LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - AMDGPUCodeGenPrepare.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 336 337 99.7 %
Date: 2018-07-13 00:08:38 Functions: 30 32 93.8 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// This pass does misc. AMDGPU optimizations on IR before instruction
      12             : /// selection.
      13             : //
      14             : //===----------------------------------------------------------------------===//
      15             : 
      16             : #include "AMDGPU.h"
      17             : #include "AMDGPUSubtarget.h"
      18             : #include "AMDGPUTargetMachine.h"
      19             : #include "llvm/ADT/StringRef.h"
      20             : #include "llvm/Analysis/DivergenceAnalysis.h"
      21             : #include "llvm/Analysis/Loads.h"
      22             : #include "llvm/Analysis/ValueTracking.h"
      23             : #include "llvm/CodeGen/Passes.h"
      24             : #include "llvm/CodeGen/TargetPassConfig.h"
      25             : #include "llvm/IR/Attributes.h"
      26             : #include "llvm/IR/BasicBlock.h"
      27             : #include "llvm/IR/Constants.h"
      28             : #include "llvm/IR/DerivedTypes.h"
      29             : #include "llvm/IR/Function.h"
      30             : #include "llvm/IR/IRBuilder.h"
      31             : #include "llvm/IR/InstVisitor.h"
      32             : #include "llvm/IR/InstrTypes.h"
      33             : #include "llvm/IR/Instruction.h"
      34             : #include "llvm/IR/Instructions.h"
      35             : #include "llvm/IR/IntrinsicInst.h"
      36             : #include "llvm/IR/Intrinsics.h"
      37             : #include "llvm/IR/LLVMContext.h"
      38             : #include "llvm/IR/Operator.h"
      39             : #include "llvm/IR/Type.h"
      40             : #include "llvm/IR/Value.h"
      41             : #include "llvm/Pass.h"
      42             : #include "llvm/Support/Casting.h"
      43             : #include <cassert>
      44             : #include <iterator>
      45             : 
      46             : #define DEBUG_TYPE "amdgpu-codegenprepare"
      47             : 
      48             : using namespace llvm;
      49             : 
      50             : namespace {
      51             : 
      52       99743 : static cl::opt<bool> WidenLoads(
      53             :   "amdgpu-codegenprepare-widen-constant-loads",
      54       99743 :   cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
      55             :   cl::ReallyHidden,
      56      299229 :   cl::init(true));
      57             : 
      58        3586 : class AMDGPUCodeGenPrepare : public FunctionPass,
      59             :                              public InstVisitor<AMDGPUCodeGenPrepare, bool> {
      60             :   const SISubtarget *ST = nullptr;
      61             :   DivergenceAnalysis *DA = nullptr;
      62             :   Module *Mod = nullptr;
      63             :   bool HasUnsafeFPMath = false;
      64             :   AMDGPUAS AMDGPUASI;
      65             : 
      66             :   /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
      67             :   /// binary operation \p V.
      68             :   ///
      69             :   /// \returns Binary operation \p V.
      70             :   /// \returns \p T's base element bit width.
      71             :   unsigned getBaseElementBitWidth(const Type *T) const;
      72             : 
      73             :   /// \returns Equivalent 32 bit integer type for given type \p T. For example,
      74             :   /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
      75             :   /// is returned.
      76             :   Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
      77             : 
      78             :   /// \returns True if binary operation \p I is a signed binary operation, false
      79             :   /// otherwise.
      80             :   bool isSigned(const BinaryOperator &I) const;
      81             : 
      82             :   /// \returns True if the condition of 'select' operation \p I comes from a
      83             :   /// signed 'icmp' operation, false otherwise.
      84             :   bool isSigned(const SelectInst &I) const;
      85             : 
      86             :   /// \returns True if type \p T needs to be promoted to 32 bit integer type,
      87             :   /// false otherwise.
      88             :   bool needsPromotionToI32(const Type *T) const;
      89             : 
      90             :   /// Promotes uniform binary operation \p I to equivalent 32 bit binary
      91             :   /// operation.
      92             :   ///
      93             :   /// \details \p I's base element bit width must be greater than 1 and less
      94             :   /// than or equal 16. Promotion is done by sign or zero extending operands to
      95             :   /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
      96             :   /// truncating the result of 32 bit binary operation back to \p I's original
      97             :   /// type. Division operation is not promoted.
      98             :   ///
      99             :   /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
     100             :   /// false otherwise.
     101             :   bool promoteUniformOpToI32(BinaryOperator &I) const;
     102             : 
     103             :   /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
     104             :   ///
     105             :   /// \details \p I's base element bit width must be greater than 1 and less
     106             :   /// than or equal 16. Promotion is done by sign or zero extending operands to
     107             :   /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
     108             :   ///
     109             :   /// \returns True.
     110             :   bool promoteUniformOpToI32(ICmpInst &I) const;
     111             : 
     112             :   /// Promotes uniform 'select' operation \p I to 32 bit 'select'
     113             :   /// operation.
     114             :   ///
     115             :   /// \details \p I's base element bit width must be greater than 1 and less
     116             :   /// than or equal 16. Promotion is done by sign or zero extending operands to
     117             :   /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
     118             :   /// result of 32 bit 'select' operation back to \p I's original type.
     119             :   ///
     120             :   /// \returns True.
     121             :   bool promoteUniformOpToI32(SelectInst &I) const;
     122             : 
     123             :   /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
     124             :   /// intrinsic.
     125             :   ///
     126             :   /// \details \p I's base element bit width must be greater than 1 and less
     127             :   /// than or equal 16. Promotion is done by zero extending the operand to 32
     128             :   /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
     129             :   /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
     130             :   /// shift amount is 32 minus \p I's base element bit width), and truncating
     131             :   /// the result of the shift operation back to \p I's original type.
     132             :   ///
     133             :   /// \returns True.
     134             :   bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
     135             : 
     136             :   /// Expands 24 bit div or rem.
     137             :   Value* expandDivRem24(IRBuilder<> &Builder, Value *Num, Value *Den,
     138             :                         bool IsDiv, bool IsSigned) const;
     139             : 
     140             :   /// Expands 32 bit div or rem.
     141             :   Value* expandDivRem32(IRBuilder<> &Builder, Instruction::BinaryOps Opc,
     142             :                         Value *Num, Value *Den) const;
     143             : 
     144             :   /// Widen a scalar load.
     145             :   ///
     146             :   /// \details \p Widen scalar load for uniform, small type loads from constant
     147             :   //  memory / to a full 32-bits and then truncate the input to allow a scalar
     148             :   //  load instead of a vector load.
     149             :   //
     150             :   /// \returns True.
     151             : 
     152             :   bool canWidenScalarExtLoad(LoadInst &I) const;
     153             : 
     154             : public:
     155             :   static char ID;
     156             : 
     157        3602 :   AMDGPUCodeGenPrepare() : FunctionPass(ID) {}
     158             : 
     159             :   bool visitFDiv(BinaryOperator &I);
     160             : 
     161             :   bool visitInstruction(Instruction &I) { return false; }
     162             :   bool visitBinaryOperator(BinaryOperator &I);
     163             :   bool visitLoadInst(LoadInst &I);
     164             :   bool visitICmpInst(ICmpInst &I);
     165             :   bool visitSelectInst(SelectInst &I);
     166             : 
     167             :   bool visitIntrinsicInst(IntrinsicInst &I);
     168             :   bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
     169             : 
     170             :   bool doInitialization(Module &M) override;
     171             :   bool runOnFunction(Function &F) override;
     172             : 
     173           0 :   StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
     174             : 
     175        1789 :   void getAnalysisUsage(AnalysisUsage &AU) const override {
     176             :     AU.addRequired<DivergenceAnalysis>();
     177             :     AU.setPreservesAll();
     178        1789 :  }
     179             : };
     180             : 
     181             : } // end anonymous namespace
     182             : 
     183             : unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const {
     184             :   assert(needsPromotionToI32(T) && "T does not need promotion to i32");
     185             : 
     186           8 :   if (T->isIntegerTy())
     187             :     return T->getIntegerBitWidth();
     188           2 :   return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();
     189             : }
     190             : 
     191         484 : Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const {
     192             :   assert(needsPromotionToI32(T) && "T does not need promotion to i32");
     193             : 
     194         484 :   if (T->isIntegerTy())
     195         282 :     return B.getInt32Ty();
     196         404 :   return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements());
     197             : }
     198             : 
     199             : bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const {
     200         211 :   return I.getOpcode() == Instruction::AShr ||
     201         448 :       I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
     202             : }
     203             : 
     204         127 : bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
     205             :   return isa<ICmpInst>(I.getOperand(0)) ?
     206         127 :       cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
     207             : }
     208             : 
     209             : bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const {
     210             :   const IntegerType *IntTy = dyn_cast<IntegerType>(T);
     211        4465 :   if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
     212             :     return true;
     213             : 
     214             :   if (const VectorType *VT = dyn_cast<VectorType>(T)) {
     215             :     // TODO: The set of packed operations is more limited, so may want to
     216             :     // promote some anyway.
     217        1122 :     if (ST->hasVOP3PInsts())
     218             :       return false;
     219             : 
     220         670 :     return needsPromotionToI32(VT->getElementType());
     221             :   }
     222             : 
     223             :   return false;
     224             : }
     225             : 
     226             : // Return true if the op promoted to i32 should have nsw set.
     227         233 : static bool promotedOpIsNSW(const Instruction &I) {
     228         233 :   switch (I.getOpcode()) {
     229             :   case Instruction::Shl:
     230             :   case Instruction::Add:
     231             :   case Instruction::Sub:
     232             :     return true;
     233          50 :   case Instruction::Mul:
     234          50 :     return I.hasNoUnsignedWrap();
     235          68 :   default:
     236          68 :     return false;
     237             :   }
     238             : }
     239             : 
     240             : // Return true if the op promoted to i32 should have nuw set.
     241         233 : static bool promotedOpIsNUW(const Instruction &I) {
     242         233 :   switch (I.getOpcode()) {
     243             :   case Instruction::Shl:
     244             :   case Instruction::Add:
     245             :   case Instruction::Mul:
     246             :     return true;
     247          26 :   case Instruction::Sub:
     248          26 :     return I.hasNoUnsignedWrap();
     249          68 :   default:
     250          68 :     return false;
     251             :   }
     252             : }
     253             : 
     254        1363 : bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const {
     255        1363 :   Type *Ty = I.getType();
     256        1363 :   const DataLayout &DL = Mod->getDataLayout();
     257        1363 :   int TySize = DL.getTypeSizeInBits(Ty);
     258        1363 :   unsigned Align = I.getAlignment() ?
     259             :                    I.getAlignment() : DL.getABITypeAlignment(Ty);
     260             : 
     261        2562 :   return I.isSimple() && TySize < 32 && Align >= 4 && DA->isUniform(&I);
     262             : }
     263             : 
     264         251 : bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
     265             :   assert(needsPromotionToI32(I.getType()) &&
     266             :          "I does not need promotion to i32");
     267             : 
     268         247 :   if (I.getOpcode() == Instruction::SDiv ||
     269         241 :       I.getOpcode() == Instruction::UDiv ||
     270         490 :       I.getOpcode() == Instruction::SRem ||
     271             :       I.getOpcode() == Instruction::URem)
     272             :     return false;
     273             : 
     274         237 :   IRBuilder<> Builder(&I);
     275         237 :   Builder.SetCurrentDebugLocation(I.getDebugLoc());
     276             : 
     277         237 :   Type *I32Ty = getI32Ty(Builder, I.getType());
     278             :   Value *ExtOp0 = nullptr;
     279             :   Value *ExtOp1 = nullptr;
     280             :   Value *ExtRes = nullptr;
     281             :   Value *TruncRes = nullptr;
     282             : 
     283             :   if (isSigned(I)) {
     284          26 :     ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
     285          26 :     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
     286             :   } else {
     287         211 :     ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
     288         211 :     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
     289             :   }
     290             : 
     291         237 :   ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1);
     292             :   if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
     293         233 :     if (promotedOpIsNSW(cast<Instruction>(I)))
     294         123 :       Inst->setHasNoSignedWrap();
     295             : 
     296         233 :     if (promotedOpIsNUW(cast<Instruction>(I)))
     297         147 :       Inst->setHasNoUnsignedWrap();
     298             : 
     299             :     if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))
     300          36 :       Inst->setIsExact(ExactOp->isExact());
     301             :   }
     302             : 
     303         474 :   TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
     304             : 
     305         237 :   I.replaceAllUsesWith(TruncRes);
     306         237 :   I.eraseFromParent();
     307             : 
     308             :   return true;
     309             : }
     310             : 
     311         112 : bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const {
     312             :   assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
     313             :          "I does not need promotion to i32");
     314             : 
     315         112 :   IRBuilder<> Builder(&I);
     316         112 :   Builder.SetCurrentDebugLocation(I.getDebugLoc());
     317             : 
     318         112 :   Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType());
     319             :   Value *ExtOp0 = nullptr;
     320             :   Value *ExtOp1 = nullptr;
     321             :   Value *NewICmp  = nullptr;
     322             : 
     323         112 :   if (I.isSigned()) {
     324          53 :     ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
     325          53 :     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
     326             :   } else {
     327          59 :     ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
     328          59 :     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
     329             :   }
     330         112 :   NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
     331             : 
     332         112 :   I.replaceAllUsesWith(NewICmp);
     333         112 :   I.eraseFromParent();
     334             : 
     335         112 :   return true;
     336             : }
     337             : 
     338         127 : bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const {
     339             :   assert(needsPromotionToI32(I.getType()) &&
     340             :          "I does not need promotion to i32");
     341             : 
     342         127 :   IRBuilder<> Builder(&I);
     343         127 :   Builder.SetCurrentDebugLocation(I.getDebugLoc());
     344             : 
     345         127 :   Type *I32Ty = getI32Ty(Builder, I.getType());
     346             :   Value *ExtOp1 = nullptr;
     347             :   Value *ExtOp2 = nullptr;
     348             :   Value *ExtRes = nullptr;
     349             :   Value *TruncRes = nullptr;
     350             : 
     351         127 :   if (isSigned(I)) {
     352          57 :     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
     353          57 :     ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
     354             :   } else {
     355          70 :     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
     356          70 :     ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
     357             :   }
     358         127 :   ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
     359         254 :   TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
     360             : 
     361         127 :   I.replaceAllUsesWith(TruncRes);
     362         127 :   I.eraseFromParent();
     363             : 
     364         127 :   return true;
     365             : }
     366             : 
     367           8 : bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
     368             :     IntrinsicInst &I) const {
     369             :   assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
     370             :          "I must be bitreverse intrinsic");
     371             :   assert(needsPromotionToI32(I.getType()) &&
     372             :          "I does not need promotion to i32");
     373             : 
     374           8 :   IRBuilder<> Builder(&I);
     375           8 :   Builder.SetCurrentDebugLocation(I.getDebugLoc());
     376             : 
     377           8 :   Type *I32Ty = getI32Ty(Builder, I.getType());
     378             :   Function *I32 =
     379          16 :       Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty });
     380          16 :   Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
     381          16 :   Value *ExtRes = Builder.CreateCall(I32, { ExtOp });
     382             :   Value *LShrOp =
     383          16 :       Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));
     384             :   Value *TruncRes =
     385          16 :       Builder.CreateTrunc(LShrOp, I.getType());
     386             : 
     387           8 :   I.replaceAllUsesWith(TruncRes);
     388           8 :   I.eraseFromParent();
     389             : 
     390           8 :   return true;
     391             : }
     392             : 
     393          74 : static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) {
     394             :   const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
     395             :   if (!CNum)
     396             :     return HasDenormals;
     397             : 
     398          59 :   if (UnsafeDiv)
     399             :     return true;
     400             : 
     401          59 :   bool IsOne = CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0);
     402             : 
     403             :   // Reciprocal f32 is handled separately without denormals.
     404          59 :   return HasDenormals ^ IsOne;
     405             : }
     406             : 
     407             : // Insert an intrinsic for fast fdiv for safe math situations where we can
     408             : // reduce precision. Leave fdiv for situations where the generic node is
     409             : // expected to be optimized.
     410         286 : bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
     411         286 :   Type *Ty = FDiv.getType();
     412             : 
     413         286 :   if (!Ty->getScalarType()->isFloatTy())
     414             :     return false;
     415             : 
     416         197 :   MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
     417          69 :   if (!FPMath)
     418             :     return false;
     419             : 
     420             :   const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
     421          69 :   float ULP = FPOp->getFPAccuracy();
     422          69 :   if (ULP < 2.5f)
     423             :     return false;
     424             : 
     425             :   FastMathFlags FMF = FPOp->getFastMathFlags();
     426         109 :   bool UnsafeDiv = HasUnsafeFPMath || FMF.isFast() ||
     427             :                                       FMF.allowReciprocal();
     428             : 
     429             :   // With UnsafeDiv node will be optimized to just rcp and mul.
     430             :   if (UnsafeDiv)
     431             :     return false;
     432             : 
     433          74 :   IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
     434             :   Builder.setFastMathFlags(FMF);
     435          37 :   Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
     436             : 
     437          37 :   Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
     438             : 
     439             :   Value *Num = FDiv.getOperand(0);
     440             :   Value *Den = FDiv.getOperand(1);
     441             : 
     442             :   Value *NewFDiv = nullptr;
     443             : 
     444          37 :   bool HasDenormals = ST->hasFP32Denormals();
     445             :   if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
     446          13 :     NewFDiv = UndefValue::get(VT);
     447             : 
     448             :     // FIXME: Doesn't do the right thing for cases where the vector is partially
     449             :     // constant. This works when the scalarizer pass is run first.
     450          63 :     for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
     451         100 :       Value *NumEltI = Builder.CreateExtractElement(Num, I);
     452          50 :       Value *DenEltI = Builder.CreateExtractElement(Den, I);
     453             :       Value *NewElt;
     454             : 
     455          50 :       if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasDenormals)) {
     456          24 :         NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
     457             :       } else {
     458          52 :         NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
     459             :       }
     460             : 
     461          50 :       NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
     462             :     }
     463             :   } else {
     464          24 :     if (!shouldKeepFDivF32(Num, UnsafeDiv, HasDenormals))
     465          22 :       NewFDiv = Builder.CreateCall(Decl, { Num, Den });
     466             :   }
     467             : 
     468          24 :   if (NewFDiv) {
     469          24 :     FDiv.replaceAllUsesWith(NewFDiv);
     470          24 :     NewFDiv->takeName(&FDiv);
     471          24 :     FDiv.eraseFromParent();
     472             :   }
     473             : 
     474          37 :   return !!NewFDiv;
     475             : }
     476             : 
     477       18076 : static bool hasUnsafeFPMath(const Function &F) {
     478       18076 :   Attribute Attr = F.getFnAttribute("unsafe-fp-math");
     479       36152 :   return Attr.getValueAsString() == "true";
     480             : }
     481             : 
     482         513 : static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,
     483             :                                           Value *LHS, Value *RHS) {
     484         513 :   Type *I32Ty = Builder.getInt32Ty();
     485         513 :   Type *I64Ty = Builder.getInt64Ty();
     486             : 
     487         513 :   Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty);
     488         513 :   Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty);
     489         513 :   Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);
     490         513 :   Value *Lo = Builder.CreateTrunc(MUL64, I32Ty);
     491         513 :   Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));
     492         513 :   Hi = Builder.CreateTrunc(Hi, I32Ty);
     493         513 :   return std::make_pair(Lo, Hi);
     494             : }
     495             : 
     496             : static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
     497         342 :   return getMul64(Builder, LHS, RHS).second;
     498             : }
     499             : 
     500             : // The fractional part of a float is enough to accurately represent up to
     501             : // a 24-bit signed integer.
     502         280 : Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
     503             :                                             Value *Num, Value *Den,
     504             :                                             bool IsDiv, bool IsSigned) const {
     505             :   assert(Num->getType()->isIntegerTy(32));
     506             : 
     507         280 :   const DataLayout &DL = Mod->getDataLayout();
     508         280 :   unsigned LHSSignBits = ComputeNumSignBits(Num, DL);
     509         280 :   if (LHSSignBits < 9)
     510             :     return nullptr;
     511             : 
     512         119 :   unsigned RHSSignBits = ComputeNumSignBits(Den, DL);
     513         119 :   if (RHSSignBits < 9)
     514             :     return nullptr;
     515             : 
     516             : 
     517         109 :   unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
     518         109 :   unsigned DivBits = 32 - SignBits;
     519         109 :   if (IsSigned)
     520          58 :     ++DivBits;
     521             : 
     522         109 :   Type *Ty = Num->getType();
     523         109 :   Type *I32Ty = Builder.getInt32Ty();
     524         109 :   Type *F32Ty = Builder.getFloatTy();
     525         109 :   ConstantInt *One = Builder.getInt32(1);
     526             :   Value *JQ = One;
     527             : 
     528         109 :   if (IsSigned) {
     529             :     // char|short jq = ia ^ ib;
     530          58 :     JQ = Builder.CreateXor(Num, Den);
     531             : 
     532             :     // jq = jq >> (bitsize - 2)
     533          58 :     JQ = Builder.CreateAShr(JQ, Builder.getInt32(30));
     534             : 
     535             :     // jq = jq | 0x1
     536          58 :     JQ = Builder.CreateOr(JQ, One);
     537             :   }
     538             : 
     539             :   // int ia = (int)LHS;
     540             :   Value *IA = Num;
     541             : 
     542             :   // int ib, (int)RHS;
     543             :   Value *IB = Den;
     544             : 
     545             :   // float fa = (float)ia;
     546         218 :   Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty)
     547         109 :                        : Builder.CreateUIToFP(IA, F32Ty);
     548             : 
     549             :   // float fb = (float)ib;
     550         218 :   Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty)
     551         109 :                        : Builder.CreateUIToFP(IB,F32Ty);
     552             : 
     553         109 :   Value *RCP = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), FB);
     554         109 :   Value *FQM = Builder.CreateFMul(FA, RCP);
     555             : 
     556             :   // fq = trunc(fqm);
     557         218 :   CallInst* FQ = Builder.CreateIntrinsic(Intrinsic::trunc, { FQM });
     558         109 :   FQ->copyFastMathFlags(Builder.getFastMathFlags());
     559             : 
     560             :   // float fqneg = -fq;
     561         109 :   Value *FQNeg = Builder.CreateFNeg(FQ);
     562             : 
     563             :   // float fr = mad(fqneg, fb, fa);
     564         218 :   Value *FR = Builder.CreateIntrinsic(Intrinsic::amdgcn_fmad_ftz,
     565         109 :                                       { FQNeg, FB, FA }, FQ);
     566             : 
     567             :   // int iq = (int)fq;
     568         218 :   Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty)
     569         109 :                        : Builder.CreateFPToUI(FQ, I32Ty);
     570             : 
     571             :   // fr = fabs(fr);
     572         218 :   FR = Builder.CreateIntrinsic(Intrinsic::fabs, { FR }, FQ);
     573             : 
     574             :   // fb = fabs(fb);
     575         218 :   FB = Builder.CreateIntrinsic(Intrinsic::fabs, { FB }, FQ);
     576             : 
     577             :   // int cv = fr >= fb;
     578         109 :   Value *CV = Builder.CreateFCmpOGE(FR, FB);
     579             : 
     580             :   // jq = (cv ? jq : 0);
     581         109 :   JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0));
     582             : 
     583             :   // dst = iq + jq;
     584         109 :   Value *Div = Builder.CreateAdd(IQ, JQ);
     585             : 
     586             :   Value *Res = Div;
     587         109 :   if (!IsDiv) {
     588             :     // Rem needs compensation, it's easier to recompute it
     589          42 :     Value *Rem = Builder.CreateMul(Div, Den);
     590          42 :     Res = Builder.CreateSub(Num, Rem);
     591             :   }
     592             : 
     593             :   // Truncate to number of bits this divide really is.
     594         109 :   if (IsSigned) {
     595         116 :     Res = Builder.CreateTrunc(Res, Builder.getIntNTy(DivBits));
     596          58 :     Res = Builder.CreateSExt(Res, Ty);
     597             :   } else {
     598          51 :     ConstantInt *TruncMask = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
     599          51 :     Res = Builder.CreateAnd(Res, TruncMask);
     600             :   }
     601             : 
     602             :   return Res;
     603             : }
     604             : 
     605         366 : Value* AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder,
     606             :                                             Instruction::BinaryOps Opc,
     607             :                                             Value *Num, Value *Den) const {
     608             :   assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
     609             :          Opc == Instruction::SRem || Opc == Instruction::SDiv);
     610             : 
     611             :   FastMathFlags FMF;
     612             :   FMF.setFast();
     613             :   Builder.setFastMathFlags(FMF);
     614             : 
     615         366 :   if (isa<Constant>(Den))
     616             :     return nullptr; // Keep it for optimization
     617             : 
     618         280 :   bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
     619         280 :   bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
     620             : 
     621         280 :   Type *Ty = Num->getType();
     622         280 :   Type *I32Ty = Builder.getInt32Ty();
     623         280 :   Type *F32Ty = Builder.getFloatTy();
     624             : 
     625         280 :   if (Ty->getScalarSizeInBits() < 32) {
     626          96 :     if (IsSigned) {
     627          48 :       Num = Builder.CreateSExt(Num, I32Ty);
     628          48 :       Den = Builder.CreateSExt(Den, I32Ty);
     629             :     } else {
     630          48 :       Num = Builder.CreateZExt(Num, I32Ty);
     631          48 :       Den = Builder.CreateZExt(Den, I32Ty);
     632             :     }
     633             :   }
     634             : 
     635         280 :   if (Value *Res = expandDivRem24(Builder, Num, Den, IsDiv, IsSigned)) {
     636         109 :     Res = Builder.CreateTrunc(Res, Ty);
     637         109 :     return Res;
     638             :   }
     639             : 
     640         171 :   ConstantInt *Zero = Builder.getInt32(0);
     641         171 :   ConstantInt *One = Builder.getInt32(1);
     642         171 :   ConstantInt *MinusOne = Builder.getInt32(~0);
     643             : 
     644             :   Value *Sign = nullptr;
     645         171 :   if (IsSigned) {
     646          68 :     ConstantInt *K31 = Builder.getInt32(31);
     647          68 :     Value *LHSign = Builder.CreateAShr(Num, K31);
     648          68 :     Value *RHSign = Builder.CreateAShr(Den, K31);
     649             :     // Remainder sign is the same as LHS
     650         108 :     Sign = IsDiv ? Builder.CreateXor(LHSign, RHSign) : LHSign;
     651             : 
     652          68 :     Num = Builder.CreateAdd(Num, LHSign);
     653          68 :     Den = Builder.CreateAdd(Den, RHSign);
     654             : 
     655          68 :     Num = Builder.CreateXor(Num, LHSign);
     656          68 :     Den = Builder.CreateXor(Den, RHSign);
     657             :   }
     658             : 
     659             :   // RCP =  URECIP(Den) = 2^32 / Den + e
     660             :   // e is rounding error.
     661         171 :   Value *DEN_F32 = Builder.CreateUIToFP(Den, F32Ty);
     662         171 :   Value *RCP_F32 = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), DEN_F32);
     663         171 :   Constant *UINT_MAX_PLUS_1 = ConstantFP::get(F32Ty, BitsToFloat(0x4f800000));
     664         171 :   Value *RCP_SCALE = Builder.CreateFMul(RCP_F32, UINT_MAX_PLUS_1);
     665         171 :   Value *RCP = Builder.CreateFPToUI(RCP_SCALE, I32Ty);
     666             : 
     667             :   // RCP_LO, RCP_HI = mul(RCP, Den) */
     668             :   Value *RCP_LO, *RCP_HI;
     669         342 :   std::tie(RCP_LO, RCP_HI) = getMul64(Builder, RCP, Den);
     670             : 
     671             :   // NEG_RCP_LO = -RCP_LO
     672         171 :   Value *NEG_RCP_LO = Builder.CreateNeg(RCP_LO);
     673             : 
     674             :   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
     675         171 :   Value *RCP_HI_0_CC = Builder.CreateICmpEQ(RCP_HI, Zero);
     676         171 :   Value *ABS_RCP_LO = Builder.CreateSelect(RCP_HI_0_CC, NEG_RCP_LO, RCP_LO);
     677             : 
     678             :   // Calculate the rounding error from the URECIP instruction
     679             :   // E = mulhu(ABS_RCP_LO, RCP)
     680             :   Value *E = getMulHu(Builder, ABS_RCP_LO, RCP);
     681             : 
     682             :   // RCP_A_E = RCP + E
     683         171 :   Value *RCP_A_E = Builder.CreateAdd(RCP, E);
     684             : 
     685             :   // RCP_S_E = RCP - E
     686         171 :   Value *RCP_S_E = Builder.CreateSub(RCP, E);
     687             : 
     688             :   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
     689         171 :   Value *Tmp0 = Builder.CreateSelect(RCP_HI_0_CC, RCP_A_E, RCP_S_E);
     690             : 
     691             :   // Quotient = mulhu(Tmp0, Num)
     692             :   Value *Quotient = getMulHu(Builder, Tmp0, Num);
     693             : 
     694             :   // Num_S_Remainder = Quotient * Den
     695         171 :   Value *Num_S_Remainder = Builder.CreateMul(Quotient, Den);
     696             : 
     697             :   // Remainder = Num - Num_S_Remainder
     698         171 :   Value *Remainder = Builder.CreateSub(Num, Num_S_Remainder);
     699             : 
     700             :   // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
     701         171 :   Value *Rem_GE_Den_CC = Builder.CreateICmpUGE(Remainder, Den);
     702         171 :   Value *Remainder_GE_Den = Builder.CreateSelect(Rem_GE_Den_CC, MinusOne, Zero);
     703             : 
     704             :   // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
     705         171 :   Value *Num_GE_Num_S_Rem_CC = Builder.CreateICmpUGE(Num, Num_S_Remainder);
     706         171 :   Value *Remainder_GE_Zero = Builder.CreateSelect(Num_GE_Num_S_Rem_CC,
     707         171 :                                                   MinusOne, Zero);
     708             : 
     709             :   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
     710         171 :   Value *Tmp1 = Builder.CreateAnd(Remainder_GE_Den, Remainder_GE_Zero);
     711         171 :   Value *Tmp1_0_CC = Builder.CreateICmpEQ(Tmp1, Zero);
     712             : 
     713             :   Value *Res;
     714         171 :   if (IsDiv) {
     715             :     // Quotient_A_One = Quotient + 1
     716         100 :     Value *Quotient_A_One = Builder.CreateAdd(Quotient, One);
     717             : 
     718             :     // Quotient_S_One = Quotient - 1
     719         100 :     Value *Quotient_S_One = Builder.CreateSub(Quotient, One);
     720             : 
     721             :     // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
     722         100 :     Value *Div = Builder.CreateSelect(Tmp1_0_CC, Quotient, Quotient_A_One);
     723             : 
     724             :     // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
     725         100 :     Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Div, Quotient_S_One);
     726             :   } else {
     727             :     // Remainder_S_Den = Remainder - Den
     728          71 :     Value *Remainder_S_Den = Builder.CreateSub(Remainder, Den);
     729             : 
     730             :     // Remainder_A_Den = Remainder + Den
     731          71 :     Value *Remainder_A_Den = Builder.CreateAdd(Remainder, Den);
     732             : 
     733             :     // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
     734          71 :     Value *Rem = Builder.CreateSelect(Tmp1_0_CC, Remainder, Remainder_S_Den);
     735             : 
     736             :     // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
     737          71 :     Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Rem, Remainder_A_Den);
     738             :   }
     739             : 
     740         171 :   if (IsSigned) {
     741          68 :     Res = Builder.CreateXor(Res, Sign);
     742          68 :     Res = Builder.CreateSub(Res, Sign);
     743             :   }
     744             : 
     745         171 :   Res = Builder.CreateTrunc(Res, Ty);
     746             : 
     747         171 :   return Res;
     748             : }
     749             : 
     750       10881 : bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
     751       21731 :   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
     752       11505 :       DA->isUniform(&I) && promoteUniformOpToI32(I))
     753             :     return true;
     754             : 
     755             :   bool Changed = false;
     756             :   Instruction::BinaryOps Opc = I.getOpcode();
     757       10644 :   Type *Ty = I.getType();
     758             :   Value *NewDiv = nullptr;
     759       10644 :   if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
     760       10782 :        Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
     761         284 :       Ty->getScalarSizeInBits() <= 32) {
     762             :     Value *Num = I.getOperand(0);
     763             :     Value *Den = I.getOperand(1);
     764         232 :     IRBuilder<> Builder(&I);
     765         232 :     Builder.SetCurrentDebugLocation(I.getDebugLoc());
     766             : 
     767             :     if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
     768          60 :       NewDiv = UndefValue::get(VT);
     769             : 
     770         254 :       for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
     771         388 :         Value *NumEltI = Builder.CreateExtractElement(Num, I);
     772         194 :         Value *DenEltI = Builder.CreateExtractElement(Den, I);
     773         194 :         Value *NewElt = expandDivRem32(Builder, Opc, NumEltI, DenEltI);
     774         194 :         if (!NewElt)
     775          54 :           NewElt = Builder.CreateBinOp(Opc, NumEltI, DenEltI);
     776         194 :         NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, I);
     777             :       }
     778             :     } else {
     779         172 :       NewDiv = expandDivRem32(Builder, Opc, Num, Den);
     780             :     }
     781             : 
     782         232 :     if (NewDiv) {
     783         200 :       I.replaceAllUsesWith(NewDiv);
     784         200 :       I.eraseFromParent();
     785             :       Changed = true;
     786             :     }
     787             :   }
     788             : 
     789             :   return Changed;
     790             : }
     791             : 
     792       14627 : bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {
     793       14627 :   if (!WidenLoads)
     794             :     return false;
     795             : 
     796       13352 :   if ((I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
     797       15966 :        I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
     798        1363 :       canWidenScalarExtLoad(I)) {
     799          69 :     IRBuilder<> Builder(&I);
     800          69 :     Builder.SetCurrentDebugLocation(I.getDebugLoc());
     801             : 
     802          69 :     Type *I32Ty = Builder.getInt32Ty();
     803          69 :     Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace());
     804          69 :     Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT);
     805          69 :     LoadInst *WidenLoad = Builder.CreateLoad(BitCast);
     806          69 :     WidenLoad->copyMetadata(I);
     807             : 
     808             :     // If we have range metadata, we need to convert the type, and not make
     809             :     // assumptions about the high bits.
     810           7 :     if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) {
     811             :       ConstantInt *Lower =
     812             :         mdconst::extract<ConstantInt>(Range->getOperand(0));
     813             : 
     814           6 :       if (Lower->getValue().isNullValue()) {
     815           1 :         WidenLoad->setMetadata(LLVMContext::MD_range, nullptr);
     816             :       } else {
     817             :         Metadata *LowAndHigh[] = {
     818          10 :           ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))),
     819             :           // Don't make assumptions about the high bits.
     820           5 :           ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0))
     821          10 :         };
     822             : 
     823           5 :         WidenLoad->setMetadata(LLVMContext::MD_range,
     824           5 :                                MDNode::get(Mod->getContext(), LowAndHigh));
     825             :       }
     826             :     }
     827             : 
     828          69 :     int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType());
     829          69 :     Type *IntNTy = Builder.getIntNTy(TySize);
     830          69 :     Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
     831         138 :     Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
     832          69 :     I.replaceAllUsesWith(ValOrig);
     833          69 :     I.eraseFromParent();
     834             :     return true;
     835             :   }
     836             : 
     837             :   return false;
     838             : }
     839             : 
     840        2491 : bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
     841             :   bool Changed = false;
     842             : 
     843        5081 :   if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
     844         252 :       DA->isUniform(&I))
     845         112 :     Changed |= promoteUniformOpToI32(I);
     846             : 
     847        2491 :   return Changed;
     848             : }
     849             : 
     850        1781 : bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
     851             :   bool Changed = false;
     852             : 
     853        2837 :   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
     854         247 :       DA->isUniform(&I))
     855         127 :     Changed |= promoteUniformOpToI32(I);
     856             : 
     857        1781 :   return Changed;
     858             : }
     859             : 
     860       11735 : bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
     861       11735 :   switch (I.getIntrinsicID()) {
     862          41 :   case Intrinsic::bitreverse:
     863          41 :     return visitBitreverseIntrinsicInst(I);
     864             :   default:
     865             :     return false;
     866             :   }
     867             : }
     868             : 
     869          41 : bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
     870             :   bool Changed = false;
     871             : 
     872          67 :   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
     873          10 :       DA->isUniform(&I))
     874           8 :     Changed |= promoteUniformBitreverseToI32(I);
     875             : 
     876          41 :   return Changed;
     877             : }
     878             : 
     879        1789 : bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
     880        1789 :   Mod = &M;
     881        1789 :   return false;
     882             : }
     883             : 
     884       18087 : bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
     885       18087 :   if (skipFunction(F))
     886             :     return false;
     887             : 
     888       18084 :   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
     889       18084 :   if (!TPC)
     890             :     return false;
     891             : 
     892       18076 :   const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
     893       36152 :   ST = &TM.getSubtarget<SISubtarget>(F);
     894       18076 :   DA = &getAnalysis<DivergenceAnalysis>();
     895       18076 :   HasUnsafeFPMath = hasUnsafeFPMath(F);
     896       18076 :   AMDGPUASI = TM.getAMDGPUAS();
     897             : 
     898             :   bool MadeChange = false;
     899             : 
     900       38350 :   for (BasicBlock &BB : F) {
     901             :     BasicBlock::iterator Next;
     902      245942 :     for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
     903             :       Next = std::next(I);
     904      112834 :       MadeChange |= visit(*I);
     905             :     }
     906             :   }
     907             : 
     908             :   return MadeChange;
     909             : }
     910             : 
     911       73254 : INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
     912             :                       "AMDGPU IR optimizations", false, false)
     913       73254 : INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
     914      342570 : INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
     915             :                     false, false)
     916             : 
     917             : char AMDGPUCodeGenPrepare::ID = 0;
     918             : 
     919        1795 : FunctionPass *llvm::createAMDGPUCodeGenPreparePass() {
     920        3590 :   return new AMDGPUCodeGenPrepare();
     921      299229 : }

Generated by: LCOV version 1.13