LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - AMDGPUTargetTransformInfo.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 203 218 93.1 %
Date: 2017-09-14 15:23:50 Functions: 22 22 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : // \file
      11             : // This file implements a TargetTransformInfo analysis pass specific to the
      12             : // AMDGPU target machine. It uses the target's detailed information to provide
      13             : // more precise answers to certain TTI queries, while letting the target
      14             : // independent and default TTI implementations handle the rest.
      15             : //
      16             : //===----------------------------------------------------------------------===//
      17             : 
      18             : #include "AMDGPUTargetTransformInfo.h"
      19             : #include "AMDGPUSubtarget.h"
      20             : #include "llvm/ADT/STLExtras.h"
      21             : #include "llvm/Analysis/LoopInfo.h"
      22             : #include "llvm/Analysis/TargetTransformInfo.h"
      23             : #include "llvm/Analysis/ValueTracking.h"
      24             : #include "llvm/CodeGen/ISDOpcodes.h"
      25             : #include "llvm/CodeGen/MachineValueType.h"
      26             : #include "llvm/CodeGen/ValueTypes.h"
      27             : #include "llvm/IR/Argument.h"
      28             : #include "llvm/IR/Attributes.h"
      29             : #include "llvm/IR/BasicBlock.h"
      30             : #include "llvm/IR/CallingConv.h"
      31             : #include "llvm/IR/DataLayout.h"
      32             : #include "llvm/IR/DerivedTypes.h"
      33             : #include "llvm/IR/Function.h"
      34             : #include "llvm/IR/Instruction.h"
      35             : #include "llvm/IR/Instructions.h"
      36             : #include "llvm/IR/IntrinsicInst.h"
      37             : #include "llvm/IR/Module.h"
      38             : #include "llvm/IR/PatternMatch.h"
      39             : #include "llvm/IR/Type.h"
      40             : #include "llvm/IR/Value.h"
      41             : #include "llvm/MC/SubtargetFeature.h"
      42             : #include "llvm/Support/Casting.h"
      43             : #include "llvm/Support/CommandLine.h"
      44             : #include "llvm/Support/Debug.h"
      45             : #include "llvm/Support/ErrorHandling.h"
      46             : #include "llvm/Support/raw_ostream.h"
      47             : #include "llvm/Target/TargetMachine.h"
      48             : #include <algorithm>
      49             : #include <cassert>
      50             : #include <limits>
      51             : #include <utility>
      52             : 
      53             : using namespace llvm;
      54             : 
      55             : #define DEBUG_TYPE "AMDGPUtti"
      56             : 
      57       72306 : static cl::opt<unsigned> UnrollThresholdPrivate(
      58             :   "amdgpu-unroll-threshold-private",
      59      216918 :   cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
      60      289224 :   cl::init(2500), cl::Hidden);
      61             : 
      62       72306 : static cl::opt<unsigned> UnrollThresholdLocal(
      63             :   "amdgpu-unroll-threshold-local",
      64      216918 :   cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
      65      289224 :   cl::init(1000), cl::Hidden);
      66             : 
      67       72306 : static cl::opt<unsigned> UnrollThresholdIf(
      68             :   "amdgpu-unroll-threshold-if",
      69      216918 :   cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
      70      289224 :   cl::init(150), cl::Hidden);
      71             : 
      72           6 : static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
      73             :                               unsigned Depth = 0) {
      74           6 :   const Instruction *I = dyn_cast<Instruction>(Cond);
      75             :   if (!I)
      76             :     return false;
      77             : 
      78          24 :   for (const Value *V : I->operand_values()) {
      79          12 :     if (!L->contains(I))
      80           0 :       continue;
      81           4 :     if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
      82           8 :       if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
      83           0 :                   return SubLoop->contains(PHI); }))
      84             :         return true;
      85           2 :     } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
      86             :       return true;
      87             :   }
      88             :   return false;
      89             : }
      90             : 
      91          17 : void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
      92             :                                             TTI::UnrollingPreferences &UP) {
      93          17 :   UP.Threshold = 300; // Twice the default.
      94          17 :   UP.MaxCount = std::numeric_limits<unsigned>::max();
      95          17 :   UP.Partial = true;
      96             : 
      97             :   // TODO: Do we want runtime unrolling?
      98             : 
      99             :   // Maximum alloca size than can fit registers. Reserve 16 registers.
     100          17 :   const unsigned MaxAlloca = (256 - 16) * 4;
     101          17 :   unsigned ThresholdPrivate = UnrollThresholdPrivate;
     102          17 :   unsigned ThresholdLocal = UnrollThresholdLocal;
     103          17 :   unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
     104          34 :   AMDGPUAS ASST = ST->getAMDGPUAS();
     105          95 :   for (const BasicBlock *BB : L->getBlocks()) {
     106          32 :     const DataLayout &DL = BB->getModule()->getDataLayout();
     107          32 :     unsigned LocalGEPsSeen = 0;
     108             : 
     109          64 :     if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
     110           0 :                return SubLoop->contains(BB); }))
     111           0 :         continue; // Block belongs to an inner loop.
     112             : 
     113         241 :     for (const Instruction &I : *BB) {
     114             :       // Unroll a loop which contains an "if" statement whose condition
     115             :       // defined by a PHI belonging to the loop. This may help to eliminate
     116             :       // if region and potentially even PHI itself, saving on both divergence
     117             :       // and registers used for the PHI.
     118             :       // Add a small bonus for each of such "if" statements.
     119          42 :       if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
     120          54 :         if (UP.Threshold < MaxBoost && Br->isConditional()) {
     121          53 :           if (L->isLoopExiting(Br->getSuccessor(0)) ||
     122          18 :               L->isLoopExiting(Br->getSuccessor(1)))
     123          12 :             continue;
     124           4 :           if (dependsOnLocalPhi(L, Br->getCondition())) {
     125           4 :             UP.Threshold += UnrollThresholdIf;
     126             :             DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
     127             :                          << " for loop:\n" << *L << " due to " << *Br << '\n');
     128           4 :             if (UP.Threshold >= MaxBoost)
     129           5 :               return;
     130             :           }
     131             :         }
     132          15 :         continue;
     133             :       }
     134             : 
     135          24 :       const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
     136          94 :       if (!GEP)
     137          94 :         continue;
     138             : 
     139          24 :       unsigned AS = GEP->getAddressSpace();
     140          24 :       unsigned Threshold = 0;
     141          24 :       if (AS == ASST.PRIVATE_ADDRESS)
     142          13 :         Threshold = ThresholdPrivate;
     143          20 :       else if (AS == ASST.LOCAL_ADDRESS)
     144           2 :         Threshold = ThresholdLocal;
     145             :       else
     146           9 :         continue;
     147             : 
     148          15 :       if (UP.Threshold >= Threshold)
     149           0 :         continue;
     150             : 
     151          15 :       if (AS == ASST.PRIVATE_ADDRESS) {
     152          13 :         const Value *Ptr = GEP->getPointerOperand();
     153             :         const AllocaInst *Alloca =
     154          20 :             dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL));
     155          14 :         if (!Alloca || !Alloca->isStaticAlloca())
     156           7 :           continue;
     157           6 :         Type *Ty = Alloca->getAllocatedType();
     158          12 :         unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
     159           6 :         if (AllocaSize > MaxAlloca)
     160           1 :           continue;
     161           2 :       } else if (AS == ASST.LOCAL_ADDRESS) {
     162           2 :         LocalGEPsSeen++;
     163             :         // Inhibit unroll for local memory if we have seen addressing not to
     164             :         // a variable, most likely we will be unable to combine it.
     165             :         // Do not unroll too deep inner loops for local memory to give a chance
     166             :         // to unroll an outer loop for a more important reason.
     167           6 :         if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
     168           4 :             (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
     169           2 :              !isa<Argument>(GEP->getPointerOperand())))
     170           0 :           continue;
     171             :       }
     172             : 
     173             :       // Check if GEP depends on a value defined by this loop itself.
     174           7 :       bool HasLoopDef = false;
     175          26 :       for (const Value *Op : GEP->operands()) {
     176          12 :         const Instruction *Inst = dyn_cast<Instruction>(Op);
     177          24 :         if (!Inst || L->isLoopInvariant(Op))
     178          12 :           continue;
     179             : 
     180          14 :         if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
     181           0 :              return SubLoop->contains(Inst); }))
     182           0 :           continue;
     183             :         HasLoopDef = true;
     184             :         break;
     185             :       }
     186           7 :       if (!HasLoopDef)
     187           0 :         continue;
     188             : 
     189             :       // We want to do whatever we can to limit the number of alloca
     190             :       // instructions that make it through to the code generator.  allocas
     191             :       // require us to use indirect addressing, which is slow and prone to
     192             :       // compiler bugs.  If this loop does an address calculation on an
     193             :       // alloca ptr, then we want to use a higher than normal loop unroll
     194             :       // threshold. This will give SROA a better chance to eliminate these
     195             :       // allocas.
     196             :       //
     197             :       // We also want to have more unrolling for local memory to let ds
     198             :       // instructions with different offsets combine.
     199             :       //
     200             :       // Don't use the maximum allowed value here as it will make some
     201             :       // programs way too big.
     202           7 :       UP.Threshold = Threshold;
     203             :       DEBUG(dbgs() << "Set unroll threshold " << Threshold << " for loop:\n"
     204             :                    << *L << " due to " << *GEP << '\n');
     205           7 :       if (UP.Threshold >= MaxBoost)
     206             :         return;
     207             :     }
     208             :   }
     209             : }
     210             : 
     211        2123 : unsigned AMDGPUTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
     212             :   // The concept of vector registers doesn't really exist. Some packed vector
     213             :   // operations operate on the normal 32-bit registers.
     214             : 
     215             :   // Number of VGPRs on SI.
     216        2123 :   if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
     217             :     return 256;
     218             : 
     219         196 :   return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
     220             : }
     221             : 
     222        2123 : unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) const {
     223             :   // This is really the number of registers to fill when vectorizing /
     224             :   // interleaving loops, so we lie to avoid trying to use all registers.
     225        2123 :   return getHardwareNumberOfRegisters(Vec) >> 3;
     226             : }
     227             : 
     228          34 : unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) const {
     229          34 :   return 32;
     230             : }
     231             : 
     232          22 : unsigned AMDGPUTTIImpl::getMinVectorRegisterBitWidth() const {
     233          22 :   return 32;
     234             : }
     235             : 
     236       24522 : unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
     237       49044 :   AMDGPUAS AS = ST->getAMDGPUAS();
     238       24522 :   if (AddrSpace == AS.GLOBAL_ADDRESS ||
     239        5253 :       AddrSpace == AS.CONSTANT_ADDRESS ||
     240             :       AddrSpace == AS.FLAT_ADDRESS)
     241             :     return 128;
     242        4837 :   if (AddrSpace == AS.LOCAL_ADDRESS ||
     243             :       AddrSpace == AS.REGION_ADDRESS)
     244             :     return 64;
     245        1227 :   if (AddrSpace == AS.PRIVATE_ADDRESS)
     246         942 :     return 8 * ST->getMaxPrivateElementSize();
     247             : 
     248         570 :   if (ST->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS &&
     249             :       (AddrSpace == AS.PARAM_D_ADDRESS ||
     250         285 :       AddrSpace == AS.PARAM_I_ADDRESS ||
     251             :       (AddrSpace >= AS.CONSTANT_BUFFER_0 &&
     252             :       AddrSpace <= AS.CONSTANT_BUFFER_15)))
     253             :     return 128;
     254           0 :   llvm_unreachable("unhandled address space");
     255             : }
     256             : 
     257        1077 : bool AMDGPUTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
     258             :                                                unsigned Alignment,
     259             :                                                unsigned AddrSpace) const {
     260             :   // We allow vectorization of flat stores, even though we may need to decompose
     261             :   // them later if they may access private memory. We don't have enough context
     262             :   // here, and legalization can handle it.
     263        2154 :   if (AddrSpace == ST->getAMDGPUAS().PRIVATE_ADDRESS) {
     264         173 :     return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
     265          64 :       ChainSizeInBytes <= ST->getMaxPrivateElementSize();
     266             :   }
     267             :   return true;
     268             : }
     269             : 
     270         716 : bool AMDGPUTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
     271             :                                                 unsigned Alignment,
     272             :                                                 unsigned AddrSpace) const {
     273         716 :   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
     274             : }
     275             : 
     276         361 : bool AMDGPUTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
     277             :                                                  unsigned Alignment,
     278             :                                                  unsigned AddrSpace) const {
     279         361 :   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
     280             : }
     281             : 
     282           7 : unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
     283             :   // Disable unrolling if the loop is not vectorized.
     284             :   // TODO: Enable this again.
     285           7 :   if (VF == 1)
     286             :     return 1;
     287             : 
     288           1 :   return 8;
     289             : }
     290             : 
     291         267 : int AMDGPUTTIImpl::getArithmeticInstrCost(
     292             :     unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
     293             :     TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
     294             :     TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args ) {
     295         267 :   EVT OrigTy = TLI->getValueType(DL, Ty);
     296         267 :   if (!OrigTy.isSimple()) {
     297          93 :     return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
     298          62 :                                          Opd1PropInfo, Opd2PropInfo);
     299             :   }
     300             : 
     301             :   // Legalize the type.
     302         236 :   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
     303         236 :   int ISD = TLI->InstructionOpcodeToISD(Opcode);
     304             : 
     305             :   // Because we don't have any legal vector operations, but the legal types, we
     306             :   // need to account for split vectors.
     307         236 :   unsigned NElts = LT.second.isVector() ?
     308         236 :     LT.second.getVectorNumElements() : 1;
     309             : 
     310         236 :   MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
     311             : 
     312         236 :   switch (ISD) {
     313          12 :   case ISD::SHL:
     314             :   case ISD::SRL:
     315             :   case ISD::SRA:
     316          12 :     if (SLT == MVT::i64)
     317          12 :       return get64BitInstrCost() * LT.first * NElts;
     318             : 
     319             :     // i32
     320           6 :     return getFullRateInstrCost() * LT.first * NElts;
     321          53 :   case ISD::ADD:
     322             :   case ISD::SUB:
     323             :   case ISD::AND:
     324             :   case ISD::OR:
     325             :   case ISD::XOR:
     326          53 :     if (SLT == MVT::i64){
     327             :       // and, or and xor are typically split into 2 VALU instructions.
     328          26 :       return 2 * getFullRateInstrCost() * LT.first * NElts;
     329             :     }
     330             : 
     331          27 :     return LT.first * NElts * getFullRateInstrCost();
     332             :   case ISD::MUL: {
     333           9 :     const int QuarterRateCost = getQuarterRateInstrCost();
     334           9 :     if (SLT == MVT::i64) {
     335           5 :       const int FullRateCost = getFullRateInstrCost();
     336           5 :       return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
     337             :     }
     338             : 
     339             :     // i32
     340           4 :     return QuarterRateCost * NElts * LT.first;
     341             :   }
     342          78 :   case ISD::FADD:
     343             :   case ISD::FSUB:
     344             :   case ISD::FMUL:
     345          78 :     if (SLT == MVT::f64)
     346          36 :       return LT.first * NElts * get64BitInstrCost();
     347             : 
     348          60 :     if (SLT == MVT::f32 || SLT == MVT::f16)
     349          60 :       return LT.first * NElts * getFullRateInstrCost();
     350             :     break;
     351          84 :   case ISD::FDIV:
     352             :   case ISD::FREM:
     353             :     // FIXME: frem should be handled separately. The fdiv in it is most of it,
     354             :     // but the current lowering is also not entirely correct.
     355          84 :     if (SLT == MVT::f64) {
     356          48 :       int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost();
     357             :       // Add cost of workaround.
     358          24 :       if (ST->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS)
     359           8 :         Cost += 3 * getFullRateInstrCost();
     360             : 
     361          24 :       return LT.first * Cost * NElts;
     362          60 :     }
     363             : 
     364         204 :     if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
     365             :       // TODO: This is more complicated, unsafe flags etc.
     366          46 :       if ((SLT == MVT::f32 && !ST->hasFP32Denormals()) ||
     367           2 :           (SLT == MVT::f16 && ST->has16BitInsts())) {
     368          18 :         return LT.first * getQuarterRateInstrCost() * NElts;
     369             :       }
     370             :     }
     371             : 
     372          42 :     if (SLT == MVT::f16 && ST->has16BitInsts()) {
     373             :       // 2 x v_cvt_f32_f16
     374             :       // f32 rcp
     375             :       // f32 fmul
     376             :       // v_cvt_f16_f32
     377             :       // f16 div_fixup
     378           3 :       int Cost = 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost();
     379           3 :       return LT.first * Cost * NElts;
     380             :     }
     381             : 
     382          39 :     if (SLT == MVT::f32 || SLT == MVT::f16) {
     383          39 :       int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost();
     384             : 
     385          39 :       if (!ST->hasFP32Denormals()) {
     386             :         // FP mode switches.
     387          24 :         Cost += 2 * getFullRateInstrCost();
     388             :       }
     389             : 
     390          39 :       return LT.first * NElts * Cost;
     391           0 :     }
     392             :     break;
     393             :   default:
     394             :     break;
     395             :   }
     396             : 
     397           0 :   return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
     398           0 :                                        Opd1PropInfo, Opd2PropInfo);
     399             : }
     400             : 
     401         310 : unsigned AMDGPUTTIImpl::getCFInstrCost(unsigned Opcode) {
     402             :   // XXX - For some reason this isn't called for switch.
     403         310 :   switch (Opcode) {
     404             :   case Instruction::Br:
     405             :   case Instruction::Ret:
     406             :     return 10;
     407          16 :   default:
     408          16 :     return BaseT::getCFInstrCost(Opcode);
     409             :   }
     410             : }
     411             : 
     412         464 : int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
     413             :                                       unsigned Index) {
     414         464 :   switch (Opcode) {
     415         464 :   case Instruction::ExtractElement:
     416             :   case Instruction::InsertElement: {
     417             :     unsigned EltSize
     418         928 :       = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
     419         464 :     if (EltSize < 32) {
     420         115 :       if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
     421             :         return 0;
     422         138 :       return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
     423             :     }
     424             : 
     425             :     // Extracts are just reads of a subregister, so are free. Inserts are
     426             :     // considered free because we don't want to have any cost for scalarizing
     427             :     // operations, and we don't have to copy into a different register class.
     428             : 
     429             :     // Dynamic indexing isn't free and is best avoided.
     430         349 :     return Index == ~0u ? 2 : 0;
     431             :   }
     432           0 :   default:
     433           0 :     return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
     434             :   }
     435             : }
     436             : 
     437       41760 : static bool isIntrinsicSourceOfDivergence(const IntrinsicInst *I) {
     438       41760 :   switch (I->getIntrinsicID()) {
     439             :   case Intrinsic::amdgcn_workitem_id_x:
     440             :   case Intrinsic::amdgcn_workitem_id_y:
     441             :   case Intrinsic::amdgcn_workitem_id_z:
     442             :   case Intrinsic::amdgcn_interp_mov:
     443             :   case Intrinsic::amdgcn_interp_p1:
     444             :   case Intrinsic::amdgcn_interp_p2:
     445             :   case Intrinsic::amdgcn_mbcnt_hi:
     446             :   case Intrinsic::amdgcn_mbcnt_lo:
     447             :   case Intrinsic::r600_read_tidig_x:
     448             :   case Intrinsic::r600_read_tidig_y:
     449             :   case Intrinsic::r600_read_tidig_z:
     450             :   case Intrinsic::amdgcn_atomic_inc:
     451             :   case Intrinsic::amdgcn_atomic_dec:
     452             :   case Intrinsic::amdgcn_image_atomic_swap:
     453             :   case Intrinsic::amdgcn_image_atomic_add:
     454             :   case Intrinsic::amdgcn_image_atomic_sub:
     455             :   case Intrinsic::amdgcn_image_atomic_smin:
     456             :   case Intrinsic::amdgcn_image_atomic_umin:
     457             :   case Intrinsic::amdgcn_image_atomic_smax:
     458             :   case Intrinsic::amdgcn_image_atomic_umax:
     459             :   case Intrinsic::amdgcn_image_atomic_and:
     460             :   case Intrinsic::amdgcn_image_atomic_or:
     461             :   case Intrinsic::amdgcn_image_atomic_xor:
     462             :   case Intrinsic::amdgcn_image_atomic_inc:
     463             :   case Intrinsic::amdgcn_image_atomic_dec:
     464             :   case Intrinsic::amdgcn_image_atomic_cmpswap:
     465             :   case Intrinsic::amdgcn_buffer_atomic_swap:
     466             :   case Intrinsic::amdgcn_buffer_atomic_add:
     467             :   case Intrinsic::amdgcn_buffer_atomic_sub:
     468             :   case Intrinsic::amdgcn_buffer_atomic_smin:
     469             :   case Intrinsic::amdgcn_buffer_atomic_umin:
     470             :   case Intrinsic::amdgcn_buffer_atomic_smax:
     471             :   case Intrinsic::amdgcn_buffer_atomic_umax:
     472             :   case Intrinsic::amdgcn_buffer_atomic_and:
     473             :   case Intrinsic::amdgcn_buffer_atomic_or:
     474             :   case Intrinsic::amdgcn_buffer_atomic_xor:
     475             :   case Intrinsic::amdgcn_buffer_atomic_cmpswap:
     476             :   case Intrinsic::amdgcn_ps_live:
     477             :   case Intrinsic::amdgcn_ds_swizzle:
     478             :     return true;
     479       27695 :   default:
     480       27695 :     return false;
     481             :   }
     482             : }
     483             : 
     484      129756 : static bool isArgPassedInSGPR(const Argument *A) {
     485      129756 :   const Function *F = A->getParent();
     486             : 
     487             :   // Arguments to compute shaders are never a source of divergence.
     488      129756 :   CallingConv::ID CC = F->getCallingConv();
     489             :   switch (CC) {
     490             :   case CallingConv::AMDGPU_KERNEL:
     491             :   case CallingConv::SPIR_KERNEL:
     492             :     return true;
     493        9789 :   case CallingConv::AMDGPU_VS:
     494             :   case CallingConv::AMDGPU_HS:
     495             :   case CallingConv::AMDGPU_GS:
     496             :   case CallingConv::AMDGPU_PS:
     497             :   case CallingConv::AMDGPU_CS:
     498             :     // For non-compute shaders, SGPR inputs are marked with either inreg or byval.
     499             :     // Everything else is in VGPRs.
     500       26711 :     return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg) ||
     501       16922 :            F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::ByVal);
     502        4281 :   default:
     503             :     // TODO: Should calls support inreg for SGPR inputs?
     504        4281 :     return false;
     505             :   }
     506             : }
     507             : 
     508             : /// \returns true if the result of the value could potentially be
     509             : /// different across workitems in a wavefront.
     510      510194 : bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const {
     511      639950 :   if (const Argument *A = dyn_cast<Argument>(V))
     512      129756 :     return !isArgPassedInSGPR(A);
     513             : 
     514             :   // Loads from the private address space are divergent, because threads
     515             :   // can execute the load instruction with the same inputs and get different
     516             :   // results.
     517             :   //
     518             :   // All other loads are not divergent, because if threads issue loads with the
     519             :   // same arguments, they will always get the same result.
     520      426121 :   if (const LoadInst *Load = dyn_cast<LoadInst>(V))
     521       91366 :     return Load->getPointerAddressSpace() == ST->getAMDGPUAS().PRIVATE_ADDRESS;
     522             : 
     523             :   // Atomics are divergent because they are executed sequentially: when an
     524             :   // atomic operation refers to the same address in each thread, then each
     525             :   // thread after the first sees the value written by the previous thread as
     526             :   // original value.
     527      664533 :   if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
     528             :     return true;
     529             : 
     530      370648 :   if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
     531       41760 :     return isIntrinsicSourceOfDivergence(Intrinsic);
     532             : 
     533             :   // Assume all function calls are a source of divergence.
     534      567199 :   if (isa<CallInst>(V) || isa<InvokeInst>(V))
     535             :     return true;
     536             : 
     537             :   return false;
     538             : }
     539             : 
     540      141445 : bool AMDGPUTTIImpl::isAlwaysUniform(const Value *V) const {
     541       19677 :   if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
     542       19677 :     switch (Intrinsic->getIntrinsicID()) {
     543             :     default:
     544             :       return false;
     545          26 :     case Intrinsic::amdgcn_readfirstlane:
     546             :     case Intrinsic::amdgcn_readlane:
     547          26 :       return true;
     548             :     }
     549             :   }
     550             :   return false;
     551             : }
     552             : 
     553          12 : unsigned AMDGPUTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
     554             :                                        Type *SubTp) {
     555          12 :   if (ST->hasVOP3PInsts()) {
     556           6 :     VectorType *VT = cast<VectorType>(Tp);
     557          12 :     if (VT->getNumElements() == 2 &&
     558           6 :         DL.getTypeSizeInBits(VT->getElementType()) == 16) {
     559             :       // With op_sel VOP3P instructions freely can access the low half or high
     560             :       // half of a register, so any swizzle is free.
     561             : 
     562             :       switch (Kind) {
     563             :       case TTI::SK_Broadcast:
     564             :       case TTI::SK_Reverse:
     565             :       case TTI::SK_PermuteSingleSrc:
     566             :         return 0;
     567             :       default:
     568             :         break;
     569             :       }
     570             :     }
     571             :   }
     572             : 
     573           7 :   return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
     574             : }
     575             : 
     576          15 : bool AMDGPUTTIImpl::areInlineCompatible(const Function *Caller,
     577             :                                         const Function *Callee) const {
     578          15 :   const TargetMachine &TM = getTLI()->getTargetMachine();
     579             :   const FeatureBitset &CallerBits =
     580          15 :     TM.getSubtargetImpl(*Caller)->getFeatureBits();
     581             :   const FeatureBitset &CalleeBits =
     582          15 :     TM.getSubtargetImpl(*Callee)->getFeatureBits();
     583             : 
     584          60 :   FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
     585          60 :   FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
     586          15 :   return ((RealCallerBits & RealCalleeBits) == RealCalleeBits);
     587      216918 : }

Generated by: LCOV version 1.13