LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - AMDGPUTargetTransformInfo.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 231 260 88.8 %
Date: 2018-07-13 00:08:38 Functions: 36 39 92.3 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : // \file
      11             : // This file implements a TargetTransformInfo analysis pass specific to the
      12             : // AMDGPU target machine. It uses the target's detailed information to provide
      13             : // more precise answers to certain TTI queries, while letting the target
      14             : // independent and default TTI implementations handle the rest.
      15             : //
      16             : //===----------------------------------------------------------------------===//
      17             : 
      18             : #include "AMDGPUTargetTransformInfo.h"
      19             : #include "AMDGPUSubtarget.h"
      20             : #include "Utils/AMDGPUBaseInfo.h"
      21             : #include "llvm/ADT/STLExtras.h"
      22             : #include "llvm/Analysis/LoopInfo.h"
      23             : #include "llvm/Analysis/TargetTransformInfo.h"
      24             : #include "llvm/Analysis/ValueTracking.h"
      25             : #include "llvm/CodeGen/ISDOpcodes.h"
      26             : #include "llvm/CodeGen/ValueTypes.h"
      27             : #include "llvm/IR/Argument.h"
      28             : #include "llvm/IR/Attributes.h"
      29             : #include "llvm/IR/BasicBlock.h"
      30             : #include "llvm/IR/CallingConv.h"
      31             : #include "llvm/IR/DataLayout.h"
      32             : #include "llvm/IR/DerivedTypes.h"
      33             : #include "llvm/IR/Function.h"
      34             : #include "llvm/IR/Instruction.h"
      35             : #include "llvm/IR/Instructions.h"
      36             : #include "llvm/IR/IntrinsicInst.h"
      37             : #include "llvm/IR/Module.h"
      38             : #include "llvm/IR/PatternMatch.h"
      39             : #include "llvm/IR/Type.h"
      40             : #include "llvm/IR/Value.h"
      41             : #include "llvm/MC/SubtargetFeature.h"
      42             : #include "llvm/Support/Casting.h"
      43             : #include "llvm/Support/CommandLine.h"
      44             : #include "llvm/Support/Debug.h"
      45             : #include "llvm/Support/ErrorHandling.h"
      46             : #include "llvm/Support/MachineValueType.h"
      47             : #include "llvm/Support/raw_ostream.h"
      48             : #include "llvm/Target/TargetMachine.h"
      49             : #include <algorithm>
      50             : #include <cassert>
      51             : #include <limits>
      52             : #include <utility>
      53             : 
      54             : using namespace llvm;
      55             : 
      56             : #define DEBUG_TYPE "AMDGPUtti"
      57             : 
      58       99743 : static cl::opt<unsigned> UnrollThresholdPrivate(
      59             :   "amdgpu-unroll-threshold-private",
      60       99743 :   cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
      61      299229 :   cl::init(2500), cl::Hidden);
      62             : 
      63       99743 : static cl::opt<unsigned> UnrollThresholdLocal(
      64             :   "amdgpu-unroll-threshold-local",
      65       99743 :   cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
      66      299229 :   cl::init(1000), cl::Hidden);
      67             : 
      68       99743 : static cl::opt<unsigned> UnrollThresholdIf(
      69             :   "amdgpu-unroll-threshold-if",
      70       99743 :   cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
      71      299229 :   cl::init(150), cl::Hidden);
      72             : 
      73           8 : static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
      74             :                               unsigned Depth = 0) {
      75             :   const Instruction *I = dyn_cast<Instruction>(Cond);
      76             :   if (!I)
      77             :     return false;
      78             : 
      79          16 :   for (const Value *V : I->operand_values()) {
      80           8 :     if (!L->contains(I))
      81           0 :       continue;
      82             :     if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
      83           6 :       if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
      84             :                   return SubLoop->contains(PHI); }))
      85             :         return true;
      86           2 :     } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
      87             :       return true;
      88             :   }
      89             :   return false;
      90             : }
      91             : 
      92          20 : void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
      93             :                                             TTI::UnrollingPreferences &UP) {
      94          20 :   UP.Threshold = 300; // Twice the default.
      95          20 :   UP.MaxCount = std::numeric_limits<unsigned>::max();
      96          20 :   UP.Partial = true;
      97             : 
      98             :   // TODO: Do we want runtime unrolling?
      99             : 
     100             :   // Maximum alloca size than can fit registers. Reserve 16 registers.
     101             :   const unsigned MaxAlloca = (256 - 16) * 4;
     102          20 :   unsigned ThresholdPrivate = UnrollThresholdPrivate;
     103          20 :   unsigned ThresholdLocal = UnrollThresholdLocal;
     104          20 :   unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
     105          40 :   const AMDGPUAS &ASST = AMDGPU::getAMDGPUAS(TargetTriple);
     106          88 :   for (const BasicBlock *BB : L->getBlocks()) {
     107          37 :     const DataLayout &DL = BB->getModule()->getDataLayout();
     108             :     unsigned LocalGEPsSeen = 0;
     109             : 
     110          37 :     if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
     111             :                return SubLoop->contains(BB); }))
     112           0 :         continue; // Block belongs to an inner loop.
     113             : 
     114         223 :     for (const Instruction &I : *BB) {
     115             :       // Unroll a loop which contains an "if" statement whose condition
     116             :       // defined by a PHI belonging to the loop. This may help to eliminate
     117             :       // if region and potentially even PHI itself, saving on both divergence
     118             :       // and registers used for the PHI.
     119             :       // Add a small bonus for each of such "if" statements.
     120          19 :       if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
     121          68 :         if (UP.Threshold < MaxBoost && Br->isConditional()) {
     122          71 :           if (L->isLoopExiting(Br->getSuccessor(0)) ||
     123          14 :               L->isLoopExiting(Br->getSuccessor(1)))
     124          15 :             continue;
     125           6 :           if (dependsOnLocalPhi(L, Br->getCondition())) {
     126           6 :             UP.Threshold += UnrollThresholdIf;
     127             :             LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
     128             :                               << " for loop:\n"
     129             :                               << *L << " due to " << *Br << '\n');
     130           6 :             if (UP.Threshold >= MaxBoost)
     131             :               return;
     132             :           }
     133             :         }
     134          19 :         continue;
     135             :       }
     136             : 
     137             :       const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
     138         125 :       if (!GEP)
     139         125 :         continue;
     140             : 
     141             :       unsigned AS = GEP->getAddressSpace();
     142             :       unsigned Threshold = 0;
     143          30 :       if (AS == ASST.PRIVATE_ADDRESS)
     144           5 :         Threshold = ThresholdPrivate;
     145          48 :       else if (AS == ASST.LOCAL_ADDRESS)
     146           2 :         Threshold = ThresholdLocal;
     147             :       else
     148          23 :         continue;
     149             : 
     150           7 :       if (UP.Threshold >= Threshold)
     151           0 :         continue;
     152             : 
     153           7 :       if (AS == ASST.PRIVATE_ADDRESS) {
     154             :         const Value *Ptr = GEP->getPointerOperand();
     155             :         const AllocaInst *Alloca =
     156             :             dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL));
     157           6 :         if (!Alloca || !Alloca->isStaticAlloca())
     158           1 :           continue;
     159           4 :         Type *Ty = Alloca->getAllocatedType();
     160           8 :         unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
     161           4 :         if (AllocaSize > MaxAlloca)
     162           1 :           continue;
     163           2 :       } else if (AS == ASST.LOCAL_ADDRESS) {
     164           2 :         LocalGEPsSeen++;
     165             :         // Inhibit unroll for local memory if we have seen addressing not to
     166             :         // a variable, most likely we will be unable to combine it.
     167             :         // Do not unroll too deep inner loops for local memory to give a chance
     168             :         // to unroll an outer loop for a more important reason.
     169           6 :         if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
     170           2 :             (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
     171             :              !isa<Argument>(GEP->getPointerOperand())))
     172           0 :           continue;
     173             :       }
     174             : 
     175             :       // Check if GEP depends on a value defined by this loop itself.
     176             :       bool HasLoopDef = false;
     177          26 :       for (const Value *Op : GEP->operands()) {
     178             :         const Instruction *Inst = dyn_cast<Instruction>(Op);
     179          16 :         if (!Inst || L->isLoopInvariant(Op))
     180           8 :           continue;
     181             : 
     182           5 :         if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
     183             :              return SubLoop->contains(Inst); }))
     184           0 :           continue;
     185             :         HasLoopDef = true;
     186             :         break;
     187             :       }
     188           5 :       if (!HasLoopDef)
     189           0 :         continue;
     190             : 
     191             :       // We want to do whatever we can to limit the number of alloca
     192             :       // instructions that make it through to the code generator.  allocas
     193             :       // require us to use indirect addressing, which is slow and prone to
     194             :       // compiler bugs.  If this loop does an address calculation on an
     195             :       // alloca ptr, then we want to use a higher than normal loop unroll
     196             :       // threshold. This will give SROA a better chance to eliminate these
     197             :       // allocas.
     198             :       //
     199             :       // We also want to have more unrolling for local memory to let ds
     200             :       // instructions with different offsets combine.
     201             :       //
     202             :       // Don't use the maximum allowed value here as it will make some
     203             :       // programs way too big.
     204           5 :       UP.Threshold = Threshold;
     205             :       LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
     206             :                         << " for loop:\n"
     207             :                         << *L << " due to " << *GEP << '\n');
     208           5 :       if (UP.Threshold >= MaxBoost)
     209             :         return;
     210             :     }
     211             :   }
     212             : }
     213             : 
     214        2483 : unsigned GCNTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
     215             :   // The concept of vector registers doesn't really exist. Some packed vector
     216             :   // operations operate on the normal 32-bit registers.
     217        2483 :   return 256;
     218             : }
     219             : 
     220        2483 : unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const {
     221             :   // This is really the number of registers to fill when vectorizing /
     222             :   // interleaving loops, so we lie to avoid trying to use all registers.
     223        2483 :   return getHardwareNumberOfRegisters(Vec) >> 3;
     224             : }
     225             : 
     226          89 : unsigned GCNTTIImpl::getRegisterBitWidth(bool Vector) const {
     227          89 :   return 32;
     228             : }
     229             : 
     230          75 : unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
     231          75 :   return 32;
     232             : }
     233             : 
     234       12386 : unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
     235             :                                             unsigned ChainSizeInBytes,
     236             :                                             VectorType *VecTy) const {
     237       12386 :   unsigned VecRegBitWidth = VF * LoadSize;
     238       12386 :   if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
     239             :     // TODO: Support element-size less than 32bit?
     240        2051 :     return 128 / LoadSize;
     241             : 
     242             :   return VF;
     243             : }
     244             : 
     245        3520 : unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
     246             :                                              unsigned ChainSizeInBytes,
     247             :                                              VectorType *VecTy) const {
     248        3520 :   unsigned VecRegBitWidth = VF * StoreSize;
     249        3520 :   if (VecRegBitWidth > 128)
     250        3219 :     return 128 / StoreSize;
     251             : 
     252             :   return VF;
     253             : }
     254             : 
     255       61003 : unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
     256       61003 :   AMDGPUAS AS = ST->getAMDGPUAS();
     257      122006 :   if (AddrSpace == AS.GLOBAL_ADDRESS ||
     258       67415 :       AddrSpace == AS.CONSTANT_ADDRESS ||
     259             :       AddrSpace == AS.CONSTANT_ADDRESS_32BIT) {
     260             :     return 512;
     261             :   }
     262             : 
     263        6298 :   if (AddrSpace == AS.FLAT_ADDRESS ||
     264         928 :       AddrSpace == AS.LOCAL_ADDRESS ||
     265             :       AddrSpace == AS.REGION_ADDRESS)
     266             :     return 128;
     267             : 
     268         896 :   if (AddrSpace == AS.PRIVATE_ADDRESS)
     269         896 :     return 8 * ST->getMaxPrivateElementSize();
     270             : 
     271           0 :   llvm_unreachable("unhandled address space");
     272             : }
     273             : 
     274        8698 : bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
     275             :                                                unsigned Alignment,
     276             :                                                unsigned AddrSpace) const {
     277             :   // We allow vectorization of flat stores, even though we may need to decompose
     278             :   // them later if they may access private memory. We don't have enough context
     279             :   // here, and legalization can handle it.
     280       17396 :   if (AddrSpace == ST->getAMDGPUAS().PRIVATE_ADDRESS) {
     281         172 :     return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
     282          64 :       ChainSizeInBytes <= ST->getMaxPrivateElementSize();
     283             :   }
     284             :   return true;
     285             : }
     286             : 
     287        8264 : bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
     288             :                                                 unsigned Alignment,
     289             :                                                 unsigned AddrSpace) const {
     290        8264 :   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
     291             : }
     292             : 
     293         434 : bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
     294             :                                                  unsigned Alignment,
     295             :                                                  unsigned AddrSpace) const {
     296         434 :   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
     297             : }
     298             : 
     299           8 : unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) {
     300             :   // Disable unrolling if the loop is not vectorized.
     301             :   // TODO: Enable this again.
     302           8 :   if (VF == 1)
     303             :     return 1;
     304             : 
     305           2 :   return 8;
     306             : }
     307             : 
     308        7184 : bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
     309             :                                        MemIntrinsicInfo &Info) const {
     310             :   switch (Inst->getIntrinsicID()) {
     311         749 :   case Intrinsic::amdgcn_atomic_inc:
     312             :   case Intrinsic::amdgcn_atomic_dec:
     313             :   case Intrinsic::amdgcn_ds_fadd:
     314             :   case Intrinsic::amdgcn_ds_fmin:
     315             :   case Intrinsic::amdgcn_ds_fmax: {
     316         749 :     auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
     317             :     auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
     318         749 :     if (!Ordering || !Volatile)
     319             :       return false; // Invalid.
     320             : 
     321         731 :     unsigned OrderingVal = Ordering->getZExtValue();
     322         731 :     if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
     323             :       return false;
     324             : 
     325         731 :     Info.PtrVal = Inst->getArgOperand(0);
     326         731 :     Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
     327         731 :     Info.ReadMem = true;
     328         731 :     Info.WriteMem = true;
     329         731 :     Info.IsVolatile = !Volatile->isNullValue();
     330         731 :     return true;
     331             :   }
     332             :   default:
     333             :     return false;
     334             :   }
     335             : }
     336             : 
     337         313 : int GCNTTIImpl::getArithmeticInstrCost(
     338             :     unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
     339             :     TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
     340             :     TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args ) {
     341         313 :   EVT OrigTy = TLI->getValueType(DL, Ty);
     342         313 :   if (!OrigTy.isSimple()) {
     343         102 :     return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
     344          68 :                                          Opd1PropInfo, Opd2PropInfo);
     345             :   }
     346             : 
     347             :   // Legalize the type.
     348         279 :   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
     349         279 :   int ISD = TLI->InstructionOpcodeToISD(Opcode);
     350             : 
     351             :   // Because we don't have any legal vector operations, but the legal types, we
     352             :   // need to account for split vectors.
     353         279 :   unsigned NElts = LT.second.isVector() ?
     354             :     LT.second.getVectorNumElements() : 1;
     355             : 
     356             :   MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
     357             : 
     358         279 :   switch (ISD) {
     359          12 :   case ISD::SHL:
     360             :   case ISD::SRL:
     361             :   case ISD::SRA:
     362          12 :     if (SLT == MVT::i64)
     363          12 :       return get64BitInstrCost() * LT.first * NElts;
     364             : 
     365             :     // i32
     366           6 :     return getFullRateInstrCost() * LT.first * NElts;
     367          67 :   case ISD::ADD:
     368             :   case ISD::SUB:
     369             :   case ISD::AND:
     370             :   case ISD::OR:
     371             :   case ISD::XOR:
     372          67 :     if (SLT == MVT::i64){
     373             :       // and, or and xor are typically split into 2 VALU instructions.
     374          26 :       return 2 * getFullRateInstrCost() * LT.first * NElts;
     375             :     }
     376             : 
     377          41 :     return LT.first * NElts * getFullRateInstrCost();
     378             :   case ISD::MUL: {
     379             :     const int QuarterRateCost = getQuarterRateInstrCost();
     380           9 :     if (SLT == MVT::i64) {
     381             :       const int FullRateCost = getFullRateInstrCost();
     382           5 :       return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
     383             :     }
     384             : 
     385             :     // i32
     386           4 :     return QuarterRateCost * NElts * LT.first;
     387             :   }
     388         107 :   case ISD::FADD:
     389             :   case ISD::FSUB:
     390             :   case ISD::FMUL:
     391         107 :     if (SLT == MVT::f64)
     392          36 :       return LT.first * NElts * get64BitInstrCost();
     393             : 
     394          89 :     if (SLT == MVT::f32 || SLT == MVT::f16)
     395          89 :       return LT.first * NElts * getFullRateInstrCost();
     396             :     break;
     397          84 :   case ISD::FDIV:
     398             :   case ISD::FREM:
     399             :     // FIXME: frem should be handled separately. The fdiv in it is most of it,
     400             :     // but the current lowering is also not entirely correct.
     401          84 :     if (SLT == MVT::f64) {
     402          48 :       int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost();
     403             :       // Add cost of workaround.
     404          24 :       if (ST->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS)
     405           8 :         Cost += 3 * getFullRateInstrCost();
     406             : 
     407          24 :       return LT.first * Cost * NElts;
     408          60 :     }
     409             : 
     410         150 :     if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
     411             :       // TODO: This is more complicated, unsafe flags etc.
     412          46 :       if ((SLT == MVT::f32 && !ST->hasFP32Denormals()) ||
     413           2 :           (SLT == MVT::f16 && ST->has16BitInsts())) {
     414          18 :         return LT.first * getQuarterRateInstrCost() * NElts;
     415             :       }
     416             :     }
     417             : 
     418          42 :     if (SLT == MVT::f16 && ST->has16BitInsts()) {
     419             :       // 2 x v_cvt_f32_f16
     420             :       // f32 rcp
     421             :       // f32 fmul
     422             :       // v_cvt_f16_f32
     423             :       // f16 div_fixup
     424             :       int Cost = 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost();
     425           3 :       return LT.first * Cost * NElts;
     426             :     }
     427             : 
     428          39 :     if (SLT == MVT::f32 || SLT == MVT::f16) {
     429             :       int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost();
     430             : 
     431          39 :       if (!ST->hasFP32Denormals()) {
     432             :         // FP mode switches.
     433             :         Cost += 2 * getFullRateInstrCost();
     434             :       }
     435             : 
     436          39 :       return LT.first * NElts * Cost;
     437           0 :     }
     438             :     break;
     439             :   default:
     440             :     break;
     441             :   }
     442             : 
     443           0 :   return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
     444           0 :                                        Opd1PropInfo, Opd2PropInfo);
     445             : }
     446             : 
     447         314 : unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode) {
     448             :   // XXX - For some reason this isn't called for switch.
     449         314 :   switch (Opcode) {
     450             :   case Instruction::Br:
     451             :   case Instruction::Ret:
     452             :     return 10;
     453          17 :   default:
     454          17 :     return BaseT::getCFInstrCost(Opcode);
     455             :   }
     456             : }
     457             : 
     458          24 : int GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *Ty,
     459             :                                               bool IsPairwise) {
     460          24 :   EVT OrigTy = TLI->getValueType(DL, Ty);
     461             : 
     462             :   // Computes cost on targets that have packed math instructions(which support
     463             :   // 16-bit types only).
     464          12 :   if (IsPairwise ||
     465          30 :       !ST->hasVOP3PInsts() ||
     466             :       OrigTy.getScalarSizeInBits() != 16)
     467          19 :     return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise);
     468             : 
     469           5 :   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
     470           5 :   return LT.first * getFullRateInstrCost();
     471             : }
     472             : 
     473          38 : int GCNTTIImpl::getMinMaxReductionCost(Type *Ty, Type *CondTy,
     474             :                                           bool IsPairwise,
     475             :                                           bool IsUnsigned) {
     476          38 :   EVT OrigTy = TLI->getValueType(DL, Ty);
     477             : 
     478             :   // Computes cost on targets that have packed math instructions(which support
     479             :   // 16-bit types only).
     480          19 :   if (IsPairwise ||
     481          50 :       !ST->hasVOP3PInsts() ||
     482             :       OrigTy.getScalarSizeInBits() != 16)
     483          31 :     return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned);
     484             : 
     485           7 :   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
     486           7 :   return LT.first * getHalfRateInstrCost();
     487             : }
     488             : 
     489        1373 : int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
     490             :                                       unsigned Index) {
     491        1373 :   switch (Opcode) {
     492        1373 :   case Instruction::ExtractElement:
     493             :   case Instruction::InsertElement: {
     494             :     unsigned EltSize
     495        1373 :       = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
     496        1373 :     if (EltSize < 32) {
     497         768 :       if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
     498             :         return 0;
     499         608 :       return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
     500             :     }
     501             : 
     502             :     // Extracts are just reads of a subregister, so are free. Inserts are
     503             :     // considered free because we don't want to have any cost for scalarizing
     504             :     // operations, and we don't have to copy into a different register class.
     505             : 
     506             :     // Dynamic indexing isn't free and is best avoided.
     507         605 :     return Index == ~0u ? 2 : 0;
     508             :   }
     509           0 :   default:
     510           0 :     return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
     511             :   }
     512             : }
     513             : 
     514             : 
     515             : 
     516      205348 : static bool isArgPassedInSGPR(const Argument *A) {
     517      205348 :   const Function *F = A->getParent();
     518             : 
     519             :   // Arguments to compute shaders are never a source of divergence.
     520             :   CallingConv::ID CC = F->getCallingConv();
     521             :   switch (CC) {
     522             :   case CallingConv::AMDGPU_KERNEL:
     523             :   case CallingConv::SPIR_KERNEL:
     524             :     return true;
     525       27153 :   case CallingConv::AMDGPU_VS:
     526             :   case CallingConv::AMDGPU_LS:
     527             :   case CallingConv::AMDGPU_HS:
     528             :   case CallingConv::AMDGPU_ES:
     529             :   case CallingConv::AMDGPU_GS:
     530             :   case CallingConv::AMDGPU_PS:
     531             :   case CallingConv::AMDGPU_CS:
     532             :     // For non-compute shaders, SGPR inputs are marked with either inreg or byval.
     533             :     // Everything else is in VGPRs.
     534       72735 :     return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg) ||
     535       45582 :            F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::ByVal);
     536       13185 :   default:
     537             :     // TODO: Should calls support inreg for SGPR inputs?
     538       13185 :     return false;
     539             :   }
     540             : }
     541             : 
     542             : /// \returns true if the result of the value could potentially be
     543             : /// different across workitems in a wavefront.
     544     1325008 : bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
     545             :   if (const Argument *A = dyn_cast<Argument>(V))
     546      205348 :     return !isArgPassedInSGPR(A);
     547             : 
     548             :   // Loads from the private address space are divergent, because threads
     549             :   // can execute the load instruction with the same inputs and get different
     550             :   // results.
     551             :   //
     552             :   // All other loads are not divergent, because if threads issue loads with the
     553             :   // same arguments, they will always get the same result.
     554             :   if (const LoadInst *Load = dyn_cast<LoadInst>(V))
     555      284998 :     return Load->getPointerAddressSpace() == ST->getAMDGPUAS().PRIVATE_ADDRESS;
     556             : 
     557             :   // Atomics are divergent because they are executed sequentially: when an
     558             :   // atomic operation refers to the same address in each thread, then each
     559             :   // thread after the first sees the value written by the previous thread as
     560             :   // original value.
     561             :   if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
     562             :     return true;
     563             : 
     564             :   if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
     565      119671 :     return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
     566             : 
     567             :   // Assume all function calls are a source of divergence.
     568             :   if (isa<CallInst>(V) || isa<InvokeInst>(V))
     569             :     return true;
     570             : 
     571             :   return false;
     572             : }
     573             : 
     574      234654 : bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
     575             :   if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
     576       40236 :     switch (Intrinsic->getIntrinsicID()) {
     577             :     default:
     578             :       return false;
     579          31 :     case Intrinsic::amdgcn_readfirstlane:
     580             :     case Intrinsic::amdgcn_readlane:
     581          31 :       return true;
     582             :     }
     583             :   }
     584             :   return false;
     585             : }
     586             : 
     587          85 : unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
     588             :                                        Type *SubTp) {
     589          85 :   if (ST->hasVOP3PInsts()) {
     590             :     VectorType *VT = cast<VectorType>(Tp);
     591          49 :     if (VT->getNumElements() == 2 &&
     592           9 :         DL.getTypeSizeInBits(VT->getElementType()) == 16) {
     593             :       // With op_sel VOP3P instructions freely can access the low half or high
     594             :       // half of a register, so any swizzle is free.
     595             : 
     596             :       switch (Kind) {
     597             :       case TTI::SK_Broadcast:
     598             :       case TTI::SK_Reverse:
     599             :       case TTI::SK_PermuteSingleSrc:
     600             :         return 0;
     601             :       default:
     602             :         break;
     603             :       }
     604             :     }
     605             :   }
     606             : 
     607          81 :   return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
     608             : }
     609             : 
     610         120 : bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
     611             :                                         const Function *Callee) const {
     612         120 :   const TargetMachine &TM = getTLI()->getTargetMachine();
     613             :   const FeatureBitset &CallerBits =
     614         120 :     TM.getSubtargetImpl(*Caller)->getFeatureBits();
     615             :   const FeatureBitset &CalleeBits =
     616         120 :     TM.getSubtargetImpl(*Callee)->getFeatureBits();
     617             : 
     618         120 :   FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
     619         120 :   FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
     620         120 :   return ((RealCallerBits & RealCalleeBits) == RealCalleeBits);
     621             : }
     622             : 
     623          17 : void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
     624             :                                          TTI::UnrollingPreferences &UP) {
     625          17 :   CommonTTI.getUnrollingPreferences(L, SE, UP);
     626          17 : }
     627             : 
     628         220 : unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
     629         220 :   return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
     630             : }
     631             : 
     632         220 : unsigned R600TTIImpl::getNumberOfRegisters(bool Vec) const {
     633         220 :   return getHardwareNumberOfRegisters(Vec);
     634             : }
     635             : 
     636           2 : unsigned R600TTIImpl::getRegisterBitWidth(bool Vector) const {
     637           2 :   return 32;
     638             : }
     639             : 
     640           2 : unsigned R600TTIImpl::getMinVectorRegisterBitWidth() const {
     641           2 :   return 32;
     642             : }
     643             : 
     644        3795 : unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
     645        3795 :   AMDGPUAS AS = ST->getAMDGPUAS();
     646        7590 :   if (AddrSpace == AS.GLOBAL_ADDRESS ||
     647        3795 :       AddrSpace == AS.CONSTANT_ADDRESS)
     648             :     return 128;
     649         913 :   if (AddrSpace == AS.LOCAL_ADDRESS ||
     650             :       AddrSpace == AS.REGION_ADDRESS)
     651             :     return 64;
     652         423 :   if (AddrSpace == AS.PRIVATE_ADDRESS)
     653             :     return 32;
     654             : 
     655         285 :   if ((AddrSpace == AS.PARAM_D_ADDRESS ||
     656         285 :       AddrSpace == AS.PARAM_I_ADDRESS ||
     657             :       (AddrSpace >= AS.CONSTANT_BUFFER_0 &&
     658             :       AddrSpace <= AS.CONSTANT_BUFFER_15)))
     659             :     return 128;
     660           0 :   llvm_unreachable("unhandled address space");
     661             : }
     662             : 
     663         118 : bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
     664             :                                              unsigned Alignment,
     665             :                                              unsigned AddrSpace) const {
     666             :   // We allow vectorization of flat stores, even though we may need to decompose
     667             :   // them later if they may access private memory. We don't have enough context
     668             :   // here, and legalization can handle it.
     669         236 :   if (AddrSpace == ST->getAMDGPUAS().PRIVATE_ADDRESS)
     670             :     return false;
     671         113 :   return true;
     672             : }
     673             : 
     674          80 : bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
     675             :                                               unsigned Alignment,
     676             :                                               unsigned AddrSpace) const {
     677          80 :   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
     678             : }
     679             : 
     680          38 : bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
     681             :                                                unsigned Alignment,
     682             :                                                unsigned AddrSpace) const {
     683          38 :   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
     684             : }
     685             : 
     686           0 : unsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF) {
     687             :   // Disable unrolling if the loop is not vectorized.
     688             :   // TODO: Enable this again.
     689           0 :   if (VF == 1)
     690             :     return 1;
     691             : 
     692           0 :   return 8;
     693             : }
     694             : 
     695           0 : unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode) {
     696             :   // XXX - For some reason this isn't called for switch.
     697           0 :   switch (Opcode) {
     698             :   case Instruction::Br:
     699             :   case Instruction::Ret:
     700             :     return 10;
     701           0 :   default:
     702           0 :     return BaseT::getCFInstrCost(Opcode);
     703             :   }
     704             : }
     705             : 
     706           0 : int R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
     707             :                                     unsigned Index) {
     708           0 :   switch (Opcode) {
     709           0 :   case Instruction::ExtractElement:
     710             :   case Instruction::InsertElement: {
     711             :     unsigned EltSize
     712           0 :       = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
     713           0 :     if (EltSize < 32) {
     714           0 :       return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
     715             :     }
     716             : 
     717             :     // Extracts are just reads of a subregister, so are free. Inserts are
     718             :     // considered free because we don't want to have any cost for scalarizing
     719             :     // operations, and we don't have to copy into a different register class.
     720             : 
     721             :     // Dynamic indexing isn't free and is best avoided.
     722           0 :     return Index == ~0u ? 2 : 0;
     723             :   }
     724           0 :   default:
     725           0 :     return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
     726             :   }
     727             : }
     728             : 
     729           3 : void R600TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
     730             :                                           TTI::UnrollingPreferences &UP) {
     731           3 :   CommonTTI.getUnrollingPreferences(L, SE, UP);
     732      299232 : }

Generated by: LCOV version 1.13