LCOV - code coverage report
Current view: top level - lib/Target/ARM - ARMTargetTransformInfo.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 141 148 95.3 %
Date: 2018-10-20 13:21:21 Functions: 13 13 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : 
      10             : #include "ARMTargetTransformInfo.h"
      11             : #include "ARMSubtarget.h"
      12             : #include "MCTargetDesc/ARMAddressingModes.h"
      13             : #include "llvm/ADT/APInt.h"
      14             : #include "llvm/ADT/SmallVector.h"
      15             : #include "llvm/Analysis/LoopInfo.h"
      16             : #include "llvm/CodeGen/CostTable.h"
      17             : #include "llvm/CodeGen/ISDOpcodes.h"
      18             : #include "llvm/CodeGen/ValueTypes.h"
      19             : #include "llvm/IR/BasicBlock.h"
      20             : #include "llvm/IR/CallSite.h"
      21             : #include "llvm/IR/DataLayout.h"
      22             : #include "llvm/IR/DerivedTypes.h"
      23             : #include "llvm/IR/Instruction.h"
      24             : #include "llvm/IR/Instructions.h"
      25             : #include "llvm/IR/Type.h"
      26             : #include "llvm/MC/SubtargetFeature.h"
      27             : #include "llvm/Support/Casting.h"
      28             : #include "llvm/Support/MachineValueType.h"
      29             : #include "llvm/Target/TargetMachine.h"
      30             : #include <algorithm>
      31             : #include <cassert>
      32             : #include <cstdint>
      33             : #include <utility>
      34             : 
      35             : using namespace llvm;
      36             : 
      37             : #define DEBUG_TYPE "armtti"
      38             : 
      39          35 : bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
      40             :                                      const Function *Callee) const {
      41          35 :   const TargetMachine &TM = getTLI()->getTargetMachine();
      42             :   const FeatureBitset &CallerBits =
      43          35 :       TM.getSubtargetImpl(*Caller)->getFeatureBits();
      44             :   const FeatureBitset &CalleeBits =
      45          35 :       TM.getSubtargetImpl(*Callee)->getFeatureBits();
      46             : 
      47             :   // To inline a callee, all features not in the whitelist must match exactly.
      48          35 :   bool MatchExact = (CallerBits & ~InlineFeatureWhitelist) ==
      49          35 :                     (CalleeBits & ~InlineFeatureWhitelist);
      50             :   // For features in the whitelist, the callee's features must be a subset of
      51             :   // the callers'.
      52          35 :   bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeatureWhitelist) ==
      53          35 :                      (CalleeBits & InlineFeatureWhitelist);
      54          35 :   return MatchExact && MatchSubset;
      55             : }
      56             : 
      57       15856 : int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
      58             :   assert(Ty->isIntegerTy());
      59             : 
      60       15856 :  unsigned Bits = Ty->getPrimitiveSizeInBits();
      61       15856 :  if (Bits == 0 || Imm.getActiveBits() >= 64)
      62             :    return 4;
      63             : 
      64             :   int64_t SImmVal = Imm.getSExtValue();
      65             :   uint64_t ZImmVal = Imm.getZExtValue();
      66       15706 :   if (!ST->isThumb()) {
      67        8490 :     if ((SImmVal >= 0 && SImmVal < 65536) ||
      68        7240 :         (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
      69        1071 :         (ARM_AM::getSOImmVal(~ZImmVal) != -1))
      70             :       return 1;
      71         435 :     return ST->hasV6T2Ops() ? 2 : 3;
      72             :   }
      73             :   if (ST->isThumb2()) {
      74        7461 :     if ((SImmVal >= 0 && SImmVal < 65536) ||
      75        6137 :         (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
      76         513 :         (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
      77             :       return 1;
      78         257 :     return ST->hasV6T2Ops() ? 2 : 3;
      79             :   }
      80             :   // Thumb1, any i8 imm cost 1.
      81        2329 :   if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
      82             :     return 1;
      83         873 :   if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
      84         569 :     return 2;
      85             :   // Load from constantpool.
      86             :   return 3;
      87             : }
      88             : 
      89             : // Constants smaller than 256 fit in the immediate field of
      90             : // Thumb1 instructions so we return a zero cost and 1 otherwise.
      91          57 : int ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
      92             :                                       const APInt &Imm, Type *Ty) {
      93         105 :   if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
      94          48 :     return 0;
      95             : 
      96             :   return 1;
      97             : }
      98             : 
      99       14320 : int ARMTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
     100             :                               Type *Ty) {
     101             :   // Division by a constant can be turned into multiplication, but only if we
     102             :   // know it's constant. So it's not so much that the immediate is cheap (it's
     103             :   // not), but that the alternative is worse.
     104             :   // FIXME: this is probably unneeded with GlobalISel.
     105       28640 :   if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
     106       14320 :        Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
     107             :       Idx == 1)
     108             :     return 0;
     109             : 
     110       14188 :   if (Opcode == Instruction::And)
     111             :       // Conversion to BIC is free, and means we can use ~Imm instead.
     112        1027 :       return std::min(getIntImmCost(Imm, Ty), getIntImmCost(~Imm, Ty));
     113             : 
     114       13242 :   if (Opcode == Instruction::Add)
     115             :     // Conversion to SUB is free, and means we can use -Imm instead.
     116         889 :     return std::min(getIntImmCost(Imm, Ty), getIntImmCost(-Imm, Ty));
     117             : 
     118       12405 :   if (Opcode == Instruction::ICmp && Imm.isNegative() &&
     119             :       Ty->getIntegerBitWidth() == 32) {
     120             :     int64_t NegImm = -Imm.getSExtValue();
     121         134 :     if (ST->isThumb2() && NegImm < 1<<12)
     122             :       // icmp X, #-C -> cmn X, #C
     123             :       return 0;
     124          72 :     if (ST->isThumb() && NegImm < 1<<8)
     125             :       // icmp X, #-C -> adds X, #C
     126             :       return 0;
     127             :   }
     128             : 
     129             :   // xor a, -1 can always be folded to MVN
     130       12556 :   if (Opcode == Instruction::Xor && Imm.isAllOnesValue())
     131             :     return 0;
     132             : 
     133       12289 :   return getIntImmCost(Imm, Ty);
     134             : }
     135             : 
     136         544 : int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     137             :                                  const Instruction *I) {
     138         544 :   int ISD = TLI->InstructionOpcodeToISD(Opcode);
     139             :   assert(ISD && "Invalid opcode");
     140             : 
     141             :   // Single to/from double precision conversions.
     142             :   static const CostTblEntry NEONFltDblTbl[] = {
     143             :     // Vector fptrunc/fpext conversions.
     144             :     { ISD::FP_ROUND,   MVT::v2f64, 2 },
     145             :     { ISD::FP_EXTEND,  MVT::v2f32, 2 },
     146             :     { ISD::FP_EXTEND,  MVT::v4f32, 4 }
     147             :   };
     148             : 
     149         544 :   if (Src->isVectorTy() && ST->hasNEON() && (ISD == ISD::FP_ROUND ||
     150         376 :                                           ISD == ISD::FP_EXTEND)) {
     151          10 :     std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
     152          10 :     if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
     153          10 :       return LT.first * Entry->Cost;
     154             :   }
     155             : 
     156         534 :   EVT SrcTy = TLI->getValueType(DL, Src);
     157         534 :   EVT DstTy = TLI->getValueType(DL, Dst);
     158             : 
     159         534 :   if (!SrcTy.isSimple() || !DstTy.isSimple())
     160          20 :     return BaseT::getCastInstrCost(Opcode, Dst, Src);
     161             : 
     162             :   // Some arithmetic, load and store operations have specific instructions
     163             :   // to cast up/down their types automatically at no extra cost.
     164             :   // TODO: Get these tables to know at least what the related operations are.
     165             :   static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
     166             :     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0 },
     167             :     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0 },
     168             :     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
     169             :     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
     170             :     { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64, 0 },
     171             :     { ISD::TRUNCATE,    MVT::v4i16, MVT::v4i32, 1 },
     172             : 
     173             :     // The number of vmovl instructions for the extension.
     174             :     { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
     175             :     { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
     176             :     { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
     177             :     { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
     178             :     { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
     179             :     { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
     180             :     { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
     181             :     { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
     182             :     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
     183             :     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
     184             : 
     185             :     // Operations that we legalize using splitting.
     186             :     { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i32, 6 },
     187             :     { ISD::TRUNCATE,    MVT::v8i8, MVT::v8i32, 3 },
     188             : 
     189             :     // Vector float <-> i32 conversions.
     190             :     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
     191             :     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
     192             : 
     193             :     { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
     194             :     { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
     195             :     { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
     196             :     { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
     197             :     { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
     198             :     { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
     199             :     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
     200             :     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
     201             :     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
     202             :     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
     203             :     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
     204             :     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
     205             :     { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
     206             :     { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
     207             :     { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
     208             :     { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
     209             :     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
     210             :     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
     211             :     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },
     212             :     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },
     213             : 
     214             :     { ISD::FP_TO_SINT,  MVT::v4i32, MVT::v4f32, 1 },
     215             :     { ISD::FP_TO_UINT,  MVT::v4i32, MVT::v4f32, 1 },
     216             :     { ISD::FP_TO_SINT,  MVT::v4i8, MVT::v4f32, 3 },
     217             :     { ISD::FP_TO_UINT,  MVT::v4i8, MVT::v4f32, 3 },
     218             :     { ISD::FP_TO_SINT,  MVT::v4i16, MVT::v4f32, 2 },
     219             :     { ISD::FP_TO_UINT,  MVT::v4i16, MVT::v4f32, 2 },
     220             : 
     221             :     // Vector double <-> i32 conversions.
     222             :     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
     223             :     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
     224             : 
     225             :     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
     226             :     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
     227             :     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
     228             :     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
     229             :     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
     230             :     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
     231             : 
     232             :     { ISD::FP_TO_SINT,  MVT::v2i32, MVT::v2f64, 2 },
     233             :     { ISD::FP_TO_UINT,  MVT::v2i32, MVT::v2f64, 2 },
     234             :     { ISD::FP_TO_SINT,  MVT::v8i16, MVT::v8f32, 4 },
     235             :     { ISD::FP_TO_UINT,  MVT::v8i16, MVT::v8f32, 4 },
     236             :     { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v16f32, 8 },
     237             :     { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v16f32, 8 }
     238             :   };
     239             : 
     240         514 :   if (SrcTy.isVector() && ST->hasNEON()) {
     241         111 :     if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
     242             :                                                    DstTy.getSimpleVT(),
     243             :                                                    SrcTy.getSimpleVT()))
     244         111 :       return Entry->Cost;
     245             :   }
     246             : 
     247             :   // Scalar float to integer conversions.
     248             :   static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
     249             :     { ISD::FP_TO_SINT,  MVT::i1, MVT::f32, 2 },
     250             :     { ISD::FP_TO_UINT,  MVT::i1, MVT::f32, 2 },
     251             :     { ISD::FP_TO_SINT,  MVT::i1, MVT::f64, 2 },
     252             :     { ISD::FP_TO_UINT,  MVT::i1, MVT::f64, 2 },
     253             :     { ISD::FP_TO_SINT,  MVT::i8, MVT::f32, 2 },
     254             :     { ISD::FP_TO_UINT,  MVT::i8, MVT::f32, 2 },
     255             :     { ISD::FP_TO_SINT,  MVT::i8, MVT::f64, 2 },
     256             :     { ISD::FP_TO_UINT,  MVT::i8, MVT::f64, 2 },
     257             :     { ISD::FP_TO_SINT,  MVT::i16, MVT::f32, 2 },
     258             :     { ISD::FP_TO_UINT,  MVT::i16, MVT::f32, 2 },
     259             :     { ISD::FP_TO_SINT,  MVT::i16, MVT::f64, 2 },
     260             :     { ISD::FP_TO_UINT,  MVT::i16, MVT::f64, 2 },
     261             :     { ISD::FP_TO_SINT,  MVT::i32, MVT::f32, 2 },
     262             :     { ISD::FP_TO_UINT,  MVT::i32, MVT::f32, 2 },
     263             :     { ISD::FP_TO_SINT,  MVT::i32, MVT::f64, 2 },
     264             :     { ISD::FP_TO_UINT,  MVT::i32, MVT::f64, 2 },
     265             :     { ISD::FP_TO_SINT,  MVT::i64, MVT::f32, 10 },
     266             :     { ISD::FP_TO_UINT,  MVT::i64, MVT::f32, 10 },
     267             :     { ISD::FP_TO_SINT,  MVT::i64, MVT::f64, 10 },
     268             :     { ISD::FP_TO_UINT,  MVT::i64, MVT::f64, 10 }
     269             :   };
     270         403 :   if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
     271          67 :     if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
     272             :                                                    DstTy.getSimpleVT(),
     273             :                                                    SrcTy.getSimpleVT()))
     274          67 :       return Entry->Cost;
     275             :   }
     276             : 
     277             :   // Scalar integer to float conversions.
     278             :   static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
     279             :     { ISD::SINT_TO_FP,  MVT::f32, MVT::i1, 2 },
     280             :     { ISD::UINT_TO_FP,  MVT::f32, MVT::i1, 2 },
     281             :     { ISD::SINT_TO_FP,  MVT::f64, MVT::i1, 2 },
     282             :     { ISD::UINT_TO_FP,  MVT::f64, MVT::i1, 2 },
     283             :     { ISD::SINT_TO_FP,  MVT::f32, MVT::i8, 2 },
     284             :     { ISD::UINT_TO_FP,  MVT::f32, MVT::i8, 2 },
     285             :     { ISD::SINT_TO_FP,  MVT::f64, MVT::i8, 2 },
     286             :     { ISD::UINT_TO_FP,  MVT::f64, MVT::i8, 2 },
     287             :     { ISD::SINT_TO_FP,  MVT::f32, MVT::i16, 2 },
     288             :     { ISD::UINT_TO_FP,  MVT::f32, MVT::i16, 2 },
     289             :     { ISD::SINT_TO_FP,  MVT::f64, MVT::i16, 2 },
     290             :     { ISD::UINT_TO_FP,  MVT::f64, MVT::i16, 2 },
     291             :     { ISD::SINT_TO_FP,  MVT::f32, MVT::i32, 2 },
     292             :     { ISD::UINT_TO_FP,  MVT::f32, MVT::i32, 2 },
     293             :     { ISD::SINT_TO_FP,  MVT::f64, MVT::i32, 2 },
     294             :     { ISD::UINT_TO_FP,  MVT::f64, MVT::i32, 2 },
     295             :     { ISD::SINT_TO_FP,  MVT::f32, MVT::i64, 10 },
     296             :     { ISD::UINT_TO_FP,  MVT::f32, MVT::i64, 10 },
     297             :     { ISD::SINT_TO_FP,  MVT::f64, MVT::i64, 10 },
     298             :     { ISD::UINT_TO_FP,  MVT::f64, MVT::i64, 10 }
     299             :   };
     300             : 
     301         336 :   if (SrcTy.isInteger() && ST->hasNEON()) {
     302          46 :     if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
     303             :                                                    ISD, DstTy.getSimpleVT(),
     304             :                                                    SrcTy.getSimpleVT()))
     305          46 :       return Entry->Cost;
     306             :   }
     307             : 
     308             :   // Scalar integer conversion costs.
     309             :   static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
     310             :     // i16 -> i64 requires two dependent operations.
     311             :     { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
     312             : 
     313             :     // Truncates on i64 are assumed to be free.
     314             :     { ISD::TRUNCATE,    MVT::i32, MVT::i64, 0 },
     315             :     { ISD::TRUNCATE,    MVT::i16, MVT::i64, 0 },
     316             :     { ISD::TRUNCATE,    MVT::i8,  MVT::i64, 0 },
     317             :     { ISD::TRUNCATE,    MVT::i1,  MVT::i64, 0 }
     318             :   };
     319             : 
     320         290 :   if (SrcTy.isInteger()) {
     321          19 :     if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
     322             :                                                    DstTy.getSimpleVT(),
     323             :                                                    SrcTy.getSimpleVT()))
     324          19 :       return Entry->Cost;
     325             :   }
     326             : 
     327         271 :   return BaseT::getCastInstrCost(Opcode, Dst, Src);
     328             : }
     329             : 
     330         441 : int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
     331             :                                    unsigned Index) {
     332             :   // Penalize inserting into an D-subregister. We end up with a three times
     333             :   // lower estimated throughput on swift.
     334          69 :   if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
     335         482 :       ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
     336             :     return 3;
     337             : 
     338         402 :   if ((Opcode == Instruction::InsertElement ||
     339             :        Opcode == Instruction::ExtractElement)) {
     340             :     // Cross-class copies are expensive on many microarchitectures,
     341             :     // so assume they are expensive by default.
     342         804 :     if (ValTy->getVectorElementType()->isIntegerTy())
     343             :       return 3;
     344             : 
     345             :     // Even if it's not a cross class copy, this likely leads to mixing
     346             :     // of NEON and VFP code and should be therefore penalized.
     347         184 :     if (ValTy->isVectorTy() &&
     348         184 :         ValTy->getScalarSizeInBits() <= 32)
     349         208 :       return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
     350             :   }
     351             : 
     352          80 :   return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
     353             : }
     354             : 
     355          83 : int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
     356             :                                    const Instruction *I) {
     357          83 :   int ISD = TLI->InstructionOpcodeToISD(Opcode);
     358             :   // On NEON a vector select gets lowered to vbsl.
     359          83 :   if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT) {
     360             :     // Lowering of some vector selects is currently far from perfect.
     361             :     static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
     362             :       { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
     363             :       { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
     364             :       { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
     365             :     };
     366             : 
     367          29 :     EVT SelCondTy = TLI->getValueType(DL, CondTy);
     368          29 :     EVT SelValTy = TLI->getValueType(DL, ValTy);
     369          29 :     if (SelCondTy.isSimple() && SelValTy.isSimple()) {
     370           6 :       if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
     371             :                                                      SelCondTy.getSimpleVT(),
     372             :                                                      SelValTy.getSimpleVT()))
     373           6 :         return Entry->Cost;
     374             :     }
     375             : 
     376          23 :     std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
     377          23 :     return LT.first;
     378             :   }
     379             : 
     380          54 :   return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
     381             : }
     382             : 
     383          46 : int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
     384             :                                           const SCEV *Ptr) {
     385             :   // Address computations in vectorized code with non-consecutive addresses will
     386             :   // likely result in more instructions compared to scalar code where the
     387             :   // computation can more often be merged into the index mode. The resulting
     388             :   // extra micro-ops can significantly decrease throughput.
     389             :   unsigned NumVectorInstToHideOverhead = 10;
     390             :   int MaxMergeDistance = 64;
     391             : 
     392          62 :   if (Ty->isVectorTy() && SE &&
     393          16 :       !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
     394           6 :     return NumVectorInstToHideOverhead;
     395             : 
     396             :   // In many cases the address computation is not merged into the instruction
     397             :   // addressing mode.
     398             :   return 1;
     399             : }
     400             : 
     401          11 : int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
     402             :                                Type *SubTp) {
     403             :   // We only handle costs of reverse and select shuffles for now.
     404          11 :   if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Select)
     405           0 :     return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
     406             : 
     407          11 :   if (Kind == TTI::SK_Reverse) {
     408             :     static const CostTblEntry NEONShuffleTbl[] = {
     409             :         // Reverse shuffle cost one instruction if we are shuffling within a
     410             :         // double word (vrev) or two if we shuffle a quad word (vrev, vext).
     411             :         {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
     412             :         {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
     413             :         {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
     414             :         {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
     415             : 
     416             :         {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
     417             :         {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
     418             :         {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
     419             :         {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
     420             : 
     421          11 :     std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
     422             : 
     423           8 :     if (const auto *Entry = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE,
     424             :                                             LT.second))
     425           8 :       return LT.first * Entry->Cost;
     426             : 
     427           3 :     return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
     428             :   }
     429             :   if (Kind == TTI::SK_Select) {
     430             :     static const CostTblEntry NEONSelShuffleTbl[] = {
     431             :         // Select shuffle cost table for ARM. Cost is the number of instructions
     432             :         // required to create the shuffled vector.
     433             : 
     434             :         {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
     435             :         {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
     436             :         {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
     437             :         {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
     438             : 
     439             :         {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
     440             :         {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
     441             :         {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
     442             : 
     443             :         {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
     444             : 
     445             :         {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
     446             : 
     447           0 :     std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
     448           0 :     if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
     449             :                                             ISD::VECTOR_SHUFFLE, LT.second))
     450           0 :       return LT.first * Entry->Cost;
     451           0 :     return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
     452             :   }
     453             :   return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
     454             : }
     455             : 
     456         241 : int ARMTTIImpl::getArithmeticInstrCost(
     457             :     unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
     458             :     TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
     459             :     TTI::OperandValueProperties Opd2PropInfo,
     460             :     ArrayRef<const Value *> Args) {
     461         241 :   int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
     462         241 :   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
     463             : 
     464             :   const unsigned FunctionCallDivCost = 20;
     465             :   const unsigned ReciprocalDivCost = 10;
     466             :   static const CostTblEntry CostTbl[] = {
     467             :     // Division.
     468             :     // These costs are somewhat random. Choose a cost of 20 to indicate that
     469             :     // vectorizing devision (added function call) is going to be very expensive.
     470             :     // Double registers types.
     471             :     { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
     472             :     { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
     473             :     { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
     474             :     { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
     475             :     { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
     476             :     { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
     477             :     { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
     478             :     { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
     479             :     { ISD::SDIV, MVT::v4i16,     ReciprocalDivCost},
     480             :     { ISD::UDIV, MVT::v4i16,     ReciprocalDivCost},
     481             :     { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
     482             :     { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
     483             :     { ISD::SDIV, MVT::v8i8,      ReciprocalDivCost},
     484             :     { ISD::UDIV, MVT::v8i8,      ReciprocalDivCost},
     485             :     { ISD::SREM, MVT::v8i8,  8 * FunctionCallDivCost},
     486             :     { ISD::UREM, MVT::v8i8,  8 * FunctionCallDivCost},
     487             :     // Quad register types.
     488             :     { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
     489             :     { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
     490             :     { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
     491             :     { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
     492             :     { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
     493             :     { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
     494             :     { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
     495             :     { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
     496             :     { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
     497             :     { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
     498             :     { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
     499             :     { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
     500             :     { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
     501             :     { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
     502             :     { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
     503             :     { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
     504             :     // Multiplication.
     505             :   };
     506             : 
     507         241 :   if (ST->hasNEON())
     508          64 :     if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
     509          64 :       return LT.first * Entry->Cost;
     510             : 
     511         531 :   int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
     512         177 :                                            Opd1PropInfo, Opd2PropInfo);
     513             : 
     514             :   // This is somewhat of a hack. The problem that we are facing is that SROA
     515             :   // creates a sequence of shift, and, or instructions to construct values.
     516             :   // These sequences are recognized by the ISel and have zero-cost. Not so for
     517             :   // the vectorized code. Because we have support for v2i64 but not i64 those
     518             :   // sequences look particularly beneficial to vectorize.
     519             :   // To work around this we increase the cost of v2i64 operations to make them
     520             :   // seem less beneficial.
     521         177 :   if (LT.second == MVT::v2i64 &&
     522             :       Op2Info == TargetTransformInfo::OK_UniformConstantValue)
     523           2 :     Cost += 4;
     524             : 
     525             :   return Cost;
     526             : }
     527             : 
     528         163 : int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
     529             :                                 unsigned AddressSpace, const Instruction *I) {
     530         163 :   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
     531             : 
     532         163 :   if (Src->isVectorTy() && Alignment != 16 &&
     533         104 :       Src->getVectorElementType()->isDoubleTy()) {
     534             :     // Unaligned loads/stores are extremely inefficient.
     535             :     // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
     536           3 :     return LT.first * 4;
     537             :   }
     538             :   return LT.first;
     539             : }
     540             : 
     541          10 : int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
     542             :                                            unsigned Factor,
     543             :                                            ArrayRef<unsigned> Indices,
     544             :                                            unsigned Alignment,
     545             :                                            unsigned AddressSpace,
     546             :                                            bool IsMasked) {
     547             :   assert(Factor >= 2 && "Invalid interleave factor");
     548             :   assert(isa<VectorType>(VecTy) && "Expect a vector type");
     549             : 
     550             :   // vldN/vstN doesn't support vector types of i64/f64 element.
     551          20 :   bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
     552             : 
     553          10 :   if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
     554          10 :       !IsMasked) {
     555             :     unsigned NumElts = VecTy->getVectorNumElements();
     556          20 :     auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
     557             : 
     558             :     // vldN/vstN only support legal vector types of size 64 or 128 in bits.
     559             :     // Accesses having vector types that are a multiple of 128 bits can be
     560             :     // matched to more than one vldN/vstN instruction.
     561          20 :     if (NumElts % Factor == 0 &&
     562          10 :         TLI->isLegalInterleavedAccessType(SubVecTy, DL))
     563          10 :       return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL);
     564             :   }
     565             : 
     566           0 :   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
     567           0 :                                            Alignment, AddressSpace, IsMasked);
     568             : }
     569             : 
     570          56 : void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
     571             :                                          TTI::UnrollingPreferences &UP) {
     572             :   // Only currently enable these preferences for M-Class cores.
     573          56 :   if (!ST->isMClass())
     574          34 :     return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP);
     575             : 
     576             :   // Disable loop unrolling for Oz and Os.
     577          34 :   UP.OptSizeThreshold = 0;
     578          34 :   UP.PartialOptSizeThreshold = 0;
     579          34 :   if (L->getHeader()->getParent()->optForSize())
     580             :     return;
     581             : 
     582             :   // Only enable on Thumb-2 targets.
     583          34 :   if (!ST->isThumb2())
     584             :     return;
     585             : 
     586             :   SmallVector<BasicBlock*, 4> ExitingBlocks;
     587          28 :   L->getExitingBlocks(ExitingBlocks);
     588             :   LLVM_DEBUG(dbgs() << "Loop has:\n"
     589             :                     << "Blocks: " << L->getNumBlocks() << "\n"
     590             :                     << "Exit blocks: " << ExitingBlocks.size() << "\n");
     591             : 
     592             :   // Only allow another exit other than the latch. This acts as an early exit
     593             :   // as it mirrors the profitability calculation of the runtime unroller.
     594          28 :   if (ExitingBlocks.size() > 2)
     595             :     return;
     596             : 
     597             :   // Limit the CFG of the loop body for targets with a branch predictor.
     598             :   // Allowing 4 blocks permits if-then-else diamonds in the body.
     599          26 :   if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
     600             :     return;
     601             : 
     602             :   // Scan the loop: don't unroll loops with calls as this could prevent
     603             :   // inlining.
     604             :   unsigned Cost = 0;
     605          83 :   for (auto *BB : L->getBlocks()) {
     606         535 :     for (auto &I : *BB) {
     607         476 :       if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
     608             :         ImmutableCallSite CS(&I);
     609             :         if (const Function *F = CS.getCalledFunction()) {
     610           2 :           if (!isLoweredToCall(F))
     611             :             continue;
     612             :         }
     613             :         return;
     614             :       }
     615             :       SmallVector<const Value*, 4> Operands(I.value_op_begin(),
     616         474 :                                             I.value_op_end());
     617         948 :       Cost += getUserCost(&I, Operands);
     618             :     }
     619             :   }
     620             : 
     621             :   LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
     622             : 
     623          22 :   UP.Partial = true;
     624          22 :   UP.Runtime = true;
     625          22 :   UP.UnrollRemainder = true;
     626          22 :   UP.DefaultUnrollRuntimeCount = 4;
     627          22 :   UP.UnrollAndJam = true;
     628          22 :   UP.UnrollAndJamInnerLoopThreshold = 60;
     629             : 
     630             :   // Force unrolling small loops can be very useful because of the branch
     631             :   // taken cost of the backedge.
     632          22 :   if (Cost < 12)
     633          17 :     UP.Force = true;
     634             : }

Generated by: LCOV version 1.13