/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/include/llvm/CodeGen/BasicTTIImpl.h

Bug Summary

File:	llvm/include/llvm/CodeGen/BasicTTIImpl.h
Warning:	line 1076, column 11 Called C++ object pointer is null

Annotated Source Code

Press '?' to see keyboard shortcuts

Show analyzer invocation

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name ARMTargetTransformInfo.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/lib/Target/ARM -resource-dir /usr/lib/llvm-14/lib/clang/14.0.0 -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/lib/Target/ARM -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/lib/Target/ARM -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/include -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/include -D NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-14/lib/clang/14.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/lib/Target/ARM -fdebug-prefix-map=/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e=. -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2021-09-04-040900-46481-1 -x c++ /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

→

1//===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//

9#include "ARMTargetTransformInfo.h"
10#include "ARMSubtarget.h"
11#include "MCTargetDesc/ARMAddressingModes.h"
12#include "llvm/ADT/APInt.h"
13#include "llvm/ADT/SmallVector.h"
14#include "llvm/Analysis/LoopInfo.h"
15#include "llvm/CodeGen/CostTable.h"
16#include "llvm/CodeGen/ISDOpcodes.h"
17#include "llvm/CodeGen/ValueTypes.h"
18#include "llvm/IR/BasicBlock.h"
19#include "llvm/IR/DataLayout.h"
20#include "llvm/IR/DerivedTypes.h"
21#include "llvm/IR/Instruction.h"
22#include "llvm/IR/Instructions.h"
23#include "llvm/IR/Intrinsics.h"
24#include "llvm/IR/IntrinsicInst.h"
25#include "llvm/IR/IntrinsicsARM.h"
26#include "llvm/IR/PatternMatch.h"
27#include "llvm/IR/Type.h"
28#include "llvm/MC/SubtargetFeature.h"
29#include "llvm/Support/Casting.h"
30#include "llvm/Support/KnownBits.h"
31#include "llvm/Support/MachineValueType.h"
32#include "llvm/Target/TargetMachine.h"
33#include "llvm/Transforms/InstCombine/InstCombiner.h"
34#include "llvm/Transforms/Utils/Local.h"
35#include "llvm/Transforms/Utils/LoopUtils.h"
36#include <algorithm>
37#include <cassert>
38#include <cstdint>
39#include <utility>

41using namespace llvm;

43#define DEBUG_TYPE"armtti" "armtti"

45static cl::opt<bool> EnableMaskedLoadStores(
"enable-arm-maskedldst", cl::Hidden, cl::init(true),
cl::desc("Enable the generation of masked loads and stores"));

49static cl::opt<bool> DisableLowOverheadLoops(
"disable-arm-loloops", cl::Hidden, cl::init(false),
cl::desc("Disable the generation of low-overhead loops"));

53static cl::opt<bool>
  AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
                cl::desc("Enable the generation of WLS loops"));

57extern cl::opt<TailPredication::Mode> EnableTailPredication;

59extern cl::opt<bool> EnableMaskedGatherScatters;

61extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor;

63/// Convert a vector load intrinsic into a simple llvm load instruction.
64/// This is beneficial when the underlying object being addressed comes
65/// from a constant, since we get constant-folding for free.
66static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
                             InstCombiner::BuilderTy &Builder) {
auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));

if (!IntrAlign)
  return nullptr;

unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
                         ? MemAlign
                         : IntrAlign->getLimitedValue();

if (!isPowerOf2_32(Alignment))
  return nullptr;

auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
                                        PointerType::get(II.getType(), 0));
return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
83}

85bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
                                   const Function *Callee) const {
const TargetMachine &TM = getTLI()->getTargetMachine();
const FeatureBitset &CallerBits =
    TM.getSubtargetImpl(*Caller)->getFeatureBits();
const FeatureBitset &CalleeBits =
    TM.getSubtargetImpl(*Callee)->getFeatureBits();

// To inline a callee, all features not in the allowed list must match exactly.
bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
                  (CalleeBits & ~InlineFeaturesAllowed);
// For features in the allowed list, the callee's features must be a subset of
// the callers'.
bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
                   (CalleeBits & InlineFeaturesAllowed);
return MatchExact && MatchSubset;
101}

103TTI::AddressingModeKind
104ARMTTIImpl::getPreferredAddressingMode(const Loop *L,
                                     ScalarEvolution *SE) const {
if (ST->hasMVEIntegerOps())
  return TTI::AMK_PostIndexed;

if (L->getHeader()->getParent()->hasOptSize())
  return TTI::AMK_None;

if (ST->isMClass() && ST->isThumb2() &&
    L->getNumBlocks() == 1)
  return TTI::AMK_PreIndexed;

return TTI::AMK_None;
117}

119Optional<Instruction *>
120ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
using namespace PatternMatch;
Intrinsic::ID IID = II.getIntrinsicID();
switch (IID) {
default:
  break;
case Intrinsic::arm_neon_vld1: {
  Align MemAlign =
      getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
                        &IC.getAssumptionCache(), &IC.getDominatorTree());
  if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
    return IC.replaceInstUsesWith(II, V);
  }
  break;
}

case Intrinsic::arm_neon_vld2:
case Intrinsic::arm_neon_vld3:
case Intrinsic::arm_neon_vld4:
case Intrinsic::arm_neon_vld2lane:
case Intrinsic::arm_neon_vld3lane:
case Intrinsic::arm_neon_vld4lane:
case Intrinsic::arm_neon_vst1:
case Intrinsic::arm_neon_vst2:
case Intrinsic::arm_neon_vst3:
case Intrinsic::arm_neon_vst4:
case Intrinsic::arm_neon_vst2lane:
case Intrinsic::arm_neon_vst3lane:
case Intrinsic::arm_neon_vst4lane: {
  Align MemAlign =
      getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
                        &IC.getAssumptionCache(), &IC.getDominatorTree());
  unsigned AlignArg = II.getNumArgOperands() - 1;
  Value *AlignArgOp = II.getArgOperand(AlignArg);
  MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
  if (Align && *Align < MemAlign) {
    return IC.replaceOperand(
        II, AlignArg,
        ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(),
                         false));
  }
  break;
}

case Intrinsic::arm_mve_pred_i2v: {
  Value *Arg = II.getArgOperand(0);
  Value *ArgArg;
  if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
                     PatternMatch::m_Value(ArgArg))) &&
      II.getType() == ArgArg->getType()) {
    return IC.replaceInstUsesWith(II, ArgArg);
  }
  Constant *XorMask;
  if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
                           PatternMatch::m_Value(ArgArg)),
                       PatternMatch::m_Constant(XorMask))) &&
      II.getType() == ArgArg->getType()) {
    if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
      if (CI->getValue().trunc(16).isAllOnesValue()) {
        auto TrueVector = IC.Builder.CreateVectorSplat(
            cast<FixedVectorType>(II.getType())->getNumElements(),
            IC.Builder.getTrue());
        return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
      }
    }
  }
  KnownBits ScalarKnown(32);
  if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
                              ScalarKnown, 0)) {
    return &II;
  }
  break;
}
case Intrinsic::arm_mve_pred_v2i: {
  Value *Arg = II.getArgOperand(0);
  Value *ArgArg;
  if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
                     PatternMatch::m_Value(ArgArg)))) {
    return IC.replaceInstUsesWith(II, ArgArg);
  }
  if (!II.getMetadata(LLVMContext::MD_range)) {
    Type *IntTy32 = Type::getInt32Ty(II.getContext());
    Metadata *M[] = {
        ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)),
        ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0x10000))};
    II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M));
    return &II;
  }
  break;
}
case Intrinsic::arm_mve_vadc:
case Intrinsic::arm_mve_vadc_predicated: {
  unsigned CarryOp =
      (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
  assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&(static_cast<void> (0))
         "Bad type for intrinsic!")(static_cast<void> (0));

  KnownBits CarryKnown(32);
  if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
                              CarryKnown)) {
    return &II;
  }
  break;
}
case Intrinsic::arm_mve_vmldava: {
  Instruction *I = cast<Instruction>(&II);
  if (I->hasOneUse()) {
    auto *User = cast<Instruction>(*I->user_begin());
    Value *OpZ;
    if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
        match(I->getOperand(3), m_Zero())) {
      Value *OpX = I->getOperand(4);
      Value *OpY = I->getOperand(5);
      Type *OpTy = OpX->getType();

      IC.Builder.SetInsertPoint(User);
      Value *V =
          IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
                                     {I->getOperand(0), I->getOperand(1),
                                      I->getOperand(2), OpZ, OpX, OpY});

      IC.replaceInstUsesWith(*User, V);
      return IC.eraseInstFromFunction(*User);
    }
  }
  return None;
}
}
return None;
249}

251InstructionCost ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
                                        TTI::TargetCostKind CostKind) {
assert(Ty->isIntegerTy())(static_cast<void> (0));

unsigned Bits = Ty->getPrimitiveSizeInBits();
if (Bits == 0 || Imm.getActiveBits() >= 64)
 return 4;

int64_t SImmVal = Imm.getSExtValue();
uint64_t ZImmVal = Imm.getZExtValue();
if (!ST->isThumb()) {
  if ((SImmVal >= 0 && SImmVal < 65536) ||
      (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
      (ARM_AM::getSOImmVal(~ZImmVal) != -1))
    return 1;
  return ST->hasV6T2Ops() ? 2 : 3;
}
if (ST->isThumb2()) {
  if ((SImmVal >= 0 && SImmVal < 65536) ||
      (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
      (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
    return 1;
  return ST->hasV6T2Ops() ? 2 : 3;
}
// Thumb1, any i8 imm cost 1.
if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
  return 1;
if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
  return 2;
// Load from constantpool.
return 3;
282}

284// Constants smaller than 256 fit in the immediate field of
285// Thumb1 instructions so we return a zero cost and 1 otherwise.
286InstructionCost ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
                                                const APInt &Imm, Type *Ty) {
if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
  return 0;

return 1;
292}

294// Checks whether Inst is part of a min(max()) or max(min()) pattern
295// that will match to an SSAT instruction
296static bool isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
Value *LHS, *RHS;
ConstantInt *C;
SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor;

if (InstSPF == SPF_SMAX &&
    PatternMatch::match(RHS, PatternMatch::m_ConstantInt(C)) &&
    C->getValue() == Imm && Imm.isNegative() && (-Imm).isPowerOf2()) {

  auto isSSatMin = [&](Value *MinInst) {
    if (isa<SelectInst>(MinInst)) {
      Value *MinLHS, *MinRHS;
      ConstantInt *MinC;
      SelectPatternFlavor MinSPF =
          matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
      if (MinSPF == SPF_SMIN &&
          PatternMatch::match(MinRHS, PatternMatch::m_ConstantInt(MinC)) &&
          MinC->getValue() == ((-Imm) - 1))
        return true;
    }
    return false;
  };

  if (isSSatMin(Inst->getOperand(1)) ||
      (Inst->hasNUses(2) && (isSSatMin(*Inst->user_begin()) ||
                             isSSatMin(*(++Inst->user_begin())))))
    return true;
}
return false;
325}

327InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
                                            const APInt &Imm, Type *Ty,
                                            TTI::TargetCostKind CostKind,
                                            Instruction *Inst) {
// Division by a constant can be turned into multiplication, but only if we
// know it's constant. So it's not so much that the immediate is cheap (it's
// not), but that the alternative is worse.
// FIXME: this is probably unneeded with GlobalISel.
if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
     Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
    Idx == 1)
  return 0;

// Leave any gep offsets for the CodeGenPrepare, which will do a better job at
// splitting any large offsets.
if (Opcode == Instruction::GetElementPtr && Idx != 0)
  return 0;

if (Opcode == Instruction::And) {
  // UXTB/UXTH
  if (Imm == 255 || Imm == 65535)
    return 0;
  // Conversion to BIC is free, and means we can use ~Imm instead.
  return std::min(getIntImmCost(Imm, Ty, CostKind),
                  getIntImmCost(~Imm, Ty, CostKind));
}

if (Opcode == Instruction::Add)
  // Conversion to SUB is free, and means we can use -Imm instead.
  return std::min(getIntImmCost(Imm, Ty, CostKind),
                  getIntImmCost(-Imm, Ty, CostKind));

if (Opcode == Instruction::ICmp && Imm.isNegative() &&
    Ty->getIntegerBitWidth() == 32) {
  int64_t NegImm = -Imm.getSExtValue();
  if (ST->isThumb2() && NegImm < 1<<12)
    // icmp X, #-C -> cmn X, #C
    return 0;
  if (ST->isThumb() && NegImm < 1<<8)
    // icmp X, #-C -> adds X, #C
    return 0;
}

// xor a, -1 can always be folded to MVN
if (Opcode == Instruction::Xor && Imm.isAllOnesValue())
  return 0;

// Ensures negative constant of min(max()) or max(min()) patterns that
// match to SSAT instructions don't get hoisted
if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
    Ty->getIntegerBitWidth() <= 32) {
  if (isSSATMinMaxPattern(Inst, Imm) ||
      (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
       isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm)))
    return 0;
}

return getIntImmCost(Imm, Ty, CostKind);
385}

387InstructionCost ARMTTIImpl::getCFInstrCost(unsigned Opcode,
                                         TTI::TargetCostKind CostKind,
                                         const Instruction *I) {
if (CostKind == TTI::TCK_RecipThroughput &&
    (ST->hasNEON() || ST->hasMVEIntegerOps())) {
  // FIXME: The vectorizer is highly sensistive to the cost of these
  // instructions, which suggests that it may be using the costs incorrectly.
  // But, for now, just make them free to avoid performance regressions for
  // vector targets.
  return 0;
}
return BaseT::getCFInstrCost(Opcode, CostKind, I);
399}

401InstructionCost ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
                                           Type *Src,
                                           TTI::CastContextHint CCH,
                                           TTI::TargetCostKind CostKind,
                                           const Instruction *I) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode")(static_cast<void> (0));

// TODO: Allow non-throughput costs that aren't binary.
auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
  if (CostKind != TTI::TCK_RecipThroughput)
    return Cost == 0 ? 0 : 1;
  return Cost;
};
auto IsLegalFPType = [this](EVT VT) {
  EVT EltVT = VT.getScalarType();
  return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
          (EltVT == MVT::f64 && ST->hasFP64()) ||
          (EltVT == MVT::f16 && ST->hasFullFP16());
};

EVT SrcTy = TLI->getValueType(DL, Src);
EVT DstTy = TLI->getValueType(DL, Dst);

if (!SrcTy.isSimple() || !DstTy.isSimple())
  return AdjustCost(
      BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));

// Extending masked load/Truncating masked stores is expensive because we
// currently don't split them. This means that we'll likely end up
// loading/storing each element individually (hence the high cost).
if ((ST->hasMVEIntegerOps() &&
     (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
      Opcode == Instruction::SExt)) ||
    (ST->hasMVEFloatOps() &&
     (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
     IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
  if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
    return 2 * DstTy.getVectorNumElements() *
           ST->getMVEVectorCostFactor(CostKind);

// The extend of other kinds of load is free
if (CCH == TTI::CastContextHint::Normal ||
    CCH == TTI::CastContextHint::Masked) {
  static const TypeConversionCostTblEntry LoadConversionTbl[] = {
      {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
      {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
      {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
      {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
      {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
      {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
      {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
      {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
      {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
      {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
      {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
      {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
  };
  if (const auto *Entry = ConvertCostTableLookup(
          LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
    return AdjustCost(Entry->Cost);

  static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
      {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
      {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
      {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
      {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
      {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
      {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
      // The following extend from a legal type to an illegal type, so need to
      // split the load. This introduced an extra load operation, but the
      // extend is still "free".
      {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1},
      {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1},
      {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3},
      {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3},
      {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1},
      {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1},
  };
  if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
    if (const auto *Entry =
            ConvertCostTableLookup(MVELoadConversionTbl, ISD,
                                   DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
      return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
  }

  static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
      // FPExtends are similar but also require the VCVT instructions.
      {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1},
      {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3},
  };
  if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
    if (const auto *Entry =
            ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
                                   DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
      return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
  }

  // The truncate of a store is free. This is the mirror of extends above.
  static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
      {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0},
      {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0},
      {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0},
      {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1},
      {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1},
      {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3},
      {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1},
  };
  if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
    if (const auto *Entry =
            ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
                                   SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
      return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
  }

  static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
      {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
      {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
  };
  if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
    if (const auto *Entry =
            ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
                                   SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
      return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
  }
}

// NEON vector operations that can extend their inputs.
if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
    I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
  static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
    // vaddl
    { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
    { ISD::ADD, MVT::v8i16, MVT::v8i8,  0 },
    // vsubl
    { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
    { ISD::SUB, MVT::v8i16, MVT::v8i8,  0 },
    // vmull
    { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
    { ISD::MUL, MVT::v8i16, MVT::v8i8,  0 },
    // vshll
    { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
    { ISD::SHL, MVT::v8i16, MVT::v8i8,  0 },
  };

  auto *User = cast<Instruction>(*I->user_begin());
  int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
  if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
                                           DstTy.getSimpleVT(),
                                           SrcTy.getSimpleVT())) {
    return AdjustCost(Entry->Cost);
  }
}

// Single to/from double precision conversions.
if (Src->isVectorTy() && ST->hasNEON() &&
    ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
      DstTy.getScalarType() == MVT::f32) ||
     (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
      DstTy.getScalarType() == MVT::f64))) {
  static const CostTblEntry NEONFltDblTbl[] = {
      // Vector fptrunc/fpext conversions.
      {ISD::FP_ROUND, MVT::v2f64, 2},
      {ISD::FP_EXTEND, MVT::v2f32, 2},
      {ISD::FP_EXTEND, MVT::v4f32, 4}};

  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
  if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
    return AdjustCost(LT.first * Entry->Cost);
}

// Some arithmetic, load and store operations have specific instructions
// to cast up/down their types automatically at no extra cost.
// TODO: Get these tables to know at least what the related operations are.
static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
  { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
  { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
  { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
  { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
  { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64, 0 },
  { ISD::TRUNCATE,    MVT::v4i16, MVT::v4i32, 1 },

  // The number of vmovl instructions for the extension.
  { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8,  1 },
  { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8,  1 },
  { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8,  2 },
  { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8,  2 },
  { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8,  3 },
  { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8,  3 },
  { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
  { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
  { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
  { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
  { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
  { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
  { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
  { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
  { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
  { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
  { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
  { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },

  // Operations that we legalize using splitting.
  { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i32, 6 },
  { ISD::TRUNCATE,    MVT::v8i8, MVT::v8i32, 3 },

  // Vector float <-> i32 conversions.
  { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
  { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },

  { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
  { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
  { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
  { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
  { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
  { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
  { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
  { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
  { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
  { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
  { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
  { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
  { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
  { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
  { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
  { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
  { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
  { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
  { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },
  { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },

  { ISD::FP_TO_SINT,  MVT::v4i32, MVT::v4f32, 1 },
  { ISD::FP_TO_UINT,  MVT::v4i32, MVT::v4f32, 1 },
  { ISD::FP_TO_SINT,  MVT::v4i8, MVT::v4f32, 3 },
  { ISD::FP_TO_UINT,  MVT::v4i8, MVT::v4f32, 3 },
  { ISD::FP_TO_SINT,  MVT::v4i16, MVT::v4f32, 2 },
  { ISD::FP_TO_UINT,  MVT::v4i16, MVT::v4f32, 2 },

  // Vector double <-> i32 conversions.
  { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
  { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },

  { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
  { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
  { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
  { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
  { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
  { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },

  { ISD::FP_TO_SINT,  MVT::v2i32, MVT::v2f64, 2 },
  { ISD::FP_TO_UINT,  MVT::v2i32, MVT::v2f64, 2 },
  { ISD::FP_TO_SINT,  MVT::v8i16, MVT::v8f32, 4 },
  { ISD::FP_TO_UINT,  MVT::v8i16, MVT::v8f32, 4 },
  { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v16f32, 8 },
  { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v16f32, 8 }
};

if (SrcTy.isVector() && ST->hasNEON()) {
  if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
                                                 DstTy.getSimpleVT(),
                                                 SrcTy.getSimpleVT()))
    return AdjustCost(Entry->Cost);
}

// Scalar float to integer conversions.
static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
  { ISD::FP_TO_SINT,  MVT::i1, MVT::f32, 2 },
  { ISD::FP_TO_UINT,  MVT::i1, MVT::f32, 2 },
  { ISD::FP_TO_SINT,  MVT::i1, MVT::f64, 2 },
  { ISD::FP_TO_UINT,  MVT::i1, MVT::f64, 2 },
  { ISD::FP_TO_SINT,  MVT::i8, MVT::f32, 2 },
  { ISD::FP_TO_UINT,  MVT::i8, MVT::f32, 2 },
  { ISD::FP_TO_SINT,  MVT::i8, MVT::f64, 2 },
  { ISD::FP_TO_UINT,  MVT::i8, MVT::f64, 2 },
  { ISD::FP_TO_SINT,  MVT::i16, MVT::f32, 2 },
  { ISD::FP_TO_UINT,  MVT::i16, MVT::f32, 2 },
  { ISD::FP_TO_SINT,  MVT::i16, MVT::f64, 2 },
  { ISD::FP_TO_UINT,  MVT::i16, MVT::f64, 2 },
  { ISD::FP_TO_SINT,  MVT::i32, MVT::f32, 2 },
  { ISD::FP_TO_UINT,  MVT::i32, MVT::f32, 2 },
  { ISD::FP_TO_SINT,  MVT::i32, MVT::f64, 2 },
  { ISD::FP_TO_UINT,  MVT::i32, MVT::f64, 2 },
  { ISD::FP_TO_SINT,  MVT::i64, MVT::f32, 10 },
  { ISD::FP_TO_UINT,  MVT::i64, MVT::f32, 10 },
  { ISD::FP_TO_SINT,  MVT::i64, MVT::f64, 10 },
  { ISD::FP_TO_UINT,  MVT::i64, MVT::f64, 10 }
};
if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
  if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
                                                 DstTy.getSimpleVT(),
                                                 SrcTy.getSimpleVT()))
    return AdjustCost(Entry->Cost);
}

// Scalar integer to float conversions.
static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
  { ISD::SINT_TO_FP,  MVT::f32, MVT::i1, 2 },
  { ISD::UINT_TO_FP,  MVT::f32, MVT::i1, 2 },
  { ISD::SINT_TO_FP,  MVT::f64, MVT::i1, 2 },
  { ISD::UINT_TO_FP,  MVT::f64, MVT::i1, 2 },
  { ISD::SINT_TO_FP,  MVT::f32, MVT::i8, 2 },
  { ISD::UINT_TO_FP,  MVT::f32, MVT::i8, 2 },
  { ISD::SINT_TO_FP,  MVT::f64, MVT::i8, 2 },
  { ISD::UINT_TO_FP,  MVT::f64, MVT::i8, 2 },
  { ISD::SINT_TO_FP,  MVT::f32, MVT::i16, 2 },
  { ISD::UINT_TO_FP,  MVT::f32, MVT::i16, 2 },
  { ISD::SINT_TO_FP,  MVT::f64, MVT::i16, 2 },
  { ISD::UINT_TO_FP,  MVT::f64, MVT::i16, 2 },
  { ISD::SINT_TO_FP,  MVT::f32, MVT::i32, 2 },
  { ISD::UINT_TO_FP,  MVT::f32, MVT::i32, 2 },
  { ISD::SINT_TO_FP,  MVT::f64, MVT::i32, 2 },
  { ISD::UINT_TO_FP,  MVT::f64, MVT::i32, 2 },
  { ISD::SINT_TO_FP,  MVT::f32, MVT::i64, 10 },
  { ISD::UINT_TO_FP,  MVT::f32, MVT::i64, 10 },
  { ISD::SINT_TO_FP,  MVT::f64, MVT::i64, 10 },
  { ISD::UINT_TO_FP,  MVT::f64, MVT::i64, 10 }
};

if (SrcTy.isInteger() && ST->hasNEON()) {
  if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
                                                 ISD, DstTy.getSimpleVT(),
                                                 SrcTy.getSimpleVT()))
    return AdjustCost(Entry->Cost);
}

// MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
// instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
// are linearised so take more.
static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
  { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
  { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
  { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
  { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
  { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
  { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
  { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
  { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
  { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
  { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
  { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
  { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
};

if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
  if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
                                                 ISD, DstTy.getSimpleVT(),
                                                 SrcTy.getSimpleVT()))
    return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
}

if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
  // As general rule, fp converts that were not matched above are scalarized
  // and cost 1 vcvt for each lane, so long as the instruction is available.
  // If not it will become a series of function calls.
  const InstructionCost CallCost =
      getCallInstrCost(nullptr, Dst, {Src}, CostKind);
  int Lanes = 1;
  if (SrcTy.isFixedLengthVector())
    Lanes = SrcTy.getVectorNumElements();

  if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
    return Lanes;
  else
    return Lanes * CallCost;
}

if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
    SrcTy.isFixedLengthVector()) {
  // Treat a truncate with larger than legal source (128bits for MVE) as
  // expensive, 2 instructions per lane.
  if ((SrcTy.getScalarType() == MVT::i8 ||
       SrcTy.getScalarType() == MVT::i16 ||
       SrcTy.getScalarType() == MVT::i32) &&
      SrcTy.getSizeInBits() > 128 &&
      SrcTy.getSizeInBits() > DstTy.getSizeInBits())
    return SrcTy.getVectorNumElements() * 2;
}

// Scalar integer conversion costs.
static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
  // i16 -> i64 requires two dependent operations.
  { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },

  // Truncates on i64 are assumed to be free.
  { ISD::TRUNCATE,    MVT::i32, MVT::i64, 0 },
  { ISD::TRUNCATE,    MVT::i16, MVT::i64, 0 },
  { ISD::TRUNCATE,    MVT::i8,  MVT::i64, 0 },
  { ISD::TRUNCATE,    MVT::i1,  MVT::i64, 0 }
};

if (SrcTy.isInteger()) {
  if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
                                                 DstTy.getSimpleVT(),
                                                 SrcTy.getSimpleVT()))
    return AdjustCost(Entry->Cost);
}

int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
                   ? ST->getMVEVectorCostFactor(CostKind)
                   : 1;
return AdjustCost(
    BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
803}

805InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
                                             unsigned Index) {
// Penalize inserting into an D-subregister. We end up with a three times
// lower estimated throughput on swift.
if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
    ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
  return 3;

if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
                      Opcode == Instruction::ExtractElement)) {
  // Cross-class copies are expensive on many microarchitectures,
  // so assume they are expensive by default.
  if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
    return 3;

  // Even if it's not a cross class copy, this likely leads to mixing
  // of NEON and VFP code and should be therefore penalized.
  if (ValTy->isVectorTy() &&
      ValTy->getScalarSizeInBits() <= 32)
    return std::max<InstructionCost>(
        BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
}

if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
                               Opcode == Instruction::ExtractElement)) {
  // Integer cross-lane moves are more expensive than float, which can
  // sometimes just be vmovs. Integer involve being passes to GPR registers,
  // causing more of a delay.
  std::pair<InstructionCost, MVT> LT =
      getTLI()->getTypeLegalizationCost(DL, ValTy->getScalarType());
  return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
}

return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
839}

841InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
                                             Type *CondTy,
                                             CmpInst::Predicate VecPred,
                                             TTI::TargetCostKind CostKind,
                                             const Instruction *I) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);

// Thumb scalar code size cost for select.
if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
40
←
Assuming 'CostKind' is not equal to TCK_CodeSize→
    ST->isThumb() && !ValTy->isVectorTy()) {
  // Assume expensive structs.
  if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
    return TTI::TCC_Expensive;

  // Select costs can vary because they:
  // - may require one or more conditional mov (including an IT),
  // - can't operate directly on immediates,
  // - require live flags, which we can't copy around easily.
  InstructionCost Cost = TLI->getTypeLegalizationCost(DL, ValTy).first;

  // Possible IT instruction for Thumb2, or more for Thumb1.
  ++Cost;

  // i1 values may need rematerialising by using mov immediates and/or
  // flag setting instructions.
  if (ValTy->isIntegerTy(1))
    ++Cost;

  return Cost;
}

// If this is a vector min/max/abs, use the cost of that intrinsic directly
// instead. Hopefully when min/max intrinsics are more prevalent this code
// will not be needed.
const Instruction *Sel = I;
if ((Opcode40.1
'Opcode' is equal to ICmp
1
'Opcode' is equal to ICmp
1
'Opcode' is equal to ICmp
1
'Opcode' is equal to ICmp
1
'Opcode' is equal to ICmp
1
'Opcode' is equal to ICmp
1
'Opcode' is equal to ICmp
 == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel40.2
'Sel' is null
2
'Sel' is null
2
'Sel' is null
2
'Sel' is null
2
'Sel' is null
2
'Sel' is null
2
'Sel' is null
 &&
    Sel->hasOneUse())
  Sel = cast<Instruction>(Sel->user_back());
if (Sel40.3
'Sel' is null
3
'Sel' is null
3
'Sel' is null
3
'Sel' is null
3
'Sel' is null
3
'Sel' is null
3
'Sel' is null
 && ValTy->isVectorTy() &&
    (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
  const Value *LHS, *RHS;
  SelectPatternFlavor SPF = matchSelectPattern(Sel, LHS, RHS).Flavor;
  unsigned IID = 0;
  switch (SPF) {
  case SPF_ABS:
    IID = Intrinsic::abs;
    break;
  case SPF_SMIN:
    IID = Intrinsic::smin;
    break;
  case SPF_SMAX:
    IID = Intrinsic::smax;
    break;
  case SPF_UMIN:
    IID = Intrinsic::umin;
    break;
  case SPF_UMAX:
    IID = Intrinsic::umax;
    break;
  case SPF_FMINNUM:
    IID = Intrinsic::minnum;
    break;
  case SPF_FMAXNUM:
    IID = Intrinsic::maxnum;
    break;
  default:
    break;
  }
  if (IID) {
    // The ICmp is free, the select gets the cost of the min/max/etc
    if (Sel != I)
      return 0;
    IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
    return getIntrinsicInstrCost(CostAttrs, CostKind);
  }
}

// On NEON a vector select gets lowered to vbsl.
if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
41
←
Assuming the condition is false→
  // Lowering of some vector selects is currently far from perfect.
  static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
    { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
    { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
    { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
  };

  EVT SelCondTy = TLI->getValueType(DL, CondTy);
  EVT SelValTy = TLI->getValueType(DL, ValTy);
  if (SelCondTy.isSimple() && SelValTy.isSimple()) {
    if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
                                                   SelCondTy.getSimpleVT(),
                                                   SelValTy.getSimpleVT()))
      return Entry->Cost;
  }

  std::pair<InstructionCost, MVT> LT =
      TLI->getTypeLegalizationCost(DL, ValTy);
  return LT.first;
}

if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
42
←
Assuming the condition is true→
43
←
Calling 'Type::isVectorTy'→
47
←
Returning from 'Type::isVectorTy'→
50
←
Taking true branch→
    (Opcode47.1
'Opcode' is equal to ICmp
1
'Opcode' is equal to ICmp
1
'Opcode' is equal to ICmp
1
'Opcode' is equal to ICmp
1
'Opcode' is equal to ICmp
1
'Opcode' is equal to ICmp
1
'Opcode' is equal to ICmp
 == Instruction::ICmp || Opcode == Instruction::FCmp) &&
    cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
48
←
'ValTy' is a 'FixedVectorType'→
49
←
Assuming the condition is true→
  FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
51
←
'ValTy' is a 'FixedVectorType'→
  FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy);
52
←
Assuming pointer value is null→
53
←
Assuming null pointer is passed into cast→
  if (!VecCondTy53.1
'VecCondTy' is null
1
'VecCondTy' is null
1
'VecCondTy' is null
1
'VecCondTy' is null
1
'VecCondTy' is null
1
'VecCondTy' is null
1
'VecCondTy' is null
)
54
←
Taking true branch→
    VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy));
55
←
The object is a 'FixedVectorType'→

  // If we don't have mve.fp any fp operations will need to be scalarized.
  if (Opcode55.1
'Opcode' is not equal to FCmp
1
'Opcode' is not equal to FCmp
1
'Opcode' is not equal to FCmp
1
'Opcode' is not equal to FCmp
1
'Opcode' is not equal to FCmp
1
'Opcode' is not equal to FCmp
1
'Opcode' is not equal to FCmp
 == Instruction::FCmp && !ST->hasMVEFloatOps()) {
    // One scalaization insert, one scalarization extract and the cost of the
    // fcmps.
    return BaseT::getScalarizationOverhead(VecValTy, false, true) +
           BaseT::getScalarizationOverhead(VecCondTy, true, false) +
           VecValTy->getNumElements() *
               getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
                                  VecCondTy->getScalarType(), VecPred, CostKind,
                                  I);
  }

  std::pair<InstructionCost, MVT> LT =
      TLI->getTypeLegalizationCost(DL, ValTy);
  int BaseCost = ST->getMVEVectorCostFactor(CostKind);
  // There are two types - the input that specifies the type of the compare
  // and the output vXi1 type. Because we don't know how the output will be
  // split, we may need an expensive shuffle to get two in sync. This has the
  // effect of making larger than legal compares (v8i32 for example)
  // expensive.
  if (LT.second.getVectorNumElements() > 2) {
56
←
Assuming the condition is false→
57
←
Taking false branch→
    if (LT.first > 1)
      return LT.first * BaseCost +
             BaseT::getScalarizationOverhead(VecCondTy, true, false);
    return BaseCost;
  }
}

// Default to cheap (throughput/size of 1 instruction) but adjust throughput
// for "multiple beats" potentially needed by MVE instructions.
int BaseCost = 1;
if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
58
←
Taking false branch→
  BaseCost = ST->getMVEVectorCostFactor(CostKind);

return BaseCost *
       BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
59
←
Passing null pointer value via 3rd parameter 'CondTy'→
60
←
Calling 'BasicTTIImplBase::getCmpSelInstrCost'→
985}

987InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty,
                                                    ScalarEvolution *SE,
                                                    const SCEV *Ptr) {
// Address computations in vectorized code with non-consecutive addresses will
// likely result in more instructions compared to scalar code where the
// computation can more often be merged into the index mode. The resulting
// extra micro-ops can significantly decrease throughput.
unsigned NumVectorInstToHideOverhead = 10;
int MaxMergeDistance = 64;

if (ST->hasNEON()) {
  if (Ty->isVectorTy() && SE &&
      !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
    return NumVectorInstToHideOverhead;

  // In many cases the address computation is not merged into the instruction
  // addressing mode.
  return 1;
}
return BaseT::getAddressComputationCost(Ty, SE, Ptr);
1007}

1009bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) {
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
  // If a VCTP is part of a chain, it's already profitable and shouldn't be
  // optimized, else LSR may block tail-predication.
  switch (II->getIntrinsicID()) {
  case Intrinsic::arm_mve_vctp8:
  case Intrinsic::arm_mve_vctp16:
  case Intrinsic::arm_mve_vctp32:
  case Intrinsic::arm_mve_vctp64:
    return true;
  default:
    break;
  }
}
return false;
1024}

1026bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
  return false;

if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
  // Don't support v2i1 yet.
  if (VecTy->getNumElements() == 2)
    return false;

  // We don't support extending fp types.
   unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
  if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
    return false;
}

unsigned EltWidth = DataTy->getScalarSizeInBits();
return (EltWidth == 32 && Alignment >= 4) ||
       (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
1044}

1046bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) {
if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
  return false;

// This method is called in 2 places:
//  - from the vectorizer with a scalar type, in which case we need to get
//  this as good as we can with the limited info we have (and rely on the cost
//  model for the rest).
//  - from the masked intrinsic lowering pass with the actual vector type.
// For MVE, we have a custom lowering pass that will already have custom
// legalised any gathers that we can to MVE intrinsics, and want to expand all
// the rest. The pass runs before the masked intrinsic lowering pass, so if we
// are here, we know we want to expand.
if (isa<VectorType>(Ty))
  return false;

unsigned EltWidth = Ty->getScalarSizeInBits();
return ((EltWidth == 32 && Alignment >= 4) ||
        (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1065}

1067/// Given a memcpy/memset/memmove instruction, return the number of memory
1068/// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1069/// call is used.
1070int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const {
MemOp MOp;
unsigned DstAddrSpace = ~0u;
unsigned SrcAddrSpace = ~0u;
const Function *F = I->getParent()->getParent();

if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
  ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
  // If 'size' is not a constant, a library call will be generated.
  if (!C)
    return -1;

  const unsigned Size = C->getValue().getZExtValue();
  const Align DstAlign = *MC->getDestAlign();
  const Align SrcAlign = *MC->getSourceAlign();

  MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
                    /*IsVolatile*/ false);
  DstAddrSpace = MC->getDestAddressSpace();
  SrcAddrSpace = MC->getSourceAddressSpace();
}
else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
  ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
  // If 'size' is not a constant, a library call will be generated.
  if (!C)
    return -1;

  const unsigned Size = C->getValue().getZExtValue();
  const Align DstAlign = *MS->getDestAlign();

  MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
                   /*IsZeroMemset*/ false, /*IsVolatile*/ false);
  DstAddrSpace = MS->getDestAddressSpace();
}
else
  llvm_unreachable("Expected a memcpy/move or memset!")__builtin_unreachable();

unsigned Limit, Factor = 2;
switch(I->getIntrinsicID()) {
  case Intrinsic::memcpy:
    Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
    break;
  case Intrinsic::memmove:
    Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
    break;
  case Intrinsic::memset:
    Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
    Factor = 1;
    break;
  default:
    llvm_unreachable("Expected a memcpy/move or memset!")__builtin_unreachable();
}

// MemOps will be poplulated with a list of data types that needs to be
// loaded and stored. That's why we multiply the number of elements by 2 to
// get the cost for this memcpy.
std::vector<EVT> MemOps;
if (getTLI()->findOptimalMemOpLowering(
        MemOps, Limit, MOp, DstAddrSpace,
        SrcAddrSpace, F->getAttributes()))
  return MemOps.size() * Factor;

// If we can't find an optimal memop lowering, return the default cost
return -1;
1134}

1136InstructionCost ARMTTIImpl::getMemcpyCost(const Instruction *I) {
int NumOps = getNumMemOps(cast<IntrinsicInst>(I));

// To model the cost of a library call, we assume 1 for the call, and
// 3 for the argument setup.
if (NumOps == -1)
  return 4;
return NumOps;
1144}

1146InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
                                         VectorType *Tp, ArrayRef<int> Mask,
                                         int Index, VectorType *SubTp) {
Kind = improveShuffleKindFromMask(Kind, Mask);
if (ST->hasNEON()) {
  if (Kind == TTI::SK_Broadcast) {
    static const CostTblEntry NEONDupTbl[] = {
        // VDUP handles these cases.
        {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
        {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
        {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
        {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
        {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
        {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},

        {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
        {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
        {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
        {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};

    std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
    if (const auto *Entry =
            CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
      return LT.first * Entry->Cost;
  }
  if (Kind == TTI::SK_Reverse) {
    static const CostTblEntry NEONShuffleTbl[] = {
        // Reverse shuffle cost one instruction if we are shuffling within a
        // double word (vrev) or two if we shuffle a quad word (vrev, vext).
        {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
        {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
        {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
        {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
        {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
        {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},

        {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
        {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
        {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
        {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};

    std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
    if (const auto *Entry =
            CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
      return LT.first * Entry->Cost;
  }
  if (Kind == TTI::SK_Select) {
    static const CostTblEntry NEONSelShuffleTbl[] = {
        // Select shuffle cost table for ARM. Cost is the number of
        // instructions
        // required to create the shuffled vector.

        {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
        {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
        {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
        {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},

        {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
        {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
        {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},

        {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},

        {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};

    std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
    if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
                                            ISD::VECTOR_SHUFFLE, LT.second))
      return LT.first * Entry->Cost;
  }
}
if (ST->hasMVEIntegerOps()) {
  if (Kind == TTI::SK_Broadcast) {
    static const CostTblEntry MVEDupTbl[] = {
        // VDUP handles these cases.
        {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
        {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
        {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
        {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
        {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};

    std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
    if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
                                            LT.second))
      return LT.first * Entry->Cost *
             ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput);
  }

  if (!Mask.empty()) {
    std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
    if (Mask.size() <= LT.second.getVectorNumElements() &&
        (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
         isVREVMask(Mask, LT.second, 64)))
      return ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) * LT.first;
  }
}

int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
                   ? ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput)
                   : 1;
return BaseCost * BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
1247}

1249InstructionCost ARMTTIImpl::getArithmeticInstrCost(
  unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
  TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
  TTI::OperandValueProperties Opd1PropInfo,
  TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
  const Instruction *CxtI) {
int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
  // Make operations on i1 relatively expensive as this often involves
  // combining predicates. AND and XOR should be easier to handle with IT
  // blocks.
  switch (ISDOpcode) {
  default:
    break;
  case ISD::AND:
  case ISD::XOR:
    return 2;
  case ISD::OR:
    return 3;
  }
}

std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);

if (ST->hasNEON()) {
  const unsigned FunctionCallDivCost = 20;
  const unsigned ReciprocalDivCost = 10;
  static const CostTblEntry CostTbl[] = {
    // Division.
    // These costs are somewhat random. Choose a cost of 20 to indicate that
    // vectorizing devision (added function call) is going to be very expensive.
    // Double registers types.
    { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
    { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
    { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
    { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
    { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
    { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
    { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
    { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
    { ISD::SDIV, MVT::v4i16,     ReciprocalDivCost},
    { ISD::UDIV, MVT::v4i16,     ReciprocalDivCost},
    { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
    { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
    { ISD::SDIV, MVT::v8i8,      ReciprocalDivCost},
    { ISD::UDIV, MVT::v8i8,      ReciprocalDivCost},
    { ISD::SREM, MVT::v8i8,  8 * FunctionCallDivCost},
    { ISD::UREM, MVT::v8i8,  8 * FunctionCallDivCost},
    // Quad register types.
    { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
    { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
    { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
    { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
    { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
    { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
    { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
    { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
    { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
    { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
    { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
    { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
    { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
    { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
    { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
    { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
    // Multiplication.
  };

  if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
    return LT.first * Entry->Cost;

  InstructionCost Cost = BaseT::getArithmeticInstrCost(
      Opcode, Ty, CostKind, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo);

  // This is somewhat of a hack. The problem that we are facing is that SROA
  // creates a sequence of shift, and, or instructions to construct values.
  // These sequences are recognized by the ISel and have zero-cost. Not so for
  // the vectorized code. Because we have support for v2i64 but not i64 those
  // sequences look particularly beneficial to vectorize.
  // To work around this we increase the cost of v2i64 operations to make them
  // seem less beneficial.
  if (LT.second == MVT::v2i64 &&
      Op2Info == TargetTransformInfo::OK_UniformConstantValue)
    Cost += 4;

  return Cost;
}

// If this operation is a shift on arm/thumb2, it might well be folded into
// the following instruction, hence having a cost of 0.
auto LooksLikeAFreeShift = [&]() {
  if (ST->isThumb1Only() || Ty->isVectorTy())
    return false;

  if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
    return false;
  if (Op2Info != TargetTransformInfo::OK_UniformConstantValue)
    return false;

  // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
  switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
  case Instruction::Add:
  case Instruction::Sub:
  case Instruction::And:
  case Instruction::Xor:
  case Instruction::Or:
  case Instruction::ICmp:
    return true;
  default:
    return false;
  }
};
if (LooksLikeAFreeShift())
  return 0;

// Default to cheap (throughput/size of 1 instruction) but adjust throughput
// for "multiple beats" potentially needed by MVE instructions.
int BaseCost = 1;
if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
  BaseCost = ST->getMVEVectorCostFactor(CostKind);

// The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
// without treating floats as more expensive that scalars or increasing the
// costs for custom operations. The results is also multiplied by the
// MVEVectorCostFactor where appropriate.
if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
  return LT.first * BaseCost;

// Else this is expand, assume that we need to scalarize this op.
if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
  unsigned Num = VTy->getNumElements();
  InstructionCost Cost =
      getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind);
  // Return the cost of multiple scalar invocation plus the cost of
  // inserting and extracting the values.
  SmallVector<Type *> Tys(Args.size(), Ty);
  return BaseT::getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
}

return BaseCost;
1389}

1391InstructionCost ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
                                          MaybeAlign Alignment,
                                          unsigned AddressSpace,
                                          TTI::TargetCostKind CostKind,
                                          const Instruction *I) {
// TODO: Handle other cost kinds.
if (CostKind != TTI::TCK_RecipThroughput)
  return 1;

// Type legalization can't handle structs
if (TLI->getValueType(DL, Src, true) == MVT::Other)
  return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
                                CostKind);

if (ST->hasNEON() && Src->isVectorTy() &&
    (Alignment && *Alignment != Align(16)) &&
    cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
  // Unaligned loads/stores are extremely inefficient.
  // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
  return LT.first * 4;
}

// MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
// Same for stores.
if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
    ((Opcode == Instruction::Load && I->hasOneUse() &&
      isa<FPExtInst>(*I->user_begin())) ||
     (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
  FixedVectorType *SrcVTy = cast<FixedVectorType>(Src);
  Type *DstTy =
      Opcode == Instruction::Load
          ? (*I->user_begin())->getType()
          : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
  if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
      DstTy->getScalarType()->isFloatTy())
    return ST->getMVEVectorCostFactor(CostKind);
}

int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
                   ? ST->getMVEVectorCostFactor(CostKind)
                   : 1;
return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
                                         CostKind, I);
1435}

1437InstructionCost
1438ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
                                unsigned AddressSpace,
                                TTI::TargetCostKind CostKind) {
if (ST->hasMVEIntegerOps()) {
  if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment))
    return ST->getMVEVectorCostFactor(CostKind);
  if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment))
    return ST->getMVEVectorCostFactor(CostKind);
}
if (!isa<FixedVectorType>(Src))
  return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
                                      CostKind);
// Scalar cost, which is currently very high due to the efficiency of the
// generated code.
return cast<FixedVectorType>(Src)->getNumElements() * 8;
1453}

1455InstructionCost ARMTTIImpl::getInterleavedMemoryOpCost(
  unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
  Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
  bool UseMaskForCond, bool UseMaskForGaps) {
assert(Factor >= 2 && "Invalid interleave factor")(static_cast<void> (0));
assert(isa<VectorType>(VecTy) && "Expect a vector type")(static_cast<void> (0));

// vldN/vstN doesn't support vector types of i64/f64 element.
bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;

if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
    !UseMaskForCond && !UseMaskForGaps) {
  unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
  auto *SubVecTy =
      FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);

  // vldN/vstN only support legal vector types of size 64 or 128 in bits.
  // Accesses having vector types that are a multiple of 128 bits can be
  // matched to more than one vldN/vstN instruction.
  int BaseCost =
      ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1;
  if (NumElts % Factor == 0 &&
      TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
    return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);

  // Some smaller than legal interleaved patterns are cheap as we can make
  // use of the vmovn or vrev patterns to interleave a standard load. This is
  // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
  // promoted differently). The cost of 2 here is then a load and vrev or
  // vmovn.
  if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
      VecTy->isIntOrIntVectorTy() &&
      DL.getTypeSizeInBits(SubVecTy).getFixedSize() <= 64)
    return 2 * BaseCost;
}

return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
                                         Alignment, AddressSpace, CostKind,
                                         UseMaskForCond, UseMaskForGaps);
1494}

1496InstructionCost ARMTTIImpl::getGatherScatterOpCost(
  unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
  Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
using namespace PatternMatch;
if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
  return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
                                       Alignment, CostKind, I);

assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!")(static_cast<void> (0));
auto *VTy = cast<FixedVectorType>(DataTy);

// TODO: Splitting, once we do that.

unsigned NumElems = VTy->getNumElements();
unsigned EltSize = VTy->getScalarSizeInBits();
std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, DataTy);

// For now, it is assumed that for the MVE gather instructions the loads are
// all effectively serialised. This means the cost is the scalar cost
// multiplied by the number of elements being loaded. This is possibly very
// conservative, but even so we still end up vectorising loops because the
// cost per iteration for many loops is lower than for scalar loops.
InstructionCost VectorCost =
    NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
// The scalarization cost should be a lot higher. We use the number of vector
// elements plus the scalarization overhead.
InstructionCost ScalarCost =
    NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, true, false) +
    BaseT::getScalarizationOverhead(VTy, false, true);

if (EltSize < 8 || Alignment < EltSize / 8)
  return ScalarCost;

unsigned ExtSize = EltSize;
// Check whether there's a single user that asks for an extended type
if (I != nullptr) {
  // Dependent of the caller of this function, a gather instruction will
  // either have opcode Instruction::Load or be a call to the masked_gather
  // intrinsic
  if ((I->getOpcode() == Instruction::Load ||
       match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
      I->hasOneUse()) {
    const User *Us = *I->users().begin();
    if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
      // only allow valid type combinations
      unsigned TypeSize =
          cast<Instruction>(Us)->getType()->getScalarSizeInBits();
      if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
           (TypeSize == 16 && EltSize == 8)) &&
          TypeSize * NumElems == 128) {
        ExtSize = TypeSize;
      }
    }
  }
  // Check whether the input data needs to be truncated
  TruncInst *T;
  if ((I->getOpcode() == Instruction::Store ||
       match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
      (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
    // Only allow valid type combinations
    unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
    if (((EltSize == 16 && TypeSize == 32) ||
         (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
        TypeSize * NumElems == 128)
      ExtSize = TypeSize;
  }
}

if (ExtSize * NumElems != 128 || NumElems < 4)
  return ScalarCost;

// Any (aligned) i32 gather will not need to be scalarised.
if (ExtSize == 32)
  return VectorCost;
// For smaller types, we need to ensure that the gep's inputs are correctly
// extended from a small enough value. Other sizes (including i64) are
// scalarized for now.
if (ExtSize != 8 && ExtSize != 16)
  return ScalarCost;

if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
  Ptr = BC->getOperand(0);
if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
  if (GEP->getNumOperands() != 2)
    return ScalarCost;
  unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
  // Scale needs to be correct (which is only relevant for i16s).
  if (Scale != 1 && Scale * 8 != ExtSize)
    return ScalarCost;
  // And we need to zext (not sext) the indexes from a small enough type.
  if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
    if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
      return VectorCost;
  }
  return ScalarCost;
}
return ScalarCost;
1593}

1595InstructionCost
1596ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
                                     Optional<FastMathFlags> FMF,
                                     TTI::TargetCostKind CostKind) {
if (TTI::requiresOrderedReduction(FMF))
  return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);

EVT ValVT = TLI->getValueType(DL, ValTy);
int ISD = TLI->InstructionOpcodeToISD(Opcode);
if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD)
  return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);

std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);

static const CostTblEntry CostTblAdd[]{
    {ISD::ADD, MVT::v16i8, 1},
    {ISD::ADD, MVT::v8i16, 1},
    {ISD::ADD, MVT::v4i32, 1},
};
if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;

return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1618}

1620InstructionCost
1621ARMTTIImpl::getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
                                      Type *ResTy, VectorType *ValTy,
                                      TTI::TargetCostKind CostKind) {
EVT ValVT = TLI->getValueType(DL, ValTy);
EVT ResVT = TLI->getValueType(DL, ResTy);

if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
  std::pair<InstructionCost, MVT> LT =
      TLI->getTypeLegalizationCost(DL, ValTy);

  // The legal cases are:
  //   VADDV u/s 8/16/32
  //   VMLAV u/s 8/16/32
  //   VADDLV u/s 32
  //   VMLALV u/s 16/32
  // Codegen currently cannot always handle larger than legal vectors very
  // well, especially for predicated reductions where the mask needs to be
  // split, so restrict to 128bit or smaller input types.
  unsigned RevVTSize = ResVT.getSizeInBits();
  if (ValVT.getSizeInBits() <= 128 &&
      ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
       (LT.second == MVT::v8i16 && RevVTSize <= (IsMLA ? 64u : 32u)) ||
       (LT.second == MVT::v4i32 && RevVTSize <= 64)))
    return ST->getMVEVectorCostFactor(CostKind) * LT.first;
}

return BaseT::getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, ValTy,
                                          CostKind);
1649}

1651InstructionCost
1652ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                TTI::TargetCostKind CostKind) {
switch (ICA.getID()) {
case Intrinsic::get_active_lane_mask:
  // Currently we make a somewhat optimistic assumption that
  // active_lane_mask's are always free. In reality it may be freely folded
  // into a tail predicated loop, expanded into a VCPT or expanded into a lot
  // of add/icmp code. We may need to improve this in the future, but being
  // able to detect if it is free or not involves looking at a lot of other
  // code. We currently assume that the vectorizer inserted these, and knew
  // what it was doing in adding one.
  if (ST->hasMVEIntegerOps())
    return 0;
  break;
case Intrinsic::sadd_sat:
case Intrinsic::ssub_sat:
case Intrinsic::uadd_sat:
case Intrinsic::usub_sat: {
  if (!ST->hasMVEIntegerOps())
    break;
  Type *VT = ICA.getReturnType();

  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
  if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
      LT.second == MVT::v16i8) {
    // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
    // need to extend the type, as it uses shr(qadd(shl, shl)).
    unsigned Instrs =
        LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4;
    return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
  }
  break;
}
case Intrinsic::abs:
case Intrinsic::smin:
case Intrinsic::smax:
case Intrinsic::umin:
case Intrinsic::umax: {
  if (!ST->hasMVEIntegerOps())
    break;
  Type *VT = ICA.getReturnType();

  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
  if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
      LT.second == MVT::v16i8)
    return LT.first * ST->getMVEVectorCostFactor(CostKind);
  break;
}
case Intrinsic::minnum:
case Intrinsic::maxnum: {
  if (!ST->hasMVEFloatOps())
    break;
  Type *VT = ICA.getReturnType();
  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
  if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
    return LT.first * ST->getMVEVectorCostFactor(CostKind);
  break;
}
}

return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1
'Default' branch taken. Execution continues on line 1712→
2
←
Calling 'BasicTTIImplBase::getIntrinsicInstrCost'→
1713}

1715bool ARMTTIImpl::isLoweredToCall(const Function *F) {
if (!F->isIntrinsic())
  BaseT::isLoweredToCall(F);

// Assume all Arm-specific intrinsics map to an instruction.
if (F->getName().startswith("llvm.arm"))
  return false;

switch (F->getIntrinsicID()) {
default: break;
case Intrinsic::powi:
case Intrinsic::sin:
case Intrinsic::cos:
case Intrinsic::pow:
case Intrinsic::log:
case Intrinsic::log10:
case Intrinsic::log2:
case Intrinsic::exp:
case Intrinsic::exp2:
  return true;
case Intrinsic::sqrt:
case Intrinsic::fabs:
case Intrinsic::copysign:
case Intrinsic::floor:
case Intrinsic::ceil:
case Intrinsic::trunc:
case Intrinsic::rint:
case Intrinsic::nearbyint:
case Intrinsic::round:
case Intrinsic::canonicalize:
case Intrinsic::lround:
case Intrinsic::llround:
case Intrinsic::lrint:
case Intrinsic::llrint:
  if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
    return true;
  if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
    return true;
  // Some operations can be handled by vector instructions and assume
  // unsupported vectors will be expanded into supported scalar ones.
  // TODO Handle scalar operations properly.
  return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
case Intrinsic::masked_store:
case Intrinsic::masked_load:
case Intrinsic::masked_gather:
case Intrinsic::masked_scatter:
  return !ST->hasMVEIntegerOps();
case Intrinsic::sadd_with_overflow:
case Intrinsic::uadd_with_overflow:
case Intrinsic::ssub_with_overflow:
case Intrinsic::usub_with_overflow:
case Intrinsic::sadd_sat:
case Intrinsic::uadd_sat:
case Intrinsic::ssub_sat:
case Intrinsic::usub_sat:
  return false;
}

return BaseT::isLoweredToCall(F);
1774}

1776bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) {
unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
EVT VT = TLI->getValueType(DL, I.getType(), true);
if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
  return true;

// Check if an intrinsic will be lowered to a call and assume that any
// other CallInst will generate a bl.
if (auto *Call = dyn_cast<CallInst>(&I)) {
  if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
    switch(II->getIntrinsicID()) {
      case Intrinsic::memcpy:
      case Intrinsic::memset:
      case Intrinsic::memmove:
        return getNumMemOps(II) == -1;
      default:
        if (const Function *F = Call->getCalledFunction())
          return isLoweredToCall(F);
    }
  }
  return true;
}

// FPv5 provides conversions between integer, double-precision,
// single-precision, and half-precision formats.
switch (I.getOpcode()) {
default:
  break;
case Instruction::FPToSI:
case Instruction::FPToUI:
case Instruction::SIToFP:
case Instruction::UIToFP:
case Instruction::FPTrunc:
case Instruction::FPExt:
  return !ST->hasFPARMv8Base();
}

// FIXME: Unfortunately the approach of checking the Operation Action does
// not catch all cases of Legalization that use library calls. Our
// Legalization step categorizes some transformations into library calls as
// Custom, Expand or even Legal when doing type legalization. So for now
// we have to special case for instance the SDIV of 64bit integers and the
// use of floating point emulation.
if (VT.isInteger() && VT.getSizeInBits() >= 64) {
  switch (ISD) {
  default:
    break;
  case ISD::SDIV:
  case ISD::UDIV:
  case ISD::SREM:
  case ISD::UREM:
  case ISD::SDIVREM:
  case ISD::UDIVREM:
    return true;
  }
}

// Assume all other non-float operations are supported.
if (!VT.isFloatingPoint())
  return false;

// We'll need a library call to handle most floats when using soft.
if (TLI->useSoftFloat()) {
  switch (I.getOpcode()) {
  default:
    return true;
  case Instruction::Alloca:
  case Instruction::Load:
  case Instruction::Store:
  case Instruction::Select:
  case Instruction::PHI:
    return false;
  }
}

// We'll need a libcall to perform double precision operations on a single
// precision only FPU.
if (I.getType()->isDoubleTy() && !ST->hasFP64())
  return true;

// Likewise for half precision arithmetic.
if (I.getType()->isHalfTy() && !ST->hasFullFP16())
  return true;

return false;
1861}

1863bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
                                        AssumptionCache &AC,
                                        TargetLibraryInfo *LibInfo,
                                        HardwareLoopInfo &HWLoopInfo) {
// Low-overhead branches are only supported in the 'low-overhead branch'
// extension of v8.1-m.
if (!ST->hasLOB() || DisableLowOverheadLoops) {
  LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n")do { } while (false);
  return false;
}

if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
  LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n")do { } while (false);
  return false;
}

const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
  LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n")do { } while (false);
  return false;
}

const SCEV *TripCountSCEV =
  SE.getAddExpr(BackedgeTakenCount,
                SE.getOne(BackedgeTakenCount->getType()));

// We need to store the trip count in LR, a 32-bit register.
if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
  LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n")do { } while (false);
  return false;
}

// Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
// point in generating a hardware loop if that's going to happen.

auto IsHardwareLoopIntrinsic = [](Instruction &I) {
  if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
    switch (Call->getIntrinsicID()) {
    default:
      break;
    case Intrinsic::start_loop_iterations:
    case Intrinsic::test_start_loop_iterations:
    case Intrinsic::loop_decrement:
    case Intrinsic::loop_decrement_reg:
      return true;
    }
  }
  return false;
};

// Scan the instructions to see if there's any that we know will turn into a
// call or if this loop is already a low-overhead loop or will become a tail
// predicated loop.
bool IsTailPredLoop = false;
auto ScanLoop = [&](Loop *L) {
  for (auto *BB : L->getBlocks()) {
    for (auto &I : *BB) {
      if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
          isa<InlineAsm>(I)) {
        LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n")do { } while (false);
        return false;
      }
      if (auto *II = dyn_cast<IntrinsicInst>(&I))
        IsTailPredLoop |=
            II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
            II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
            II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
            II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
            II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
    }
  }
  return true;
};

// Visit inner loops.
for (auto Inner : *L)
  if (!ScanLoop(Inner))
    return false;

if (!ScanLoop(L))
  return false;

// TODO: Check whether the trip count calculation is expensive. If L is the
// inner loop but we know it has a low trip count, calculating that trip
// count (in the parent loop) may be detrimental.

LLVMContext &C = L->getHeader()->getContext();
HWLoopInfo.CounterInReg = true;
HWLoopInfo.IsNestingLegal = false;
HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
HWLoopInfo.CountType = Type::getInt32Ty(C);
HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
return true;
1956}

1958static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
// We don't allow icmp's, and because we only look at single block loops,
// we simply count the icmps, i.e. there should only be 1 for the backedge.
if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
  return false;
// FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
// not currently canonical, but soon will be. Code without them uses icmp, and
// so is not tail predicated as per the condition above. In order to get the
// same performance we treat min and max the same as an icmp for tailpred
// purposes for the moment (we often rely on non-tailpred and higher VF's to
// pick more optimial instructions like VQDMULH. They need to be recognized
// directly by the vectorizer).
if (auto *II = dyn_cast<IntrinsicInst>(&I))
  if ((II->getIntrinsicID() == Intrinsic::smin ||
       II->getIntrinsicID() == Intrinsic::smax ||
       II->getIntrinsicID() == Intrinsic::umin ||
       II->getIntrinsicID() == Intrinsic::umax) &&
      ++ICmpCount > 1)
    return false;

if (isa<FCmpInst>(&I))
  return false;

// We could allow extending/narrowing FP loads/stores, but codegen is
// too inefficient so reject this for now.
if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
  return false;

// Extends have to be extending-loads
if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
  if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
    return false;

// Truncs have to be narrowing-stores
if (isa<TruncInst>(&I) )
  if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
    return false;

return true;
1997}

1999// To set up a tail-predicated loop, we need to know the total number of
2000// elements processed by that loop. Thus, we need to determine the element
2001// size and:
2002// 1) it should be uniform for all operations in the vector loop, so we
2003//    e.g. don't want any widening/narrowing operations.
2004// 2) it should be smaller than i64s because we don't have vector operations
2005//    that work on i64s.
2006// 3) we don't want elements to be reversed or shuffled, to make sure the
2007//    tail-predication masks/predicates the right lanes.
2008//
2009static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
                               const DataLayout &DL,
                               const LoopAccessInfo *LAI) {
LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n")do { } while (false);

// If there are live-out values, it is probably a reduction. We can predicate
// most reduction operations freely under MVE using a combination of
// prefer-predicated-reduction-select and inloop reductions. We limit this to
// floating point and integer reductions, but don't check for operators
// specifically here. If the value ends up not being a reduction (and so the
// vectorizer cannot tailfold the loop), we should fall back to standard
// vectorization automatically.
SmallVector< Instruction *, 8 > LiveOuts;
LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
bool ReductionsDisabled =
    EnableTailPredication == TailPredication::EnabledNoReductions ||
    EnableTailPredication == TailPredication::ForceEnabledNoReductions;

for (auto *I : LiveOuts) {
  if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
      !I->getType()->isHalfTy()) {
    LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "do { } while (false)
                         "live-out value\n")do { } while (false);
    return false;
  }
  if (ReductionsDisabled) {
    LLVM_DEBUG(dbgs() << "Reductions not enabled\n")do { } while (false);
    return false;
  }
}

// Next, check that all instructions can be tail-predicated.
PredicatedScalarEvolution PSE = LAI->getPSE();
SmallVector<Instruction *, 16> LoadStores;
int ICmpCount = 0;

for (BasicBlock *BB : L->blocks()) {
  for (Instruction &I : BB->instructionsWithoutDebug()) {
    if (isa<PHINode>(&I))
      continue;
    if (!canTailPredicateInstruction(I, ICmpCount)) {
      LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump())do { } while (false);
      return false;
    }

    Type *T  = I.getType();
    if (T->isPointerTy())
      T = T->getPointerElementType();

    if (T->getScalarSizeInBits() > 32) {
      LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump())do { } while (false);
      return false;
    }
    if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
      Value *Ptr = isa<LoadInst>(I) ? I.getOperand(0) : I.getOperand(1);
      int64_t NextStride = getPtrStride(PSE, Ptr, L);
      if (NextStride == 1) {
        // TODO: for now only allow consecutive strides of 1. We could support
        // other strides as long as it is uniform, but let's keep it simple
        // for now.
        continue;
      } else if (NextStride == -1 ||
                 (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
                 (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
        LLVM_DEBUG(dbgs()do { } while (false)
                   << "Consecutive strides of 2 found, vld2/vstr2 can't "do { } while (false)
                      "be tail-predicated\n.")do { } while (false);
        return false;
        // TODO: don't tail predicate if there is a reversed load?
      } else if (EnableMaskedGatherScatters) {
        // Gather/scatters do allow loading from arbitrary strides, at
        // least if they are loop invariant.
        // TODO: Loop variant strides should in theory work, too, but
        // this requires further testing.
        const SCEV *PtrScev =
            replaceSymbolicStrideSCEV(PSE, llvm::ValueToValueMap(), Ptr);
        if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
          const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
          if (PSE.getSE()->isLoopInvariant(Step, L))
            continue;
        }
      }
      LLVM_DEBUG(dbgs() << "Bad stride found, can't "do { } while (false)
                           "tail-predicate\n.")do { } while (false);
      return false;
    }
  }
}

LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n")do { } while (false);
return true;
2100}

2102bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
                                           ScalarEvolution &SE,
                                           AssumptionCache &AC,
                                           TargetLibraryInfo *TLI,
                                           DominatorTree *DT,
                                           const LoopAccessInfo *LAI) {
if (!EnableTailPredication) {
  LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n")do { } while (false);
  return false;
}

// Creating a predicated vector loop is the first step for generating a
// tail-predicated hardware loop, for which we need the MVE masked
// load/stores instructions:
if (!ST->hasMVEIntegerOps())
  return false;

// For now, restrict this to single block loops.
if (L->getNumBlocks() > 1) {
  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "do { } while (false)
                       "loop.\n")do { } while (false);
  return false;
}

assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected")(static_cast<void> (0));

HardwareLoopInfo HWLoopInfo(L);
if (!HWLoopInfo.canAnalyze(*LI)) {
  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "do { } while (false)
                       "analyzable.\n")do { } while (false);
  return false;
}

// This checks if we have the low-overhead branch architecture
// extension, and if we will create a hardware-loop:
if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {
  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "do { } while (false)
                       "profitable.\n")do { } while (false);
  return false;
}

if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) {
  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "do { } while (false)
                       "a candidate.\n")do { } while (false);
  return false;
}

return canTailPredicateLoop(L, LI, SE, DL, LAI);
2150}

2152bool ARMTTIImpl::emitGetActiveLaneMask() const {
if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
  return false;

// Intrinsic @llvm.get.active.lane.mask is supported.
// It is used in the MVETailPredication pass, which requires the number of
// elements processed by this vector loop to setup the tail-predicated
// loop.
return true;
2161}
2162void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                       TTI::UnrollingPreferences &UP,
                                       OptimizationRemarkEmitter *ORE) {
// Enable Upper bound unrolling universally, not dependant upon the conditions
// below.
UP.UpperBound = true;

// Only currently enable these preferences for M-Class cores.
if (!ST->isMClass())
  return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);

// Disable loop unrolling for Oz and Os.
UP.OptSizeThreshold = 0;
UP.PartialOptSizeThreshold = 0;
if (L->getHeader()->getParent()->hasOptSize())
  return;

SmallVector<BasicBlock*, 4> ExitingBlocks;
L->getExitingBlocks(ExitingBlocks);
LLVM_DEBUG(dbgs() << "Loop has:\n"do { } while (false)
                  << "Blocks: " << L->getNumBlocks() << "\n"do { } while (false)
                  << "Exit blocks: " << ExitingBlocks.size() << "\n")do { } while (false);

// Only allow another exit other than the latch. This acts as an early exit
// as it mirrors the profitability calculation of the runtime unroller.
if (ExitingBlocks.size() > 2)
  return;

// Limit the CFG of the loop body for targets with a branch predictor.
// Allowing 4 blocks permits if-then-else diamonds in the body.
if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
  return;

// Don't unroll vectorized loops, including the remainder loop
if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
  return;

// Scan the loop: don't unroll loops with calls as this could prevent
// inlining.
InstructionCost Cost = 0;
for (auto *BB : L->getBlocks()) {
  for (auto &I : *BB) {
    // Don't unroll vectorised loop. MVE does not benefit from it as much as
    // scalar code.
    if (I.getType()->isVectorTy())
      return;

    if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
      if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
        if (!isLoweredToCall(F))
          continue;
      }
      return;
    }

    SmallVector<const Value*, 4> Operands(I.operand_values());
    Cost +=
      getUserCost(&I, Operands, TargetTransformInfo::TCK_SizeAndLatency);
  }
}

// On v6m cores, there are very few registers available. We can easily end up
// spilling and reloading more registers in an unrolled loop. Look at the
// number of LCSSA phis as a rough measure of how many registers will need to
// be live out of the loop, reducing the default unroll count if more than 1
// value is needed.  In the long run, all of this should be being learnt by a
// machine.
unsigned UnrollCount = 4;
if (ST->isThumb1Only()) {
  unsigned ExitingValues = 0;
  SmallVector<BasicBlock *, 4> ExitBlocks;
  L->getExitBlocks(ExitBlocks);
  for (auto *Exit : ExitBlocks) {
    // Count the number of LCSSA phis. Exclude values coming from GEP's as
    // only the last is expected to be needed for address operands.
    unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) {
      return PH.getNumOperands() != 1 ||
             !isa<GetElementPtrInst>(PH.getOperand(0));
    });
    ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
  }
  if (ExitingValues)
    UnrollCount /= ExitingValues;
  if (UnrollCount <= 1)
    return;
}

LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n")do { } while (false);
LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n")do { } while (false);

UP.Partial = true;
UP.Runtime = true;
UP.UnrollRemainder = true;
UP.DefaultUnrollRuntimeCount = UnrollCount;
UP.UnrollAndJam = true;
UP.UnrollAndJamInnerLoopThreshold = 60;

// Force unrolling small loops can be very useful because of the branch
// taken cost of the backedge.
if (Cost < 12)
  UP.Force = true;
2263}

2265void ARMTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
                                     TTI::PeelingPreferences &PP) {
BaseT::getPeelingPreferences(L, SE, PP);
2268}

2270bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
                                     TTI::ReductionFlags Flags) const {
if (!ST->hasMVEIntegerOps())
  return false;

unsigned ScalarBits = Ty->getScalarSizeInBits();
switch (Opcode) {
case Instruction::Add:
  return ScalarBits <= 64;
default:
  return false;
}
2282}

2284bool ARMTTIImpl::preferPredicatedReductionSelect(
  unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const {
if (!ST->hasMVEIntegerOps())
  return false;
return true;
2289}

←

/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/include/llvm/CodeGen/BasicTTIImpl.h

→

1//===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file provides a helper that implements much of the TTI interface in
11/// terms of the target-independent code generator and TargetLowering
12/// interfaces.
13//
14//===----------------------------------------------------------------------===//

16#ifndef LLVM_CODEGEN_BASICTTIIMPL_H
17#define LLVM_CODEGEN_BASICTTIIMPL_H

19#include "llvm/ADT/APInt.h"
20#include "llvm/ADT/ArrayRef.h"
21#include "llvm/ADT/BitVector.h"
22#include "llvm/ADT/SmallPtrSet.h"
23#include "llvm/ADT/SmallVector.h"
24#include "llvm/Analysis/LoopInfo.h"
25#include "llvm/Analysis/OptimizationRemarkEmitter.h"
26#include "llvm/Analysis/TargetTransformInfo.h"
27#include "llvm/Analysis/TargetTransformInfoImpl.h"
28#include "llvm/CodeGen/ISDOpcodes.h"
29#include "llvm/CodeGen/TargetLowering.h"
30#include "llvm/CodeGen/TargetSubtargetInfo.h"
31#include "llvm/CodeGen/ValueTypes.h"
32#include "llvm/IR/BasicBlock.h"
33#include "llvm/IR/Constant.h"
34#include "llvm/IR/Constants.h"
35#include "llvm/IR/DataLayout.h"
36#include "llvm/IR/DerivedTypes.h"
37#include "llvm/IR/InstrTypes.h"
38#include "llvm/IR/Instruction.h"
39#include "llvm/IR/Instructions.h"
40#include "llvm/IR/Intrinsics.h"
41#include "llvm/IR/Operator.h"
42#include "llvm/IR/Type.h"
43#include "llvm/IR/Value.h"
44#include "llvm/Support/Casting.h"
45#include "llvm/Support/CommandLine.h"
46#include "llvm/Support/ErrorHandling.h"
47#include "llvm/Support/MachineValueType.h"
48#include "llvm/Support/MathExtras.h"
49#include "llvm/Target/TargetMachine.h"
50#include <algorithm>
51#include <cassert>
52#include <cstdint>
53#include <limits>
54#include <utility>

56namespace llvm {

58class Function;
59class GlobalValue;
60class LLVMContext;
61class ScalarEvolution;
62class SCEV;
63class TargetMachine;

65extern cl::opt<unsigned> PartialUnrollingThreshold;

67/// Base class which can be used to help build a TTI implementation.
68///
69/// This class provides as much implementation of the TTI interface as is
70/// possible using the target independent parts of the code generator.
71///
72/// In order to subclass it, your class must implement a getST() method to
73/// return the subtarget, and a getTLI() method to return the target lowering.
74/// We need these methods implemented in the derived class so that this class
75/// doesn't have to duplicate storage for them.
76template <typename T>
77class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
78private:
using BaseT = TargetTransformInfoImplCRTPBase<T>;
using TTI = TargetTransformInfo;

/// Helper function to access this as a T.
T *thisT() { return static_cast<T *>(this); }

/// Estimate a cost of Broadcast as an extract and sequence of insert
/// operations.
InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy) {
  InstructionCost Cost = 0;
  // Broadcast cost is equal to the cost of extracting the zero'th element
  // plus the cost of inserting it into every element of the result vector.
  Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 0);

  for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
    Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i);
  }
  return Cost;
}

/// Estimate a cost of shuffle as a sequence of extract and insert
/// operations.
InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy) {
  InstructionCost Cost = 0;
  // Shuffle cost is equal to the cost of extracting element from its argument
  // plus the cost of inserting them onto the result vector.

  // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
  // index 0 of first vector, index 1 of second vector,index 2 of first
  // vector and finally index 3 of second vector and insert them at index
  // <0,1,2,3> of result vector.
  for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
    Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i);
    Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, i);
  }
  return Cost;
}

/// Estimate a cost of subvector extraction as a sequence of extract and
/// insert operations.
InstructionCost getExtractSubvectorOverhead(VectorType *VTy, int Index,
                                     FixedVectorType *SubVTy) {
  assert(VTy && SubVTy &&(static_cast<void> (0))
         "Can only extract subvectors from vectors")(static_cast<void> (0));
  int NumSubElts = SubVTy->getNumElements();
  assert((!isa<FixedVectorType>(VTy) ||(static_cast<void> (0))
          (Index + NumSubElts) <=(static_cast<void> (0))
              (int)cast<FixedVectorType>(VTy)->getNumElements()) &&(static_cast<void> (0))
         "SK_ExtractSubvector index out of range")(static_cast<void> (0));

  InstructionCost Cost = 0;
  // Subvector extraction cost is equal to the cost of extracting element from
  // the source type plus the cost of inserting them into the result vector
  // type.
  for (int i = 0; i != NumSubElts; ++i) {
    Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
                                        i + Index);
    Cost +=
        thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, i);
  }
  return Cost;
}

/// Estimate a cost of subvector insertion as a sequence of extract and
/// insert operations.
InstructionCost getInsertSubvectorOverhead(VectorType *VTy, int Index,
                                    FixedVectorType *SubVTy) {
  assert(VTy && SubVTy &&(static_cast<void> (0))
         "Can only insert subvectors into vectors")(static_cast<void> (0));
  int NumSubElts = SubVTy->getNumElements();
  assert((!isa<FixedVectorType>(VTy) ||(static_cast<void> (0))
          (Index + NumSubElts) <=(static_cast<void> (0))
              (int)cast<FixedVectorType>(VTy)->getNumElements()) &&(static_cast<void> (0))
         "SK_InsertSubvector index out of range")(static_cast<void> (0));

  InstructionCost Cost = 0;
  // Subvector insertion cost is equal to the cost of extracting element from
  // the source type plus the cost of inserting them into the result vector
  // type.
  for (int i = 0; i != NumSubElts; ++i) {
    Cost +=
        thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy, i);
    Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
                                        i + Index);
  }
  return Cost;
}

/// Local query method delegates up to T which *must* implement this!
const TargetSubtargetInfo *getST() const {
  return static_cast<const T *>(this)->getST();
}

/// Local query method delegates up to T which *must* implement this!
const TargetLoweringBase *getTLI() const {
  return static_cast<const T *>(this)->getTLI();
}

static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
  switch (M) {
    case TTI::MIM_Unindexed:
      return ISD::UNINDEXED;
    case TTI::MIM_PreInc:
      return ISD::PRE_INC;
    case TTI::MIM_PreDec:
      return ISD::PRE_DEC;
    case TTI::MIM_PostInc:
      return ISD::POST_INC;
    case TTI::MIM_PostDec:
      return ISD::POST_DEC;
  }
  llvm_unreachable("Unexpected MemIndexedMode")__builtin_unreachable();
}

InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
                                            Align Alignment,
                                            bool VariableMask,
                                            bool IsGatherScatter,
                                            TTI::TargetCostKind CostKind) {
  auto *VT = cast<FixedVectorType>(DataTy);
  // Assume the target does not have support for gather/scatter operations
  // and provide a rough estimate.
  //
  // First, compute the cost of the individual memory operations.
  InstructionCost AddrExtractCost =
      IsGatherScatter
          ? getVectorInstrCost(Instruction::ExtractElement,
                               FixedVectorType::get(
                                   PointerType::get(VT->getElementType(), 0),
                                   VT->getNumElements()),
                               -1)
          : 0;
  InstructionCost LoadCost =
      VT->getNumElements() *
      (AddrExtractCost +
       getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind));

  // Next, compute the cost of packing the result in a vector.
  InstructionCost PackingCost = getScalarizationOverhead(
      VT, Opcode != Instruction::Store, Opcode == Instruction::Store);

  InstructionCost ConditionalCost = 0;
  if (VariableMask) {
    // Compute the cost of conditionally executing the memory operations with
    // variable masks. This includes extracting the individual conditions, a
    // branches and PHIs to combine the results.
    // NOTE: Estimating the cost of conditionally executing the memory
    // operations accurately is quite difficult and the current solution
    // provides a very rough estimate only.
    ConditionalCost =
        VT->getNumElements() *
        (getVectorInstrCost(
             Instruction::ExtractElement,
             FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()),
                                  VT->getNumElements()),
             -1) +
         getCFInstrCost(Instruction::Br, CostKind) +
         getCFInstrCost(Instruction::PHI, CostKind));
  }

  return LoadCost + PackingCost + ConditionalCost;
}

242protected:
explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
    : BaseT(DL) {}
virtual ~BasicTTIImplBase() = default;

using TargetTransformInfoImplBase::DL;

249public:
/// \name Scalar TTI Implementations
/// @{
bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth,
                                    unsigned AddressSpace, Align Alignment,
                                    bool *Fast) const {
  EVT E = EVT::getIntegerVT(Context, BitWidth);
  return getTLI()->allowsMisalignedMemoryAccesses(
      E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast);
}

bool hasBranchDivergence() { return false; }

bool useGPUDivergenceAnalysis() { return false; }

bool isSourceOfDivergence(const Value *V) { return false; }

bool isAlwaysUniform(const Value *V) { return false; }

unsigned getFlatAddressSpace() {
  // Return an invalid address space.
  return -1;
}

bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
                                Intrinsic::ID IID) const {
  return false;
}

bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
  return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS);
}

unsigned getAssumedAddrSpace(const Value *V) const {
  return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
}

Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
                                        Value *NewV) const {
  return nullptr;
}

bool isLegalAddImmediate(int64_t imm) {
  return getTLI()->isLegalAddImmediate(imm);
}

bool isLegalICmpImmediate(int64_t imm) {
  return getTLI()->isLegalICmpImmediate(imm);
}

bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
                           bool HasBaseReg, int64_t Scale,
                           unsigned AddrSpace, Instruction *I = nullptr) {
  TargetLoweringBase::AddrMode AM;
  AM.BaseGV = BaseGV;
  AM.BaseOffs = BaseOffset;
  AM.HasBaseReg = HasBaseReg;
  AM.Scale = Scale;
  return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
}

bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty,
                        const DataLayout &DL) const {
  EVT VT = getTLI()->getValueType(DL, Ty);
  return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
}

bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty,
                         const DataLayout &DL) const {
  EVT VT = getTLI()->getValueType(DL, Ty);
  return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
}

bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) {
  return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
}

bool isNumRegsMajorCostOfLSR() {
  return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR();
}

bool isProfitableLSRChainElement(Instruction *I) {
  return TargetTransformInfoImplBase::isProfitableLSRChainElement(I);
}

InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
                                     int64_t BaseOffset, bool HasBaseReg,
                                     int64_t Scale, unsigned AddrSpace) {
  TargetLoweringBase::AddrMode AM;
  AM.BaseGV = BaseGV;
  AM.BaseOffs = BaseOffset;
  AM.HasBaseReg = HasBaseReg;
  AM.Scale = Scale;
  return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace);
}

bool isTruncateFree(Type *Ty1, Type *Ty2) {
  return getTLI()->isTruncateFree(Ty1, Ty2);
}

bool isProfitableToHoist(Instruction *I) {
  return getTLI()->isProfitableToHoist(I);
}

bool useAA() const { return getST()->useAA(); }

bool isTypeLegal(Type *Ty) {
  EVT VT = getTLI()->getValueType(DL, Ty);
  return getTLI()->isTypeLegal(VT);
}

InstructionCost getRegUsageForType(Type *Ty) {
  InstructionCost Val = getTLI()->getTypeLegalizationCost(DL, Ty).first;
  assert(Val >= 0 && "Negative cost!")(static_cast<void> (0));
  return Val;
}

InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr,
                           ArrayRef<const Value *> Operands) {
  return BaseT::getGEPCost(PointeeType, Ptr, Operands);
}

unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
                                          unsigned &JumpTableSize,
                                          ProfileSummaryInfo *PSI,
                                          BlockFrequencyInfo *BFI) {
  /// Try to find the estimated number of clusters. Note that the number of
  /// clusters identified in this function could be different from the actual
  /// numbers found in lowering. This function ignore switches that are
  /// lowered with a mix of jump table / bit test / BTree. This function was
  /// initially intended to be used when estimating the cost of switch in
  /// inline cost heuristic, but it's a generic cost model to be used in other
  /// places (e.g., in loop unrolling).
  unsigned N = SI.getNumCases();
  const TargetLoweringBase *TLI = getTLI();
  const DataLayout &DL = this->getDataLayout();

  JumpTableSize = 0;
  bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());

  // Early exit if both a jump table and bit test are not allowed.
  if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N))
    return N;

  APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
  APInt MinCaseVal = MaxCaseVal;
  for (auto CI : SI.cases()) {
    const APInt &CaseVal = CI.getCaseValue()->getValue();
    if (CaseVal.sgt(MaxCaseVal))
      MaxCaseVal = CaseVal;
    if (CaseVal.slt(MinCaseVal))
      MinCaseVal = CaseVal;
  }

  // Check if suitable for a bit test
  if (N <= DL.getIndexSizeInBits(0u)) {
    SmallPtrSet<const BasicBlock *, 4> Dests;
    for (auto I : SI.cases())
      Dests.insert(I.getCaseSuccessor());

    if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
                                   DL))
      return 1;
  }

  // Check if suitable for a jump table.
  if (IsJTAllowed) {
    if (N < 2 || N < TLI->getMinimumJumpTableEntries())
      return N;
    uint64_t Range =
        (MaxCaseVal - MinCaseVal)
            .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1;
    // Check whether a range of clusters is dense enough for a jump table
    if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) {
      JumpTableSize = Range;
      return 1;
    }
  }
  return N;
}

bool shouldBuildLookupTables() {
  const TargetLoweringBase *TLI = getTLI();
  return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
         TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
}

bool shouldBuildRelLookupTables() const {
  const TargetMachine &TM = getTLI()->getTargetMachine();
  // If non-PIC mode, do not generate a relative lookup table.
  if (!TM.isPositionIndependent())
    return false;

  /// Relative lookup table entries consist of 32-bit offsets.
  /// Do not generate relative lookup tables for large code models
  /// in 64-bit achitectures where 32-bit offsets might not be enough.
  if (TM.getCodeModel() == CodeModel::Medium ||
      TM.getCodeModel() == CodeModel::Large)
    return false;

  Triple TargetTriple = TM.getTargetTriple();
  if (!TargetTriple.isArch64Bit())
    return false;

  // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it
  // there.
  if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin())
    return false;

  return true;
}

bool haveFastSqrt(Type *Ty) {
  const TargetLoweringBase *TLI = getTLI();
  EVT VT = TLI->getValueType(DL, Ty);
  return TLI->isTypeLegal(VT) &&
         TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
}

bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
  return true;
}

InstructionCost getFPOpCost(Type *Ty) {
  // Check whether FADD is available, as a proxy for floating-point in
  // general.
  const TargetLoweringBase *TLI = getTLI();
  EVT VT = TLI->getValueType(DL, Ty);
  if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT))
    return TargetTransformInfo::TCC_Basic;
  return TargetTransformInfo::TCC_Expensive;
}

unsigned getInliningThresholdMultiplier() { return 1; }
unsigned adjustInliningThreshold(const CallBase *CB) { return 0; }

int getInlinerVectorBonusPercent() { return 150; }

void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                             TTI::UnrollingPreferences &UP,
                             OptimizationRemarkEmitter *ORE) {
  // This unrolling functionality is target independent, but to provide some
  // motivation for its intended use, for x86:

  // According to the Intel 64 and IA-32 Architectures Optimization Reference
  // Manual, Intel Core models and later have a loop stream detector (and
  // associated uop queue) that can benefit from partial unrolling.
  // The relevant requirements are:
  //  - The loop must have no more than 4 (8 for Nehalem and later) branches
  //    taken, and none of them may be calls.
  //  - The loop can have no more than 18 (28 for Nehalem and later) uops.

  // According to the Software Optimization Guide for AMD Family 15h
  // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
  // and loop buffer which can benefit from partial unrolling.
  // The relevant requirements are:
  //  - The loop must have fewer than 16 branches
  //  - The loop must have less than 40 uops in all executed loop branches

  // The number of taken branches in a loop is hard to estimate here, and
  // benchmarking has revealed that it is better not to be conservative when
  // estimating the branch count. As a result, we'll ignore the branch limits
  // until someone finds a case where it matters in practice.

  unsigned MaxOps;
  const TargetSubtargetInfo *ST = getST();
  if (PartialUnrollingThreshold.getNumOccurrences() > 0)
    MaxOps = PartialUnrollingThreshold;
  else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
    MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
  else
    return;

  // Scan the loop: don't unroll loops with calls.
  for (BasicBlock *BB : L->blocks()) {
    for (Instruction &I : *BB) {
      if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
        if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
          if (!thisT()->isLoweredToCall(F))
            continue;
        }

        if (ORE) {
          ORE->emit([&]() {
            return OptimizationRemark("TTI", "DontUnroll", L->getStartLoc(),
                                      L->getHeader())
                   << "advising against unrolling the loop because it "
                      "contains a "
                   << ore::NV("Call", &I);
          });
        }
        return;
      }
    }
  }

  // Enable runtime and partial unrolling up to the specified size.
  // Enable using trip count upper bound to unroll loops.
  UP.Partial = UP.Runtime = UP.UpperBound = true;
  UP.PartialThreshold = MaxOps;

  // Avoid unrolling when optimizing for size.
  UP.OptSizeThreshold = 0;
  UP.PartialOptSizeThreshold = 0;

  // Set number of instructions optimized when "back edge"
  // becomes "fall through" to default value of 2.
  UP.BEInsns = 2;
}

void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
                           TTI::PeelingPreferences &PP) {
  PP.PeelCount = 0;
  PP.AllowPeeling = true;
  PP.AllowLoopNestsPeeling = false;
  PP.PeelProfiledIterations = true;
}

bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
                              AssumptionCache &AC,
                              TargetLibraryInfo *LibInfo,
                              HardwareLoopInfo &HWLoopInfo) {
  return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
}

bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
                                 AssumptionCache &AC, TargetLibraryInfo *TLI,
                                 DominatorTree *DT,
                                 const LoopAccessInfo *LAI) {
  return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
}

bool emitGetActiveLaneMask() {
  return BaseT::emitGetActiveLaneMask();
}

Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
                                             IntrinsicInst &II) {
  return BaseT::instCombineIntrinsic(IC, II);
}

Optional<Value *> simplifyDemandedUseBitsIntrinsic(InstCombiner &IC,
                                                   IntrinsicInst &II,
                                                   APInt DemandedMask,
                                                   KnownBits &Known,
                                                   bool &KnownBitsComputed) {
  return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
                                                 KnownBitsComputed);
}

Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
    InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
    APInt &UndefElts2, APInt &UndefElts3,
    std::function<void(Instruction *, unsigned, APInt, APInt &)>
        SimplifyAndSetOp) {
  return BaseT::simplifyDemandedVectorEltsIntrinsic(
      IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
      SimplifyAndSetOp);
}

InstructionCost getInstructionLatency(const Instruction *I) {
  if (isa<LoadInst>(I))
    return getST()->getSchedModel().DefaultLoadLatency;

  return BaseT::getInstructionLatency(I);
}

virtual Optional<unsigned>
getCacheSize(TargetTransformInfo::CacheLevel Level) const {
  return Optional<unsigned>(
    getST()->getCacheSize(static_cast<unsigned>(Level)));
}

virtual Optional<unsigned>
getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const {
  Optional<unsigned> TargetResult =
      getST()->getCacheAssociativity(static_cast<unsigned>(Level));

  if (TargetResult)
    return TargetResult;

  return BaseT::getCacheAssociativity(Level);
}

virtual unsigned getCacheLineSize() const {
  return getST()->getCacheLineSize();
}

virtual unsigned getPrefetchDistance() const {
  return getST()->getPrefetchDistance();
}

virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
                                      unsigned NumStridedMemAccesses,
                                      unsigned NumPrefetches,
                                      bool HasCall) const {
  return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
                                       NumPrefetches, HasCall);
}

virtual unsigned getMaxPrefetchIterationsAhead() const {
  return getST()->getMaxPrefetchIterationsAhead();
}

virtual bool enableWritePrefetching() const {
  return getST()->enableWritePrefetching();
}

/// @}

/// \name Vector TTI Implementations
/// @{

TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
  return TypeSize::getFixed(32);
}

Optional<unsigned> getMaxVScale() const { return None; }

/// Estimate the overhead of scalarizing an instruction. Insert and Extract
/// are set if the demanded result elements need to be inserted and/or
/// extracted from vectors.
InstructionCost getScalarizationOverhead(VectorType *InTy,
                                         const APInt &DemandedElts,
                                         bool Insert, bool Extract) {
  /// FIXME: a bitfield is not a reasonable abstraction for talking about
  /// which elements are needed from a scalable vector
  auto *Ty = cast<FixedVectorType>(InTy);

  assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&(static_cast<void> (0))
         "Vector size mismatch")(static_cast<void> (0));

  InstructionCost Cost = 0;

  for (int i = 0, e = Ty->getNumElements(); i < e; ++i) {
    if (!DemandedElts[i])
      continue;
    if (Insert)
      Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, i);
    if (Extract)
      Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
  }

  return Cost;
}

/// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert,
                                         bool Extract) {
  auto *Ty = cast<FixedVectorType>(InTy);

  APInt DemandedElts = APInt::getAllOnesValue(Ty->getNumElements());
  return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
}

/// Estimate the overhead of scalarizing an instructions unique
/// non-constant operands. The (potentially vector) types to use for each of
/// argument are passes via Tys.
InstructionCost getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
                                                 ArrayRef<Type *> Tys) {
  assert(Args.size() == Tys.size() && "Expected matching Args and Tys")(static_cast<void> (0));

  InstructionCost Cost = 0;
  SmallPtrSet<const Value*, 4> UniqueOperands;
  for (int I = 0, E = Args.size(); I != E; I++) {
    // Disregard things like metadata arguments.
    const Value *A = Args[I];
    Type *Ty = Tys[I];
    if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() &&
        !Ty->isPtrOrPtrVectorTy())
      continue;

    if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
      if (auto *VecTy = dyn_cast<VectorType>(Ty))
        Cost += getScalarizationOverhead(VecTy, false, true);
    }
  }

  return Cost;
}

/// Estimate the overhead of scalarizing the inputs and outputs of an
/// instruction, with return type RetTy and arguments Args of type Tys. If
/// Args are unknown (empty), then the cost associated with one argument is
/// added as a heuristic.
InstructionCost getScalarizationOverhead(VectorType *RetTy,
                                         ArrayRef<const Value *> Args,
                                         ArrayRef<Type *> Tys) {
  InstructionCost Cost = getScalarizationOverhead(RetTy, true, false);
  if (!Args.empty())
    Cost += getOperandsScalarizationOverhead(Args, Tys);
  else
    // When no information on arguments is provided, we add the cost
    // associated with one argument as a heuristic.
    Cost += getScalarizationOverhead(RetTy, false, true);

  return Cost;
}

unsigned getMaxInterleaveFactor(unsigned VF) { return 1; }

InstructionCost getArithmeticInstrCost(
    unsigned Opcode, Type *Ty,
    TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
    TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
    TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
    TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
    TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
    ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
    const Instruction *CxtI = nullptr) {
  // Check if any of the operands are vector operands.
  const TargetLoweringBase *TLI = getTLI();
  int ISD = TLI->InstructionOpcodeToISD(Opcode);
  assert(ISD && "Invalid opcode")(static_cast<void> (0));

  // TODO: Handle more cost kinds.
  if (CostKind != TTI::TCK_RecipThroughput)
    return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
                                         Opd1Info, Opd2Info,
                                         Opd1PropInfo, Opd2PropInfo,
                                         Args, CxtI);

  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);

  bool IsFloat = Ty->isFPOrFPVectorTy();
  // Assume that floating point arithmetic operations cost twice as much as
  // integer operations.
  InstructionCost OpCost = (IsFloat ? 2 : 1);

  if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
    // The operation is legal. Assume it costs 1.
    // TODO: Once we have extract/insert subvector cost we need to use them.
    return LT.first * OpCost;
  }

  if (!TLI->isOperationExpand(ISD, LT.second)) {
    // If the operation is custom lowered, then assume that the code is twice
    // as expensive.
    return LT.first * 2 * OpCost;
  }

  // An 'Expand' of URem and SRem is special because it may default
  // to expanding the operation into a sequence of sub-operations
  // i.e. X % Y -> X-(X/Y)*Y.
  if (ISD == ISD::UREM || ISD == ISD::SREM) {
    bool IsSigned = ISD == ISD::SREM;
    if (TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIVREM : ISD::UDIVREM,
                                      LT.second) ||
        TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIV : ISD::UDIV,
                                      LT.second)) {
      unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv;
      InstructionCost DivCost = thisT()->getArithmeticInstrCost(
          DivOpc, Ty, CostKind, Opd1Info, Opd2Info, Opd1PropInfo,
          Opd2PropInfo);
      InstructionCost MulCost =
          thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind);
      InstructionCost SubCost =
          thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
      return DivCost + MulCost + SubCost;
    }
  }

  // We cannot scalarize scalable vectors, so return Invalid.
  if (isa<ScalableVectorType>(Ty))
    return InstructionCost::getInvalid();

  // Else, assume that we need to scalarize this op.
  // TODO: If one of the types get legalized by splitting, handle this
  // similarly to what getCastInstrCost() does.
  if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
    InstructionCost Cost = thisT()->getArithmeticInstrCost(
        Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
        Opd1PropInfo, Opd2PropInfo, Args, CxtI);
    // Return the cost of multiple scalar invocation plus the cost of
    // inserting and extracting the values.
    SmallVector<Type *> Tys(Args.size(), Ty);
    return getScalarizationOverhead(VTy, Args, Tys) +
           VTy->getNumElements() * Cost;
  }

  // We don't know anything about this scalar instruction.
  return OpCost;
}

TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind,
                                            ArrayRef<int> Mask) const {
  int Limit = Mask.size() * 2;
  if (Mask.empty() ||
      // Extra check required by isSingleSourceMaskImpl function (called by
      // ShuffleVectorInst::isSingleSourceMask).
      any_of(Mask, [Limit](int I) { return I >= Limit; }))
    return Kind;
  switch (Kind) {
  case TTI::SK_PermuteSingleSrc:
    if (ShuffleVectorInst::isReverseMask(Mask))
      return TTI::SK_Reverse;
    if (ShuffleVectorInst::isZeroEltSplatMask(Mask))
      return TTI::SK_Broadcast;
    break;
  case TTI::SK_PermuteTwoSrc:
    if (ShuffleVectorInst::isSelectMask(Mask))
      return TTI::SK_Select;
    if (ShuffleVectorInst::isTransposeMask(Mask))
      return TTI::SK_Transpose;
    break;
  case TTI::SK_Select:
  case TTI::SK_Reverse:
  case TTI::SK_Broadcast:
  case TTI::SK_Transpose:
  case TTI::SK_InsertSubvector:
  case TTI::SK_ExtractSubvector:
  case TTI::SK_Splice:
    break;
  }
  return Kind;
}

InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
                               ArrayRef<int> Mask, int Index,
                               VectorType *SubTp) {

  switch (improveShuffleKindFromMask(Kind, Mask)) {
  case TTI::SK_Broadcast:
    return getBroadcastShuffleOverhead(cast<FixedVectorType>(Tp));
  case TTI::SK_Select:
  case TTI::SK_Splice:
  case TTI::SK_Reverse:
  case TTI::SK_Transpose:
  case TTI::SK_PermuteSingleSrc:
  case TTI::SK_PermuteTwoSrc:
    return getPermuteShuffleOverhead(cast<FixedVectorType>(Tp));
  case TTI::SK_ExtractSubvector:
    return getExtractSubvectorOverhead(Tp, Index,
                                       cast<FixedVectorType>(SubTp));
  case TTI::SK_InsertSubvector:
    return getInsertSubvectorOverhead(Tp, Index,
                                      cast<FixedVectorType>(SubTp));
  }
  llvm_unreachable("Unknown TTI::ShuffleKind")__builtin_unreachable();
}

InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
                                 TTI::CastContextHint CCH,
                                 TTI::TargetCostKind CostKind,
                                 const Instruction *I = nullptr) {
  if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0)
    return 0;

  const TargetLoweringBase *TLI = getTLI();
  int ISD = TLI->InstructionOpcodeToISD(Opcode);
  assert(ISD && "Invalid opcode")(static_cast<void> (0));
  std::pair<InstructionCost, MVT> SrcLT =
      TLI->getTypeLegalizationCost(DL, Src);
  std::pair<InstructionCost, MVT> DstLT =
      TLI->getTypeLegalizationCost(DL, Dst);

  TypeSize SrcSize = SrcLT.second.getSizeInBits();
  TypeSize DstSize = DstLT.second.getSizeInBits();
  bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy();
  bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy();

  switch (Opcode) {
  default:
    break;
  case Instruction::Trunc:
    // Check for NOOP conversions.
    if (TLI->isTruncateFree(SrcLT.second, DstLT.second))
      return 0;
    LLVM_FALLTHROUGH[[gnu::fallthrough]];
  case Instruction::BitCast:
    // Bitcast between types that are legalized to the same type are free and
    // assume int to/from ptr of the same size is also free.
    if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst &&
        SrcSize == DstSize)
      return 0;
    break;
  case Instruction::FPExt:
    if (I && getTLI()->isExtFree(I))
      return 0;
    break;
  case Instruction::ZExt:
    if (TLI->isZExtFree(SrcLT.second, DstLT.second))
      return 0;
    LLVM_FALLTHROUGH[[gnu::fallthrough]];
  case Instruction::SExt:
    if (I && getTLI()->isExtFree(I))
      return 0;

    // If this is a zext/sext of a load, return 0 if the corresponding
    // extending load exists on target and the result type is legal.
    if (CCH == TTI::CastContextHint::Normal) {
      EVT ExtVT = EVT::getEVT(Dst);
      EVT LoadVT = EVT::getEVT(Src);
      unsigned LType =
        ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
      if (DstLT.first == SrcLT.first &&
          TLI->isLoadExtLegal(LType, ExtVT, LoadVT))
        return 0;
    }
    break;
  case Instruction::AddrSpaceCast:
    if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(),
                                 Dst->getPointerAddressSpace()))
      return 0;
    break;
  }

  auto *SrcVTy = dyn_cast<VectorType>(Src);
  auto *DstVTy = dyn_cast<VectorType>(Dst);

  // If the cast is marked as legal (or promote) then assume low cost.
  if (SrcLT.first == DstLT.first &&
      TLI->isOperationLegalOrPromote(ISD, DstLT.second))
    return SrcLT.first;

  // Handle scalar conversions.
  if (!SrcVTy && !DstVTy) {
    // Just check the op cost. If the operation is legal then assume it costs
    // 1.
    if (!TLI->isOperationExpand(ISD, DstLT.second))
      return 1;

    // Assume that illegal scalar instruction are expensive.
    return 4;
  }

  // Check vector-to-vector casts.
  if (DstVTy && SrcVTy) {
    // If the cast is between same-sized registers, then the check is simple.
    if (SrcLT.first == DstLT.first && SrcSize == DstSize) {

      // Assume that Zext is done using AND.
      if (Opcode == Instruction::ZExt)
        return SrcLT.first;

      // Assume that sext is done using SHL and SRA.
      if (Opcode == Instruction::SExt)
        return SrcLT.first * 2;

      // Just check the op cost. If the operation is legal then assume it
      // costs
      // 1 and multiply by the type-legalization overhead.
      if (!TLI->isOperationExpand(ISD, DstLT.second))
        return SrcLT.first * 1;
    }

    // If we are legalizing by splitting, query the concrete TTI for the cost
    // of casting the original vector twice. We also need to factor in the
    // cost of the split itself. Count that as 1, to be consistent with
    // TLI->getTypeLegalizationCost().
    bool SplitSrc =
        TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
        TargetLowering::TypeSplitVector;
    bool SplitDst =
        TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
        TargetLowering::TypeSplitVector;
    if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() &&
        DstVTy->getElementCount().isVector()) {
      Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy);
      Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy);
      T *TTI = static_cast<T *>(this);
      // If both types need to be split then the split is free.
      InstructionCost SplitCost =
          (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0;
      return SplitCost +
             (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH,
                                        CostKind, I));
    }

    // Scalarization cost is Invalid, can't assume any num elements.
    if (isa<ScalableVectorType>(DstVTy))
      return InstructionCost::getInvalid();

    // In other cases where the source or destination are illegal, assume
    // the operation will get scalarized.
    unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements();
    InstructionCost Cost = thisT()->getCastInstrCost(
        Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I);

    // Return the cost of multiple scalar invocation plus the cost of
    // inserting and extracting the values.
    return getScalarizationOverhead(DstVTy, true, true) + Num * Cost;
  }

  // We already handled vector-to-vector and scalar-to-scalar conversions.
  // This
  // is where we handle bitcast between vectors and scalars. We need to assume
  //  that the conversion is scalarized in one way or another.
  if (Opcode == Instruction::BitCast) {
    // Illegal bitcasts are done by storing and loading from a stack slot.
    return (SrcVTy ? getScalarizationOverhead(SrcVTy, false, true) : 0) +
           (DstVTy ? getScalarizationOverhead(DstVTy, true, false) : 0);
  }

  llvm_unreachable("Unhandled cast")__builtin_unreachable();
}

InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst,
                                         VectorType *VecTy, unsigned Index) {
  return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
                                     Index) +
         thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(),
                                   TTI::CastContextHint::None,
                                   TTI::TCK_RecipThroughput);
}

InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
                               const Instruction *I = nullptr) {
  return BaseT::getCFInstrCost(Opcode, CostKind, I);
}

InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
                                   CmpInst::Predicate VecPred,
                                   TTI::TargetCostKind CostKind,
                                   const Instruction *I = nullptr) {
  const TargetLoweringBase *TLI = getTLI();
  int ISD = TLI->InstructionOpcodeToISD(Opcode);
  assert(ISD && "Invalid opcode")(static_cast<void> (0));

  // TODO: Handle other cost kinds.
  if (CostKind != TTI::TCK_RecipThroughput)
61
←
Assuming 'CostKind' is equal to TCK_RecipThroughput→
62
←
Taking false branch→
    return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
                                     I);

  // Selects on vectors are actually vector selects.
  if (ISD == ISD::SELECT) {
63
←
Assuming 'ISD' is equal to SELECT→
64
←
Taking true branch→
    assert(CondTy && "CondTy must exist")(static_cast<void> (0));
    if (CondTy->isVectorTy())
65
←
Called C++ object pointer is null
      ISD = ISD::VSELECT;
  }
  std::pair<InstructionCost, MVT> LT =
      TLI->getTypeLegalizationCost(DL, ValTy);

  if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
      !TLI->isOperationExpand(ISD, LT.second)) {
    // The operation is legal. Assume it costs 1. Multiply
    // by the type-legalization overhead.
    return LT.first * 1;
  }

  // Otherwise, assume that the cast is scalarized.
  // TODO: If one of the types get legalized by splitting, handle this
  // similarly to what getCastInstrCost() does.
  if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) {
    unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements();
    if (CondTy)
      CondTy = CondTy->getScalarType();
    InstructionCost Cost = thisT()->getCmpSelInstrCost(
        Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I);

    // Return the cost of multiple scalar invocation plus the cost of
    // inserting and extracting the values.
    return getScalarizationOverhead(ValVTy, true, false) + Num * Cost;
  }

  // Unknown scalar opcode.
  return 1;
}

InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
                                   unsigned Index) {
  std::pair<InstructionCost, MVT> LT =
      getTLI()->getTypeLegalizationCost(DL, Val->getScalarType());

  return LT.first;
}

InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src,
                                MaybeAlign Alignment, unsigned AddressSpace,
                                TTI::TargetCostKind CostKind,
                                const Instruction *I = nullptr) {
  assert(!Src->isVoidTy() && "Invalid type")(static_cast<void> (0));
  // Assume types, such as structs, are expensive.
  if (getTLI()->getValueType(DL, Src,  true) == MVT::Other)
    return 4;
  std::pair<InstructionCost, MVT> LT =
      getTLI()->getTypeLegalizationCost(DL, Src);

  // Assuming that all loads of legal types cost 1.
  InstructionCost Cost = LT.first;
  if (CostKind != TTI::TCK_RecipThroughput)
    return Cost;

  if (Src->isVectorTy() &&
      // In practice it's not currently possible to have a change in lane
      // length for extending loads or truncating stores so both types should
      // have the same scalable property.
      TypeSize::isKnownLT(Src->getPrimitiveSizeInBits(),
                          LT.second.getSizeInBits())) {
    // This is a vector load that legalizes to a larger type than the vector
    // itself. Unless the corresponding extending load or truncating store is
    // legal, then this will scalarize.
    TargetLowering::LegalizeAction LA = TargetLowering::Expand;
    EVT MemVT = getTLI()->getValueType(DL, Src);
    if (Opcode == Instruction::Store)
      LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
    else
      LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);

    if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
      // This is a vector load/store for some illegal type that is scalarized.
      // We must account for the cost of building or decomposing the vector.
      Cost += getScalarizationOverhead(cast<VectorType>(Src),
                                       Opcode != Instruction::Store,
                                       Opcode == Instruction::Store);
    }
  }

  return Cost;
}

InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
                                      Align Alignment, unsigned AddressSpace,
                                      TTI::TargetCostKind CostKind) {
  return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false,
                                     CostKind);
}

InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
                                       const Value *Ptr, bool VariableMask,
                                       Align Alignment,
                                       TTI::TargetCostKind CostKind,
                                       const Instruction *I = nullptr) {
  return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask,
                                     true, CostKind);
}

InstructionCost getInterleavedMemoryOpCost(
    unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
    Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
    bool UseMaskForCond = false, bool UseMaskForGaps = false) {
  auto *VT = cast<FixedVectorType>(VecTy);

  unsigned NumElts = VT->getNumElements();
  assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor")(static_cast<void> (0));

  unsigned NumSubElts = NumElts / Factor;
  auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts);

  // Firstly, the cost of load/store operation.
  InstructionCost Cost;
  if (UseMaskForCond || UseMaskForGaps)
    Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment,
                                          AddressSpace, CostKind);
  else
    Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace,
                                    CostKind);

  // Legalize the vector type, and get the legalized and unlegalized type
  // sizes.
  MVT VecTyLT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
  unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy);
  unsigned VecTyLTSize = VecTyLT.getStoreSize();

  // Scale the cost of the memory operation by the fraction of legalized
  // instructions that will actually be used. We shouldn't account for the
  // cost of dead instructions since they will be removed.
  //
  // E.g., An interleaved load of factor 8:
  //       %vec = load <16 x i64>, <16 x i64>* %ptr
  //       %v0 = shufflevector %vec, undef, <0, 8>
  //
  // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
  // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
  // type). The other loads are unused.
  //
  // TODO: Note that legalization can turn masked loads/stores into unmasked
  // (legalized) loads/stores. This can be reflected in the cost.
  if (VecTySize > VecTyLTSize) {
    // The number of loads of a legal type it will take to represent a load
    // of the unlegalized vector type.
    unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize);

    // The number of elements of the unlegalized type that correspond to a
    // single legal instruction.
    unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts);

    // Determine which legal instructions will be used.
    BitVector UsedInsts(NumLegalInsts, false);
    for (unsigned Index : Indices)
      for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
        UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);

    // Scale the cost of the load by the fraction of legal instructions that
    // will be used.
    Cost *= UsedInsts.count() / NumLegalInsts;
  }

  // Then plus the cost of interleave operation.
  assert(Indices.size() <= Factor &&(static_cast<void> (0))
         "Interleaved memory op has too many members")(static_cast<void> (0));
  if (Opcode == Instruction::Load) {
    // The interleave cost is similar to extract sub vectors' elements
    // from the wide vector, and insert them into sub vectors.
    //
    // E.g. An interleaved load of factor 2 (with one member of index 0):
    //      %vec = load <8 x i32>, <8 x i32>* %ptr
    //      %v0 = shuffle %vec, undef, <0, 2, 4, 6>         ; Index 0
    // The cost is estimated as extract elements at 0, 2, 4, 6 from the
    // <8 x i32> vector and insert them into a <4 x i32> vector.
    for (unsigned Index : Indices) {
      assert(Index < Factor && "Invalid index for interleaved memory op")(static_cast<void> (0));

      // Extract elements from loaded vector for each sub vector.
      for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
        Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VT,
                                            Index + Elm * Factor);
    }

    InstructionCost InsSubCost = 0;
    for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
      InsSubCost +=
          thisT()->getVectorInstrCost(Instruction::InsertElement, SubVT, Elm);

    Cost += Indices.size() * InsSubCost;
  } else {
    // The interleave cost is extract elements from sub vectors, and
    // insert them into the wide vector.
    //
    // E.g. An interleaved store of factor 3 with 2 members at indices 0,1:
    // (using VF=4):
    //    %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef>
    //    %gaps.mask = <true, true, false, true, true, false,
    //                  true, true, false, true, true, false>
    //    call llvm.masked.store <12 x i32> %v0_v1, <12 x i32>* %ptr,
    //                           i32 Align, <12 x i1> %gaps.mask
    // The cost is estimated as extract all elements (of actual members,
    // excluding gaps) from both <4 x i32> vectors and insert into the <12 x
    // i32> vector.
    InstructionCost ExtSubCost = 0;
    for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
      ExtSubCost += thisT()->getVectorInstrCost(Instruction::ExtractElement,
                                                SubVT, Elm);
    Cost += ExtSubCost * Indices.size();

    for (unsigned Index : Indices) {
      assert(Index < Factor && "Invalid index for interleaved memory op")(static_cast<void> (0));

      // Insert elements from loaded vector for each sub vector.
      for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
        Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VT,
                                            Index + Elm * Factor);
    }
  }

  if (!UseMaskForCond)
    return Cost;

  Type *I8Type = Type::getInt8Ty(VT->getContext());
  auto *MaskVT = FixedVectorType::get(I8Type, NumElts);
  SubVT = FixedVectorType::get(I8Type, NumSubElts);

  // The Mask shuffling cost is extract all the elements of the Mask
  // and insert each of them Factor times into the wide vector:
  //
  // E.g. an interleaved group with factor 3:
  //    %mask = icmp ult <8 x i32> %vec1, %vec2
  //    %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
  //        <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
  // The cost is estimated as extract all mask elements from the <8xi1> mask
  // vector and insert them factor times into the <24xi1> shuffled mask
  // vector.
  for (unsigned i = 0; i < NumSubElts; i++)
    Cost +=
        thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVT, i);

  for (unsigned i = 0; i < NumElts; i++)
    Cost +=
        thisT()->getVectorInstrCost(Instruction::InsertElement, MaskVT, i);

  // The Gaps mask is invariant and created outside the loop, therefore the
  // cost of creating it is not accounted for here. However if we have both
  // a MaskForGaps and some other mask that guards the execution of the
  // memory access, we need to account for the cost of And-ing the two masks
  // inside the loop.
  if (UseMaskForGaps)
    Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT,
                                            CostKind);

  return Cost;
}

/// Get intrinsic cost based on arguments.
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                      TTI::TargetCostKind CostKind) {
  // Check for generically free intrinsics.
  if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0)
3
←
Calling 'InstructionCost::operator=='→
8
←
Returning from 'InstructionCost::operator=='→
9
←
Taking false branch→
    return 0;

  // Assume that target intrinsics are cheap.
  Intrinsic::ID IID = ICA.getID();
  if (Function::isTargetIntrinsic(IID))
10
←
Assuming the condition is false→
11
←
Taking false branch→
    return TargetTransformInfo::TCC_Basic;

  if (ICA.isTypeBasedOnly())
12
←
Calling 'IntrinsicCostAttributes::isTypeBasedOnly'→
18
←
Returning from 'IntrinsicCostAttributes::isTypeBasedOnly'→
19
←
Taking false branch→
    return getTypeBasedIntrinsicInstrCost(ICA, CostKind);

  Type *RetTy = ICA.getReturnType();

  ElementCount RetVF =
      (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
20
←
'?' condition is false→
                           : ElementCount::getFixed(1));
  const IntrinsicInst *I = ICA.getInst();
  const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
  FastMathFlags FMF = ICA.getFlags();
  switch (IID) {
21
←
Control jumps to 'case fshl:'  at line 1445→
  default:
    break;

  case Intrinsic::cttz:
    // FIXME: If necessary, this should go in target-specific overrides.
    if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz())
      return TargetTransformInfo::TCC_Basic;
    break;

  case Intrinsic::ctlz:
    // FIXME: If necessary, this should go in target-specific overrides.
    if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz())
      return TargetTransformInfo::TCC_Basic;
    break;

  case Intrinsic::memcpy:
    return thisT()->getMemcpyCost(ICA.getInst());

  case Intrinsic::masked_scatter: {
    const Value *Mask = Args[3];
    bool VarMask = !isa<Constant>(Mask);
    Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue();
    return thisT()->getGatherScatterOpCost(Instruction::Store,
                                           ICA.getArgTypes()[0], Args[1],
                                           VarMask, Alignment, CostKind, I);
  }
  case Intrinsic::masked_gather: {
    const Value *Mask = Args[2];
    bool VarMask = !isa<Constant>(Mask);
    Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue();
    return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
                                           VarMask, Alignment, CostKind, I);
  }
  case Intrinsic::experimental_stepvector: {
    if (isa<ScalableVectorType>(RetTy))
      return BaseT::getIntrinsicInstrCost(ICA, CostKind);
    // The cost of materialising a constant integer vector.
    return TargetTransformInfo::TCC_Basic;
  }
  case Intrinsic::experimental_vector_extract: {
    // FIXME: Handle case where a scalable vector is extracted from a scalable
    // vector
    if (isa<ScalableVectorType>(RetTy))
      return BaseT::getIntrinsicInstrCost(ICA, CostKind);
    unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue();
    return thisT()->getShuffleCost(TTI::SK_ExtractSubvector,
                                   cast<VectorType>(Args[0]->getType()), None,
                                   Index, cast<VectorType>(RetTy));
  }
  case Intrinsic::experimental_vector_insert: {
    // FIXME: Handle case where a scalable vector is inserted into a scalable
    // vector
    if (isa<ScalableVectorType>(Args[1]->getType()))
      return BaseT::getIntrinsicInstrCost(ICA, CostKind);
    unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
    return thisT()->getShuffleCost(
        TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()), None,
        Index, cast<VectorType>(Args[1]->getType()));
  }
  case Intrinsic::experimental_vector_reverse: {
    return thisT()->getShuffleCost(TTI::SK_Reverse,
                                   cast<VectorType>(Args[0]->getType()), None,
                                   0, cast<VectorType>(RetTy));
  }
  case Intrinsic::experimental_vector_splice: {
    unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
    return thisT()->getShuffleCost(TTI::SK_Splice,
                                   cast<VectorType>(Args[0]->getType()), None,
                                   Index, cast<VectorType>(RetTy));
  }
  case Intrinsic::vector_reduce_add:
  case Intrinsic::vector_reduce_mul:
  case Intrinsic::vector_reduce_and:
  case Intrinsic::vector_reduce_or:
  case Intrinsic::vector_reduce_xor:
  case Intrinsic::vector_reduce_smax:
  case Intrinsic::vector_reduce_smin:
  case Intrinsic::vector_reduce_fmax:
  case Intrinsic::vector_reduce_fmin:
  case Intrinsic::vector_reduce_umax:
  case Intrinsic::vector_reduce_umin: {
    IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1);
    return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
  }
  case Intrinsic::vector_reduce_fadd:
  case Intrinsic::vector_reduce_fmul: {
    IntrinsicCostAttributes Attrs(
        IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1);
    return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
  }
  case Intrinsic::fshl:
  case Intrinsic::fshr: {
    if (isa<ScalableVectorType>(RetTy))
22
←
Assuming 'RetTy' is not a 'ScalableVectorType'→
23
←
Taking false branch→
      return BaseT::getIntrinsicInstrCost(ICA, CostKind);
    const Value *X = Args[0];
    const Value *Y = Args[1];
    const Value *Z = Args[2];
    TTI::OperandValueProperties OpPropsX, OpPropsY, OpPropsZ, OpPropsBW;
    TTI::OperandValueKind OpKindX = TTI::getOperandInfo(X, OpPropsX);
    TTI::OperandValueKind OpKindY = TTI::getOperandInfo(Y, OpPropsY);
    TTI::OperandValueKind OpKindZ = TTI::getOperandInfo(Z, OpPropsZ);
    TTI::OperandValueKind OpKindBW = TTI::OK_UniformConstantValue;
    OpPropsBW = isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2
24
←
'?' condition is false→
                                                            : TTI::OP_None;
    // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
    // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
    InstructionCost Cost = 0;
    Cost +=
        thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
    Cost +=
        thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
    Cost += thisT()->getArithmeticInstrCost(
        BinaryOperator::Shl, RetTy, CostKind, OpKindX, OpKindZ, OpPropsX);
    Cost += thisT()->getArithmeticInstrCost(
        BinaryOperator::LShr, RetTy, CostKind, OpKindY, OpKindZ, OpPropsY);
    // Non-constant shift amounts requires a modulo.
    if (OpKindZ != TTI::OK_UniformConstantValue &&
25
←
Assuming 'OpKindZ' is equal to OK_UniformConstantValue→
        OpKindZ != TTI::OK_NonUniformConstantValue)
      Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy,
                                              CostKind, OpKindZ, OpKindBW,
                                              OpPropsZ, OpPropsBW);
    // For non-rotates (X != Y) we must add shift-by-zero handling costs.
    if (X != Y) {
26
←
Assuming 'X' is not equal to 'Y'→
27
←
Taking true branch→
      Type *CondTy = RetTy->getWithNewBitWidth(1);
28
←
Calling 'Type::getWithNewBitWidth'→
36
←
Returning from 'Type::getWithNewBitWidth'→
37
←
'CondTy' initialized here→
      Cost +=
          thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
38
←
Passing 'CondTy' via 3rd parameter 'CondTy'→
39
←
Calling 'ARMTTIImpl::getCmpSelInstrCost'→
                                      CmpInst::BAD_ICMP_PREDICATE, CostKind);
      Cost +=
          thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
                                      CmpInst::BAD_ICMP_PREDICATE, CostKind);
    }
    return Cost;
  }
  }

  // Assume that we need to scalarize this intrinsic.
  // Compute the scalarization overhead based on Args for a vector
  // intrinsic.
  InstructionCost ScalarizationCost = InstructionCost::getInvalid();
  if (RetVF.isVector() && !RetVF.isScalable()) {
    ScalarizationCost = 0;
    if (!RetTy->isVoidTy())
      ScalarizationCost +=
          getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
    ScalarizationCost +=
        getOperandsScalarizationOverhead(Args, ICA.getArgTypes());
  }

  IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
                                ScalarizationCost);
  return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
}

/// Get intrinsic cost based on argument types.
/// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the
/// cost of scalarizing the arguments and the return value will be computed
/// based on types.
InstructionCost
getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                               TTI::TargetCostKind CostKind) {
  Intrinsic::ID IID = ICA.getID();
  Type *RetTy = ICA.getReturnType();
  const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes();
  FastMathFlags FMF = ICA.getFlags();
  InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost();
  bool SkipScalarizationCost = ICA.skipScalarizationCost();

  VectorType *VecOpTy = nullptr;
  if (!Tys.empty()) {
    // The vector reduction operand is operand 0 except for fadd/fmul.
    // Their operand 0 is a scalar start value, so the vector op is operand 1.
    unsigned VecTyIndex = 0;
    if (IID == Intrinsic::vector_reduce_fadd ||
        IID == Intrinsic::vector_reduce_fmul)
      VecTyIndex = 1;
    assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes")(static_cast<void> (0));
    VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]);
  }

  // Library call cost - other than size, make it expensive.
  unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10;
  SmallVector<unsigned, 2> ISDs;
  switch (IID) {
  default: {
    // Scalable vectors cannot be scalarized, so return Invalid.
    if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
          return isa<ScalableVectorType>(Ty);
        }))
      return InstructionCost::getInvalid();

    // Assume that we need to scalarize this intrinsic.
    InstructionCost ScalarizationCost =
        SkipScalarizationCost ? ScalarizationCostPassed : 0;
    unsigned ScalarCalls = 1;
    Type *ScalarRetTy = RetTy;
    if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
      if (!SkipScalarizationCost)
        ScalarizationCost = getScalarizationOverhead(RetVTy, true, false);
      ScalarCalls = std::max(ScalarCalls,
                             cast<FixedVectorType>(RetVTy)->getNumElements());
      ScalarRetTy = RetTy->getScalarType();
    }
    SmallVector<Type *, 4> ScalarTys;
    for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
      Type *Ty = Tys[i];
      if (auto *VTy = dyn_cast<VectorType>(Ty)) {
        if (!SkipScalarizationCost)
          ScalarizationCost += getScalarizationOverhead(VTy, false, true);
        ScalarCalls = std::max(ScalarCalls,
                               cast<FixedVectorType>(VTy)->getNumElements());
        Ty = Ty->getScalarType();
      }
      ScalarTys.push_back(Ty);
    }
    if (ScalarCalls == 1)
      return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.

    IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF);
    InstructionCost ScalarCost =
        thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind);

    return ScalarCalls * ScalarCost + ScalarizationCost;
  }
  // Look for intrinsics that can be lowered directly or turned into a scalar
  // intrinsic call.
  case Intrinsic::sqrt:
    ISDs.push_back(ISD::FSQRT);
    break;
  case Intrinsic::sin:
    ISDs.push_back(ISD::FSIN);
    break;
  case Intrinsic::cos:
    ISDs.push_back(ISD::FCOS);
    break;
  case Intrinsic::exp:
    ISDs.push_back(ISD::FEXP);
    break;
  case Intrinsic::exp2:
    ISDs.push_back(ISD::FEXP2);
    break;
  case Intrinsic::log:
    ISDs.push_back(ISD::FLOG);
    break;
  case Intrinsic::log10:
    ISDs.push_back(ISD::FLOG10);
    break;
  case Intrinsic::log2:
    ISDs.push_back(ISD::FLOG2);
    break;
  case Intrinsic::fabs:
    ISDs.push_back(ISD::FABS);
    break;
  case Intrinsic::canonicalize:
    ISDs.push_back(ISD::FCANONICALIZE);
    break;
  case Intrinsic::minnum:
    ISDs.push_back(ISD::FMINNUM);
    break;
  case Intrinsic::maxnum:
    ISDs.push_back(ISD::FMAXNUM);
    break;
  case Intrinsic::minimum:
    ISDs.push_back(ISD::FMINIMUM);
    break;
  case Intrinsic::maximum:
    ISDs.push_back(ISD::FMAXIMUM);
    break;
  case Intrinsic::copysign:
    ISDs.push_back(ISD::FCOPYSIGN);
    break;
  case Intrinsic::floor:
    ISDs.push_back(ISD::FFLOOR);
    break;
  case Intrinsic::ceil:
    ISDs.push_back(ISD::FCEIL);
    break;
  case Intrinsic::trunc:
    ISDs.push_back(ISD::FTRUNC);
    break;
  case Intrinsic::nearbyint:
    ISDs.push_back(ISD::FNEARBYINT);
    break;
  case Intrinsic::rint:
    ISDs.push_back(ISD::FRINT);
    break;
  case Intrinsic::round:
    ISDs.push_back(ISD::FROUND);
    break;
  case Intrinsic::roundeven:
    ISDs.push_back(ISD::FROUNDEVEN);
    break;
  case Intrinsic::pow:
    ISDs.push_back(ISD::FPOW);
    break;
  case Intrinsic::fma:
    ISDs.push_back(ISD::FMA);
    break;
  case Intrinsic::fmuladd:
    ISDs.push_back(ISD::FMA);
    break;
  case Intrinsic::experimental_constrained_fmuladd:
    ISDs.push_back(ISD::STRICT_FMA);
    break;
  // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
  case Intrinsic::lifetime_start:
  case Intrinsic::lifetime_end:
  case Intrinsic::sideeffect:
  case Intrinsic::pseudoprobe:
  case Intrinsic::arithmetic_fence:
    return 0;
  case Intrinsic::masked_store: {
    Type *Ty = Tys[0];
    Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
    return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0,
                                          CostKind);
  }
  case Intrinsic::masked_load: {
    Type *Ty = RetTy;
    Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
    return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
                                          CostKind);
  }
  case Intrinsic::vector_reduce_add:
    return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy,
                                               None, CostKind);
  case Intrinsic::vector_reduce_mul:
    return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy,
                                               None, CostKind);
  case Intrinsic::vector_reduce_and:
    return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy,
                                               None, CostKind);
  case Intrinsic::vector_reduce_or:
    return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy, None,
                                               CostKind);
  case Intrinsic::vector_reduce_xor:
    return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy,
                                               None, CostKind);
  case Intrinsic::vector_reduce_fadd:
    return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy,
                                               FMF, CostKind);
  case Intrinsic::vector_reduce_fmul:
    return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy,
                                               FMF, CostKind);
  case Intrinsic::vector_reduce_smax:
  case Intrinsic::vector_reduce_smin:
  case Intrinsic::vector_reduce_fmax:
  case Intrinsic::vector_reduce_fmin:
    return thisT()->getMinMaxReductionCost(
        VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
        /*IsUnsigned=*/false, CostKind);
  case Intrinsic::vector_reduce_umax:
  case Intrinsic::vector_reduce_umin:
    return thisT()->getMinMaxReductionCost(
        VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
        /*IsUnsigned=*/true, CostKind);
  case Intrinsic::abs:
  case Intrinsic::smax:
  case Intrinsic::smin:
  case Intrinsic::umax:
  case Intrinsic::umin: {
    // abs(X) = select(icmp(X,0),X,sub(0,X))
    // minmax(X,Y) = select(icmp(X,Y),X,Y)
    Type *CondTy = RetTy->getWithNewBitWidth(1);
    InstructionCost Cost = 0;
    // TODO: Ideally getCmpSelInstrCost would accept an icmp condition code.
    Cost +=
        thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
                                    CmpInst::BAD_ICMP_PREDICATE, CostKind);
    Cost +=
        thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
                                    CmpInst::BAD_ICMP_PREDICATE, CostKind);
    // TODO: Should we add an OperandValueProperties::OP_Zero property?
    if (IID == Intrinsic::abs)
      Cost += thisT()->getArithmeticInstrCost(
          BinaryOperator::Sub, RetTy, CostKind, TTI::OK_UniformConstantValue);
    return Cost;
  }
  case Intrinsic::sadd_sat:
  case Intrinsic::ssub_sat: {
    Type *CondTy = RetTy->getWithNewBitWidth(1);

    Type *OpTy = StructType::create({RetTy, CondTy});
    Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
                                   ? Intrinsic::sadd_with_overflow
                                   : Intrinsic::ssub_with_overflow;

    // SatMax -> Overflow && SumDiff < 0
    // SatMin -> Overflow && SumDiff >= 0
    InstructionCost Cost = 0;
    IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
                                  nullptr, ScalarizationCostPassed);
    Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
    Cost +=
        thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
                                    CmpInst::BAD_ICMP_PREDICATE, CostKind);
    Cost += 2 * thisT()->getCmpSelInstrCost(
                    BinaryOperator::Select, RetTy, CondTy,
                    CmpInst::BAD_ICMP_PREDICATE, CostKind);
    return Cost;
  }
  case Intrinsic::uadd_sat:
  case Intrinsic::usub_sat: {
    Type *CondTy = RetTy->getWithNewBitWidth(1);

    Type *OpTy = StructType::create({RetTy, CondTy});
    Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
                                   ? Intrinsic::uadd_with_overflow
                                   : Intrinsic::usub_with_overflow;

    InstructionCost Cost = 0;
    IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
                                  nullptr, ScalarizationCostPassed);
    Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
    Cost +=
        thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
                                    CmpInst::BAD_ICMP_PREDICATE, CostKind);
    return Cost;
  }
  case Intrinsic::smul_fix:
  case Intrinsic::umul_fix: {
    unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
    Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);

    unsigned ExtOp =
        IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
    TTI::CastContextHint CCH = TTI::CastContextHint::None;

    InstructionCost Cost = 0;
    Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
    Cost +=
        thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
    Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
                                          CCH, CostKind);
    Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy,
                                            CostKind, TTI::OK_AnyValue,
                                            TTI::OK_UniformConstantValue);
    Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind,
                                            TTI::OK_AnyValue,
                                            TTI::OK_UniformConstantValue);
    Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
    return Cost;
  }
  case Intrinsic::sadd_with_overflow:
  case Intrinsic::ssub_with_overflow: {
    Type *SumTy = RetTy->getContainedType(0);
    Type *OverflowTy = RetTy->getContainedType(1);
    unsigned Opcode = IID == Intrinsic::sadd_with_overflow
                          ? BinaryOperator::Add
                          : BinaryOperator::Sub;

    //   LHSSign -> LHS >= 0
    //   RHSSign -> RHS >= 0
    //   SumSign -> Sum >= 0
    //
    //   Add:
    //   Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign)
    //   Sub:
    //   Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign)
    InstructionCost Cost = 0;
    Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
    Cost += 3 * thisT()->getCmpSelInstrCost(
                    Instruction::ICmp, SumTy, OverflowTy,
                    CmpInst::BAD_ICMP_PREDICATE, CostKind);
    Cost += 2 * thisT()->getCmpSelInstrCost(
                    Instruction::Select, OverflowTy, OverflowTy,
                    CmpInst::BAD_ICMP_PREDICATE, CostKind);
    Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, OverflowTy,
                                            CostKind);
    return Cost;
  }
  case Intrinsic::uadd_with_overflow:
  case Intrinsic::usub_with_overflow: {
    Type *SumTy = RetTy->getContainedType(0);
    Type *OverflowTy = RetTy->getContainedType(1);
    unsigned Opcode = IID == Intrinsic::uadd_with_overflow
                          ? BinaryOperator::Add
                          : BinaryOperator::Sub;

    InstructionCost Cost = 0;
    Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
    Cost +=
        thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy,
                                    CmpInst::BAD_ICMP_PREDICATE, CostKind);
    return Cost;
  }
  case Intrinsic::smul_with_overflow:
  case Intrinsic::umul_with_overflow: {
    Type *MulTy = RetTy->getContainedType(0);
    Type *OverflowTy = RetTy->getContainedType(1);
    unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
    Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);

    unsigned ExtOp =
        IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
    TTI::CastContextHint CCH = TTI::CastContextHint::None;

    InstructionCost Cost = 0;
    Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
    Cost +=
        thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
    Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
                                          CCH, CostKind);
    Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, MulTy,
                                            CostKind, TTI::OK_AnyValue,
                                            TTI::OK_UniformConstantValue);

    if (IID == Intrinsic::smul_with_overflow)
      Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy,
                                              CostKind, TTI::OK_AnyValue,
                                              TTI::OK_UniformConstantValue);

    Cost +=
        thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, MulTy, OverflowTy,
                                    CmpInst::BAD_ICMP_PREDICATE, CostKind);
    return Cost;
  }
  case Intrinsic::ctpop:
    ISDs.push_back(ISD::CTPOP);
    // In case of legalization use TCC_Expensive. This is cheaper than a
    // library call but still not a cheap instruction.
    SingleCallCost = TargetTransformInfo::TCC_Expensive;
    break;
  case Intrinsic::ctlz:
    ISDs.push_back(ISD::CTLZ);
    break;
  case Intrinsic::cttz:
    ISDs.push_back(ISD::CTTZ);
    break;
  case Intrinsic::bswap:
    ISDs.push_back(ISD::BSWAP);
    break;
  case Intrinsic::bitreverse:
    ISDs.push_back(ISD::BITREVERSE);
    break;
  }

  const TargetLoweringBase *TLI = getTLI();
  std::pair<InstructionCost, MVT> LT =
      TLI->getTypeLegalizationCost(DL, RetTy);

  SmallVector<InstructionCost, 2> LegalCost;
  SmallVector<InstructionCost, 2> CustomCost;
  for (unsigned ISD : ISDs) {
    if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
      if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
          TLI->isFAbsFree(LT.second)) {
        return 0;
      }

      // The operation is legal. Assume it costs 1.
      // If the type is split to multiple registers, assume that there is some
      // overhead to this.
      // TODO: Once we have extract/insert subvector cost we need to use them.
      if (LT.first > 1)
        LegalCost.push_back(LT.first * 2);
      else
        LegalCost.push_back(LT.first * 1);
    } else if (!TLI->isOperationExpand(ISD, LT.second)) {
      // If the operation is custom lowered then assume
      // that the code is twice as expensive.
      CustomCost.push_back(LT.first * 2);
    }
  }

  auto *MinLegalCostI = std::min_element(LegalCost.begin(), LegalCost.end());
  if (MinLegalCostI != LegalCost.end())
    return *MinLegalCostI;

  auto MinCustomCostI =
      std::min_element(CustomCost.begin(), CustomCost.end());
  if (MinCustomCostI != CustomCost.end())
    return *MinCustomCostI;

  // If we can't lower fmuladd into an FMA estimate the cost as a floating
  // point mul followed by an add.
  if (IID == Intrinsic::fmuladd)
    return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy,
                                           CostKind) +
           thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy,
                                           CostKind);
  if (IID == Intrinsic::experimental_constrained_fmuladd) {
    IntrinsicCostAttributes FMulAttrs(
      Intrinsic::experimental_constrained_fmul, RetTy, Tys);
    IntrinsicCostAttributes FAddAttrs(
      Intrinsic::experimental_constrained_fadd, RetTy, Tys);
    return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) +
           thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind);
  }

  // Else, assume that we need to scalarize this intrinsic. For math builtins
  // this will emit a costly libcall, adding call overhead and spills. Make it
  // very expensive.
  if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
    // Scalable vectors cannot be scalarized, so return Invalid.
    if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
          return isa<ScalableVectorType>(Ty);
        }))
      return InstructionCost::getInvalid();

    InstructionCost ScalarizationCost =
        SkipScalarizationCost ? ScalarizationCostPassed
                              : getScalarizationOverhead(RetVTy, true, false);

    unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements();
    SmallVector<Type *, 4> ScalarTys;
    for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
      Type *Ty = Tys[i];
      if (Ty->isVectorTy())
        Ty = Ty->getScalarType();
      ScalarTys.push_back(Ty);
    }
    IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
    InstructionCost ScalarCost =
        thisT()->getIntrinsicInstrCost(Attrs, CostKind);
    for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
      if (auto *VTy = dyn_cast<VectorType>(Tys[i])) {
        if (!ICA.skipScalarizationCost())
          ScalarizationCost += getScalarizationOverhead(VTy, false, true);
        ScalarCalls = std::max(ScalarCalls,
                               cast<FixedVectorType>(VTy)->getNumElements());
      }
    }
    return ScalarCalls * ScalarCost + ScalarizationCost;
  }

  // This is going to be turned into a library call, make it expensive.
  return SingleCallCost;
}

/// Compute a cost of the given call instruction.
///
/// Compute the cost of calling function F with return type RetTy and
/// argument types Tys. F might be nullptr, in this case the cost of an
/// arbitrary call with the specified signature will be returned.
/// This is used, for instance,  when we estimate call of a vector
/// counterpart of the given function.
/// \param F Called function, might be nullptr.
/// \param RetTy Return value types.
/// \param Tys Argument types.
/// \returns The cost of Call instruction.
InstructionCost
getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys,
                 TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) {
  return 10;
}

unsigned getNumberOfParts(Type *Tp) {
  std::pair<InstructionCost, MVT> LT =
      getTLI()->getTypeLegalizationCost(DL, Tp);
  return *LT.first.getValue();
}

InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *,
                                          const SCEV *) {
  return 0;
}

/// Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
/// We're assuming that reduction operation are performing the following way:
///
/// %val1 = shufflevector<n x t> %val, <n x t> %undef,
/// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
///            \----------------v-------------/  \----------v------------/
///                            n/2 elements               n/2 elements
/// %red1 = op <n x t> %val, <n x t> val1
/// After this operation we have a vector %red1 where only the first n/2
/// elements are meaningful, the second n/2 elements are undefined and can be
/// dropped. All other operations are actually working with the vector of
/// length n/2, not n, though the real vector length is still n.
/// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
/// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
///            \----------------v-------------/  \----------v------------/
///                            n/4 elements               3*n/4 elements
/// %red2 = op <n x t> %red1, <n x t> val2  - working with the vector of
/// length n/2, the resulting vector has length n/4 etc.
///
/// The cost model should take into account that the actual length of the
/// vector is reduced on each iteration.
InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty,
                                     TTI::TargetCostKind CostKind) {
  Type *ScalarTy = Ty->getElementType();
  unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
  if ((Opcode == Instruction::Or || Opcode == Instruction::And) &&
      ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) &&
      NumVecElts >= 2) {
    // Or reduction for i1 is represented as:
    // %val = bitcast <ReduxWidth x i1> to iReduxWidth
    // %res = cmp ne iReduxWidth %val, 0
    // And reduction for i1 is represented as:
    // %val = bitcast <ReduxWidth x i1> to iReduxWidth
    // %res = cmp eq iReduxWidth %val, 11111
    Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts);
    return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty,
                                     TTI::CastContextHint::None, CostKind) +
           thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy,
                                       CmpInst::makeCmpResultType(ValTy),
                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
  }
  unsigned NumReduxLevels = Log2_32(NumVecElts);
  InstructionCost ArithCost = 0;
  InstructionCost ShuffleCost = 0;
  std::pair<InstructionCost, MVT> LT =
      thisT()->getTLI()->getTypeLegalizationCost(DL, Ty);
  unsigned LongVectorCount = 0;
  unsigned MVTLen =
      LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
  while (NumVecElts > MVTLen) {
    NumVecElts /= 2;
    VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
    ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None,
                                           NumVecElts, SubTy);
    ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind);
    Ty = SubTy;
    ++LongVectorCount;
  }

  NumReduxLevels -= LongVectorCount;

  // The minimal length of the vector is limited by the real length of vector
  // operations performed on the current platform. That's why several final
  // reduction operations are performed on the vectors with the same
  // architecture-dependent length.

  // By default reductions need one shuffle per reduction level.
  ShuffleCost += NumReduxLevels * thisT()->getShuffleCost(
                                   TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty);
  ArithCost += NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty);
  return ShuffleCost + ArithCost +
         thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
}

/// Try to calculate the cost of performing strict (in-order) reductions,
/// which involves doing a sequence of floating point additions in lane
/// order, starting with an initial value. For example, consider a scalar
/// initial value 'InitVal' of type float and a vector of type <4 x float>:
///
///   Vector = <float %v0, float %v1, float %v2, float %v3>
///
///   %add1 = %InitVal + %v0
///   %add2 = %add1 + %v1
///   %add3 = %add2 + %v2
///   %add4 = %add3 + %v3
///
/// As a simple estimate we can say the cost of such a reduction is 4 times
/// the cost of a scalar FP addition. We can only estimate the costs for
/// fixed-width vectors here because for scalable vectors we do not know the
/// runtime number of operations.
InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty,
                                        TTI::TargetCostKind CostKind) {
  // Targets must implement a default value for the scalable case, since
  // we don't know how many lanes the vector has.
  if (isa<ScalableVectorType>(Ty))
    return InstructionCost::getInvalid();

  auto *VTy = cast<FixedVectorType>(Ty);
  InstructionCost ExtractCost =
      getScalarizationOverhead(VTy, /*Insert=*/false, /*Extract=*/true);
  InstructionCost ArithCost = thisT()->getArithmeticInstrCost(
      Opcode, VTy->getElementType(), CostKind);
  ArithCost *= VTy->getNumElements();

  return ExtractCost + ArithCost;
}

InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
                                           Optional<FastMathFlags> FMF,
                                           TTI::TargetCostKind CostKind) {
  if (TTI::requiresOrderedReduction(FMF))
    return getOrderedReductionCost(Opcode, Ty, CostKind);
  return getTreeReductionCost(Opcode, Ty, CostKind);
}

/// Try to calculate op costs for min/max reduction operations.
/// \param CondTy Conditional type for the Select instruction.
InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
                                       bool IsUnsigned,
                                       TTI::TargetCostKind CostKind) {
  Type *ScalarTy = Ty->getElementType();
  Type *ScalarCondTy = CondTy->getElementType();
  unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
  unsigned NumReduxLevels = Log2_32(NumVecElts);
  unsigned CmpOpcode;
  if (Ty->isFPOrFPVectorTy()) {
    CmpOpcode = Instruction::FCmp;
  } else {
    assert(Ty->isIntOrIntVectorTy() &&(static_cast<void> (0))
           "expecting floating point or integer type for min/max reduction")(static_cast<void> (0));
    CmpOpcode = Instruction::ICmp;
  }
  InstructionCost MinMaxCost = 0;
  InstructionCost ShuffleCost = 0;
  std::pair<InstructionCost, MVT> LT =
      thisT()->getTLI()->getTypeLegalizationCost(DL, Ty);
  unsigned LongVectorCount = 0;
  unsigned MVTLen =
      LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
  while (NumVecElts > MVTLen) {
    NumVecElts /= 2;
    auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
    CondTy = FixedVectorType::get(ScalarCondTy, NumVecElts);

    ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None,
                                           NumVecElts, SubTy);
    MinMaxCost +=
        thisT()->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy,
                                    CmpInst::BAD_ICMP_PREDICATE, CostKind) +
        thisT()->getCmpSelInstrCost(Instruction::Select, SubTy, CondTy,
                                    CmpInst::BAD_ICMP_PREDICATE, CostKind);
    Ty = SubTy;
    ++LongVectorCount;
  }

  NumReduxLevels -= LongVectorCount;

  // The minimal length of the vector is limited by the real length of vector
  // operations performed on the current platform. That's why several final
  // reduction opertions are perfomed on the vectors with the same
  // architecture-dependent length.
  ShuffleCost += NumReduxLevels * thisT()->getShuffleCost(
                                   TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty);
  MinMaxCost +=
      NumReduxLevels *
      (thisT()->getCmpSelInstrCost(CmpOpcode, Ty, CondTy,
                                   CmpInst::BAD_ICMP_PREDICATE, CostKind) +
       thisT()->getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
                                   CmpInst::BAD_ICMP_PREDICATE, CostKind));
  // The last min/max should be in vector registers and we counted it above.
  // So just need a single extractelement.
  return ShuffleCost + MinMaxCost +
         thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
}

InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
                                            Type *ResTy, VectorType *Ty,
                                            TTI::TargetCostKind CostKind) {
  // Without any native support, this is equivalent to the cost of
  // vecreduce.add(ext) or if IsMLA vecreduce.add(mul(ext, ext))
  VectorType *ExtTy = VectorType::get(ResTy, Ty);
  InstructionCost RedCost = thisT()->getArithmeticReductionCost(
      Instruction::Add, ExtTy, None, CostKind);
  InstructionCost MulCost = 0;
  InstructionCost ExtCost = thisT()->getCastInstrCost(
      IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
      TTI::CastContextHint::None, CostKind);
  if (IsMLA) {
    MulCost =
        thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
    ExtCost *= 2;
  }

  return RedCost + MulCost + ExtCost;
}

InstructionCost getVectorSplitCost() { return 1; }

/// @}
2211};

2213/// Concrete BasicTTIImpl that can be used if no further customization
2214/// is needed.
2215class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
using BaseT = BasicTTIImplBase<BasicTTIImpl>;

friend class BasicTTIImplBase<BasicTTIImpl>;

const TargetSubtargetInfo *ST;
const TargetLoweringBase *TLI;

const TargetSubtargetInfo *getST() const { return ST; }
const TargetLoweringBase *getTLI() const { return TLI; }

2226public:
explicit BasicTTIImpl(const TargetMachine *TM, const Function &F);
2228};

2230} // end namespace llvm

2232#endif // LLVM_CODEGEN_BASICTTIIMPL_H

←

/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/include/llvm/Support/InstructionCost.h

→

1//===- InstructionCost.h ----------------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file defines an InstructionCost class that is used when calculating
10/// the cost of an instruction, or a group of instructions. In addition to a
11/// numeric value representing the cost the class also contains a state that
12/// can be used to encode particular properties, such as a cost being invalid.
13/// Operations on InstructionCost implement saturation arithmetic, so that
14/// accumulating costs on large cost-values don't overflow.
15///
16//===----------------------------------------------------------------------===//

18#ifndef LLVM_SUPPORT_INSTRUCTIONCOST_H
19#define LLVM_SUPPORT_INSTRUCTIONCOST_H

21#include "llvm/ADT/Optional.h"
22#include "llvm/Support/MathExtras.h"
23#include <limits>

25namespace llvm {

27class raw_ostream;

29class InstructionCost {
30public:
using CostType = int64_t;

/// CostState describes the state of a cost.
enum CostState {
  Valid,  /// < The cost value represents a valid cost, even when the
          /// cost-value is large.
  Invalid /// < Invalid indicates there is no way to represent the cost as a
          /// numeric value. This state exists to represent a possible issue,
          /// e.g. if the cost-model knows the operation cannot be expanded
          /// into a valid code-sequence by the code-generator.  While some
          /// passes may assert that the calculated cost must be valid, it is
          /// up to individual passes how to interpret an Invalid cost. For
          /// example, a transformation pass could choose not to perform a
          /// transformation if the resulting cost would end up Invalid.
          /// Because some passes may assert a cost is Valid, it is not
          /// recommended to use Invalid costs to model 'Unknown'.
          /// Note that Invalid is semantically different from a (very) high,
          /// but valid cost, which intentionally indicates no issue, but
          /// rather a strong preference not to select a certain operation.
};

52private:
CostType Value = 0;
CostState State = Valid;

void propagateState(const InstructionCost &RHS) {
  if (RHS.State == Invalid)
    State = Invalid;
}

static CostType getMaxValue() { return std::numeric_limits<CostType>::max(); }
static CostType getMinValue() { return std::numeric_limits<CostType>::min(); }

64public:
// A default constructed InstructionCost is a valid zero cost
InstructionCost() = default;

InstructionCost(CostState) = delete;
InstructionCost(CostType Val) : Value(Val), State(Valid) {}

static InstructionCost getMax() { return getMaxValue(); }
static InstructionCost getMin() { return getMinValue(); }
static InstructionCost getInvalid(CostType Val = 0) {
  InstructionCost Tmp(Val);
  Tmp.setInvalid();
  return Tmp;
}

bool isValid() const { return State == Valid; }
void setValid() { State = Valid; }
void setInvalid() { State = Invalid; }
CostState getState() const { return State; }

/// This function is intended to be used as sparingly as possible, since the
/// class provides the full range of operator support required for arithmetic
/// and comparisons.
Optional<CostType> getValue() const {
  if (isValid())
    return Value;
  return None;
}

/// For all of the arithmetic operators provided here any invalid state is
/// perpetuated and cannot be removed. Once a cost becomes invalid it stays
/// invalid, and it also inherits any invalid state from the RHS.
/// Arithmetic work on the actual values is implemented with saturation,
/// to avoid overflow when using more extreme cost values.

InstructionCost &operator+=(const InstructionCost &RHS) {
  propagateState(RHS);

  // Saturating addition.
  InstructionCost::CostType Result;
  if (AddOverflow(Value, RHS.Value, Result))
    Result = RHS.Value > 0 ? getMaxValue() : getMinValue();

  Value = Result;
  return *this;
}

InstructionCost &operator+=(const CostType RHS) {
  InstructionCost RHS2(RHS);
  *this += RHS2;
  return *this;
}

InstructionCost &operator-=(const InstructionCost &RHS) {
  propagateState(RHS);

  // Saturating subtract.
  InstructionCost::CostType Result;
  if (SubOverflow(Value, RHS.Value, Result))
    Result = RHS.Value > 0 ? getMinValue() : getMaxValue();
  Value = Result;
  return *this;
}

InstructionCost &operator-=(const CostType RHS) {
  InstructionCost RHS2(RHS);
  *this -= RHS2;
  return *this;
}

InstructionCost &operator*=(const InstructionCost &RHS) {
  propagateState(RHS);

  // Saturating multiply.
  InstructionCost::CostType Result;
  if (MulOverflow(Value, RHS.Value, Result)) {
    if ((Value > 0 && RHS.Value > 0) || (Value < 0 && RHS.Value < 0))
      Result = getMaxValue();
    else
      Result = getMinValue();
  }

  Value = Result;
  return *this;
}

InstructionCost &operator*=(const CostType RHS) {
  InstructionCost RHS2(RHS);
  *this *= RHS2;
  return *this;
}

InstructionCost &operator/=(const InstructionCost &RHS) {
  propagateState(RHS);
  Value /= RHS.Value;
  return *this;
}

InstructionCost &operator/=(const CostType RHS) {
  InstructionCost RHS2(RHS);
  *this /= RHS2;
  return *this;
}

InstructionCost &operator++() {
  *this += 1;
  return *this;
}

InstructionCost operator++(int) {
  InstructionCost Copy = *this;
  ++*this;
  return Copy;
}

InstructionCost &operator--() {
  *this -= 1;
  return *this;
}

InstructionCost operator--(int) {
  InstructionCost Copy = *this;
  --*this;
  return Copy;
}

/// For the comparison operators we have chosen to use lexicographical
/// ordering where valid costs are always considered to be less than invalid
/// costs. This avoids having to add asserts to the comparison operators that
/// the states are valid and users can test for validity of the cost
/// explicitly.
bool operator<(const InstructionCost &RHS) const {
  if (State != RHS.State)
    return State < RHS.State;
  return Value < RHS.Value;
}

// Implement in terms of operator< to ensure that the two comparisons stay in
// sync
bool operator==(const InstructionCost &RHS) const {
  return !(*this < RHS) && !(RHS < *this);
5
←
Returning zero, which participates in a condition later→
}

bool operator!=(const InstructionCost &RHS) const { return !(*this == RHS); }

bool operator==(const CostType RHS) const {
  InstructionCost RHS2(RHS);
  return *this == RHS2;
4
←
Calling 'InstructionCost::operator=='→
6
←
Returning from 'InstructionCost::operator=='→
7
←
Returning zero, which participates in a condition later→
}

bool operator!=(const CostType RHS) const { return !(*this == RHS); }

bool operator>(const InstructionCost &RHS) const { return RHS < *this; }

bool operator<=(const InstructionCost &RHS) const { return !(RHS < *this); }

bool operator>=(const InstructionCost &RHS) const { return !(*this < RHS); }

bool operator<(const CostType RHS) const {
  InstructionCost RHS2(RHS);
  return *this < RHS2;
}

bool operator>(const CostType RHS) const {
  InstructionCost RHS2(RHS);
  return *this > RHS2;
}

bool operator<=(const CostType RHS) const {
  InstructionCost RHS2(RHS);
  return *this <= RHS2;
}

bool operator>=(const CostType RHS) const {
  InstructionCost RHS2(RHS);
  return *this >= RHS2;
}

void print(raw_ostream &OS) const;

template <class Function>
auto map(const Function &F) const -> InstructionCost {
  if (isValid())
    return F(*getValue());
  return getInvalid();
}
250};

252inline InstructionCost operator+(const InstructionCost &LHS,
                               const InstructionCost &RHS) {
InstructionCost LHS2(LHS);
LHS2 += RHS;
return LHS2;
257}

259inline InstructionCost operator-(const InstructionCost &LHS,
                               const InstructionCost &RHS) {
InstructionCost LHS2(LHS);
LHS2 -= RHS;
return LHS2;
264}

266inline InstructionCost operator*(const InstructionCost &LHS,
                               const InstructionCost &RHS) {
InstructionCost LHS2(LHS);
LHS2 *= RHS;
return LHS2;
271}

273inline InstructionCost operator/(const InstructionCost &LHS,
                               const InstructionCost &RHS) {
InstructionCost LHS2(LHS);
LHS2 /= RHS;
return LHS2;
278}

280inline raw_ostream &operator<<(raw_ostream &OS, const InstructionCost &V) {
V.print(OS);
return OS;
283}

285} // namespace llvm

287#endif

←

/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/include/llvm/Analysis/TargetTransformInfo.h

→

1//===- TargetTransformInfo.h ------------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This pass exposes codegen information to IR-level passes. Every
10/// transformation that uses codegen information is broken into three parts:
11/// 1. The IR-level analysis pass.
12/// 2. The IR-level transformation interface which provides the needed
13///    information.
14/// 3. Codegen-level implementation which uses target-specific hooks.
15///
16/// This file defines #2, which is the interface that IR-level transformations
17/// use for querying the codegen.
18///
19//===----------------------------------------------------------------------===//

21#ifndef LLVM_ANALYSIS_TARGETTRANSFORMINFO_H
22#define LLVM_ANALYSIS_TARGETTRANSFORMINFO_H

24#include "llvm/IR/InstrTypes.h"
25#include "llvm/IR/Operator.h"
26#include "llvm/IR/PassManager.h"
27#include "llvm/Pass.h"
28#include "llvm/Support/AtomicOrdering.h"
29#include "llvm/Support/BranchProbability.h"
30#include "llvm/Support/DataTypes.h"
31#include "llvm/Support/InstructionCost.h"
32#include <functional>

34namespace llvm {

36namespace Intrinsic {
37typedef unsigned ID;
38}

40class AssumptionCache;
41class BlockFrequencyInfo;
42class DominatorTree;
43class BranchInst;
44class CallBase;
45class ExtractElementInst;
46class Function;
47class GlobalValue;
48class InstCombiner;
49class OptimizationRemarkEmitter;
50class IntrinsicInst;
51class LoadInst;
52class LoopAccessInfo;
53class Loop;
54class LoopInfo;
55class ProfileSummaryInfo;
56class RecurrenceDescriptor;
57class SCEV;
58class ScalarEvolution;
59class StoreInst;
60class SwitchInst;
61class TargetLibraryInfo;
62class Type;
63class User;
64class Value;
65class VPIntrinsic;
66struct KnownBits;
67template <typename T> class Optional;

69/// Information about a load/store intrinsic defined by the target.
70struct MemIntrinsicInfo {
/// This is the pointer that the intrinsic is loading from or storing to.
/// If this is non-null, then analysis/optimization passes can assume that
/// this intrinsic is functionally equivalent to a load/store from this
/// pointer.
Value *PtrVal = nullptr;

// Ordering for atomic operations.
AtomicOrdering Ordering = AtomicOrdering::NotAtomic;

// Same Id is set by the target for corresponding load/store intrinsics.
unsigned short MatchingId = 0;

bool ReadMem = false;
bool WriteMem = false;
bool IsVolatile = false;

bool isUnordered() const {
  return (Ordering == AtomicOrdering::NotAtomic ||
          Ordering == AtomicOrdering::Unordered) &&
         !IsVolatile;
}
92};

94/// Attributes of a target dependent hardware loop.
95struct HardwareLoopInfo {
HardwareLoopInfo() = delete;
HardwareLoopInfo(Loop *L) : L(L) {}
Loop *L = nullptr;
BasicBlock *ExitBlock = nullptr;
BranchInst *ExitBranch = nullptr;
const SCEV *ExitCount = nullptr;
IntegerType *CountType = nullptr;
Value *LoopDecrement = nullptr; // Decrement the loop counter by this
                                // value in every iteration.
bool IsNestingLegal = false;    // Can a hardware loop be a parent to
                                // another hardware loop?
bool CounterInReg = false;      // Should loop counter be updated in
                                // the loop via a phi?
bool PerformEntryTest = false;  // Generate the intrinsic which also performs
                                // icmp ne zero on the loop counter value and
                                // produces an i1 to guard the loop entry.
bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI,
                             DominatorTree &DT, bool ForceNestedLoop = false,
                             bool ForceHardwareLoopPHI = false);
bool canAnalyze(LoopInfo &LI);
116};

118class IntrinsicCostAttributes {
const IntrinsicInst *II = nullptr;
Type *RetTy = nullptr;
Intrinsic::ID IID;
SmallVector<Type *, 4> ParamTys;
SmallVector<const Value *, 4> Arguments;
FastMathFlags FMF;
// If ScalarizationCost is UINT_MAX, the cost of scalarizing the
// arguments and the return value will be computed based on types.
InstructionCost ScalarizationCost = InstructionCost::getInvalid();

129public:
IntrinsicCostAttributes(
    Intrinsic::ID Id, const CallBase &CI,
    InstructionCost ScalarCost = InstructionCost::getInvalid());

IntrinsicCostAttributes(
    Intrinsic::ID Id, Type *RTy, ArrayRef<Type *> Tys,
    FastMathFlags Flags = FastMathFlags(), const IntrinsicInst *I = nullptr,
    InstructionCost ScalarCost = InstructionCost::getInvalid());

IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
                        ArrayRef<const Value *> Args);

IntrinsicCostAttributes(
    Intrinsic::ID Id, Type *RTy, ArrayRef<const Value *> Args,
    ArrayRef<Type *> Tys, FastMathFlags Flags = FastMathFlags(),
    const IntrinsicInst *I = nullptr,
    InstructionCost ScalarCost = InstructionCost::getInvalid());

Intrinsic::ID getID() const { return IID; }
const IntrinsicInst *getInst() const { return II; }
Type *getReturnType() const { return RetTy; }
FastMathFlags getFlags() const { return FMF; }
InstructionCost getScalarizationCost() const { return ScalarizationCost; }
const SmallVectorImpl<const Value *> &getArgs() const { return Arguments; }
const SmallVectorImpl<Type *> &getArgTypes() const { return ParamTys; }

bool isTypeBasedOnly() const {
  return Arguments.empty();
13
←
Calling 'SmallVectorBase::empty'→
16
←
Returning from 'SmallVectorBase::empty'→
17
←
Returning zero, which participates in a condition later→
}

bool skipScalarizationCost() const { return ScalarizationCost.isValid(); }
161};

163class TargetTransformInfo;
164typedef TargetTransformInfo TTI;

166/// This pass provides access to the codegen interfaces that are needed
167/// for IR-level transformations.
168class TargetTransformInfo {
169public:
/// Construct a TTI object using a type implementing the \c Concept
/// API below.
///
/// This is used by targets to construct a TTI wrapping their target-specific
/// implementation that encodes appropriate costs for their target.
template <typename T> TargetTransformInfo(T Impl);

/// Construct a baseline TTI object using a minimal implementation of
/// the \c Concept API below.
///
/// The TTI implementation will reflect the information in the DataLayout
/// provided if non-null.
explicit TargetTransformInfo(const DataLayout &DL);

// Provide move semantics.
TargetTransformInfo(TargetTransformInfo &&Arg);
TargetTransformInfo &operator=(TargetTransformInfo &&RHS);

// We need to define the destructor out-of-line to define our sub-classes
// out-of-line.
~TargetTransformInfo();

/// Handle the invalidation of this information.
///
/// When used as a result of \c TargetIRAnalysis this method will be called
/// when the function this was computed for changes. When it returns false,
/// the information is preserved across those changes.
bool invalidate(Function &, const PreservedAnalyses &,
                FunctionAnalysisManager::Invalidator &) {
  // FIXME: We should probably in some way ensure that the subtarget
  // information for a function hasn't changed.
  return false;
}

/// \name Generic Target Information
/// @{

/// The kind of cost model.
///
/// There are several different cost models that can be customized by the
/// target. The normalization of each cost model may be target specific.
enum TargetCostKind {
  TCK_RecipThroughput, ///< Reciprocal throughput.
  TCK_Latency,         ///< The latency of instruction.
  TCK_CodeSize,        ///< Instruction code size.
  TCK_SizeAndLatency   ///< The weighted sum of size and latency.
};

/// Query the cost of a specified instruction.
///
/// Clients should use this interface to query the cost of an existing
/// instruction. The instruction must have a valid parent (basic block).
///
/// Note, this method does not cache the cost calculation and it
/// can be expensive in some cases.
InstructionCost getInstructionCost(const Instruction *I,
                                   enum TargetCostKind kind) const {
  InstructionCost Cost;
  switch (kind) {
  case TCK_RecipThroughput:
    Cost = getInstructionThroughput(I);
    break;
  case TCK_Latency:
    Cost = getInstructionLatency(I);
    break;
  case TCK_CodeSize:
  case TCK_SizeAndLatency:
    Cost = getUserCost(I, kind);
    break;
  }
  return Cost;
}

/// Underlying constants for 'cost' values in this interface.
///
/// Many APIs in this interface return a cost. This enum defines the
/// fundamental values that should be used to interpret (and produce) those
/// costs. The costs are returned as an int rather than a member of this
/// enumeration because it is expected that the cost of one IR instruction
/// may have a multiplicative factor to it or otherwise won't fit directly
/// into the enum. Moreover, it is common to sum or average costs which works
/// better as simple integral values. Thus this enum only provides constants.
/// Also note that the returned costs are signed integers to make it natural
/// to add, subtract, and test with zero (a common boundary condition). It is
/// not expected that 2^32 is a realistic cost to be modeling at any point.
///
/// Note that these costs should usually reflect the intersection of code-size
/// cost and execution cost. A free instruction is typically one that folds
/// into another instruction. For example, reg-to-reg moves can often be
/// skipped by renaming the registers in the CPU, but they still are encoded
/// and thus wouldn't be considered 'free' here.
enum TargetCostConstants {
  TCC_Free = 0,     ///< Expected to fold away in lowering.
  TCC_Basic = 1,    ///< The cost of a typical 'add' instruction.
  TCC_Expensive = 4 ///< The cost of a 'div' instruction on x86.
};

/// Estimate the cost of a GEP operation when lowered.
InstructionCost
getGEPCost(Type *PointeeType, const Value *Ptr,
           ArrayRef<const Value *> Operands,
           TargetCostKind CostKind = TCK_SizeAndLatency) const;

/// \returns A value by which our inlining threshold should be multiplied.
/// This is primarily used to bump up the inlining threshold wholesale on
/// targets where calls are unusually expensive.
///
/// TODO: This is a rather blunt instrument.  Perhaps altering the costs of
/// individual classes of instructions would be better.
unsigned getInliningThresholdMultiplier() const;

/// \returns A value to be added to the inlining threshold.
unsigned adjustInliningThreshold(const CallBase *CB) const;

/// \returns Vector bonus in percent.
///
/// Vector bonuses: We want to more aggressively inline vector-dense kernels
/// and apply this bonus based on the percentage of vector instructions. A
/// bonus is applied if the vector instructions exceed 50% and half that
/// amount is applied if it exceeds 10%. Note that these bonuses are some what
/// arbitrary and evolved over time by accident as much as because they are
/// principled bonuses.
/// FIXME: It would be nice to base the bonus values on something more
/// scientific. A target may has no bonus on vector instructions.
int getInlinerVectorBonusPercent() const;

/// \return the expected cost of a memcpy, which could e.g. depend on the
/// source/destination type and alignment and the number of bytes copied.
InstructionCost getMemcpyCost(const Instruction *I) const;

/// \return The estimated number of case clusters when lowering \p 'SI'.
/// \p JTSize Set a jump table size only when \p SI is suitable for a jump
/// table.
unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
                                          unsigned &JTSize,
                                          ProfileSummaryInfo *PSI,
                                          BlockFrequencyInfo *BFI) const;

/// Estimate the cost of a given IR user when lowered.
///
/// This can estimate the cost of either a ConstantExpr or Instruction when
/// lowered.
///
/// \p Operands is a list of operands which can be a result of transformations
/// of the current operands. The number of the operands on the list must equal
/// to the number of the current operands the IR user has. Their order on the
/// list must be the same as the order of the current operands the IR user
/// has.
///
/// The returned cost is defined in terms of \c TargetCostConstants, see its
/// comments for a detailed explanation of the cost values.
InstructionCost getUserCost(const User *U, ArrayRef<const Value *> Operands,
                            TargetCostKind CostKind) const;

/// This is a helper function which calls the two-argument getUserCost
/// with \p Operands which are the current operands U has.
InstructionCost getUserCost(const User *U, TargetCostKind CostKind) const {
  SmallVector<const Value *, 4> Operands(U->operand_values());
  return getUserCost(U, Operands, CostKind);
}

/// If a branch or a select condition is skewed in one direction by more than
/// this factor, it is very likely to be predicted correctly.
BranchProbability getPredictableBranchThreshold() const;

/// Return true if branch divergence exists.
///
/// Branch divergence has a significantly negative impact on GPU performance
/// when threads in the same wavefront take different paths due to conditional
/// branches.
bool hasBranchDivergence() const;

/// Return true if the target prefers to use GPU divergence analysis to
/// replace the legacy version.
bool useGPUDivergenceAnalysis() const;

/// Returns whether V is a source of divergence.
///
/// This function provides the target-dependent information for
/// the target-independent LegacyDivergenceAnalysis. LegacyDivergenceAnalysis
/// first builds the dependency graph, and then runs the reachability
/// algorithm starting with the sources of divergence.
bool isSourceOfDivergence(const Value *V) const;

// Returns true for the target specific
// set of operations which produce uniform result
// even taking non-uniform arguments
bool isAlwaysUniform(const Value *V) const;

/// Returns the address space ID for a target's 'flat' address space. Note
/// this is not necessarily the same as addrspace(0), which LLVM sometimes
/// refers to as the generic address space. The flat address space is a
/// generic address space that can be used access multiple segments of memory
/// with different address spaces. Access of a memory location through a
/// pointer with this address space is expected to be legal but slower
/// compared to the same memory location accessed through a pointer with a
/// different address space.
//
/// This is for targets with different pointer representations which can
/// be converted with the addrspacecast instruction. If a pointer is converted
/// to this address space, optimizations should attempt to replace the access
/// with the source address space.
///
/// \returns ~0u if the target does not have such a flat address space to
/// optimize away.
unsigned getFlatAddressSpace() const;

/// Return any intrinsic address operand indexes which may be rewritten if
/// they use a flat address space pointer.
///
/// \returns true if the intrinsic was handled.
bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
                                Intrinsic::ID IID) const;

bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const;

unsigned getAssumedAddrSpace(const Value *V) const;

/// Rewrite intrinsic call \p II such that \p OldV will be replaced with \p
/// NewV, which has a different address space. This should happen for every
/// operand index that collectFlatAddressOperands returned for the intrinsic.
/// \returns nullptr if the intrinsic was not handled. Otherwise, returns the
/// new value (which may be the original \p II with modified operands).
Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
                                        Value *NewV) const;

/// Test whether calls to a function lower to actual program function
/// calls.
///
/// The idea is to test whether the program is likely to require a 'call'
/// instruction or equivalent in order to call the given function.
///
/// FIXME: It's not clear that this is a good or useful query API. Client's
/// should probably move to simpler cost metrics using the above.
/// Alternatively, we could split the cost interface into distinct code-size
/// and execution-speed costs. This would allow modelling the core of this
/// query more accurately as a call is a single small instruction, but
/// incurs significant execution cost.
bool isLoweredToCall(const Function *F) const;

struct LSRCost {
  /// TODO: Some of these could be merged. Also, a lexical ordering
  /// isn't always optimal.
  unsigned Insns;
  unsigned NumRegs;
  unsigned AddRecCost;
  unsigned NumIVMuls;
  unsigned NumBaseAdds;
  unsigned ImmCost;
  unsigned SetupCost;
  unsigned ScaleCost;
};

/// Parameters that control the generic loop unrolling transformation.
struct UnrollingPreferences {
  /// The cost threshold for the unrolled loop. Should be relative to the
  /// getUserCost values returned by this API, and the expectation is that
  /// the unrolled loop's instructions when run through that interface should
  /// not exceed this cost. However, this is only an estimate. Also, specific
  /// loops may be unrolled even with a cost above this threshold if deemed
  /// profitable. Set this to UINT_MAX to disable the loop body cost
  /// restriction.
  unsigned Threshold;
  /// If complete unrolling will reduce the cost of the loop, we will boost
  /// the Threshold by a certain percent to allow more aggressive complete
  /// unrolling. This value provides the maximum boost percentage that we
  /// can apply to Threshold (The value should be no less than 100).
  /// BoostedThreshold = Threshold * min(RolledCost / UnrolledCost,
  ///                                    MaxPercentThresholdBoost / 100)
  /// E.g. if complete unrolling reduces the loop execution time by 50%
  /// then we boost the threshold by the factor of 2x. If unrolling is not
  /// expected to reduce the running time, then we do not increase the
  /// threshold.
  unsigned MaxPercentThresholdBoost;
  /// The cost threshold for the unrolled loop when optimizing for size (set
  /// to UINT_MAX to disable).
  unsigned OptSizeThreshold;
  /// The cost threshold for the unrolled loop, like Threshold, but used
  /// for partial/runtime unrolling (set to UINT_MAX to disable).
  unsigned PartialThreshold;
  /// The cost threshold for the unrolled loop when optimizing for size, like
  /// OptSizeThreshold, but used for partial/runtime unrolling (set to
  /// UINT_MAX to disable).
  unsigned PartialOptSizeThreshold;
  /// A forced unrolling factor (the number of concatenated bodies of the
  /// original loop in the unrolled loop body). When set to 0, the unrolling
  /// transformation will select an unrolling factor based on the current cost
  /// threshold and other factors.
  unsigned Count;
  /// Default unroll count for loops with run-time trip count.
  unsigned DefaultUnrollRuntimeCount;
  // Set the maximum unrolling factor. The unrolling factor may be selected
  // using the appropriate cost threshold, but may not exceed this number
  // (set to UINT_MAX to disable). This does not apply in cases where the
  // loop is being fully unrolled.
  unsigned MaxCount;
  /// Set the maximum unrolling factor for full unrolling. Like MaxCount, but
  /// applies even if full unrolling is selected. This allows a target to fall
  /// back to Partial unrolling if full unrolling is above FullUnrollMaxCount.
  unsigned FullUnrollMaxCount;
  // Represents number of instructions optimized when "back edge"
  // becomes "fall through" in unrolled loop.
  // For now we count a conditional branch on a backedge and a comparison
  // feeding it.
  unsigned BEInsns;
  /// Allow partial unrolling (unrolling of loops to expand the size of the
  /// loop body, not only to eliminate small constant-trip-count loops).
  bool Partial;
  /// Allow runtime unrolling (unrolling of loops to expand the size of the
  /// loop body even when the number of loop iterations is not known at
  /// compile time).
  bool Runtime;
  /// Allow generation of a loop remainder (extra iterations after unroll).
  bool AllowRemainder;
  /// Allow emitting expensive instructions (such as divisions) when computing
  /// the trip count of a loop for runtime unrolling.
  bool AllowExpensiveTripCount;
  /// Apply loop unroll on any kind of loop
  /// (mainly to loops that fail runtime unrolling).
  bool Force;
  /// Allow using trip count upper bound to unroll loops.
  bool UpperBound;
  /// Allow unrolling of all the iterations of the runtime loop remainder.
  bool UnrollRemainder;
  /// Allow unroll and jam. Used to enable unroll and jam for the target.
  bool UnrollAndJam;
  /// Threshold for unroll and jam, for inner loop size. The 'Threshold'
  /// value above is used during unroll and jam for the outer loop size.
  /// This value is used in the same manner to limit the size of the inner
  /// loop.
  unsigned UnrollAndJamInnerLoopThreshold;
  /// Don't allow loop unrolling to simulate more than this number of
  /// iterations when checking full unroll profitability
  unsigned MaxIterationsCountToAnalyze;
};

/// Get target-customized preferences for the generic loop unrolling
/// transformation. The caller will initialize UP with the current
/// target-independent defaults.
void getUnrollingPreferences(Loop *L, ScalarEvolution &,
                             UnrollingPreferences &UP,
                             OptimizationRemarkEmitter *ORE) const;

/// Query the target whether it would be profitable to convert the given loop
/// into a hardware loop.
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
                              AssumptionCache &AC, TargetLibraryInfo *LibInfo,
                              HardwareLoopInfo &HWLoopInfo) const;

/// Query the target whether it would be prefered to create a predicated
/// vector loop, which can avoid the need to emit a scalar epilogue loop.
bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
                                 AssumptionCache &AC, TargetLibraryInfo *TLI,
                                 DominatorTree *DT,
                                 const LoopAccessInfo *LAI) const;

/// Query the target whether lowering of the llvm.get.active.lane.mask
/// intrinsic is supported.
bool emitGetActiveLaneMask() const;

// Parameters that control the loop peeling transformation
struct PeelingPreferences {
  /// A forced peeling factor (the number of bodied of the original loop
  /// that should be peeled off before the loop body). When set to 0, the
  /// a peeling factor based on profile information and other factors.
  unsigned PeelCount;
  /// Allow peeling off loop iterations.
  bool AllowPeeling;
  /// Allow peeling off loop iterations for loop nests.
  bool AllowLoopNestsPeeling;
  /// Allow peeling basing on profile. Uses to enable peeling off all
  /// iterations basing on provided profile.
  /// If the value is true the peeling cost model can decide to peel only
  /// some iterations and in this case it will set this to false.
  bool PeelProfiledIterations;
};

/// Get target-customized preferences for the generic loop peeling
/// transformation. The caller will initialize \p PP with the current
/// target-independent defaults with information from \p L and \p SE.
void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
                           PeelingPreferences &PP) const;

/// Targets can implement their own combinations for target-specific
/// intrinsics. This function will be called from the InstCombine pass every
/// time a target-specific intrinsic is encountered.
///
/// \returns None to not do anything target specific or a value that will be
/// returned from the InstCombiner. It is possible to return null and stop
/// further processing of the intrinsic by returning nullptr.
Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
                                             IntrinsicInst &II) const;
/// Can be used to implement target-specific instruction combining.
/// \see instCombineIntrinsic
Optional<Value *>
simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II,
                                 APInt DemandedMask, KnownBits &Known,
                                 bool &KnownBitsComputed) const;
/// Can be used to implement target-specific instruction combining.
/// \see instCombineIntrinsic
Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
    InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
    APInt &UndefElts2, APInt &UndefElts3,
    std::function<void(Instruction *, unsigned, APInt, APInt &)>
        SimplifyAndSetOp) const;
/// @}

/// \name Scalar Target Information
/// @{

/// Flags indicating the kind of support for population count.
///
/// Compared to the SW implementation, HW support is supposed to
/// significantly boost the performance when the population is dense, and it
/// may or may not degrade performance if the population is sparse. A HW
/// support is considered as "Fast" if it can outperform, or is on a par
/// with, SW implementation when the population is sparse; otherwise, it is
/// considered as "Slow".
enum PopcntSupportKind { PSK_Software, PSK_SlowHardware, PSK_FastHardware };

/// Return true if the specified immediate is legal add immediate, that
/// is the target has add instructions which can add a register with the
/// immediate without having to materialize the immediate into a register.
bool isLegalAddImmediate(int64_t Imm) const;

/// Return true if the specified immediate is legal icmp immediate,
/// that is the target has icmp instructions which can compare a register
/// against the immediate without having to materialize the immediate into a
/// register.
bool isLegalICmpImmediate(int64_t Imm) const;

/// Return true if the addressing mode represented by AM is legal for
/// this target, for a load/store of the specified type.
/// The type may be VoidTy, in which case only return true if the addressing
/// mode is legal for a load/store of any legal type.
/// If target returns true in LSRWithInstrQueries(), I may be valid.
/// TODO: Handle pre/postinc as well.
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
                           bool HasBaseReg, int64_t Scale,
                           unsigned AddrSpace = 0,
                           Instruction *I = nullptr) const;

/// Return true if LSR cost of C1 is lower than C1.
bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
                   TargetTransformInfo::LSRCost &C2) const;

/// Return true if LSR major cost is number of registers. Targets which
/// implement their own isLSRCostLess and unset number of registers as major
/// cost should return false, otherwise return true.
bool isNumRegsMajorCostOfLSR() const;

/// \returns true if LSR should not optimize a chain that includes \p I.
bool isProfitableLSRChainElement(Instruction *I) const;

/// Return true if the target can fuse a compare and branch.
/// Loop-strength-reduction (LSR) uses that knowledge to adjust its cost
/// calculation for the instructions in a loop.
bool canMacroFuseCmp() const;

/// Return true if the target can save a compare for loop count, for example
/// hardware loop saves a compare.
bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI,
                DominatorTree *DT, AssumptionCache *AC,
                TargetLibraryInfo *LibInfo) const;

enum AddressingModeKind {
  AMK_PreIndexed,
  AMK_PostIndexed,
  AMK_None
};

/// Return the preferred addressing mode LSR should make efforts to generate.
AddressingModeKind getPreferredAddressingMode(const Loop *L,
                                              ScalarEvolution *SE) const;

/// Return true if the target supports masked store.
bool isLegalMaskedStore(Type *DataType, Align Alignment) const;
/// Return true if the target supports masked load.
bool isLegalMaskedLoad(Type *DataType, Align Alignment) const;

/// Return true if the target supports nontemporal store.
bool isLegalNTStore(Type *DataType, Align Alignment) const;
/// Return true if the target supports nontemporal load.
bool isLegalNTLoad(Type *DataType, Align Alignment) const;

/// Return true if the target supports masked scatter.
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const;
/// Return true if the target supports masked gather.
bool isLegalMaskedGather(Type *DataType, Align Alignment) const;

/// Return true if the target supports masked compress store.
bool isLegalMaskedCompressStore(Type *DataType) const;
/// Return true if the target supports masked expand load.
bool isLegalMaskedExpandLoad(Type *DataType) const;

/// Return true if we should be enabling ordered reductions for the target.
bool enableOrderedReductions() const;

/// Return true if the target has a unified operation to calculate division
/// and remainder. If so, the additional implicit multiplication and
/// subtraction required to calculate a remainder from division are free. This
/// can enable more aggressive transformations for division and remainder than
/// would typically be allowed using throughput or size cost models.
bool hasDivRemOp(Type *DataType, bool IsSigned) const;

/// Return true if the given instruction (assumed to be a memory access
/// instruction) has a volatile variant. If that's the case then we can avoid
/// addrspacecast to generic AS for volatile loads/stores. Default
/// implementation returns false, which prevents address space inference for
/// volatile loads/stores.
bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) const;

/// Return true if target doesn't mind addresses in vectors.
bool prefersVectorizedAddressing() const;

/// Return the cost of the scaling factor used in the addressing
/// mode represented by AM for this target, for a load/store
/// of the specified type.
/// If the AM is supported, the return value must be >= 0.
/// If the AM is not supported, it returns a negative value.
/// TODO: Handle pre/postinc as well.
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
                                     int64_t BaseOffset, bool HasBaseReg,
                                     int64_t Scale,
                                     unsigned AddrSpace = 0) const;

/// Return true if the loop strength reduce pass should make
/// Instruction* based TTI queries to isLegalAddressingMode(). This is
/// needed on SystemZ, where e.g. a memcpy can only have a 12 bit unsigned
/// immediate offset and no index register.
bool LSRWithInstrQueries() const;

/// Return true if it's free to truncate a value of type Ty1 to type
/// Ty2. e.g. On x86 it's free to truncate a i32 value in register EAX to i16
/// by referencing its sub-register AX.
bool isTruncateFree(Type *Ty1, Type *Ty2) const;

/// Return true if it is profitable to hoist instruction in the
/// then/else to before if.
bool isProfitableToHoist(Instruction *I) const;

bool useAA() const;

/// Return true if this type is legal.
bool isTypeLegal(Type *Ty) const;

/// Returns the estimated number of registers required to represent \p Ty.
InstructionCost getRegUsageForType(Type *Ty) const;

/// Return true if switches should be turned into lookup tables for the
/// target.
bool shouldBuildLookupTables() const;

/// Return true if switches should be turned into lookup tables
/// containing this constant value for the target.
bool shouldBuildLookupTablesForConstant(Constant *C) const;

/// Return true if lookup tables should be turned into relative lookup tables.
bool shouldBuildRelLookupTables() const;

/// Return true if the input function which is cold at all call sites,
///  should use coldcc calling convention.
bool useColdCCForColdCall(Function &F) const;

/// Estimate the overhead of scalarizing an instruction. Insert and Extract
/// are set if the demanded result elements need to be inserted and/or
/// extracted from vectors.
InstructionCost getScalarizationOverhead(VectorType *Ty,
                                         const APInt &DemandedElts,
                                         bool Insert, bool Extract) const;

/// Estimate the overhead of scalarizing an instructions unique
/// non-constant operands. The (potentially vector) types to use for each of
/// argument are passes via Tys.
InstructionCost getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
                                                 ArrayRef<Type *> Tys) const;

/// If target has efficient vector element load/store instructions, it can
/// return true here so that insertion/extraction costs are not added to
/// the scalarization cost of a load/store.
bool supportsEfficientVectorElementLoadStore() const;

/// Don't restrict interleaved unrolling to small loops.
bool enableAggressiveInterleaving(bool LoopHasReductions) const;

/// Returns options for expansion of memcmp. IsZeroCmp is
// true if this is the expansion of memcmp(p1, p2, s) == 0.
struct MemCmpExpansionOptions {
  // Return true if memcmp expansion is enabled.
  operator bool() const { return MaxNumLoads > 0; }

  // Maximum number of load operations.
  unsigned MaxNumLoads = 0;

  // The list of available load sizes (in bytes), sorted in decreasing order.
  SmallVector<unsigned, 8> LoadSizes;

  // For memcmp expansion when the memcmp result is only compared equal or
  // not-equal to 0, allow up to this number of load pairs per block. As an
  // example, this may allow 'memcmp(a, b, 3) == 0' in a single block:
  //   a0 = load2bytes &a[0]
  //   b0 = load2bytes &b[0]
  //   a2 = load1byte  &a[2]
  //   b2 = load1byte  &b[2]
  //   r  = cmp eq (a0 ^ b0 | a2 ^ b2), 0
  unsigned NumLoadsPerBlock = 1;

  // Set to true to allow overlapping loads. For example, 7-byte compares can
  // be done with two 4-byte compares instead of 4+2+1-byte compares. This
  // requires all loads in LoadSizes to be doable in an unaligned way.
  bool AllowOverlappingLoads = false;
};
MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
                                             bool IsZeroCmp) const;

/// Enable matching of interleaved access groups.
bool enableInterleavedAccessVectorization() const;

/// Enable matching of interleaved access groups that contain predicated
/// accesses or gaps and therefore vectorized using masked
/// vector loads/stores.
bool enableMaskedInterleavedAccessVectorization() const;

/// Indicate that it is potentially unsafe to automatically vectorize
/// floating-point operations because the semantics of vector and scalar
/// floating-point semantics may differ. For example, ARM NEON v7 SIMD math
/// does not support IEEE-754 denormal numbers, while depending on the
/// platform, scalar floating-point math does.
/// This applies to floating-point math operations and calls, not memory
/// operations, shuffles, or casts.
bool isFPVectorizationPotentiallyUnsafe() const;

/// Determine if the target supports unaligned memory accesses.
bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth,
                                    unsigned AddressSpace = 0,
                                    Align Alignment = Align(1),
                                    bool *Fast = nullptr) const;

/// Return hardware support for population count.
PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const;

/// Return true if the hardware has a fast square-root instruction.
bool haveFastSqrt(Type *Ty) const;

/// Return true if it is faster to check if a floating-point value is NaN
/// (or not-NaN) versus a comparison against a constant FP zero value.
/// Targets should override this if materializing a 0.0 for comparison is
/// generally as cheap as checking for ordered/unordered.
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) const;

/// Return the expected cost of supporting the floating point operation
/// of the specified type.
InstructionCost getFPOpCost(Type *Ty) const;

/// Return the expected cost of materializing for the given integer
/// immediate of the specified type.
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty,
                              TargetCostKind CostKind) const;

/// Return the expected cost of materialization for the given integer
/// immediate of the specified type for a given instruction. The cost can be
/// zero if the immediate can be folded into the specified instruction.
InstructionCost getIntImmCostInst(unsigned Opc, unsigned Idx,
                                  const APInt &Imm, Type *Ty,
                                  TargetCostKind CostKind,
                                  Instruction *Inst = nullptr) const;
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
                                    const APInt &Imm, Type *Ty,
                                    TargetCostKind CostKind) const;

/// Return the expected cost for the given integer when optimising
/// for size. This is different than the other integer immediate cost
/// functions in that it is subtarget agnostic. This is useful when you e.g.
/// target one ISA such as Aarch32 but smaller encodings could be possible
/// with another such as Thumb. This return value is used as a penalty when
/// the total costs for a constant is calculated (the bigger the cost, the
/// more beneficial constant hoisting is).
InstructionCost getIntImmCodeSizeCost(unsigned Opc, unsigned Idx,
                                      const APInt &Imm, Type *Ty) const;
/// @}

/// \name Vector Target Information
/// @{

/// The various kinds of shuffle patterns for vector queries.
enum ShuffleKind {
  SK_Broadcast,        ///< Broadcast element 0 to all other elements.
  SK_Reverse,          ///< Reverse the order of the vector.
  SK_Select,           ///< Selects elements from the corresponding lane of
                       ///< either source operand. This is equivalent to a
                       ///< vector select with a constant condition operand.
  SK_Transpose,        ///< Transpose two vectors.
  SK_InsertSubvector,  ///< InsertSubvector. Index indicates start offset.
  SK_ExtractSubvector, ///< ExtractSubvector Index indicates start offset.
  SK_PermuteTwoSrc,    ///< Merge elements from two source vectors into one
                       ///< with any shuffle mask.
  SK_PermuteSingleSrc, ///< Shuffle elements of single source vector with any
                       ///< shuffle mask.
  SK_Splice            ///< Concatenates elements from the first input vector
                       ///< with elements of the second input vector. Returning
                       ///< a vector of the same type as the input vectors.
};

/// Additional information about an operand's possible values.
enum OperandValueKind {
  OK_AnyValue,               // Operand can have any value.
  OK_UniformValue,           // Operand is uniform (splat of a value).
  OK_UniformConstantValue,   // Operand is uniform constant.
  OK_NonUniformConstantValue // Operand is a non uniform constant value.
};

/// Additional properties of an operand's values.
enum OperandValueProperties { OP_None = 0, OP_PowerOf2 = 1 };

/// \return the number of registers in the target-provided register class.
unsigned getNumberOfRegisters(unsigned ClassID) const;

/// \return the target-provided register class ID for the provided type,
/// accounting for type promotion and other type-legalization techniques that
/// the target might apply. However, it specifically does not account for the
/// scalarization or splitting of vector types. Should a vector type require
/// scalarization or splitting into multiple underlying vector registers, that
/// type should be mapped to a register class containing no registers.
/// Specifically, this is designed to provide a simple, high-level view of the
/// register allocation later performed by the backend. These register classes
/// don't necessarily map onto the register classes used by the backend.
/// FIXME: It's not currently possible to determine how many registers
/// are used by the provided type.
unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const;

/// \return the target-provided register class name
const char *getRegisterClassName(unsigned ClassID) const;

enum RegisterKind { RGK_Scalar, RGK_FixedWidthVector, RGK_ScalableVector };

/// \return The width of the largest scalar or vector register type.
TypeSize getRegisterBitWidth(RegisterKind K) const;

/// \return The width of the smallest vector register type.
unsigned getMinVectorRegisterBitWidth() const;

/// \return The maximum value of vscale if the target specifies an
///  architectural maximum vector length, and None otherwise.
Optional<unsigned> getMaxVScale() const;

/// \return True if the vectorization factor should be chosen to
/// make the vector of the smallest element type match the size of a
/// vector register. For wider element types, this could result in
/// creating vectors that span multiple vector registers.
/// If false, the vectorization factor will be chosen based on the
/// size of the widest element type.
bool shouldMaximizeVectorBandwidth() const;

/// \return The minimum vectorization factor for types of given element
/// bit width, or 0 if there is no minimum VF. The returned value only
/// applies when shouldMaximizeVectorBandwidth returns true.
/// If IsScalable is true, the returned ElementCount must be a scalable VF.
ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const;

/// \return The maximum vectorization factor for types of given element
/// bit width and opcode, or 0 if there is no maximum VF.
/// Currently only used by the SLP vectorizer.
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const;

/// \return True if it should be considered for address type promotion.
/// \p AllowPromotionWithoutCommonHeader Set true if promoting \p I is
/// profitable without finding other extensions fed by the same input.
bool shouldConsiderAddressTypePromotion(
    const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const;

/// \return The size of a cache line in bytes.
unsigned getCacheLineSize() const;

/// The possible cache levels
enum class CacheLevel {
  L1D, // The L1 data cache
  L2D, // The L2 data cache

  // We currently do not model L3 caches, as their sizes differ widely between
  // microarchitectures. Also, we currently do not have a use for L3 cache
  // size modeling yet.
};

/// \return The size of the cache level in bytes, if available.
Optional<unsigned> getCacheSize(CacheLevel Level) const;

/// \return The associativity of the cache level, if available.
Optional<unsigned> getCacheAssociativity(CacheLevel Level) const;

/// \return How much before a load we should place the prefetch
/// instruction.  This is currently measured in number of
/// instructions.
unsigned getPrefetchDistance() const;

/// Some HW prefetchers can handle accesses up to a certain constant stride.
/// Sometimes prefetching is beneficial even below the HW prefetcher limit,
/// and the arguments provided are meant to serve as a basis for deciding this
/// for a particular loop.
///
/// \param NumMemAccesses        Number of memory accesses in the loop.
/// \param NumStridedMemAccesses Number of the memory accesses that
///                              ScalarEvolution could find a known stride
///                              for.
/// \param NumPrefetches         Number of software prefetches that will be
///                              emitted as determined by the addresses
///                              involved and the cache line size.
/// \param HasCall               True if the loop contains a call.
///
/// \return This is the minimum stride in bytes where it makes sense to start
///         adding SW prefetches. The default is 1, i.e. prefetch with any
///         stride.
unsigned getMinPrefetchStride(unsigned NumMemAccesses,
                              unsigned NumStridedMemAccesses,
                              unsigned NumPrefetches, bool HasCall) const;

/// \return The maximum number of iterations to prefetch ahead.  If
/// the required number of iterations is more than this number, no
/// prefetching is performed.
unsigned getMaxPrefetchIterationsAhead() const;

/// \return True if prefetching should also be done for writes.
bool enableWritePrefetching() const;

/// \return The maximum interleave factor that any transform should try to
/// perform for this target. This number depends on the level of parallelism
/// and the number of execution units in the CPU.
unsigned getMaxInterleaveFactor(unsigned VF) const;

/// Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
static OperandValueKind getOperandInfo(const Value *V,
                                       OperandValueProperties &OpProps);

/// This is an approximation of reciprocal throughput of a math/logic op.
/// A higher cost indicates less expected throughput.
/// From Agner Fog's guides, reciprocal throughput is "the average number of
/// clock cycles per instruction when the instructions are not part of a
/// limiting dependency chain."
/// Therefore, costs should be scaled to account for multiple execution units
/// on the target that can process this type of instruction. For example, if
/// there are 5 scalar integer units and 2 vector integer units that can
/// calculate an 'add' in a single cycle, this model should indicate that the
/// cost of the vector add instruction is 2.5 times the cost of the scalar
/// add instruction.
/// \p Args is an optional argument which holds the instruction operands
/// values so the TTI can analyze those values searching for special
/// cases or optimizations based on those values.
/// \p CxtI is the optional original context instruction, if one exists, to
/// provide even more information.
InstructionCost getArithmeticInstrCost(
    unsigned Opcode, Type *Ty,
    TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
    OperandValueKind Opd1Info = OK_AnyValue,
    OperandValueKind Opd2Info = OK_AnyValue,
    OperandValueProperties Opd1PropInfo = OP_None,
    OperandValueProperties Opd2PropInfo = OP_None,
    ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
    const Instruction *CxtI = nullptr) const;

/// \return The cost of a shuffle instruction of kind Kind and of type Tp.
/// The exact mask may be passed as Mask, or else the array will be empty.
/// The index and subtype parameters are used by the subvector insertion and
/// extraction shuffle kinds to show the insert/extract point and the type of
/// the subvector being inserted/extracted.
/// NOTE: For subvector extractions Tp represents the source type.
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp,
                               ArrayRef<int> Mask = None, int Index = 0,
                               VectorType *SubTp = nullptr) const;

/// Represents a hint about the context in which a cast is used.
///
/// For zext/sext, the context of the cast is the operand, which must be a
/// load of some kind. For trunc, the context is of the cast is the single
/// user of the instruction, which must be a store of some kind.
///
/// This enum allows the vectorizer to give getCastInstrCost an idea of the
/// type of cast it's dealing with, as not every cast is equal. For instance,
/// the zext of a load may be free, but the zext of an interleaving load can
//// be (very) expensive!
///
/// See \c getCastContextHint to compute a CastContextHint from a cast
/// Instruction*. Callers can use it if they don't need to override the
/// context and just want it to be calculated from the instruction.
///
/// FIXME: This handles the types of load/store that the vectorizer can
/// produce, which are the cases where the context instruction is most
/// likely to be incorrect. There are other situations where that can happen
/// too, which might be handled here but in the long run a more general
/// solution of costing multiple instructions at the same times may be better.
enum class CastContextHint : uint8_t {
  None,          ///< The cast is not used with a load/store of any kind.
  Normal,        ///< The cast is used with a normal load/store.
  Masked,        ///< The cast is used with a masked load/store.
  GatherScatter, ///< The cast is used with a gather/scatter.
  Interleave,    ///< The cast is used with an interleaved load/store.
  Reversed,      ///< The cast is used with a reversed load/store.
};

/// Calculates a CastContextHint from \p I.
/// This should be used by callers of getCastInstrCost if they wish to
/// determine the context from some instruction.
/// \returns the CastContextHint for ZExt/SExt/Trunc, None if \p I is nullptr,
/// or if it's another type of cast.
static CastContextHint getCastContextHint(const Instruction *I);

/// \return The expected cost of cast instructions, such as bitcast, trunc,
/// zext, etc. If there is an existing instruction that holds Opcode, it
/// may be passed in the 'I' parameter.
InstructionCost
getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
                 TTI::CastContextHint CCH,
                 TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
                 const Instruction *I = nullptr) const;

/// \return The expected cost of a sign- or zero-extended vector extract. Use
/// -1 to indicate that there is no information about the index value.
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst,
                                         VectorType *VecTy,
                                         unsigned Index = -1) const;

/// \return The expected cost of control-flow related instructions such as
/// Phi, Ret, Br, Switch.
InstructionCost
getCFInstrCost(unsigned Opcode,
               TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
               const Instruction *I = nullptr) const;

/// \returns The expected cost of compare and select instructions. If there
/// is an existing instruction that holds Opcode, it may be passed in the
/// 'I' parameter. The \p VecPred parameter can be used to indicate the select
/// is using a compare with the specified predicate as condition. When vector
/// types are passed, \p VecPred must be used for all lanes.
InstructionCost
getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy = nullptr,
                   CmpInst::Predicate VecPred = CmpInst::BAD_ICMP_PREDICATE,
                   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
                   const Instruction *I = nullptr) const;

/// \return The expected cost of vector Insert and Extract.
/// Use -1 to indicate that there is no information on the index value.
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
                                   unsigned Index = -1) const;

/// \return The cost of Load and Store instructions.
InstructionCost
getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
                unsigned AddressSpace,
                TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
                const Instruction *I = nullptr) const;

/// \return The cost of masked Load and Store instructions.
InstructionCost getMaskedMemoryOpCost(
    unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
    TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;

/// \return The cost of Gather or Scatter operation
/// \p Opcode - is a type of memory access Load or Store
/// \p DataTy - a vector type of the data to be loaded or stored
/// \p Ptr - pointer [or vector of pointers] - address[es] in memory
/// \p VariableMask - true when the memory access is predicated with a mask
///                   that is not a compile-time constant
/// \p Alignment - alignment of single element
/// \p I - the optional original context instruction, if one exists, e.g. the
///        load/store to transform or the call to the gather/scatter intrinsic
InstructionCost getGatherScatterOpCost(
    unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
    Align Alignment, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
    const Instruction *I = nullptr) const;

/// \return The cost of the interleaved memory operation.
/// \p Opcode is the memory operation code
/// \p VecTy is the vector type of the interleaved access.
/// \p Factor is the interleave factor
/// \p Indices is the indices for interleaved load members (as interleaved
///    load allows gaps)
/// \p Alignment is the alignment of the memory operation
/// \p AddressSpace is address space of the pointer.
/// \p UseMaskForCond indicates if the memory access is predicated.
/// \p UseMaskForGaps indicates if gaps should be masked.
InstructionCost getInterleavedMemoryOpCost(
    unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
    Align Alignment, unsigned AddressSpace,
    TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
    bool UseMaskForCond = false, bool UseMaskForGaps = false) const;

/// A helper function to determine the type of reduction algorithm used
/// for a given \p Opcode and set of FastMathFlags \p FMF.
static bool requiresOrderedReduction(Optional<FastMathFlags> FMF) {
  return FMF != None && !(*FMF).allowReassoc();
}

/// Calculate the cost of vector reduction intrinsics.
///
/// This is the cost of reducing the vector value of type \p Ty to a scalar
/// value using the operation denoted by \p Opcode. The FastMathFlags
/// parameter \p FMF indicates what type of reduction we are performing:
///   1. Tree-wise. This is the typical 'fast' reduction performed that
///   involves successively splitting a vector into half and doing the
///   operation on the pair of halves until you have a scalar value. For
///   example:
///     (v0, v1, v2, v3)
///     ((v0+v2), (v1+v3), undef, undef)
///     ((v0+v2+v1+v3), undef, undef, undef)
///   This is the default behaviour for integer operations, whereas for
///   floating point we only do this if \p FMF indicates that
///   reassociation is allowed.
///   2. Ordered. For a vector with N elements this involves performing N
///   operations in lane order, starting with an initial scalar value, i.e.
///     result = InitVal + v0
///     result = result + v1
///     result = result + v2
///     result = result + v3
///   This is only the case for FP operations and when reassociation is not
///   allowed.
///
InstructionCost getArithmeticReductionCost(
    unsigned Opcode, VectorType *Ty, Optional<FastMathFlags> FMF,
    TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;

InstructionCost getMinMaxReductionCost(
    VectorType *Ty, VectorType *CondTy, bool IsUnsigned,
    TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;

/// Calculate the cost of an extended reduction pattern, similar to
/// getArithmeticReductionCost of an Add reduction with an extension and
/// optional multiply. This is the cost of as:
/// ResTy vecreduce.add(ext(Ty A)), or if IsMLA flag is set then:
/// ResTy vecreduce.add(mul(ext(Ty A), ext(Ty B)). The reduction happens
/// on a VectorType with ResTy elements and Ty lanes.
InstructionCost getExtendedAddReductionCost(
    bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty,
    TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;

/// \returns The cost of Intrinsic instructions. Analyses the real arguments.
/// Three cases are handled: 1. scalar instruction 2. vector instruction
/// 3. scalar instruction which is to be vectorized.
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                      TTI::TargetCostKind CostKind) const;

/// \returns The cost of Call instructions.
InstructionCost getCallInstrCost(
    Function *F, Type *RetTy, ArrayRef<Type *> Tys,
    TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) const;

/// \returns The number of pieces into which the provided type must be
/// split during legalization. Zero is returned when the answer is unknown.
unsigned getNumberOfParts(Type *Tp) const;

/// \returns The cost of the address computation. For most targets this can be
/// merged into the instruction indexing mode. Some targets might want to
/// distinguish between address computation for memory operations on vector
/// types and scalar types. Such targets should override this function.
/// The 'SE' parameter holds pointer for the scalar evolution object which
/// is used in order to get the Ptr step value in case of constant stride.
/// The 'Ptr' parameter holds SCEV of the access pointer.
InstructionCost getAddressComputationCost(Type *Ty,
                                          ScalarEvolution *SE = nullptr,
                                          const SCEV *Ptr = nullptr) const;

/// \returns The cost, if any, of keeping values of the given types alive
/// over a callsite.
///
/// Some types may require the use of register classes that do not have
/// any callee-saved registers, so would require a spill and fill.
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) const;

/// \returns True if the intrinsic is a supported memory intrinsic.  Info
/// will contain additional information - whether the intrinsic may write
/// or read to memory, volatility and the pointer.  Info is undefined
/// if false is returned.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const;

/// \returns The maximum element size, in bytes, for an element
/// unordered-atomic memory intrinsic.
unsigned getAtomicMemIntrinsicMaxElementSize() const;

/// \returns A value which is the result of the given memory intrinsic.  New
/// instructions may be created to extract the result from the given intrinsic
/// memory operation.  Returns nullptr if the target cannot create a result
/// from the given intrinsic.
Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
                                         Type *ExpectedType) const;

/// \returns The type to use in a loop expansion of a memcpy call.
Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
                                unsigned SrcAddrSpace, unsigned DestAddrSpace,
                                unsigned SrcAlign, unsigned DestAlign) const;

/// \param[out] OpsOut The operand types to copy RemainingBytes of memory.
/// \param RemainingBytes The number of bytes to copy.
///
/// Calculates the operand types to use when copying \p RemainingBytes of
/// memory, where source and destination alignments are \p SrcAlign and
/// \p DestAlign respectively.
void getMemcpyLoopResidualLoweringType(
    SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
    unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
    unsigned SrcAlign, unsigned DestAlign) const;

/// \returns True if the two functions have compatible attributes for inlining
/// purposes.
bool areInlineCompatible(const Function *Caller,
                         const Function *Callee) const;

/// \returns True if the caller and callee agree on how \p Args will be passed
/// to the callee.
/// \param[out] Args The list of compatible arguments.  The implementation may
/// filter out any incompatible args from this list.
bool areFunctionArgsABICompatible(const Function *Caller,
                                  const Function *Callee,
                                  SmallPtrSetImpl<Argument *> &Args) const;

/// The type of load/store indexing.
enum MemIndexedMode {
  MIM_Unindexed, ///< No indexing.
  MIM_PreInc,    ///< Pre-incrementing.
  MIM_PreDec,    ///< Pre-decrementing.
  MIM_PostInc,   ///< Post-incrementing.
  MIM_PostDec    ///< Post-decrementing.
};

/// \returns True if the specified indexed load for the given type is legal.
bool isIndexedLoadLegal(enum MemIndexedMode Mode, Type *Ty) const;

/// \returns True if the specified indexed store for the given type is legal.
bool isIndexedStoreLegal(enum MemIndexedMode Mode, Type *Ty) const;

/// \returns The bitwidth of the largest vector type that should be used to
/// load/store in the given address space.
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;

/// \returns True if the load instruction is legal to vectorize.
bool isLegalToVectorizeLoad(LoadInst *LI) const;

/// \returns True if the store instruction is legal to vectorize.
bool isLegalToVectorizeStore(StoreInst *SI) const;

/// \returns True if it is legal to vectorize the given load chain.
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment,
                                 unsigned AddrSpace) const;

/// \returns True if it is legal to vectorize the given store chain.
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
                                  unsigned AddrSpace) const;

/// \returns True if it is legal to vectorize the given reduction kind.
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc,
                                 ElementCount VF) const;

/// \returns True if the given type is supported for scalable vectors
bool isElementTypeLegalForScalableVector(Type *Ty) const;

/// \returns The new vector factor value if the target doesn't support \p
/// SizeInBytes loads or has a better vector factor.
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
                             unsigned ChainSizeInBytes,
                             VectorType *VecTy) const;

/// \returns The new vector factor value if the target doesn't support \p
/// SizeInBytes stores or has a better vector factor.
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
                              unsigned ChainSizeInBytes,
                              VectorType *VecTy) const;

/// Flags describing the kind of vector reduction.
struct ReductionFlags {
  ReductionFlags() : IsMaxOp(false), IsSigned(false), NoNaN(false) {}
  bool IsMaxOp;  ///< If the op a min/max kind, true if it's a max operation.
  bool IsSigned; ///< Whether the operation is a signed int reduction.
  bool NoNaN;    ///< If op is an fp min/max, whether NaNs may be present.
};

/// \returns True if the target prefers reductions in loop.
bool preferInLoopReduction(unsigned Opcode, Type *Ty,
                           ReductionFlags Flags) const;

/// \returns True if the target prefers reductions select kept in the loop
/// when tail folding. i.e.
/// loop:
///   p = phi (0, s)
///   a = add (p, x)
///   s = select (mask, a, p)
/// vecreduce.add(s)
///
/// As opposed to the normal scheme of p = phi (0, a) which allows the select
/// to be pulled out of the loop. If the select(.., add, ..) can be predicated
/// by the target, this can lead to cleaner code generation.
bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
                                     ReductionFlags Flags) const;

/// \returns True if the target wants to expand the given reduction intrinsic
/// into a shuffle sequence.
bool shouldExpandReduction(const IntrinsicInst *II) const;

/// \returns the size cost of rematerializing a GlobalValue address relative
/// to a stack reload.
unsigned getGISelRematGlobalCost() const;

/// \returns True if the target supports scalable vectors.
bool supportsScalableVectors() const;

/// \name Vector Predication Information
/// @{
/// Whether the target supports the %evl parameter of VP intrinsic efficiently
/// in hardware. (see LLVM Language Reference - "Vector Predication
/// Intrinsics") Use of %evl is discouraged when that is not the case.
bool hasActiveVectorLength() const;

struct VPLegalization {
  enum VPTransform {
    // keep the predicating parameter
    Legal = 0,
    // where legal, discard the predicate parameter
    Discard = 1,
    // transform into something else that is also predicating
    Convert = 2
  };

  // How to transform the EVL parameter.
  // Legal:   keep the EVL parameter as it is.
  // Discard: Ignore the EVL parameter where it is safe to do so.
  // Convert: Fold the EVL into the mask parameter.
  VPTransform EVLParamStrategy;

  // How to transform the operator.
  // Legal:   The target supports this operator.
  // Convert: Convert this to a non-VP operation.
  // The 'Discard' strategy is invalid.
  VPTransform OpStrategy;

  bool shouldDoNothing() const {
    return (EVLParamStrategy == Legal) && (OpStrategy == Legal);
  }
  VPLegalization(VPTransform EVLParamStrategy, VPTransform OpStrategy)
      : EVLParamStrategy(EVLParamStrategy), OpStrategy(OpStrategy) {}
};

/// \returns How the target needs this vector-predicated operation to be
/// transformed.
VPLegalization getVPLegalizationStrategy(const VPIntrinsic &PI) const;
/// @}

/// @}

1413private:
/// Estimate the latency of specified instruction.
/// Returns 1 as the default value.
InstructionCost getInstructionLatency(const Instruction *I) const;

/// Returns the expected throughput cost of the instruction.
/// Returns -1 if the cost is unknown.
InstructionCost getInstructionThroughput(const Instruction *I) const;

/// The abstract base class used to type erase specific TTI
/// implementations.
class Concept;

/// The template model for the base class which wraps a concrete
/// implementation in a type erased interface.
template <typename T> class Model;

std::unique_ptr<Concept> TTIImpl;
1431};

1433class TargetTransformInfo::Concept {
1434public:
virtual ~Concept() = 0;
virtual const DataLayout &getDataLayout() const = 0;
virtual InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr,
                                   ArrayRef<const Value *> Operands,
                                   TTI::TargetCostKind CostKind) = 0;
virtual unsigned getInliningThresholdMultiplier() = 0;
virtual unsigned adjustInliningThreshold(const CallBase *CB) = 0;
virtual int getInlinerVectorBonusPercent() = 0;
virtual InstructionCost getMemcpyCost(const Instruction *I) = 0;
virtual unsigned
getEstimatedNumberOfCaseClusters(const SwitchInst &SI, unsigned &JTSize,
                                 ProfileSummaryInfo *PSI,
                                 BlockFrequencyInfo *BFI) = 0;
virtual InstructionCost getUserCost(const User *U,
                                    ArrayRef<const Value *> Operands,
                                    TargetCostKind CostKind) = 0;
virtual BranchProbability getPredictableBranchThreshold() = 0;
virtual bool hasBranchDivergence() = 0;
virtual bool useGPUDivergenceAnalysis() = 0;
virtual bool isSourceOfDivergence(const Value *V) = 0;
virtual bool isAlwaysUniform(const Value *V) = 0;
virtual unsigned getFlatAddressSpace() = 0;
virtual bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
                                        Intrinsic::ID IID) const = 0;
virtual bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const = 0;
virtual unsigned getAssumedAddrSpace(const Value *V) const = 0;
virtual Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
                                                Value *OldV,
                                                Value *NewV) const = 0;
virtual bool isLoweredToCall(const Function *F) = 0;
virtual void getUnrollingPreferences(Loop *L, ScalarEvolution &,
                                     UnrollingPreferences &UP,
                                     OptimizationRemarkEmitter *ORE) = 0;
virtual void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
                                   PeelingPreferences &PP) = 0;
virtual bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
                                      AssumptionCache &AC,
                                      TargetLibraryInfo *LibInfo,
                                      HardwareLoopInfo &HWLoopInfo) = 0;
virtual bool
preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
                            AssumptionCache &AC, TargetLibraryInfo *TLI,
                            DominatorTree *DT, const LoopAccessInfo *LAI) = 0;
virtual bool emitGetActiveLaneMask() = 0;
virtual Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
                                                     IntrinsicInst &II) = 0;
virtual Optional<Value *>
simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II,
                                 APInt DemandedMask, KnownBits &Known,
                                 bool &KnownBitsComputed) = 0;
virtual Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
    InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
    APInt &UndefElts2, APInt &UndefElts3,
    std::function<void(Instruction *, unsigned, APInt, APInt &)>
        SimplifyAndSetOp) = 0;
virtual bool isLegalAddImmediate(int64_t Imm) = 0;
virtual bool isLegalICmpImmediate(int64_t Imm) = 0;
virtual bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
                                   int64_t BaseOffset, bool HasBaseReg,
                                   int64_t Scale, unsigned AddrSpace,
                                   Instruction *I) = 0;
virtual bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
                           TargetTransformInfo::LSRCost &C2) = 0;
virtual bool isNumRegsMajorCostOfLSR() = 0;
virtual bool isProfitableLSRChainElement(Instruction *I) = 0;
virtual bool canMacroFuseCmp() = 0;
virtual bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE,
                        LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC,
                        TargetLibraryInfo *LibInfo) = 0;
virtual AddressingModeKind
  getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const = 0;
virtual bool isLegalMaskedStore(Type *DataType, Align Alignment) = 0;
virtual bool isLegalMaskedLoad(Type *DataType, Align Alignment) = 0;
virtual bool isLegalNTStore(Type *DataType, Align Alignment) = 0;
virtual bool isLegalNTLoad(Type *DataType, Align Alignment) = 0;
virtual bool isLegalMaskedScatter(Type *DataType, Align Alignment) = 0;
virtual bool isLegalMaskedGather(Type *DataType, Align Alignment) = 0;
virtual bool isLegalMaskedCompressStore(Type *DataType) = 0;
virtual bool isLegalMaskedExpandLoad(Type *DataType) = 0;
virtual bool enableOrderedReductions() = 0;
virtual bool hasDivRemOp(Type *DataType, bool IsSigned) = 0;
virtual bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) = 0;
virtual bool prefersVectorizedAddressing() = 0;
virtual InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
                                             int64_t BaseOffset,
                                             bool HasBaseReg, int64_t Scale,
                                             unsigned AddrSpace) = 0;
virtual bool LSRWithInstrQueries() = 0;
virtual bool isTruncateFree(Type *Ty1, Type *Ty2) = 0;
virtual bool isProfitableToHoist(Instruction *I) = 0;
virtual bool useAA() = 0;
virtual bool isTypeLegal(Type *Ty) = 0;
virtual InstructionCost getRegUsageForType(Type *Ty) = 0;
virtual bool shouldBuildLookupTables() = 0;
virtual bool shouldBuildLookupTablesForConstant(Constant *C) = 0;
virtual bool shouldBuildRelLookupTables() = 0;
virtual bool useColdCCForColdCall(Function &F) = 0;
virtual InstructionCost getScalarizationOverhead(VectorType *Ty,
                                                 const APInt &DemandedElts,
                                                 bool Insert,
                                                 bool Extract) = 0;
virtual InstructionCost
getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
                                 ArrayRef<Type *> Tys) = 0;
virtual bool supportsEfficientVectorElementLoadStore() = 0;
virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0;
virtual MemCmpExpansionOptions
enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const = 0;
virtual bool enableInterleavedAccessVectorization() = 0;
virtual bool enableMaskedInterleavedAccessVectorization() = 0;
virtual bool isFPVectorizationPotentiallyUnsafe() = 0;
virtual bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
                                            unsigned BitWidth,
                                            unsigned AddressSpace,
                                            Align Alignment,
                                            bool *Fast) = 0;
virtual PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) = 0;
virtual bool haveFastSqrt(Type *Ty) = 0;
virtual bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) = 0;
virtual InstructionCost getFPOpCost(Type *Ty) = 0;
virtual InstructionCost getIntImmCodeSizeCost(unsigned Opc, unsigned Idx,
                                              const APInt &Imm, Type *Ty) = 0;
virtual InstructionCost getIntImmCost(const APInt &Imm, Type *Ty,
                                      TargetCostKind CostKind) = 0;
virtual InstructionCost getIntImmCostInst(unsigned Opc, unsigned Idx,
                                          const APInt &Imm, Type *Ty,
                                          TargetCostKind CostKind,
                                          Instruction *Inst = nullptr) = 0;
virtual InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
                                            const APInt &Imm, Type *Ty,
                                            TargetCostKind CostKind) = 0;
virtual unsigned getNumberOfRegisters(unsigned ClassID) const = 0;
virtual unsigned getRegisterClassForType(bool Vector,
                                         Type *Ty = nullptr) const = 0;
virtual const char *getRegisterClassName(unsigned ClassID) const = 0;
virtual TypeSize getRegisterBitWidth(RegisterKind K) const = 0;
virtual unsigned getMinVectorRegisterBitWidth() const = 0;
virtual Optional<unsigned> getMaxVScale() const = 0;
virtual bool shouldMaximizeVectorBandwidth() const = 0;
virtual ElementCount getMinimumVF(unsigned ElemWidth,
                                  bool IsScalable) const = 0;
virtual unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const = 0;
virtual bool shouldConsiderAddressTypePromotion(
    const Instruction &I, bool &AllowPromotionWithoutCommonHeader) = 0;
virtual unsigned getCacheLineSize() const = 0;
virtual Optional<unsigned> getCacheSize(CacheLevel Level) const = 0;
virtual Optional<unsigned> getCacheAssociativity(CacheLevel Level) const = 0;

/// \return How much before a load we should place the prefetch
/// instruction.  This is currently measured in number of
/// instructions.
virtual unsigned getPrefetchDistance() const = 0;

/// \return Some HW prefetchers can handle accesses up to a certain
/// constant stride.  This is the minimum stride in bytes where it
/// makes sense to start adding SW prefetches.  The default is 1,
/// i.e. prefetch with any stride.  Sometimes prefetching is beneficial
/// even below the HW prefetcher limit, and the arguments provided are
/// meant to serve as a basis for deciding this for a particular loop.
virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
                                      unsigned NumStridedMemAccesses,
                                      unsigned NumPrefetches,
                                      bool HasCall) const = 0;

/// \return The maximum number of iterations to prefetch ahead.  If
/// the required number of iterations is more than this number, no
/// prefetching is performed.
virtual unsigned getMaxPrefetchIterationsAhead() const = 0;

/// \return True if prefetching should also be done for writes.
virtual bool enableWritePrefetching() const = 0;

virtual unsigned getMaxInterleaveFactor(unsigned VF) = 0;
virtual InstructionCost getArithmeticInstrCost(
    unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
    OperandValueKind Opd1Info, OperandValueKind Opd2Info,
    OperandValueProperties Opd1PropInfo, OperandValueProperties Opd2PropInfo,
    ArrayRef<const Value *> Args, const Instruction *CxtI = nullptr) = 0;
virtual InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp,
                                       ArrayRef<int> Mask, int Index,
                                       VectorType *SubTp) = 0;
virtual InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst,
                                         Type *Src, CastContextHint CCH,
                                         TTI::TargetCostKind CostKind,
                                         const Instruction *I) = 0;
virtual InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst,
                                                 VectorType *VecTy,
                                                 unsigned Index) = 0;
virtual InstructionCost getCFInstrCost(unsigned Opcode,
                                       TTI::TargetCostKind CostKind,
                                       const Instruction *I = nullptr) = 0;
virtual InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
                                           Type *CondTy,
                                           CmpInst::Predicate VecPred,
                                           TTI::TargetCostKind CostKind,
                                           const Instruction *I) = 0;
virtual InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
                                           unsigned Index) = 0;
virtual InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src,
                                        Align Alignment,
                                        unsigned AddressSpace,
                                        TTI::TargetCostKind CostKind,
                                        const Instruction *I) = 0;
virtual InstructionCost
getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
                      unsigned AddressSpace,
                      TTI::TargetCostKind CostKind) = 0;
virtual InstructionCost
getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr,
                       bool VariableMask, Align Alignment,
                       TTI::TargetCostKind CostKind,
                       const Instruction *I = nullptr) = 0;

virtual InstructionCost getInterleavedMemoryOpCost(
    unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
    Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
    bool UseMaskForCond = false, bool UseMaskForGaps = false) = 0;
virtual InstructionCost
getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
                           Optional<FastMathFlags> FMF,
                           TTI::TargetCostKind CostKind) = 0;
virtual InstructionCost
getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned,
                       TTI::TargetCostKind CostKind) = 0;
virtual InstructionCost getExtendedAddReductionCost(
    bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty,
    TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) = 0;
virtual InstructionCost
getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                      TTI::TargetCostKind CostKind) = 0;
virtual InstructionCost getCallInstrCost(Function *F, Type *RetTy,
                                         ArrayRef<Type *> Tys,
                                         TTI::TargetCostKind CostKind) = 0;
virtual unsigned getNumberOfParts(Type *Tp) = 0;
virtual InstructionCost
getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr) = 0;
virtual InstructionCost
getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) = 0;
virtual bool getTgtMemIntrinsic(IntrinsicInst *Inst,
                                MemIntrinsicInfo &Info) = 0;
virtual unsigned getAtomicMemIntrinsicMaxElementSize() const = 0;
virtual Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
                                                 Type *ExpectedType) = 0;
virtual Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
                                        unsigned SrcAddrSpace,
                                        unsigned DestAddrSpace,
                                        unsigned SrcAlign,
                                        unsigned DestAlign) const = 0;
virtual void getMemcpyLoopResidualLoweringType(
    SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
    unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
    unsigned SrcAlign, unsigned DestAlign) const = 0;
virtual bool areInlineCompatible(const Function *Caller,
                                 const Function *Callee) const = 0;
virtual bool
areFunctionArgsABICompatible(const Function *Caller, const Function *Callee,
                             SmallPtrSetImpl<Argument *> &Args) const = 0;
virtual bool isIndexedLoadLegal(MemIndexedMode Mode, Type *Ty) const = 0;
virtual bool isIndexedStoreLegal(MemIndexedMode Mode, Type *Ty) const = 0;
virtual unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const = 0;
virtual bool isLegalToVectorizeLoad(LoadInst *LI) const = 0;
virtual bool isLegalToVectorizeStore(StoreInst *SI) const = 0;
virtual bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
                                         Align Alignment,
                                         unsigned AddrSpace) const = 0;
virtual bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
                                          Align Alignment,
                                          unsigned AddrSpace) const = 0;
virtual bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc,
                                         ElementCount VF) const = 0;
virtual bool isElementTypeLegalForScalableVector(Type *Ty) const = 0;
virtual unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
                                     unsigned ChainSizeInBytes,
                                     VectorType *VecTy) const = 0;
virtual unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
                                      unsigned ChainSizeInBytes,
                                      VectorType *VecTy) const = 0;
virtual bool preferInLoopReduction(unsigned Opcode, Type *Ty,
                                   ReductionFlags) const = 0;
virtual bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
                                             ReductionFlags) const = 0;
virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0;
virtual unsigned getGISelRematGlobalCost() const = 0;
virtual bool supportsScalableVectors() const = 0;
virtual bool hasActiveVectorLength() const = 0;
virtual InstructionCost getInstructionLatency(const Instruction *I) = 0;
virtual VPLegalization
getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
1723};

1725template <typename T>
1726class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
T Impl;

1729public:
Model(T Impl) : Impl(std::move(Impl)) {}
~Model() override {}

const DataLayout &getDataLayout() const override {
  return Impl.getDataLayout();
}

InstructionCost
getGEPCost(Type *PointeeType, const Value *Ptr,
           ArrayRef<const Value *> Operands,
           enum TargetTransformInfo::TargetCostKind CostKind) override {
  return Impl.getGEPCost(PointeeType, Ptr, Operands);
}
unsigned getInliningThresholdMultiplier() override {
  return Impl.getInliningThresholdMultiplier();
}
unsigned adjustInliningThreshold(const CallBase *CB) override {
  return Impl.adjustInliningThreshold(CB);
}
int getInlinerVectorBonusPercent() override {
  return Impl.getInlinerVectorBonusPercent();
}
InstructionCost getMemcpyCost(const Instruction *I) override {
  return Impl.getMemcpyCost(I);
}
InstructionCost getUserCost(const User *U, ArrayRef<const Value *> Operands,
                            TargetCostKind CostKind) override {
  return Impl.getUserCost(U, Operands, CostKind);
}
BranchProbability getPredictableBranchThreshold() override {
  return Impl.getPredictableBranchThreshold();
}
bool hasBranchDivergence() override { return Impl.hasBranchDivergence(); }
bool useGPUDivergenceAnalysis() override {
  return Impl.useGPUDivergenceAnalysis();
}
bool isSourceOfDivergence(const Value *V) override {
  return Impl.isSourceOfDivergence(V);
}

bool isAlwaysUniform(const Value *V) override {
  return Impl.isAlwaysUniform(V);
}

unsigned getFlatAddressSpace() override { return Impl.getFlatAddressSpace(); }

bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
                                Intrinsic::ID IID) const override {
  return Impl.collectFlatAddressOperands(OpIndexes, IID);
}

bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const override {
  return Impl.isNoopAddrSpaceCast(FromAS, ToAS);
}

unsigned getAssumedAddrSpace(const Value *V) const override {
  return Impl.getAssumedAddrSpace(V);
}

Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
                                        Value *NewV) const override {
  return Impl.rewriteIntrinsicWithAddressSpace(II, OldV, NewV);
}

bool isLoweredToCall(const Function *F) override {
  return Impl.isLoweredToCall(F);
}
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                             UnrollingPreferences &UP,
                             OptimizationRemarkEmitter *ORE) override {
  return Impl.getUnrollingPreferences(L, SE, UP, ORE);
}
void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
                           PeelingPreferences &PP) override {
  return Impl.getPeelingPreferences(L, SE, PP);
}
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
                              AssumptionCache &AC, TargetLibraryInfo *LibInfo,
                              HardwareLoopInfo &HWLoopInfo) override {
  return Impl.isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
}
bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
                                 AssumptionCache &AC, TargetLibraryInfo *TLI,
                                 DominatorTree *DT,
                                 const LoopAccessInfo *LAI) override {
  return Impl.preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
}
bool emitGetActiveLaneMask() override {
  return Impl.emitGetActiveLaneMask();
}
Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
                                             IntrinsicInst &II) override {
  return Impl.instCombineIntrinsic(IC, II);
}
Optional<Value *>
simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II,
                                 APInt DemandedMask, KnownBits &Known,
                                 bool &KnownBitsComputed) override {
  return Impl.simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
                                               KnownBitsComputed);
}
Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
    InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
    APInt &UndefElts2, APInt &UndefElts3,
    std::function<void(Instruction *, unsigned, APInt, APInt &)>
        SimplifyAndSetOp) override {
  return Impl.simplifyDemandedVectorEltsIntrinsic(
      IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
      SimplifyAndSetOp);
}
bool isLegalAddImmediate(int64_t Imm) override {
  return Impl.isLegalAddImmediate(Imm);
}
bool isLegalICmpImmediate(int64_t Imm) override {
  return Impl.isLegalICmpImmediate(Imm);
}
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
                           bool HasBaseReg, int64_t Scale, unsigned AddrSpace,
                           Instruction *I) override {
  return Impl.isLegalAddressingMode(Ty, BaseGV, BaseOffset, HasBaseReg, Scale,
                                    AddrSpace, I);
}
bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
                   TargetTransformInfo::LSRCost &C2) override {
  return Impl.isLSRCostLess(C1, C2);
}
bool isNumRegsMajorCostOfLSR() override {
  return Impl.isNumRegsMajorCostOfLSR();
}
bool isProfitableLSRChainElement(Instruction *I) override {
  return Impl.isProfitableLSRChainElement(I);
}
bool canMacroFuseCmp() override { return Impl.canMacroFuseCmp(); }
bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI,
                DominatorTree *DT, AssumptionCache *AC,
                TargetLibraryInfo *LibInfo) override {
  return Impl.canSaveCmp(L, BI, SE, LI, DT, AC, LibInfo);
}
AddressingModeKind
  getPreferredAddressingMode(const Loop *L,
                             ScalarEvolution *SE) const override {
  return Impl.getPreferredAddressingMode(L, SE);
}
bool isLegalMaskedStore(Type *DataType, Align Alignment) override {
  return Impl.isLegalMaskedStore(DataType, Alignment);
}
bool isLegalMaskedLoad(Type *DataType, Align Alignment) override {
  return Impl.isLegalMaskedLoad(DataType, Alignment);
}
bool isLegalNTStore(Type *DataType, Align Alignment) override {
  return Impl.isLegalNTStore(DataType, Alignment);
}
bool isLegalNTLoad(Type *DataType, Align Alignment) override {
  return Impl.isLegalNTLoad(DataType, Alignment);
}
bool isLegalMaskedScatter(Type *DataType, Align Alignment) override {
  return Impl.isLegalMaskedScatter(DataType, Alignment);
}
bool isLegalMaskedGather(Type *DataType, Align Alignment) override {
  return Impl.isLegalMaskedGather(DataType, Alignment);
}
bool isLegalMaskedCompressStore(Type *DataType) override {
  return Impl.isLegalMaskedCompressStore(DataType);
}
bool isLegalMaskedExpandLoad(Type *DataType) override {
  return Impl.isLegalMaskedExpandLoad(DataType);
}
bool enableOrderedReductions() override {
  return Impl.enableOrderedReductions();
}
bool hasDivRemOp(Type *DataType, bool IsSigned) override {
  return Impl.hasDivRemOp(DataType, IsSigned);
}
bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) override {
  return Impl.hasVolatileVariant(I, AddrSpace);
}
bool prefersVectorizedAddressing() override {
  return Impl.prefersVectorizedAddressing();
}
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
                                     int64_t BaseOffset, bool HasBaseReg,
                                     int64_t Scale,
                                     unsigned AddrSpace) override {
  return Impl.getScalingFactorCost(Ty, BaseGV, BaseOffset, HasBaseReg, Scale,
                                   AddrSpace);
}
bool LSRWithInstrQueries() override { return Impl.LSRWithInstrQueries(); }
bool isTruncateFree(Type *Ty1, Type *Ty2) override {
  return Impl.isTruncateFree(Ty1, Ty2);
}
bool isProfitableToHoist(Instruction *I) override {
  return Impl.isProfitableToHoist(I);
}
bool useAA() override { return Impl.useAA(); }
bool isTypeLegal(Type *Ty) override { return Impl.isTypeLegal(Ty); }
InstructionCost getRegUsageForType(Type *Ty) override {
  return Impl.getRegUsageForType(Ty);
}
bool shouldBuildLookupTables() override {
  return Impl.shouldBuildLookupTables();
}
bool shouldBuildLookupTablesForConstant(Constant *C) override {
  return Impl.shouldBuildLookupTablesForConstant(C);
}
bool shouldBuildRelLookupTables() override {
  return Impl.shouldBuildRelLookupTables();
}
bool useColdCCForColdCall(Function &F) override {
  return Impl.useColdCCForColdCall(F);
}

InstructionCost getScalarizationOverhead(VectorType *Ty,
                                         const APInt &DemandedElts,
                                         bool Insert, bool Extract) override {
  return Impl.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
}
InstructionCost
getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
                                 ArrayRef<Type *> Tys) override {
  return Impl.getOperandsScalarizationOverhead(Args, Tys);
}

bool supportsEfficientVectorElementLoadStore() override {
  return Impl.supportsEfficientVectorElementLoadStore();
}

bool enableAggressiveInterleaving(bool LoopHasReductions) override {
  return Impl.enableAggressiveInterleaving(LoopHasReductions);
}
MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
                                             bool IsZeroCmp) const override {
  return Impl.enableMemCmpExpansion(OptSize, IsZeroCmp);
}
bool enableInterleavedAccessVectorization() override {
  return Impl.enableInterleavedAccessVectorization();
}
bool enableMaskedInterleavedAccessVectorization() override {
  return Impl.enableMaskedInterleavedAccessVectorization();
}
bool isFPVectorizationPotentiallyUnsafe() override {
  return Impl.isFPVectorizationPotentiallyUnsafe();
}
bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth,
                                    unsigned AddressSpace, Align Alignment,
                                    bool *Fast) override {
  return Impl.allowsMisalignedMemoryAccesses(Context, BitWidth, AddressSpace,
                                             Alignment, Fast);
}
PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) override {
  return Impl.getPopcntSupport(IntTyWidthInBit);
}
bool haveFastSqrt(Type *Ty) override { return Impl.haveFastSqrt(Ty); }

bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) override {
  return Impl.isFCmpOrdCheaperThanFCmpZero(Ty);
}

InstructionCost getFPOpCost(Type *Ty) override {
  return Impl.getFPOpCost(Ty);
}

InstructionCost getIntImmCodeSizeCost(unsigned Opc, unsigned Idx,
                                      const APInt &Imm, Type *Ty) override {
  return Impl.getIntImmCodeSizeCost(Opc, Idx, Imm, Ty);
}
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty,
                              TargetCostKind CostKind) override {
  return Impl.getIntImmCost(Imm, Ty, CostKind);
}
InstructionCost getIntImmCostInst(unsigned Opc, unsigned Idx,
                                  const APInt &Imm, Type *Ty,
                                  TargetCostKind CostKind,
                                  Instruction *Inst = nullptr) override {
  return Impl.getIntImmCostInst(Opc, Idx, Imm, Ty, CostKind, Inst);
}
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
                                    const APInt &Imm, Type *Ty,
                                    TargetCostKind CostKind) override {
  return Impl.getIntImmCostIntrin(IID, Idx, Imm, Ty, CostKind);
}
unsigned getNumberOfRegisters(unsigned ClassID) const override {
  return Impl.getNumberOfRegisters(ClassID);
}
unsigned getRegisterClassForType(bool Vector,
                                 Type *Ty = nullptr) const override {
  return Impl.getRegisterClassForType(Vector, Ty);
}
const char *getRegisterClassName(unsigned ClassID) const override {
  return Impl.getRegisterClassName(ClassID);
}
TypeSize getRegisterBitWidth(RegisterKind K) const override {
  return Impl.getRegisterBitWidth(K);
}
unsigned getMinVectorRegisterBitWidth() const override {
  return Impl.getMinVectorRegisterBitWidth();
}
Optional<unsigned> getMaxVScale() const override {
  return Impl.getMaxVScale();
}
bool shouldMaximizeVectorBandwidth() const override {
  return Impl.shouldMaximizeVectorBandwidth();
}
ElementCount getMinimumVF(unsigned ElemWidth,
                          bool IsScalable) const override {
  return Impl.getMinimumVF(ElemWidth, IsScalable);
}
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override {
  return Impl.getMaximumVF(ElemWidth, Opcode);
}
bool shouldConsiderAddressTypePromotion(
    const Instruction &I, bool &AllowPromotionWithoutCommonHeader) override {
  return Impl.shouldConsiderAddressTypePromotion(
      I, AllowPromotionWithoutCommonHeader);
}
unsigned getCacheLineSize() const override { return Impl.getCacheLineSize(); }
Optional<unsigned> getCacheSize(CacheLevel Level) const override {
  return Impl.getCacheSize(Level);
}
Optional<unsigned> getCacheAssociativity(CacheLevel Level) const override {
  return Impl.getCacheAssociativity(Level);
}

/// Return the preferred prefetch distance in terms of instructions.
///
unsigned getPrefetchDistance() const override {
  return Impl.getPrefetchDistance();
}

/// Return the minimum stride necessary to trigger software
/// prefetching.
///
unsigned getMinPrefetchStride(unsigned NumMemAccesses,
                              unsigned NumStridedMemAccesses,
                              unsigned NumPrefetches,
                              bool HasCall) const override {
  return Impl.getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
                                   NumPrefetches, HasCall);
}

/// Return the maximum prefetch distance in terms of loop
/// iterations.
///
unsigned getMaxPrefetchIterationsAhead() const override {
  return Impl.getMaxPrefetchIterationsAhead();
}

/// \return True if prefetching should also be done for writes.
bool enableWritePrefetching() const override {
  return Impl.enableWritePrefetching();
}

unsigned getMaxInterleaveFactor(unsigned VF) override {
  return Impl.getMaxInterleaveFactor(VF);
}
unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
                                          unsigned &JTSize,
                                          ProfileSummaryInfo *PSI,
                                          BlockFrequencyInfo *BFI) override {
  return Impl.getEstimatedNumberOfCaseClusters(SI, JTSize, PSI, BFI);
}
InstructionCost getArithmeticInstrCost(
    unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
    OperandValueKind Opd1Info, OperandValueKind Opd2Info,
    OperandValueProperties Opd1PropInfo, OperandValueProperties Opd2PropInfo,
    ArrayRef<const Value *> Args,
    const Instruction *CxtI = nullptr) override {
  return Impl.getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, Opd2Info,
                                     Opd1PropInfo, Opd2PropInfo, Args, CxtI);
}
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp,
                               ArrayRef<int> Mask, int Index,
                               VectorType *SubTp) override {
  return Impl.getShuffleCost(Kind, Tp, Mask, Index, SubTp);
}
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
                                 CastContextHint CCH,
                                 TTI::TargetCostKind CostKind,
                                 const Instruction *I) override {
  return Impl.getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
}
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst,
                                         VectorType *VecTy,
                                         unsigned Index) override {
  return Impl.getExtractWithExtendCost(Opcode, Dst, VecTy, Index);
}
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
                               const Instruction *I = nullptr) override {
  return Impl.getCFInstrCost(Opcode, CostKind, I);
}
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
                                   CmpInst::Predicate VecPred,
                                   TTI::TargetCostKind CostKind,
                                   const Instruction *I) override {
  return Impl.getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
}
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
                                   unsigned Index) override {
  return Impl.getVectorInstrCost(Opcode, Val, Index);
}
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
                                unsigned AddressSpace,
                                TTI::TargetCostKind CostKind,
                                const Instruction *I) override {
  return Impl.getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
                              CostKind, I);
}
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
                                      Align Alignment, unsigned AddressSpace,
                                      TTI::TargetCostKind CostKind) override {
  return Impl.getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
                                    CostKind);
}
InstructionCost
getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr,
                       bool VariableMask, Align Alignment,
                       TTI::TargetCostKind CostKind,
                       const Instruction *I = nullptr) override {
  return Impl.getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
                                     Alignment, CostKind, I);
}
InstructionCost getInterleavedMemoryOpCost(
    unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
    Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
    bool UseMaskForCond, bool UseMaskForGaps) override {
  return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
                                         Alignment, AddressSpace, CostKind,
                                         UseMaskForCond, UseMaskForGaps);
}
InstructionCost
getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
                           Optional<FastMathFlags> FMF,
                           TTI::TargetCostKind CostKind) override {
  return Impl.getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
}
InstructionCost
getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned,
                       TTI::TargetCostKind CostKind) override {
  return Impl.getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
}
InstructionCost getExtendedAddReductionCost(
    bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty,
    TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) override {
  return Impl.getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, Ty,
                                          CostKind);
}
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                      TTI::TargetCostKind CostKind) override {
  return Impl.getIntrinsicInstrCost(ICA, CostKind);
}
InstructionCost getCallInstrCost(Function *F, Type *RetTy,
                                 ArrayRef<Type *> Tys,
                                 TTI::TargetCostKind CostKind) override {
  return Impl.getCallInstrCost(F, RetTy, Tys, CostKind);
}
unsigned getNumberOfParts(Type *Tp) override {
  return Impl.getNumberOfParts(Tp);
}
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
                                          const SCEV *Ptr) override {
  return Impl.getAddressComputationCost(Ty, SE, Ptr);
}
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) override {
  return Impl.getCostOfKeepingLiveOverCall(Tys);
}
bool getTgtMemIntrinsic(IntrinsicInst *Inst,
                        MemIntrinsicInfo &Info) override {
  return Impl.getTgtMemIntrinsic(Inst, Info);
}
unsigned getAtomicMemIntrinsicMaxElementSize() const override {
  return Impl.getAtomicMemIntrinsicMaxElementSize();
}
Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
                                         Type *ExpectedType) override {
  return Impl.getOrCreateResultFromMemIntrinsic(Inst, ExpectedType);
}
Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
                                unsigned SrcAddrSpace, unsigned DestAddrSpace,
                                unsigned SrcAlign,
                                unsigned DestAlign) const override {
  return Impl.getMemcpyLoopLoweringType(Context, Length, SrcAddrSpace,
                                        DestAddrSpace, SrcAlign, DestAlign);
}
void getMemcpyLoopResidualLoweringType(
    SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
    unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
    unsigned SrcAlign, unsigned DestAlign) const override {
  Impl.getMemcpyLoopResidualLoweringType(OpsOut, Context, RemainingBytes,
                                         SrcAddrSpace, DestAddrSpace,
                                         SrcAlign, DestAlign);
}
bool areInlineCompatible(const Function *Caller,
                         const Function *Callee) const override {
  return Impl.areInlineCompatible(Caller, Callee);
}
bool areFunctionArgsABICompatible(
    const Function *Caller, const Function *Callee,
    SmallPtrSetImpl<Argument *> &Args) const override {
  return Impl.areFunctionArgsABICompatible(Caller, Callee, Args);
}
bool isIndexedLoadLegal(MemIndexedMode Mode, Type *Ty) const override {
  return Impl.isIndexedLoadLegal(Mode, Ty, getDataLayout());
}
bool isIndexedStoreLegal(MemIndexedMode Mode, Type *Ty) const override {
  return Impl.isIndexedStoreLegal(Mode, Ty, getDataLayout());
}
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override {
  return Impl.getLoadStoreVecRegBitWidth(AddrSpace);
}
bool isLegalToVectorizeLoad(LoadInst *LI) const override {
  return Impl.isLegalToVectorizeLoad(LI);
}
bool isLegalToVectorizeStore(StoreInst *SI) const override {
  return Impl.isLegalToVectorizeStore(SI);
}
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment,
                                 unsigned AddrSpace) const override {
  return Impl.isLegalToVectorizeLoadChain(ChainSizeInBytes, Alignment,
                                          AddrSpace);
}
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
                                  unsigned AddrSpace) const override {
  return Impl.isLegalToVectorizeStoreChain(ChainSizeInBytes, Alignment,
                                           AddrSpace);
}
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc,
                                 ElementCount VF) const override {
  return Impl.isLegalToVectorizeReduction(RdxDesc, VF);
}
bool isElementTypeLegalForScalableVector(Type *Ty) const override {
  return Impl.isElementTypeLegalForScalableVector(Ty);
}
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
                             unsigned ChainSizeInBytes,
                             VectorType *VecTy) const override {
  return Impl.getLoadVectorFactor(VF, LoadSize, ChainSizeInBytes, VecTy);
}
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
                              unsigned ChainSizeInBytes,
                              VectorType *VecTy) const override {
  return Impl.getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy);
}
bool preferInLoopReduction(unsigned Opcode, Type *Ty,
                           ReductionFlags Flags) const override {
  return Impl.preferInLoopReduction(Opcode, Ty, Flags);
}
bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
                                     ReductionFlags Flags) const override {
  return Impl.preferPredicatedReductionSelect(Opcode, Ty, Flags);
}
bool shouldExpandReduction(const IntrinsicInst *II) const override {
  return Impl.shouldExpandReduction(II);
}

unsigned getGISelRematGlobalCost() const override {
  return Impl.getGISelRematGlobalCost();
}

bool supportsScalableVectors() const override {
  return Impl.supportsScalableVectors();
}

bool hasActiveVectorLength() const override {
  return Impl.hasActiveVectorLength();
}

InstructionCost getInstructionLatency(const Instruction *I) override {
  return Impl.getInstructionLatency(I);
}

VPLegalization
getVPLegalizationStrategy(const VPIntrinsic &PI) const override {
  return Impl.getVPLegalizationStrategy(PI);
}
2303};

2305template <typename T>
2306TargetTransformInfo::TargetTransformInfo(T Impl)
  : TTIImpl(new Model<T>(Impl)) {}

2309/// Analysis pass providing the \c TargetTransformInfo.
2310///
2311/// The core idea of the TargetIRAnalysis is to expose an interface through
2312/// which LLVM targets can analyze and provide information about the middle
2313/// end's target-independent IR. This supports use cases such as target-aware
2314/// cost modeling of IR constructs.
2315///
2316/// This is a function analysis because much of the cost modeling for targets
2317/// is done in a subtarget specific way and LLVM supports compiling different
2318/// functions targeting different subtargets in order to support runtime
2319/// dispatch according to the observed subtarget.
2320class TargetIRAnalysis : public AnalysisInfoMixin<TargetIRAnalysis> {
2321public:
typedef TargetTransformInfo Result;

/// Default construct a target IR analysis.
///
/// This will use the module's datalayout to construct a baseline
/// conservative TTI result.
TargetIRAnalysis();

/// Construct an IR analysis pass around a target-provide callback.
///
/// The callback will be called with a particular function for which the TTI
/// is needed and must return a TTI object for that function.
TargetIRAnalysis(std::function<Result(const Function &)> TTICallback);

// Value semantics. We spell out the constructors for MSVC.
TargetIRAnalysis(const TargetIRAnalysis &Arg)
    : TTICallback(Arg.TTICallback) {}
TargetIRAnalysis(TargetIRAnalysis &&Arg)
    : TTICallback(std::move(Arg.TTICallback)) {}
TargetIRAnalysis &operator=(const TargetIRAnalysis &RHS) {
  TTICallback = RHS.TTICallback;
  return *this;
}
TargetIRAnalysis &operator=(TargetIRAnalysis &&RHS) {
  TTICallback = std::move(RHS.TTICallback);
  return *this;
}

Result run(const Function &F, FunctionAnalysisManager &);

2352private:
friend AnalysisInfoMixin<TargetIRAnalysis>;
static AnalysisKey Key;

/// The callback used to produce a result.
///
/// We use a completely opaque callback so that targets can provide whatever
/// mechanism they desire for constructing the TTI for a given function.
///
/// FIXME: Should we really use std::function? It's relatively inefficient.
/// It might be possible to arrange for even stateful callbacks to outlive
/// the analysis and thus use a function_ref which would be lighter weight.
/// This may also be less error prone as the callback is likely to reference
/// the external TargetMachine, and that reference needs to never dangle.
std::function<Result(const Function &)> TTICallback;

/// Helper function used as the callback in the default constructor.
static Result getDefaultTTI(const Function &F);
2370};

2372/// Wrapper pass for TargetTransformInfo.
2373///
2374/// This pass can be constructed from a TTI object which it stores internally
2375/// and is queried by passes.
2376class TargetTransformInfoWrapperPass : public ImmutablePass {
TargetIRAnalysis TIRA;
Optional<TargetTransformInfo> TTI;

virtual void anchor();

2382public:
static char ID;

/// We must provide a default constructor for the pass but it should
/// never be used.
///
/// Use the constructor below or call one of the creation routines.
TargetTransformInfoWrapperPass();

explicit TargetTransformInfoWrapperPass(TargetIRAnalysis TIRA);

TargetTransformInfo &getTTI(const Function &F);
2394};

2396/// Create an analysis pass wrapper around a TTI object.
2397///
2398/// This analysis pass just holds the TTI instance and makes it available to
2399/// clients.
2400ImmutablePass *createTargetTransformInfoWrapperPass(TargetIRAnalysis TIRA);

2402} // namespace llvm

2404#endif

←

/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/include/llvm/ADT/SmallVector.h

→

1//===- llvm/ADT/SmallVector.h - 'Normally small' vectors --------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the SmallVector class.
10//
11//===----------------------------------------------------------------------===//

13#ifndef LLVM_ADT_SMALLVECTOR_H
14#define LLVM_ADT_SMALLVECTOR_H

16#include "llvm/ADT/iterator_range.h"
17#include "llvm/Support/Compiler.h"
18#include "llvm/Support/ErrorHandling.h"
19#include "llvm/Support/MemAlloc.h"
20#include "llvm/Support/type_traits.h"
21#include <algorithm>
22#include <cassert>
23#include <cstddef>
24#include <cstdlib>
25#include <cstring>
26#include <functional>
27#include <initializer_list>
28#include <iterator>
29#include <limits>
30#include <memory>
31#include <new>
32#include <type_traits>
33#include <utility>

35namespace llvm {

37/// This is all the stuff common to all SmallVectors.
38///
39/// The template parameter specifies the type which should be used to hold the
40/// Size and Capacity of the SmallVector, so it can be adjusted.
41/// Using 32 bit size is desirable to shrink the size of the SmallVector.
42/// Using 64 bit size is desirable for cases like SmallVector<char>, where a
43/// 32 bit size would limit the vector to ~4GB. SmallVectors are used for
44/// buffering bitcode output - which can exceed 4GB.
45template <class Size_T> class SmallVectorBase {
46protected:
void *BeginX;
Size_T Size = 0, Capacity;

/// The maximum value of the Size_T used.
static constexpr size_t SizeTypeMax() {
  return std::numeric_limits<Size_T>::max();
}

SmallVectorBase() = delete;
SmallVectorBase(void *FirstEl, size_t TotalCapacity)
    : BeginX(FirstEl), Capacity(TotalCapacity) {}

/// This is a helper for \a grow() that's out of line to reduce code
/// duplication.  This function will report a fatal error if it can't grow at
/// least to \p MinSize.
void *mallocForGrow(size_t MinSize, size_t TSize, size_t &NewCapacity);

/// This is an implementation of the grow() method which only works
/// on POD-like data types and is out of line to reduce code duplication.
/// This function will report a fatal error if it cannot increase capacity.
void grow_pod(void *FirstEl, size_t MinSize, size_t TSize);

69public:
size_t size() const { return Size; }
size_t capacity() const { return Capacity; }

LLVM_NODISCARD[[clang::warn_unused_result]] bool empty() const { return !Size; }
14
←
Assuming field 'Size' is not equal to 0→
15
←
Returning zero, which participates in a condition later→

/// Set the array size to \p N, which the current array must have enough
/// capacity for.
///
/// This does not construct or destroy any elements in the vector.
///
/// Clients can use this in conjunction with capacity() to write past the end
/// of the buffer when they know that more elements are available, and only
/// update the size later. This avoids the cost of value initializing elements
/// which will only be overwritten.
void set_size(size_t N) {
  assert(N <= capacity())(static_cast<void> (0));
  Size = N;
}
88};

90template <class T>
91using SmallVectorSizeType =
  typename std::conditional<sizeof(T) < 4 && sizeof(void *) >= 8, uint64_t,
                            uint32_t>::type;

95/// Figure out the offset of the first element.
96template <class T, typename = void> struct SmallVectorAlignmentAndSize {
alignas(SmallVectorBase<SmallVectorSizeType<T>>) char Base[sizeof(
    SmallVectorBase<SmallVectorSizeType<T>>)];
alignas(T) char FirstEl[sizeof(T)];
100};

102/// This is the part of SmallVectorTemplateBase which does not depend on whether
103/// the type T is a POD. The extra dummy template argument is used by ArrayRef
104/// to avoid unnecessarily requiring T to be complete.
105template <typename T, typename = void>
106class SmallVectorTemplateCommon
  : public SmallVectorBase<SmallVectorSizeType<T>> {
using Base = SmallVectorBase<SmallVectorSizeType<T>>;

/// Find the address of the first element.  For this pointer math to be valid
/// with small-size of 0 for T with lots of alignment, it's important that
/// SmallVectorStorage is properly-aligned even for small-size of 0.
void *getFirstEl() const {
  return const_cast<void *>(reinterpret_cast<const void *>(
      reinterpret_cast<const char *>(this) +
      offsetof(SmallVectorAlignmentAndSize<T>, FirstEl)__builtin_offsetof(SmallVectorAlignmentAndSize<T>, FirstEl
)));
}
// Space after 'FirstEl' is clobbered, do not add any instance vars after it.

120protected:
SmallVectorTemplateCommon(size_t Size) : Base(getFirstEl(), Size) {}

void grow_pod(size_t MinSize, size_t TSize) {
  Base::grow_pod(getFirstEl(), MinSize, TSize);
}

/// Return true if this is a smallvector which has not had dynamic
/// memory allocated for it.
bool isSmall() const { return this->BeginX == getFirstEl(); }

/// Put this vector in a state of being small.
void resetToSmall() {
  this->BeginX = getFirstEl();
  this->Size = this->Capacity = 0; // FIXME: Setting Capacity to 0 is suspect.
}

/// Return true if V is an internal reference to the given range.
bool isReferenceToRange(const void *V, const void *First, const void *Last) const {
  // Use std::less to avoid UB.
  std::less<> LessThan;
  return !LessThan(V, First) && LessThan(V, Last);
}

/// Return true if V is an internal reference to this vector.
bool isReferenceToStorage(const void *V) const {
  return isReferenceToRange(V, this->begin(), this->end());
}

/// Return true if First and Last form a valid (possibly empty) range in this
/// vector's storage.
bool isRangeInStorage(const void *First, const void *Last) const {
  // Use std::less to avoid UB.
  std::less<> LessThan;
  return !LessThan(First, this->begin()) && !LessThan(Last, First) &&
         !LessThan(this->end(), Last);
}

/// Return true unless Elt will be invalidated by resizing the vector to
/// NewSize.
bool isSafeToReferenceAfterResize(const void *Elt, size_t NewSize) {
  // Past the end.
  if (LLVM_LIKELY(!isReferenceToStorage(Elt))__builtin_expect((bool)(!isReferenceToStorage(Elt)), true))
    return true;

  // Return false if Elt will be destroyed by shrinking.
  if (NewSize <= this->size())
    return Elt < this->begin() + NewSize;

  // Return false if we need to grow.
  return NewSize <= this->capacity();
}

/// Check whether Elt will be invalidated by resizing the vector to NewSize.
void assertSafeToReferenceAfterResize(const void *Elt, size_t NewSize) {
  assert(isSafeToReferenceAfterResize(Elt, NewSize) &&(static_cast<void> (0))
         "Attempting to reference an element of the vector in an operation "(static_cast<void> (0))
         "that invalidates it")(static_cast<void> (0));
}

/// Check whether Elt will be invalidated by increasing the size of the
/// vector by N.
void assertSafeToAdd(const void *Elt, size_t N = 1) {
  this->assertSafeToReferenceAfterResize(Elt, this->size() + N);
}

/// Check whether any part of the range will be invalidated by clearing.
void assertSafeToReferenceAfterClear(const T *From, const T *To) {
  if (From == To)
    return;
  this->assertSafeToReferenceAfterResize(From, 0);
  this->assertSafeToReferenceAfterResize(To - 1, 0);
}
template <
    class ItTy,
    std::enable_if_t<!std::is_same<std::remove_const_t<ItTy>, T *>::value,
                     bool> = false>
void assertSafeToReferenceAfterClear(ItTy, ItTy) {}

/// Check whether any part of the range will be invalidated by growing.
void assertSafeToAddRange(const T *From, const T *To) {
  if (From == To)
    return;
  this->assertSafeToAdd(From, To - From);
  this->assertSafeToAdd(To - 1, To - From);
}
template <
    class ItTy,
    std::enable_if_t<!std::is_same<std::remove_const_t<ItTy>, T *>::value,
                     bool> = false>
void assertSafeToAddRange(ItTy, ItTy) {}

/// Reserve enough space to add one element, and return the updated element
/// pointer in case it was a reference to the storage.
template <class U>
static const T *reserveForParamAndGetAddressImpl(U *This, const T &Elt,
                                                 size_t N) {
  size_t NewSize = This->size() + N;
  if (LLVM_LIKELY(NewSize <= This->capacity())__builtin_expect((bool)(NewSize <= This->capacity()), true
))
    return &Elt;

  bool ReferencesStorage = false;
  int64_t Index = -1;
  if (!U::TakesParamByValue) {
    if (LLVM_UNLIKELY(This->isReferenceToStorage(&Elt))__builtin_expect((bool)(This->isReferenceToStorage(&Elt
)), false)) {
      ReferencesStorage = true;
      Index = &Elt - This->begin();
    }
  }
  This->grow(NewSize);
  return ReferencesStorage ? This->begin() + Index : &Elt;
}

233public:
using size_type = size_t;
using difference_type = ptrdiff_t;
using value_type = T;
using iterator = T *;
using const_iterator = const T *;

using const_reverse_iterator = std::reverse_iterator<const_iterator>;
using reverse_iterator = std::reverse_iterator<iterator>;

using reference = T &;
using const_reference = const T &;
using pointer = T *;
using const_pointer = const T *;

using Base::capacity;
using Base::empty;
using Base::size;

// forward iterator creation methods.
iterator begin() { return (iterator)this->BeginX; }
const_iterator begin() const { return (const_iterator)this->BeginX; }
iterator end() { return begin() + size(); }
const_iterator end() const { return begin() + size(); }

// reverse iterator creation methods.
reverse_iterator rbegin()            { return reverse_iterator(end()); }
const_reverse_iterator rbegin() const{ return const_reverse_iterator(end()); }
reverse_iterator rend()              { return reverse_iterator(begin()); }
const_reverse_iterator rend() const { return const_reverse_iterator(begin());}

size_type size_in_bytes() const { return size() * sizeof(T); }
size_type max_size() const {
  return std::min(this->SizeTypeMax(), size_type(-1) / sizeof(T));
}

size_t capacity_in_bytes() const { return capacity() * sizeof(T); }

/// Return a pointer to the vector's buffer, even if empty().
pointer data() { return pointer(begin()); }
/// Return a pointer to the vector's buffer, even if empty().
const_pointer data() const { return const_pointer(begin()); }

reference operator[](size_type idx) {
  assert(idx < size())(static_cast<void> (0));
  return begin()[idx];
}
const_reference operator[](size_type idx) const {
  assert(idx < size())(static_cast<void> (0));
  return begin()[idx];
}

reference front() {
  assert(!empty())(static_cast<void> (0));
  return begin()[0];
}
const_reference front() const {
  assert(!empty())(static_cast<void> (0));
  return begin()[0];
}

reference back() {
  assert(!empty())(static_cast<void> (0));
  return end()[-1];
}
const_reference back() const {
  assert(!empty())(static_cast<void> (0));
  return end()[-1];
}
302};

304/// SmallVectorTemplateBase<TriviallyCopyable = false> - This is where we put
305/// method implementations that are designed to work with non-trivial T's.
306///
307/// We approximate is_trivially_copyable with trivial move/copy construction and
308/// trivial destruction. While the standard doesn't specify that you're allowed
309/// copy these types with memcpy, there is no way for the type to observe this.
310/// This catches the important case of std::pair<POD, POD>, which is not
311/// trivially assignable.
312template <typename T, bool = (is_trivially_copy_constructible<T>::value) &&
                           (is_trivially_move_constructible<T>::value) &&
                           std::is_trivially_destructible<T>::value>
315class SmallVectorTemplateBase : public SmallVectorTemplateCommon<T> {
friend class SmallVectorTemplateCommon<T>;

318protected:
static constexpr bool TakesParamByValue = false;
using ValueParamT = const T &;

SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {}

static void destroy_range(T *S, T *E) {
  while (S != E) {
    --E;
    E->~T();
  }
}

/// Move the range [I, E) into the uninitialized memory starting with "Dest",
/// constructing elements as needed.
template<typename It1, typename It2>
static void uninitialized_move(It1 I, It1 E, It2 Dest) {
  std::uninitialized_copy(std::make_move_iterator(I),
                          std::make_move_iterator(E), Dest);
}

/// Copy the range [I, E) onto the uninitialized memory starting with "Dest",
/// constructing elements as needed.
template<typename It1, typename It2>
static void uninitialized_copy(It1 I, It1 E, It2 Dest) {
  std::uninitialized_copy(I, E, Dest);
}

/// Grow the allocated memory (without initializing new elements), doubling
/// the size of the allocated memory. Guarantees space for at least one more
/// element, or MinSize more elements if specified.
void grow(size_t MinSize = 0);

/// Create a new allocation big enough for \p MinSize and pass back its size
/// in \p NewCapacity. This is the first section of \a grow().
T *mallocForGrow(size_t MinSize, size_t &NewCapacity) {
  return static_cast<T *>(
      SmallVectorBase<SmallVectorSizeType<T>>::mallocForGrow(
          MinSize, sizeof(T), NewCapacity));
}

/// Move existing elements over to the new allocation \p NewElts, the middle
/// section of \a grow().
void moveElementsForGrow(T *NewElts);

/// Transfer ownership of the allocation, finishing up \a grow().
void takeAllocationForGrow(T *NewElts, size_t NewCapacity);

/// Reserve enough space to add one element, and return the updated element
/// pointer in case it was a reference to the storage.
const T *reserveForParamAndGetAddress(const T &Elt, size_t N = 1) {
  return this->reserveForParamAndGetAddressImpl(this, Elt, N);
}

/// Reserve enough space to add one element, and return the updated element
/// pointer in case it was a reference to the storage.
T *reserveForParamAndGetAddress(T &Elt, size_t N = 1) {
  return const_cast<T *>(
      this->reserveForParamAndGetAddressImpl(this, Elt, N));
}

static T &&forward_value_param(T &&V) { return std::move(V); }
static const T &forward_value_param(const T &V) { return V; }

void growAndAssign(size_t NumElts, const T &Elt) {
  // Grow manually in case Elt is an internal reference.
  size_t NewCapacity;
  T *NewElts = mallocForGrow(NumElts, NewCapacity);
  std::uninitialized_fill_n(NewElts, NumElts, Elt);
  this->destroy_range(this->begin(), this->end());
  takeAllocationForGrow(NewElts, NewCapacity);
  this->set_size(NumElts);
}

template <typename... ArgTypes> T &growAndEmplaceBack(ArgTypes &&... Args) {
  // Grow manually in case one of Args is an internal reference.
  size_t NewCapacity;
  T *NewElts = mallocForGrow(0, NewCapacity);
  ::new ((void *)(NewElts + this->size())) T(std::forward<ArgTypes>(Args)...);
  moveElementsForGrow(NewElts);
  takeAllocationForGrow(NewElts, NewCapacity);
  this->set_size(this->size() + 1);
  return this->back();
}

403public:
void push_back(const T &Elt) {
  const T *EltPtr = reserveForParamAndGetAddress(Elt);
  ::new ((void *)this->end()) T(*EltPtr);
  this->set_size(this->size() + 1);
}

void push_back(T &&Elt) {
  T *EltPtr = reserveForParamAndGetAddress(Elt);
  ::new ((void *)this->end()) T(::std::move(*EltPtr));
  this->set_size(this->size() + 1);
}

void pop_back() {
  this->set_size(this->size() - 1);
  this->end()->~T();
}
420};

422// Define this out-of-line to dissuade the C++ compiler from inlining it.
423template <typename T, bool TriviallyCopyable>
424void SmallVectorTemplateBase<T, TriviallyCopyable>::grow(size_t MinSize) {
size_t NewCapacity;
T *NewElts = mallocForGrow(MinSize, NewCapacity);
moveElementsForGrow(NewElts);
takeAllocationForGrow(NewElts, NewCapacity);
429}

431// Define this out-of-line to dissuade the C++ compiler from inlining it.
432template <typename T, bool TriviallyCopyable>
433void SmallVectorTemplateBase<T, TriviallyCopyable>::moveElementsForGrow(
  T *NewElts) {
// Move the elements over.
this->uninitialized_move(this->begin(), this->end(), NewElts);

// Destroy the original elements.
destroy_range(this->begin(), this->end());
440}

442// Define this out-of-line to dissuade the C++ compiler from inlining it.
443template <typename T, bool TriviallyCopyable>
444void SmallVectorTemplateBase<T, TriviallyCopyable>::takeAllocationForGrow(
  T *NewElts, size_t NewCapacity) {
// If this wasn't grown from the inline copy, deallocate the old space.
if (!this->isSmall())
  free(this->begin());

this->BeginX = NewElts;
this->Capacity = NewCapacity;
452}

454/// SmallVectorTemplateBase<TriviallyCopyable = true> - This is where we put
455/// method implementations that are designed to work with trivially copyable
456/// T's. This allows using memcpy in place of copy/move construction and
457/// skipping destruction.
458template <typename T>
459class SmallVectorTemplateBase<T, true> : public SmallVectorTemplateCommon<T> {
friend class SmallVectorTemplateCommon<T>;

462protected:
/// True if it's cheap enough to take parameters by value. Doing so avoids
/// overhead related to mitigations for reference invalidation.
static constexpr bool TakesParamByValue = sizeof(T) <= 2 * sizeof(void *);

/// Either const T& or T, depending on whether it's cheap enough to take
/// parameters by value.
using ValueParamT =
    typename std::conditional<TakesParamByValue, T, const T &>::type;

SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {}

// No need to do a destroy loop for POD's.
static void destroy_range(T *, T *) {}

/// Move the range [I, E) onto the uninitialized memory
/// starting with "Dest", constructing elements into it as needed.
template<typename It1, typename It2>
static void uninitialized_move(It1 I, It1 E, It2 Dest) {
  // Just do a copy.
  uninitialized_copy(I, E, Dest);
}

/// Copy the range [I, E) onto the uninitialized memory
/// starting with "Dest", constructing elements into it as needed.
template<typename It1, typename It2>
static void uninitialized_copy(It1 I, It1 E, It2 Dest) {
  // Arbitrary iterator types; just use the basic implementation.
  std::uninitialized_copy(I, E, Dest);
}

/// Copy the range [I, E) onto the uninitialized memory
/// starting with "Dest", constructing elements into it as needed.
template <typename T1, typename T2>
static void uninitialized_copy(
    T1 *I, T1 *E, T2 *Dest,
    std::enable_if_t<std::is_same<typename std::remove_const<T1>::type,
                                  T2>::value> * = nullptr) {
  // Use memcpy for PODs iterated by pointers (which includes SmallVector
  // iterators): std::uninitialized_copy optimizes to memmove, but we can
  // use memcpy here. Note that I and E are iterators and thus might be
  // invalid for memcpy if they are equal.
  if (I != E)
    memcpy(reinterpret_cast<void *>(Dest), I, (E - I) * sizeof(T));
}

/// Double the size of the allocated memory, guaranteeing space for at
/// least one more element or MinSize if specified.
void grow(size_t MinSize = 0) { this->grow_pod(MinSize, sizeof(T)); }

/// Reserve enough space to add one element, and return the updated element
/// pointer in case it was a reference to the storage.
const T *reserveForParamAndGetAddress(const T &Elt, size_t N = 1) {
  return this->reserveForParamAndGetAddressImpl(this, Elt, N);
}

/// Reserve enough space to add one element, and return the updated element
/// pointer in case it was a reference to the storage.
T *reserveForParamAndGetAddress(T &Elt, size_t N = 1) {
  return const_cast<T *>(
      this->reserveForParamAndGetAddressImpl(this, Elt, N));
}

/// Copy \p V or return a reference, depending on \a ValueParamT.
static ValueParamT forward_value_param(ValueParamT V) { return V; }

void growAndAssign(size_t NumElts, T Elt) {
  // Elt has been copied in case it's an internal reference, side-stepping
  // reference invalidation problems without losing the realloc optimization.
  this->set_size(0);
  this->grow(NumElts);
  std::uninitialized_fill_n(this->begin(), NumElts, Elt);
  this->set_size(NumElts);
}

template <typename... ArgTypes> T &growAndEmplaceBack(ArgTypes &&... Args) {
  // Use push_back with a copy in case Args has an internal reference,
  // side-stepping reference invalidation problems without losing the realloc
  // optimization.
  push_back(T(std::forward<ArgTypes>(Args)...));
  return this->back();
}

545public:
void push_back(ValueParamT Elt) {
  const T *EltPtr = reserveForParamAndGetAddress(Elt);
  memcpy(reinterpret_cast<void *>(this->end()), EltPtr, sizeof(T));
  this->set_size(this->size() + 1);
}

void pop_back() { this->set_size(this->size() - 1); }
553};

555/// This class consists of common code factored out of the SmallVector class to
556/// reduce code duplication based on the SmallVector 'N' template parameter.
557template <typename T>
558class SmallVectorImpl : public SmallVectorTemplateBase<T> {
using SuperClass = SmallVectorTemplateBase<T>;

561public:
using iterator = typename SuperClass::iterator;
using const_iterator = typename SuperClass::const_iterator;
using reference = typename SuperClass::reference;
using size_type = typename SuperClass::size_type;

567protected:
using SmallVectorTemplateBase<T>::TakesParamByValue;
using ValueParamT = typename SuperClass::ValueParamT;

// Default ctor - Initialize to empty.
explicit SmallVectorImpl(unsigned N)
    : SmallVectorTemplateBase<T>(N) {}

575public:
SmallVectorImpl(const SmallVectorImpl &) = delete;

~SmallVectorImpl() {
  // Subclass has already destructed this vector's elements.
  // If this wasn't grown from the inline copy, deallocate the old space.
  if (!this->isSmall())
    free(this->begin());
}

void clear() {
  this->destroy_range(this->begin(), this->end());
  this->Size = 0;
}

590private:
template <bool ForOverwrite> void resizeImpl(size_type N) {
  if (N < this->size()) {
    this->pop_back_n(this->size() - N);
  } else if (N > this->size()) {
    this->reserve(N);
    for (auto I = this->end(), E = this->begin() + N; I != E; ++I)
      if (ForOverwrite)
        new (&*I) T;
      else
        new (&*I) T();
    this->set_size(N);
  }
}

605public:
void resize(size_type N) { resizeImpl<false>(N); }

/// Like resize, but \ref T is POD, the new values won't be initialized.
void resize_for_overwrite(size_type N) { resizeImpl<true>(N); }

void resize(size_type N, ValueParamT NV) {
  if (N == this->size())
    return;

  if (N < this->size()) {
    this->pop_back_n(this->size() - N);
    return;
  }

  // N > this->size(). Defer to append.
  this->append(N - this->size(), NV);
}

void reserve(size_type N) {
  if (this->capacity() < N)
    this->grow(N);
}

void pop_back_n(size_type NumItems) {
  assert(this->size() >= NumItems)(static_cast<void> (0));
  this->destroy_range(this->end() - NumItems, this->end());
  this->set_size(this->size() - NumItems);
}

LLVM_NODISCARD[[clang::warn_unused_result]] T pop_back_val() {
  T Result = ::std::move(this->back());
  this->pop_back();
  return Result;
}

void swap(SmallVectorImpl &RHS);

/// Add the specified range to the end of the SmallVector.
template <typename in_iter,
          typename = std::enable_if_t<std::is_convertible<
              typename std::iterator_traits<in_iter>::iterator_category,
              std::input_iterator_tag>::value>>
void append(in_iter in_start, in_iter in_end) {
  this->assertSafeToAddRange(in_start, in_end);
  size_type NumInputs = std::distance(in_start, in_end);
  this->reserve(this->size() + NumInputs);
  this->uninitialized_copy(in_start, in_end, this->end());
  this->set_size(this->size() + NumInputs);
}

/// Append \p NumInputs copies of \p Elt to the end.
void append(size_type NumInputs, ValueParamT Elt) {
  const T *EltPtr = this->reserveForParamAndGetAddress(Elt, NumInputs);
  std::uninitialized_fill_n(this->end(), NumInputs, *EltPtr);
  this->set_size(this->size() + NumInputs);
}

void append(std::initializer_list<T> IL) {
  append(IL.begin(), IL.end());
}

void append(const SmallVectorImpl &RHS) { append(RHS.begin(), RHS.end()); }

void assign(size_type NumElts, ValueParamT Elt) {
  // Note that Elt could be an internal reference.
  if (NumElts > this->capacity()) {
    this->growAndAssign(NumElts, Elt);
    return;
  }

  // Assign over existing elements.
  std::fill_n(this->begin(), std::min(NumElts, this->size()), Elt);
  if (NumElts > this->size())
    std::uninitialized_fill_n(this->end(), NumElts - this->size(), Elt);
  else if (NumElts < this->size())
    this->destroy_range(this->begin() + NumElts, this->end());
  this->set_size(NumElts);
}

// FIXME: Consider assigning over existing elements, rather than clearing &
// re-initializing them - for all assign(...) variants.

template <typename in_iter,
          typename = std::enable_if_t<std::is_convertible<
              typename std::iterator_traits<in_iter>::iterator_category,
              std::input_iterator_tag>::value>>
void assign(in_iter in_start, in_iter in_end) {
  this->assertSafeToReferenceAfterClear(in_start, in_end);
  clear();
  append(in_start, in_end);
}

void assign(std::initializer_list<T> IL) {
  clear();
  append(IL);
}

void assign(const SmallVectorImpl &RHS) { assign(RHS.begin(), RHS.end()); }

iterator erase(const_iterator CI) {
  // Just cast away constness because this is a non-const member function.
  iterator I = const_cast<iterator>(CI);

  assert(this->isReferenceToStorage(CI) && "Iterator to erase is out of bounds.")(static_cast<void> (0));

  iterator N = I;
  // Shift all elts down one.
  std::move(I+1, this->end(), I);
  // Drop the last elt.
  this->pop_back();
  return(N);
}

iterator erase(const_iterator CS, const_iterator CE) {
  // Just cast away constness because this is a non-const member function.
  iterator S = const_cast<iterator>(CS);
  iterator E = const_cast<iterator>(CE);

  assert(this->isRangeInStorage(S, E) && "Range to erase is out of bounds.")(static_cast<void> (0));

  iterator N = S;
  // Shift all elts down.
  iterator I = std::move(E, this->end(), S);
  // Drop the last elts.
  this->destroy_range(I, this->end());
  this->set_size(I - this->begin());
  return(N);
}

735private:
template <class ArgType> iterator insert_one_impl(iterator I, ArgType &&Elt) {
  // Callers ensure that ArgType is derived from T.
  static_assert(
      std::is_same<std::remove_const_t<std::remove_reference_t<ArgType>>,
                   T>::value,
      "ArgType must be derived from T!");

  if (I == this->end()) {  // Important special case for empty vector.
    this->push_back(::std::forward<ArgType>(Elt));
    return this->end()-1;
  }

  assert(this->isReferenceToStorage(I) && "Insertion iterator is out of bounds.")(static_cast<void> (0));

  // Grow if necessary.
  size_t Index = I - this->begin();
  std::remove_reference_t<ArgType> *EltPtr =
      this->reserveForParamAndGetAddress(Elt);
  I = this->begin() + Index;

  ::new ((void*) this->end()) T(::std::move(this->back()));
  // Push everything else over.
  std::move_backward(I, this->end()-1, this->end());
  this->set_size(this->size() + 1);

  // If we just moved the element we're inserting, be sure to update
  // the reference (never happens if TakesParamByValue).
  static_assert(!TakesParamByValue || std::is_same<ArgType, T>::value,
                "ArgType must be 'T' when taking by value!");
  if (!TakesParamByValue && this->isReferenceToRange(EltPtr, I, this->end()))
    ++EltPtr;

  *I = ::std::forward<ArgType>(*EltPtr);
  return I;
}

772public:
iterator insert(iterator I, T &&Elt) {
  return insert_one_impl(I, this->forward_value_param(std::move(Elt)));
}

iterator insert(iterator I, const T &Elt) {
  return insert_one_impl(I, this->forward_value_param(Elt));
}

iterator insert(iterator I, size_type NumToInsert, ValueParamT Elt) {
  // Convert iterator to elt# to avoid invalidating iterator when we reserve()
  size_t InsertElt = I - this->begin();

  if (I == this->end()) {  // Important special case for empty vector.
    append(NumToInsert, Elt);
    return this->begin()+InsertElt;
  }

  assert(this->isReferenceToStorage(I) && "Insertion iterator is out of bounds.")(static_cast<void> (0));

  // Ensure there is enough space, and get the (maybe updated) address of
  // Elt.
  const T *EltPtr = this->reserveForParamAndGetAddress(Elt, NumToInsert);

  // Uninvalidate the iterator.
  I = this->begin()+InsertElt;

  // If there are more elements between the insertion point and the end of the
  // range than there are being inserted, we can use a simple approach to
  // insertion.  Since we already reserved space, we know that this won't
  // reallocate the vector.
  if (size_t(this->end()-I) >= NumToInsert) {
    T *OldEnd = this->end();
    append(std::move_iterator<iterator>(this->end() - NumToInsert),
           std::move_iterator<iterator>(this->end()));

    // Copy the existing elements that get replaced.
    std::move_backward(I, OldEnd-NumToInsert, OldEnd);

    // If we just moved the element we're inserting, be sure to update
    // the reference (never happens if TakesParamByValue).
    if (!TakesParamByValue && I <= EltPtr && EltPtr < this->end())
      EltPtr += NumToInsert;

    std::fill_n(I, NumToInsert, *EltPtr);
    return I;
  }

  // Otherwise, we're inserting more elements than exist already, and we're
  // not inserting at the end.

  // Move over the elements that we're about to overwrite.
  T *OldEnd = this->end();
  this->set_size(this->size() + NumToInsert);
  size_t NumOverwritten = OldEnd-I;
  this->uninitialized_move(I, OldEnd, this->end()-NumOverwritten);

  // If we just moved the element we're inserting, be sure to update
  // the reference (never happens if TakesParamByValue).
  if (!TakesParamByValue && I <= EltPtr && EltPtr < this->end())
    EltPtr += NumToInsert;

  // Replace the overwritten part.
  std::fill_n(I, NumOverwritten, *EltPtr);

  // Insert the non-overwritten middle part.
  std::uninitialized_fill_n(OldEnd, NumToInsert - NumOverwritten, *EltPtr);
  return I;
}

template <typename ItTy,
          typename = std::enable_if_t<std::is_convertible<
              typename std::iterator_traits<ItTy>::iterator_category,
              std::input_iterator_tag>::value>>
iterator insert(iterator I, ItTy From, ItTy To) {
  // Convert iterator to elt# to avoid invalidating iterator when we reserve()
  size_t InsertElt = I - this->begin();

  if (I == this->end()) {  // Important special case for empty vector.
    append(From, To);
    return this->begin()+InsertElt;
  }

  assert(this->isReferenceToStorage(I) && "Insertion iterator is out of bounds.")(static_cast<void> (0));

  // Check that the reserve that follows doesn't invalidate the iterators.
  this->assertSafeToAddRange(From, To);

  size_t NumToInsert = std::distance(From, To);

  // Ensure there is enough space.
  reserve(this->size() + NumToInsert);

  // Uninvalidate the iterator.
  I = this->begin()+InsertElt;

  // If there are more elements between the insertion point and the end of the
  // range than there are being inserted, we can use a simple approach to
  // insertion.  Since we already reserved space, we know that this won't
  // reallocate the vector.
  if (size_t(this->end()-I) >= NumToInsert) {
    T *OldEnd = this->end();
    append(std::move_iterator<iterator>(this->end() - NumToInsert),
           std::move_iterator<iterator>(this->end()));

    // Copy the existing elements that get replaced.
    std::move_backward(I, OldEnd-NumToInsert, OldEnd);

    std::copy(From, To, I);
    return I;
  }

  // Otherwise, we're inserting more elements than exist already, and we're
  // not inserting at the end.

  // Move over the elements that we're about to overwrite.
  T *OldEnd = this->end();
  this->set_size(this->size() + NumToInsert);
  size_t NumOverwritten = OldEnd-I;
  this->uninitialized_move(I, OldEnd, this->end()-NumOverwritten);

  // Replace the overwritten part.
  for (T *J = I; NumOverwritten > 0; --NumOverwritten) {
    *J = *From;
    ++J; ++From;
  }

  // Insert the non-overwritten middle part.
  this->uninitialized_copy(From, To, OldEnd);
  return I;
}

void insert(iterator I, std::initializer_list<T> IL) {
  insert(I, IL.begin(), IL.end());
}

template <typename... ArgTypes> reference emplace_back(ArgTypes &&... Args) {
  if (LLVM_UNLIKELY(this->size() >= this->capacity())__builtin_expect((bool)(this->size() >= this->capacity
()), false))
    return this->growAndEmplaceBack(std::forward<ArgTypes>(Args)...);

  ::new ((void *)this->end()) T(std::forward<ArgTypes>(Args)...);
  this->set_size(this->size() + 1);
  return this->back();
}

SmallVectorImpl &operator=(const SmallVectorImpl &RHS);

SmallVectorImpl &operator=(SmallVectorImpl &&RHS);

bool operator==(const SmallVectorImpl &RHS) const {
  if (this->size() != RHS.size()) return false;
  return std::equal(this->begin(), this->end(), RHS.begin());
}
bool operator!=(const SmallVectorImpl &RHS) const {
  return !(*this == RHS);
}

bool operator<(const SmallVectorImpl &RHS) const {
  return std::lexicographical_compare(this->begin(), this->end(),
                                      RHS.begin(), RHS.end());
}
933};

935template <typename T>
936void SmallVectorImpl<T>::swap(SmallVectorImpl<T> &RHS) {
if (this == &RHS) return;

// We can only avoid copying elements if neither vector is small.
if (!this->isSmall() && !RHS.isSmall()) {
  std::swap(this->BeginX, RHS.BeginX);
  std::swap(this->Size, RHS.Size);
  std::swap(this->Capacity, RHS.Capacity);
  return;
}
this->reserve(RHS.size());
RHS.reserve(this->size());

// Swap the shared elements.
size_t NumShared = this->size();
if (NumShared > RHS.size()) NumShared = RHS.size();
for (size_type i = 0; i != NumShared; ++i)
  std::swap((*this)[i], RHS[i]);

// Copy over the extra elts.
if (this->size() > RHS.size()) {
  size_t EltDiff = this->size() - RHS.size();
  this->uninitialized_copy(this->begin()+NumShared, this->end(), RHS.end());
  RHS.set_size(RHS.size() + EltDiff);
  this->destroy_range(this->begin()+NumShared, this->end());
  this->set_size(NumShared);
} else if (RHS.size() > this->size()) {
  size_t EltDiff = RHS.size() - this->size();
  this->uninitialized_copy(RHS.begin()+NumShared, RHS.end(), this->end());
  this->set_size(this->size() + EltDiff);
  this->destroy_range(RHS.begin()+NumShared, RHS.end());
  RHS.set_size(NumShared);
}
969}

971template <typename T>
972SmallVectorImpl<T> &SmallVectorImpl<T>::
operator=(const SmallVectorImpl<T> &RHS) {
// Avoid self-assignment.
if (this == &RHS) return *this;

// If we already have sufficient space, assign the common elements, then
// destroy any excess.
size_t RHSSize = RHS.size();
size_t CurSize = this->size();
if (CurSize >= RHSSize) {
  // Assign common elements.
  iterator NewEnd;
  if (RHSSize)
    NewEnd = std::copy(RHS.begin(), RHS.begin()+RHSSize, this->begin());
  else
    NewEnd = this->begin();

  // Destroy excess elements.
  this->destroy_range(NewEnd, this->end());

  // Trim.
  this->set_size(RHSSize);
  return *this;
}

// If we have to grow to have enough elements, destroy the current elements.
// This allows us to avoid copying them during the grow.
// FIXME: don't do this if they're efficiently moveable.
if (this->capacity() < RHSSize) {
  // Destroy current elements.
  this->clear();
  CurSize = 0;
  this->grow(RHSSize);
} else if (CurSize) {
  // Otherwise, use assignment for the already-constructed elements.
  std::copy(RHS.begin(), RHS.begin()+CurSize, this->begin());
}

// Copy construct the new elements in place.
this->uninitialized_copy(RHS.begin()+CurSize, RHS.end(),
                         this->begin()+CurSize);

// Set end.
this->set_size(RHSSize);
return *this;
1017}

1019template <typename T>
1020SmallVectorImpl<T> &SmallVectorImpl<T>::operator=(SmallVectorImpl<T> &&RHS) {
// Avoid self-assignment.
if (this == &RHS) return *this;

// If the RHS isn't small, clear this vector and then steal its buffer.
if (!RHS.isSmall()) {
  this->destroy_range(this->begin(), this->end());
  if (!this->isSmall()) free(this->begin());
  this->BeginX = RHS.BeginX;
  this->Size = RHS.Size;
  this->Capacity = RHS.Capacity;
  RHS.resetToSmall();
  return *this;
}

// If we already have sufficient space, assign the common elements, then
// destroy any excess.
size_t RHSSize = RHS.size();
size_t CurSize = this->size();
if (CurSize >= RHSSize) {
  // Assign common elements.
  iterator NewEnd = this->begin();
  if (RHSSize)
    NewEnd = std::move(RHS.begin(), RHS.end(), NewEnd);

  // Destroy excess elements and trim the bounds.
  this->destroy_range(NewEnd, this->end());
  this->set_size(RHSSize);

  // Clear the RHS.
  RHS.clear();

  return *this;
}

// If we have to grow to have enough elements, destroy the current elements.
// This allows us to avoid copying them during the grow.
// FIXME: this may not actually make any sense if we can efficiently move
// elements.
if (this->capacity() < RHSSize) {
  // Destroy current elements.
  this->clear();
  CurSize = 0;
  this->grow(RHSSize);
} else if (CurSize) {
  // Otherwise, use assignment for the already-constructed elements.
  std::move(RHS.begin(), RHS.begin()+CurSize, this->begin());
}

// Move-construct the new elements in place.
this->uninitialized_move(RHS.begin()+CurSize, RHS.end(),
                         this->begin()+CurSize);

// Set end.
this->set_size(RHSSize);

RHS.clear();
return *this;
1078}

1080/// Storage for the SmallVector elements.  This is specialized for the N=0 case
1081/// to avoid allocating unnecessary storage.
1082template <typename T, unsigned N>
1083struct SmallVectorStorage {
alignas(T) char InlineElts[N * sizeof(T)];
1085};

1087/// We need the storage to be properly aligned even for small-size of 0 so that
1088/// the pointer math in \a SmallVectorTemplateCommon::getFirstEl() is
1089/// well-defined.
1090template <typename T> struct alignas(T) SmallVectorStorage<T, 0> {};

1092/// Forward declaration of SmallVector so that
1093/// calculateSmallVectorDefaultInlinedElements can reference
1094/// `sizeof(SmallVector<T, 0>)`.
1095template <typename T, unsigned N> class LLVM_GSL_OWNER[[gsl::Owner]] SmallVector;

1097/// Helper class for calculating the default number of inline elements for
1098/// `SmallVector<T>`.
1099///
1100/// This should be migrated to a constexpr function when our minimum
1101/// compiler support is enough for multi-statement constexpr functions.
1102template <typename T> struct CalculateSmallVectorDefaultInlinedElements {
// Parameter controlling the default number of inlined elements
// for `SmallVector<T>`.
//
// The default number of inlined elements ensures that
// 1. There is at least one inlined element.
// 2. `sizeof(SmallVector<T>) <= kPreferredSmallVectorSizeof` unless
// it contradicts 1.
static constexpr size_t kPreferredSmallVectorSizeof = 64;

// static_assert that sizeof(T) is not "too big".
//
// Because our policy guarantees at least one inlined element, it is possible
// for an arbitrarily large inlined element to allocate an arbitrarily large
// amount of inline storage. We generally consider it an antipattern for a
// SmallVector to allocate an excessive amount of inline storage, so we want
// to call attention to these cases and make sure that users are making an
// intentional decision if they request a lot of inline storage.
//
// We want this assertion to trigger in pathological cases, but otherwise
// not be too easy to hit. To accomplish that, the cutoff is actually somewhat
// larger than kPreferredSmallVectorSizeof (otherwise,
// `SmallVector<SmallVector<T>>` would be one easy way to trip it, and that
// pattern seems useful in practice).
//
// One wrinkle is that this assertion is in theory non-portable, since
// sizeof(T) is in general platform-dependent. However, we don't expect this
// to be much of an issue, because most LLVM development happens on 64-bit
// hosts, and therefore sizeof(T) is expected to *decrease* when compiled for
// 32-bit hosts, dodging the issue. The reverse situation, where development
// happens on a 32-bit host and then fails due to sizeof(T) *increasing* on a
// 64-bit host, is expected to be very rare.
static_assert(
    sizeof(T) <= 256,
    "You are trying to use a default number of inlined elements for "
    "`SmallVector<T>` but `sizeof(T)` is really big! Please use an "
    "explicit number of inlined elements with `SmallVector<T, N>` to make "
    "sure you really want that much inline storage.");

// Discount the size of the header itself when calculating the maximum inline
// bytes.
static constexpr size_t PreferredInlineBytes =
    kPreferredSmallVectorSizeof - sizeof(SmallVector<T, 0>);
static constexpr size_t NumElementsThatFit = PreferredInlineBytes / sizeof(T);
static constexpr size_t value =
    NumElementsThatFit == 0 ? 1 : NumElementsThatFit;
1148};

1150/// This is a 'vector' (really, a variable-sized array), optimized
1151/// for the case when the array is small.  It contains some number of elements
1152/// in-place, which allows it to avoid heap allocation when the actual number of
1153/// elements is below that threshold.  This allows normal "small" cases to be
1154/// fast without losing generality for large inputs.
1155///
1156/// \note
1157/// In the absence of a well-motivated choice for the number of inlined
1158/// elements \p N, it is recommended to use \c SmallVector<T> (that is,
1159/// omitting the \p N). This will choose a default number of inlined elements
1160/// reasonable for allocation on the stack (for example, trying to keep \c
1161/// sizeof(SmallVector<T>) around 64 bytes).
1162///
1163/// \warning This does not attempt to be exception safe.
1164///
1165/// \see https://llvm.org/docs/ProgrammersManual.html#llvm-adt-smallvector-h
1166template <typename T,
        unsigned N = CalculateSmallVectorDefaultInlinedElements<T>::value>
1168class LLVM_GSL_OWNER[[gsl::Owner]] SmallVector : public SmallVectorImpl<T>,
                                 SmallVectorStorage<T, N> {
1170public:
SmallVector() : SmallVectorImpl<T>(N) {}

~SmallVector() {
  // Destroy the constructed elements in the vector.
  this->destroy_range(this->begin(), this->end());
}

explicit SmallVector(size_t Size, const T &Value = T())
  : SmallVectorImpl<T>(N) {
  this->assign(Size, Value);
}

template <typename ItTy,
          typename = std::enable_if_t<std::is_convertible<
              typename std::iterator_traits<ItTy>::iterator_category,
              std::input_iterator_tag>::value>>
SmallVector(ItTy S, ItTy E) : SmallVectorImpl<T>(N) {
  this->append(S, E);
}

template <typename RangeTy>
explicit SmallVector(const iterator_range<RangeTy> &R)
    : SmallVectorImpl<T>(N) {
  this->append(R.begin(), R.end());
}

SmallVector(std::initializer_list<T> IL) : SmallVectorImpl<T>(N) {
  this->assign(IL);
}

SmallVector(const SmallVector &RHS) : SmallVectorImpl<T>(N) {
  if (!RHS.empty())
    SmallVectorImpl<T>::operator=(RHS);
}

SmallVector &operator=(const SmallVector &RHS) {
  SmallVectorImpl<T>::operator=(RHS);
  return *this;
}

SmallVector(SmallVector &&RHS) : SmallVectorImpl<T>(N) {
  if (!RHS.empty())
    SmallVectorImpl<T>::operator=(::std::move(RHS));
}

SmallVector(SmallVectorImpl<T> &&RHS) : SmallVectorImpl<T>(N) {
  if (!RHS.empty())
    SmallVectorImpl<T>::operator=(::std::move(RHS));
}

SmallVector &operator=(SmallVector &&RHS) {
  SmallVectorImpl<T>::operator=(::std::move(RHS));
  return *this;
}

SmallVector &operator=(SmallVectorImpl<T> &&RHS) {
  SmallVectorImpl<T>::operator=(::std::move(RHS));
  return *this;
}

SmallVector &operator=(std::initializer_list<T> IL) {
  this->assign(IL);
  return *this;
}
1235};

1237template <typename T, unsigned N>
1238inline size_t capacity_in_bytes(const SmallVector<T, N> &X) {
return X.capacity_in_bytes();
1240}

1242/// Given a range of type R, iterate the entire range and return a
1243/// SmallVector with elements of the vector.  This is useful, for example,
1244/// when you want to iterate a range and then sort the results.
1245template <unsigned Size, typename R>
1246SmallVector<typename std::remove_const<typename std::remove_reference<
              decltype(*std::begin(std::declval<R &>()))>::type>::type,
          Size>
1249to_vector(R &&Range) {
return {std::begin(Range), std::end(Range)};
1251}

1253} // end namespace llvm

1255namespace std {

/// Implement std::swap in terms of SmallVector swap.
template<typename T>
inline void
swap(llvm::SmallVectorImpl<T> &LHS, llvm::SmallVectorImpl<T> &RHS) {
  LHS.swap(RHS);
}

/// Implement std::swap in terms of SmallVector swap.
template<typename T, unsigned N>
inline void
swap(llvm::SmallVector<T, N> &LHS, llvm::SmallVector<T, N> &RHS) {
  LHS.swap(RHS);
}

1271} // end namespace std

1273#endif // LLVM_ADT_SMALLVECTOR_H

←

/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/include/llvm/IR/DerivedTypes.h

→

1//===- llvm/DerivedTypes.h - Classes for handling data types ----*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains the declarations of classes that represent "derived
10// types".  These are things like "arrays of x" or "structure of x, y, z" or
11// "function returning x taking (y,z) as parameters", etc...
12//
13// The implementations of these classes live in the Type.cpp file.
14//
15//===----------------------------------------------------------------------===//

17#ifndef LLVM_IR_DERIVEDTYPES_H
18#define LLVM_IR_DERIVEDTYPES_H

20#include "llvm/ADT/ArrayRef.h"
21#include "llvm/ADT/STLExtras.h"
22#include "llvm/ADT/StringRef.h"
23#include "llvm/IR/Type.h"
24#include "llvm/Support/Casting.h"
25#include "llvm/Support/Compiler.h"
26#include "llvm/Support/TypeSize.h"
27#include <cassert>
28#include <cstdint>

30namespace llvm {

32class Value;
33class APInt;
34class LLVMContext;

36/// Class to represent integer types. Note that this class is also used to
37/// represent the built-in integer types: Int1Ty, Int8Ty, Int16Ty, Int32Ty and
38/// Int64Ty.
39/// Integer representation type
40class IntegerType : public Type {
friend class LLVMContextImpl;

43protected:
explicit IntegerType(LLVMContext &C, unsigned NumBits) : Type(C, IntegerTyID){
  setSubclassData(NumBits);
}

48public:
/// This enum is just used to hold constants we need for IntegerType.
enum {
  MIN_INT_BITS = 1,        ///< Minimum number of bits that can be specified
  MAX_INT_BITS = (1<<24)-1 ///< Maximum number of bits that can be specified
    ///< Note that bit width is stored in the Type classes SubclassData field
    ///< which has 24 bits. This yields a maximum bit width of 16,777,215
    ///< bits.
};

/// This static method is the primary way of constructing an IntegerType.
/// If an IntegerType with the same NumBits value was previously instantiated,
/// that instance will be returned. Otherwise a new one will be created. Only
/// one instance with a given NumBits value is ever created.
/// Get or create an IntegerType instance.
static IntegerType *get(LLVMContext &C, unsigned NumBits);

/// Returns type twice as wide the input type.
IntegerType *getExtendedType() const {
  return Type::getIntNTy(getContext(), 2 * getScalarSizeInBits());
}

/// Get the number of bits in this IntegerType
unsigned getBitWidth() const { return getSubclassData(); }

/// Return a bitmask with ones set for all of the bits that can be set by an
/// unsigned version of this type. This is 0xFF for i8, 0xFFFF for i16, etc.
uint64_t getBitMask() const {
  return ~uint64_t(0UL) >> (64-getBitWidth());
}

/// Return a uint64_t with just the most significant bit set (the sign bit, if
/// the value is treated as a signed number).
uint64_t getSignBit() const {
  return 1ULL << (getBitWidth()-1);
}

/// For example, this is 0xFF for an 8 bit integer, 0xFFFF for i16, etc.
/// @returns a bit mask with ones set for all the bits of this type.
/// Get a bit mask for this type.
APInt getMask() const;

/// Methods for support type inquiry through isa, cast, and dyn_cast.
static bool classof(const Type *T) {
  return T->getTypeID() == IntegerTyID;
}
94};

96unsigned Type::getIntegerBitWidth() const {
return cast<IntegerType>(this)->getBitWidth();
98}

100/// Class to represent function types
101///
102class FunctionType : public Type {
FunctionType(Type *Result, ArrayRef<Type*> Params, bool IsVarArgs);

105public:
FunctionType(const FunctionType &) = delete;
FunctionType &operator=(const FunctionType &) = delete;

/// This static method is the primary way of constructing a FunctionType.
static FunctionType *get(Type *Result,
                         ArrayRef<Type*> Params, bool isVarArg);

/// Create a FunctionType taking no parameters.
static FunctionType *get(Type *Result, bool isVarArg);

/// Return true if the specified type is valid as a return type.
static bool isValidReturnType(Type *RetTy);

/// Return true if the specified type is valid as an argument type.
static bool isValidArgumentType(Type *ArgTy);

bool isVarArg() const { return getSubclassData()!=0; }
Type *getReturnType() const { return ContainedTys[0]; }

using param_iterator = Type::subtype_iterator;

param_iterator param_begin() const { return ContainedTys + 1; }
param_iterator param_end() const { return &ContainedTys[NumContainedTys]; }
ArrayRef<Type *> params() const {
  return makeArrayRef(param_begin(), param_end());
}

/// Parameter type accessors.
Type *getParamType(unsigned i) const { return ContainedTys[i+1]; }

/// Return the number of fixed parameters this function type requires.
/// This does not consider varargs.
unsigned getNumParams() const { return NumContainedTys - 1; }

/// Methods for support type inquiry through isa, cast, and dyn_cast.
static bool classof(const Type *T) {
  return T->getTypeID() == FunctionTyID;
}
144};
145static_assert(alignof(FunctionType) >= alignof(Type *),
            "Alignment sufficient for objects appended to FunctionType");

148bool Type::isFunctionVarArg() const {
return cast<FunctionType>(this)->isVarArg();
150}

152Type *Type::getFunctionParamType(unsigned i) const {
return cast<FunctionType>(this)->getParamType(i);
154}

156unsigned Type::getFunctionNumParams() const {
return cast<FunctionType>(this)->getNumParams();
158}

160/// A handy container for a FunctionType+Callee-pointer pair, which can be
161/// passed around as a single entity. This assists in replacing the use of
162/// PointerType::getElementType() to access the function's type, since that's
163/// slated for removal as part of the [opaque pointer types] project.
164class FunctionCallee {
165public:
// Allow implicit conversion from types which have a getFunctionType member
// (e.g. Function and InlineAsm).
template <typename T, typename U = decltype(&T::getFunctionType)>
FunctionCallee(T *Fn)
    : FnTy(Fn ? Fn->getFunctionType() : nullptr), Callee(Fn) {}

FunctionCallee(FunctionType *FnTy, Value *Callee)
    : FnTy(FnTy), Callee(Callee) {
  assert((FnTy == nullptr) == (Callee == nullptr))(static_cast<void> (0));
}

FunctionCallee(std::nullptr_t) {}

FunctionCallee() = default;

FunctionType *getFunctionType() { return FnTy; }

Value *getCallee() { return Callee; }

explicit operator bool() { return Callee; }

187private:
FunctionType *FnTy = nullptr;
Value *Callee = nullptr;
190};

192/// Class to represent struct types. There are two different kinds of struct
193/// types: Literal structs and Identified structs.
194///
195/// Literal struct types (e.g. { i32, i32 }) are uniqued structurally, and must
196/// always have a body when created.  You can get one of these by using one of
197/// the StructType::get() forms.
198///
199/// Identified structs (e.g. %foo or %42) may optionally have a name and are not
200/// uniqued.  The names for identified structs are managed at the LLVMContext
201/// level, so there can only be a single identified struct with a given name in
202/// a particular LLVMContext.  Identified structs may also optionally be opaque
203/// (have no body specified).  You get one of these by using one of the
204/// StructType::create() forms.
205///
206/// Independent of what kind of struct you have, the body of a struct type are
207/// laid out in memory consecutively with the elements directly one after the
208/// other (if the struct is packed) or (if not packed) with padding between the
209/// elements as defined by DataLayout (which is required to match what the code
210/// generator for a target expects).
211///
212class StructType : public Type {
StructType(LLVMContext &C) : Type(C, StructTyID) {}

enum {
  /// This is the contents of the SubClassData field.
  SCDB_HasBody = 1,
  SCDB_Packed = 2,
  SCDB_IsLiteral = 4,
  SCDB_IsSized = 8
};

/// For a named struct that actually has a name, this is a pointer to the
/// symbol table entry (maintained by LLVMContext) for the struct.
/// This is null if the type is an literal struct or if it is a identified
/// type that has an empty name.
void *SymbolTableEntry = nullptr;

229public:
StructType(const StructType &) = delete;
StructType &operator=(const StructType &) = delete;

/// This creates an identified struct.
static StructType *create(LLVMContext &Context, StringRef Name);
static StructType *create(LLVMContext &Context);

static StructType *create(ArrayRef<Type *> Elements, StringRef Name,
                          bool isPacked = false);
static StructType *create(ArrayRef<Type *> Elements);
static StructType *create(LLVMContext &Context, ArrayRef<Type *> Elements,
                          StringRef Name, bool isPacked = false);
static StructType *create(LLVMContext &Context, ArrayRef<Type *> Elements);
template <class... Tys>
static std::enable_if_t<are_base_of<Type, Tys...>::value, StructType *>
create(StringRef Name, Type *elt1, Tys *... elts) {
  assert(elt1 && "Cannot create a struct type with no elements with this")(static_cast<void> (0));
  return create(ArrayRef<Type *>({elt1, elts...}), Name);
}

/// This static method is the primary way to create a literal StructType.
static StructType *get(LLVMContext &Context, ArrayRef<Type*> Elements,
                       bool isPacked = false);

/// Create an empty structure type.
static StructType *get(LLVMContext &Context, bool isPacked = false);

/// This static method is a convenience method for creating structure types by
/// specifying the elements as arguments. Note that this method always returns
/// a non-packed struct, and requires at least one element type.
template <class... Tys>
static std::enable_if_t<are_base_of<Type, Tys...>::value, StructType *>
get(Type *elt1, Tys *... elts) {
  assert(elt1 && "Cannot create a struct type with no elements with this")(static_cast<void> (0));
  LLVMContext &Ctx = elt1->getContext();
  return StructType::get(Ctx, ArrayRef<Type *>({elt1, elts...}));
}

/// Return the type with the specified name, or null if there is none by that
/// name.
static StructType *getTypeByName(LLVMContext &C, StringRef Name);

bool isPacked() const { return (getSubclassData() & SCDB_Packed) != 0; }

/// Return true if this type is uniqued by structural equivalence, false if it
/// is a struct definition.
bool isLiteral() const { return (getSubclassData() & SCDB_IsLiteral) != 0; }

/// Return true if this is a type with an identity that has no body specified
/// yet. These prints as 'opaque' in .ll files.
bool isOpaque() const { return (getSubclassData() & SCDB_HasBody) == 0; }

/// isSized - Return true if this is a sized type.
bool isSized(SmallPtrSetImpl<Type *> *Visited = nullptr) const;

/// Returns true if this struct contains a scalable vector.
bool containsScalableVectorType() const;

/// Return true if this is a named struct that has a non-empty name.
bool hasName() const { return SymbolTableEntry != nullptr; }

/// Return the name for this struct type if it has an identity.
/// This may return an empty string for an unnamed struct type.  Do not call
/// this on an literal type.
StringRef getName() const;

/// Change the name of this type to the specified name, or to a name with a
/// suffix if there is a collision. Do not call this on an literal type.
void setName(StringRef Name);

/// Specify a body for an opaque identified type.
void setBody(ArrayRef<Type*> Elements, bool isPacked = false);

template <typename... Tys>
std::enable_if_t<are_base_of<Type, Tys...>::value, void>
setBody(Type *elt1, Tys *... elts) {
  assert(elt1 && "Cannot create a struct type with no elements with this")(static_cast<void> (0));
  setBody(ArrayRef<Type *>({elt1, elts...}));
}

/// Return true if the specified type is valid as a element type.
static bool isValidElementType(Type *ElemTy);

// Iterator access to the elements.
using element_iterator = Type::subtype_iterator;

element_iterator element_begin() const { return ContainedTys; }
element_iterator element_end() const { return &ContainedTys[NumContainedTys];}
ArrayRef<Type *> elements() const {
  return makeArrayRef(element_begin(), element_end());
}

/// Return true if this is layout identical to the specified struct.
bool isLayoutIdentical(StructType *Other) const;

/// Random access to the elements
unsigned getNumElements() const { return NumContainedTys; }
Type *getElementType(unsigned N) const {
  assert(N < NumContainedTys && "Element number out of range!")(static_cast<void> (0));
  return ContainedTys[N];
}
/// Given an index value into the type, return the type of the element.
Type *getTypeAtIndex(const Value *V) const;
Type *getTypeAtIndex(unsigned N) const { return getElementType(N); }
bool indexValid(const Value *V) const;
bool indexValid(unsigned Idx) const { return Idx < getNumElements(); }

/// Methods for support type inquiry through isa, cast, and dyn_cast.
static bool classof(const Type *T) {
  return T->getTypeID() == StructTyID;
}
341};

343StringRef Type::getStructName() const {
return cast<StructType>(this)->getName();
345}

347unsigned Type::getStructNumElements() const {
return cast<StructType>(this)->getNumElements();
349}

351Type *Type::getStructElementType(unsigned N) const {
return cast<StructType>(this)->getElementType(N);
353}

355/// Class to represent array types.
356class ArrayType : public Type {
/// The element type of the array.
Type *ContainedType;
/// Number of elements in the array.
uint64_t NumElements;

ArrayType(Type *ElType, uint64_t NumEl);

364public:
ArrayType(const ArrayType &) = delete;
ArrayType &operator=(const ArrayType &) = delete;

uint64_t getNumElements() const { return NumElements; }
Type *getElementType() const { return ContainedType; }

/// This static method is the primary way to construct an ArrayType
static ArrayType *get(Type *ElementType, uint64_t NumElements);

/// Return true if the specified type is valid as a element type.
static bool isValidElementType(Type *ElemTy);

/// Methods for support type inquiry through isa, cast, and dyn_cast.
static bool classof(const Type *T) {
  return T->getTypeID() == ArrayTyID;
}
381};

383uint64_t Type::getArrayNumElements() const {
return cast<ArrayType>(this)->getNumElements();
385}

387/// Base class of all SIMD vector types
388class VectorType : public Type {
/// A fully specified VectorType is of the form <vscale x n x Ty>. 'n' is the
/// minimum number of elements of type Ty contained within the vector, and
/// 'vscale x' indicates that the total element count is an integer multiple
/// of 'n', where the multiple is either guaranteed to be one, or is
/// statically unknown at compile time.
///
/// If the multiple is known to be 1, then the extra term is discarded in
/// textual IR:
///
/// <4 x i32>          - a vector containing 4 i32s
/// <vscale x 4 x i32> - a vector containing an unknown integer multiple
///                      of 4 i32s

/// The element type of the vector.
Type *ContainedType;

405protected:
/// The element quantity of this vector. The meaning of this value depends
/// on the type of vector:
/// - For FixedVectorType = <ElementQuantity x ty>, there are
///   exactly ElementQuantity elements in this vector.
/// - For ScalableVectorType = <vscale x ElementQuantity x ty>,
///   there are vscale * ElementQuantity elements in this vector, where
///   vscale is a runtime-constant integer greater than 0.
const unsigned ElementQuantity;

VectorType(Type *ElType, unsigned EQ, Type::TypeID TID);

417public:
VectorType(const VectorType &) = delete;
VectorType &operator=(const VectorType &) = delete;

Type *getElementType() const { return ContainedType; }

/// This static method is the primary way to construct an VectorType.
static VectorType *get(Type *ElementType, ElementCount EC);

static VectorType *get(Type *ElementType, unsigned NumElements,
                       bool Scalable) {
  return VectorType::get(ElementType,
                         ElementCount::get(NumElements, Scalable));
}

static VectorType *get(Type *ElementType, const VectorType *Other) {
  return VectorType::get(ElementType, Other->getElementCount());
}

/// This static method gets a VectorType with the same number of elements as
/// the input type, and the element type is an integer type of the same width
/// as the input element type.
static VectorType *getInteger(VectorType *VTy) {
  unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits();
  assert(EltBits && "Element size must be of a non-zero size")(static_cast<void> (0));
  Type *EltTy = IntegerType::get(VTy->getContext(), EltBits);
  return VectorType::get(EltTy, VTy->getElementCount());
}

/// This static method is like getInteger except that the element types are
/// twice as wide as the elements in the input type.
static VectorType *getExtendedElementVectorType(VectorType *VTy) {
  assert(VTy->isIntOrIntVectorTy() && "VTy expected to be a vector of ints.")(static_cast<void> (0));
  auto *EltTy = cast<IntegerType>(VTy->getElementType());
  return VectorType::get(EltTy->getExtendedType(), VTy->getElementCount());
}

// This static method gets a VectorType with the same number of elements as
// the input type, and the element type is an integer or float type which
// is half as wide as the elements in the input type.
static VectorType *getTruncatedElementVectorType(VectorType *VTy) {
  Type *EltTy;
  if (VTy->getElementType()->isFloatingPointTy()) {
    switch(VTy->getElementType()->getTypeID()) {
    case DoubleTyID:
      EltTy = Type::getFloatTy(VTy->getContext());
      break;
    case FloatTyID:
      EltTy = Type::getHalfTy(VTy->getContext());
      break;
    default:
      llvm_unreachable("Cannot create narrower fp vector element type")__builtin_unreachable();
    }
  } else {
    unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits();
    assert((EltBits & 1) == 0 &&(static_cast<void> (0))
           "Cannot truncate vector element with odd bit-width")(static_cast<void> (0));
    EltTy = IntegerType::get(VTy->getContext(), EltBits / 2);
  }
  return VectorType::get(EltTy, VTy->getElementCount());
}

// This static method returns a VectorType with a smaller number of elements
// of a larger type than the input element type. For example, a <16 x i8>
// subdivided twice would return <4 x i32>
static VectorType *getSubdividedVectorType(VectorType *VTy, int NumSubdivs) {
  for (int i = 0; i < NumSubdivs; ++i) {
    VTy = VectorType::getDoubleElementsVectorType(VTy);
    VTy = VectorType::getTruncatedElementVectorType(VTy);
  }
  return VTy;
}

/// This static method returns a VectorType with half as many elements as the
/// input type and the same element type.
static VectorType *getHalfElementsVectorType(VectorType *VTy) {
  auto EltCnt = VTy->getElementCount();
  assert(EltCnt.isKnownEven() &&(static_cast<void> (0))
         "Cannot halve vector with odd number of elements.")(static_cast<void> (0));
  return VectorType::get(VTy->getElementType(),
                         EltCnt.divideCoefficientBy(2));
}

/// This static method returns a VectorType with twice as many elements as the
/// input type and the same element type.
static VectorType *getDoubleElementsVectorType(VectorType *VTy) {
  auto EltCnt = VTy->getElementCount();
  assert((EltCnt.getKnownMinValue() * 2ull) <= UINT_MAX &&(static_cast<void> (0))
         "Too many elements in vector")(static_cast<void> (0));
  return VectorType::get(VTy->getElementType(), EltCnt * 2);
}

/// Return true if the specified type is valid as a element type.
static bool isValidElementType(Type *ElemTy);

/// Return an ElementCount instance to represent the (possibly scalable)
/// number of elements in the vector.
inline ElementCount getElementCount() const;

/// Methods for support type inquiry through isa, cast, and dyn_cast.
static bool classof(const Type *T) {
  return T->getTypeID() == FixedVectorTyID ||
         T->getTypeID() == ScalableVectorTyID;
}
521};

523/// Class to represent fixed width SIMD vectors
524class FixedVectorType : public VectorType {
525protected:
FixedVectorType(Type *ElTy, unsigned NumElts)
    : VectorType(ElTy, NumElts, FixedVectorTyID) {}

529public:
static FixedVectorType *get(Type *ElementType, unsigned NumElts);

static FixedVectorType *get(Type *ElementType, const FixedVectorType *FVTy) {
  return get(ElementType, FVTy->getNumElements());
}

static FixedVectorType *getInteger(FixedVectorType *VTy) {
  return cast<FixedVectorType>(VectorType::getInteger(VTy));
}

static FixedVectorType *getExtendedElementVectorType(FixedVectorType *VTy) {
  return cast<FixedVectorType>(VectorType::getExtendedElementVectorType(VTy));
}

static FixedVectorType *getTruncatedElementVectorType(FixedVectorType *VTy) {
  return cast<FixedVectorType>(
      VectorType::getTruncatedElementVectorType(VTy));
}

static FixedVectorType *getSubdividedVectorType(FixedVectorType *VTy,
                                                int NumSubdivs) {
  return cast<FixedVectorType>(
      VectorType::getSubdividedVectorType(VTy, NumSubdivs));
}

static FixedVectorType *getHalfElementsVectorType(FixedVectorType *VTy) {
  return cast<FixedVectorType>(VectorType::getHalfElementsVectorType(VTy));
}

static FixedVectorType *getDoubleElementsVectorType(FixedVectorType *VTy) {
  return cast<FixedVectorType>(VectorType::getDoubleElementsVectorType(VTy));
}

static bool classof(const Type *T) {
  return T->getTypeID() == FixedVectorTyID;
}

unsigned getNumElements() const { return ElementQuantity; }
568};

570/// Class to represent scalable SIMD vectors
571class ScalableVectorType : public VectorType {
572protected:
ScalableVectorType(Type *ElTy, unsigned MinNumElts)
    : VectorType(ElTy, MinNumElts, ScalableVectorTyID) {}

576public:
static ScalableVectorType *get(Type *ElementType, unsigned MinNumElts);

static ScalableVectorType *get(Type *ElementType,
                               const ScalableVectorType *SVTy) {
  return get(ElementType, SVTy->getMinNumElements());
}

static ScalableVectorType *getInteger(ScalableVectorType *VTy) {
  return cast<ScalableVectorType>(VectorType::getInteger(VTy));
}

static ScalableVectorType *
getExtendedElementVectorType(ScalableVectorType *VTy) {
  return cast<ScalableVectorType>(
      VectorType::getExtendedElementVectorType(VTy));
}

static ScalableVectorType *
getTruncatedElementVectorType(ScalableVectorType *VTy) {
  return cast<ScalableVectorType>(
      VectorType::getTruncatedElementVectorType(VTy));
}

static ScalableVectorType *getSubdividedVectorType(ScalableVectorType *VTy,
                                                   int NumSubdivs) {
  return cast<ScalableVectorType>(
      VectorType::getSubdividedVectorType(VTy, NumSubdivs));
}

static ScalableVectorType *
getHalfElementsVectorType(ScalableVectorType *VTy) {
  return cast<ScalableVectorType>(VectorType::getHalfElementsVectorType(VTy));
}

static ScalableVectorType *
getDoubleElementsVectorType(ScalableVectorType *VTy) {
  return cast<ScalableVectorType>(
      VectorType::getDoubleElementsVectorType(VTy));
}

/// Get the minimum number of elements in this vector. The actual number of
/// elements in the vector is an integer multiple of this value.
uint64_t getMinNumElements() const { return ElementQuantity; }

static bool classof(const Type *T) {
  return T->getTypeID() == ScalableVectorTyID;
}
624};

626inline ElementCount VectorType::getElementCount() const {
return ElementCount::get(ElementQuantity, isa<ScalableVectorType>(this));
628}

630/// Class to represent pointers.
631class PointerType : public Type {
explicit PointerType(Type *ElType, unsigned AddrSpace);
explicit PointerType(LLVMContext &C, unsigned AddrSpace);

Type *PointeeTy;

637public:
PointerType(const PointerType &) = delete;
PointerType &operator=(const PointerType &) = delete;

/// This constructs a pointer to an object of the specified type in a numbered
/// address space.
static PointerType *get(Type *ElementType, unsigned AddressSpace);
/// This constructs an opaque pointer to an object in a numbered address
/// space.
static PointerType *get(LLVMContext &C, unsigned AddressSpace);

/// This constructs a pointer to an object of the specified type in the
/// default address space (address space zero).
static PointerType *getUnqual(Type *ElementType) {
  return PointerType::get(ElementType, 0);
}

/// This constructs an opaque pointer to an object in the
/// default address space (address space zero).
static PointerType *getUnqual(LLVMContext &C) {
  return PointerType::get(C, 0);
}

/// This constructs a pointer type with the same pointee type as input
/// PointerType (or opaque pointer is the input PointerType is opaque) and the
/// given address space. This is only useful during the opaque pointer
/// transition.
/// TODO: remove after opaque pointer transition is complete.
static PointerType *getWithSamePointeeType(PointerType *PT,
                                           unsigned AddressSpace) {
  if (PT->isOpaque())
    return get(PT->getContext(), AddressSpace);
  return get(PT->getElementType(), AddressSpace);
}

Type *getElementType() const {
  assert(!isOpaque() && "Attempting to get element type of opaque pointer")(static_cast<void> (0));
  return PointeeTy;
}

bool isOpaque() const { return !PointeeTy; }

/// Return true if the specified type is valid as a element type.
static bool isValidElementType(Type *ElemTy);

/// Return true if we can load or store from a pointer to this type.
static bool isLoadableOrStorableType(Type *ElemTy);

/// Return the address space of the Pointer type.
inline unsigned getAddressSpace() const { return getSubclassData(); }

/// Return true if either this is an opaque pointer type or if this pointee
/// type matches Ty. Primarily used for checking if an instruction's pointer
/// operands are valid types. Will be useless after non-opaque pointers are
/// removed.
bool isOpaqueOrPointeeTypeMatches(Type *Ty) {
  return isOpaque() || PointeeTy == Ty;
}

/// Return true if both pointer types have the same element type. Two opaque
/// pointers are considered to have the same element type, while an opaque
/// and a non-opaque pointer have different element types.
/// TODO: Remove after opaque pointer transition is complete.
bool hasSameElementTypeAs(PointerType *Other) {
  return PointeeTy == Other->PointeeTy;
}

/// Implement support type inquiry through isa, cast, and dyn_cast.
static bool classof(const Type *T) {
  return T->getTypeID() == PointerTyID;
}
708};

710Type *Type::getExtendedType() const {
assert((static_cast<void> (0))
    isIntOrIntVectorTy() &&(static_cast<void> (0))
    "Original type expected to be a vector of integers or a scalar integer.")(static_cast<void> (0));
if (auto *VTy = dyn_cast<VectorType>(this))
  return VectorType::getExtendedElementVectorType(
      const_cast<VectorType *>(VTy));
return cast<IntegerType>(this)->getExtendedType();
718}

720Type *Type::getWithNewType(Type *EltTy) const {
if (auto *VTy31.1
'VTy' is null
1
'VTy' is null
1
'VTy' is null
1
'VTy' is null
1
'VTy' is null
1
'VTy' is null
1
'VTy' is null
 = dyn_cast<VectorType>(this))
31
←
Assuming the object is not a 'VectorType'→
32
←
Taking false branch→
  return VectorType::get(EltTy, VTy->getElementCount());
return EltTy;
33
←
Returning pointer (loaded from 'EltTy')→
724}

726Type *Type::getWithNewBitWidth(unsigned NewBitWidth) const {
assert((static_cast<void> (0))
    isIntOrIntVectorTy() &&(static_cast<void> (0))
    "Original type expected to be a vector of integers or a scalar integer.")(static_cast<void> (0));
return getWithNewType(getIntNTy(getContext(), NewBitWidth));
29
←
Passing value via 1st parameter 'EltTy'→
30
←
Calling 'Type::getWithNewType'→
34
←
Returning from 'Type::getWithNewType'→
35
←
Returning pointer→
731}

733unsigned Type::getPointerAddressSpace() const {
return cast<PointerType>(getScalarType())->getAddressSpace();
735}

737} // end namespace llvm

739#endif // LLVM_IR_DERIVEDTYPES_H

←

/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/include/llvm/IR/Type.h

1//===- llvm/Type.h - Classes for handling data types ------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains the declaration of the Type class.  For more "Type"
10// stuff, look in DerivedTypes.h.
11//
12//===----------------------------------------------------------------------===//

14#ifndef LLVM_IR_TYPE_H
15#define LLVM_IR_TYPE_H

17#include "llvm/ADT/APFloat.h"
18#include "llvm/ADT/ArrayRef.h"
19#include "llvm/ADT/SmallPtrSet.h"
20#include "llvm/Support/CBindingWrapping.h"
21#include "llvm/Support/Casting.h"
22#include "llvm/Support/Compiler.h"
23#include "llvm/Support/ErrorHandling.h"
24#include "llvm/Support/TypeSize.h"
25#include <cassert>
26#include <cstdint>
27#include <iterator>

29namespace llvm {

31class IntegerType;
32class LLVMContext;
33class PointerType;
34class raw_ostream;
35class StringRef;

37/// The instances of the Type class are immutable: once they are created,
38/// they are never changed.  Also note that only one instance of a particular
39/// type is ever created.  Thus seeing if two types are equal is a matter of
40/// doing a trivial pointer comparison. To enforce that no two equal instances
41/// are created, Type instances can only be created via static factory methods
42/// in class Type and in derived classes.  Once allocated, Types are never
43/// free'd.
44///
45class Type {
46public:
//===--------------------------------------------------------------------===//
/// Definitions of all of the base types for the Type system.  Based on this
/// value, you can cast to a class defined in DerivedTypes.h.
/// Note: If you add an element to this, you need to add an element to the
/// Type::getPrimitiveType function, or else things will break!
/// Also update LLVMTypeKind and LLVMGetTypeKind () in the C binding.
///
enum TypeID {
  // PrimitiveTypes
  HalfTyID = 0,  ///< 16-bit floating point type
  BFloatTyID,    ///< 16-bit floating point type (7-bit significand)
  FloatTyID,     ///< 32-bit floating point type
  DoubleTyID,    ///< 64-bit floating point type
  X86_FP80TyID,  ///< 80-bit floating point type (X87)
  FP128TyID,     ///< 128-bit floating point type (112-bit significand)
  PPC_FP128TyID, ///< 128-bit floating point type (two 64-bits, PowerPC)
  VoidTyID,      ///< type with no size
  LabelTyID,     ///< Labels
  MetadataTyID,  ///< Metadata
  X86_MMXTyID,   ///< MMX vectors (64 bits, X86 specific)
  X86_AMXTyID,   ///< AMX vectors (8192 bits, X86 specific)
  TokenTyID,     ///< Tokens

  // Derived types... see DerivedTypes.h file.
  IntegerTyID,       ///< Arbitrary bit width integers
  FunctionTyID,      ///< Functions
  PointerTyID,       ///< Pointers
  StructTyID,        ///< Structures
  ArrayTyID,         ///< Arrays
  FixedVectorTyID,   ///< Fixed width SIMD vector type
  ScalableVectorTyID ///< Scalable SIMD vector type
};

80private:
/// This refers to the LLVMContext in which this type was uniqued.
LLVMContext &Context;

TypeID   ID : 8;            // The current base type of this type.
unsigned SubclassData : 24; // Space for subclasses to store data.
                            // Note that this should be synchronized with
                            // MAX_INT_BITS value in IntegerType class.

89protected:
friend class LLVMContextImpl;

explicit Type(LLVMContext &C, TypeID tid)
  : Context(C), ID(tid), SubclassData(0) {}
~Type() = default;

unsigned getSubclassData() const { return SubclassData; }

void setSubclassData(unsigned val) {
  SubclassData = val;
  // Ensure we don't have any accidental truncation.
  assert(getSubclassData() == val && "Subclass data too large for field")(static_cast<void> (0));
}

/// Keeps track of how many Type*'s there are in the ContainedTys list.
unsigned NumContainedTys = 0;

/// A pointer to the array of Types contained by this Type. For example, this
/// includes the arguments of a function type, the elements of a structure,
/// the pointee of a pointer, the element type of an array, etc. This pointer
/// may be 0 for types that don't contain other types (Integer, Double,
/// Float).
Type * const *ContainedTys = nullptr;

114public:
/// Print the current type.
/// Omit the type details if \p NoDetails == true.
/// E.g., let %st = type { i32, i16 }
/// When \p NoDetails is true, we only print %st.
/// Put differently, \p NoDetails prints the type as if
/// inlined with the operands when printing an instruction.
void print(raw_ostream &O, bool IsForDebug = false,
           bool NoDetails = false) const;

void dump() const;

/// Return the LLVMContext in which this type was uniqued.
LLVMContext &getContext() const { return Context; }

//===--------------------------------------------------------------------===//
// Accessors for working with types.
//

/// Return the type id for the type. This will return one of the TypeID enum
/// elements defined above.
TypeID getTypeID() const { return ID; }

/// Return true if this is 'void'.
bool isVoidTy() const { return getTypeID() == VoidTyID; }

/// Return true if this is 'half', a 16-bit IEEE fp type.
bool isHalfTy() const { return getTypeID() == HalfTyID; }

/// Return true if this is 'bfloat', a 16-bit bfloat type.
bool isBFloatTy() const { return getTypeID() == BFloatTyID; }

/// Return true if this is 'float', a 32-bit IEEE fp type.
bool isFloatTy() const { return getTypeID() == FloatTyID; }

/// Return true if this is 'double', a 64-bit IEEE fp type.
bool isDoubleTy() const { return getTypeID() == DoubleTyID; }

/// Return true if this is x86 long double.
bool isX86_FP80Ty() const { return getTypeID() == X86_FP80TyID; }

/// Return true if this is 'fp128'.
bool isFP128Ty() const { return getTypeID() == FP128TyID; }

/// Return true if this is powerpc long double.
bool isPPC_FP128Ty() const { return getTypeID() == PPC_FP128TyID; }

/// Return true if this is one of the six floating-point types
bool isFloatingPointTy() const {
  return getTypeID() == HalfTyID || getTypeID() == BFloatTyID ||
         getTypeID() == FloatTyID || getTypeID() == DoubleTyID ||
         getTypeID() == X86_FP80TyID || getTypeID() == FP128TyID ||
         getTypeID() == PPC_FP128TyID;
}

const fltSemantics &getFltSemantics() const {
  switch (getTypeID()) {
  case HalfTyID: return APFloat::IEEEhalf();
  case BFloatTyID: return APFloat::BFloat();
  case FloatTyID: return APFloat::IEEEsingle();
  case DoubleTyID: return APFloat::IEEEdouble();
  case X86_FP80TyID: return APFloat::x87DoubleExtended();
  case FP128TyID: return APFloat::IEEEquad();
  case PPC_FP128TyID: return APFloat::PPCDoubleDouble();
  default: llvm_unreachable("Invalid floating type")__builtin_unreachable();
  }
}

/// Return true if this is X86 MMX.
bool isX86_MMXTy() const { return getTypeID() == X86_MMXTyID; }

/// Return true if this is X86 AMX.
bool isX86_AMXTy() const { return getTypeID() == X86_AMXTyID; }

/// Return true if this is a FP type or a vector of FP.
bool isFPOrFPVectorTy() const { return getScalarType()->isFloatingPointTy(); }

/// Return true if this is 'label'.
bool isLabelTy() const { return getTypeID() == LabelTyID; }

/// Return true if this is 'metadata'.
bool isMetadataTy() const { return getTypeID() == MetadataTyID; }

/// Return true if this is 'token'.
bool isTokenTy() const { return getTypeID() == TokenTyID; }

/// True if this is an instance of IntegerType.
bool isIntegerTy() const { return getTypeID() == IntegerTyID; }

/// Return true if this is an IntegerType of the given width.
bool isIntegerTy(unsigned Bitwidth) const;

/// Return true if this is an integer type or a vector of integer types.
bool isIntOrIntVectorTy() const { return getScalarType()->isIntegerTy(); }

/// Return true if this is an integer type or a vector of integer types of
/// the given width.
bool isIntOrIntVectorTy(unsigned BitWidth) const {
  return getScalarType()->isIntegerTy(BitWidth);
}

/// Return true if this is an integer type or a pointer type.
bool isIntOrPtrTy() const { return isIntegerTy() || isPointerTy(); }

/// True if this is an instance of FunctionType.
bool isFunctionTy() const { return getTypeID() == FunctionTyID; }

/// True if this is an instance of StructType.
bool isStructTy() const { return getTypeID() == StructTyID; }

/// True if this is an instance of ArrayType.
bool isArrayTy() const { return getTypeID() == ArrayTyID; }

/// True if this is an instance of PointerType.
bool isPointerTy() const { return getTypeID() == PointerTyID; }

/// True if this is an instance of an opaque PointerType.
bool isOpaquePointerTy() const;

/// Return true if this is a pointer type or a vector of pointer types.
bool isPtrOrPtrVectorTy() const { return getScalarType()->isPointerTy(); }

/// True if this is an instance of VectorType.
inline bool isVectorTy() const {
  return getTypeID() == ScalableVectorTyID || getTypeID() == FixedVectorTyID;
44
←
Assuming the condition is false→
45
←
Assuming the condition is true→
46
←
Returning the value 1, which participates in a condition later→
}

/// Return true if this type could be converted with a lossless BitCast to
/// type 'Ty'. For example, i8* to i32*. BitCasts are valid for types of the
/// same size only where no re-interpretation of the bits is done.
/// Determine if this type could be losslessly bitcast to Ty
bool canLosslesslyBitCastTo(Type *Ty) const;

/// Return true if this type is empty, that is, it has no elements or all of
/// its elements are empty.
bool isEmptyTy() const;

/// Return true if the type is "first class", meaning it is a valid type for a
/// Value.
bool isFirstClassType() const {
  return getTypeID() != FunctionTyID && getTypeID() != VoidTyID;
}

/// Return true if the type is a valid type for a register in codegen. This
/// includes all first-class types except struct and array types.
bool isSingleValueType() const {
  return isFloatingPointTy() || isX86_MMXTy() || isIntegerTy() ||
         isPointerTy() || isVectorTy() || isX86_AMXTy();
}

/// Return true if the type is an aggregate type. This means it is valid as
/// the first operand of an insertvalue or extractvalue instruction. This
/// includes struct and array types, but does not include vector types.
bool isAggregateType() const {
  return getTypeID() == StructTyID || getTypeID() == ArrayTyID;
}

/// Return true if it makes sense to take the size of this type. To get the
/// actual size for a particular target, it is reasonable to use the
/// DataLayout subsystem to do this.
bool isSized(SmallPtrSetImpl<Type*> *Visited = nullptr) const {
  // If it's a primitive, it is always sized.
  if (getTypeID() == IntegerTyID || isFloatingPointTy() ||
      getTypeID() == PointerTyID || getTypeID() == X86_MMXTyID ||
      getTypeID() == X86_AMXTyID)
    return true;
  // If it is not something that can have a size (e.g. a function or label),
  // it doesn't have a size.
  if (getTypeID() != StructTyID && getTypeID() != ArrayTyID && !isVectorTy())
    return false;
  // Otherwise we have to try harder to decide.
  return isSizedDerivedType(Visited);
}

/// Return the basic size of this type if it is a primitive type. These are
/// fixed by LLVM and are not target-dependent.
/// This will return zero if the type does not have a size or is not a
/// primitive type.
///
/// If this is a scalable vector type, the scalable property will be set and
/// the runtime size will be a positive integer multiple of the base size.
///
/// Note that this may not reflect the size of memory allocated for an
/// instance of the type or the number of bytes that are written when an
/// instance of the type is stored to memory. The DataLayout class provides
/// additional query functions to provide this information.
///
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY__attribute__((__pure__));

/// If this is a vector type, return the getPrimitiveSizeInBits value for the
/// element type. Otherwise return the getPrimitiveSizeInBits value for this
/// type.
unsigned getScalarSizeInBits() const LLVM_READONLY__attribute__((__pure__));

/// Return the width of the mantissa of this type. This is only valid on
/// floating-point types. If the FP type does not have a stable mantissa (e.g.
/// ppc long double), this method returns -1.
int getFPMantissaWidth() const;

/// Return whether the type is IEEE compatible, as defined by the eponymous
/// method in APFloat.
bool isIEEE() const { return APFloat::getZero(getFltSemantics()).isIEEE(); }

/// If this is a vector type, return the element type, otherwise return
/// 'this'.
inline Type *getScalarType() const {
  if (isVectorTy())
    return getContainedType(0);
  return const_cast<Type *>(this);
}

//===--------------------------------------------------------------------===//
// Type Iteration support.
//
using subtype_iterator = Type * const *;

subtype_iterator subtype_begin() const { return ContainedTys; }
subtype_iterator subtype_end() const { return &ContainedTys[NumContainedTys];}
ArrayRef<Type*> subtypes() const {
  return makeArrayRef(subtype_begin(), subtype_end());
}

using subtype_reverse_iterator = std::reverse_iterator<subtype_iterator>;

subtype_reverse_iterator subtype_rbegin() const {
  return subtype_reverse_iterator(subtype_end());
}
subtype_reverse_iterator subtype_rend() const {
  return subtype_reverse_iterator(subtype_begin());
}

/// This method is used to implement the type iterator (defined at the end of
/// the file). For derived types, this returns the types 'contained' in the
/// derived type.
Type *getContainedType(unsigned i) const {
  assert(i < NumContainedTys && "Index out of range!")(static_cast<void> (0));
  return ContainedTys[i];
}

/// Return the number of types in the derived type.
unsigned getNumContainedTypes() const { return NumContainedTys; }

//===--------------------------------------------------------------------===//
// Helper methods corresponding to subclass methods.  This forces a cast to
// the specified subclass and calls its accessor.  "getArrayNumElements" (for
// example) is shorthand for cast<ArrayType>(Ty)->getNumElements().  This is
// only intended to cover the core methods that are frequently used, helper
// methods should not be added here.

inline unsigned getIntegerBitWidth() const;

inline Type *getFunctionParamType(unsigned i) const;
inline unsigned getFunctionNumParams() const;
inline bool isFunctionVarArg() const;

inline StringRef getStructName() const;
inline unsigned getStructNumElements() const;
inline Type *getStructElementType(unsigned N) const;

inline uint64_t getArrayNumElements() const;

Type *getArrayElementType() const {
  assert(getTypeID() == ArrayTyID)(static_cast<void> (0));
  return ContainedTys[0];
}

Type *getPointerElementType() const {
  assert(getTypeID() == PointerTyID)(static_cast<void> (0));
  return ContainedTys[0];
}

/// Given vector type, change the element type,
/// whilst keeping the old number of elements.
/// For non-vectors simply returns \p EltTy.
inline Type *getWithNewType(Type *EltTy) const;

/// Given an integer or vector type, change the lane bitwidth to NewBitwidth,
/// whilst keeping the old number of lanes.
inline Type *getWithNewBitWidth(unsigned NewBitWidth) const;

/// Given scalar/vector integer type, returns a type with elements twice as
/// wide as in the original type. For vectors, preserves element count.
inline Type *getExtendedType() const;

/// Get the address space of this pointer or pointer vector type.
inline unsigned getPointerAddressSpace() const;

//===--------------------------------------------------------------------===//
// Static members exported by the Type class itself.  Useful for getting
// instances of Type.
//

/// Return a type based on an identifier.
static Type *getPrimitiveType(LLVMContext &C, TypeID IDNumber);

//===--------------------------------------------------------------------===//
// These are the builtin types that are always available.
//
static Type *getVoidTy(LLVMContext &C);
static Type *getLabelTy(LLVMContext &C);
static Type *getHalfTy(LLVMContext &C);
static Type *getBFloatTy(LLVMContext &C);
static Type *getFloatTy(LLVMContext &C);
static Type *getDoubleTy(LLVMContext &C);
static Type *getMetadataTy(LLVMContext &C);
static Type *getX86_FP80Ty(LLVMContext &C);
static Type *getFP128Ty(LLVMContext &C);
static Type *getPPC_FP128Ty(LLVMContext &C);
static Type *getX86_MMXTy(LLVMContext &C);
static Type *getX86_AMXTy(LLVMContext &C);
static Type *getTokenTy(LLVMContext &C);
static IntegerType *getIntNTy(LLVMContext &C, unsigned N);
static IntegerType *getInt1Ty(LLVMContext &C);
static IntegerType *getInt8Ty(LLVMContext &C);
static IntegerType *getInt16Ty(LLVMContext &C);
static IntegerType *getInt32Ty(LLVMContext &C);
static IntegerType *getInt64Ty(LLVMContext &C);
static IntegerType *getInt128Ty(LLVMContext &C);
template <typename ScalarTy> static Type *getScalarTy(LLVMContext &C) {
  int noOfBits = sizeof(ScalarTy) * CHAR_BIT8;
  if (std::is_integral<ScalarTy>::value) {
    return (Type*) Type::getIntNTy(C, noOfBits);
  } else if (std::is_floating_point<ScalarTy>::value) {
    switch (noOfBits) {
    case 32:
      return Type::getFloatTy(C);
    case 64:
      return Type::getDoubleTy(C);
    }
  }
  llvm_unreachable("Unsupported type in Type::getScalarTy")__builtin_unreachable();
}
static Type *getFloatingPointTy(LLVMContext &C, const fltSemantics &S) {
  Type *Ty;
  if (&S == &APFloat::IEEEhalf())
    Ty = Type::getHalfTy(C);
  else if (&S == &APFloat::BFloat())
    Ty = Type::getBFloatTy(C);
  else if (&S == &APFloat::IEEEsingle())
    Ty = Type::getFloatTy(C);
  else if (&S == &APFloat::IEEEdouble())
    Ty = Type::getDoubleTy(C);
  else if (&S == &APFloat::x87DoubleExtended())
    Ty = Type::getX86_FP80Ty(C);
  else if (&S == &APFloat::IEEEquad())
    Ty = Type::getFP128Ty(C);
  else {
    assert(&S == &APFloat::PPCDoubleDouble() && "Unknown FP format")(static_cast<void> (0));
    Ty = Type::getPPC_FP128Ty(C);
  }
  return Ty;
}

//===--------------------------------------------------------------------===//
// Convenience methods for getting pointer types with one of the above builtin
// types as pointee.
//
static PointerType *getHalfPtrTy(LLVMContext &C, unsigned AS = 0);
static PointerType *getBFloatPtrTy(LLVMContext &C, unsigned AS = 0);
static PointerType *getFloatPtrTy(LLVMContext &C, unsigned AS = 0);
static PointerType *getDoublePtrTy(LLVMContext &C, unsigned AS = 0);
static PointerType *getX86_FP80PtrTy(LLVMContext &C, unsigned AS = 0);
static PointerType *getFP128PtrTy(LLVMContext &C, unsigned AS = 0);
static PointerType *getPPC_FP128PtrTy(LLVMContext &C, unsigned AS = 0);
static PointerType *getX86_MMXPtrTy(LLVMContext &C, unsigned AS = 0);
static PointerType *getX86_AMXPtrTy(LLVMContext &C, unsigned AS = 0);
static PointerType *getIntNPtrTy(LLVMContext &C, unsigned N, unsigned AS = 0);
static PointerType *getInt1PtrTy(LLVMContext &C, unsigned AS = 0);
static PointerType *getInt8PtrTy(LLVMContext &C, unsigned AS = 0);
static PointerType *getInt16PtrTy(LLVMContext &C, unsigned AS = 0);
static PointerType *getInt32PtrTy(LLVMContext &C, unsigned AS = 0);
static PointerType *getInt64PtrTy(LLVMContext &C, unsigned AS = 0);

/// Return a pointer to the current type. This is equivalent to
/// PointerType::get(Foo, AddrSpace).
/// TODO: Remove this after opaque pointer transition is complete.
PointerType *getPointerTo(unsigned AddrSpace = 0) const;

492private:
/// Derived types like structures and arrays are sized iff all of the members
/// of the type are sized as well. Since asking for their size is relatively
/// uncommon, move this operation out-of-line.
bool isSizedDerivedType(SmallPtrSetImpl<Type*> *Visited = nullptr) const;
497};

499// Printing of types.
500inline raw_ostream &operator<<(raw_ostream &OS, const Type &T) {
T.print(OS);
return OS;
503}

505// allow isa<PointerType>(x) to work without DerivedTypes.h included.
506template <> struct isa_impl<PointerType, Type> {
static inline bool doit(const Type &Ty) {
  return Ty.getTypeID() == Type::PointerTyID;
}
510};

512// Create wrappers for C Binding types (see CBindingWrapping.h).
513DEFINE_ISA_CONVERSION_FUNCTIONS(Type, LLVMTypeRef)inline Type *unwrap(LLVMTypeRef P) { return reinterpret_cast<
Type*>(P); } inline LLVMTypeRef wrap(const Type *P) { return
 reinterpret_cast<LLVMTypeRef>(const_cast<Type*>(
P)); } template<typename T> inline T *unwrap(LLVMTypeRef
 P) { return cast<T>(unwrap(P)); }

515/* Specialized opaque type conversions.
*/
517inline Type **unwrap(LLVMTypeRef* Tys) {
return reinterpret_cast<Type**>(Tys);
519}

521inline LLVMTypeRef *wrap(Type **Tys) {
return reinterpret_cast<LLVMTypeRef*>(const_cast<Type**>(Tys));
523}

525} // end namespace llvm

527#endif // LLVM_IR_TYPE_H