doxygen/html/ARMTargetTransformInfo%5F8cpp%5Fsource.html

//===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//


#include "ARMTargetTransformInfo.h"

#include "ARMSubtarget.h"

#include "MCTargetDesc/ARMAddressingModes.h"

#include "llvm/ADT/APInt.h"

#include "llvm/ADT/SmallVector.h"

#include "llvm/Analysis/LoopInfo.h"

#include "llvm/CodeGen/CostTable.h"

#include "llvm/CodeGen/ISDOpcodes.h"

#include "llvm/CodeGen/ValueTypes.h"

#include "llvm/CodeGenTypes/MachineValueType.h"

#include "llvm/IR/BasicBlock.h"

#include "llvm/IR/DataLayout.h"

#include "llvm/IR/DerivedTypes.h"

#include "llvm/IR/Instruction.h"

#include "llvm/IR/Instructions.h"

#include "llvm/IR/IntrinsicInst.h"

#include "llvm/IR/Intrinsics.h"

#include "llvm/IR/IntrinsicsARM.h"

#include "llvm/IR/PatternMatch.h"

#include "llvm/IR/Type.h"

#include "llvm/Support/Casting.h"

#include "llvm/Support/KnownBits.h"

#include "llvm/Target/TargetMachine.h"

#include "llvm/TargetParser/SubtargetFeature.h"

#include "llvm/Transforms/InstCombine/InstCombiner.h"

#include "llvm/Transforms/Utils/Local.h"

#include "llvm/Transforms/Utils/LoopUtils.h"

#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"

#include <algorithm>

#include <cassert>

#include <cstdint>

#include <optional>

#include <utility>


using namespace llvm;


#define DEBUG_TYPE "armtti"


static cl::opt<bool> EnableMaskedLoadStores(

  "enable-arm-maskedldst", cl::Hidden, cl::init(true),

  cl::desc("Enable the generation of masked loads and stores"));


static cl::opt<bool> DisableLowOverheadLoops(

  "disable-arm-loloops", cl::Hidden, cl::init(false),

  cl::desc("Disable the generation of low-overhead loops"));


static cl::opt<bool>

    AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),

                  cl::desc("Enable the generation of WLS loops"));


static cl::opt<bool> UseWidenGlobalArrays(

    "widen-global-strings", cl::Hidden, cl::init(true),

    cl::desc("Enable the widening of global strings to alignment boundaries"));


extern cl::opt<TailPredication::Mode> EnableTailPredication;


extern cl::opt<bool> EnableMaskedGatherScatters;


extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor;


static cl::opt<int> ArmForceUnrollThreshold(

    "arm-force-unroll-threshold", cl::init(12), cl::Hidden,

    cl::desc(

        "Threshold for forced unrolling of small loops in Arm architecture"));


/// Convert a vector load intrinsic into a simple llvm load instruction.

/// This is beneficial when the underlying object being addressed comes

/// from a constant, since we get constant-folding for free.


static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,

                               InstCombiner::BuilderTy &Builder) {

  auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));


  if (!IntrAlign)

    return nullptr;


  unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign

                           ? MemAlign

                           : IntrAlign->getLimitedValue();


  if (!isPowerOf2_32(Alignment))

    return nullptr;


  return Builder.CreateAlignedLoad(II.getType(), II.getArgOperand(0),

                                   Align(Alignment));

}


bool ARMTTIImpl::areInlineCompatible(const Function *Caller,

                                     const Function *Callee) const {

  const TargetMachine &TM = getTLI()->getTargetMachine();

  const FeatureBitset &CallerBits =

      TM.getSubtargetImpl(*Caller)->getFeatureBits();

  const FeatureBitset &CalleeBits =

      TM.getSubtargetImpl(*Callee)->getFeatureBits();


  // To inline a callee, all features not in the allowed list must match exactly.

  bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==

                    (CalleeBits & ~InlineFeaturesAllowed);

  // For features in the allowed list, the callee's features must be a subset of

  // the callers'.

  bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==

                     (CalleeBits & InlineFeaturesAllowed);


  LLVM_DEBUG({

    if (!MatchExact || !MatchSubset) {

      dbgs() << "=== Inline compatibility debug ===\n";

      dbgs() << "Caller: " << Caller->getName() << "\n";

      dbgs() << "Callee: " << Callee->getName() << "\n";


      // Bit diffs

      FeatureBitset MissingInCaller = CalleeBits & ~CallerBits; // callee-only

      FeatureBitset ExtraInCaller = CallerBits & ~CalleeBits;   // caller-only


      // Counts

      dbgs() << "Only-in-caller bit count: " << ExtraInCaller.count() << "\n";

      dbgs() << "Only-in-callee bit count: " << MissingInCaller.count() << "\n";


      dbgs() << "Only-in-caller feature indices [";

      {

        bool First = true;

        for (size_t I = 0, E = ExtraInCaller.size(); I < E; ++I) {

          if (ExtraInCaller.test(I)) {

            if (!First)

              dbgs() << ", ";

            dbgs() << I;

            First = false;

          }

        }

      }

      dbgs() << "]\n";


      dbgs() << "Only-in-callee feature indices [";

      {

        bool First = true;

        for (size_t I = 0, E = MissingInCaller.size(); I < E; ++I) {

          if (MissingInCaller.test(I)) {

            if (!First)

              dbgs() << ", ";

            dbgs() << I;

            First = false;

          }

        }

      }

      dbgs() << "]\n";


      // Indices map to features as found in

      // llvm-project/(your_build)/lib/Target/ARM/ARMGenSubtargetInfo.inc

      dbgs() << "MatchExact=" << (MatchExact ? "true" : "false")

             << " MatchSubset=" << (MatchSubset ? "true" : "false") << "\n";

    }

  });

  return MatchExact && MatchSubset;

}


TTI::AddressingModeKind


ARMTTIImpl::getPreferredAddressingMode(const Loop *L,

                                       ScalarEvolution *SE) const {

  if (ST->hasMVEIntegerOps())

    return TTI::AMK_PostIndexed;


  if (L->getHeader()->getParent()->hasOptSize())

    return TTI::AMK_None;


  if (ST->isMClass() && ST->isThumb2() &&

      L->getNumBlocks() == 1)

    return TTI::AMK_PreIndexed;


  return TTI::AMK_None;

}


std::optional<Instruction *>


ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {

  using namespace PatternMatch;

  Intrinsic::ID IID = II.getIntrinsicID();

  switch (IID) {

  default:

    break;

  case Intrinsic::arm_neon_vld1: {

    Align MemAlign =

        getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,

                          &IC.getAssumptionCache(), &IC.getDominatorTree());

    if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {

      return IC.replaceInstUsesWith(II, V);

    }

    break;

  }


  case Intrinsic::arm_neon_vld2:

  case Intrinsic::arm_neon_vld3:

  case Intrinsic::arm_neon_vld4:

  case Intrinsic::arm_neon_vld2lane:

  case Intrinsic::arm_neon_vld3lane:

  case Intrinsic::arm_neon_vld4lane:

  case Intrinsic::arm_neon_vst1:

  case Intrinsic::arm_neon_vst2:

  case Intrinsic::arm_neon_vst3:

  case Intrinsic::arm_neon_vst4:

  case Intrinsic::arm_neon_vst2lane:

  case Intrinsic::arm_neon_vst3lane:

  case Intrinsic::arm_neon_vst4lane: {

    Align MemAlign =

        getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,

                          &IC.getAssumptionCache(), &IC.getDominatorTree());

    unsigned AlignArg = II.arg_size() - 1;

    Value *AlignArgOp = II.getArgOperand(AlignArg);

    MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();

    if (Align && *Align < MemAlign) {

      return IC.replaceOperand(

          II, AlignArg,

          ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(),

                           false));

    }

    break;

  }


  case Intrinsic::arm_neon_vld1x2:

  case Intrinsic::arm_neon_vld1x3:

  case Intrinsic::arm_neon_vld1x4:

  case Intrinsic::arm_neon_vst1x2:

  case Intrinsic::arm_neon_vst1x3:

  case Intrinsic::arm_neon_vst1x4: {

    Align NewAlign =

        getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,

                          &IC.getAssumptionCache(), &IC.getDominatorTree());

    Align OldAlign = II.getParamAlign(0).valueOrOne();

    if (NewAlign > OldAlign)

      II.addParamAttr(0,

                      Attribute::getWithAlignment(II.getContext(), NewAlign));

    break;

  }


  case Intrinsic::arm_mve_pred_i2v: {

    Value *Arg = II.getArgOperand(0);

    Value *ArgArg;

    if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(

                       PatternMatch::m_Value(ArgArg))) &&

        II.getType() == ArgArg->getType()) {

      return IC.replaceInstUsesWith(II, ArgArg);

    }

    Constant *XorMask;

    if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(

                             PatternMatch::m_Value(ArgArg)),

                         PatternMatch::m_Constant(XorMask))) &&

        II.getType() == ArgArg->getType()) {

      if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {

        if (CI->getValue().trunc(16).isAllOnes()) {

          auto TrueVector = IC.Builder.CreateVectorSplat(

              cast<FixedVectorType>(II.getType())->getNumElements(),

              IC.Builder.getTrue());

          return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);

        }

      }

    }

    KnownBits ScalarKnown(32);

    if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),

                                ScalarKnown)) {

      return &II;

    }

    break;

  }

  case Intrinsic::arm_mve_pred_v2i: {

    Value *Arg = II.getArgOperand(0);

    Value *ArgArg;

    if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(

                       PatternMatch::m_Value(ArgArg)))) {

      return IC.replaceInstUsesWith(II, ArgArg);

    }


    if (II.getMetadata(LLVMContext::MD_range))

      break;


    ConstantRange Range(APInt(32, 0), APInt(32, 0x10000));


    if (auto CurrentRange = II.getRange()) {

      Range = Range.intersectWith(*CurrentRange);

      if (Range == CurrentRange)

        break;

    }


    II.addRangeRetAttr(Range);

    II.addRetAttr(Attribute::NoUndef);

    return &II;

  }

  case Intrinsic::arm_mve_vadc:

  case Intrinsic::arm_mve_vadc_predicated: {

    unsigned CarryOp =

        (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;

    assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&

           "Bad type for intrinsic!");


    KnownBits CarryKnown(32);

    if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),

                                CarryKnown)) {

      return &II;

    }

    break;

  }

  case Intrinsic::arm_mve_vmldava: {

    Instruction *I = cast<Instruction>(&II);

    if (I->hasOneUse()) {

      auto *User = cast<Instruction>(*I->user_begin());

      Value *OpZ;

      if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&

          match(I->getOperand(3), m_Zero())) {

        Value *OpX = I->getOperand(4);

        Value *OpY = I->getOperand(5);

        Type *OpTy = OpX->getType();


        IC.Builder.SetInsertPoint(User);

        Value *V =

            IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},

                                       {I->getOperand(0), I->getOperand(1),

                                        I->getOperand(2), OpZ, OpX, OpY});


        IC.replaceInstUsesWith(*User, V);

        return IC.eraseInstFromFunction(*User);

      }

    }

    return std::nullopt;

  }

  }

  return std::nullopt;

}


std::optional<Value *> ARMTTIImpl::simplifyDemandedVectorEltsIntrinsic(

    InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,

    APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,

    std::function<void(Instruction *, unsigned, APInt, APInt &)>

        SimplifyAndSetOp) const {


  // Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the

  // opcode specifying a Top/Bottom instruction, which can change between

  // instructions.

  auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) {

    unsigned NumElts = cast<FixedVectorType>(II.getType())->getNumElements();

    unsigned IsTop = cast<ConstantInt>(II.getOperand(TopOpc))->getZExtValue();


    // The only odd/even lanes of operand 0 will only be demanded depending

    // on whether this is a top/bottom instruction.

    APInt DemandedElts =

        APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)

                                       : APInt::getHighBitsSet(2, 1));

    SimplifyAndSetOp(&II, 0, OrigDemandedElts & DemandedElts, UndefElts);

    // The other lanes will be defined from the inserted elements.

    UndefElts &= APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)

                                                : APInt::getHighBitsSet(2, 1));

    return std::nullopt;

  };


  switch (II.getIntrinsicID()) {

  default:

    break;

  case Intrinsic::arm_mve_vcvt_narrow:

    SimplifyNarrowInstrTopBottom(2);

    break;

  case Intrinsic::arm_mve_vqmovn:

    SimplifyNarrowInstrTopBottom(4);

    break;

  case Intrinsic::arm_mve_vshrn:

    SimplifyNarrowInstrTopBottom(7);

    break;

  }


  return std::nullopt;

}


InstructionCost ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,

                                          TTI::TargetCostKind CostKind) const {

  assert(Ty->isIntegerTy());


 unsigned Bits = Ty->getPrimitiveSizeInBits();

 if (Bits == 0 || Imm.getActiveBits() >= 64)

   return 4;


  int64_t SImmVal = Imm.getSExtValue();

  uint64_t ZImmVal = Imm.getZExtValue();

  if (!ST->isThumb()) {

    if ((SImmVal >= 0 && SImmVal < 65536) ||

        (ARM_AM::getSOImmVal(ZImmVal) != -1) ||

        (ARM_AM::getSOImmVal(~ZImmVal) != -1))

      return 1;

    return ST->hasV6T2Ops() ? 2 : 3;

  }

  if (ST->isThumb2()) {

    if ((SImmVal >= 0 && SImmVal < 65536) ||

        (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||

        (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))

      return 1;

    return ST->hasV6T2Ops() ? 2 : 3;

  }

  // Thumb1, any i8 imm cost 1.

  if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))

    return 1;

  if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))

    return 2;

  // Load from constantpool.

  return 3;

}


// Constants smaller than 256 fit in the immediate field of

// Thumb1 instructions so we return a zero cost and 1 otherwise.


InstructionCost ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,

                                                  const APInt &Imm,

                                                  Type *Ty) const {

  if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)

    return 0;


  return 1;

}


// Checks whether Inst is part of a min(max()) or max(min()) pattern

// that will match to an SSAT instruction. Returns the instruction being

// saturated, or null if no saturation pattern was found.


static Value *isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {

  Value *LHS, *RHS;

  ConstantInt *C;

  SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor;


  if (InstSPF == SPF_SMAX &&

      PatternMatch::match(RHS, PatternMatch::m_ConstantInt(C)) &&

      C->getValue() == Imm && Imm.isNegative() && Imm.isNegatedPowerOf2()) {


    auto isSSatMin = [&](Value *MinInst) {

      if (isa<SelectInst>(MinInst)) {

        Value *MinLHS, *MinRHS;

        ConstantInt *MinC;

        SelectPatternFlavor MinSPF =

            matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;

        if (MinSPF == SPF_SMIN &&

            PatternMatch::match(MinRHS, PatternMatch::m_ConstantInt(MinC)) &&

            MinC->getValue() == ((-Imm) - 1))

          return true;

      }

      return false;

    };


    if (isSSatMin(Inst->getOperand(1)))

      return cast<Instruction>(Inst->getOperand(1))->getOperand(1);

    if (Inst->hasNUses(2) &&

        (isSSatMin(*Inst->user_begin()) || isSSatMin(*(++Inst->user_begin()))))

      return Inst->getOperand(1);

  }

  return nullptr;

}


// Look for a FP Saturation pattern, where the instruction can be simplified to

// a fptosi.sat. max(min(fptosi)). The constant in this case is always free.


static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm) {

  if (Imm.getBitWidth() != 64 ||

      Imm != APInt::getHighBitsSet(64, 33)) // -2147483648

    return false;

  Value *FP = isSSATMinMaxPattern(Inst, Imm);

  if (!FP && isa<ICmpInst>(Inst) && Inst->hasOneUse())

    FP = isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm);

  if (!FP)

    return false;

  return isa<FPToSIInst>(FP);

}


InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,

                                              const APInt &Imm, Type *Ty,

                                              TTI::TargetCostKind CostKind,

                                              Instruction *Inst) const {

  // Division by a constant can be turned into multiplication, but only if we

  // know it's constant. So it's not so much that the immediate is cheap (it's

  // not), but that the alternative is worse.

  // FIXME: this is probably unneeded with GlobalISel.

  if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||

       Opcode == Instruction::SRem || Opcode == Instruction::URem) &&

      Idx == 1)

    return 0;


  // Leave any gep offsets for the CodeGenPrepare, which will do a better job at

  // splitting any large offsets.

  if (Opcode == Instruction::GetElementPtr && Idx != 0)

    return 0;


  if (Opcode == Instruction::And) {

    // UXTB/UXTH

    if (Imm == 255 || Imm == 65535)

      return 0;

    // Conversion to BIC is free, and means we can use ~Imm instead.

    return std::min(getIntImmCost(Imm, Ty, CostKind),

                    getIntImmCost(~Imm, Ty, CostKind));

  }


  if (Opcode == Instruction::Add)

    // Conversion to SUB is free, and means we can use -Imm instead.

    return std::min(getIntImmCost(Imm, Ty, CostKind),

                    getIntImmCost(-Imm, Ty, CostKind));


  if (Opcode == Instruction::ICmp && Imm.isNegative() &&

      Ty->getIntegerBitWidth() == 32) {

    int64_t NegImm = -Imm.getSExtValue();

    if (ST->isThumb2() && NegImm < 1<<12)

      // icmp X, #-C -> cmn X, #C

      return 0;

    if (ST->isThumb() && NegImm < 1<<8)

      // icmp X, #-C -> adds X, #C

      return 0;

  }


  // xor a, -1 can always be folded to MVN

  if (Opcode == Instruction::Xor && Imm.isAllOnes())

    return 0;


  // Ensures negative constant of min(max()) or max(min()) patterns that

  // match to SSAT instructions don't get hoisted

  if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&

      Ty->getIntegerBitWidth() <= 32) {

    if (isSSATMinMaxPattern(Inst, Imm) ||

        (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&

         isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm)))

      return 0;

  }


  if (Inst && ST->hasVFP2Base() && isFPSatMinMaxPattern(Inst, Imm))

    return 0;


  // We can convert <= -1 to < 0, which is generally quite cheap.

  if (Inst && Opcode == Instruction::ICmp && Idx == 1 && Imm.isAllOnes()) {

    ICmpInst::Predicate Pred = cast<ICmpInst>(Inst)->getPredicate();

    if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE)

      return std::min(getIntImmCost(Imm, Ty, CostKind),

                      getIntImmCost(Imm + 1, Ty, CostKind));

  }


  return getIntImmCost(Imm, Ty, CostKind);

}


InstructionCost ARMTTIImpl::getCFInstrCost(unsigned Opcode,

                                           TTI::TargetCostKind CostKind,

                                           const Instruction *I) const {

  if (CostKind == TTI::TCK_RecipThroughput &&

      (ST->hasNEON() || ST->hasMVEIntegerOps())) {

    // FIXME: The vectorizer is highly sensistive to the cost of these

    // instructions, which suggests that it may be using the costs incorrectly.

    // But, for now, just make them free to avoid performance regressions for

    // vector targets.

    return 0;

  }

  return BaseT::getCFInstrCost(Opcode, CostKind, I);

}


InstructionCost ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,

                                             Type *Src,

                                             TTI::CastContextHint CCH,

                                             TTI::TargetCostKind CostKind,

                                             const Instruction *I) const {

  int ISD = TLI->InstructionOpcodeToISD(Opcode);

  assert(ISD && "Invalid opcode");


  // TODO: Allow non-throughput costs that aren't binary.

  auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {

    if (CostKind != TTI::TCK_RecipThroughput)

      return Cost == 0 ? 0 : 1;

    return Cost;

  };

  auto IsLegalFPType = [this](EVT VT) {

    EVT EltVT = VT.getScalarType();

    return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||

            (EltVT == MVT::f64 && ST->hasFP64()) ||

            (EltVT == MVT::f16 && ST->hasFullFP16());

  };


  EVT SrcTy = TLI->getValueType(DL, Src);

  EVT DstTy = TLI->getValueType(DL, Dst);


  if (!SrcTy.isSimple() || !DstTy.isSimple())

    return AdjustCost(

        BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));


  // Extending masked load/Truncating masked stores is expensive because we

  // currently don't split them. This means that we'll likely end up

  // loading/storing each element individually (hence the high cost).

  if ((ST->hasMVEIntegerOps() &&

       (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||

        Opcode == Instruction::SExt)) ||

      (ST->hasMVEFloatOps() &&

       (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&

       IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))

    if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)

      return 2 * DstTy.getVectorNumElements() *

             ST->getMVEVectorCostFactor(CostKind);


  // The extend of other kinds of load is free

  if (CCH == TTI::CastContextHint::Normal ||

      CCH == TTI::CastContextHint::Masked) {

    static const TypeConversionCostTblEntry LoadConversionTbl[] = {

        {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},

        {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},

        {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},

        {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},

        {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},

        {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},

        {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},

        {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},

        {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},

        {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},

        {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},

        {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},

    };

    if (const auto *Entry = ConvertCostTableLookup(

            LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))

      return AdjustCost(Entry->Cost);


    static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {

        {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},

        {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},

        {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},

        {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},

        {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},

        {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},

        // The following extend from a legal type to an illegal type, so need to

        // split the load. This introduced an extra load operation, but the

        // extend is still "free".

        {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1},

        {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1},

        {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3},

        {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3},

        {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1},

        {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1},

    };

    if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {

      if (const auto *Entry =

              ConvertCostTableLookup(MVELoadConversionTbl, ISD,

                                     DstTy.getSimpleVT(), SrcTy.getSimpleVT()))

        return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);

    }


    static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {

        // FPExtends are similar but also require the VCVT instructions.

        {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1},

        {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3},

    };

    if (SrcTy.isVector() && ST->hasMVEFloatOps()) {

      if (const auto *Entry =

              ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,

                                     DstTy.getSimpleVT(), SrcTy.getSimpleVT()))

        return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);

    }


    // The truncate of a store is free. This is the mirror of extends above.

    static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {

        {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0},

        {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0},

        {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0},

        {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1},

        {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1},

        {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3},

        {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1},

    };

    if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {

      if (const auto *Entry =

              ConvertCostTableLookup(MVEStoreConversionTbl, ISD,

                                     SrcTy.getSimpleVT(), DstTy.getSimpleVT()))

        return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);

    }


    static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {

        {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},

        {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},

    };

    if (SrcTy.isVector() && ST->hasMVEFloatOps()) {

      if (const auto *Entry =

              ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,

                                     SrcTy.getSimpleVT(), DstTy.getSimpleVT()))

        return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);

    }

  }


  // NEON vector operations that can extend their inputs.

  if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&

      I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {

    static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {

      // vaddl

      { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },

      { ISD::ADD, MVT::v8i16, MVT::v8i8,  0 },

      // vsubl

      { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },

      { ISD::SUB, MVT::v8i16, MVT::v8i8,  0 },

      // vmull

      { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },

      { ISD::MUL, MVT::v8i16, MVT::v8i8,  0 },

      // vshll

      { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },

      { ISD::SHL, MVT::v8i16, MVT::v8i8,  0 },

    };


    auto *User = cast<Instruction>(*I->user_begin());

    int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());

    if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,

                                             DstTy.getSimpleVT(),

                                             SrcTy.getSimpleVT())) {

      return AdjustCost(Entry->Cost);

    }

  }


  // Single to/from double precision conversions.

  if (Src->isVectorTy() && ST->hasNEON() &&

      ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&

        DstTy.getScalarType() == MVT::f32) ||

       (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&

        DstTy.getScalarType() == MVT::f64))) {

    static const CostTblEntry NEONFltDblTbl[] = {

        // Vector fptrunc/fpext conversions.

        {ISD::FP_ROUND, MVT::v2f64, 2},

        {ISD::FP_EXTEND, MVT::v2f32, 2},

        {ISD::FP_EXTEND, MVT::v4f32, 4}};


    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);

    if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))

      return AdjustCost(LT.first * Entry->Cost);

  }


  // Some arithmetic, load and store operations have specific instructions

  // to cast up/down their types automatically at no extra cost.

  // TODO: Get these tables to know at least what the related operations are.

  static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {

    { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },

    { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },

    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },

    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },

    { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64, 0 },

    { ISD::TRUNCATE,    MVT::v4i16, MVT::v4i32, 1 },


    // The number of vmovl instructions for the extension.

    { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8,  1 },

    { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8,  1 },

    { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8,  2 },

    { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8,  2 },

    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8,  3 },

    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8,  3 },

    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 },

    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },

    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },

    { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },

    { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },

    { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },

    { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },

    { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },

    { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },

    { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },

    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },

    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },


    // Operations that we legalize using splitting.

    { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i32, 6 },

    { ISD::TRUNCATE,    MVT::v8i8, MVT::v8i32, 3 },


    // Vector float <-> i32 conversions.

    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },

    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },


    { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },

    { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },

    { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },

    { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },

    { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },

    { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },

    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },

    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },

    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },

    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },

    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },

    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },

    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },

    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },

    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },

    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },

    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },

    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },

    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },

    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },


    { ISD::FP_TO_SINT,  MVT::v4i32, MVT::v4f32, 1 },

    { ISD::FP_TO_UINT,  MVT::v4i32, MVT::v4f32, 1 },

    { ISD::FP_TO_SINT,  MVT::v4i8, MVT::v4f32, 3 },

    { ISD::FP_TO_UINT,  MVT::v4i8, MVT::v4f32, 3 },

    { ISD::FP_TO_SINT,  MVT::v4i16, MVT::v4f32, 2 },

    { ISD::FP_TO_UINT,  MVT::v4i16, MVT::v4f32, 2 },


    // Vector double <-> i32 conversions.

    { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },

    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },


    { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },

    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },

    { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },

    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },

    { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },

    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },


    { ISD::FP_TO_SINT,  MVT::v2i32, MVT::v2f64, 2 },

    { ISD::FP_TO_UINT,  MVT::v2i32, MVT::v2f64, 2 },

    { ISD::FP_TO_SINT,  MVT::v8i16, MVT::v8f32, 4 },

    { ISD::FP_TO_UINT,  MVT::v8i16, MVT::v8f32, 4 },

    { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v16f32, 8 },

    { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v16f32, 8 }

  };


  if (SrcTy.isVector() && ST->hasNEON()) {

    if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,

                                                   DstTy.getSimpleVT(),

                                                   SrcTy.getSimpleVT()))

      return AdjustCost(Entry->Cost);

  }


  // Scalar float to integer conversions.

  static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {

    { ISD::FP_TO_SINT,  MVT::i1, MVT::f32, 2 },

    { ISD::FP_TO_UINT,  MVT::i1, MVT::f32, 2 },

    { ISD::FP_TO_SINT,  MVT::i1, MVT::f64, 2 },

    { ISD::FP_TO_UINT,  MVT::i1, MVT::f64, 2 },

    { ISD::FP_TO_SINT,  MVT::i8, MVT::f32, 2 },

    { ISD::FP_TO_UINT,  MVT::i8, MVT::f32, 2 },

    { ISD::FP_TO_SINT,  MVT::i8, MVT::f64, 2 },

    { ISD::FP_TO_UINT,  MVT::i8, MVT::f64, 2 },

    { ISD::FP_TO_SINT,  MVT::i16, MVT::f32, 2 },

    { ISD::FP_TO_UINT,  MVT::i16, MVT::f32, 2 },

    { ISD::FP_TO_SINT,  MVT::i16, MVT::f64, 2 },

    { ISD::FP_TO_UINT,  MVT::i16, MVT::f64, 2 },

    { ISD::FP_TO_SINT,  MVT::i32, MVT::f32, 2 },

    { ISD::FP_TO_UINT,  MVT::i32, MVT::f32, 2 },

    { ISD::FP_TO_SINT,  MVT::i32, MVT::f64, 2 },

    { ISD::FP_TO_UINT,  MVT::i32, MVT::f64, 2 },

    { ISD::FP_TO_SINT,  MVT::i64, MVT::f32, 10 },

    { ISD::FP_TO_UINT,  MVT::i64, MVT::f32, 10 },

    { ISD::FP_TO_SINT,  MVT::i64, MVT::f64, 10 },

    { ISD::FP_TO_UINT,  MVT::i64, MVT::f64, 10 }

  };

  if (SrcTy.isFloatingPoint() && ST->hasNEON()) {

    if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,

                                                   DstTy.getSimpleVT(),

                                                   SrcTy.getSimpleVT()))

      return AdjustCost(Entry->Cost);

  }


  // Scalar integer to float conversions.

  static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {

    { ISD::SINT_TO_FP,  MVT::f32, MVT::i1, 2 },

    { ISD::UINT_TO_FP,  MVT::f32, MVT::i1, 2 },

    { ISD::SINT_TO_FP,  MVT::f64, MVT::i1, 2 },

    { ISD::UINT_TO_FP,  MVT::f64, MVT::i1, 2 },

    { ISD::SINT_TO_FP,  MVT::f32, MVT::i8, 2 },

    { ISD::UINT_TO_FP,  MVT::f32, MVT::i8, 2 },

    { ISD::SINT_TO_FP,  MVT::f64, MVT::i8, 2 },

    { ISD::UINT_TO_FP,  MVT::f64, MVT::i8, 2 },

    { ISD::SINT_TO_FP,  MVT::f32, MVT::i16, 2 },

    { ISD::UINT_TO_FP,  MVT::f32, MVT::i16, 2 },

    { ISD::SINT_TO_FP,  MVT::f64, MVT::i16, 2 },

    { ISD::UINT_TO_FP,  MVT::f64, MVT::i16, 2 },

    { ISD::SINT_TO_FP,  MVT::f32, MVT::i32, 2 },

    { ISD::UINT_TO_FP,  MVT::f32, MVT::i32, 2 },

    { ISD::SINT_TO_FP,  MVT::f64, MVT::i32, 2 },

    { ISD::UINT_TO_FP,  MVT::f64, MVT::i32, 2 },

    { ISD::SINT_TO_FP,  MVT::f32, MVT::i64, 10 },

    { ISD::UINT_TO_FP,  MVT::f32, MVT::i64, 10 },

    { ISD::SINT_TO_FP,  MVT::f64, MVT::i64, 10 },

    { ISD::UINT_TO_FP,  MVT::f64, MVT::i64, 10 }

  };


  if (SrcTy.isInteger() && ST->hasNEON()) {

    if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,

                                                   ISD, DstTy.getSimpleVT(),

                                                   SrcTy.getSimpleVT()))

      return AdjustCost(Entry->Cost);

  }


  // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one

  // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext

  // are linearised so take more.

  static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {

    { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },

    { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },

    { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },

    { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },

    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },

    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },

    { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },

    { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },

    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },

    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },

    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },

    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },

  };


  if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {

    if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,

                                                   ISD, DstTy.getSimpleVT(),

                                                   SrcTy.getSimpleVT()))

      return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);

  }


  if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {

    // As general rule, fp converts that were not matched above are scalarized

    // and cost 1 vcvt for each lane, so long as the instruction is available.

    // If not it will become a series of function calls.

    const InstructionCost CallCost =

        getCallInstrCost(nullptr, Dst, {Src}, CostKind);

    int Lanes = 1;

    if (SrcTy.isFixedLengthVector())

      Lanes = SrcTy.getVectorNumElements();


    if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))

      return Lanes;

    else

      return Lanes * CallCost;

  }


  if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&

      SrcTy.isFixedLengthVector()) {

    // Treat a truncate with larger than legal source (128bits for MVE) as

    // expensive, 2 instructions per lane.

    if ((SrcTy.getScalarType() == MVT::i8 ||

         SrcTy.getScalarType() == MVT::i16 ||

         SrcTy.getScalarType() == MVT::i32) &&

        SrcTy.getSizeInBits() > 128 &&

        SrcTy.getSizeInBits() > DstTy.getSizeInBits())

      return SrcTy.getVectorNumElements() * 2;

  }


  // Scalar integer conversion costs.

  static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {

    // i16 -> i64 requires two dependent operations.

    { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },


    // Truncates on i64 are assumed to be free.

    { ISD::TRUNCATE,    MVT::i32, MVT::i64, 0 },

    { ISD::TRUNCATE,    MVT::i16, MVT::i64, 0 },

    { ISD::TRUNCATE,    MVT::i8,  MVT::i64, 0 },

    { ISD::TRUNCATE,    MVT::i1,  MVT::i64, 0 }

  };


  if (SrcTy.isInteger()) {

    if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,

                                                   DstTy.getSimpleVT(),

                                                   SrcTy.getSimpleVT()))

      return AdjustCost(Entry->Cost);

  }


  int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()

                     ? ST->getMVEVectorCostFactor(CostKind)

                     : 1;

  return AdjustCost(

      BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));

}


InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,

                                               TTI::TargetCostKind CostKind,

                                               unsigned Index, const Value *Op0,

                                               const Value *Op1) const {

  // Penalize inserting into an D-subregister. We end up with a three times

  // lower estimated throughput on swift.

  if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&

      ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)

    return 3;


  if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||

                        Opcode == Instruction::ExtractElement)) {

    // Cross-class copies are expensive on many microarchitectures,

    // so assume they are expensive by default.

    if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())

      return 3;


    // Even if it's not a cross class copy, this likely leads to mixing

    // of NEON and VFP code and should be therefore penalized.

    if (ValTy->isVectorTy() &&

        ValTy->getScalarSizeInBits() <= 32)

      return std::max<InstructionCost>(

          BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1),

          2U);

  }


  if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||

                                 Opcode == Instruction::ExtractElement)) {

    // Integer cross-lane moves are more expensive than float, which can

    // sometimes just be vmovs. Integer involve being passes to GPR registers,

    // causing more of a delay.

    std::pair<InstructionCost, MVT> LT =

        getTypeLegalizationCost(ValTy->getScalarType());

    return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);

  }


  return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);

}


InstructionCost ARMTTIImpl::getCmpSelInstrCost(

    unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,

    TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,

    TTI::OperandValueInfo Op2Info, const Instruction *I) const {

  int ISD = TLI->InstructionOpcodeToISD(Opcode);


  // Thumb scalar code size cost for select.

  if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&

      ST->isThumb() && !ValTy->isVectorTy()) {

    // Assume expensive structs.

    if (TLI->getValueType(DL, ValTy, true) == MVT::Other)

      return TTI::TCC_Expensive;


    // Select costs can vary because they:

    // - may require one or more conditional mov (including an IT),

    // - can't operate directly on immediates,

    // - require live flags, which we can't copy around easily.

    InstructionCost Cost = getTypeLegalizationCost(ValTy).first;


    // Possible IT instruction for Thumb2, or more for Thumb1.

    ++Cost;


    // i1 values may need rematerialising by using mov immediates and/or

    // flag setting instructions.

    if (ValTy->isIntegerTy(1))

      ++Cost;


    return Cost;

  }


  // If this is a vector min/max/abs, use the cost of that intrinsic directly

  // instead. Hopefully when min/max intrinsics are more prevalent this code

  // will not be needed.

  const Instruction *Sel = I;

  if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&

      Sel->hasOneUse())

    Sel = cast<Instruction>(Sel->user_back());

  if (Sel && ValTy->isVectorTy() &&

      (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {

    const Value *LHS, *RHS;

    SelectPatternFlavor SPF = matchSelectPattern(Sel, LHS, RHS).Flavor;

    unsigned IID = 0;

    switch (SPF) {

    case SPF_ABS:

      IID = Intrinsic::abs;

      break;

    case SPF_SMIN:

      IID = Intrinsic::smin;

      break;

    case SPF_SMAX:

      IID = Intrinsic::smax;

      break;

    case SPF_UMIN:

      IID = Intrinsic::umin;

      break;

    case SPF_UMAX:

      IID = Intrinsic::umax;

      break;

    case SPF_FMINNUM:

      IID = Intrinsic::minnum;

      break;

    case SPF_FMAXNUM:

      IID = Intrinsic::maxnum;

      break;

    default:

      break;

    }

    if (IID) {

      // The ICmp is free, the select gets the cost of the min/max/etc

      if (Sel != I)

        return 0;

      IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});

      return getIntrinsicInstrCost(CostAttrs, CostKind);

    }

  }


  // On NEON a vector select gets lowered to vbsl.

  if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {

    // Lowering of some vector selects is currently far from perfect.

    static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {

      { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },

      { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },

      { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }

    };


    EVT SelCondTy = TLI->getValueType(DL, CondTy);

    EVT SelValTy = TLI->getValueType(DL, ValTy);

    if (SelCondTy.isSimple() && SelValTy.isSimple()) {

      if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,

                                                     SelCondTy.getSimpleVT(),

                                                     SelValTy.getSimpleVT()))

        return Entry->Cost;

    }


    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);

    return LT.first;

  }


  if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&

      (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&

      cast<FixedVectorType>(ValTy)->getNumElements() > 1) {

    FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);

    FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy);

    if (!VecCondTy)

      VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy));


    // If we don't have mve.fp any fp operations will need to be scalarized.

    if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {

      // One scalaization insert, one scalarization extract and the cost of the

      // fcmps.

      return BaseT::getScalarizationOverhead(VecValTy, /*Insert*/ false,

                                             /*Extract*/ true, CostKind) +

             BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,

                                             /*Extract*/ false, CostKind) +

             VecValTy->getNumElements() *

                 getCmpSelInstrCost(Opcode, ValTy->getScalarType(),

                                    VecCondTy->getScalarType(), VecPred,

                                    CostKind, Op1Info, Op2Info, I);

    }


    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);

    int BaseCost = ST->getMVEVectorCostFactor(CostKind);

    // There are two types - the input that specifies the type of the compare

    // and the output vXi1 type. Because we don't know how the output will be

    // split, we may need an expensive shuffle to get two in sync. This has the

    // effect of making larger than legal compares (v8i32 for example)

    // expensive.

    if (LT.second.isVector() && LT.second.getVectorNumElements() > 2) {

      if (LT.first > 1)

        return LT.first * BaseCost +

               BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,

                                               /*Extract*/ false, CostKind);

      return BaseCost;

    }

  }


  // Default to cheap (throughput/size of 1 instruction) but adjust throughput

  // for "multiple beats" potentially needed by MVE instructions.

  int BaseCost = 1;

  if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())

    BaseCost = ST->getMVEVectorCostFactor(CostKind);


  return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred,

                                              CostKind, Op1Info, Op2Info, I);

}


InstructionCost


ARMTTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,

                                      const SCEV *Ptr,

                                      TTI::TargetCostKind CostKind) const {

  // Address computations in vectorized code with non-consecutive addresses will

  // likely result in more instructions compared to scalar code where the

  // computation can more often be merged into the index mode. The resulting

  // extra micro-ops can significantly decrease throughput.

  unsigned NumVectorInstToHideOverhead = 10;

  int MaxMergeDistance = 64;


  if (ST->hasNEON()) {

    if (PtrTy->isVectorTy() && SE &&

        !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))

      return NumVectorInstToHideOverhead;


    // In many cases the address computation is not merged into the instruction

    // addressing mode.

    return 1;

  }

  return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);

}


bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) const {

  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {

    // If a VCTP is part of a chain, it's already profitable and shouldn't be

    // optimized, else LSR may block tail-predication.

    switch (II->getIntrinsicID()) {

    case Intrinsic::arm_mve_vctp8:

    case Intrinsic::arm_mve_vctp16:

    case Intrinsic::arm_mve_vctp32:

    case Intrinsic::arm_mve_vctp64:

      return true;

    default:

      break;

    }

  }

  return false;

}


bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment,

                                   unsigned /*AddressSpace*/,

                                   TTI::MaskKind /*MaskKind*/) const {

  if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())

    return false;


  if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {

    // Don't support v2i1 yet.

    if (VecTy->getNumElements() == 2)

      return false;


    // We don't support extending fp types.

     unsigned VecWidth = DataTy->getPrimitiveSizeInBits();

    if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())

      return false;

  }


  unsigned EltWidth = DataTy->getScalarSizeInBits();

  return (EltWidth == 32 && Alignment >= 4) ||

         (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);

}


bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) const {

  if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())

    return false;


  unsigned EltWidth = Ty->getScalarSizeInBits();

  return ((EltWidth == 32 && Alignment >= 4) ||

          (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);

}


/// Given a memcpy/memset/memmove instruction, return the number of memory

/// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a

/// call is used.


int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const {

  MemOp MOp;

  unsigned DstAddrSpace = ~0u;

  unsigned SrcAddrSpace = ~0u;

  const Function *F = I->getParent()->getParent();


  if (const auto *MC = dyn_cast<MemTransferInst>(I)) {

    ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());

    // If 'size' is not a constant, a library call will be generated.

    if (!C)

      return -1;


    const unsigned Size = C->getValue().getZExtValue();

    const Align DstAlign = MC->getDestAlign().valueOrOne();

    const Align SrcAlign = MC->getSourceAlign().valueOrOne();


    MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,

                      /*IsVolatile*/ false);

    DstAddrSpace = MC->getDestAddressSpace();

    SrcAddrSpace = MC->getSourceAddressSpace();

  }

  else if (const auto *MS = dyn_cast<MemSetInst>(I)) {

    ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());

    // If 'size' is not a constant, a library call will be generated.

    if (!C)

      return -1;


    const unsigned Size = C->getValue().getZExtValue();

    const Align DstAlign = MS->getDestAlign().valueOrOne();


    MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,

                     /*IsZeroMemset*/ false, /*IsVolatile*/ false);

    DstAddrSpace = MS->getDestAddressSpace();

  }

  else

    llvm_unreachable("Expected a memcpy/move or memset!");


  unsigned Limit, Factor = 2;

  switch(I->getIntrinsicID()) {

    case Intrinsic::memcpy:

      Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());

      break;

    case Intrinsic::memmove:

      Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());

      break;

    case Intrinsic::memset:

      Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());

      Factor = 1;

      break;

    default:

      llvm_unreachable("Expected a memcpy/move or memset!");

  }


  // MemOps will be poplulated with a list of data types that needs to be

  // loaded and stored. That's why we multiply the number of elements by 2 to

  // get the cost for this memcpy.

  std::vector<EVT> MemOps;

  LLVMContext &C = F->getContext();

  if (getTLI()->findOptimalMemOpLowering(C, MemOps, Limit, MOp, DstAddrSpace,

                                         SrcAddrSpace, F->getAttributes()))

    return MemOps.size() * Factor;


  // If we can't find an optimal memop lowering, return the default cost

  return -1;

}


InstructionCost ARMTTIImpl::getMemcpyCost(const Instruction *I) const {

  int NumOps = getNumMemOps(cast<IntrinsicInst>(I));


  // To model the cost of a library call, we assume 1 for the call, and

  // 3 for the argument setup.

  if (NumOps == -1)

    return 4;

  return NumOps;

}


InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,

                                           VectorType *DstTy, VectorType *SrcTy,

                                           ArrayRef<int> Mask,

                                           TTI::TargetCostKind CostKind,

                                           int Index, VectorType *SubTp,

                                           ArrayRef<const Value *> Args,

                                           const Instruction *CxtI) const {

  assert((Mask.empty() || DstTy->isScalableTy() ||

          Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&

         "Expected the Mask to match the return size if given");

  assert(SrcTy->getScalarType() == DstTy->getScalarType() &&

         "Expected the same scalar types");


  Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);

  // Treat extractsubvector as single op permutation.

  bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;

  if (IsExtractSubvector)

    Kind = TTI::SK_PermuteSingleSrc;

  if (ST->hasNEON()) {

    if (Kind == TTI::SK_Broadcast) {

      static const CostTblEntry NEONDupTbl[] = {

          // VDUP handles these cases.

          {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},

          {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},

          {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},

          {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},

          {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},

          {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},


          {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},

          {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},

          {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},

          {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};


      std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);

      if (const auto *Entry =

              CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))

        return LT.first * Entry->Cost;

    }

    if (Kind == TTI::SK_Reverse) {

      static const CostTblEntry NEONShuffleTbl[] = {

          // Reverse shuffle cost one instruction if we are shuffling within a

          // double word (vrev) or two if we shuffle a quad word (vrev, vext).

          {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},

          {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},

          {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},

          {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},

          {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},

          {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},


          {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},

          {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},

          {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},

          {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};


      std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);

      if (const auto *Entry =

              CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))

        return LT.first * Entry->Cost;

    }

    if (Kind == TTI::SK_Select) {

      static const CostTblEntry NEONSelShuffleTbl[] = {

          // Select shuffle cost table for ARM. Cost is the number of

          // instructions

          // required to create the shuffled vector.


          {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},

          {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},

          {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},

          {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},


          {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},

          {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},

          {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},


          {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},


          {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};


      std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);

      if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,

                                              ISD::VECTOR_SHUFFLE, LT.second))

        return LT.first * Entry->Cost;

    }

  }

  if (ST->hasMVEIntegerOps()) {

    if (Kind == TTI::SK_Broadcast) {

      static const CostTblEntry MVEDupTbl[] = {

          // VDUP handles these cases.

          {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},

          {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},

          {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},

          {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},

          {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};


      std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);

      if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,

                                              LT.second))

        return LT.first * Entry->Cost * ST->getMVEVectorCostFactor(CostKind);

    }


    if (!Mask.empty()) {

      std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);

      // Check for LD2/LD4 instructions, which are represented in llvm IR as

      // deinterleaving-shuffle(load). The shuffle cost could potentially be

      // free, but we model it with a cost of LT.first so that LD2/LD4 have a

      // higher cost than just the load.

      if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&

          (LT.second.getScalarSizeInBits() == 8 ||

           LT.second.getScalarSizeInBits() == 16 ||

           LT.second.getScalarSizeInBits() == 32) &&

          LT.second.getSizeInBits() == 128 &&

          ((TLI->getMaxSupportedInterleaveFactor() >= 2 &&

            ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, 2)) ||

           (TLI->getMaxSupportedInterleaveFactor() == 4 &&

            ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, 4))))

        return ST->getMVEVectorCostFactor(CostKind) *

               std::max<InstructionCost>(1, LT.first / 4);


      // Check for ST2/ST4 instructions, which are represented in llvm IR as

      // store(interleaving-shuffle). The shuffle cost could potentially be

      // free, but we model it with a cost of LT.first so that ST2/ST4 have a

      // higher cost than just the store.

      if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&

          (LT.second.getScalarSizeInBits() == 8 ||

           LT.second.getScalarSizeInBits() == 16 ||

           LT.second.getScalarSizeInBits() == 32) &&

          LT.second.getSizeInBits() == 128 &&

          ((TLI->getMaxSupportedInterleaveFactor() >= 2 &&

            ShuffleVectorInst::isInterleaveMask(

                Mask, 2, SrcTy->getElementCount().getKnownMinValue() * 2)) ||

           (TLI->getMaxSupportedInterleaveFactor() == 4 &&

            ShuffleVectorInst::isInterleaveMask(

                Mask, 4, SrcTy->getElementCount().getKnownMinValue() * 2))))

        return ST->getMVEVectorCostFactor(CostKind) * LT.first;


      if (LT.second.isVector() &&

          Mask.size() <= LT.second.getVectorNumElements() &&

          (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||

           isVREVMask(Mask, LT.second, 64)))

        return ST->getMVEVectorCostFactor(CostKind) * LT.first;

    }

  }


  // Restore optimal kind.

  if (IsExtractSubvector)

    Kind = TTI::SK_ExtractSubvector;

  int BaseCost = ST->hasMVEIntegerOps() && SrcTy->isVectorTy()

                     ? ST->getMVEVectorCostFactor(CostKind)

                     : 1;

  return BaseCost * BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind,

                                          Index, SubTp);

}


InstructionCost ARMTTIImpl::getArithmeticInstrCost(

    unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,

    TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,

    ArrayRef<const Value *> Args, const Instruction *CxtI) const {

  int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);

  if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {

    // Make operations on i1 relatively expensive as this often involves

    // combining predicates. AND and XOR should be easier to handle with IT

    // blocks.

    switch (ISDOpcode) {

    default:

      break;

    case ISD::AND:

    case ISD::XOR:

      return 2;

    case ISD::OR:

      return 3;

    }

  }


  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);


  if (ST->hasNEON()) {

    const unsigned FunctionCallDivCost = 20;

    const unsigned ReciprocalDivCost = 10;

    static const CostTblEntry CostTbl[] = {

      // Division.

      // These costs are somewhat random. Choose a cost of 20 to indicate that

      // vectorizing devision (added function call) is going to be very expensive.

      // Double registers types.

      { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},

      { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},

      { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},

      { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},

      { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},

      { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},

      { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},

      { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},

      { ISD::SDIV, MVT::v4i16,     ReciprocalDivCost},

      { ISD::UDIV, MVT::v4i16,     ReciprocalDivCost},

      { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},

      { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},

      { ISD::SDIV, MVT::v8i8,      ReciprocalDivCost},

      { ISD::UDIV, MVT::v8i8,      ReciprocalDivCost},

      { ISD::SREM, MVT::v8i8,  8 * FunctionCallDivCost},

      { ISD::UREM, MVT::v8i8,  8 * FunctionCallDivCost},

      // Quad register types.

      { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},

      { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},

      { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},

      { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},

      { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},

      { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},

      { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},

      { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},

      { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},

      { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},

      { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},

      { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},

      { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},

      { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},

      { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},

      { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},

      // Multiplication.

    };


    if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))

      return LT.first * Entry->Cost;


    InstructionCost Cost = BaseT::getArithmeticInstrCost(

        Opcode, Ty, CostKind, Op1Info, Op2Info);


    // This is somewhat of a hack. The problem that we are facing is that SROA

    // creates a sequence of shift, and, or instructions to construct values.

    // These sequences are recognized by the ISel and have zero-cost. Not so for

    // the vectorized code. Because we have support for v2i64 but not i64 those

    // sequences look particularly beneficial to vectorize.

    // To work around this we increase the cost of v2i64 operations to make them

    // seem less beneficial.

    if (LT.second == MVT::v2i64 && Op2Info.isUniform() && Op2Info.isConstant())

      Cost += 4;


    return Cost;

  }


  // If this operation is a shift on arm/thumb2, it might well be folded into

  // the following instruction, hence having a cost of 0.

  auto LooksLikeAFreeShift = [&]() {

    if (ST->isThumb1Only() || Ty->isVectorTy())

      return false;


    if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())

      return false;

    if (!Op2Info.isUniform() || !Op2Info.isConstant())

      return false;


    // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB

    switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {

    case Instruction::Add:

    case Instruction::Sub:

    case Instruction::And:

    case Instruction::Xor:

    case Instruction::Or:

    case Instruction::ICmp:

      return true;

    default:

      return false;

    }

  };

  if (LooksLikeAFreeShift())

    return 0;


  // When targets have both DSP and MVE we find that the

  // the compiler will attempt to vectorize as well as using

  // scalar (S/U)MLAL operations. This is in cases where we have

  // the pattern ext(mul(ext(i16), ext(i16))) we find

  // that codegen performs better when only using (S/U)MLAL scalar

  // ops instead of trying to mix vector ops with (S/U)MLAL ops. We therefore

  // check if a mul instruction is used in a (U/S)MLAL pattern.

  auto MulInDSPMLALPattern = [&](const Instruction *I, unsigned Opcode,

                                 Type *Ty) -> bool {

    if (!ST->hasDSP())

      return false;


    if (!I)

      return false;


    if (Opcode != Instruction::Mul)

      return false;


    if (Ty->isVectorTy())

      return false;


    auto ValueOpcodesEqual = [](const Value *LHS, const Value *RHS) -> bool {

      return cast<Instruction>(LHS)->getOpcode() ==

             cast<Instruction>(RHS)->getOpcode();

    };

    auto IsExtInst = [](const Value *V) -> bool {

      return isa<ZExtInst>(V) || isa<SExtInst>(V);

    };

    auto IsExtensionFromHalf = [](const Value *V) -> bool {

      return cast<Instruction>(V)->getOperand(0)->getType()->isIntegerTy(16);

    };


    // We check the arguments of the instruction to see if they're extends

    auto *BinOp = dyn_cast<BinaryOperator>(I);

    if (!BinOp)

      return false;

    Value *Op0 = BinOp->getOperand(0);

    Value *Op1 = BinOp->getOperand(1);

    if (IsExtInst(Op0) && IsExtInst(Op1) && ValueOpcodesEqual(Op0, Op1)) {

      // We're interested in an ext of an i16

      if (!I->getType()->isIntegerTy(32) || !IsExtensionFromHalf(Op0) ||

          !IsExtensionFromHalf(Op1))

        return false;

      // We need to check if this result will be further extended to i64

      // and that all these uses are SExt

      for (auto *U : I->users())

        if (!IsExtInst(U))

          return false;

      return true;

    }


    return false;

  };


  if (MulInDSPMLALPattern(CxtI, Opcode, Ty))

    return 0;


  // Default to cheap (throughput/size of 1 instruction) but adjust throughput

  // for "multiple beats" potentially needed by MVE instructions.

  int BaseCost = 1;

  if (ST->hasMVEIntegerOps() && Ty->isVectorTy())

    BaseCost = ST->getMVEVectorCostFactor(CostKind);


  // The rest of this mostly follows what is done in

  // BaseT::getArithmeticInstrCost, without treating floats as more expensive

  // that scalars or increasing the costs for custom operations. The results is

  // also multiplied by the MVEVectorCostFactor where appropriate.

  if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))

    return LT.first * BaseCost;


  // Else this is expand, assume that we need to scalarize this op.

  if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {

    unsigned Num = VTy->getNumElements();

    InstructionCost Cost =

        getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind);

    // Return the cost of multiple scalar invocation plus the cost of

    // inserting and extracting the values.

    SmallVector<Type *> Tys(Args.size(), Ty);

    return BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind) +

           Num * Cost;

  }


  return BaseCost;

}


InstructionCost ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,

                                            Align Alignment,

                                            unsigned AddressSpace,

                                            TTI::TargetCostKind CostKind,

                                            TTI::OperandValueInfo OpInfo,

                                            const Instruction *I) const {

  // TODO: Handle other cost kinds.

  if (CostKind != TTI::TCK_RecipThroughput)

    return 1;


  // Type legalization can't handle structs

  if (TLI->getValueType(DL, Src, true) == MVT::Other)

    return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,

                                  CostKind);


  if (ST->hasNEON() && Src->isVectorTy() && Alignment != Align(16) &&

      cast<VectorType>(Src)->getElementType()->isDoubleTy()) {

    // Unaligned loads/stores are extremely inefficient.

    // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.

    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);

    return LT.first * 4;

  }


  // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.

  // Same for stores.

  if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&

      ((Opcode == Instruction::Load && I->hasOneUse() &&

        isa<FPExtInst>(*I->user_begin())) ||

       (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {

    FixedVectorType *SrcVTy = cast<FixedVectorType>(Src);

    Type *DstTy =

        Opcode == Instruction::Load

            ? (*I->user_begin())->getType()

            : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();

    if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&

        DstTy->getScalarType()->isFloatTy())

      return ST->getMVEVectorCostFactor(CostKind);

  }


  int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()

                     ? ST->getMVEVectorCostFactor(CostKind)

                     : 1;

  return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,

                                           CostKind, OpInfo, I);

}


InstructionCost


ARMTTIImpl::getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA,

                                     TTI::TargetCostKind CostKind) const {

  switch (MICA.getID()) {

  case Intrinsic::masked_scatter:

  case Intrinsic::masked_gather:

    return getGatherScatterOpCost(MICA, CostKind);

  case Intrinsic::masked_load:

  case Intrinsic::masked_store:

    return getMaskedMemoryOpCost(MICA, CostKind);

  }

  return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);

}


InstructionCost


ARMTTIImpl::getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,

                                  TTI::TargetCostKind CostKind) const {

  unsigned IID = MICA.getID();

  Type *Src = MICA.getDataType();

  Align Alignment = MICA.getAlignment();

  unsigned AddressSpace = MICA.getAddressSpace();

  if (ST->hasMVEIntegerOps()) {

    if (IID == Intrinsic::masked_load &&

        isLegalMaskedLoad(Src, Alignment, AddressSpace))

      return ST->getMVEVectorCostFactor(CostKind);

    if (IID == Intrinsic::masked_store &&

        isLegalMaskedStore(Src, Alignment, AddressSpace))

      return ST->getMVEVectorCostFactor(CostKind);

  }

  if (!isa<FixedVectorType>(Src))

    return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);

  // Scalar cost, which is currently very high due to the efficiency of the

  // generated code.

  return cast<FixedVectorType>(Src)->getNumElements() * 8;

}


InstructionCost ARMTTIImpl::getInterleavedMemoryOpCost(

    unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,

    Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,

    bool UseMaskForCond, bool UseMaskForGaps) const {

  assert(Factor >= 2 && "Invalid interleave factor");

  assert(isa<VectorType>(VecTy) && "Expect a vector type");


  // vldN/vstN doesn't support vector types of i64/f64 element.

  bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;


  if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&

      !UseMaskForCond && !UseMaskForGaps) {

    unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();

    auto *SubVecTy =

        FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);


    // vldN/vstN only support legal vector types of size 64 or 128 in bits.

    // Accesses having vector types that are a multiple of 128 bits can be

    // matched to more than one vldN/vstN instruction.

    int BaseCost =

        ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1;

    if (NumElts % Factor == 0 &&

        TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))

      return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);


    // Some smaller than legal interleaved patterns are cheap as we can make

    // use of the vmovn or vrev patterns to interleave a standard load. This is

    // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is

    // promoted differently). The cost of 2 here is then a load and vrev or

    // vmovn.

    if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&

        VecTy->isIntOrIntVectorTy() &&

        DL.getTypeSizeInBits(SubVecTy).getFixedValue() <= 64)

      return 2 * BaseCost;

  }


  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,

                                           Alignment, AddressSpace, CostKind,

                                           UseMaskForCond, UseMaskForGaps);

}


InstructionCost


ARMTTIImpl::getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA,

                                   TTI::TargetCostKind CostKind) const {


  Type *DataTy = MICA.getDataType();

  const Value *Ptr = MICA.getPointer();

  bool VariableMask = MICA.getVariableMask();

  Align Alignment = MICA.getAlignment();

  const Instruction *I = MICA.getInst();


  using namespace PatternMatch;

  if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)

    return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);


  assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");

  auto *VTy = cast<FixedVectorType>(DataTy);


  // TODO: Splitting, once we do that.


  unsigned NumElems = VTy->getNumElements();

  unsigned EltSize = VTy->getScalarSizeInBits();

  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(DataTy);


  // For now, it is assumed that for the MVE gather instructions the loads are

  // all effectively serialised. This means the cost is the scalar cost

  // multiplied by the number of elements being loaded. This is possibly very

  // conservative, but even so we still end up vectorising loops because the

  // cost per iteration for many loops is lower than for scalar loops.

  InstructionCost VectorCost =

      NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);

  // The scalarization cost should be a lot higher. We use the number of vector

  // elements plus the scalarization overhead. If masking is required then a lot

  // of little blocks will be needed and potentially a scalarized p0 mask,

  // greatly increasing the cost.

  InstructionCost ScalarCost =

      NumElems * LT.first + (VariableMask ? NumElems * 5 : 0) +

      BaseT::getScalarizationOverhead(VTy, /*Insert*/ true, /*Extract*/ false,

                                      CostKind) +

      BaseT::getScalarizationOverhead(VTy, /*Insert*/ false, /*Extract*/ true,

                                      CostKind);


  if (EltSize < 8 || Alignment < EltSize / 8)

    return ScalarCost;


  unsigned ExtSize = EltSize;

  // Check whether there's a single user that asks for an extended type

  if (I != nullptr) {

    // Dependent of the caller of this function, a gather instruction will

    // either have opcode Instruction::Load or be a call to the masked_gather

    // intrinsic

    if ((I->getOpcode() == Instruction::Load ||

         match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&

        I->hasOneUse()) {

      const User *Us = *I->users().begin();

      if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {

        // only allow valid type combinations

        unsigned TypeSize =

            cast<Instruction>(Us)->getType()->getScalarSizeInBits();

        if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||

             (TypeSize == 16 && EltSize == 8)) &&

            TypeSize * NumElems == 128) {

          ExtSize = TypeSize;

        }

      }

    }

    // Check whether the input data needs to be truncated

    TruncInst *T;

    if ((I->getOpcode() == Instruction::Store ||

         match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&

        (T = dyn_cast<TruncInst>(I->getOperand(0)))) {

      // Only allow valid type combinations

      unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();

      if (((EltSize == 16 && TypeSize == 32) ||

           (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&

          TypeSize * NumElems == 128)

        ExtSize = TypeSize;

    }

  }


  if (ExtSize * NumElems != 128 || NumElems < 4)

    return ScalarCost;


  // Any (aligned) i32 gather will not need to be scalarised.

  if (ExtSize == 32)

    return VectorCost;

  // For smaller types, we need to ensure that the gep's inputs are correctly

  // extended from a small enough value. Other sizes (including i64) are

  // scalarized for now.

  if (ExtSize != 8 && ExtSize != 16)

    return ScalarCost;


  if (const auto *BC = dyn_cast<BitCastInst>(Ptr))

    Ptr = BC->getOperand(0);

  if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {

    if (GEP->getNumOperands() != 2)

      return ScalarCost;

    unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());

    // Scale needs to be correct (which is only relevant for i16s).

    if (Scale != 1 && Scale * 8 != ExtSize)

      return ScalarCost;

    // And we need to zext (not sext) the indexes from a small enough type.

    if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {

      if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)

        return VectorCost;

    }

    return ScalarCost;

  }

  return ScalarCost;

}


InstructionCost


ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,

                                       std::optional<FastMathFlags> FMF,

                                       TTI::TargetCostKind CostKind) const {


  EVT ValVT = TLI->getValueType(DL, ValTy);

  int ISD = TLI->InstructionOpcodeToISD(Opcode);

  unsigned EltSize = ValVT.getScalarSizeInBits();


  // In general floating point reductions are a series of elementwise

  // operations, with free extracts on each step. These are either in-order or

  // treewise depending on whether that is allowed by the fast math flags.

  if ((ISD == ISD::FADD || ISD == ISD::FMUL) &&

      ((EltSize == 32 && ST->hasVFP2Base()) ||

       (EltSize == 64 && ST->hasFP64()) ||

       (EltSize == 16 && ST->hasFullFP16()))) {

    unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements();

    unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);

    InstructionCost VecCost = 0;

    while (!TTI::requiresOrderedReduction(FMF) && isPowerOf2_32(NumElts) &&

           NumElts * EltSize > VecLimit) {

      Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts / 2);

      VecCost += getArithmeticInstrCost(Opcode, VecTy, CostKind);

      NumElts /= 2;

    }


    // For fp16 we need to extract the upper lane elements. MVE can add a

    // VREV+FMIN/MAX to perform another vector step instead.

    InstructionCost ExtractCost = 0;

    if (!TTI::requiresOrderedReduction(FMF) && ST->hasMVEFloatOps() &&

        ValVT.getVectorElementType() == MVT::f16 && NumElts == 8) {

      VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;

      NumElts /= 2;

    } else if (ValVT.getVectorElementType() == MVT::f16)

      ExtractCost = NumElts / 2;


    return VecCost + ExtractCost +

           NumElts *

               getArithmeticInstrCost(Opcode, ValTy->getElementType(), CostKind);

  }


  if ((ISD == ISD::AND || ISD == ISD::OR || ISD == ISD::XOR) &&

      (EltSize == 64 || EltSize == 32 || EltSize == 16 || EltSize == 8)) {

    unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements();

    unsigned VecLimit =

        ST->hasMVEIntegerOps() ? 128 : (ST->hasNEON() ? 64 : -1);

    InstructionCost VecCost = 0;

    while (isPowerOf2_32(NumElts) && NumElts * EltSize > VecLimit) {

      Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts / 2);

      VecCost += getArithmeticInstrCost(Opcode, VecTy, CostKind);

      NumElts /= 2;

    }

    // For i16/i8, MVE will perform a VREV + VORR/VAND/VEOR for the 64bit vector

    // step.

    if (ST->hasMVEIntegerOps() && ValVT.getScalarSizeInBits() <= 16 &&

        NumElts * EltSize == 64) {

      Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts);

      VecCost += ST->getMVEVectorCostFactor(CostKind) +

                 getArithmeticInstrCost(Opcode, VecTy, CostKind);

      NumElts /= 2;

    }


    // From here we extract the elements and perform the and/or/xor.

    InstructionCost ExtractCost = NumElts;

    return VecCost + ExtractCost +

           (NumElts - 1) * getArithmeticInstrCost(

                               Opcode, ValTy->getElementType(), CostKind);

  }


  if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD ||

      TTI::requiresOrderedReduction(FMF))

    return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);


  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);


  static const CostTblEntry CostTblAdd[]{

      {ISD::ADD, MVT::v16i8, 1},

      {ISD::ADD, MVT::v8i16, 1},

      {ISD::ADD, MVT::v4i32, 1},

  };

  if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))

    return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;


  return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);

}


InstructionCost ARMTTIImpl::getExtendedReductionCost(

    unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,

    std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {

  EVT ValVT = TLI->getValueType(DL, ValTy);

  EVT ResVT = TLI->getValueType(DL, ResTy);


  int ISD = TLI->InstructionOpcodeToISD(Opcode);


  switch (ISD) {

  case ISD::ADD:

    if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {

      std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);


      // The legal cases are:

      //   VADDV u/s 8/16/32

      //   VADDLV u/s 32

      // Codegen currently cannot always handle larger than legal vectors very

      // well, especially for predicated reductions where the mask needs to be

      // split, so restrict to 128bit or smaller input types.

      unsigned RevVTSize = ResVT.getSizeInBits();

      if (ValVT.getSizeInBits() <= 128 &&

          ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||

           (LT.second == MVT::v8i16 && RevVTSize <= 32) ||

           (LT.second == MVT::v4i32 && RevVTSize <= 64)))

        return ST->getMVEVectorCostFactor(CostKind) * LT.first;

    }

    break;

  default:

    break;

  }

  return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, FMF,

                                         CostKind);

}


InstructionCost


ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode,

                                   Type *ResTy, VectorType *ValTy,

                                   TTI::TargetCostKind CostKind) const {

  if (RedOpcode != Instruction::Add)

    return InstructionCost::getInvalid(CostKind);

  EVT ValVT = TLI->getValueType(DL, ValTy);

  EVT ResVT = TLI->getValueType(DL, ResTy);


  if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {

    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);


    // The legal cases are:

    //   VMLAV u/s 8/16/32

    //   VMLALV u/s 16/32

    // Codegen currently cannot always handle larger than legal vectors very

    // well, especially for predicated reductions where the mask needs to be

    // split, so restrict to 128bit or smaller input types.

    unsigned RevVTSize = ResVT.getSizeInBits();

    if (ValVT.getSizeInBits() <= 128 &&

        ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||

         (LT.second == MVT::v8i16 && RevVTSize <= 64) ||

         (LT.second == MVT::v4i32 && RevVTSize <= 64)))

      return ST->getMVEVectorCostFactor(CostKind) * LT.first;

  }


  return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, ValTy,

                                       CostKind);

}


InstructionCost


ARMTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,

                                   FastMathFlags FMF,

                                   TTI::TargetCostKind CostKind) const {

  EVT ValVT = TLI->getValueType(DL, Ty);


  // In general floating point reductions are a series of elementwise

  // operations, with free extracts on each step. These are either in-order or

  // treewise depending on whether that is allowed by the fast math flags.

  if ((IID == Intrinsic::minnum || IID == Intrinsic::maxnum) &&

      ((ValVT.getVectorElementType() == MVT::f32 && ST->hasVFP2Base()) ||

       (ValVT.getVectorElementType() == MVT::f64 && ST->hasFP64()) ||

       (ValVT.getVectorElementType() == MVT::f16 && ST->hasFullFP16()))) {

    unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();

    unsigned EltSize = ValVT.getScalarSizeInBits();

    unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);

    InstructionCost VecCost;

    while (isPowerOf2_32(NumElts) && NumElts * EltSize > VecLimit) {

      Type *VecTy = FixedVectorType::get(Ty->getElementType(), NumElts/2);

      IntrinsicCostAttributes ICA(IID, VecTy, {VecTy, VecTy}, FMF);

      VecCost += getIntrinsicInstrCost(ICA, CostKind);

      NumElts /= 2;

    }


    // For fp16 we need to extract the upper lane elements. MVE can add a

    // VREV+FMIN/MAX to perform another vector step instead.

    InstructionCost ExtractCost = 0;

    if (ST->hasMVEFloatOps() && ValVT.getVectorElementType() == MVT::f16 &&

        NumElts == 8) {

      VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;

      NumElts /= 2;

    } else if (ValVT.getVectorElementType() == MVT::f16)

      ExtractCost = cast<FixedVectorType>(Ty)->getNumElements() / 2;


    IntrinsicCostAttributes ICA(IID, Ty->getElementType(),

                                {Ty->getElementType(), Ty->getElementType()},

                                FMF);

    return VecCost + ExtractCost +

           (NumElts - 1) * getIntrinsicInstrCost(ICA, CostKind);

  }


  if (IID == Intrinsic::smin || IID == Intrinsic::smax ||

      IID == Intrinsic::umin || IID == Intrinsic::umax) {

    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);


    // All costs are the same for u/s min/max.  These lower to vminv, which are

    // given a slightly higher cost as they tend to take multiple cycles for

    // smaller type sizes.

    static const CostTblEntry CostTblAdd[]{

        {ISD::SMIN, MVT::v16i8, 4},

        {ISD::SMIN, MVT::v8i16, 3},

        {ISD::SMIN, MVT::v4i32, 2},

    };

    if (const auto *Entry = CostTableLookup(CostTblAdd, ISD::SMIN, LT.second))

      return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;

  }


  return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);

}


InstructionCost


ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,

                                  TTI::TargetCostKind CostKind) const {

  unsigned Opc = ICA.getID();

  switch (Opc) {

  case Intrinsic::get_active_lane_mask:

    // Currently we make a somewhat optimistic assumption that

    // active_lane_mask's are always free. In reality it may be freely folded

    // into a tail predicated loop, expanded into a VCPT or expanded into a lot

    // of add/icmp code. We may need to improve this in the future, but being

    // able to detect if it is free or not involves looking at a lot of other

    // code. We currently assume that the vectorizer inserted these, and knew

    // what it was doing in adding one.

    if (ST->hasMVEIntegerOps())

      return 0;

    break;

  case Intrinsic::sadd_sat:

  case Intrinsic::ssub_sat:

  case Intrinsic::uadd_sat:

  case Intrinsic::usub_sat: {

    bool IsAdd = (Opc == Intrinsic::sadd_sat || Opc == Intrinsic::ssub_sat);

    bool IsSigned = (Opc == Intrinsic::sadd_sat || Opc == Intrinsic::ssub_sat);

    Type *RetTy = ICA.getReturnType();


    if (auto *ITy = dyn_cast<IntegerType>(RetTy)) {

      if (IsSigned && ST->hasDSP() && ITy->getBitWidth() == 32)

        return 1; // qadd / qsub

      if (ST->hasDSP() && (ITy->getBitWidth() == 8 || ITy->getBitWidth() == 16))

        return 2; // uqadd16 / qadd16 / uqsub16 / qsub16 + possible extend.

      // Otherwise return the cost of expanding the node. Generally an add +

      // icmp + sel.

      CmpInst::Predicate Pred = CmpInst::ICMP_SGT;

      Type *CondTy = RetTy->getWithNewBitWidth(1);

      return getArithmeticInstrCost(IsAdd ? Instruction::Add : Instruction::Sub,

                                    RetTy, CostKind) +

             2 * getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, Pred,

                                    CostKind) +

             2 * getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, Pred,

                                    CostKind);

    }


    if (!ST->hasMVEIntegerOps())

      break;


    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);

    if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||

        LT.second == MVT::v16i8) {

      // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we

      // need to extend the type, as it uses shr(qadd(shl, shl)).

      unsigned Instrs =

          LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1

                                                                          : 4;

      return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;

    }

    break;

  }

  case Intrinsic::abs:

  case Intrinsic::smin:

  case Intrinsic::smax:

  case Intrinsic::umin:

  case Intrinsic::umax: {

    if (!ST->hasMVEIntegerOps())

      break;

    Type *VT = ICA.getReturnType();


    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);

    if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||

        LT.second == MVT::v16i8)

      return LT.first * ST->getMVEVectorCostFactor(CostKind);

    break;

  }

  case Intrinsic::minnum:

  case Intrinsic::maxnum: {

    if (!ST->hasMVEFloatOps())

      break;

    Type *VT = ICA.getReturnType();

    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);

    if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)

      return LT.first * ST->getMVEVectorCostFactor(CostKind);

    break;

  }

  case Intrinsic::fptosi_sat:

  case Intrinsic::fptoui_sat: {

    if (ICA.getArgTypes().empty())

      break;

    bool IsSigned = Opc == Intrinsic::fptosi_sat;

    auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);

    EVT MTy = TLI->getValueType(DL, ICA.getReturnType());

    // Check for the legal types, with the corect subtarget features.

    if ((ST->hasVFP2Base() && LT.second == MVT::f32 && MTy == MVT::i32) ||

        (ST->hasFP64() && LT.second == MVT::f64 && MTy == MVT::i32) ||

        (ST->hasFullFP16() && LT.second == MVT::f16 && MTy == MVT::i32))

      return LT.first;


    // Equally for MVE vector types

    if (ST->hasMVEFloatOps() &&

        (LT.second == MVT::v4f32 || LT.second == MVT::v8f16) &&

        LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())

      return LT.first * ST->getMVEVectorCostFactor(CostKind);


    // If we can we use a legal convert followed by a min+max

    if (((ST->hasVFP2Base() && LT.second == MVT::f32) ||

         (ST->hasFP64() && LT.second == MVT::f64) ||

         (ST->hasFullFP16() && LT.second == MVT::f16) ||

         (ST->hasMVEFloatOps() &&

          (LT.second == MVT::v4f32 || LT.second == MVT::v8f16))) &&

        LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {

      Type *LegalTy = Type::getIntNTy(ICA.getReturnType()->getContext(),

                                      LT.second.getScalarSizeInBits());

      InstructionCost Cost =

          LT.second.isVector() ? ST->getMVEVectorCostFactor(CostKind) : 1;

      IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin

                                              : Intrinsic::umin,

                                     LegalTy, {LegalTy, LegalTy});

      Cost += getIntrinsicInstrCost(Attrs1, CostKind);

      IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax

                                              : Intrinsic::umax,

                                     LegalTy, {LegalTy, LegalTy});

      Cost += getIntrinsicInstrCost(Attrs2, CostKind);

      return LT.first * Cost;

    }

    // Otherwise we need to follow the default expansion that clamps the value

    // using a float min/max with a fcmp+sel for nan handling when signed.

    Type *FPTy = ICA.getArgTypes()[0];

    Type *RetTy = ICA.getReturnType();

    IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});

    InstructionCost Cost = getIntrinsicInstrCost(Attrs1, CostKind);

    IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});

    Cost += getIntrinsicInstrCost(Attrs2, CostKind);

    Cost +=

        getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,

                         RetTy, FPTy, TTI::CastContextHint::None, CostKind);

    if (IsSigned) {

      Type *CondTy = RetTy->getWithNewBitWidth(1);

      Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,

                                 CmpInst::FCMP_UNO, CostKind);

      Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,

                                 CmpInst::FCMP_UNO, CostKind);

    }

    return Cost;

  }

  }


  return BaseT::getIntrinsicInstrCost(ICA, CostKind);

}


bool ARMTTIImpl::isLoweredToCall(const Function *F) const {

  if (!F->isIntrinsic())

    return BaseT::isLoweredToCall(F);


  // Assume all Arm-specific intrinsics map to an instruction.

  if (F->getName().starts_with("llvm.arm"))

    return false;


  switch (F->getIntrinsicID()) {

  default: break;

  case Intrinsic::powi:

  case Intrinsic::sin:

  case Intrinsic::cos:

  case Intrinsic::sincos:

  case Intrinsic::pow:

  case Intrinsic::log:

  case Intrinsic::log10:

  case Intrinsic::log2:

  case Intrinsic::exp:

  case Intrinsic::exp2:

    return true;

  case Intrinsic::sqrt:

  case Intrinsic::fabs:

  case Intrinsic::copysign:

  case Intrinsic::floor:

  case Intrinsic::ceil:

  case Intrinsic::trunc:

  case Intrinsic::rint:

  case Intrinsic::nearbyint:

  case Intrinsic::round:

  case Intrinsic::canonicalize:

  case Intrinsic::lround:

  case Intrinsic::llround:

  case Intrinsic::lrint:

  case Intrinsic::llrint:

    if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())

      return true;

    if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())

      return true;

    // Some operations can be handled by vector instructions and assume

    // unsupported vectors will be expanded into supported scalar ones.

    // TODO Handle scalar operations properly.

    return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();

  case Intrinsic::masked_store:

  case Intrinsic::masked_load:

  case Intrinsic::masked_gather:

  case Intrinsic::masked_scatter:

    return !ST->hasMVEIntegerOps();

  case Intrinsic::sadd_with_overflow:

  case Intrinsic::uadd_with_overflow:

  case Intrinsic::ssub_with_overflow:

  case Intrinsic::usub_with_overflow:

  case Intrinsic::sadd_sat:

  case Intrinsic::uadd_sat:

  case Intrinsic::ssub_sat:

  case Intrinsic::usub_sat:

    return false;

  }


  return BaseT::isLoweredToCall(F);

}


bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) const {

  unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());

  EVT VT = TLI->getValueType(DL, I.getType(), true);

  if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)

    return true;


  // Check if an intrinsic will be lowered to a call and assume that any

  // other CallInst will generate a bl.

  if (auto *Call = dyn_cast<CallInst>(&I)) {

    if (auto *II = dyn_cast<IntrinsicInst>(Call)) {

      switch(II->getIntrinsicID()) {

        case Intrinsic::memcpy:

        case Intrinsic::memset:

        case Intrinsic::memmove:

          return getNumMemOps(II) == -1;

        default:

          if (const Function *F = Call->getCalledFunction())

            return isLoweredToCall(F);

      }

    }

    return true;

  }


  // FPv5 provides conversions between integer, double-precision,

  // single-precision, and half-precision formats.

  switch (I.getOpcode()) {

  default:

    break;

  case Instruction::FPToSI:

  case Instruction::FPToUI:

  case Instruction::SIToFP:

  case Instruction::UIToFP:

  case Instruction::FPTrunc:

  case Instruction::FPExt:

    return !ST->hasFPARMv8Base();

  }


  // FIXME: Unfortunately the approach of checking the Operation Action does

  // not catch all cases of Legalization that use library calls. Our

  // Legalization step categorizes some transformations into library calls as

  // Custom, Expand or even Legal when doing type legalization. So for now

  // we have to special case for instance the SDIV of 64bit integers and the

  // use of floating point emulation.

  if (VT.isInteger() && VT.getSizeInBits() >= 64) {

    switch (ISD) {

    default:

      break;

    case ISD::SDIV:

    case ISD::UDIV:

    case ISD::SREM:

    case ISD::UREM:

    case ISD::SDIVREM:

    case ISD::UDIVREM:

      return true;

    }

  }


  // Assume all other non-float operations are supported.

  if (!VT.isFloatingPoint())

    return false;


  // We'll need a library call to handle most floats when using soft.

  if (TLI->useSoftFloat()) {

    switch (I.getOpcode()) {

    default:

      return true;

    case Instruction::Alloca:

    case Instruction::Load:

    case Instruction::Store:

    case Instruction::Select:

    case Instruction::PHI:

      return false;

    }

  }


  // We'll need a libcall to perform double precision operations on a single

  // precision only FPU.

  if (I.getType()->isDoubleTy() && !ST->hasFP64())

    return true;


  // Likewise for half precision arithmetic.

  if (I.getType()->isHalfTy() && !ST->hasFullFP16())

    return true;


  return false;

}


bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,

                                          AssumptionCache &AC,

                                          TargetLibraryInfo *LibInfo,

                                          HardwareLoopInfo &HWLoopInfo) const {

  // Low-overhead branches are only supported in the 'low-overhead branch'

  // extension of v8.1-m.

  if (!ST->hasLOB() || DisableLowOverheadLoops) {

    LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");

    return false;

  }


  if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {

    LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");

    return false;

  }


  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);

  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {

    LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");

    return false;

  }


  const SCEV *TripCountSCEV =

    SE.getAddExpr(BackedgeTakenCount,

                  SE.getOne(BackedgeTakenCount->getType()));


  // We need to store the trip count in LR, a 32-bit register.

  if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {

    LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");

    return false;

  }


  // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little

  // point in generating a hardware loop if that's going to happen.


  auto IsHardwareLoopIntrinsic = [](Instruction &I) {

    if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {

      switch (Call->getIntrinsicID()) {

      default:

        break;

      case Intrinsic::start_loop_iterations:

      case Intrinsic::test_start_loop_iterations:

      case Intrinsic::loop_decrement:

      case Intrinsic::loop_decrement_reg:

        return true;

      }

    }

    return false;

  };


  // Scan the instructions to see if there's any that we know will turn into a

  // call or if this loop is already a low-overhead loop or will become a tail

  // predicated loop.

  bool IsTailPredLoop = false;

  auto ScanLoop = [&](Loop *L) {

    for (auto *BB : L->getBlocks()) {

      for (auto &I : *BB) {

        if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||

            isa<InlineAsm>(I)) {

          LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");

          return false;

        }

        if (auto *II = dyn_cast<IntrinsicInst>(&I))

          IsTailPredLoop |=

              II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||

              II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||

              II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||

              II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||

              II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;

      }

    }

    return true;

  };


  // Visit inner loops.

  for (auto *Inner : *L)

    if (!ScanLoop(Inner))

      return false;


  if (!ScanLoop(L))

    return false;


  // TODO: Check whether the trip count calculation is expensive. If L is the

  // inner loop but we know it has a low trip count, calculating that trip

  // count (in the parent loop) may be detrimental.


  LLVMContext &C = L->getHeader()->getContext();

  HWLoopInfo.CounterInReg = true;

  HWLoopInfo.IsNestingLegal = false;

  HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;

  HWLoopInfo.CountType = Type::getInt32Ty(C);

  HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);

  return true;

}


static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {

  // We don't allow icmp's, and because we only look at single block loops,

  // we simply count the icmps, i.e. there should only be 1 for the backedge.

  if (isa<ICmpInst>(&I) && ++ICmpCount > 1)

    return false;

  // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are

  // not currently canonical, but soon will be. Code without them uses icmp, and

  // so is not tail predicated as per the condition above. In order to get the

  // same performance we treat min and max the same as an icmp for tailpred

  // purposes for the moment (we often rely on non-tailpred and higher VF's to

  // pick more optimial instructions like VQDMULH. They need to be recognized

  // directly by the vectorizer).

  if (auto *II = dyn_cast<IntrinsicInst>(&I))

    if ((II->getIntrinsicID() == Intrinsic::smin ||

         II->getIntrinsicID() == Intrinsic::smax ||

         II->getIntrinsicID() == Intrinsic::umin ||

         II->getIntrinsicID() == Intrinsic::umax) &&

        ++ICmpCount > 1)

      return false;


  if (isa<FCmpInst>(&I))

    return false;


  // We could allow extending/narrowing FP loads/stores, but codegen is

  // too inefficient so reject this for now.

  if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))

    return false;


  // Extends have to be extending-loads

  if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )

    if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))

      return false;


  // Truncs have to be narrowing-stores

  if (isa<TruncInst>(&I) )

    if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))

      return false;


  return true;

}


// To set up a tail-predicated loop, we need to know the total number of

// elements processed by that loop. Thus, we need to determine the element

// size and:

// 1) it should be uniform for all operations in the vector loop, so we

//    e.g. don't want any widening/narrowing operations.

// 2) it should be smaller than i64s because we don't have vector operations

//    that work on i64s.

// 3) we don't want elements to be reversed or shuffled, to make sure the

//    tail-predication masks/predicates the right lanes.

//


static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,

                                 const DataLayout &DL,

                                 const LoopAccessInfo *LAI,

                                 const DominatorTree &DT) {

  LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");


  // If there are live-out values, it is probably a reduction. We can predicate

  // most reduction operations freely under MVE using a combination of

  // prefer-predicated-reduction-select and inloop reductions. We limit this to

  // floating point and integer reductions, but don't check for operators

  // specifically here. If the value ends up not being a reduction (and so the

  // vectorizer cannot tailfold the loop), we should fall back to standard

  // vectorization automatically.

  SmallVector< Instruction *, 8 > LiveOuts;

  LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);

  bool ReductionsDisabled =

      EnableTailPredication == TailPredication::EnabledNoReductions ||

      EnableTailPredication == TailPredication::ForceEnabledNoReductions;


  for (auto *I : LiveOuts) {

    if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&

        !I->getType()->isHalfTy()) {

      LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "

                           "live-out value\n");

      return false;

    }

    if (ReductionsDisabled) {

      LLVM_DEBUG(dbgs() << "Reductions not enabled\n");

      return false;

    }

  }


  // Next, check that all instructions can be tail-predicated.

  PredicatedScalarEvolution PSE = LAI->getPSE();

  int ICmpCount = 0;


  for (BasicBlock *BB : L->blocks()) {

    for (Instruction &I : BB->instructionsWithoutDebug()) {

      if (isa<PHINode>(&I))

        continue;

      if (!canTailPredicateInstruction(I, ICmpCount)) {

        LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());

        return false;

      }


      Type *T  = I.getType();

      if (T->getScalarSizeInBits() > 32) {

        LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());

        return false;

      }

      if (isa<StoreInst>(I) || isa<LoadInst>(I)) {

        Value *Ptr = getLoadStorePointerOperand(&I);

        Type *AccessTy = getLoadStoreType(&I);

        int64_t NextStride =

            getPtrStride(PSE, AccessTy, Ptr, L, DT).value_or(0);

        if (NextStride == 1) {

          // TODO: for now only allow consecutive strides of 1. We could support

          // other strides as long as it is uniform, but let's keep it simple

          // for now.

          continue;

        } else if (NextStride == -1 ||

                   (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||

                   (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {

          LLVM_DEBUG(dbgs()

                     << "Consecutive strides of 2 found, vld2/vstr2 can't "

                        "be tail-predicated\n.");

          return false;

          // TODO: don't tail predicate if there is a reversed load?

        } else if (EnableMaskedGatherScatters) {

          // Gather/scatters do allow loading from arbitrary strides, at

          // least if they are loop invariant.

          // TODO: Loop variant strides should in theory work, too, but

          // this requires further testing.

          const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr);

          if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {

            const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());

            if (PSE.getSE()->isLoopInvariant(Step, L))

              continue;

          }

        }

        LLVM_DEBUG(dbgs() << "Bad stride found, can't "

                             "tail-predicate\n.");

        return false;

      }

    }

  }


  LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");

  return true;

}


bool ARMTTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const {

  if (!EnableTailPredication) {

    LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");

    return false;

  }


  // Creating a predicated vector loop is the first step for generating a

  // tail-predicated hardware loop, for which we need the MVE masked

  // load/stores instructions:

  if (!ST->hasMVEIntegerOps())

    return false;


  LoopVectorizationLegality *LVL = TFI->LVL;

  Loop *L = LVL->getLoop();


  // For now, restrict this to single block loops.

  if (L->getNumBlocks() > 1) {

    LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "

                         "loop.\n");

    return false;

  }


  assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");


  LoopInfo *LI = LVL->getLoopInfo();

  HardwareLoopInfo HWLoopInfo(L);

  if (!HWLoopInfo.canAnalyze(*LI)) {

    LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "

                         "analyzable.\n");

    return false;

  }


  AssumptionCache *AC = LVL->getAssumptionCache();

  ScalarEvolution *SE = LVL->getScalarEvolution();


  // This checks if we have the low-overhead branch architecture

  // extension, and if we will create a hardware-loop:

  if (!isHardwareLoopProfitable(L, *SE, *AC, TFI->TLI, HWLoopInfo)) {

    LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "

                         "profitable.\n");

    return false;

  }


  DominatorTree *DT = LVL->getDominatorTree();

  if (!HWLoopInfo.isHardwareLoopCandidate(*SE, *LI, *DT)) {

    LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "

                         "a candidate.\n");

    return false;

  }


  return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI(),

                              *LVL->getDominatorTree());

}


TailFoldingStyle


ARMTTIImpl::getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const {

  if (!ST->hasMVEIntegerOps() || !EnableTailPredication)

    return TailFoldingStyle::DataWithoutLaneMask;


  // Intrinsic @llvm.get.active.lane.mask is supported.

  // It is used in the MVETailPredication pass, which requires the number of

  // elements processed by this vector loop to setup the tail-predicated

  // loop.

  return TailFoldingStyle::Data;

}


void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,

                                         TTI::UnrollingPreferences &UP,

                                         OptimizationRemarkEmitter *ORE) const {

  // Enable Upper bound unrolling universally, providing that we do not see an

  // active lane mask, which will be better kept as a loop to become tail

  // predicated than to be conditionally unrolled.

  UP.UpperBound =

      !ST->hasMVEIntegerOps() || !any_of(*L->getHeader(), [](Instruction &I) {

        return isa<IntrinsicInst>(I) &&

               cast<IntrinsicInst>(I).getIntrinsicID() ==

                   Intrinsic::get_active_lane_mask;

      });


  // Only currently enable these preferences for M-Class cores.

  if (!ST->isMClass())

    return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);


  // Disable loop unrolling for Oz and Os.

  UP.OptSizeThreshold = 0;

  UP.PartialOptSizeThreshold = 0;

  if (L->getHeader()->getParent()->hasOptSize())

    return;


  SmallVector<BasicBlock*, 4> ExitingBlocks;

  L->getExitingBlocks(ExitingBlocks);

  LLVM_DEBUG(dbgs() << "Loop has:\n"

                    << "Blocks: " << L->getNumBlocks() << "\n"

                    << "Exit blocks: " << ExitingBlocks.size() << "\n");


  // Only allow another exit other than the latch. This acts as an early exit

  // as it mirrors the profitability calculation of the runtime unroller.

  if (ExitingBlocks.size() > 2)

    return;


  // Limit the CFG of the loop body for targets with a branch predictor.

  // Allowing 4 blocks permits if-then-else diamonds in the body.

  if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)

    return;


  // Don't unroll vectorized loops, including the remainder loop

  if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))

    return;


  // Scan the loop: don't unroll loops with calls as this could prevent

  // inlining.

  InstructionCost Cost = 0;

  for (auto *BB : L->getBlocks()) {

    for (auto &I : *BB) {

      // Don't unroll vectorised loop. MVE does not benefit from it as much as

      // scalar code.

      if (I.getType()->isVectorTy())

        return;


      if (isa<CallInst>(I) || isa<InvokeInst>(I)) {

        if (const Function *F = cast<CallBase>(I).getCalledFunction()) {

          if (!isLoweredToCall(F))

            continue;

        }

        return;

      }


      SmallVector<const Value*, 4> Operands(I.operand_values());

      Cost += getInstructionCost(&I, Operands,

                                 TargetTransformInfo::TCK_SizeAndLatency);

    }

  }


  // On v6m cores, there are very few registers available. We can easily end up

  // spilling and reloading more registers in an unrolled loop. Look at the

  // number of LCSSA phis as a rough measure of how many registers will need to

  // be live out of the loop, reducing the default unroll count if more than 1

  // value is needed.  In the long run, all of this should be being learnt by a

  // machine.

  unsigned UnrollCount = 4;

  if (ST->isThumb1Only()) {

    unsigned ExitingValues = 0;

    SmallVector<BasicBlock *, 4> ExitBlocks;

    L->getExitBlocks(ExitBlocks);

    for (auto *Exit : ExitBlocks) {

      // Count the number of LCSSA phis. Exclude values coming from GEP's as

      // only the last is expected to be needed for address operands.

      unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) {

        return PH.getNumOperands() != 1 ||

               !isa<GetElementPtrInst>(PH.getOperand(0));

      });

      ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;

    }

    if (ExitingValues)

      UnrollCount /= ExitingValues;

    if (UnrollCount <= 1)

      return;

  }


  // For processors with low overhead branching (LOB), runtime unrolling the

  // innermost loop is often detrimental to performance. In these cases the loop

  // remainder gets unrolled into a series of compare-and-jump blocks, which in

  // deeply nested loops get executed multiple times, negating the benefits of

  // LOB. This is particularly noticable when the loop trip count of the

  // innermost loop varies within the outer loop, such as in the case of

  // triangular matrix decompositions. In these cases we will prefer to not

  // unroll the innermost loop, with the intention for it to be executed as a

  // low overhead loop.

  bool Runtime = true;

  if (ST->hasLOB()) {

    if (SE.hasLoopInvariantBackedgeTakenCount(L)) {

      const auto *BETC = SE.getBackedgeTakenCount(L);

      auto *Outer = L->getOutermostLoop();

      if ((L != Outer && Outer != L->getParentLoop()) ||

          (L != Outer && BETC && !SE.isLoopInvariant(BETC, Outer))) {

        Runtime = false;

      }

    }

  }


  LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");

  LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");


  UP.Partial = true;

  UP.Runtime = Runtime;

  UP.UnrollRemainder = true;

  UP.DefaultUnrollRuntimeCount = UnrollCount;

  UP.UnrollAndJam = true;

  UP.UnrollAndJamInnerLoopThreshold = 60;


  // Force unrolling small loops can be very useful because of the branch

  // taken cost of the backedge.

  if (Cost < ArmForceUnrollThreshold)

    UP.Force = true;

}


void ARMTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,

                                       TTI::PeelingPreferences &PP) const {

  BaseT::getPeelingPreferences(L, SE, PP);

}


bool ARMTTIImpl::preferInLoopReduction(RecurKind Kind, Type *Ty) const {

  if (!ST->hasMVEIntegerOps())

    return false;


  unsigned ScalarBits = Ty->getScalarSizeInBits();

  switch (Kind) {

  case RecurKind::Add:

    return ScalarBits <= 64;

  default:

    return false;

  }

}


bool ARMTTIImpl::preferPredicatedReductionSelect() const {

  if (!ST->hasMVEIntegerOps())

    return false;

  return true;

}


InstructionCost ARMTTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,

                                                 StackOffset BaseOffset,

                                                 bool HasBaseReg, int64_t Scale,

                                                 unsigned AddrSpace) const {

  TargetLoweringBase::AddrMode AM;

  AM.BaseGV = BaseGV;

  AM.BaseOffs = BaseOffset.getFixed();

  AM.HasBaseReg = HasBaseReg;

  AM.Scale = Scale;

  AM.ScalableOffset = BaseOffset.getScalable();

  if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) {

    if (ST->hasFPAO())

      return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster

    return 0;

  }

  return InstructionCost::getInvalid();

}


bool ARMTTIImpl::hasArmWideBranch(bool Thumb) const {

  if (Thumb) {

    // B.W is available in any Thumb2-supporting target, and also in every

    // version of Armv8-M, even Baseline which does not include the rest of

    // Thumb2.

    return ST->isThumb2() || ST->hasV8MBaselineOps();

  } else {

    // B is available in all versions of the Arm ISA, so the only question is

    // whether that ISA is available at all.

    return ST->hasARMOps();

  }

}


/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth

/// of the vector elements.


static bool areExtractExts(Value *Ext1, Value *Ext2) {

  using namespace PatternMatch;


  auto areExtDoubled = [](Instruction *Ext) {

    return Ext->getType()->getScalarSizeInBits() ==

           2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();

  };


  if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||

      !match(Ext2, m_ZExtOrSExt(m_Value())) ||

      !areExtDoubled(cast<Instruction>(Ext1)) ||

      !areExtDoubled(cast<Instruction>(Ext2)))

    return false;


  return true;

}


/// Check if sinking \p I's operands to I's basic block is profitable, because

/// the operands can be folded into a target instruction, e.g.

/// sext/zext can be folded into vsubl.


bool ARMTTIImpl::isProfitableToSinkOperands(Instruction *I,

                                            SmallVectorImpl<Use *> &Ops) const {

  using namespace PatternMatch;


  if (!I->getType()->isVectorTy())

    return false;


  if (ST->hasNEON()) {

    switch (I->getOpcode()) {

    case Instruction::Sub:

    case Instruction::Add: {

      if (!areExtractExts(I->getOperand(0), I->getOperand(1)))

        return false;

      Ops.push_back(&I->getOperandUse(0));

      Ops.push_back(&I->getOperandUse(1));

      return true;

    }

    default:

      return false;

    }

  }


  if (!ST->hasMVEIntegerOps())

    return false;


  auto IsFMSMul = [&](Instruction *I) {

    if (!I->hasOneUse())

      return false;

    auto *Sub = cast<Instruction>(*I->users().begin());

    return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(1) == I;

  };

  auto IsFMS = [&](Instruction *I) {

    if (match(I->getOperand(0), m_FNeg(m_Value())) ||

        match(I->getOperand(1), m_FNeg(m_Value())))

      return true;

    return false;

  };


  auto IsSinker = [&](Instruction *I, int Operand) {

    switch (I->getOpcode()) {

    case Instruction::Add:

    case Instruction::Mul:

    case Instruction::FAdd:

    case Instruction::ICmp:

    case Instruction::FCmp:

      return true;

    case Instruction::FMul:

      return !IsFMSMul(I);

    case Instruction::Sub:

    case Instruction::FSub:

    case Instruction::Shl:

    case Instruction::LShr:

    case Instruction::AShr:

      return Operand == 1;

    case Instruction::Call:

      if (auto *II = dyn_cast<IntrinsicInst>(I)) {

        switch (II->getIntrinsicID()) {

        case Intrinsic::fma:

          return !IsFMS(I);

        case Intrinsic::sadd_sat:

        case Intrinsic::uadd_sat:

        case Intrinsic::arm_mve_add_predicated:

        case Intrinsic::arm_mve_mul_predicated:

        case Intrinsic::arm_mve_qadd_predicated:

        case Intrinsic::arm_mve_vhadd:

        case Intrinsic::arm_mve_hadd_predicated:

        case Intrinsic::arm_mve_vqdmull:

        case Intrinsic::arm_mve_vqdmull_predicated:

        case Intrinsic::arm_mve_vqdmulh:

        case Intrinsic::arm_mve_qdmulh_predicated:

        case Intrinsic::arm_mve_vqrdmulh:

        case Intrinsic::arm_mve_qrdmulh_predicated:

        case Intrinsic::arm_mve_fma_predicated:

          return true;

        case Intrinsic::ssub_sat:

        case Intrinsic::usub_sat:

        case Intrinsic::arm_mve_sub_predicated:

        case Intrinsic::arm_mve_qsub_predicated:

        case Intrinsic::arm_mve_hsub_predicated:

        case Intrinsic::arm_mve_vhsub:

          return Operand == 1;

        default:

          return false;

        }

      }

      return false;

    default:

      return false;

    }

  };


  for (auto OpIdx : enumerate(I->operands())) {

    Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());

    // Make sure we are not already sinking this operand

    if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))

      continue;


    Instruction *Shuffle = Op;

    if (Shuffle->getOpcode() == Instruction::BitCast)

      Shuffle = dyn_cast<Instruction>(Shuffle->getOperand(0));

    // We are looking for a splat that can be sunk.

    if (!Shuffle || !match(Shuffle, m_Shuffle(m_InsertElt(m_Undef(), m_Value(),

                                                          m_ZeroInt()),

                                              m_Undef(), m_ZeroMask())))

      continue;

    if (!IsSinker(I, OpIdx.index()))

      continue;


    // All uses of the shuffle should be sunk to avoid duplicating it across gpr

    // and vector registers

    for (Use &U : Op->uses()) {

      Instruction *Insn = cast<Instruction>(U.getUser());

      if (!IsSinker(Insn, U.getOperandNo()))

        return false;

    }


    Ops.push_back(&Shuffle->getOperandUse(0));

    if (Shuffle != Op)

      Ops.push_back(&Op->getOperandUse(0));

    Ops.push_back(&OpIdx.value());

  }

  return true;

}


unsigned ARMTTIImpl::getNumBytesToPadGlobalArray(unsigned Size,

                                                 Type *ArrayType) const {

  if (!UseWidenGlobalArrays) {

    LLVM_DEBUG(dbgs() << "Padding global arrays disabled\n");

    return false;

  }


  // Don't modify none integer array types

  if (!ArrayType || !ArrayType->isArrayTy() ||

      !ArrayType->getArrayElementType()->isIntegerTy())

    return 0;


  // We pad to 4 byte boundaries

  if (Size % 4 == 0)

    return 0;


  unsigned NumBytesToPad = 4 - (Size % 4);

  unsigned NewSize = Size + NumBytesToPad;


  // Max number of bytes that memcpy allows for lowering to load/stores before

  // it uses library function (__aeabi_memcpy).

  unsigned MaxMemIntrinsicSize = getMaxMemIntrinsicInlineSizeThreshold();


  if (NewSize > MaxMemIntrinsicSize)

    return 0;


  return NumBytesToPad;

}


SelectTypeKind::FP
@ FP
Definition AArch64ISelDAGToDAG.cpp:1842

assert
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

areExtractExts
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
Definition AArch64TargetTransformInfo.cpp:6523

APInt.h
This file implements a class to represent arbitrary precision integral constant values and operations...

ARMAddressingModes.h

MVEMaxSupportedInterleaveFactor
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, cl::desc("Maximum interleave factor for MVE VLDn to generate."), cl::init(2))

DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition ARMSLSHardening.cpp:73

ARMSubtarget.h

ArmForceUnrollThreshold
static cl::opt< int > ArmForceUnrollThreshold("arm-force-unroll-threshold", cl::init(12), cl::Hidden, cl::desc("Threshold for forced unrolling of small loops in Arm architecture"))

isSSATMinMaxPattern
static Value * isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm)
Definition ARMTargetTransformInfo.cpp:421

canTailPredicateLoop
static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, const DataLayout &DL, const LoopAccessInfo *LAI, const DominatorTree &DT)
Definition ARMTargetTransformInfo.cpp:2526

AllowWLSLoops
static cl::opt< bool > AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true), cl::desc("Enable the generation of WLS loops"))

simplifyNeonVld1
static Value * simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign, InstCombiner::BuilderTy &Builder)
Convert a vector load intrinsic into a simple llvm load instruction.
Definition ARMTargetTransformInfo.cpp:77

isFPSatMinMaxPattern
static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm)
Definition ARMTargetTransformInfo.cpp:455

UseWidenGlobalArrays
static cl::opt< bool > UseWidenGlobalArrays("widen-global-strings", cl::Hidden, cl::init(true), cl::desc("Enable the widening of global strings to alignment boundaries"))

EnableMaskedGatherScatters
cl::opt< bool > EnableMaskedGatherScatters

canTailPredicateInstruction
static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount)
Definition ARMTargetTransformInfo.cpp:2475

EnableTailPredication
cl::opt< TailPredication::Mode > EnableTailPredication

DisableLowOverheadLoops
static cl::opt< bool > DisableLowOverheadLoops("disable-arm-loloops", cl::Hidden, cl::init(false), cl::desc("Disable the generation of low-overhead loops"))

EnableMaskedLoadStores
static cl::opt< bool > EnableMaskedLoadStores("enable-arm-maskedldst", cl::Hidden, cl::init(true), cl::desc("Enable the generation of masked loads and stores"))

ARMTargetTransformInfo.h
This file a TargetTransformInfoImplBase conforming object specific to the ARM target machine.

Casting.h

CostKind
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))

CostTable.h
Cost tables and simple lookup functions.

DataLayout.h

DerivedTypes.h

GEP
Hexagon Common GEP
Definition HexagonCommonGEP.cpp:164

BasicBlock.h

Instruction.h

IntrinsicInst.h

Type.h

ISDOpcodes.h

InlinePriorityMode::Size
@ Size
Definition InlineOrder.cpp:25

InstCombiner.h
This file provides the interface for the instcombine pass implementation.

Instructions.h

Intrinsics.h

NumOps
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
Definition ItaniumDemangle.h:3452

Ops
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Definition ItaniumDemangle.h:3370

KnownBits.h

LoopInfo.h

UnrollCount
static cl::opt< unsigned > UnrollCount("unroll-count", cl::Hidden, cl::desc("Use this unroll count for all loops including those with " "unroll_count pragma values, for testing purposes"))

LoopUtils.h

LoopVectorizationLegality.h
This file defines the LoopVectorizationLegality class.

F
#define F(x, y, z)
Definition MD5.cpp:54

I
#define I(x, y, z)
Definition MD5.cpp:57

MachineValueType.h

getCalledFunction
static const Function * getCalledFunction(const Value *V)
Definition MemoryBuiltins.cpp:159

T
#define T
Definition Mips16ISelLowering.cpp:282

OpIdx
MachineInstr unsigned OpIdx
Definition NVPTXPrologEpilogPass.cpp:56

Range
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))

II
uint64_t IntrinsicInst * II
Definition NVVMIntrRange.cpp:46

PatternMatch.h

Opc
auto Opc
Definition RISCVRedundantCopyElimination.cpp:77

SmallVector.h
This file defines the SmallVector class.

SubtargetFeature.h

LLVM_DEBUG
#define LLVM_DEBUG(...)
Definition Debug.h:114

Local.h

ValueTypes.h

RHS
Value * RHS
Definition X86PartialReduction.cpp:81

LHS
Value * LHS
Definition X86PartialReduction.cpp:80

llvm::APInt
Class for arbitrary precision integers.
Definition APInt.h:78

llvm::APInt::getBitWidth
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1497

llvm::APInt::getSplat
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:651

llvm::APInt::getLowBitsSet
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307

llvm::APInt::getHighBitsSet
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297

llvm::APInt::getOneBitSet
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240

llvm::ARMTTIImpl::getGatherScatterOpCost
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
Definition ARMTargetTransformInfo.cpp:1766

llvm::ARMTTIImpl::getAddressComputationCost
InstructionCost getAddressComputationCost(Type *Val, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
Definition ARMTargetTransformInfo.cpp:1142

llvm::ARMTTIImpl::getPreferredTailFoldingStyle
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const override
Definition ARMTargetTransformInfo.cpp:2672

llvm::ARMTTIImpl::getMaskedMemoryOpCost
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
Definition ARMTargetTransformInfo.cpp:1703

llvm::ARMTTIImpl::getMemoryOpCost
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Definition ARMTargetTransformInfo.cpp:1642

llvm::ARMTTIImpl::getMemcpyCost
InstructionCost getMemcpyCost(const Instruction *I) const override
Definition ARMTargetTransformInfo.cpp:1281

llvm::ARMTTIImpl::maybeLoweredToCall
bool maybeLoweredToCall(Instruction &I) const
Definition ARMTargetTransformInfo.cpp:2293

llvm::ARMTTIImpl::preferInLoopReduction
bool preferInLoopReduction(RecurKind Kind, Type *Ty) const override
Definition ARMTargetTransformInfo.cpp:2817

llvm::ARMTTIImpl::getCmpSelInstrCost
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Definition ARMTargetTransformInfo.cpp:995

llvm::ARMTTIImpl::getMulAccReductionCost
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *ValTy, TTI::TargetCostKind CostKind) const override
Definition ARMTargetTransformInfo.cpp:1996

llvm::ARMTTIImpl::getInterleavedMemoryOpCost
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
Definition ARMTargetTransformInfo.cpp:1724

llvm::ARMTTIImpl::getIntImmCost
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
Definition ARMTargetTransformInfo.cpp:374

llvm::ARMTTIImpl::hasArmWideBranch
bool hasArmWideBranch(bool Thumb) const override
Definition ARMTargetTransformInfo.cpp:2854

llvm::ARMTTIImpl::getCastInstrCost
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
Definition ARMTargetTransformInfo.cpp:552

llvm::ARMTTIImpl::getNumMemOps
int getNumMemOps(const IntrinsicInst *I) const
Given a memcpy/memset/memmove instruction, return the number of memory operations performed,...
Definition ARMTargetTransformInfo.cpp:1215

llvm::ARMTTIImpl::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
Definition ARMTargetTransformInfo.cpp:538

llvm::ARMTTIImpl::getIntImmCodeSizeCost
InstructionCost getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty) const override
Definition ARMTargetTransformInfo.cpp:409

llvm::ARMTTIImpl::isLoweredToCall
bool isLoweredToCall(const Function *F) const override
Definition ARMTargetTransformInfo.cpp:2231

llvm::ARMTTIImpl::getExtendedReductionCost
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
Definition ARMTargetTransformInfo.cpp:1961

llvm::ARMTTIImpl::isProfitableToSinkOperands
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
Definition ARMTargetTransformInfo.cpp:2889

llvm::ARMTTIImpl::getMaxMemIntrinsicInlineSizeThreshold
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
Definition ARMTargetTransformInfo.h:321

llvm::ARMTTIImpl::isLegalMaskedStore
bool isLegalMaskedStore(Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::MaskKind MaskKind=TTI::MaskKind::VariableOrConstantMask) const override
Definition ARMTargetTransformInfo.h:293

llvm::ARMTTIImpl::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
Definition ARMTargetTransformInfo.cpp:956

llvm::ARMTTIImpl::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
Definition ARMTargetTransformInfo.cpp:1876

llvm::ARMTTIImpl::simplifyDemandedVectorEltsIntrinsic
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
Definition ARMTargetTransformInfo.cpp:332

llvm::ARMTTIImpl::isLegalMaskedLoad
bool isLegalMaskedLoad(Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::MaskKind MaskKind=TTI::MaskKind::VariableOrConstantMask) const override
Definition ARMTargetTransformInfo.cpp:1181

llvm::ARMTTIImpl::getIntImmCostInst
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
Definition ARMTargetTransformInfo.cpp:467

llvm::ARMTTIImpl::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
Definition ARMTargetTransformInfo.cpp:1445

llvm::ARMTTIImpl::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Definition ARMTargetTransformInfo.cpp:2086

llvm::ARMTTIImpl::instCombineIntrinsic
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
Definition ARMTargetTransformInfo.cpp:179

llvm::ARMTTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
Definition ARMTargetTransformInfo.cpp:2812

llvm::ARMTTIImpl::getMinMaxReductionCost
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Definition ARMTargetTransformInfo.cpp:2026

llvm::ARMTTIImpl::getPreferredAddressingMode
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const override
Definition ARMTargetTransformInfo.cpp:163

llvm::ARMTTIImpl::preferPredicateOverEpilogue
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const override
Definition ARMTargetTransformInfo.cpp:2617

llvm::ARMTTIImpl::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
Definition ARMTargetTransformInfo.cpp:1291

llvm::ARMTTIImpl::areInlineCompatible
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
Definition ARMTargetTransformInfo.cpp:95

llvm::ARMTTIImpl::getMemIntrinsicInstrCost
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Definition ARMTargetTransformInfo.cpp:1689

llvm::ARMTTIImpl::preferPredicatedReductionSelect
bool preferPredicatedReductionSelect() const override
Definition ARMTargetTransformInfo.cpp:2830

llvm::ARMTTIImpl::isLegalMaskedGather
bool isLegalMaskedGather(Type *Ty, Align Alignment) const override
Definition ARMTargetTransformInfo.cpp:1203

llvm::ARMTTIImpl::getNumBytesToPadGlobalArray
unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const override
Definition ARMTargetTransformInfo.cpp:3013

llvm::ARMTTIImpl::isProfitableLSRChainElement
bool isProfitableLSRChainElement(Instruction *I) const override
Definition ARMTargetTransformInfo.cpp:1164

llvm::ARMTTIImpl::isHardwareLoopProfitable
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const override
Definition ARMTargetTransformInfo.cpp:2380

llvm::ARMTTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
Definition ARMTargetTransformInfo.cpp:2682

llvm::ARMTTIImpl::getScalingFactorCost
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
getScalingFactorCost - Return the cost of the scaling used in addressing mode represented by AM.
Definition ARMTargetTransformInfo.cpp:2836

llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40

llvm::ArrayType
Class to represent array types.
Definition DerivedTypes.h:398

llvm::AssumptionCache
A cache of @llvm.assume calls within a function.
Definition AssumptionCache.h:44

llvm::Attribute::getWithAlignment
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition Attributes.cpp:234

llvm::BasicBlock
LLVM Basic Block Representation.
Definition BasicBlock.h:62

llvm::BasicTTIImplBase< ARMTTIImpl >::getInterleavedMemoryOpCost
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
Definition BasicTTIImpl.h:1556

llvm::BasicTTIImplBase< ARMTTIImpl >::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
Definition BasicTTIImpl.h:1431

llvm::BasicTTIImplBase< ARMTTIImpl >::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
Definition BasicTTIImpl.h:1041

llvm::BasicTTIImplBase< ARMTTIImpl >::getMinMaxReductionCost
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Definition BasicTTIImpl.h:3280

llvm::BasicTTIImplBase< ARMTTIImpl >::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
Definition BasicTTIImpl.h:1374

llvm::BasicTTIImplBase< ARMTTIImpl >::getScalarizationOverhead
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
Definition BasicTTIImpl.h:896

llvm::BasicTTIImplBase< ARMTTIImpl >::improveShuffleKindFromMask
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
Definition BasicTTIImpl.h:1119

llvm::BasicTTIImplBase< ARMTTIImpl >::isLegalAddressingMode
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
Definition BasicTTIImpl.h:466

llvm::BasicTTIImplBase< ARMTTIImpl >::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
Definition BasicTTIImpl.h:1175

llvm::BasicTTIImplBase< ARMTTIImpl >::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
Definition BasicTTIImpl.h:3268

llvm::BasicTTIImplBase< ARMTTIImpl >::getCmpSelInstrCost
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Definition BasicTTIImpl.h:1379

llvm::BasicTTIImplBase< ARMTTIImpl >::getCallInstrCost
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const override
Definition BasicTTIImpl.h:3125

llvm::BasicTTIImplBase::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
Definition BasicTTIImpl.h:710

llvm::BasicTTIImplBase< ARMTTIImpl >::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
Definition BasicTTIImpl.h:782

llvm::BasicTTIImplBase< ARMTTIImpl >::getMulAccReductionCost
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind) const override
Definition BasicTTIImpl.h:3357

llvm::BasicTTIImplBase< ARMTTIImpl >::getCastInstrCost
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
Definition BasicTTIImpl.h:1204

llvm::BasicTTIImplBase< ARMTTIImpl >::getTypeLegalizationCost
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Definition BasicTTIImpl.h:1005

llvm::BasicTTIImplBase< ARMTTIImpl >::getExtendedReductionCost
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
Definition BasicTTIImpl.h:3328

llvm::BasicTTIImplBase< ARMTTIImpl >::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Definition BasicTTIImpl.h:1704

llvm::BasicTTIImplBase< ARMTTIImpl >::getAddressComputationCost
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
Definition BasicTTIImpl.h:3148

llvm::BasicTTIImplBase< ARMTTIImpl >::DL
const DataLayout & DL

llvm::BasicTTIImplBase< ARMTTIImpl >::getMemIntrinsicInstrCost
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Definition BasicTTIImpl.h:3053

llvm::BasicTTIImplBase< ARMTTIImpl >::getMemoryOpCost
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Definition BasicTTIImpl.h:1511

llvm::BinaryOperator::Create
static LLVM_ABI BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), InsertPosition InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
Definition Instructions.cpp:2701

llvm::CmpInst::makeCmpResultType
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:982

llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676

llvm::CmpInst::ICMP_SLE
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706

llvm::CmpInst::ICMP_SGT
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703

llvm::CmpInst::FCMP_UNO
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686

llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition Constants.h:87

llvm::ConstantInt::getValue
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159

llvm::ConstantRange
This class represents a range of values.
Definition ConstantRange.h:47

llvm::Constant
This is an important base class in LLVM.
Definition Constant.h:43

llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64

llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164

llvm::FastMathFlags
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22

llvm::FeatureBitset
Container class for subtarget features.
Definition SubtargetFeature.h:42

llvm::FeatureBitset::test
constexpr bool test(unsigned I) const
Definition SubtargetFeature.h:83

llvm::FeatureBitset::count
size_t count() const
Definition SubtargetFeature.h:91

llvm::FeatureBitset::size
constexpr size_t size() const
Definition SubtargetFeature.h:85

llvm::FixedVectorType
Class to represent fixed width SIMD vectors.
Definition DerivedTypes.h:594

llvm::FixedVectorType::getNumElements
unsigned getNumElements() const
Definition DerivedTypes.h:637

llvm::FixedVectorType::get
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802

llvm::Function
Definition Function.h:64

llvm::GlobalValue
Definition GlobalValue.h:49

llvm::IRBuilderBase::CreateVectorSplat
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition IRBuilder.cpp:1141

llvm::IRBuilderBase::getTrue
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition IRBuilder.h:502

llvm::IRBuilderBase::CreateIntrinsic
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition IRBuilder.cpp:847

llvm::IRBuilderBase::SetInsertPoint
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207

llvm::InstCombiner
The core instruction combiner logic.
Definition InstCombiner.h:48

llvm::InstCombiner::getDataLayout
const DataLayout & getDataLayout() const
Definition InstCombiner.h:339

llvm::InstCombiner::eraseInstFromFunction
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.

llvm::InstCombiner::BuilderTy
IRBuilder< TargetFolder, IRBuilderCallbackInserter > BuilderTy
An IRBuilder that automatically inserts new instructions into the worklist.
Definition InstCombiner.h:60

llvm::InstCombiner::getDominatorTree
DominatorTree & getDominatorTree() const
Definition InstCombiner.h:338

llvm::InstCombiner::replaceInstUsesWith
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition InstCombiner.h:390

llvm::InstCombiner::SimplifyDemandedBits
virtual bool SimplifyDemandedBits(Instruction *I, unsigned OpNo, const APInt &DemandedMask, KnownBits &Known, const SimplifyQuery &Q, unsigned Depth=0)=0

llvm::InstCombiner::replaceOperand
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
Definition InstCombiner.h:414

llvm::InstCombiner::Builder
BuilderTy & Builder
Definition InstCombiner.h:61

llvm::InstCombiner::getAssumptionCache
AssumptionCache & getAssumptionCache() const
Definition InstCombiner.h:336

llvm::InstructionCost
Definition InstructionCost.h:30

llvm::InstructionCost::getInvalid
static InstructionCost getInvalid(CostType Val=0)
Definition InstructionCost.h:74

llvm::Instruction
Definition Instruction.h:69

llvm::Instruction::user_back
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition Instruction.h:171

llvm::Instruction::getOpcode
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition Instruction.h:312

llvm::Instruction::isShift
bool isShift() const
Definition Instruction.h:320

llvm::IntrinsicCostAttributes
Definition TargetTransformInfo.h:181

llvm::IntrinsicCostAttributes::getArgTypes
const SmallVectorImpl< Type * > & getArgTypes() const
Definition TargetTransformInfo.h:220

llvm::IntrinsicCostAttributes::getReturnType
Type * getReturnType() const
Definition TargetTransformInfo.h:216

llvm::IntrinsicCostAttributes::getID
Intrinsic::ID getID() const
Definition TargetTransformInfo.h:214

llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition IntrinsicInst.h:49

llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68

llvm::LoopAccessInfo
Drive the analysis of memory accesses in the loop.
Definition LoopAccessAnalysis.h:689

llvm::LoopAccessInfo::getPSE
const PredicatedScalarEvolution & getPSE() const
Used to add runtime SCEV checks.
Definition LoopAccessAnalysis.h:783

llvm::LoopInfo
Definition LoopInfo.h:408

llvm::LoopVectorizationLegality
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
Definition LoopVectorizationLegality.h:261

llvm::LoopVectorizationLegality::getLoopInfo
LoopInfo * getLoopInfo() const
Definition LoopVectorizationLegality.h:477

llvm::LoopVectorizationLegality::getDominatorTree
DominatorTree * getDominatorTree() const
Definition LoopVectorizationLegality.h:483

llvm::LoopVectorizationLegality::getAssumptionCache
AssumptionCache * getAssumptionCache() const
Definition LoopVectorizationLegality.h:479

llvm::LoopVectorizationLegality::getLAI
const LoopAccessInfo * getLAI() const
Definition LoopVectorizationLegality.h:399

llvm::LoopVectorizationLegality::getScalarEvolution
ScalarEvolution * getScalarEvolution() const
Definition LoopVectorizationLegality.h:481

llvm::LoopVectorizationLegality::getLoop
Loop * getLoop() const
Definition LoopVectorizationLegality.h:475

llvm::Loop
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40

llvm::MCSubtargetInfo::getFeatureBits
const FeatureBitset & getFeatureBits() const
Definition MCSubtargetInfo.h:115

llvm::MemIntrinsicCostAttributes
Information for memory intrinsic cost model.
Definition TargetTransformInfo.h:128

llvm::MemIntrinsicCostAttributes::getAlignment
Align getAlignment() const
Definition TargetTransformInfo.h:178

llvm::MemIntrinsicCostAttributes::getAddressSpace
unsigned getAddressSpace() const
Definition TargetTransformInfo.h:177

llvm::MemIntrinsicCostAttributes::getDataType
Type * getDataType() const
Definition TargetTransformInfo.h:175

llvm::MemIntrinsicCostAttributes::getPointer
const Value * getPointer() const
Definition TargetTransformInfo.h:174

llvm::MemIntrinsicCostAttributes::getVariableMask
bool getVariableMask() const
Definition TargetTransformInfo.h:176

llvm::MemIntrinsicCostAttributes::getID
Intrinsic::ID getID() const
Definition TargetTransformInfo.h:172

llvm::MemIntrinsicCostAttributes::getInst
const Instruction * getInst() const
Definition TargetTransformInfo.h:173

llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition OptimizationRemarkEmitter.h:33

llvm::PredicatedScalarEvolution
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
Definition ScalarEvolution.h:2443

llvm::PredicatedScalarEvolution::getSE
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
Definition ScalarEvolution.h:2490

llvm::SCEV
This class represents an analyzed expression in the program.
Definition ScalarEvolution.h:72

llvm::SCEV::getType
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Definition ScalarEvolution.cpp:385

llvm::ScalarEvolution
The main scalar evolution driver.
Definition ScalarEvolution.h:457

llvm::ScalarEvolution::getBackedgeTakenCount
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
Definition ScalarEvolution.cpp:8456

llvm::ScalarEvolution::getSCEV
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
Definition ScalarEvolution.cpp:4632

llvm::ScalarEvolution::getOne
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
Definition ScalarEvolution.h:680

llvm::ScalarEvolution::isLoopInvariant
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
Definition ScalarEvolution.cpp:14274

llvm::ScalarEvolution::hasLoopInvariantBackedgeTakenCount
LLVM_ABI bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
Definition ScalarEvolution.cpp:13888

llvm::ScalarEvolution::getUnsignedRangeMax
APInt getUnsignedRangeMax(const SCEV *S)
Determine the max of the unsigned range for a particular SCEV.
Definition ScalarEvolution.h:1043

llvm::ScalarEvolution::getAddExpr
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
Definition ScalarEvolution.cpp:2546

llvm::ShuffleVectorInst::isDeInterleaveMaskOfFactor
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
Definition Instructions.cpp:2441

llvm::ShuffleVectorInst::isInterleaveMask
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
Definition Instructions.cpp:2357

llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition SmallVector.h:576

llvm::SmallVectorTemplateCommon::size
size_t size() const
Definition SmallVector.h:80

llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition SmallVector.h:1205

llvm::StackOffset
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30

llvm::StackOffset::getScalable
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40

llvm::StackOffset::getFixed
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39

llvm::TargetLibraryInfo
Provides information about what library functions are available for the current target.
Definition TargetLibraryInfo.h:266

llvm::TargetLoweringBase::LibCall
@ LibCall
Definition TargetLowering.h:206

llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition TargetMachine.h:83

llvm::TargetMachine::getSubtargetImpl
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
Definition TargetMachine.h:139

llvm::TargetTransformInfoImplBase::isLoweredToCall
virtual bool isLoweredToCall(const Function *F) const
Definition TargetTransformInfoImpl.h:216

llvm::TargetTransformInfoImplBase::isConstantStridedAccessLessThan
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
Definition TargetTransformInfoImpl.h:1265

llvm::TargetTransformInfoImplCRTPBase::getInstructionCost
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
Definition TargetTransformInfoImpl.h:1404

llvm::TargetTransformInfo::MaskKind
MaskKind
Some targets only support masked load/store with a constant mask.
Definition TargetTransformInfo.h:885

llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition TargetTransformInfo.h:333

llvm::TargetTransformInfo::TCK_RecipThroughput
@ TCK_RecipThroughput
Reciprocal throughput.
Definition TargetTransformInfo.h:334

llvm::TargetTransformInfo::TCK_CodeSize
@ TCK_CodeSize
Instruction code size.
Definition TargetTransformInfo.h:336

llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition TargetTransformInfo.h:337

llvm::TargetTransformInfo::requiresOrderedReduction
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
Definition TargetTransformInfo.h:1653

llvm::TargetTransformInfo::TCC_Expensive
@ TCC_Expensive
The cost of a 'div' instruction on x86.
Definition TargetTransformInfo.h:361

llvm::TargetTransformInfo::AddressingModeKind
AddressingModeKind
Which addressing mode Loop Strength Reduction will try to generate.
Definition TargetTransformInfo.h:872

llvm::TargetTransformInfo::AMK_PostIndexed
@ AMK_PostIndexed
Prefer post-indexed addressing mode.
Definition TargetTransformInfo.h:875

llvm::TargetTransformInfo::AMK_PreIndexed
@ AMK_PreIndexed
Prefer pre-indexed addressing mode.
Definition TargetTransformInfo.h:874

llvm::TargetTransformInfo::AMK_None
@ AMK_None
Don't prefer any addressing mode.
Definition TargetTransformInfo.h:873

llvm::TargetTransformInfo::ShuffleKind
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Definition TargetTransformInfo.h:1212

llvm::TargetTransformInfo::SK_Select
@ SK_Select
Selects elements from the corresponding lane of either source operand.
Definition TargetTransformInfo.h:1215

llvm::TargetTransformInfo::SK_PermuteSingleSrc
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
Definition TargetTransformInfo.h:1223

llvm::TargetTransformInfo::SK_Broadcast
@ SK_Broadcast
Broadcast element 0 to all other elements.
Definition TargetTransformInfo.h:1213

llvm::TargetTransformInfo::SK_Reverse
@ SK_Reverse
Reverse the order of the vector.
Definition TargetTransformInfo.h:1214

llvm::TargetTransformInfo::SK_ExtractSubvector
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
Definition TargetTransformInfo.h:1220

llvm::TargetTransformInfo::CastContextHint
CastContextHint
Represents a hint about the context in which a cast is used.
Definition TargetTransformInfo.h:1518

llvm::TargetTransformInfo::CastContextHint::Masked
@ Masked
The cast is used with a masked load/store.
Definition TargetTransformInfo.h:1521

llvm::TargetTransformInfo::CastContextHint::None
@ None
The cast is not used with a load/store of any kind.
Definition TargetTransformInfo.h:1519

llvm::TargetTransformInfo::CastContextHint::Normal
@ Normal
The cast is used with a normal load/store.
Definition TargetTransformInfo.h:1520

llvm::TruncInst
This class represents a truncation of integer types.
Definition Instructions.h:4570

llvm::TypeSize
Definition TypeSize.h:332

llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45

llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273

llvm::Type::isArrayTy
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition Type.h:264

llvm::Type::isScalableTy
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61

llvm::Type::getInt32Ty
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296

llvm::Type::isIntOrIntVectorTy
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition Type.h:246

llvm::Type::getArrayElementType
Type * getArrayElementType() const
Definition Type.h:408

llvm::Type::isFloatTy
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153

llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352

llvm::Type::getPrimitiveSizeInBits
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197

llvm::Type::getWithNewBitWidth
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
Definition DerivedTypes.h:766

llvm::Type::isHalfTy
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142

llvm::Type::getContext
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128

llvm::Type::getScalarSizeInBits
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230

llvm::Type::isIntegerTy
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240

llvm::Type::getIntNTy
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300

llvm::Use
A Use represents the edge between a Value definition and its users.
Definition Use.h:35

llvm::User
Definition User.h:44

llvm::User::getOperandUse
const Use & getOperandUse(unsigned i) const
Definition User.h:220

llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition User.h:207

llvm::Value
LLVM Value Representation.
Definition Value.h:75

llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256

llvm::Value::user_begin
user_iterator user_begin()
Definition Value.h:402

llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439

llvm::Value::hasNUses
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150

llvm::VectorType
Base class of all SIMD vector types.
Definition DerivedTypes.h:430

llvm::VectorType::getElementCount
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition DerivedTypes.h:697

llvm::cl::opt
Definition CommandLine.h:1454

llvm::details::FixedOrScalableQuantity::getKnownMinValue
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165

uint64_t

Call
CallInst * Call
Definition ObjCARCOpts.cpp:2359

llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition ErrorHandling.h:164

TargetMachine.h

llvm::ARM_AM::getSOImmVal
int getSOImmVal(unsigned Arg)
getSOImmVal - Given a 32-bit immediate, if it is something that can fit into an shifter_operand immed...
Definition ARMAddressingModes.h:149

llvm::ARM_AM::isThumbImmShiftedVal
bool isThumbImmShiftedVal(unsigned V)
isThumbImmShiftedVal - Return true if the specified value can be obtained by left shifting a 8-bit im...
Definition ARMAddressingModes.h:221

llvm::ARM_AM::getT2SOImmVal
int getT2SOImmVal(unsigned Arg)
getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit into a Thumb-2 shifter_oper...
Definition ARMAddressingModes.h:307

llvm::CallingConv::C
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34

llvm::ISD
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24

llvm::ISD::SREM
@ SREM
Definition ISDOpcodes.h:269

llvm::ISD::UDIV
@ UDIV
Definition ISDOpcodes.h:268

llvm::ISD::UINT_TO_FP
@ UINT_TO_FP
Definition ISDOpcodes.h:880

llvm::ISD::SDIV
@ SDIV
Definition ISDOpcodes.h:267

llvm::ISD::ADD
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264

llvm::ISD::SINT_TO_FP
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:879

llvm::ISD::FADD
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417

llvm::ISD::UDIVREM
@ UDIVREM
Definition ISDOpcodes.h:281

llvm::ISD::SDIVREM
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:280

llvm::ISD::SIGN_EXTEND
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:843

llvm::ISD::FP_TO_UINT
@ FP_TO_UINT
Definition ISDOpcodes.h:926

llvm::ISD::OR
@ OR
Definition ISDOpcodes.h:739

llvm::ISD::SELECT
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:795

llvm::ISD::SHL
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:764

llvm::ISD::VECTOR_SHUFFLE
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649

llvm::ISD::XOR
@ XOR
Definition ISDOpcodes.h:740

llvm::ISD::ZERO_EXTEND
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:849

llvm::ISD::FMUL
@ FMUL
Definition ISDOpcodes.h:419

llvm::ISD::SUB
@ SUB
Definition ISDOpcodes.h:265

llvm::ISD::SMIN
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:726

llvm::ISD::FP_EXTEND
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:977

llvm::ISD::FP_TO_SINT
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:925

llvm::ISD::AND
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:738

llvm::ISD::UREM
@ UREM
Definition ISDOpcodes.h:270

llvm::ISD::MUL
@ MUL
Definition ISDOpcodes.h:266

llvm::ISD::FP_ROUND
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:958

llvm::ISD::TRUNCATE
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:855

llvm::Intrinsic::ID
unsigned ID
Definition GenericSSAContext.h:28

llvm::MIPatternMatch::m_ZeroInt
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
Definition MIPatternMatch.h:278

llvm::NVPTXAS::AddressSpace
AddressSpace
Definition NVPTXAddrSpace.h:21

llvm::PatternMatch
Definition PatternMatch.h:47

llvm::PatternMatch::m_Constant
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
Definition PatternMatch.h:172

llvm::PatternMatch::m_Xor
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
Definition PatternMatch.h:1328

llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition PatternMatch.h:49

llvm::PatternMatch::m_Specific
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition PatternMatch.h:982

llvm::PatternMatch::m_ConstantInt
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition PatternMatch.h:175

llvm::PatternMatch::m_Intrinsic
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
Definition PatternMatch.h:2804

llvm::PatternMatch::m_Shuffle
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
Definition PatternMatch.h:2060

llvm::PatternMatch::m_c_Add
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
Definition PatternMatch.h:2995

llvm::PatternMatch::m_Value
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition PatternMatch.h:99

llvm::PatternMatch::m_ZExtOrSExt
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
Definition PatternMatch.h:2306

llvm::PatternMatch::m_FNeg
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
Definition PatternMatch.h:1256

llvm::PatternMatch::m_Undef
auto m_Undef()
Match an arbitrary undef constant.
Definition PatternMatch.h:159

llvm::PatternMatch::m_Zero
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition PatternMatch.h:618

llvm::PatternMatch::m_InsertElt
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
Definition PatternMatch.h:1978

llvm::TailPredication::EnabledNoReductions
@ EnabledNoReductions
Definition ARMTargetTransformInfo.h:45

llvm::TailPredication::ForceEnabledNoReductions
@ ForceEnabledNoReductions
Definition ARMTargetTransformInfo.h:47

llvm::cl::Hidden
@ Hidden
Definition CommandLine.h:138

llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition CommandLine.h:444

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26

llvm::CostTableLookup
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35

llvm::getBooleanLoopAttribute
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
Definition LoopInfo.cpp:1109

llvm::Cost
InstructionCost Cost
Definition FunctionSpecialization.h:103

llvm::enumerate
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2544

llvm::TypeConversionCostTblEntry
TypeConversionCostTblEntryT< unsigned > TypeConversionCostTblEntry
Definition CostTable.h:61

llvm::dyn_cast
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643

llvm::AsanDetectStackUseAfterReturnMode::Runtime
@ Runtime
Detect stack use after return if not disabled runtime with (ASAN_OPTIONS=detect_stack_use_after_retur...
Definition AddressSanitizerOptions.h:31

llvm::getLoadStorePointerOperand
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
Definition Instructions.h:5103

llvm::dyn_cast_or_null
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753

llvm::getKnownAlignment
Align getKnownAlignment(Value *V, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to infer an alignment for the specified pointer.
Definition Local.h:252

llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744

llvm::findDefsUsedOutsideOfLoop
LLVM_ABI SmallVector< Instruction *, 8 > findDefsUsedOutsideOfLoop(Loop *L)
Returns the instructions that use values defined in the loop.
Definition LoopUtils.cpp:127

llvm::SelectPatternFlavor
SelectPatternFlavor
Specific patterns of select instructions we can match.
Definition ValueTracking.h:845

llvm::SPF_ABS
@ SPF_ABS
Floating point maxnum.
Definition ValueTracking.h:853

llvm::SPF_FMAXNUM
@ SPF_FMAXNUM
Floating point minnum.
Definition ValueTracking.h:852

llvm::SPF_UMIN
@ SPF_UMIN
Signed minimum.
Definition ValueTracking.h:848

llvm::SPF_UMAX
@ SPF_UMAX
Signed maximum.
Definition ValueTracking.h:850

llvm::SPF_SMIN
@ SPF_SMIN
Definition ValueTracking.h:847

llvm::SPF_SMAX
@ SPF_SMAX
Unsigned minimum.
Definition ValueTracking.h:849

llvm::SPF_FMINNUM
@ SPF_FMINNUM
Unsigned maximum.
Definition ValueTracking.h:851

llvm::isPowerOf2_32
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279

llvm::matchSelectPattern
LLVM_ABI SelectPatternResult matchSelectPattern(Value *V, Value *&LHS, Value *&RHS, Instruction::CastOps *CastOp=nullptr, unsigned Depth=0)
Pattern match integer [SU]MIN, [SU]MAX and ABS idioms, returning the kind and providing the out param...
Definition ValueTracking.cpp:9091

llvm::dbgs
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207

llvm::isa
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547

llvm::IRMemLocation::First
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74

llvm::RecurKind
RecurKind
These are the kinds of recurrences that we support.
Definition IVDescriptors.h:34

llvm::RecurKind::Sub
@ Sub
Subtraction of integers.
Definition IVDescriptors.h:38

llvm::RecurKind::Add
@ Add
Sum of integers.
Definition IVDescriptors.h:37

llvm::Op
DWARFExpression::Operation Op
Definition DWARFExpressionPrinter.cpp:22

llvm::CostTblEntry
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30

llvm::count_if
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2009

llvm::cast
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559

llvm::getLoadStoreType
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
Definition Instructions.h:5158

llvm::isVREVMask
bool isVREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isVREVMask - Check if a vector shuffle corresponds to a VREV instruction with the specified blocksize...
Definition ARMTargetTransformInfo.h:463

llvm::TailFoldingStyle
TailFoldingStyle
Definition TargetTransformInfo.h:230

llvm::TailFoldingStyle::DataWithoutLaneMask
@ DataWithoutLaneMask
Same as Data, but avoids using the get.active.lane.mask intrinsic to calculate the mask and instead i...
Definition TargetTransformInfo.h:247

llvm::TailFoldingStyle::Data
@ Data
Use predicate only to mask operations on data in the loop.
Definition TargetTransformInfo.h:241

llvm::getPtrStride
LLVM_ABI std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DominatorTree &DT, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
Definition LoopAccessAnalysis.cpp:1623

llvm::ConvertCostTableLookup
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition CostTable.h:66

llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39

llvm::Align::value
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77

llvm::EVT
Extended Value Type.
Definition ValueTypes.h:35

llvm::EVT::isSimple
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137

llvm::EVT::isFloatingPoint
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147

llvm::EVT::getSizeInBits
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373

llvm::EVT::getScalarSizeInBits
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385

llvm::EVT::getSimpleVT
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316

llvm::EVT::getScalarType
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323

llvm::EVT::getVectorElementType
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328

llvm::EVT::getVectorNumElements
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336

llvm::EVT::isInteger
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152

llvm::HardwareLoopInfo
Attributes of a target dependent hardware loop.
Definition TargetTransformInfo.h:103

llvm::HardwareLoopInfo::LoopDecrement
Value * LoopDecrement
Definition TargetTransformInfo.h:111

llvm::HardwareLoopInfo::canAnalyze
LLVM_ABI bool canAnalyze(LoopInfo &LI)
Definition TargetTransformInfo.cpp:65

llvm::HardwareLoopInfo::CounterInReg
bool CounterInReg
Definition TargetTransformInfo.h:115

llvm::HardwareLoopInfo::CountType
IntegerType * CountType
Definition TargetTransformInfo.h:110

llvm::HardwareLoopInfo::IsNestingLegal
bool IsNestingLegal
Definition TargetTransformInfo.h:113

llvm::HardwareLoopInfo::PerformEntryTest
bool PerformEntryTest
Definition TargetTransformInfo.h:117

llvm::HardwareLoopInfo::isHardwareLoopCandidate
LLVM_ABI bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, DominatorTree &DT, bool ForceNestedLoop=false, bool ForceHardwareLoopPHI=false)
Definition TargetTransformInfo.cpp:127

llvm::KnownBits
Definition KnownBits.h:24

llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106

llvm::MemOp
Definition TargetLowering.h:118

llvm::MemOp::Set
static MemOp Set(uint64_t Size, bool DstAlignCanChange, Align DstAlign, bool IsZeroMemset, bool IsVolatile)
Definition TargetLowering.h:151

llvm::MemOp::Copy
static MemOp Copy(uint64_t Size, bool DstAlignCanChange, Align DstAlign, Align SrcAlign, bool IsVolatile, bool MemcpyStrSrc=false)
Definition TargetLowering.h:136

llvm::PatternMatch::m_ZeroMask
Definition PatternMatch.h:2017

llvm::SelectPatternResult::Flavor
SelectPatternFlavor Flavor
Definition ValueTracking.h:869

llvm::TailFoldingInfo
Definition TargetTransformInfo.h:263

llvm::TailFoldingInfo::TLI
TargetLibraryInfo * TLI
Definition TargetTransformInfo.h:264

llvm::TailFoldingInfo::LVL
LoopVectorizationLegality * LVL
Definition TargetTransformInfo.h:265

llvm::TargetLoweringBase::AddrMode
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
Definition TargetLowering.h:2937

llvm::TargetLoweringBase::AddrMode::BaseOffs
int64_t BaseOffs
Definition TargetLowering.h:2939

llvm::TargetLoweringBase::AddrMode::BaseGV
GlobalValue * BaseGV
Definition TargetLowering.h:2938

llvm::TargetLoweringBase::AddrMode::HasBaseReg
bool HasBaseReg
Definition TargetLowering.h:2940

llvm::TargetLoweringBase::AddrMode::Scale
int64_t Scale
Definition TargetLowering.h:2941

llvm::TargetLoweringBase::AddrMode::ScalableOffset
int64_t ScalableOffset
Definition TargetLowering.h:2942

llvm::TargetTransformInfo::OperandValueInfo
Definition TargetTransformInfo.h:1250

llvm::TargetTransformInfo::OperandValueInfo::isConstant
bool isConstant() const
Definition TargetTransformInfo.h:1254

llvm::TargetTransformInfo::OperandValueInfo::isUniform
bool isUniform() const
Definition TargetTransformInfo.h:1257

llvm::TargetTransformInfo::PeelingPreferences
Definition TargetTransformInfo.h:749

llvm::TargetTransformInfo::UnrollingPreferences
Parameters that control the generic loop unrolling transformation.
Definition TargetTransformInfo.h:619

llvm::TargetTransformInfo::UnrollingPreferences::UpperBound
bool UpperBound
Allow using trip count upper bound to unroll loops.
Definition TargetTransformInfo.h:690

llvm::TargetTransformInfo::UnrollingPreferences::Force
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
Definition TargetTransformInfo.h:688

llvm::TargetTransformInfo::UnrollingPreferences::PartialOptSizeThreshold
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
Definition TargetTransformInfo.h:648

llvm::TargetTransformInfo::UnrollingPreferences::DefaultUnrollRuntimeCount
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
Definition TargetTransformInfo.h:655

llvm::TargetTransformInfo::UnrollingPreferences::UnrollAndJamInnerLoopThreshold
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
Definition TargetTransformInfo.h:699

llvm::TargetTransformInfo::UnrollingPreferences::UnrollAndJam
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
Definition TargetTransformInfo.h:694

llvm::TargetTransformInfo::UnrollingPreferences::UnrollRemainder
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
Definition TargetTransformInfo.h:692

llvm::TargetTransformInfo::UnrollingPreferences::Runtime
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
Definition TargetTransformInfo.h:680

llvm::TargetTransformInfo::UnrollingPreferences::Partial
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Definition TargetTransformInfo.h:676

llvm::TargetTransformInfo::UnrollingPreferences::OptSizeThreshold
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Definition TargetTransformInfo.h:641

llvm::cl::desc
Definition CommandLine.h:410