/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h

Bug Summary

File:	llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
Warning:	line 237, column 16 The result of the left shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'

Annotated Source Code

Press '?' to see keyboard shortcuts

Show analyzer invocation

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name ARMTargetTransformInfo.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/build-llvm/lib/Target/ARM -resource-dir /usr/lib/llvm-14/lib/clang/14.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/build-llvm/lib/Target/ARM -I /build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM -I /build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/build-llvm/include -I /build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-14/lib/clang/14.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/build-llvm/lib/Target/ARM -fdebug-prefix-map=/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0=. -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2021-08-28-193554-24367-1 -x c++ /build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

→

1//===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//

9#include "ARMTargetTransformInfo.h"
10#include "ARMSubtarget.h"
11#include "MCTargetDesc/ARMAddressingModes.h"
12#include "llvm/ADT/APInt.h"
13#include "llvm/ADT/SmallVector.h"
14#include "llvm/Analysis/LoopInfo.h"
15#include "llvm/CodeGen/CostTable.h"
16#include "llvm/CodeGen/ISDOpcodes.h"
17#include "llvm/CodeGen/ValueTypes.h"
18#include "llvm/IR/BasicBlock.h"
19#include "llvm/IR/DataLayout.h"
20#include "llvm/IR/DerivedTypes.h"
21#include "llvm/IR/Instruction.h"
22#include "llvm/IR/Instructions.h"
23#include "llvm/IR/Intrinsics.h"
24#include "llvm/IR/IntrinsicInst.h"
25#include "llvm/IR/IntrinsicsARM.h"
26#include "llvm/IR/PatternMatch.h"
27#include "llvm/IR/Type.h"
28#include "llvm/MC/SubtargetFeature.h"
29#include "llvm/Support/Casting.h"
30#include "llvm/Support/KnownBits.h"
31#include "llvm/Support/MachineValueType.h"
32#include "llvm/Target/TargetMachine.h"
33#include "llvm/Transforms/InstCombine/InstCombiner.h"
34#include "llvm/Transforms/Utils/Local.h"
35#include "llvm/Transforms/Utils/LoopUtils.h"
36#include <algorithm>
37#include <cassert>
38#include <cstdint>
39#include <utility>

41using namespace llvm;

43#define DEBUG_TYPE"armtti" "armtti"

45static cl::opt<bool> EnableMaskedLoadStores(
"enable-arm-maskedldst", cl::Hidden, cl::init(true),
cl::desc("Enable the generation of masked loads and stores"));

49static cl::opt<bool> DisableLowOverheadLoops(
"disable-arm-loloops", cl::Hidden, cl::init(false),
cl::desc("Disable the generation of low-overhead loops"));

53static cl::opt<bool>
  AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
                cl::desc("Enable the generation of WLS loops"));

57extern cl::opt<TailPredication::Mode> EnableTailPredication;

59extern cl::opt<bool> EnableMaskedGatherScatters;

61extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor;

63/// Convert a vector load intrinsic into a simple llvm load instruction.
64/// This is beneficial when the underlying object being addressed comes
65/// from a constant, since we get constant-folding for free.
66static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
                             InstCombiner::BuilderTy &Builder) {
auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));

if (!IntrAlign)
  return nullptr;

unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
                         ? MemAlign
                         : IntrAlign->getLimitedValue();

if (!isPowerOf2_32(Alignment))
  return nullptr;

auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
                                        PointerType::get(II.getType(), 0));
return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
83}

85bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
                                   const Function *Callee) const {
const TargetMachine &TM = getTLI()->getTargetMachine();
const FeatureBitset &CallerBits =
    TM.getSubtargetImpl(*Caller)->getFeatureBits();
const FeatureBitset &CalleeBits =
    TM.getSubtargetImpl(*Callee)->getFeatureBits();

// To inline a callee, all features not in the allowed list must match exactly.
bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
                  (CalleeBits & ~InlineFeaturesAllowed);
// For features in the allowed list, the callee's features must be a subset of
// the callers'.
bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
                   (CalleeBits & InlineFeaturesAllowed);
return MatchExact && MatchSubset;
101}

103TTI::AddressingModeKind
104ARMTTIImpl::getPreferredAddressingMode(const Loop *L,
                                     ScalarEvolution *SE) const {
if (ST->hasMVEIntegerOps())
  return TTI::AMK_PostIndexed;

if (L->getHeader()->getParent()->hasOptSize())
  return TTI::AMK_None;

if (ST->isMClass() && ST->isThumb2() &&
    L->getNumBlocks() == 1)
  return TTI::AMK_PreIndexed;

return TTI::AMK_None;
117}

119Optional<Instruction *>
120ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
using namespace PatternMatch;
Intrinsic::ID IID = II.getIntrinsicID();
switch (IID) {
default:
  break;
case Intrinsic::arm_neon_vld1: {
  Align MemAlign =
      getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
                        &IC.getAssumptionCache(), &IC.getDominatorTree());
  if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
    return IC.replaceInstUsesWith(II, V);
  }
  break;
}

case Intrinsic::arm_neon_vld2:
case Intrinsic::arm_neon_vld3:
case Intrinsic::arm_neon_vld4:
case Intrinsic::arm_neon_vld2lane:
case Intrinsic::arm_neon_vld3lane:
case Intrinsic::arm_neon_vld4lane:
case Intrinsic::arm_neon_vst1:
case Intrinsic::arm_neon_vst2:
case Intrinsic::arm_neon_vst3:
case Intrinsic::arm_neon_vst4:
case Intrinsic::arm_neon_vst2lane:
case Intrinsic::arm_neon_vst3lane:
case Intrinsic::arm_neon_vst4lane: {
  Align MemAlign =
      getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
                        &IC.getAssumptionCache(), &IC.getDominatorTree());
  unsigned AlignArg = II.getNumArgOperands() - 1;
  Value *AlignArgOp = II.getArgOperand(AlignArg);
  MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
  if (Align && *Align < MemAlign) {
    return IC.replaceOperand(
        II, AlignArg,
        ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(),
                         false));
  }
  break;
}

case Intrinsic::arm_mve_pred_i2v: {
  Value *Arg = II.getArgOperand(0);
  Value *ArgArg;
  if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
                     PatternMatch::m_Value(ArgArg))) &&
      II.getType() == ArgArg->getType()) {
    return IC.replaceInstUsesWith(II, ArgArg);
  }
  Constant *XorMask;
  if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
                           PatternMatch::m_Value(ArgArg)),
                       PatternMatch::m_Constant(XorMask))) &&
      II.getType() == ArgArg->getType()) {
    if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
      if (CI->getValue().trunc(16).isAllOnesValue()) {
        auto TrueVector = IC.Builder.CreateVectorSplat(
            cast<FixedVectorType>(II.getType())->getNumElements(),
            IC.Builder.getTrue());
        return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
      }
    }
  }
  KnownBits ScalarKnown(32);
  if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
                              ScalarKnown, 0)) {
    return &II;
  }
  break;
}
case Intrinsic::arm_mve_pred_v2i: {
  Value *Arg = II.getArgOperand(0);
  Value *ArgArg;
  if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
                     PatternMatch::m_Value(ArgArg)))) {
    return IC.replaceInstUsesWith(II, ArgArg);
  }
  if (!II.getMetadata(LLVMContext::MD_range)) {
    Type *IntTy32 = Type::getInt32Ty(II.getContext());
    Metadata *M[] = {
        ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)),
        ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0x10000))};
    II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M));
    return &II;
  }
  break;
}
case Intrinsic::arm_mve_vadc:
case Intrinsic::arm_mve_vadc_predicated: {
  unsigned CarryOp =
      (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
  assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&(static_cast <bool> (II.getArgOperand(CarryOp)->getType
()->getScalarSizeInBits() == 32 && "Bad type for intrinsic!"
) ? void (0) : __assert_fail ("II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 && \"Bad type for intrinsic!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp"
, 215, __extension__ __PRETTY_FUNCTION__))
         "Bad type for intrinsic!")(static_cast <bool> (II.getArgOperand(CarryOp)->getType
()->getScalarSizeInBits() == 32 && "Bad type for intrinsic!"
) ? void (0) : __assert_fail ("II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 && \"Bad type for intrinsic!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp"
, 215, __extension__ __PRETTY_FUNCTION__));

  KnownBits CarryKnown(32);
  if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
                              CarryKnown)) {
    return &II;
  }
  break;
}
case Intrinsic::arm_mve_vmldava: {
  Instruction *I = cast<Instruction>(&II);
  if (I->hasOneUse()) {
    auto *User = cast<Instruction>(*I->user_begin());
    Value *OpZ;
    if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
        match(I->getOperand(3), m_Zero())) {
      Value *OpX = I->getOperand(4);
      Value *OpY = I->getOperand(5);
      Type *OpTy = OpX->getType();

      IC.Builder.SetInsertPoint(User);
      Value *V =
          IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
                                     {I->getOperand(0), I->getOperand(1),
                                      I->getOperand(2), OpZ, OpX, OpY});

      IC.replaceInstUsesWith(*User, V);
      return IC.eraseInstFromFunction(*User);
    }
  }
  return None;
}
}
return None;
249}

251InstructionCost ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
                                        TTI::TargetCostKind CostKind) {
assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
 __assert_fail ("Ty->isIntegerTy()", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp"
, 253, __extension__ __PRETTY_FUNCTION__));
9
←
'?' condition is true→

unsigned Bits = Ty->getPrimitiveSizeInBits();
if (Bits == 0 || Imm.getActiveBits() >= 64)
10
←
Assuming 'Bits' is not equal to 0→
11
←
Assuming the condition is false→
12
←
Taking false branch→
 return 4;

int64_t SImmVal = Imm.getSExtValue();
uint64_t ZImmVal = Imm.getZExtValue();
if (!ST->isThumb()) {
13
←
Assuming the condition is false→
14
←
Taking false branch→
  if ((SImmVal >= 0 && SImmVal < 65536) ||
      (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
      (ARM_AM::getSOImmVal(~ZImmVal) != -1))
    return 1;
  return ST->hasV6T2Ops() ? 2 : 3;
}
if (ST->isThumb2()) {
15
←
Assuming the condition is false→
  if ((SImmVal >= 0 && SImmVal < 65536) ||
      (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
      (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
    return 1;
  return ST->hasV6T2Ops() ? 2 : 3;
}
// Thumb1, any i8 imm cost 1.
if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
16
←
Assuming 'Bits' is not equal to 8→
17
←
Assuming 'SImmVal' is < 0→
  return 1;
if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
18
←
Assuming the condition is false→
19
←
Calling 'isThumbImmShiftedVal'→
  return 2;
// Load from constantpool.
return 3;
282}

284// Constants smaller than 256 fit in the immediate field of
285// Thumb1 instructions so we return a zero cost and 1 otherwise.
286InstructionCost ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
                                                const APInt &Imm, Type *Ty) {
if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
  return 0;

return 1;
292}

294// Checks whether Inst is part of a min(max()) or max(min()) pattern
295// that will match to an SSAT instruction
296static bool isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
Value *LHS, *RHS;
ConstantInt *C;
SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor;

if (InstSPF == SPF_SMAX &&
    PatternMatch::match(RHS, PatternMatch::m_ConstantInt(C)) &&
    C->getValue() == Imm && Imm.isNegative() && (-Imm).isPowerOf2()) {

  auto isSSatMin = [&](Value *MinInst) {
    if (isa<SelectInst>(MinInst)) {
      Value *MinLHS, *MinRHS;
      ConstantInt *MinC;
      SelectPatternFlavor MinSPF =
          matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
      if (MinSPF == SPF_SMIN &&
          PatternMatch::match(MinRHS, PatternMatch::m_ConstantInt(MinC)) &&
          MinC->getValue() == ((-Imm) - 1))
        return true;
    }
    return false;
  };

  if (isSSatMin(Inst->getOperand(1)) ||
      (Inst->hasNUses(2) && (isSSatMin(*Inst->user_begin()) ||
                             isSSatMin(*(++Inst->user_begin())))))
    return true;
}
return false;
325}

327InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
                                            const APInt &Imm, Type *Ty,
                                            TTI::TargetCostKind CostKind,
                                            Instruction *Inst) {
// Division by a constant can be turned into multiplication, but only if we
// know it's constant. So it's not so much that the immediate is cheap (it's
// not), but that the alternative is worse.
// FIXME: this is probably unneeded with GlobalISel.
if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
1
Assuming 'Opcode' is not equal to SDiv→
2
←
Assuming 'Opcode' is not equal to UDiv→
     Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
3
←
Assuming 'Opcode' is not equal to SRem→
4
←
Assuming 'Opcode' is not equal to URem→
    Idx == 1)
  return 0;

// Leave any gep offsets for the CodeGenPrepare, which will do a better job at
// splitting any large offsets.
if (Opcode == Instruction::GetElementPtr && Idx != 0)
5
←
Assuming 'Opcode' is not equal to GetElementPtr→
  return 0;

if (Opcode == Instruction::And) {
6
←
Assuming 'Opcode' is equal to And→
  // UXTB/UXTH
  if (Imm == 255 || Imm == 65535)
7
←
Taking false branch→
    return 0;
  // Conversion to BIC is free, and means we can use ~Imm instead.
  return std::min(getIntImmCost(Imm, Ty, CostKind),
                  getIntImmCost(~Imm, Ty, CostKind));
8
←
Calling 'ARMTTIImpl::getIntImmCost'→
}

if (Opcode == Instruction::Add)
  // Conversion to SUB is free, and means we can use -Imm instead.
  return std::min(getIntImmCost(Imm, Ty, CostKind),
                  getIntImmCost(-Imm, Ty, CostKind));

if (Opcode == Instruction::ICmp && Imm.isNegative() &&
    Ty->getIntegerBitWidth() == 32) {
  int64_t NegImm = -Imm.getSExtValue();
  if (ST->isThumb2() && NegImm < 1<<12)
    // icmp X, #-C -> cmn X, #C
    return 0;
  if (ST->isThumb() && NegImm < 1<<8)
    // icmp X, #-C -> adds X, #C
    return 0;
}

// xor a, -1 can always be folded to MVN
if (Opcode == Instruction::Xor && Imm.isAllOnesValue())
  return 0;

// Ensures negative constant of min(max()) or max(min()) patterns that
// match to SSAT instructions don't get hoisted
if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
    Ty->getIntegerBitWidth() <= 32) {
  if (isSSATMinMaxPattern(Inst, Imm) ||
      (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
       isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm)))
    return 0;
}

return getIntImmCost(Imm, Ty, CostKind);
385}

387InstructionCost ARMTTIImpl::getCFInstrCost(unsigned Opcode,
                                         TTI::TargetCostKind CostKind,
                                         const Instruction *I) {
if (CostKind == TTI::TCK_RecipThroughput &&
    (ST->hasNEON() || ST->hasMVEIntegerOps())) {
  // FIXME: The vectorizer is highly sensistive to the cost of these
  // instructions, which suggests that it may be using the costs incorrectly.
  // But, for now, just make them free to avoid performance regressions for
  // vector targets.
  return 0;
}
return BaseT::getCFInstrCost(Opcode, CostKind, I);
399}

401InstructionCost ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
                                           Type *Src,
                                           TTI::CastContextHint CCH,
                                           TTI::TargetCostKind CostKind,
                                           const Instruction *I) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ?
 void (0) : __assert_fail ("ISD && \"Invalid opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp"
, 407, __extension__ __PRETTY_FUNCTION__));

// TODO: Allow non-throughput costs that aren't binary.
auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
  if (CostKind != TTI::TCK_RecipThroughput)
    return Cost == 0 ? 0 : 1;
  return Cost;
};
auto IsLegalFPType = [this](EVT VT) {
  EVT EltVT = VT.getScalarType();
  return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
          (EltVT == MVT::f64 && ST->hasFP64()) ||
          (EltVT == MVT::f16 && ST->hasFullFP16());
};

EVT SrcTy = TLI->getValueType(DL, Src);
EVT DstTy = TLI->getValueType(DL, Dst);

if (!SrcTy.isSimple() || !DstTy.isSimple())
  return AdjustCost(
      BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));

// Extending masked load/Truncating masked stores is expensive because we
// currently don't split them. This means that we'll likely end up
// loading/storing each element individually (hence the high cost).
if ((ST->hasMVEIntegerOps() &&
     (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
      Opcode == Instruction::SExt)) ||
    (ST->hasMVEFloatOps() &&
     (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
     IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
  if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
    return 2 * DstTy.getVectorNumElements() *
           ST->getMVEVectorCostFactor(CostKind);

// The extend of other kinds of load is free
if (CCH == TTI::CastContextHint::Normal ||
    CCH == TTI::CastContextHint::Masked) {
  static const TypeConversionCostTblEntry LoadConversionTbl[] = {
      {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
      {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
      {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
      {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
      {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
      {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
      {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
      {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
      {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
      {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
      {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
      {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
  };
  if (const auto *Entry = ConvertCostTableLookup(
          LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
    return AdjustCost(Entry->Cost);

  static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
      {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
      {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
      {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
      {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
      {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
      {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
      // The following extend from a legal type to an illegal type, so need to
      // split the load. This introduced an extra load operation, but the
      // extend is still "free".
      {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1},
      {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1},
      {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3},
      {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3},
      {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1},
      {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1},
  };
  if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
    if (const auto *Entry =
            ConvertCostTableLookup(MVELoadConversionTbl, ISD,
                                   DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
      return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
  }

  static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
      // FPExtends are similar but also require the VCVT instructions.
      {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1},
      {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3},
  };
  if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
    if (const auto *Entry =
            ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
                                   DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
      return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
  }

  // The truncate of a store is free. This is the mirror of extends above.
  static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
      {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0},
      {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0},
      {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0},
      {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1},
      {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1},
      {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3},
      {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1},
  };
  if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
    if (const auto *Entry =
            ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
                                   SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
      return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
  }

  static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
      {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
      {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
  };
  if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
    if (const auto *Entry =
            ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
                                   SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
      return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
  }
}

// NEON vector operations that can extend their inputs.
if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
    I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
  static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
    // vaddl
    { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
    { ISD::ADD, MVT::v8i16, MVT::v8i8,  0 },
    // vsubl
    { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
    { ISD::SUB, MVT::v8i16, MVT::v8i8,  0 },
    // vmull
    { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
    { ISD::MUL, MVT::v8i16, MVT::v8i8,  0 },
    // vshll
    { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
    { ISD::SHL, MVT::v8i16, MVT::v8i8,  0 },
  };

  auto *User = cast<Instruction>(*I->user_begin());
  int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
  if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
                                           DstTy.getSimpleVT(),
                                           SrcTy.getSimpleVT())) {
    return AdjustCost(Entry->Cost);
  }
}

// Single to/from double precision conversions.
if (Src->isVectorTy() && ST->hasNEON() &&
    ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
      DstTy.getScalarType() == MVT::f32) ||
     (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
      DstTy.getScalarType() == MVT::f64))) {
  static const CostTblEntry NEONFltDblTbl[] = {
      // Vector fptrunc/fpext conversions.
      {ISD::FP_ROUND, MVT::v2f64, 2},
      {ISD::FP_EXTEND, MVT::v2f32, 2},
      {ISD::FP_EXTEND, MVT::v4f32, 4}};

  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
  if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
    return AdjustCost(LT.first * Entry->Cost);
}

// Some arithmetic, load and store operations have specific instructions
// to cast up/down their types automatically at no extra cost.
// TODO: Get these tables to know at least what the related operations are.
static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
  { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
  { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
  { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
  { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
  { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64, 0 },
  { ISD::TRUNCATE,    MVT::v4i16, MVT::v4i32, 1 },

  // The number of vmovl instructions for the extension.
  { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8,  1 },
  { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8,  1 },
  { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8,  2 },
  { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8,  2 },
  { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8,  3 },
  { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8,  3 },
  { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
  { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
  { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
  { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
  { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
  { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
  { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
  { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
  { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
  { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
  { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
  { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },

  // Operations that we legalize using splitting.
  { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i32, 6 },
  { ISD::TRUNCATE,    MVT::v8i8, MVT::v8i32, 3 },

  // Vector float <-> i32 conversions.
  { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
  { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },

  { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
  { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
  { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
  { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
  { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
  { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
  { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
  { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
  { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
  { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
  { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
  { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
  { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
  { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
  { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
  { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
  { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
  { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
  { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },
  { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },

  { ISD::FP_TO_SINT,  MVT::v4i32, MVT::v4f32, 1 },
  { ISD::FP_TO_UINT,  MVT::v4i32, MVT::v4f32, 1 },
  { ISD::FP_TO_SINT,  MVT::v4i8, MVT::v4f32, 3 },
  { ISD::FP_TO_UINT,  MVT::v4i8, MVT::v4f32, 3 },
  { ISD::FP_TO_SINT,  MVT::v4i16, MVT::v4f32, 2 },
  { ISD::FP_TO_UINT,  MVT::v4i16, MVT::v4f32, 2 },

  // Vector double <-> i32 conversions.
  { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
  { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },

  { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
  { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
  { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
  { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
  { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
  { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },

  { ISD::FP_TO_SINT,  MVT::v2i32, MVT::v2f64, 2 },
  { ISD::FP_TO_UINT,  MVT::v2i32, MVT::v2f64, 2 },
  { ISD::FP_TO_SINT,  MVT::v8i16, MVT::v8f32, 4 },
  { ISD::FP_TO_UINT,  MVT::v8i16, MVT::v8f32, 4 },
  { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v16f32, 8 },
  { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v16f32, 8 }
};

if (SrcTy.isVector() && ST->hasNEON()) {
  if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
                                                 DstTy.getSimpleVT(),
                                                 SrcTy.getSimpleVT()))
    return AdjustCost(Entry->Cost);
}

// Scalar float to integer conversions.
static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
  { ISD::FP_TO_SINT,  MVT::i1, MVT::f32, 2 },
  { ISD::FP_TO_UINT,  MVT::i1, MVT::f32, 2 },
  { ISD::FP_TO_SINT,  MVT::i1, MVT::f64, 2 },
  { ISD::FP_TO_UINT,  MVT::i1, MVT::f64, 2 },
  { ISD::FP_TO_SINT,  MVT::i8, MVT::f32, 2 },
  { ISD::FP_TO_UINT,  MVT::i8, MVT::f32, 2 },
  { ISD::FP_TO_SINT,  MVT::i8, MVT::f64, 2 },
  { ISD::FP_TO_UINT,  MVT::i8, MVT::f64, 2 },
  { ISD::FP_TO_SINT,  MVT::i16, MVT::f32, 2 },
  { ISD::FP_TO_UINT,  MVT::i16, MVT::f32, 2 },
  { ISD::FP_TO_SINT,  MVT::i16, MVT::f64, 2 },
  { ISD::FP_TO_UINT,  MVT::i16, MVT::f64, 2 },
  { ISD::FP_TO_SINT,  MVT::i32, MVT::f32, 2 },
  { ISD::FP_TO_UINT,  MVT::i32, MVT::f32, 2 },
  { ISD::FP_TO_SINT,  MVT::i32, MVT::f64, 2 },
  { ISD::FP_TO_UINT,  MVT::i32, MVT::f64, 2 },
  { ISD::FP_TO_SINT,  MVT::i64, MVT::f32, 10 },
  { ISD::FP_TO_UINT,  MVT::i64, MVT::f32, 10 },
  { ISD::FP_TO_SINT,  MVT::i64, MVT::f64, 10 },
  { ISD::FP_TO_UINT,  MVT::i64, MVT::f64, 10 }
};
if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
  if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
                                                 DstTy.getSimpleVT(),
                                                 SrcTy.getSimpleVT()))
    return AdjustCost(Entry->Cost);
}

// Scalar integer to float conversions.
static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
  { ISD::SINT_TO_FP,  MVT::f32, MVT::i1, 2 },
  { ISD::UINT_TO_FP,  MVT::f32, MVT::i1, 2 },
  { ISD::SINT_TO_FP,  MVT::f64, MVT::i1, 2 },
  { ISD::UINT_TO_FP,  MVT::f64, MVT::i1, 2 },
  { ISD::SINT_TO_FP,  MVT::f32, MVT::i8, 2 },
  { ISD::UINT_TO_FP,  MVT::f32, MVT::i8, 2 },
  { ISD::SINT_TO_FP,  MVT::f64, MVT::i8, 2 },
  { ISD::UINT_TO_FP,  MVT::f64, MVT::i8, 2 },
  { ISD::SINT_TO_FP,  MVT::f32, MVT::i16, 2 },
  { ISD::UINT_TO_FP,  MVT::f32, MVT::i16, 2 },
  { ISD::SINT_TO_FP,  MVT::f64, MVT::i16, 2 },
  { ISD::UINT_TO_FP,  MVT::f64, MVT::i16, 2 },
  { ISD::SINT_TO_FP,  MVT::f32, MVT::i32, 2 },
  { ISD::UINT_TO_FP,  MVT::f32, MVT::i32, 2 },
  { ISD::SINT_TO_FP,  MVT::f64, MVT::i32, 2 },
  { ISD::UINT_TO_FP,  MVT::f64, MVT::i32, 2 },
  { ISD::SINT_TO_FP,  MVT::f32, MVT::i64, 10 },
  { ISD::UINT_TO_FP,  MVT::f32, MVT::i64, 10 },
  { ISD::SINT_TO_FP,  MVT::f64, MVT::i64, 10 },
  { ISD::UINT_TO_FP,  MVT::f64, MVT::i64, 10 }
};

if (SrcTy.isInteger() && ST->hasNEON()) {
  if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
                                                 ISD, DstTy.getSimpleVT(),
                                                 SrcTy.getSimpleVT()))
    return AdjustCost(Entry->Cost);
}

// MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
// instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
// are linearised so take more.
static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
  { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
  { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
  { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
  { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
  { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
  { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
  { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
  { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
  { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
  { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
  { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
  { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
};

if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
  if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
                                                 ISD, DstTy.getSimpleVT(),
                                                 SrcTy.getSimpleVT()))
    return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
}

if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
  // As general rule, fp converts that were not matched above are scalarized
  // and cost 1 vcvt for each lane, so long as the instruction is available.
  // If not it will become a series of function calls.
  const InstructionCost CallCost =
      getCallInstrCost(nullptr, Dst, {Src}, CostKind);
  int Lanes = 1;
  if (SrcTy.isFixedLengthVector())
    Lanes = SrcTy.getVectorNumElements();

  if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
    return Lanes;
  else
    return Lanes * CallCost;
}

if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
    SrcTy.isFixedLengthVector()) {
  // Treat a truncate with larger than legal source (128bits for MVE) as
  // expensive, 2 instructions per lane.
  if ((SrcTy.getScalarType() == MVT::i8 ||
       SrcTy.getScalarType() == MVT::i16 ||
       SrcTy.getScalarType() == MVT::i32) &&
      SrcTy.getSizeInBits() > 128 &&
      SrcTy.getSizeInBits() > DstTy.getSizeInBits())
    return SrcTy.getVectorNumElements() * 2;
}

// Scalar integer conversion costs.
static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
  // i16 -> i64 requires two dependent operations.
  { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },

  // Truncates on i64 are assumed to be free.
  { ISD::TRUNCATE,    MVT::i32, MVT::i64, 0 },
  { ISD::TRUNCATE,    MVT::i16, MVT::i64, 0 },
  { ISD::TRUNCATE,    MVT::i8,  MVT::i64, 0 },
  { ISD::TRUNCATE,    MVT::i1,  MVT::i64, 0 }
};

if (SrcTy.isInteger()) {
  if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
                                                 DstTy.getSimpleVT(),
                                                 SrcTy.getSimpleVT()))
    return AdjustCost(Entry->Cost);
}

int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
                   ? ST->getMVEVectorCostFactor(CostKind)
                   : 1;
return AdjustCost(
    BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
803}

805InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
                                             unsigned Index) {
// Penalize inserting into an D-subregister. We end up with a three times
// lower estimated throughput on swift.
if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
    ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
  return 3;

if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
                      Opcode == Instruction::ExtractElement)) {
  // Cross-class copies are expensive on many microarchitectures,
  // so assume they are expensive by default.
  if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
    return 3;

  // Even if it's not a cross class copy, this likely leads to mixing
  // of NEON and VFP code and should be therefore penalized.
  if (ValTy->isVectorTy() &&
      ValTy->getScalarSizeInBits() <= 32)
    return std::max<InstructionCost>(
        BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
}

if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
                               Opcode == Instruction::ExtractElement)) {
  // Integer cross-lane moves are more expensive than float, which can
  // sometimes just be vmovs. Integer involve being passes to GPR registers,
  // causing more of a delay.
  std::pair<InstructionCost, MVT> LT =
      getTLI()->getTypeLegalizationCost(DL, ValTy->getScalarType());
  return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
}

return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
839}

841InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
                                             Type *CondTy,
                                             CmpInst::Predicate VecPred,
                                             TTI::TargetCostKind CostKind,
                                             const Instruction *I) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);

// Thumb scalar code size cost for select.
if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
    ST->isThumb() && !ValTy->isVectorTy()) {
  // Assume expensive structs.
  if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
    return TTI::TCC_Expensive;

  // Select costs can vary because they:
  // - may require one or more conditional mov (including an IT),
  // - can't operate directly on immediates,
  // - require live flags, which we can't copy around easily.
  InstructionCost Cost = TLI->getTypeLegalizationCost(DL, ValTy).first;

  // Possible IT instruction for Thumb2, or more for Thumb1.
  ++Cost;

  // i1 values may need rematerialising by using mov immediates and/or
  // flag setting instructions.
  if (ValTy->isIntegerTy(1))
    ++Cost;

  return Cost;
}

// If this is a vector min/max/abs, use the cost of that intrinsic directly
// instead. Hopefully when min/max intrinsics are more prevalent this code
// will not be needed.
const Instruction *Sel = I;
if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
    Sel->hasOneUse())
  Sel = cast<Instruction>(Sel->user_back());
if (Sel && ValTy->isVectorTy() &&
    (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
  const Value *LHS, *RHS;
  SelectPatternFlavor SPF = matchSelectPattern(Sel, LHS, RHS).Flavor;
  unsigned IID = 0;
  switch (SPF) {
  case SPF_ABS:
    IID = Intrinsic::abs;
    break;
  case SPF_SMIN:
    IID = Intrinsic::smin;
    break;
  case SPF_SMAX:
    IID = Intrinsic::smax;
    break;
  case SPF_UMIN:
    IID = Intrinsic::umin;
    break;
  case SPF_UMAX:
    IID = Intrinsic::umax;
    break;
  case SPF_FMINNUM:
    IID = Intrinsic::minnum;
    break;
  case SPF_FMAXNUM:
    IID = Intrinsic::maxnum;
    break;
  default:
    break;
  }
  if (IID) {
    // The ICmp is free, the select gets the cost of the min/max/etc
    if (Sel != I)
      return 0;
    IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
    return getIntrinsicInstrCost(CostAttrs, CostKind);
  }
}

// On NEON a vector select gets lowered to vbsl.
if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
  // Lowering of some vector selects is currently far from perfect.
  static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
    { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
    { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
    { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
  };

  EVT SelCondTy = TLI->getValueType(DL, CondTy);
  EVT SelValTy = TLI->getValueType(DL, ValTy);
  if (SelCondTy.isSimple() && SelValTy.isSimple()) {
    if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
                                                   SelCondTy.getSimpleVT(),
                                                   SelValTy.getSimpleVT()))
      return Entry->Cost;
  }

  std::pair<InstructionCost, MVT> LT =
      TLI->getTypeLegalizationCost(DL, ValTy);
  return LT.first;
}

if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
    (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
    cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
  FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
  FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy);
  if (!VecCondTy)
    VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy));

  // If we don't have mve.fp any fp operations will need to be scalarized.
  if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
    // One scalaization insert, one scalarization extract and the cost of the
    // fcmps.
    return BaseT::getScalarizationOverhead(VecValTy, false, true) +
           BaseT::getScalarizationOverhead(VecCondTy, true, false) +
           VecValTy->getNumElements() *
               getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
                                  VecCondTy->getScalarType(), VecPred, CostKind,
                                  I);
  }

  std::pair<InstructionCost, MVT> LT =
      TLI->getTypeLegalizationCost(DL, ValTy);
  int BaseCost = ST->getMVEVectorCostFactor(CostKind);
  // There are two types - the input that specifies the type of the compare
  // and the output vXi1 type. Because we don't know how the output will be
  // split, we may need an expensive shuffle to get two in sync. This has the
  // effect of making larger than legal compares (v8i32 for example)
  // expensive.
  if (LT.second.getVectorNumElements() > 2) {
    if (LT.first > 1)
      return LT.first * BaseCost +
             BaseT::getScalarizationOverhead(VecCondTy, true, false);
    return BaseCost;
  }
}

// Default to cheap (throughput/size of 1 instruction) but adjust throughput
// for "multiple beats" potentially needed by MVE instructions.
int BaseCost = 1;
if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
  BaseCost = ST->getMVEVectorCostFactor(CostKind);

return BaseCost *
       BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
985}

987InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty,
                                                    ScalarEvolution *SE,
                                                    const SCEV *Ptr) {
// Address computations in vectorized code with non-consecutive addresses will
// likely result in more instructions compared to scalar code where the
// computation can more often be merged into the index mode. The resulting
// extra micro-ops can significantly decrease throughput.
unsigned NumVectorInstToHideOverhead = 10;
int MaxMergeDistance = 64;

if (ST->hasNEON()) {
  if (Ty->isVectorTy() && SE &&
      !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
    return NumVectorInstToHideOverhead;

  // In many cases the address computation is not merged into the instruction
  // addressing mode.
  return 1;
}
return BaseT::getAddressComputationCost(Ty, SE, Ptr);
1007}

1009bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) {
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
  // If a VCTP is part of a chain, it's already profitable and shouldn't be
  // optimized, else LSR may block tail-predication.
  switch (II->getIntrinsicID()) {
  case Intrinsic::arm_mve_vctp8:
  case Intrinsic::arm_mve_vctp16:
  case Intrinsic::arm_mve_vctp32:
  case Intrinsic::arm_mve_vctp64:
    return true;
  default:
    break;
  }
}
return false;
1024}

1026bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
  return false;

if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
  // Don't support v2i1 yet.
  if (VecTy->getNumElements() == 2)
    return false;

  // We don't support extending fp types.
   unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
  if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
    return false;
}

unsigned EltWidth = DataTy->getScalarSizeInBits();
return (EltWidth == 32 && Alignment >= 4) ||
       (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
1044}

1046bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) {
if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
  return false;

// This method is called in 2 places:
//  - from the vectorizer with a scalar type, in which case we need to get
//  this as good as we can with the limited info we have (and rely on the cost
//  model for the rest).
//  - from the masked intrinsic lowering pass with the actual vector type.
// For MVE, we have a custom lowering pass that will already have custom
// legalised any gathers that we can to MVE intrinsics, and want to expand all
// the rest. The pass runs before the masked intrinsic lowering pass, so if we
// are here, we know we want to expand.
if (isa<VectorType>(Ty))
  return false;

unsigned EltWidth = Ty->getScalarSizeInBits();
return ((EltWidth == 32 && Alignment >= 4) ||
        (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1065}

1067/// Given a memcpy/memset/memmove instruction, return the number of memory
1068/// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1069/// call is used.
1070int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const {
MemOp MOp;
unsigned DstAddrSpace = ~0u;
unsigned SrcAddrSpace = ~0u;
const Function *F = I->getParent()->getParent();

if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
  ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
  // If 'size' is not a constant, a library call will be generated.
  if (!C)
    return -1;

  const unsigned Size = C->getValue().getZExtValue();
  const Align DstAlign = *MC->getDestAlign();
  const Align SrcAlign = *MC->getSourceAlign();

  MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
                    /*IsVolatile*/ false);
  DstAddrSpace = MC->getDestAddressSpace();
  SrcAddrSpace = MC->getSourceAddressSpace();
}
else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
  ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
  // If 'size' is not a constant, a library call will be generated.
  if (!C)
    return -1;

  const unsigned Size = C->getValue().getZExtValue();
  const Align DstAlign = *MS->getDestAlign();

  MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
                   /*IsZeroMemset*/ false, /*IsVolatile*/ false);
  DstAddrSpace = MS->getDestAddressSpace();
}
else
  llvm_unreachable("Expected a memcpy/move or memset!")::llvm::llvm_unreachable_internal("Expected a memcpy/move or memset!"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp"
, 1105);

unsigned Limit, Factor = 2;
switch(I->getIntrinsicID()) {
  case Intrinsic::memcpy:
    Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
    break;
  case Intrinsic::memmove:
    Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
    break;
  case Intrinsic::memset:
    Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
    Factor = 1;
    break;
  default:
    llvm_unreachable("Expected a memcpy/move or memset!")::llvm::llvm_unreachable_internal("Expected a memcpy/move or memset!"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp"
, 1120);
}

// MemOps will be poplulated with a list of data types that needs to be
// loaded and stored. That's why we multiply the number of elements by 2 to
// get the cost for this memcpy.
std::vector<EVT> MemOps;
if (getTLI()->findOptimalMemOpLowering(
        MemOps, Limit, MOp, DstAddrSpace,
        SrcAddrSpace, F->getAttributes()))
  return MemOps.size() * Factor;

// If we can't find an optimal memop lowering, return the default cost
return -1;
1134}

1136InstructionCost ARMTTIImpl::getMemcpyCost(const Instruction *I) {
int NumOps = getNumMemOps(cast<IntrinsicInst>(I));

// To model the cost of a library call, we assume 1 for the call, and
// 3 for the argument setup.
if (NumOps == -1)
  return 4;
return NumOps;
1144}

1146InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
                                         VectorType *Tp, ArrayRef<int> Mask,
                                         int Index, VectorType *SubTp) {
Kind = improveShuffleKindFromMask(Kind, Mask);
if (ST->hasNEON()) {
  if (Kind == TTI::SK_Broadcast) {
    static const CostTblEntry NEONDupTbl[] = {
        // VDUP handles these cases.
        {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
        {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
        {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
        {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
        {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
        {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},

        {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
        {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
        {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
        {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};

    std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
    if (const auto *Entry =
            CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
      return LT.first * Entry->Cost;
  }
  if (Kind == TTI::SK_Reverse) {
    static const CostTblEntry NEONShuffleTbl[] = {
        // Reverse shuffle cost one instruction if we are shuffling within a
        // double word (vrev) or two if we shuffle a quad word (vrev, vext).
        {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
        {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
        {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
        {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
        {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
        {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},

        {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
        {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
        {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
        {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};

    std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
    if (const auto *Entry =
            CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
      return LT.first * Entry->Cost;
  }
  if (Kind == TTI::SK_Select) {
    static const CostTblEntry NEONSelShuffleTbl[] = {
        // Select shuffle cost table for ARM. Cost is the number of
        // instructions
        // required to create the shuffled vector.

        {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
        {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
        {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
        {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},

        {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
        {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
        {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},

        {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},

        {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};

    std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
    if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
                                            ISD::VECTOR_SHUFFLE, LT.second))
      return LT.first * Entry->Cost;
  }
}
if (ST->hasMVEIntegerOps()) {
  if (Kind == TTI::SK_Broadcast) {
    static const CostTblEntry MVEDupTbl[] = {
        // VDUP handles these cases.
        {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
        {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
        {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
        {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
        {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};

    std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
    if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
                                            LT.second))
      return LT.first * Entry->Cost *
             ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput);
  }

  if (!Mask.empty()) {
    std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
    if (Mask.size() <= LT.second.getVectorNumElements() &&
        (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
         isVREVMask(Mask, LT.second, 64)))
      return ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) * LT.first;
  }
}

int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
                   ? ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput)
                   : 1;
return BaseCost * BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
1247}

1249InstructionCost ARMTTIImpl::getArithmeticInstrCost(
  unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
  TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
  TTI::OperandValueProperties Opd1PropInfo,
  TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
  const Instruction *CxtI) {
int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
  // Make operations on i1 relatively expensive as this often involves
  // combining predicates. AND and XOR should be easier to handle with IT
  // blocks.
  switch (ISDOpcode) {
  default:
    break;
  case ISD::AND:
  case ISD::XOR:
    return 2;
  case ISD::OR:
    return 3;
  }
}

std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);

if (ST->hasNEON()) {
  const unsigned FunctionCallDivCost = 20;
  const unsigned ReciprocalDivCost = 10;
  static const CostTblEntry CostTbl[] = {
    // Division.
    // These costs are somewhat random. Choose a cost of 20 to indicate that
    // vectorizing devision (added function call) is going to be very expensive.
    // Double registers types.
    { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
    { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
    { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
    { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
    { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
    { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
    { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
    { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
    { ISD::SDIV, MVT::v4i16,     ReciprocalDivCost},
    { ISD::UDIV, MVT::v4i16,     ReciprocalDivCost},
    { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
    { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
    { ISD::SDIV, MVT::v8i8,      ReciprocalDivCost},
    { ISD::UDIV, MVT::v8i8,      ReciprocalDivCost},
    { ISD::SREM, MVT::v8i8,  8 * FunctionCallDivCost},
    { ISD::UREM, MVT::v8i8,  8 * FunctionCallDivCost},
    // Quad register types.
    { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
    { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
    { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
    { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
    { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
    { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
    { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
    { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
    { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
    { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
    { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
    { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
    { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
    { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
    { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
    { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
    // Multiplication.
  };

  if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
    return LT.first * Entry->Cost;

  InstructionCost Cost = BaseT::getArithmeticInstrCost(
      Opcode, Ty, CostKind, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo);

  // This is somewhat of a hack. The problem that we are facing is that SROA
  // creates a sequence of shift, and, or instructions to construct values.
  // These sequences are recognized by the ISel and have zero-cost. Not so for
  // the vectorized code. Because we have support for v2i64 but not i64 those
  // sequences look particularly beneficial to vectorize.
  // To work around this we increase the cost of v2i64 operations to make them
  // seem less beneficial.
  if (LT.second == MVT::v2i64 &&
      Op2Info == TargetTransformInfo::OK_UniformConstantValue)
    Cost += 4;

  return Cost;
}

// If this operation is a shift on arm/thumb2, it might well be folded into
// the following instruction, hence having a cost of 0.
auto LooksLikeAFreeShift = [&]() {
  if (ST->isThumb1Only() || Ty->isVectorTy())
    return false;

  if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
    return false;
  if (Op2Info != TargetTransformInfo::OK_UniformConstantValue)
    return false;

  // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
  switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
  case Instruction::Add:
  case Instruction::Sub:
  case Instruction::And:
  case Instruction::Xor:
  case Instruction::Or:
  case Instruction::ICmp:
    return true;
  default:
    return false;
  }
};
if (LooksLikeAFreeShift())
  return 0;

// Default to cheap (throughput/size of 1 instruction) but adjust throughput
// for "multiple beats" potentially needed by MVE instructions.
int BaseCost = 1;
if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
  BaseCost = ST->getMVEVectorCostFactor(CostKind);

// The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
// without treating floats as more expensive that scalars or increasing the
// costs for custom operations. The results is also multiplied by the
// MVEVectorCostFactor where appropriate.
if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
  return LT.first * BaseCost;

// Else this is expand, assume that we need to scalarize this op.
if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
  unsigned Num = VTy->getNumElements();
  InstructionCost Cost =
      getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind);
  // Return the cost of multiple scalar invocation plus the cost of
  // inserting and extracting the values.
  SmallVector<Type *> Tys(Args.size(), Ty);
  return BaseT::getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
}

return BaseCost;
1389}

1391InstructionCost ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
                                          MaybeAlign Alignment,
                                          unsigned AddressSpace,
                                          TTI::TargetCostKind CostKind,
                                          const Instruction *I) {
// TODO: Handle other cost kinds.
if (CostKind != TTI::TCK_RecipThroughput)
  return 1;

// Type legalization can't handle structs
if (TLI->getValueType(DL, Src, true) == MVT::Other)
  return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
                                CostKind);

if (ST->hasNEON() && Src->isVectorTy() &&
    (Alignment && *Alignment != Align(16)) &&
    cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
  // Unaligned loads/stores are extremely inefficient.
  // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
  return LT.first * 4;
}

// MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
// Same for stores.
if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
    ((Opcode == Instruction::Load && I->hasOneUse() &&
      isa<FPExtInst>(*I->user_begin())) ||
     (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
  FixedVectorType *SrcVTy = cast<FixedVectorType>(Src);
  Type *DstTy =
      Opcode == Instruction::Load
          ? (*I->user_begin())->getType()
          : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
  if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
      DstTy->getScalarType()->isFloatTy())
    return ST->getMVEVectorCostFactor(CostKind);
}

int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
                   ? ST->getMVEVectorCostFactor(CostKind)
                   : 1;
return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
                                         CostKind, I);
1435}

1437InstructionCost
1438ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
                                unsigned AddressSpace,
                                TTI::TargetCostKind CostKind) {
if (ST->hasMVEIntegerOps()) {
  if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment))
    return ST->getMVEVectorCostFactor(CostKind);
  if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment))
    return ST->getMVEVectorCostFactor(CostKind);
}
if (!isa<FixedVectorType>(Src))
  return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
                                      CostKind);
// Scalar cost, which is currently very high due to the efficiency of the
// generated code.
return cast<FixedVectorType>(Src)->getNumElements() * 8;
1453}

1455InstructionCost ARMTTIImpl::getInterleavedMemoryOpCost(
  unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
  Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
  bool UseMaskForCond, bool UseMaskForGaps) {
assert(Factor >= 2 && "Invalid interleave factor")(static_cast <bool> (Factor >= 2 && "Invalid interleave factor"
) ? void (0) : __assert_fail ("Factor >= 2 && \"Invalid interleave factor\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp"
, 1459, __extension__ __PRETTY_FUNCTION__));
assert(isa<VectorType>(VecTy) && "Expect a vector type")(static_cast <bool> (isa<VectorType>(VecTy) &&
 "Expect a vector type") ? void (0) : __assert_fail ("isa<VectorType>(VecTy) && \"Expect a vector type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp"
, 1460, __extension__ __PRETTY_FUNCTION__));

// vldN/vstN doesn't support vector types of i64/f64 element.
bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;

if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
    !UseMaskForCond && !UseMaskForGaps) {
  unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
  auto *SubVecTy =
      FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);

  // vldN/vstN only support legal vector types of size 64 or 128 in bits.
  // Accesses having vector types that are a multiple of 128 bits can be
  // matched to more than one vldN/vstN instruction.
  int BaseCost =
      ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1;
  if (NumElts % Factor == 0 &&
      TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
    return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);

  // Some smaller than legal interleaved patterns are cheap as we can make
  // use of the vmovn or vrev patterns to interleave a standard load. This is
  // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
  // promoted differently). The cost of 2 here is then a load and vrev or
  // vmovn.
  if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
      VecTy->isIntOrIntVectorTy() &&
      DL.getTypeSizeInBits(SubVecTy).getFixedSize() <= 64)
    return 2 * BaseCost;
}

return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
                                         Alignment, AddressSpace, CostKind,
                                         UseMaskForCond, UseMaskForGaps);
1494}

1496InstructionCost ARMTTIImpl::getGatherScatterOpCost(
  unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
  Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
using namespace PatternMatch;
if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
  return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
                                       Alignment, CostKind, I);

assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!")(static_cast <bool> (DataTy->isVectorTy() &&
 "Can't do gather/scatters on scalar!") ? void (0) : __assert_fail
 ("DataTy->isVectorTy() && \"Can't do gather/scatters on scalar!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp"
, 1504, __extension__ __PRETTY_FUNCTION__));
auto *VTy = cast<FixedVectorType>(DataTy);

// TODO: Splitting, once we do that.

unsigned NumElems = VTy->getNumElements();
unsigned EltSize = VTy->getScalarSizeInBits();
std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, DataTy);

// For now, it is assumed that for the MVE gather instructions the loads are
// all effectively serialised. This means the cost is the scalar cost
// multiplied by the number of elements being loaded. This is possibly very
// conservative, but even so we still end up vectorising loops because the
// cost per iteration for many loops is lower than for scalar loops.
InstructionCost VectorCost =
    NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
// The scalarization cost should be a lot higher. We use the number of vector
// elements plus the scalarization overhead.
InstructionCost ScalarCost =
    NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, true, false) +
    BaseT::getScalarizationOverhead(VTy, false, true);

if (EltSize < 8 || Alignment < EltSize / 8)
  return ScalarCost;

unsigned ExtSize = EltSize;
// Check whether there's a single user that asks for an extended type
if (I != nullptr) {
  // Dependent of the caller of this function, a gather instruction will
  // either have opcode Instruction::Load or be a call to the masked_gather
  // intrinsic
  if ((I->getOpcode() == Instruction::Load ||
       match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
      I->hasOneUse()) {
    const User *Us = *I->users().begin();
    if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
      // only allow valid type combinations
      unsigned TypeSize =
          cast<Instruction>(Us)->getType()->getScalarSizeInBits();
      if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
           (TypeSize == 16 && EltSize == 8)) &&
          TypeSize * NumElems == 128) {
        ExtSize = TypeSize;
      }
    }
  }
  // Check whether the input data needs to be truncated
  TruncInst *T;
  if ((I->getOpcode() == Instruction::Store ||
       match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
      (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
    // Only allow valid type combinations
    unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
    if (((EltSize == 16 && TypeSize == 32) ||
         (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
        TypeSize * NumElems == 128)
      ExtSize = TypeSize;
  }
}

if (ExtSize * NumElems != 128 || NumElems < 4)
  return ScalarCost;

// Any (aligned) i32 gather will not need to be scalarised.
if (ExtSize == 32)
  return VectorCost;
// For smaller types, we need to ensure that the gep's inputs are correctly
// extended from a small enough value. Other sizes (including i64) are
// scalarized for now.
if (ExtSize != 8 && ExtSize != 16)
  return ScalarCost;

if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
  Ptr = BC->getOperand(0);
if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
  if (GEP->getNumOperands() != 2)
    return ScalarCost;
  unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
  // Scale needs to be correct (which is only relevant for i16s).
  if (Scale != 1 && Scale * 8 != ExtSize)
    return ScalarCost;
  // And we need to zext (not sext) the indexes from a small enough type.
  if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
    if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
      return VectorCost;
  }
  return ScalarCost;
}
return ScalarCost;
1593}

1595InstructionCost
1596ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
                                     Optional<FastMathFlags> FMF,
                                     TTI::TargetCostKind CostKind) {
if (TTI::requiresOrderedReduction(FMF))
  return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);

EVT ValVT = TLI->getValueType(DL, ValTy);
int ISD = TLI->InstructionOpcodeToISD(Opcode);
if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD)
  return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);

std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);

static const CostTblEntry CostTblAdd[]{
    {ISD::ADD, MVT::v16i8, 1},
    {ISD::ADD, MVT::v8i16, 1},
    {ISD::ADD, MVT::v4i32, 1},
};
if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;

return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1618}

1620InstructionCost
1621ARMTTIImpl::getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
                                      Type *ResTy, VectorType *ValTy,
                                      TTI::TargetCostKind CostKind) {
EVT ValVT = TLI->getValueType(DL, ValTy);
EVT ResVT = TLI->getValueType(DL, ResTy);

if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
  std::pair<InstructionCost, MVT> LT =
      TLI->getTypeLegalizationCost(DL, ValTy);

  // The legal cases are:
  //   VADDV u/s 8/16/32
  //   VMLAV u/s 8/16/32
  //   VADDLV u/s 32
  //   VMLALV u/s 16/32
  // Codegen currently cannot always handle larger than legal vectors very
  // well, especially for predicated reductions where the mask needs to be
  // split, so restrict to 128bit or smaller input types.
  unsigned RevVTSize = ResVT.getSizeInBits();
  if (ValVT.getSizeInBits() <= 128 &&
      ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
       (LT.second == MVT::v8i16 && RevVTSize <= (IsMLA ? 64u : 32u)) ||
       (LT.second == MVT::v4i32 && RevVTSize <= 64)))
    return ST->getMVEVectorCostFactor(CostKind) * LT.first;
}

return BaseT::getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, ValTy,
                                          CostKind);
1649}

1651InstructionCost
1652ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                TTI::TargetCostKind CostKind) {
switch (ICA.getID()) {
case Intrinsic::get_active_lane_mask:
  // Currently we make a somewhat optimistic assumption that
  // active_lane_mask's are always free. In reality it may be freely folded
  // into a tail predicated loop, expanded into a VCPT or expanded into a lot
  // of add/icmp code. We may need to improve this in the future, but being
  // able to detect if it is free or not involves looking at a lot of other
  // code. We currently assume that the vectorizer inserted these, and knew
  // what it was doing in adding one.
  if (ST->hasMVEIntegerOps())
    return 0;
  break;
case Intrinsic::sadd_sat:
case Intrinsic::ssub_sat:
case Intrinsic::uadd_sat:
case Intrinsic::usub_sat: {
  if (!ST->hasMVEIntegerOps())
    break;
  Type *VT = ICA.getReturnType();

  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
  if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
      LT.second == MVT::v16i8) {
    // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
    // need to extend the type, as it uses shr(qadd(shl, shl)).
    unsigned Instrs =
        LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4;
    return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
  }
  break;
}
case Intrinsic::abs:
case Intrinsic::smin:
case Intrinsic::smax:
case Intrinsic::umin:
case Intrinsic::umax: {
  if (!ST->hasMVEIntegerOps())
    break;
  Type *VT = ICA.getReturnType();

  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
  if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
      LT.second == MVT::v16i8)
    return LT.first * ST->getMVEVectorCostFactor(CostKind);
  break;
}
case Intrinsic::minnum:
case Intrinsic::maxnum: {
  if (!ST->hasMVEFloatOps())
    break;
  Type *VT = ICA.getReturnType();
  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
  if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
    return LT.first * ST->getMVEVectorCostFactor(CostKind);
  break;
}
}

return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1713}

1715bool ARMTTIImpl::isLoweredToCall(const Function *F) {
if (!F->isIntrinsic())
  BaseT::isLoweredToCall(F);

// Assume all Arm-specific intrinsics map to an instruction.
if (F->getName().startswith("llvm.arm"))
  return false;

switch (F->getIntrinsicID()) {
default: break;
case Intrinsic::powi:
case Intrinsic::sin:
case Intrinsic::cos:
case Intrinsic::pow:
case Intrinsic::log:
case Intrinsic::log10:
case Intrinsic::log2:
case Intrinsic::exp:
case Intrinsic::exp2:
  return true;
case Intrinsic::sqrt:
case Intrinsic::fabs:
case Intrinsic::copysign:
case Intrinsic::floor:
case Intrinsic::ceil:
case Intrinsic::trunc:
case Intrinsic::rint:
case Intrinsic::nearbyint:
case Intrinsic::round:
case Intrinsic::canonicalize:
case Intrinsic::lround:
case Intrinsic::llround:
case Intrinsic::lrint:
case Intrinsic::llrint:
  if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
    return true;
  if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
    return true;
  // Some operations can be handled by vector instructions and assume
  // unsupported vectors will be expanded into supported scalar ones.
  // TODO Handle scalar operations properly.
  return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
case Intrinsic::masked_store:
case Intrinsic::masked_load:
case Intrinsic::masked_gather:
case Intrinsic::masked_scatter:
  return !ST->hasMVEIntegerOps();
case Intrinsic::sadd_with_overflow:
case Intrinsic::uadd_with_overflow:
case Intrinsic::ssub_with_overflow:
case Intrinsic::usub_with_overflow:
case Intrinsic::sadd_sat:
case Intrinsic::uadd_sat:
case Intrinsic::ssub_sat:
case Intrinsic::usub_sat:
  return false;
}

return BaseT::isLoweredToCall(F);
1774}

1776bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) {
unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
EVT VT = TLI->getValueType(DL, I.getType(), true);
if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
  return true;

// Check if an intrinsic will be lowered to a call and assume that any
// other CallInst will generate a bl.
if (auto *Call = dyn_cast<CallInst>(&I)) {
  if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
    switch(II->getIntrinsicID()) {
      case Intrinsic::memcpy:
      case Intrinsic::memset:
      case Intrinsic::memmove:
        return getNumMemOps(II) == -1;
      default:
        if (const Function *F = Call->getCalledFunction())
          return isLoweredToCall(F);
    }
  }
  return true;
}

// FPv5 provides conversions between integer, double-precision,
// single-precision, and half-precision formats.
switch (I.getOpcode()) {
default:
  break;
case Instruction::FPToSI:
case Instruction::FPToUI:
case Instruction::SIToFP:
case Instruction::UIToFP:
case Instruction::FPTrunc:
case Instruction::FPExt:
  return !ST->hasFPARMv8Base();
}

// FIXME: Unfortunately the approach of checking the Operation Action does
// not catch all cases of Legalization that use library calls. Our
// Legalization step categorizes some transformations into library calls as
// Custom, Expand or even Legal when doing type legalization. So for now
// we have to special case for instance the SDIV of 64bit integers and the
// use of floating point emulation.
if (VT.isInteger() && VT.getSizeInBits() >= 64) {
  switch (ISD) {
  default:
    break;
  case ISD::SDIV:
  case ISD::UDIV:
  case ISD::SREM:
  case ISD::UREM:
  case ISD::SDIVREM:
  case ISD::UDIVREM:
    return true;
  }
}

// Assume all other non-float operations are supported.
if (!VT.isFloatingPoint())
  return false;

// We'll need a library call to handle most floats when using soft.
if (TLI->useSoftFloat()) {
  switch (I.getOpcode()) {
  default:
    return true;
  case Instruction::Alloca:
  case Instruction::Load:
  case Instruction::Store:
  case Instruction::Select:
  case Instruction::PHI:
    return false;
  }
}

// We'll need a libcall to perform double precision operations on a single
// precision only FPU.
if (I.getType()->isDoubleTy() && !ST->hasFP64())
  return true;

// Likewise for half precision arithmetic.
if (I.getType()->isHalfTy() && !ST->hasFullFP16())
  return true;

return false;
1861}

1863bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
                                        AssumptionCache &AC,
                                        TargetLibraryInfo *LibInfo,
                                        HardwareLoopInfo &HWLoopInfo) {
// Low-overhead branches are only supported in the 'low-overhead branch'
// extension of v8.1-m.
if (!ST->hasLOB() || DisableLowOverheadLoops) {
  LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "ARMHWLoops: Disabled\n"; } } while
 (false);
  return false;
}

if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
  LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "ARMHWLoops: No BETC\n"; } } while
 (false);
  return false;
}

const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
  LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "ARMHWLoops: Uncomputable BETC\n"
; } } while (false);
  return false;
}

const SCEV *TripCountSCEV =
  SE.getAddExpr(BackedgeTakenCount,
                SE.getOne(BackedgeTakenCount->getType()));

// We need to store the trip count in LR, a 32-bit register.
if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
  LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n"
; } } while (false);
  return false;
}

// Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
// point in generating a hardware loop if that's going to happen.

auto IsHardwareLoopIntrinsic = [](Instruction &I) {
  if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
    switch (Call->getIntrinsicID()) {
    default:
      break;
    case Intrinsic::start_loop_iterations:
    case Intrinsic::test_start_loop_iterations:
    case Intrinsic::loop_decrement:
    case Intrinsic::loop_decrement_reg:
      return true;
    }
  }
  return false;
};

// Scan the instructions to see if there's any that we know will turn into a
// call or if this loop is already a low-overhead loop or will become a tail
// predicated loop.
bool IsTailPredLoop = false;
auto ScanLoop = [&](Loop *L) {
  for (auto *BB : L->getBlocks()) {
    for (auto &I : *BB) {
      if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
          isa<InlineAsm>(I)) {
        LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "ARMHWLoops: Bad instruction: "
 << I << "\n"; } } while (false);
        return false;
      }
      if (auto *II = dyn_cast<IntrinsicInst>(&I))
        IsTailPredLoop |=
            II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
            II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
            II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
            II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
            II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
    }
  }
  return true;
};

// Visit inner loops.
for (auto Inner : *L)
  if (!ScanLoop(Inner))
    return false;

if (!ScanLoop(L))
  return false;

// TODO: Check whether the trip count calculation is expensive. If L is the
// inner loop but we know it has a low trip count, calculating that trip
// count (in the parent loop) may be detrimental.

LLVMContext &C = L->getHeader()->getContext();
HWLoopInfo.CounterInReg = true;
HWLoopInfo.IsNestingLegal = false;
HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
HWLoopInfo.CountType = Type::getInt32Ty(C);
HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
return true;
1956}

1958static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
// We don't allow icmp's, and because we only look at single block loops,
// we simply count the icmps, i.e. there should only be 1 for the backedge.
if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
  return false;

if (isa<FCmpInst>(&I))
  return false;

// We could allow extending/narrowing FP loads/stores, but codegen is
// too inefficient so reject this for now.
if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
  return false;

// Extends have to be extending-loads
if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
  if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
    return false;

// Truncs have to be narrowing-stores
if (isa<TruncInst>(&I) )
  if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
    return false;

return true;
1983}

1985// To set up a tail-predicated loop, we need to know the total number of
1986// elements processed by that loop. Thus, we need to determine the element
1987// size and:
1988// 1) it should be uniform for all operations in the vector loop, so we
1989//    e.g. don't want any widening/narrowing operations.
1990// 2) it should be smaller than i64s because we don't have vector operations
1991//    that work on i64s.
1992// 3) we don't want elements to be reversed or shuffled, to make sure the
1993//    tail-predication masks/predicates the right lanes.
1994//
1995static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
                               const DataLayout &DL,
                               const LoopAccessInfo *LAI) {
LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Tail-predication: checking allowed instructions\n"
; } } while (false);

// If there are live-out values, it is probably a reduction. We can predicate
// most reduction operations freely under MVE using a combination of
// prefer-predicated-reduction-select and inloop reductions. We limit this to
// floating point and integer reductions, but don't check for operators
// specifically here. If the value ends up not being a reduction (and so the
// vectorizer cannot tailfold the loop), we should fall back to standard
// vectorization automatically.
SmallVector< Instruction *, 8 > LiveOuts;
LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
bool ReductionsDisabled =
    EnableTailPredication == TailPredication::EnabledNoReductions ||
    EnableTailPredication == TailPredication::ForceEnabledNoReductions;

for (auto *I : LiveOuts) {
  if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
      !I->getType()->isHalfTy()) {
    LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Don't tail-predicate loop with non-integer/float "
 "live-out value\n"; } } while (false)
                         "live-out value\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Don't tail-predicate loop with non-integer/float "
 "live-out value\n"; } } while (false);
    return false;
  }
  if (ReductionsDisabled) {
    LLVM_DEBUG(dbgs() << "Reductions not enabled\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Reductions not enabled\n"; } }
 while (false);
    return false;
  }
}

// Next, check that all instructions can be tail-predicated.
PredicatedScalarEvolution PSE = LAI->getPSE();
SmallVector<Instruction *, 16> LoadStores;
int ICmpCount = 0;

for (BasicBlock *BB : L->blocks()) {
  for (Instruction &I : BB->instructionsWithoutDebug()) {
    if (isa<PHINode>(&I))
      continue;
    if (!canTailPredicateInstruction(I, ICmpCount)) {
      LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Instruction not allowed: "; I.
dump(); } } while (false);
      return false;
    }

    Type *T  = I.getType();
    if (T->isPointerTy())
      T = T->getPointerElementType();

    if (T->getScalarSizeInBits() > 32) {
      LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Unsupported Type: "; T->dump
(); } } while (false);
      return false;
    }
    if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
      Value *Ptr = isa<LoadInst>(I) ? I.getOperand(0) : I.getOperand(1);
      int64_t NextStride = getPtrStride(PSE, Ptr, L);
      if (NextStride == 1) {
        // TODO: for now only allow consecutive strides of 1. We could support
        // other strides as long as it is uniform, but let's keep it simple
        // for now.
        continue;
      } else if (NextStride == -1 ||
                 (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
                 (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
        LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Consecutive strides of 2 found, vld2/vstr2 can't "
 "be tail-predicated\n."; } } while (false)
                   << "Consecutive strides of 2 found, vld2/vstr2 can't "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Consecutive strides of 2 found, vld2/vstr2 can't "
 "be tail-predicated\n."; } } while (false)
                      "be tail-predicated\n.")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Consecutive strides of 2 found, vld2/vstr2 can't "
 "be tail-predicated\n."; } } while (false);
        return false;
        // TODO: don't tail predicate if there is a reversed load?
      } else if (EnableMaskedGatherScatters) {
        // Gather/scatters do allow loading from arbitrary strides, at
        // least if they are loop invariant.
        // TODO: Loop variant strides should in theory work, too, but
        // this requires further testing.
        const SCEV *PtrScev =
            replaceSymbolicStrideSCEV(PSE, llvm::ValueToValueMap(), Ptr);
        if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
          const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
          if (PSE.getSE()->isLoopInvariant(Step, L))
            continue;
        }
      }
      LLVM_DEBUG(dbgs() << "Bad stride found, can't "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Bad stride found, can't " "tail-predicate\n."
; } } while (false)
                           "tail-predicate\n.")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Bad stride found, can't " "tail-predicate\n."
; } } while (false);
      return false;
    }
  }
}

LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "tail-predication: all instructions allowed!\n"
; } } while (false);
return true;
2086}

2088bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
                                           ScalarEvolution &SE,
                                           AssumptionCache &AC,
                                           TargetLibraryInfo *TLI,
                                           DominatorTree *DT,
                                           const LoopAccessInfo *LAI) {
if (!EnableTailPredication) {
  LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Tail-predication not enabled.\n"
; } } while (false);
  return false;
}

// Creating a predicated vector loop is the first step for generating a
// tail-predicated hardware loop, for which we need the MVE masked
// load/stores instructions:
if (!ST->hasMVEIntegerOps())
  return false;

// For now, restrict this to single block loops.
if (L->getNumBlocks() > 1) {
  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "preferPredicateOverEpilogue: not a single block "
 "loop.\n"; } } while (false)
                       "loop.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "preferPredicateOverEpilogue: not a single block "
 "loop.\n"; } } while (false);
  return false;
}

assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected")(static_cast <bool> (L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected"
) ? void (0) : __assert_fail ("L->isInnermost() && \"preferPredicateOverEpilogue: inner-loop expected\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp"
, 2112, __extension__ __PRETTY_FUNCTION__));

HardwareLoopInfo HWLoopInfo(L);
if (!HWLoopInfo.canAnalyze(*LI)) {
  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
 "analyzable.\n"; } } while (false)
                       "analyzable.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
 "analyzable.\n"; } } while (false);
  return false;
}

// This checks if we have the low-overhead branch architecture
// extension, and if we will create a hardware-loop:
if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {
  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
 "profitable.\n"; } } while (false)
                       "profitable.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
 "profitable.\n"; } } while (false);
  return false;
}

if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) {
  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
 "a candidate.\n"; } } while (false)
                       "a candidate.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
 "a candidate.\n"; } } while (false);
  return false;
}

return canTailPredicateLoop(L, LI, SE, DL, LAI);
2136}

2138bool ARMTTIImpl::emitGetActiveLaneMask() const {
if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
  return false;

// Intrinsic @llvm.get.active.lane.mask is supported.
// It is used in the MVETailPredication pass, which requires the number of
// elements processed by this vector loop to setup the tail-predicated
// loop.
return true;
2147}
2148void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                       TTI::UnrollingPreferences &UP,
                                       OptimizationRemarkEmitter *ORE) {
// Enable Upper bound unrolling universally, not dependant upon the conditions
// below.
UP.UpperBound = true;

// Only currently enable these preferences for M-Class cores.
if (!ST->isMClass())
  return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);

// Disable loop unrolling for Oz and Os.
UP.OptSizeThreshold = 0;
UP.PartialOptSizeThreshold = 0;
if (L->getHeader()->getParent()->hasOptSize())
  return;

SmallVector<BasicBlock*, 4> ExitingBlocks;
L->getExitingBlocks(ExitingBlocks);
LLVM_DEBUG(dbgs() << "Loop has:\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Loop has:\n" << "Blocks: "
 << L->getNumBlocks() << "\n" << "Exit blocks: "
 << ExitingBlocks.size() << "\n"; } } while (false
)
                  << "Blocks: " << L->getNumBlocks() << "\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Loop has:\n" << "Blocks: "
 << L->getNumBlocks() << "\n" << "Exit blocks: "
 << ExitingBlocks.size() << "\n"; } } while (false
)
                  << "Exit blocks: " << ExitingBlocks.size() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Loop has:\n" << "Blocks: "
 << L->getNumBlocks() << "\n" << "Exit blocks: "
 << ExitingBlocks.size() << "\n"; } } while (false
);

// Only allow another exit other than the latch. This acts as an early exit
// as it mirrors the profitability calculation of the runtime unroller.
if (ExitingBlocks.size() > 2)
  return;

// Limit the CFG of the loop body for targets with a branch predictor.
// Allowing 4 blocks permits if-then-else diamonds in the body.
if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
  return;

// Don't unroll vectorized loops, including the remainder loop
if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
  return;

// Scan the loop: don't unroll loops with calls as this could prevent
// inlining.
InstructionCost Cost = 0;
for (auto *BB : L->getBlocks()) {
  for (auto &I : *BB) {
    // Don't unroll vectorised loop. MVE does not benefit from it as much as
    // scalar code.
    if (I.getType()->isVectorTy())
      return;

    if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
      if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
        if (!isLoweredToCall(F))
          continue;
      }
      return;
    }

    SmallVector<const Value*, 4> Operands(I.operand_values());
    Cost +=
      getUserCost(&I, Operands, TargetTransformInfo::TCK_SizeAndLatency);
  }
}

// On v6m cores, there are very few registers available. We can easily end up
// spilling and reloading more registers in an unrolled loop. Look at the
// number of LCSSA phis as a rough measure of how many registers will need to
// be live out of the loop, reducing the default unroll count if more than 1
// value is needed.  In the long run, all of this should be being learnt by a
// machine.
unsigned UnrollCount = 4;
if (ST->isThumb1Only()) {
  unsigned ExitingValues = 0;
  SmallVector<BasicBlock *, 4> ExitBlocks;
  L->getExitBlocks(ExitBlocks);
  for (auto *Exit : ExitBlocks) {
    // Count the number of LCSSA phis. Exclude values coming from GEP's as
    // only the last is expected to be needed for address operands.
    unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) {
      return PH.getNumOperands() != 1 ||
             !isa<GetElementPtrInst>(PH.getOperand(0));
    });
    ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
  }
  if (ExitingValues)
    UnrollCount /= ExitingValues;
  if (UnrollCount <= 1)
    return;
}

LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Cost of loop: " << Cost <<
 "\n"; } } while (false);
LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Default Runtime Unroll Count: "
 << UnrollCount << "\n"; } } while (false);

UP.Partial = true;
UP.Runtime = true;
UP.UnrollRemainder = true;
UP.DefaultUnrollRuntimeCount = UnrollCount;
UP.UnrollAndJam = true;
UP.UnrollAndJamInnerLoopThreshold = 60;

// Force unrolling small loops can be very useful because of the branch
// taken cost of the backedge.
if (Cost < 12)
  UP.Force = true;
2249}

2251void ARMTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
                                     TTI::PeelingPreferences &PP) {
BaseT::getPeelingPreferences(L, SE, PP);
2254}

2256bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
                                     TTI::ReductionFlags Flags) const {
if (!ST->hasMVEIntegerOps())
  return false;

unsigned ScalarBits = Ty->getScalarSizeInBits();
switch (Opcode) {
case Instruction::Add:
  return ScalarBits <= 64;
default:
  return false;
}
2268}

2270bool ARMTTIImpl::preferPredicatedReductionSelect(
  unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const {
if (!ST->hasMVEIntegerOps())
  return false;
return true;
2275}

←

/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h

→

1//===-- ARMAddressingModes.h - ARM Addressing Modes -------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains the ARM addressing mode implementation stuff.
10//
11//===----------------------------------------------------------------------===//

13#ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMADDRESSINGMODES_H
14#define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMADDRESSINGMODES_H

16#include "llvm/ADT/APFloat.h"
17#include "llvm/ADT/APInt.h"
18#include "llvm/ADT/bit.h"
19#include "llvm/Support/ErrorHandling.h"
20#include "llvm/Support/MathExtras.h"
21#include <cassert>

23namespace llvm {

25/// ARM_AM - ARM Addressing Mode Stuff
26namespace ARM_AM {
enum ShiftOpc {
  no_shift = 0,
  asr,
  lsl,
  lsr,
  ror,
  rrx,
  uxtw
};

enum AddrOpc {
  sub = 0,
  add
};

inline const char *getAddrOpcStr(AddrOpc Op) { return Op == sub ? "-" : ""; }

inline const char *getShiftOpcStr(ShiftOpc Op) {
  switch (Op) {
  default: llvm_unreachable("Unknown shift opc!")::llvm::llvm_unreachable_internal("Unknown shift opc!", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 46);
  case ARM_AM::asr: return "asr";
  case ARM_AM::lsl: return "lsl";
  case ARM_AM::lsr: return "lsr";
  case ARM_AM::ror: return "ror";
  case ARM_AM::rrx: return "rrx";
  case ARM_AM::uxtw: return "uxtw";
  }
}

inline unsigned getShiftOpcEncoding(ShiftOpc Op) {
  switch (Op) {
  default: llvm_unreachable("Unknown shift opc!")::llvm::llvm_unreachable_internal("Unknown shift opc!", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 58);
  case ARM_AM::asr: return 2;
  case ARM_AM::lsl: return 0;
  case ARM_AM::lsr: return 1;
  case ARM_AM::ror: return 3;
  }
}

enum AMSubMode {
  bad_am_submode = 0,
  ia,
  ib,
  da,
  db
};

inline const char *getAMSubModeStr(AMSubMode Mode) {
  switch (Mode) {
  default: llvm_unreachable("Unknown addressing sub-mode!")::llvm::llvm_unreachable_internal("Unknown addressing sub-mode!"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 76);
  case ARM_AM::ia: return "ia";
  case ARM_AM::ib: return "ib";
  case ARM_AM::da: return "da";
  case ARM_AM::db: return "db";
  }
}

/// rotr32 - Rotate a 32-bit unsigned value right by a specified # bits.
///
inline unsigned rotr32(unsigned Val, unsigned Amt) {
  assert(Amt < 32 && "Invalid rotate amount")(static_cast <bool> (Amt < 32 && "Invalid rotate amount"
) ? void (0) : __assert_fail ("Amt < 32 && \"Invalid rotate amount\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 87, __extension__ __PRETTY_FUNCTION__));
  return (Val >> Amt) | (Val << ((32-Amt)&31));
}

/// rotl32 - Rotate a 32-bit unsigned value left by a specified # bits.
///
inline unsigned rotl32(unsigned Val, unsigned Amt) {
  assert(Amt < 32 && "Invalid rotate amount")(static_cast <bool> (Amt < 32 && "Invalid rotate amount"
) ? void (0) : __assert_fail ("Amt < 32 && \"Invalid rotate amount\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 94, __extension__ __PRETTY_FUNCTION__));
  return (Val << Amt) | (Val >> ((32-Amt)&31));
}

//===--------------------------------------------------------------------===//
// Addressing Mode #1: shift_operand with registers
//===--------------------------------------------------------------------===//
//
// This 'addressing mode' is used for arithmetic instructions.  It can
// represent things like:
//   reg
//   reg [asr|lsl|lsr|ror|rrx] reg
//   reg [asr|lsl|lsr|ror|rrx] imm
//
// This is stored three operands [rega, regb, opc].  The first is the base
// reg, the second is the shift amount (or reg0 if not present or imm).  The
// third operand encodes the shift opcode and the imm if a reg isn't present.
//
inline unsigned getSORegOpc(ShiftOpc ShOp, unsigned Imm) {
  return ShOp | (Imm << 3);
}
inline unsigned getSORegOffset(unsigned Op) { return Op >> 3; }
inline ShiftOpc getSORegShOp(unsigned Op) { return (ShiftOpc)(Op & 7); }

/// getSOImmValImm - Given an encoded imm field for the reg/imm form, return
/// the 8-bit imm value.
inline unsigned getSOImmValImm(unsigned Imm) { return Imm & 0xFF; }
/// getSOImmValRot - Given an encoded imm field for the reg/imm form, return
/// the rotate amount.
inline unsigned getSOImmValRot(unsigned Imm) { return (Imm >> 8) * 2; }

/// getSOImmValRotate - Try to handle Imm with an immediate shifter operand,
/// computing the rotate amount to use.  If this immediate value cannot be
/// handled with a single shifter-op, determine a good rotate amount that will
/// take a maximal chunk of bits out of the immediate.
inline unsigned getSOImmValRotate(unsigned Imm) {
  // 8-bit (or less) immediates are trivially shifter_operands with a rotate
  // of zero.
  if ((Imm & ~255U) == 0) return 0;

  // Use CTZ to compute the rotate amount.
  unsigned TZ = countTrailingZeros(Imm);

  // Rotate amount must be even.  Something like 0x200 must be rotated 8 bits,
  // not 9.
  unsigned RotAmt = TZ & ~1;

  // If we can handle this spread, return it.
  if ((rotr32(Imm, RotAmt) & ~255U) == 0)
    return (32-RotAmt)&31;  // HW rotates right, not left.

  // For values like 0xF000000F, we should ignore the low 6 bits, then
  // retry the hunt.
  if (Imm & 63U) {
    unsigned TZ2 = countTrailingZeros(Imm & ~63U);
    unsigned RotAmt2 = TZ2 & ~1;
    if ((rotr32(Imm, RotAmt2) & ~255U) == 0)
      return (32-RotAmt2)&31;  // HW rotates right, not left.
  }

  // Otherwise, we have no way to cover this span of bits with a single
  // shifter_op immediate.  Return a chunk of bits that will be useful to
  // handle.
  return (32-RotAmt)&31;  // HW rotates right, not left.
}

/// getSOImmVal - Given a 32-bit immediate, if it is something that can fit
/// into an shifter_operand immediate operand, return the 12-bit encoding for
/// it.  If not, return -1.
inline int getSOImmVal(unsigned Arg) {
  // 8-bit (or less) immediates are trivially shifter_operands with a rotate
  // of zero.
  if ((Arg & ~255U) == 0) return Arg;

  unsigned RotAmt = getSOImmValRotate(Arg);

  // If this cannot be handled with a single shifter_op, bail out.
  if (rotr32(~255U, RotAmt) & Arg)
    return -1;

  // Encode this correctly.
  return rotl32(Arg, RotAmt) | ((RotAmt>>1) << 8);
}

/// isSOImmTwoPartVal - Return true if the specified value can be obtained by
/// or'ing together two SOImmVal's.
inline bool isSOImmTwoPartVal(unsigned V) {
  // If this can be handled with a single shifter_op, bail out.
  V = rotr32(~255U, getSOImmValRotate(V)) & V;
  if (V == 0)
    return false;

  // If this can be handled with two shifter_op's, accept.
  V = rotr32(~255U, getSOImmValRotate(V)) & V;
  return V == 0;
}

/// getSOImmTwoPartFirst - If V is a value that satisfies isSOImmTwoPartVal,
/// return the first chunk of it.
inline unsigned getSOImmTwoPartFirst(unsigned V) {
  return rotr32(255U, getSOImmValRotate(V)) & V;
}

/// getSOImmTwoPartSecond - If V is a value that satisfies isSOImmTwoPartVal,
/// return the second chunk of it.
inline unsigned getSOImmTwoPartSecond(unsigned V) {
  // Mask out the first hunk.
  V = rotr32(~255U, getSOImmValRotate(V)) & V;

  // Take what's left.
  assert(V == (rotr32(255U, getSOImmValRotate(V)) & V))(static_cast <bool> (V == (rotr32(255U, getSOImmValRotate
(V)) & V)) ? void (0) : __assert_fail ("V == (rotr32(255U, getSOImmValRotate(V)) & V)"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 204, __extension__ __PRETTY_FUNCTION__));
  return V;
}

/// isSOImmTwoPartValNeg - Return true if the specified value can be obtained
/// by two SOImmVal, that -V = First + Second.
/// "R+V" can be optimized to (sub (sub R, First), Second).
/// "R=V" can be optimized to (sub (mvn R, ~(-First)), Second).
inline bool isSOImmTwoPartValNeg(unsigned V) {
  unsigned First;
  if (!isSOImmTwoPartVal(-V))
    return false;
  // Return false if ~(-First) is not a SoImmval.
  First = getSOImmTwoPartFirst(-V);
  First = ~(-First);
  return !(rotr32(~255U, getSOImmValRotate(First)) & First);
}

/// getThumbImmValShift - Try to handle Imm with a 8-bit immediate followed
/// by a left shift. Returns the shift amount to use.
inline unsigned getThumbImmValShift(unsigned Imm) {
  // 8-bit (or less) immediates are trivially immediate operand with a shift
  // of zero.
  if ((Imm & ~255U) == 0) return 0;
21
←
Assuming the condition is false→
22
←
Taking false branch→

  // Use CTZ to compute the shift amount.
  return countTrailingZeros(Imm);
23
←
Calling 'countTrailingZeros<unsigned int>'→
30
←
Returning from 'countTrailingZeros<unsigned int>'→
31
←
Returning the value 32→
}

/// isThumbImmShiftedVal - Return true if the specified value can be obtained
/// by left shifting a 8-bit immediate.
inline bool isThumbImmShiftedVal(unsigned V) {
  // If this can be handled with
  V = (~255U << getThumbImmValShift(V)) & V;
20
←
Calling 'getThumbImmValShift'→
32
←
Returning from 'getThumbImmValShift'→
33
←
The result of the left shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'
  return V == 0;
}

/// getThumbImm16ValShift - Try to handle Imm with a 16-bit immediate followed
/// by a left shift. Returns the shift amount to use.
inline unsigned getThumbImm16ValShift(unsigned Imm) {
  // 16-bit (or less) immediates are trivially immediate operand with a shift
  // of zero.
  if ((Imm & ~65535U) == 0) return 0;

  // Use CTZ to compute the shift amount.
  return countTrailingZeros(Imm);
}

/// isThumbImm16ShiftedVal - Return true if the specified value can be
/// obtained by left shifting a 16-bit immediate.
inline bool isThumbImm16ShiftedVal(unsigned V) {
  // If this can be handled with
  V = (~65535U << getThumbImm16ValShift(V)) & V;
  return V == 0;
}

/// getThumbImmNonShiftedVal - If V is a value that satisfies
/// isThumbImmShiftedVal, return the non-shiftd value.
inline unsigned getThumbImmNonShiftedVal(unsigned V) {
  return V >> getThumbImmValShift(V);
}


/// getT2SOImmValSplat - Return the 12-bit encoded representation
/// if the specified value can be obtained by splatting the low 8 bits
/// into every other byte or every byte of a 32-bit value. i.e.,
///     00000000 00000000 00000000 abcdefgh    control = 0
///     00000000 abcdefgh 00000000 abcdefgh    control = 1
///     abcdefgh 00000000 abcdefgh 00000000    control = 2
///     abcdefgh abcdefgh abcdefgh abcdefgh    control = 3
/// Return -1 if none of the above apply.
/// See ARM Reference Manual A6.3.2.
inline int getT2SOImmValSplatVal(unsigned V) {
  unsigned u, Vs, Imm;
  // control = 0
  if ((V & 0xffffff00) == 0)
    return V;

  // If the value is zeroes in the first byte, just shift those off
  Vs = ((V & 0xff) == 0) ? V >> 8 : V;
  // Any passing value only has 8 bits of payload, splatted across the word
  Imm = Vs & 0xff;
  // Likewise, any passing values have the payload splatted into the 3rd byte
  u = Imm | (Imm << 16);

  // control = 1 or 2
  if (Vs == u)
    return (((Vs == V) ? 1 : 2) << 8) | Imm;

  // control = 3
  if (Vs == (u | (u << 8)))
    return (3 << 8) | Imm;

  return -1;
}

/// getT2SOImmValRotateVal - Return the 12-bit encoded representation if the
/// specified value is a rotated 8-bit value. Return -1 if no rotation
/// encoding is possible.
/// See ARM Reference Manual A6.3.2.
inline int getT2SOImmValRotateVal(unsigned V) {
  unsigned RotAmt = countLeadingZeros(V);
  if (RotAmt >= 24)
    return -1;

  // If 'Arg' can be handled with a single shifter_op return the value.
  if ((rotr32(0xff000000U, RotAmt) & V) == V)
    return (rotr32(V, 24 - RotAmt) & 0x7f) | ((RotAmt + 8) << 7);

  return -1;
}

/// getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit
/// into a Thumb-2 shifter_operand immediate operand, return the 12-bit
/// encoding for it.  If not, return -1.
/// See ARM Reference Manual A6.3.2.
inline int getT2SOImmVal(unsigned Arg) {
  // If 'Arg' is an 8-bit splat, then get the encoded value.
  int Splat = getT2SOImmValSplatVal(Arg);
  if (Splat != -1)
    return Splat;

  // If 'Arg' can be handled with a single shifter_op return the value.
  int Rot = getT2SOImmValRotateVal(Arg);
  if (Rot != -1)
    return Rot;

  return -1;
}

inline unsigned getT2SOImmValRotate(unsigned V) {
  if ((V & ~255U) == 0) return 0;
  // Use CTZ to compute the rotate amount.
  unsigned RotAmt = countTrailingZeros(V);
  return (32 - RotAmt) & 31;
}

inline bool isT2SOImmTwoPartVal(unsigned Imm) {
  unsigned V = Imm;
  // Passing values can be any combination of splat values and shifter
  // values. If this can be handled with a single shifter or splat, bail
  // out. Those should be handled directly, not with a two-part val.
  if (getT2SOImmValSplatVal(V) != -1)
    return false;
  V = rotr32 (~255U, getT2SOImmValRotate(V)) & V;
  if (V == 0)
    return false;

  // If this can be handled as an immediate, accept.
  if (getT2SOImmVal(V) != -1) return true;

  // Likewise, try masking out a splat value first.
  V = Imm;
  if (getT2SOImmValSplatVal(V & 0xff00ff00U) != -1)
    V &= ~0xff00ff00U;
  else if (getT2SOImmValSplatVal(V & 0x00ff00ffU) != -1)
    V &= ~0x00ff00ffU;
  // If what's left can be handled as an immediate, accept.
  if (getT2SOImmVal(V) != -1) return true;

  // Otherwise, do not accept.
  return false;
}

inline unsigned getT2SOImmTwoPartFirst(unsigned Imm) {
  assert (isT2SOImmTwoPartVal(Imm) &&(static_cast <bool> (isT2SOImmTwoPartVal(Imm) &&
 "Immedate cannot be encoded as two part immediate!") ? void (
0) : __assert_fail ("isT2SOImmTwoPartVal(Imm) && \"Immedate cannot be encoded as two part immediate!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 370, __extension__ __PRETTY_FUNCTION__))
          "Immedate cannot be encoded as two part immediate!")(static_cast <bool> (isT2SOImmTwoPartVal(Imm) &&
 "Immedate cannot be encoded as two part immediate!") ? void (
0) : __assert_fail ("isT2SOImmTwoPartVal(Imm) && \"Immedate cannot be encoded as two part immediate!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 370, __extension__ __PRETTY_FUNCTION__));
  // Try a shifter operand as one part
  unsigned V = rotr32 (~255, getT2SOImmValRotate(Imm)) & Imm;
  // If the rest is encodable as an immediate, then return it.
  if (getT2SOImmVal(V) != -1) return V;

  // Try masking out a splat value first.
  if (getT2SOImmValSplatVal(Imm & 0xff00ff00U) != -1)
    return Imm & 0xff00ff00U;

  // The other splat is all that's left as an option.
  assert (getT2SOImmValSplatVal(Imm & 0x00ff00ffU) != -1)(static_cast <bool> (getT2SOImmValSplatVal(Imm & 0x00ff00ffU
) != -1) ? void (0) : __assert_fail ("getT2SOImmValSplatVal(Imm & 0x00ff00ffU) != -1"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 381, __extension__ __PRETTY_FUNCTION__));
  return Imm & 0x00ff00ffU;
}

inline unsigned getT2SOImmTwoPartSecond(unsigned Imm) {
  // Mask out the first hunk
  Imm ^= getT2SOImmTwoPartFirst(Imm);
  // Return what's left
  assert (getT2SOImmVal(Imm) != -1 &&(static_cast <bool> (getT2SOImmVal(Imm) != -1 &&
 "Unable to encode second part of T2 two part SO immediate") ?
 void (0) : __assert_fail ("getT2SOImmVal(Imm) != -1 && \"Unable to encode second part of T2 two part SO immediate\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 390, __extension__ __PRETTY_FUNCTION__))
          "Unable to encode second part of T2 two part SO immediate")(static_cast <bool> (getT2SOImmVal(Imm) != -1 &&
 "Unable to encode second part of T2 two part SO immediate") ?
 void (0) : __assert_fail ("getT2SOImmVal(Imm) != -1 && \"Unable to encode second part of T2 two part SO immediate\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 390, __extension__ __PRETTY_FUNCTION__));
  return Imm;
}


//===--------------------------------------------------------------------===//
// Addressing Mode #2
//===--------------------------------------------------------------------===//
//
// This is used for most simple load/store instructions.
//
// addrmode2 := reg +/- reg shop imm
// addrmode2 := reg +/- imm12
//
// The first operand is always a Reg.  The second operand is a reg if in
// reg/reg form, otherwise it's reg#0.  The third field encodes the operation
// in bit 12, the immediate in bits 0-11, and the shift op in 13-15. The
// fourth operand 16-17 encodes the index mode.
//
// If this addressing mode is a frame index (before prolog/epilog insertion
// and code rewriting), this operand will have the form:  FI#, reg0, <offs>
// with no shift amount for the frame offset.
//
inline unsigned getAM2Opc(AddrOpc Opc, unsigned Imm12, ShiftOpc SO,
                          unsigned IdxMode = 0) {
  assert(Imm12 < (1 << 12) && "Imm too large!")(static_cast <bool> (Imm12 < (1 << 12) &&
 "Imm too large!") ? void (0) : __assert_fail ("Imm12 < (1 << 12) && \"Imm too large!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 415, __extension__ __PRETTY_FUNCTION__));
  bool isSub = Opc == sub;
  return Imm12 | ((int)isSub << 12) | (SO << 13) | (IdxMode << 16) ;
}
inline unsigned getAM2Offset(unsigned AM2Opc) {
  return AM2Opc & ((1 << 12)-1);
}
inline AddrOpc getAM2Op(unsigned AM2Opc) {
  return ((AM2Opc >> 12) & 1) ? sub : add;
}
inline ShiftOpc getAM2ShiftOpc(unsigned AM2Opc) {
  return (ShiftOpc)((AM2Opc >> 13) & 7);
}
inline unsigned getAM2IdxMode(unsigned AM2Opc) { return (AM2Opc >> 16); }

//===--------------------------------------------------------------------===//
// Addressing Mode #3
//===--------------------------------------------------------------------===//
//
// This is used for sign-extending loads, and load/store-pair instructions.
//
// addrmode3 := reg +/- reg
// addrmode3 := reg +/- imm8
//
// The first operand is always a Reg.  The second operand is a reg if in
// reg/reg form, otherwise it's reg#0.  The third field encodes the operation
// in bit 8, the immediate in bits 0-7. The fourth operand 9-10 encodes the
// index mode.

/// getAM3Opc - This function encodes the addrmode3 opc field.
inline unsigned getAM3Opc(AddrOpc Opc, unsigned char Offset,
                          unsigned IdxMode = 0) {
  bool isSub = Opc == sub;
  return ((int)isSub << 8) | Offset | (IdxMode << 9);
}
inline unsigned char getAM3Offset(unsigned AM3Opc) { return AM3Opc & 0xFF; }
inline AddrOpc getAM3Op(unsigned AM3Opc) {
  return ((AM3Opc >> 8) & 1) ? sub : add;
}
inline unsigned getAM3IdxMode(unsigned AM3Opc) { return (AM3Opc >> 9); }

//===--------------------------------------------------------------------===//
// Addressing Mode #4
//===--------------------------------------------------------------------===//
//
// This is used for load / store multiple instructions.
//
// addrmode4 := reg, <mode>
//
// The four modes are:
//    IA - Increment after
//    IB - Increment before
//    DA - Decrement after
//    DB - Decrement before
// For VFP instructions, only the IA and DB modes are valid.

inline AMSubMode getAM4SubMode(unsigned Mode) {
  return (AMSubMode)(Mode & 0x7);
}

inline unsigned getAM4ModeImm(AMSubMode SubMode) { return (int)SubMode; }

//===--------------------------------------------------------------------===//
// Addressing Mode #5
//===--------------------------------------------------------------------===//
//
// This is used for coprocessor instructions, such as FP load/stores.
//
// addrmode5 := reg +/- imm8*4
//
// The first operand is always a Reg.  The second operand encodes the
// operation (add or subtract) in bit 8 and the immediate in bits 0-7.

/// getAM5Opc - This function encodes the addrmode5 opc field.
inline unsigned getAM5Opc(AddrOpc Opc, unsigned char Offset) {
  bool isSub = Opc == sub;
  return ((int)isSub << 8) | Offset;
}
inline unsigned char getAM5Offset(unsigned AM5Opc) { return AM5Opc & 0xFF; }
inline AddrOpc getAM5Op(unsigned AM5Opc) {
  return ((AM5Opc >> 8) & 1) ? sub : add;
}

//===--------------------------------------------------------------------===//
// Addressing Mode #5 FP16
//===--------------------------------------------------------------------===//
//
// This is used for coprocessor instructions, such as 16-bit FP load/stores.
//
// addrmode5fp16 := reg +/- imm8*2
//
// The first operand is always a Reg.  The second operand encodes the
// operation (add or subtract) in bit 8 and the immediate in bits 0-7.

/// getAM5FP16Opc - This function encodes the addrmode5fp16 opc field.
inline unsigned getAM5FP16Opc(AddrOpc Opc, unsigned char Offset) {
  bool isSub = Opc == sub;
  return ((int)isSub << 8) | Offset;
}
inline unsigned char getAM5FP16Offset(unsigned AM5Opc) {
  return AM5Opc & 0xFF;
}
inline AddrOpc getAM5FP16Op(unsigned AM5Opc) {
  return ((AM5Opc >> 8) & 1) ? sub : add;
}

//===--------------------------------------------------------------------===//
// Addressing Mode #6
//===--------------------------------------------------------------------===//
//
// This is used for NEON load / store instructions.
//
// addrmode6 := reg with optional alignment
//
// This is stored in two operands [regaddr, align].  The first is the
// address register.  The second operand is the value of the alignment
// specifier in bytes or zero if no explicit alignment.
// Valid alignments depend on the specific instruction.

//===--------------------------------------------------------------------===//
// NEON/MVE Modified Immediates
//===--------------------------------------------------------------------===//
//
// Several NEON and MVE instructions (e.g., VMOV) take a "modified immediate"
// vector operand, where a small immediate encoded in the instruction
// specifies a full NEON vector value.  These modified immediates are
// represented here as encoded integers.  The low 8 bits hold the immediate
// value; bit 12 holds the "Op" field of the instruction, and bits 11-8 hold
// the "Cmode" field of the instruction.  The interfaces below treat the
// Op and Cmode values as a single 5-bit value.

inline unsigned createVMOVModImm(unsigned OpCmode, unsigned Val) {
  return (OpCmode << 8) | Val;
}
inline unsigned getVMOVModImmOpCmode(unsigned ModImm) {
  return (ModImm >> 8) & 0x1f;
}
inline unsigned getVMOVModImmVal(unsigned ModImm) { return ModImm & 0xff; }

/// decodeVMOVModImm - Decode a NEON/MVE modified immediate value into the
/// element value and the element size in bits.  (If the element size is
/// smaller than the vector, it is splatted into all the elements.)
inline uint64_t decodeVMOVModImm(unsigned ModImm, unsigned &EltBits) {
  unsigned OpCmode = getVMOVModImmOpCmode(ModImm);
  unsigned Imm8 = getVMOVModImmVal(ModImm);
  uint64_t Val = 0;

  if (OpCmode == 0xe) {
    // 8-bit vector elements
    Val = Imm8;
    EltBits = 8;
  } else if ((OpCmode & 0xc) == 0x8) {
    // 16-bit vector elements
    unsigned ByteNum = (OpCmode & 0x6) >> 1;
    Val = Imm8 << (8 * ByteNum);
    EltBits = 16;
  } else if ((OpCmode & 0x8) == 0) {
    // 32-bit vector elements, zero with one byte set
    unsigned ByteNum = (OpCmode & 0x6) >> 1;
    Val = Imm8 << (8 * ByteNum);
    EltBits = 32;
  } else if ((OpCmode & 0xe) == 0xc) {
    // 32-bit vector elements, one byte with low bits set
    unsigned ByteNum = 1 + (OpCmode & 0x1);
    Val = (Imm8 << (8 * ByteNum)) | (0xffff >> (8 * (2 - ByteNum)));
    EltBits = 32;
  } else if (OpCmode == 0x1e) {
    // 64-bit vector elements
    for (unsigned ByteNum = 0; ByteNum < 8; ++ByteNum) {
      if ((ModImm >> ByteNum) & 1)
        Val |= (uint64_t)0xff << (8 * ByteNum);
    }
    EltBits = 64;
  } else {
    llvm_unreachable("Unsupported VMOV immediate")::llvm::llvm_unreachable_internal("Unsupported VMOV immediate"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 589);
  }
  return Val;
}

// Generic validation for single-byte immediate (0X00, 00X0, etc).
inline bool isNEONBytesplat(unsigned Value, unsigned Size) {
  assert(Size >= 1 && Size <= 4 && "Invalid size")(static_cast <bool> (Size >= 1 && Size <=
&& "Invalid size") ? void (0) : __assert_fail ("Size >= 1 && Size <= 4 && \"Invalid size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 596, __extension__ __PRETTY_FUNCTION__));
  unsigned count = 0;
  for (unsigned i = 0; i < Size; ++i) {
    if (Value & 0xff) count++;
    Value >>= 8;
  }
  return count == 1;
}

/// Checks if Value is a correct immediate for instructions like VBIC/VORR.
inline bool isNEONi16splat(unsigned Value) {
  if (Value > 0xffff)
    return false;
  // i16 value with set bits only in one byte X0 or 0X.
  return Value == 0 || isNEONBytesplat(Value, 2);
}

// Encode NEON 16 bits Splat immediate for instructions like VBIC/VORR
inline unsigned encodeNEONi16splat(unsigned Value) {
  assert(isNEONi16splat(Value) && "Invalid NEON splat value")(static_cast <bool> (isNEONi16splat(Value) && "Invalid NEON splat value"
) ? void (0) : __assert_fail ("isNEONi16splat(Value) && \"Invalid NEON splat value\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 615, __extension__ __PRETTY_FUNCTION__));
  if (Value >= 0x100)
    Value = (Value >> 8) | 0xa00;
  else
    Value |= 0x800;
  return Value;
}

/// Checks if Value is a correct immediate for instructions like VBIC/VORR.
inline bool isNEONi32splat(unsigned Value) {
  // i32 value with set bits only in one byte X000, 0X00, 00X0, or 000X.
  return Value == 0 || isNEONBytesplat(Value, 4);
}

/// Encode NEON 32 bits Splat immediate for instructions like VBIC/VORR.
inline unsigned encodeNEONi32splat(unsigned Value) {
  assert(isNEONi32splat(Value) && "Invalid NEON splat value")(static_cast <bool> (isNEONi32splat(Value) && "Invalid NEON splat value"
) ? void (0) : __assert_fail ("isNEONi32splat(Value) && \"Invalid NEON splat value\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 631, __extension__ __PRETTY_FUNCTION__));
  if (Value >= 0x100 && Value <= 0xff00)
    Value = (Value >> 8) | 0x200;
  else if (Value > 0xffff && Value <= 0xff0000)
    Value = (Value >> 16) | 0x400;
  else if (Value > 0xffffff)
    Value = (Value >> 24) | 0x600;
  return Value;
}

//===--------------------------------------------------------------------===//
// Floating-point Immediates
//
inline float getFPImmFloat(unsigned Imm) {
  // We expect an 8-bit binary encoding of a floating-point number here.

  uint8_t Sign = (Imm >> 7) & 0x1;
  uint8_t Exp = (Imm >> 4) & 0x7;
  uint8_t Mantissa = Imm & 0xf;

  //   8-bit FP    IEEE Float Encoding
  //   abcd efgh   aBbbbbbc defgh000 00000000 00000000
  //
  // where B = NOT(b);
  uint32_t I = 0;
  I |= Sign << 31;
  I |= ((Exp & 0x4) != 0 ? 0 : 1) << 30;
  I |= ((Exp & 0x4) != 0 ? 0x1f : 0) << 25;
  I |= (Exp & 0x3) << 23;
  I |= Mantissa << 19;
  return bit_cast<float>(I);
}

/// getFP16Imm - Return an 8-bit floating-point version of the 16-bit
/// floating-point value. If the value cannot be represented as an 8-bit
/// floating-point value, then return -1.
inline int getFP16Imm(const APInt &Imm) {
  uint32_t Sign = Imm.lshr(15).getZExtValue() & 1;
  int32_t Exp = (Imm.lshr(10).getSExtValue() & 0x1f) - 15;  // -14 to 15
  int64_t Mantissa = Imm.getZExtValue() & 0x3ff;  // 10 bits

  // We can handle 4 bits of mantissa.
  // mantissa = (16+UInt(e:f:g:h))/16.
  if (Mantissa & 0x3f)
    return -1;
  Mantissa >>= 6;

  // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
  if (Exp < -3 || Exp > 4)
    return -1;
  Exp = ((Exp+3) & 0x7) ^ 4;

  return ((int)Sign << 7) | (Exp << 4) | Mantissa;
}

inline int getFP16Imm(const APFloat &FPImm) {
  return getFP16Imm(FPImm.bitcastToAPInt());
}

/// If this is a FP16Imm encoded as a fp32 value, return the 8-bit encoding
/// for it. Otherwise return -1 like getFP16Imm.
inline int getFP32FP16Imm(const APInt &Imm) {
  if (Imm.getActiveBits() > 16)
    return -1;
  return ARM_AM::getFP16Imm(Imm.trunc(16));
}

inline int getFP32FP16Imm(const APFloat &FPImm) {
  return getFP32FP16Imm(FPImm.bitcastToAPInt());
}

/// getFP32Imm - Return an 8-bit floating-point version of the 32-bit
/// floating-point value. If the value cannot be represented as an 8-bit
/// floating-point value, then return -1.
inline int getFP32Imm(const APInt &Imm) {
  uint32_t Sign = Imm.lshr(31).getZExtValue() & 1;
  int32_t Exp = (Imm.lshr(23).getSExtValue() & 0xff) - 127;  // -126 to 127
  int64_t Mantissa = Imm.getZExtValue() & 0x7fffff;  // 23 bits

  // We can handle 4 bits of mantissa.
  // mantissa = (16+UInt(e:f:g:h))/16.
  if (Mantissa & 0x7ffff)
    return -1;
  Mantissa >>= 19;
  if ((Mantissa & 0xf) != Mantissa)
    return -1;

  // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
  if (Exp < -3 || Exp > 4)
    return -1;
  Exp = ((Exp+3) & 0x7) ^ 4;

  return ((int)Sign << 7) | (Exp << 4) | Mantissa;
}

inline int getFP32Imm(const APFloat &FPImm) {
  return getFP32Imm(FPImm.bitcastToAPInt());
}

/// getFP64Imm - Return an 8-bit floating-point version of the 64-bit
/// floating-point value. If the value cannot be represented as an 8-bit
/// floating-point value, then return -1.
inline int getFP64Imm(const APInt &Imm) {
  uint64_t Sign = Imm.lshr(63).getZExtValue() & 1;
  int64_t Exp = (Imm.lshr(52).getSExtValue() & 0x7ff) - 1023; // -1022 to 1023
  uint64_t Mantissa = Imm.getZExtValue() & 0xfffffffffffffULL;

  // We can handle 4 bits of mantissa.
  // mantissa = (16+UInt(e:f:g:h))/16.
  if (Mantissa & 0xffffffffffffULL)
    return -1;
  Mantissa >>= 48;
  if ((Mantissa & 0xf) != Mantissa)
    return -1;

  // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
  if (Exp < -3 || Exp > 4)
    return -1;
  Exp = ((Exp+3) & 0x7) ^ 4;

  return ((int)Sign << 7) | (Exp << 4) | Mantissa;
}

inline int getFP64Imm(const APFloat &FPImm) {
  return getFP64Imm(FPImm.bitcastToAPInt());
}

758} // end namespace ARM_AM
759} // end namespace llvm

761#endif


←

/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h

1//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains some functions that are useful for math stuff.
10//
11//===----------------------------------------------------------------------===//
12 
13#ifndef LLVM_SUPPORT_MATHEXTRAS_H
14#define LLVM_SUPPORT_MATHEXTRAS_H
15 
16#include "llvm/Support/Compiler.h"
17#include <cassert>
18#include <climits>
19#include <cmath>
20#include <cstdint>
21#include <cstring>
22#include <limits>
23#include <type_traits>
24 
25#ifdef __ANDROID_NDK__
26#include <android/api-level.h>
27#endif
28 
29#ifdef _MSC_VER
30// Declare these intrinsics manually rather including intrin.h. It's very
31// expensive, and MathExtras.h is popular.
32// #include <intrin.h>
33extern "C" {
34unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
35unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
36unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
37unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
38}
39#endif
40 
41namespace llvm {
42 
43/// The behavior an operation has on an input of 0.
44enum ZeroBehavior {
45  /// The returned value is undefined.
46  ZB_Undefined,
47  /// The returned value is numeric_limits<T>::max()
48  ZB_Max,
49  /// The returned value is numeric_limits<T>::digits
50  ZB_Width
51};
52 
53/// Mathematical constants.
54namespace numbers {
55// TODO: Track C++20 std::numbers.
56// TODO: Favor using the hexadecimal FP constants (requires C++17).
57constexpr double e          = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113
58                 egamma     = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620
59                 ln2        = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162
60                 ln10       = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392
61                 log2e      = 1.4426950408889634074, // (0x1.71547652b82feP+0)
62                 log10e     = .43429448190325182765, // (0x1.bcb7b1526e50eP-2)
63                 pi         = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796
64                 inv_pi     = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541
65                 sqrtpi     = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161
66                 inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197
67                 sqrt2      = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219
68                 inv_sqrt2  = .70710678118654752440, // (0x1.6a09e667f3bcdP-1)
69                 sqrt3      = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194
70                 inv_sqrt3  = .57735026918962576451, // (0x1.279a74590331cP-1)
71                 phi        = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622
72constexpr float ef          = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113
73                egammaf     = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620
74                ln2f        = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162
75                ln10f       = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392
76                log2ef      = 1.44269504F, // (0x1.715476P+0)
77                log10ef     = .434294482F, // (0x1.bcb7b2P-2)
78                pif         = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796
79                inv_pif     = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541
80                sqrtpif     = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161
81                inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197
82                sqrt2f      = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193
83                inv_sqrt2f  = .707106781F, // (0x1.6a09e6P-1)
84                sqrt3f      = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194
85                inv_sqrt3f  = .577350269F, // (0x1.279a74P-1)
86                phif        = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622
87} // namespace numbers
88 
89namespace detail {
90template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
91  static unsigned count(T Val, ZeroBehavior) {
92    if (!Val)
93      return std::numeric_limits<T>::digits;
94    if (Val & 0x1)
95      return 0;
96 
97    // Bisection method.
98    unsigned ZeroBits = 0;
99    T Shift = std::numeric_limits<T>::digits >> 1;
100    T Mask = std::numeric_limits<T>::max() >> Shift;
101    while (Shift) {
102      if ((Val & Mask) == 0) {
103        Val >>= Shift;
104        ZeroBits |= Shift;
105      }
106      Shift >>= 1;
107      Mask >>= Shift;
108    }
109    return ZeroBits;
110  }
111};
112 
113#if defined(__GNUC__4) || defined(_MSC_VER)
114template <typename T> struct TrailingZerosCounter<T, 4> {
115  static unsigned count(T Val, ZeroBehavior ZB) {
116    if (ZB24.1
'ZB' is not equal to ZB_Undefined
24.1
'ZB' is not equal to ZB_Undefined
24.1
'ZB' is not equal to ZB_Undefined
 != ZB_Undefined && Val == 0)
25
←
Assuming 'Val' is equal to 0→
26
←
Taking true branch→
117      return 32;
27
←
Returning the value 32→
118 
119#if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4)
120    return __builtin_ctz(Val);
121#elif defined(_MSC_VER)
122    unsigned long Index;
123    _BitScanForward(&Index, Val);
124    return Index;
125#endif
126  }
127};
128 
129#if !defined(_MSC_VER) || defined(_M_X64)
130template <typename T> struct TrailingZerosCounter<T, 8> {
131  static unsigned count(T Val, ZeroBehavior ZB) {
132    if (ZB != ZB_Undefined && Val == 0)
133      return 64;
134 
135#if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4)
136    return __builtin_ctzll(Val);
137#elif defined(_MSC_VER)
138    unsigned long Index;
139    _BitScanForward64(&Index, Val);
140    return Index;
141#endif
142  }
143};
144#endif
145#endif
146} // namespace detail
147 
148/// Count number of 0's from the least significant bit to the most
149///   stopping at the first 1.
150///
151/// Only unsigned integral types are allowed.
152///
153/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
154///   valid arguments.
155template <typename T>
156unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
157  static_assert(std::numeric_limits<T>::is_integer &&
158                    !std::numeric_limits<T>::is_signed,
159                "Only unsigned integral types are allowed.");
160  return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
24
←
Calling 'TrailingZerosCounter::count'→
28
←
Returning from 'TrailingZerosCounter::count'→
29
←
Returning the value 32→
161}
162 
163namespace detail {
164template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
165  static unsigned count(T Val, ZeroBehavior) {
166    if (!Val)
167      return std::numeric_limits<T>::digits;
168 
169    // Bisection method.
170    unsigned ZeroBits = 0;
171    for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
172      T Tmp = Val >> Shift;
173      if (Tmp)
174        Val = Tmp;
175      else
176        ZeroBits |= Shift;
177    }
178    return ZeroBits;
179  }
180};
181 
182#if defined(__GNUC__4) || defined(_MSC_VER)
183template <typename T> struct LeadingZerosCounter<T, 4> {
184  static unsigned count(T Val, ZeroBehavior ZB) {
185    if (ZB != ZB_Undefined && Val == 0)
186      return 32;
187 
188#if __has_builtin(__builtin_clz)1 || defined(__GNUC__4)
189    return __builtin_clz(Val);
190#elif defined(_MSC_VER)
191    unsigned long Index;
192    _BitScanReverse(&Index, Val);
193    return Index ^ 31;
194#endif
195  }
196};
197 
198#if !defined(_MSC_VER) || defined(_M_X64)
199template <typename T> struct LeadingZerosCounter<T, 8> {
200  static unsigned count(T Val, ZeroBehavior ZB) {
201    if (ZB != ZB_Undefined && Val == 0)
202      return 64;
203 
204#if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4)
205    return __builtin_clzll(Val);
206#elif defined(_MSC_VER)
207    unsigned long Index;
208    _BitScanReverse64(&Index, Val);
209    return Index ^ 63;
210#endif
211  }
212};
213#endif
214#endif
215} // namespace detail
216 
217/// Count number of 0's from the most significant bit to the least
218///   stopping at the first 1.
219///
220/// Only unsigned integral types are allowed.
221///
222/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
223///   valid arguments.
224template <typename T>
225unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
226  static_assert(std::numeric_limits<T>::is_integer &&
227                    !std::numeric_limits<T>::is_signed,
228                "Only unsigned integral types are allowed.");
229  return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
230}
231 
232/// Get the index of the first set bit starting from the least
233///   significant bit.
234///
235/// Only unsigned integral types are allowed.
236///
237/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
238///   valid arguments.
239template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
240  if (ZB == ZB_Max && Val == 0)
241    return std::numeric_limits<T>::max();
242 
243  return countTrailingZeros(Val, ZB_Undefined);
244}
245 
246/// Create a bitmask with the N right-most bits set to 1, and all other
247/// bits set to 0.  Only unsigned types are allowed.
248template <typename T> T maskTrailingOnes(unsigned N) {
249  static_assert(std::is_unsigned<T>::value, "Invalid type!");
250  const unsigned Bits = CHAR_BIT8 * sizeof(T);
251  assert(N <= Bits && "Invalid bit index")(static_cast <bool> (N <= Bits && "Invalid bit index"
) ? void (0) : __assert_fail ("N <= Bits && \"Invalid bit index\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h"
, 251, __extension__ __PRETTY_FUNCTION__));
252  return N == 0 ? 0 : (T(-1) >> (Bits - N));
253}
254 
255/// Create a bitmask with the N left-most bits set to 1, and all other
256/// bits set to 0.  Only unsigned types are allowed.
257template <typename T> T maskLeadingOnes(unsigned N) {
258  return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
259}
260 
261/// Create a bitmask with the N right-most bits set to 0, and all other
262/// bits set to 1.  Only unsigned types are allowed.
263template <typename T> T maskTrailingZeros(unsigned N) {
264  return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
265}
266 
267/// Create a bitmask with the N left-most bits set to 0, and all other
268/// bits set to 1.  Only unsigned types are allowed.
269template <typename T> T maskLeadingZeros(unsigned N) {
270  return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
271}
272 
273/// Get the index of the last set bit starting from the least
274///   significant bit.
275///
276/// Only unsigned integral types are allowed.
277///
278/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
279///   valid arguments.
280template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
281  if (ZB == ZB_Max && Val == 0)
282    return std::numeric_limits<T>::max();
283 
284  // Use ^ instead of - because both gcc and llvm can remove the associated ^
285  // in the __builtin_clz intrinsic on x86.
286  return countLeadingZeros(Val, ZB_Undefined) ^
287         (std::numeric_limits<T>::digits - 1);
288}
289 
290/// Macro compressed bit reversal table for 256 bits.
291///
292/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
293static const unsigned char BitReverseTable256[256] = {
294#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
295#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
296#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
297  R6(0), R6(2), R6(1), R6(3)
298#undef R2
299#undef R4
300#undef R6
301};
302 
303/// Reverse the bits in \p Val.
304template <typename T>
305T reverseBits(T Val) {
306  unsigned char in[sizeof(Val)];
307  unsigned char out[sizeof(Val)];
308  std::memcpy(in, &Val, sizeof(Val));
309  for (unsigned i = 0; i < sizeof(Val); ++i)
310    out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
311  std::memcpy(&Val, out, sizeof(Val));
312  return Val;
313}
314 
315#if __has_builtin(__builtin_bitreverse8)1
316template<>
317inline uint8_t reverseBits<uint8_t>(uint8_t Val) {
318  return __builtin_bitreverse8(Val);
319}
320#endif
321 
322#if __has_builtin(__builtin_bitreverse16)1
323template<>
324inline uint16_t reverseBits<uint16_t>(uint16_t Val) {
325  return __builtin_bitreverse16(Val);
326}
327#endif
328 
329#if __has_builtin(__builtin_bitreverse32)1
330template<>
331inline uint32_t reverseBits<uint32_t>(uint32_t Val) {
332  return __builtin_bitreverse32(Val);
333}
334#endif
335 
336#if __has_builtin(__builtin_bitreverse64)1
337template<>
338inline uint64_t reverseBits<uint64_t>(uint64_t Val) {
339  return __builtin_bitreverse64(Val);
340}
341#endif
342 
343// NOTE: The following support functions use the _32/_64 extensions instead of
344// type overloading so that signed and unsigned integers can be used without
345// ambiguity.
346 
347/// Return the high 32 bits of a 64 bit value.
348constexpr inline uint32_t Hi_32(uint64_t Value) {
349  return static_cast<uint32_t>(Value >> 32);
350}
351 
352/// Return the low 32 bits of a 64 bit value.
353constexpr inline uint32_t Lo_32(uint64_t Value) {
354  return static_cast<uint32_t>(Value);
355}
356 
357/// Make a 64-bit integer from a high / low pair of 32-bit integers.
358constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
359  return ((uint64_t)High << 32) | (uint64_t)Low;
360}
361 
362/// Checks if an integer fits into the given bit width.
363template <unsigned N> constexpr inline bool isInt(int64_t x) {
364  return N >= 64 || (-(INT64_C(1)1L<<(N-1)) <= x && x < (INT64_C(1)1L<<(N-1)));
365}
366// Template specializations to get better code for common cases.
367template <> constexpr inline bool isInt<8>(int64_t x) {
368  return static_cast<int8_t>(x) == x;
369}
370template <> constexpr inline bool isInt<16>(int64_t x) {
371  return static_cast<int16_t>(x) == x;
372}
373template <> constexpr inline bool isInt<32>(int64_t x) {
374  return static_cast<int32_t>(x) == x;
375}
376 
377/// Checks if a signed integer is an N bit number shifted left by S.
378template <unsigned N, unsigned S>
379constexpr inline bool isShiftedInt(int64_t x) {
380  static_assert(
381      N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number.");
382  static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide.");
383  return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
384}
385 
386/// Checks if an unsigned integer fits into the given bit width.
387///
388/// This is written as two functions rather than as simply
389///
390///   return N >= 64 || X < (UINT64_C(1) << N);
391///
392/// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting
393/// left too many places.
394template <unsigned N>
395constexpr inline std::enable_if_t<(N < 64), bool> isUInt(uint64_t X) {
396  static_assert(N > 0, "isUInt<0> doesn't make sense");
397  return X < (UINT64_C(1)1UL << (N));
398}
399template <unsigned N>
400constexpr inline std::enable_if_t<N >= 64, bool> isUInt(uint64_t) {
401  return true;
402}
403 
404// Template specializations to get better code for common cases.
405template <> constexpr inline bool isUInt<8>(uint64_t x) {
406  return static_cast<uint8_t>(x) == x;
407}
408template <> constexpr inline bool isUInt<16>(uint64_t x) {
409  return static_cast<uint16_t>(x) == x;
410}
411template <> constexpr inline bool isUInt<32>(uint64_t x) {
412  return static_cast<uint32_t>(x) == x;
413}
414 
415/// Checks if a unsigned integer is an N bit number shifted left by S.
416template <unsigned N, unsigned S>
417constexpr inline bool isShiftedUInt(uint64_t x) {
418  static_assert(
419      N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)");
420  static_assert(N + S <= 64,
421                "isShiftedUInt<N, S> with N + S > 64 is too wide.");
422  // Per the two static_asserts above, S must be strictly less than 64.  So
423  // 1 << S is not undefined behavior.
424  return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
425}
426 
427/// Gets the maximum value for a N-bit unsigned integer.
428inline uint64_t maxUIntN(uint64_t N) {
429  assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
 "integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h"
, 429, __extension__ __PRETTY_FUNCTION__));
430 
431  // uint64_t(1) << 64 is undefined behavior, so we can't do
432  //   (uint64_t(1) << N) - 1
433  // without checking first that N != 64.  But this works and doesn't have a
434  // branch.
435  return UINT64_MAX(18446744073709551615UL) >> (64 - N);
436}
437 
438/// Gets the minimum value for a N-bit signed integer.
439inline int64_t minIntN(int64_t N) {
440  assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
 "integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h"
, 440, __extension__ __PRETTY_FUNCTION__));
441 
442  return UINT64_C(1)1UL + ~(UINT64_C(1)1UL << (N - 1));
443}
444 
445/// Gets the maximum value for a N-bit signed integer.
446inline int64_t maxIntN(int64_t N) {
447  assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
 "integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h"
, 447, __extension__ __PRETTY_FUNCTION__));
448 
449  // This relies on two's complement wraparound when N == 64, so we convert to
450  // int64_t only at the very end to avoid UB.
451  return (UINT64_C(1)1UL << (N - 1)) - 1;
452}
453 
454/// Checks if an unsigned integer fits into the given (dynamic) bit width.
455inline bool isUIntN(unsigned N, uint64_t x) {
456  return N >= 64 || x <= maxUIntN(N);
457}
458 
459/// Checks if an signed integer fits into the given (dynamic) bit width.
460inline bool isIntN(unsigned N, int64_t x) {
461  return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N));
462}
463 
464/// Return true if the argument is a non-empty sequence of ones starting at the
465/// least significant bit with the remainder zero (32 bit version).
466/// Ex. isMask_32(0x0000FFFFU) == true.
467constexpr inline bool isMask_32(uint32_t Value) {
468  return Value && ((Value + 1) & Value) == 0;
469}
470 
471/// Return true if the argument is a non-empty sequence of ones starting at the
472/// least significant bit with the remainder zero (64 bit version).
473constexpr inline bool isMask_64(uint64_t Value) {
474  return Value && ((Value + 1) & Value) == 0;
475}
476 
477/// Return true if the argument contains a non-empty sequence of ones with the
478/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
479constexpr inline bool isShiftedMask_32(uint32_t Value) {
480  return Value && isMask_32((Value - 1) | Value);
481}
482 
483/// Return true if the argument contains a non-empty sequence of ones with the
484/// remainder zero (64 bit version.)
485constexpr inline bool isShiftedMask_64(uint64_t Value) {
486  return Value && isMask_64((Value - 1) | Value);
487}
488 
489/// Return true if the argument is a power of two > 0.
490/// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
491constexpr inline bool isPowerOf2_32(uint32_t Value) {
492  return Value && !(Value & (Value - 1));
493}
494 
495/// Return true if the argument is a power of two > 0 (64 bit edition.)
496constexpr inline bool isPowerOf2_64(uint64_t Value) {
497  return Value && !(Value & (Value - 1));
498}
499 
500/// Count the number of ones from the most significant bit to the first
501/// zero bit.
502///
503/// Ex. countLeadingOnes(0xFF0FFF00) == 8.
504/// Only unsigned integral types are allowed.
505///
506/// \param ZB the behavior on an input of all ones. Only ZB_Width and
507/// ZB_Undefined are valid arguments.
508template <typename T>
509unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
510  static_assert(std::numeric_limits<T>::is_integer &&
511                    !std::numeric_limits<T>::is_signed,
512                "Only unsigned integral types are allowed.");
513  return countLeadingZeros<T>(~Value, ZB);
514}
515 
516/// Count the number of ones from the least significant bit to the first
517/// zero bit.
518///
519/// Ex. countTrailingOnes(0x00FF00FF) == 8.
520/// Only unsigned integral types are allowed.
521///
522/// \param ZB the behavior on an input of all ones. Only ZB_Width and
523/// ZB_Undefined are valid arguments.
524template <typename T>
525unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
526  static_assert(std::numeric_limits<T>::is_integer &&
527                    !std::numeric_limits<T>::is_signed,
528                "Only unsigned integral types are allowed.");
529  return countTrailingZeros<T>(~Value, ZB);
530}
531 
532namespace detail {
533template <typename T, std::size_t SizeOfT> struct PopulationCounter {
534  static unsigned count(T Value) {
535    // Generic version, forward to 32 bits.
536    static_assert(SizeOfT <= 4, "Not implemented!");
537#if defined(__GNUC__4)
538    return __builtin_popcount(Value);
539#else
540    uint32_t v = Value;
541    v = v - ((v >> 1) & 0x55555555);
542    v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
543    return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
544#endif
545  }
546};
547 
548template <typename T> struct PopulationCounter<T, 8> {
549  static unsigned count(T Value) {
550#if defined(__GNUC__4)
551    return __builtin_popcountll(Value);
552#else
553    uint64_t v = Value;
554    v = v - ((v >> 1) & 0x5555555555555555ULL);
555    v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
556    v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
557    return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
558#endif
559  }
560};
561} // namespace detail
562 
563/// Count the number of set bits in a value.
564/// Ex. countPopulation(0xF000F000) = 8
565/// Returns 0 if the word is zero.
566template <typename T>
567inline unsigned countPopulation(T Value) {
568  static_assert(std::numeric_limits<T>::is_integer &&
569                    !std::numeric_limits<T>::is_signed,
570                "Only unsigned integral types are allowed.");
571  return detail::PopulationCounter<T, sizeof(T)>::count(Value);
572}
573 
574/// Compile time Log2.
575/// Valid only for positive powers of two.
576template <size_t kValue> constexpr inline size_t CTLog2() {
577  static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue),
578                "Value is not a valid power of 2");
579  return 1 + CTLog2<kValue / 2>();
580}
581 
582template <> constexpr inline size_t CTLog2<1>() { return 0; }
583 
584/// Return the log base 2 of the specified value.
585inline double Log2(double Value) {
586#if defined(__ANDROID_API__) && __ANDROID_API__ < 18
587  return __builtin_log(Value) / __builtin_log(2.0);
588#else
589  return log2(Value);
590#endif
591}
592 
593/// Return the floor log base 2 of the specified value, -1 if the value is zero.
594/// (32 bit edition.)
595/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
596inline unsigned Log2_32(uint32_t Value) {
597  return 31 - countLeadingZeros(Value);
598}
599 
600/// Return the floor log base 2 of the specified value, -1 if the value is zero.
601/// (64 bit edition.)
602inline unsigned Log2_64(uint64_t Value) {
603  return 63 - countLeadingZeros(Value);
604}
605 
606/// Return the ceil log base 2 of the specified value, 32 if the value is zero.
607/// (32 bit edition).
608/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
609inline unsigned Log2_32_Ceil(uint32_t Value) {
610  return 32 - countLeadingZeros(Value - 1);
611}
612 
613/// Return the ceil log base 2 of the specified value, 64 if the value is zero.
614/// (64 bit edition.)
615inline unsigned Log2_64_Ceil(uint64_t Value) {
616  return 64 - countLeadingZeros(Value - 1);
617}
618 
619/// Return the greatest common divisor of the values using Euclid's algorithm.
620template <typename T>
621inline T greatestCommonDivisor(T A, T B) {
622  while (B) {
623    T Tmp = B;
624    B = A % B;
625    A = Tmp;
626  }
627  return A;
628}
629 
630inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) {
631  return greatestCommonDivisor<uint64_t>(A, B);
632}
633 
634/// This function takes a 64-bit integer and returns the bit equivalent double.
635inline double BitsToDouble(uint64_t Bits) {
636  double D;
637  static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
638  memcpy(&D, &Bits, sizeof(Bits));
639  return D;
640}
641 
642/// This function takes a 32-bit integer and returns the bit equivalent float.
643inline float BitsToFloat(uint32_t Bits) {
644  float F;
645  static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
646  memcpy(&F, &Bits, sizeof(Bits));
647  return F;
648}
649 
650/// This function takes a double and returns the bit equivalent 64-bit integer.
651/// Note that copying doubles around changes the bits of NaNs on some hosts,
652/// notably x86, so this routine cannot be used if these bits are needed.
653inline uint64_t DoubleToBits(double Double) {
654  uint64_t Bits;
655  static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
656  memcpy(&Bits, &Double, sizeof(Double));
657  return Bits;
658}
659 
660/// This function takes a float and returns the bit equivalent 32-bit integer.
661/// Note that copying floats around changes the bits of NaNs on some hosts,
662/// notably x86, so this routine cannot be used if these bits are needed.
663inline uint32_t FloatToBits(float Float) {
664  uint32_t Bits;
665  static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
666  memcpy(&Bits, &Float, sizeof(Float));
667  return Bits;
668}
669 
670/// A and B are either alignments or offsets. Return the minimum alignment that
671/// may be assumed after adding the two together.
672constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
673  // The largest power of 2 that divides both A and B.
674  //
675  // Replace "-Value" by "1+~Value" in the following commented code to avoid
676  // MSVC warning C4146
677  //    return (A | B) & -(A | B);
678  return (A | B) & (1 + ~(A | B));
679}
680 
681/// Returns the next power of two (in 64-bits) that is strictly greater than A.
682/// Returns zero on overflow.
683inline uint64_t NextPowerOf2(uint64_t A) {
684  A |= (A >> 1);
685  A |= (A >> 2);
686  A |= (A >> 4);
687  A |= (A >> 8);
688  A |= (A >> 16);
689  A |= (A >> 32);
690  return A + 1;
691}
692 
693/// Returns the power of two which is less than or equal to the given value.
694/// Essentially, it is a floor operation across the domain of powers of two.
695inline uint64_t PowerOf2Floor(uint64_t A) {
696  if (!A) return 0;
697  return 1ull << (63 - countLeadingZeros(A, ZB_Undefined));
698}
699 
700/// Returns the power of two which is greater than or equal to the given value.
701/// Essentially, it is a ceil operation across the domain of powers of two.
702inline uint64_t PowerOf2Ceil(uint64_t A) {
703  if (!A)
704    return 0;
705  return NextPowerOf2(A - 1);
706}
707 
708/// Returns the next integer (mod 2**64) that is greater than or equal to
709/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
710///
711/// If non-zero \p Skew is specified, the return value will be a minimal
712/// integer that is greater than or equal to \p Value and equal to
713/// \p Align * N + \p Skew for some integer N. If \p Skew is larger than
714/// \p Align, its value is adjusted to '\p Skew mod \p Align'.
715///
716/// Examples:
717/// \code
718///   alignTo(5, 8) = 8
719///   alignTo(17, 8) = 24
720///   alignTo(~0LL, 8) = 0
721///   alignTo(321, 255) = 510
722///
723///   alignTo(5, 8, 7) = 7
724///   alignTo(17, 8, 1) = 17
725///   alignTo(~0LL, 8, 3) = 3
726///   alignTo(321, 255, 42) = 552
727/// \endcode
728inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
729  assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0."
) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h"
, 729, __extension__ __PRETTY_FUNCTION__));
730  Skew %= Align;
731  return (Value + Align - 1 - Skew) / Align * Align + Skew;
732}
733 
734/// Returns the next integer (mod 2**64) that is greater than or equal to
735/// \p Value and is a multiple of \c Align. \c Align must be non-zero.
736template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) {
737  static_assert(Align != 0u, "Align must be non-zero");
738  return (Value + Align - 1) / Align * Align;
739}
740 
741/// Returns the integer ceil(Numerator / Denominator).
742inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
743  return alignTo(Numerator, Denominator) / Denominator;
744}
745 
746/// Returns the integer nearest(Numerator / Denominator).
747inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) {
748  return (Numerator + (Denominator / 2)) / Denominator;
749}
750 
751/// Returns the largest uint64_t less than or equal to \p Value and is
752/// \p Skew mod \p Align. \p Align must be non-zero
753inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
754  assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0."
) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h"
, 754, __extension__ __PRETTY_FUNCTION__));
755  Skew %= Align;
756  return (Value - Skew) / Align * Align + Skew;
757}
758 
759/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
760/// Requires 0 < B <= 32.
761template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) {
762  static_assert(B > 0, "Bit width can't be 0.");
763  static_assert(B <= 32, "Bit width out of range.");
764  return int32_t(X << (32 - B)) >> (32 - B);
765}
766 
767/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
768/// Requires 0 < B <= 32.
769inline int32_t SignExtend32(uint32_t X, unsigned B) {
770  assert(B > 0 && "Bit width can't be 0.")(static_cast <bool> (B > 0 && "Bit width can't be 0."
) ? void (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h"
, 770, __extension__ __PRETTY_FUNCTION__));
771  assert(B <= 32 && "Bit width out of range.")(static_cast <bool> (B <= 32 && "Bit width out of range."
) ? void (0) : __assert_fail ("B <= 32 && \"Bit width out of range.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h"
, 771, __extension__ __PRETTY_FUNCTION__));
772  return int32_t(X << (32 - B)) >> (32 - B);
773}
774 
775/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
776/// Requires 0 < B <= 64.
777template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) {
778  static_assert(B > 0, "Bit width can't be 0.");
779  static_assert(B <= 64, "Bit width out of range.");
780  return int64_t(x << (64 - B)) >> (64 - B);
781}
782 
783/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
784/// Requires 0 < B <= 64.
785inline int64_t SignExtend64(uint64_t X, unsigned B) {
786  assert(B > 0 && "Bit width can't be 0.")(static_cast <bool> (B > 0 && "Bit width can't be 0."
) ? void (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h"
, 786, __extension__ __PRETTY_FUNCTION__));
787  assert(B <= 64 && "Bit width out of range.")(static_cast <bool> (B <= 64 && "Bit width out of range."
) ? void (0) : __assert_fail ("B <= 64 && \"Bit width out of range.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h"
, 787, __extension__ __PRETTY_FUNCTION__));
788  return int64_t(X << (64 - B)) >> (64 - B);
789}
790 
791/// Subtract two unsigned integers, X and Y, of type T and return the absolute
792/// value of the result.
793template <typename T>
794std::enable_if_t<std::is_unsigned<T>::value, T> AbsoluteDifference(T X, T Y) {
795  return X > Y ? (X - Y) : (Y - X);
796}
797 
798/// Add two unsigned integers, X and Y, of type T.  Clamp the result to the
799/// maximum representable value of T on overflow.  ResultOverflowed indicates if
800/// the result is larger than the maximum representable value of type T.
801template <typename T>
802std::enable_if_t<std::is_unsigned<T>::value, T>
803SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) {
804  bool Dummy;
805  bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
806  // Hacker's Delight, p. 29
807  T Z = X + Y;
808  Overflowed = (Z < X || Z < Y);
809  if (Overflowed)
810    return std::numeric_limits<T>::max();
811  else
812    return Z;
813}
814 
815/// Multiply two unsigned integers, X and Y, of type T.  Clamp the result to the
816/// maximum representable value of T on overflow.  ResultOverflowed indicates if
817/// the result is larger than the maximum representable value of type T.
818template <typename T>
819std::enable_if_t<std::is_unsigned<T>::value, T>
820SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) {
821  bool Dummy;
822  bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
823 
824  // Hacker's Delight, p. 30 has a different algorithm, but we don't use that
825  // because it fails for uint16_t (where multiplication can have undefined
826  // behavior due to promotion to int), and requires a division in addition
827  // to the multiplication.
828 
829  Overflowed = false;
830 
831  // Log2(Z) would be either Log2Z or Log2Z + 1.
832  // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z
833  // will necessarily be less than Log2Max as desired.
834  int Log2Z = Log2_64(X) + Log2_64(Y);
835  const T Max = std::numeric_limits<T>::max();
836  int Log2Max = Log2_64(Max);
837  if (Log2Z < Log2Max) {
838    return X * Y;
839  }
840  if (Log2Z > Log2Max) {
841    Overflowed = true;
842    return Max;
843  }
844 
845  // We're going to use the top bit, and maybe overflow one
846  // bit past it. Multiply all but the bottom bit then add
847  // that on at the end.
848  T Z = (X >> 1) * Y;
849  if (Z & ~(Max >> 1)) {
850    Overflowed = true;
851    return Max;
852  }
853  Z <<= 1;
854  if (X & 1)
855    return SaturatingAdd(Z, Y, ResultOverflowed);
856 
857  return Z;
858}
859 
860/// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to
861/// the product. Clamp the result to the maximum representable value of T on
862/// overflow. ResultOverflowed indicates if the result is larger than the
863/// maximum representable value of type T.
864template <typename T>
865std::enable_if_t<std::is_unsigned<T>::value, T>
866SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) {
867  bool Dummy;
868  bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
869 
870  T Product = SaturatingMultiply(X, Y, &Overflowed);
871  if (Overflowed)
872    return Product;
873 
874  return SaturatingAdd(A, Product, &Overflowed);
875}
876 
877/// Use this rather than HUGE_VALF; the latter causes warnings on MSVC.
878extern const float huge_valf;
879 
880 
881/// Add two signed integers, computing the two's complement truncated result,
882/// returning true if overflow occured.
883template <typename T>
884std::enable_if_t<std::is_signed<T>::value, T> AddOverflow(T X, T Y, T &Result) {
885#if __has_builtin(__builtin_add_overflow)1
886  return __builtin_add_overflow(X, Y, &Result);
887#else
888  // Perform the unsigned addition.
889  using U = std::make_unsigned_t<T>;
890  const U UX = static_cast<U>(X);
891  const U UY = static_cast<U>(Y);
892  const U UResult = UX + UY;
893 
894  // Convert to signed.
895  Result = static_cast<T>(UResult);
896 
897  // Adding two positive numbers should result in a positive number.
898  if (X > 0 && Y > 0)
899    return Result <= 0;
900  // Adding two negatives should result in a negative number.
901  if (X < 0 && Y < 0)
902    return Result >= 0;
903  return false;
904#endif
905}
906 
907/// Subtract two signed integers, computing the two's complement truncated
908/// result, returning true if an overflow ocurred.
909template <typename T>
910std::enable_if_t<std::is_signed<T>::value, T> SubOverflow(T X, T Y, T &Result) {
911#if __has_builtin(__builtin_sub_overflow)1
912  return __builtin_sub_overflow(X, Y, &Result);
913#else
914  // Perform the unsigned addition.
915  using U = std::make_unsigned_t<T>;
916  const U UX = static_cast<U>(X);
917  const U UY = static_cast<U>(Y);
918  const U UResult = UX - UY;
919 
920  // Convert to signed.
921  Result = static_cast<T>(UResult);
922 
923  // Subtracting a positive number from a negative results in a negative number.
924  if (X <= 0 && Y > 0)
925    return Result >= 0;
926  // Subtracting a negative number from a positive results in a positive number.
927  if (X >= 0 && Y < 0)
928    return Result <= 0;
929  return false;
930#endif
931}
932 
933/// Multiply two signed integers, computing the two's complement truncated
934/// result, returning true if an overflow ocurred.
935template <typename T>
936std::enable_if_t<std::is_signed<T>::value, T> MulOverflow(T X, T Y, T &Result) {
937  // Perform the unsigned multiplication on absolute values.
938  using U = std::make_unsigned_t<T>;
939  const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X);
940  const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y);
941  const U UResult = UX * UY;
942 
943  // Convert to signed.
944  const bool IsNegative = (X < 0) ^ (Y < 0);
945  Result = IsNegative ? (0 - UResult) : UResult;
946 
947  // If any of the args was 0, result is 0 and no overflow occurs.
948  if (UX == 0 || UY == 0)
949    return false;
950 
951  // UX and UY are in [1, 2^n], where n is the number of digits.
952  // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for
953  // positive) divided by an argument compares to the other.
954  if (IsNegative)
955    return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY;
956  else
957    return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY;
958}
959 
960} // End llvm namespace
961 
962#endif