/build/source/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Bug Summary

File:	build/source/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Warning:	line 2825, column 21 The result of the left shift is undefined due to shifting by '4294967295', which is greater or equal to the width of type 'int'

Annotated Source Code

Press '?' to see keyboard shortcuts

Show analyzer invocation

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name AArch64TargetTransformInfo.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/source/build-llvm/tools/clang/stage2-bins -resource-dir /usr/lib/llvm-17/lib/clang/17 -D _DEBUG -D _GLIBCXX_ASSERTIONS -D _GNU_SOURCE -D _LIBCPP_ENABLE_ASSERTIONS -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Target/AArch64 -I /build/source/llvm/lib/Target/AArch64 -I include -I /build/source/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-17/lib/clang/17/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fmacro-prefix-map=/build/source/= -fcoverage-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fcoverage-prefix-map=/build/source/= -source-date-epoch 1683717183 -O2 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -Wno-misleading-indentation -std=c++17 -fdeprecated-macro -fdebug-compilation-dir=/build/source/build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/source/= -ferror-limit 19 -fvisibility=hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2023-05-10-133810-16478-1 -x c++ /build/source/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

/build/source/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

→

1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//

9#include "AArch64TargetTransformInfo.h"
10#include "AArch64ExpandImm.h"
11#include "AArch64PerfectShuffle.h"
12#include "MCTargetDesc/AArch64AddressingModes.h"
13#include "llvm/Analysis/IVDescriptors.h"
14#include "llvm/Analysis/LoopInfo.h"
15#include "llvm/Analysis/TargetTransformInfo.h"
16#include "llvm/CodeGen/BasicTTIImpl.h"
17#include "llvm/CodeGen/CostTable.h"
18#include "llvm/CodeGen/TargetLowering.h"
19#include "llvm/IR/IntrinsicInst.h"
20#include "llvm/IR/Intrinsics.h"
21#include "llvm/IR/IntrinsicsAArch64.h"
22#include "llvm/IR/PatternMatch.h"
23#include "llvm/Support/Debug.h"
24#include "llvm/Transforms/InstCombine/InstCombiner.h"
25#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
26#include <algorithm>
27#include <optional>
28using namespace llvm;
29using namespace llvm::PatternMatch;

31#define DEBUG_TYPE"aarch64tti" "aarch64tti"

33static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
                                             cl::init(true), cl::Hidden);

36static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
                                         cl::Hidden);

39static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
                                          cl::init(10), cl::Hidden);

42namespace {
43class TailFoldingKind {
44private:
uint8_t Bits = 0; // Currently defaults to disabled.

47public:
enum TailFoldingOpts {
  TFDisabled = 0x0,
  TFReductions = 0x01,
  TFRecurrences = 0x02,
  TFReverse = 0x04,
  TFSimple = 0x80,
  TFAll = TFReductions | TFRecurrences | TFReverse | TFSimple
};

void operator=(const std::string &Val) {
  if (Val.empty())
    return;
  SmallVector<StringRef, 6> TailFoldTypes;
  StringRef(Val).split(TailFoldTypes, '+', -1, false);
  for (auto TailFoldType : TailFoldTypes) {
    if (TailFoldType == "disabled")
      Bits = 0;
    else if (TailFoldType == "all")
      Bits = TFAll;
    else if (TailFoldType == "default")
      Bits = 0; // Currently defaults to never tail-folding.
    else if (TailFoldType == "simple")
      add(TFSimple);
    else if (TailFoldType == "reductions")
      add(TFReductions);
    else if (TailFoldType == "recurrences")
      add(TFRecurrences);
    else if (TailFoldType == "reverse")
      add(TFReverse);
    else if (TailFoldType == "noreductions")
      remove(TFReductions);
    else if (TailFoldType == "norecurrences")
      remove(TFRecurrences);
    else if (TailFoldType == "noreverse")
      remove(TFReverse);
    else {
      errs()
          << "invalid argument " << TailFoldType.str()
          << " to -sve-tail-folding=; each element must be one of: disabled, "
             "all, default, simple, reductions, noreductions, recurrences, "
             "norecurrences\n";
    }
  }
}

operator uint8_t() const { return Bits; }

void add(uint8_t Flag) { Bits |= Flag; }
void remove(uint8_t Flag) { Bits &= ~Flag; }
97};
98} // namespace

100TailFoldingKind TailFoldingKindLoc;

102cl::opt<TailFoldingKind, true, cl::parser<std::string>> SVETailFolding(
  "sve-tail-folding",
  cl::desc(
      "Control the use of vectorisation using tail-folding for SVE:"
      "\ndisabled    No loop types will vectorize using tail-folding"
      "\ndefault     Uses the default tail-folding settings for the target "
      "CPU"
      "\nall         All legal loop types will vectorize using tail-folding"
      "\nsimple      Use tail-folding for simple loops (not reductions or "
      "recurrences)"
      "\nreductions  Use tail-folding for loops containing reductions"
      "\nrecurrences Use tail-folding for loops containing fixed order "
      "recurrences"
      "\nreverse     Use tail-folding for loops requiring reversed "
      "predicates"),
  cl::location(TailFoldingKindLoc));

119// Experimental option that will only be fully functional when the
120// code-generator is changed to use SVE instead of NEON for all fixed-width
121// operations.
122static cl::opt<bool> EnableFixedwidthAutovecInStreamingMode(
  "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);

125// Experimental option that will only be fully functional when the cost-model
126// and code-generator have been changed to avoid using scalable vector
127// instructions that are not legal in streaming SVE mode.
128static cl::opt<bool> EnableScalableAutovecInStreamingMode(
  "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);

131bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
                                       const Function *Callee) const {
SMEAttrs CallerAttrs(*Caller);
SMEAttrs CalleeAttrs(*Callee);
if (CallerAttrs.requiresSMChange(CalleeAttrs,
                                 /*BodyOverridesInterface=*/true) ||
    CallerAttrs.requiresLazySave(CalleeAttrs) ||
    CalleeAttrs.hasNewZAInterface())
  return false;

const TargetMachine &TM = getTLI()->getTargetMachine();

const FeatureBitset &CallerBits =
    TM.getSubtargetImpl(*Caller)->getFeatureBits();
const FeatureBitset &CalleeBits =
    TM.getSubtargetImpl(*Callee)->getFeatureBits();

// Inline a callee if its target-features are a subset of the callers
// target-features.
return (CallerBits & CalleeBits) == CalleeBits;
151}

153bool AArch64TTIImpl::shouldMaximizeVectorBandwidth(
  TargetTransformInfo::RegisterKind K) const {
assert(K != TargetTransformInfo::RGK_Scalar)(static_cast <bool> (K != TargetTransformInfo::RGK_Scalar
) ? void (0) : __assert_fail ("K != TargetTransformInfo::RGK_Scalar"
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 155
, __extension__ __PRETTY_FUNCTION__));
return K == TargetTransformInfo::RGK_FixedWidthVector;
157}

159/// Calculate the cost of materializing a 64-bit value. This helper
160/// method might only calculate a fraction of a larger immediate. Therefore it
161/// is valid to return a cost of ZERO.
162InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) {
// Check if the immediate can be encoded within an instruction.
if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
  return 0;

if (Val < 0)
  Val = ~Val;

// Calculate how many moves we will need to materialize this constant.
SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
AArch64_IMM::expandMOVImm(Val, 64, Insn);
return Insn.size();
174}

176/// Calculate the cost of materializing the given constant.
177InstructionCost AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
                                            TTI::TargetCostKind CostKind) {
assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
 __assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 179, __extension__ __PRETTY_FUNCTION__));

unsigned BitSize = Ty->getPrimitiveSizeInBits();
if (BitSize == 0)
  return ~0U;

// Sign-extend all constants to a multiple of 64-bit.
APInt ImmVal = Imm;
if (BitSize & 0x3f)
  ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);

// Split the constant into 64-bit chunks and calculate the cost for each
// chunk.
InstructionCost Cost = 0;
for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
  APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
  int64_t Val = Tmp.getSExtValue();
  Cost += getIntImmCost(Val);
}
// We need at least one instruction to materialze the constant.
return std::max<InstructionCost>(1, Cost);
200}

202InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
                                                const APInt &Imm, Type *Ty,
                                                TTI::TargetCostKind CostKind,
                                                Instruction *Inst) {
assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
 __assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 206, __extension__ __PRETTY_FUNCTION__));

unsigned BitSize = Ty->getPrimitiveSizeInBits();
// There is no cost model for constants with a bit size of 0. Return TCC_Free
// here, so that constant hoisting will ignore this constant.
if (BitSize == 0)
  return TTI::TCC_Free;

unsigned ImmIdx = ~0U;
switch (Opcode) {
default:
  return TTI::TCC_Free;
case Instruction::GetElementPtr:
  // Always hoist the base address of a GetElementPtr.
  if (Idx == 0)
    return 2 * TTI::TCC_Basic;
  return TTI::TCC_Free;
case Instruction::Store:
  ImmIdx = 0;
  break;
case Instruction::Add:
case Instruction::Sub:
case Instruction::Mul:
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::URem:
case Instruction::SRem:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor:
case Instruction::ICmp:
  ImmIdx = 1;
  break;
// Always return TCC_Free for the shift value of a shift instruction.
case Instruction::Shl:
case Instruction::LShr:
case Instruction::AShr:
  if (Idx == 1)
    return TTI::TCC_Free;
  break;
case Instruction::Trunc:
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::IntToPtr:
case Instruction::PtrToInt:
case Instruction::BitCast:
case Instruction::PHI:
case Instruction::Call:
case Instruction::Select:
case Instruction::Ret:
case Instruction::Load:
  break;
}

if (Idx == ImmIdx) {
  int NumConstants = (BitSize + 63) / 64;
  InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
  return (Cost <= NumConstants * TTI::TCC_Basic)
             ? static_cast<int>(TTI::TCC_Free)
             : Cost;
}
return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
268}

270InstructionCost
271AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
                                  const APInt &Imm, Type *Ty,
                                  TTI::TargetCostKind CostKind) {
assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
 __assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 274, __extension__ __PRETTY_FUNCTION__));

unsigned BitSize = Ty->getPrimitiveSizeInBits();
// There is no cost model for constants with a bit size of 0. Return TCC_Free
// here, so that constant hoisting will ignore this constant.
if (BitSize == 0)
  return TTI::TCC_Free;

// Most (all?) AArch64 intrinsics do not support folding immediates into the
// selected instruction, so we compute the materialization cost for the
// immediate directly.
if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
  return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);

switch (IID) {
default:
  return TTI::TCC_Free;
case Intrinsic::sadd_with_overflow:
case Intrinsic::uadd_with_overflow:
case Intrinsic::ssub_with_overflow:
case Intrinsic::usub_with_overflow:
case Intrinsic::smul_with_overflow:
case Intrinsic::umul_with_overflow:
  if (Idx == 1) {
    int NumConstants = (BitSize + 63) / 64;
    InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
    return (Cost <= NumConstants * TTI::TCC_Basic)
               ? static_cast<int>(TTI::TCC_Free)
               : Cost;
  }
  break;
case Intrinsic::experimental_stackmap:
  if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
    return TTI::TCC_Free;
  break;
case Intrinsic::experimental_patchpoint_void:
case Intrinsic::experimental_patchpoint_i64:
  if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
    return TTI::TCC_Free;
  break;
case Intrinsic::experimental_gc_statepoint:
  if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
    return TTI::TCC_Free;
  break;
}
return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
320}

322TargetTransformInfo::PopcntSupportKind
323AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2")(static_cast <bool> (isPowerOf2_32(TyWidth) && "Ty width must be power of 2"
) ? void (0) : __assert_fail ("isPowerOf2_32(TyWidth) && \"Ty width must be power of 2\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 324
, __extension__ __PRETTY_FUNCTION__));
if (TyWidth == 32 || TyWidth == 64)
  return TTI::PSK_FastHardware;
// TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
return TTI::PSK_Software;
329}

331InstructionCost
332AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                    TTI::TargetCostKind CostKind) {
auto *RetTy = ICA.getReturnType();
switch (ICA.getID()) {
case Intrinsic::umin:
case Intrinsic::umax:
case Intrinsic::smin:
case Intrinsic::smax: {
  static const auto ValidMinMaxTys = {MVT::v8i8,  MVT::v16i8, MVT::v4i16,
                                      MVT::v8i16, MVT::v2i32, MVT::v4i32};
  auto LT = getTypeLegalizationCost(RetTy);
  // v2i64 types get converted to cmp+bif hence the cost of 2
  if (LT.second == MVT::v2i64)
    return LT.first * 2;
  if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
    return LT.first;
  break;
}
case Intrinsic::sadd_sat:
case Intrinsic::ssub_sat:
case Intrinsic::uadd_sat:
case Intrinsic::usub_sat: {
  static const auto ValidSatTys = {MVT::v8i8,  MVT::v16i8, MVT::v4i16,
                                   MVT::v8i16, MVT::v2i32, MVT::v4i32,
                                   MVT::v2i64};
  auto LT = getTypeLegalizationCost(RetTy);
  // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
  // need to extend the type, as it uses shr(qadd(shl, shl)).
  unsigned Instrs =
      LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
  if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
    return LT.first * Instrs;
  break;
}
case Intrinsic::abs: {
  static const auto ValidAbsTys = {MVT::v8i8,  MVT::v16i8, MVT::v4i16,
                                   MVT::v8i16, MVT::v2i32, MVT::v4i32,
                                   MVT::v2i64};
  auto LT = getTypeLegalizationCost(RetTy);
  if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
    return LT.first;
  break;
}
case Intrinsic::experimental_stepvector: {
  InstructionCost Cost = 1; // Cost of the `index' instruction
  auto LT = getTypeLegalizationCost(RetTy);
  // Legalisation of illegal vectors involves an `index' instruction plus
  // (LT.first - 1) vector adds.
  if (LT.first > 1) {
    Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
    InstructionCost AddCost =
        getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
    Cost += AddCost * (LT.first - 1);
  }
  return Cost;
}
case Intrinsic::bitreverse: {
  static const CostTblEntry BitreverseTbl[] = {
      {Intrinsic::bitreverse, MVT::i32, 1},
      {Intrinsic::bitreverse, MVT::i64, 1},
      {Intrinsic::bitreverse, MVT::v8i8, 1},
      {Intrinsic::bitreverse, MVT::v16i8, 1},
      {Intrinsic::bitreverse, MVT::v4i16, 2},
      {Intrinsic::bitreverse, MVT::v8i16, 2},
      {Intrinsic::bitreverse, MVT::v2i32, 2},
      {Intrinsic::bitreverse, MVT::v4i32, 2},
      {Intrinsic::bitreverse, MVT::v1i64, 2},
      {Intrinsic::bitreverse, MVT::v2i64, 2},
  };
  const auto LegalisationCost = getTypeLegalizationCost(RetTy);
  const auto *Entry =
      CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
  if (Entry) {
    // Cost Model is using the legal type(i32) that i8 and i16 will be
    // converted to +1 so that we match the actual lowering cost
    if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
        TLI->getValueType(DL, RetTy, true) == MVT::i16)
      return LegalisationCost.first * Entry->Cost + 1;

    return LegalisationCost.first * Entry->Cost;
  }
  break;
}
case Intrinsic::ctpop: {
  if (!ST->hasNEON()) {
    // 32-bit or 64-bit ctpop without NEON is 12 instructions.
    return getTypeLegalizationCost(RetTy).first * 12;
  }
  static const CostTblEntry CtpopCostTbl[] = {
      {ISD::CTPOP, MVT::v2i64, 4},
      {ISD::CTPOP, MVT::v4i32, 3},
      {ISD::CTPOP, MVT::v8i16, 2},
      {ISD::CTPOP, MVT::v16i8, 1},
      {ISD::CTPOP, MVT::i64,   4},
      {ISD::CTPOP, MVT::v2i32, 3},
      {ISD::CTPOP, MVT::v4i16, 2},
      {ISD::CTPOP, MVT::v8i8,  1},
      {ISD::CTPOP, MVT::i32,   5},
  };
  auto LT = getTypeLegalizationCost(RetTy);
  MVT MTy = LT.second;
  if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
    // Extra cost of +1 when illegal vector types are legalized by promoting
    // the integer type.
    int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
                                          RetTy->getScalarSizeInBits()
                        ? 1
                        : 0;
    return LT.first * Entry->Cost + ExtraCost;
  }
  break;
}
case Intrinsic::sadd_with_overflow:
case Intrinsic::uadd_with_overflow:
case Intrinsic::ssub_with_overflow:
case Intrinsic::usub_with_overflow:
case Intrinsic::smul_with_overflow:
case Intrinsic::umul_with_overflow: {
  static const CostTblEntry WithOverflowCostTbl[] = {
      {Intrinsic::sadd_with_overflow, MVT::i8, 3},
      {Intrinsic::uadd_with_overflow, MVT::i8, 3},
      {Intrinsic::sadd_with_overflow, MVT::i16, 3},
      {Intrinsic::uadd_with_overflow, MVT::i16, 3},
      {Intrinsic::sadd_with_overflow, MVT::i32, 1},
      {Intrinsic::uadd_with_overflow, MVT::i32, 1},
      {Intrinsic::sadd_with_overflow, MVT::i64, 1},
      {Intrinsic::uadd_with_overflow, MVT::i64, 1},
      {Intrinsic::ssub_with_overflow, MVT::i8, 3},
      {Intrinsic::usub_with_overflow, MVT::i8, 3},
      {Intrinsic::ssub_with_overflow, MVT::i16, 3},
      {Intrinsic::usub_with_overflow, MVT::i16, 3},
      {Intrinsic::ssub_with_overflow, MVT::i32, 1},
      {Intrinsic::usub_with_overflow, MVT::i32, 1},
      {Intrinsic::ssub_with_overflow, MVT::i64, 1},
      {Intrinsic::usub_with_overflow, MVT::i64, 1},
      {Intrinsic::smul_with_overflow, MVT::i8, 5},
      {Intrinsic::umul_with_overflow, MVT::i8, 4},
      {Intrinsic::smul_with_overflow, MVT::i16, 5},
      {Intrinsic::umul_with_overflow, MVT::i16, 4},
      {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
      {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
      {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
      {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
  };
  EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
  if (MTy.isSimple())
    if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
                                            MTy.getSimpleVT()))
      return Entry->Cost;
  break;
}
case Intrinsic::fptosi_sat:
case Intrinsic::fptoui_sat: {
  if (ICA.getArgTypes().empty())
    break;
  bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
  auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
  EVT MTy = TLI->getValueType(DL, RetTy);
  // Check for the legal types, which are where the size of the input and the
  // output are the same, or we are using cvt f64->i32 or f32->i64.
  if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
       LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
       LT.second == MVT::v2f64) &&
      (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
       (LT.second == MVT::f64 && MTy == MVT::i32) ||
       (LT.second == MVT::f32 && MTy == MVT::i64)))
    return LT.first;
  // Similarly for fp16 sizes
  if (ST->hasFullFP16() &&
      ((LT.second == MVT::f16 && MTy == MVT::i32) ||
       ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
        (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits()))))
    return LT.first;

  // Otherwise we use a legal convert followed by a min+max
  if ((LT.second.getScalarType() == MVT::f32 ||
       LT.second.getScalarType() == MVT::f64 ||
       (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) &&
      LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
    Type *LegalTy =
        Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
    if (LT.second.isVector())
      LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
    InstructionCost Cost = 1;
    IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
                                  LegalTy, {LegalTy, LegalTy});
    Cost += getIntrinsicInstrCost(Attrs1, CostKind);
    IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
                                  LegalTy, {LegalTy, LegalTy});
    Cost += getIntrinsicInstrCost(Attrs2, CostKind);
    return LT.first * Cost;
  }
  break;
}
case Intrinsic::fshl:
case Intrinsic::fshr: {
  if (ICA.getArgs().empty())
    break;

  // TODO: Add handling for fshl where third argument is not a constant.
  const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
  if (!OpInfoZ.isConstant())
    break;

  const auto LegalisationCost = getTypeLegalizationCost(RetTy);
  if (OpInfoZ.isUniform()) {
    // FIXME: The costs could be lower if the codegen is better.
    static const CostTblEntry FshlTbl[] = {
        {Intrinsic::fshl, MVT::v4i32, 3}, // ushr + shl + orr
        {Intrinsic::fshl, MVT::v2i64, 3}, {Intrinsic::fshl, MVT::v16i8, 4},
        {Intrinsic::fshl, MVT::v8i16, 4}, {Intrinsic::fshl, MVT::v2i32, 3},
        {Intrinsic::fshl, MVT::v8i8, 4},  {Intrinsic::fshl, MVT::v4i16, 4}};
    // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
    // to avoid having to duplicate the costs.
    const auto *Entry =
        CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
    if (Entry)
      return LegalisationCost.first * Entry->Cost;
  }

  auto TyL = getTypeLegalizationCost(RetTy);
  if (!RetTy->isIntegerTy())
    break;

  // Estimate cost manually, as types like i8 and i16 will get promoted to
  // i32 and CostTableLookup will ignore the extra conversion cost.
  bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
                     RetTy->getScalarSizeInBits() < 64) ||
                    (RetTy->getScalarSizeInBits() % 64 != 0);
  unsigned ExtraCost = HigherCost ? 1 : 0;
  if (RetTy->getScalarSizeInBits() == 32 ||
      RetTy->getScalarSizeInBits() == 64)
    ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
                   // extr instruction.
  else if (HigherCost)
    ExtraCost = 1;
  else
    break;
  return TyL.first + ExtraCost;
}
default:
  break;
}
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
576}

578/// The function will remove redundant reinterprets casting in the presence
579/// of the control flow
580static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
                                                 IntrinsicInst &II) {
SmallVector<Instruction *, 32> Worklist;
auto RequiredType = II.getType();

auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
assert(PN && "Expected Phi Node!")(static_cast <bool> (PN && "Expected Phi Node!"
) ? void (0) : __assert_fail ("PN && \"Expected Phi Node!\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 586
, __extension__ __PRETTY_FUNCTION__));

// Don't create a new Phi unless we can remove the old one.
if (!PN->hasOneUse())
  return std::nullopt;

for (Value *IncValPhi : PN->incoming_values()) {
  auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
  if (!Reinterpret ||
      Reinterpret->getIntrinsicID() !=
          Intrinsic::aarch64_sve_convert_to_svbool ||
      RequiredType != Reinterpret->getArgOperand(0)->getType())
    return std::nullopt;
}

// Create the new Phi
LLVMContext &Ctx = PN->getContext();
IRBuilder<> Builder(Ctx);
Builder.SetInsertPoint(PN);
PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
Worklist.push_back(PN);

for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
  auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
  NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
  Worklist.push_back(Reinterpret);
}

// Cleanup Phi Node and reinterprets
return IC.replaceInstUsesWith(II, NPN);
616}

618// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
619// => (binop (pred) (from_svbool _) (from_svbool _))
620//
621// The above transformation eliminates a `to_svbool` in the predicate
622// operand of bitwise operation `binop` by narrowing the vector width of
623// the operation. For example, it would convert a `<vscale x 16 x i1>
624// and` into a `<vscale x 4 x i1> and`. This is profitable because
625// to_svbool must zero the new lanes during widening, whereas
626// from_svbool is free.
627static std::optional<Instruction *>
628tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II) {
auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
if (!BinOp)
  return std::nullopt;

auto IntrinsicID = BinOp->getIntrinsicID();
switch (IntrinsicID) {
case Intrinsic::aarch64_sve_and_z:
case Intrinsic::aarch64_sve_bic_z:
case Intrinsic::aarch64_sve_eor_z:
case Intrinsic::aarch64_sve_nand_z:
case Intrinsic::aarch64_sve_nor_z:
case Intrinsic::aarch64_sve_orn_z:
case Intrinsic::aarch64_sve_orr_z:
  break;
default:
  return std::nullopt;
}

auto BinOpPred = BinOp->getOperand(0);
auto BinOpOp1 = BinOp->getOperand(1);
auto BinOpOp2 = BinOp->getOperand(2);

auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
if (!PredIntr ||
    PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
  return std::nullopt;

auto PredOp = PredIntr->getOperand(0);
auto PredOpTy = cast<VectorType>(PredOp->getType());
if (PredOpTy != II.getType())
  return std::nullopt;

IRBuilder<> Builder(II.getContext());
Builder.SetInsertPoint(&II);

SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
auto NarrowBinOpOp1 = Builder.CreateIntrinsic(
    Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
if (BinOpOp1 == BinOpOp2)
  NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
else
  NarrowedBinOpArgs.push_back(Builder.CreateIntrinsic(
      Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));

auto NarrowedBinOp =
    Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
return IC.replaceInstUsesWith(II, NarrowedBinOp);
677}

679static std::optional<Instruction *>
680instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II) {
// If the reinterpret instruction operand is a PHI Node
if (isa<PHINode>(II.getArgOperand(0)))
  return processPhiNode(IC, II);

if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
  return BinOpCombine;

SmallVector<Instruction *, 32> CandidatesForRemoval;
Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;

const auto *IVTy = cast<VectorType>(II.getType());

// Walk the chain of conversions.
while (Cursor) {
  // If the type of the cursor has fewer lanes than the final result, zeroing
  // must take place, which breaks the equivalence chain.
  const auto *CursorVTy = cast<VectorType>(Cursor->getType());
  if (CursorVTy->getElementCount().getKnownMinValue() <
      IVTy->getElementCount().getKnownMinValue())
    break;

  // If the cursor has the same type as I, it is a viable replacement.
  if (Cursor->getType() == IVTy)
    EarliestReplacement = Cursor;

  auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);

  // If this is not an SVE conversion intrinsic, this is the end of the chain.
  if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
                                Intrinsic::aarch64_sve_convert_to_svbool ||
                            IntrinsicCursor->getIntrinsicID() ==
                                Intrinsic::aarch64_sve_convert_from_svbool))
    break;

  CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
  Cursor = IntrinsicCursor->getOperand(0);
}

// If no viable replacement in the conversion chain was found, there is
// nothing to do.
if (!EarliestReplacement)
  return std::nullopt;

return IC.replaceInstUsesWith(II, EarliestReplacement);
725}

727static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
                                                    IntrinsicInst &II) {
IRBuilder<> Builder(&II);
auto Select = Builder.CreateSelect(II.getOperand(0), II.getOperand(1),
                                   II.getOperand(2));
return IC.replaceInstUsesWith(II, Select);
733}

735static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
                                                    IntrinsicInst &II) {
IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
if (!Pg)
  return std::nullopt;

if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
  return std::nullopt;

const auto PTruePattern =
    cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
if (PTruePattern != AArch64SVEPredPattern::vl1)
  return std::nullopt;

// The intrinsic is inserting into lane zero so use an insert instead.
auto *IdxTy = Type::getInt64Ty(II.getContext());
auto *Insert = InsertElementInst::Create(
    II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
Insert->insertBefore(&II);
Insert->takeName(&II);

return IC.replaceInstUsesWith(II, Insert);
757}

759static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
                                                     IntrinsicInst &II) {
// Replace DupX with a regular IR splat.
IRBuilder<> Builder(II.getContext());
Builder.SetInsertPoint(&II);
auto *RetTy = cast<ScalableVectorType>(II.getType());
Value *Splat =
    Builder.CreateVectorSplat(RetTy->getElementCount(), II.getArgOperand(0));
Splat->takeName(&II);
return IC.replaceInstUsesWith(II, Splat);
769}

771static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
                                                      IntrinsicInst &II) {
LLVMContext &Ctx = II.getContext();
IRBuilder<> Builder(Ctx);
Builder.SetInsertPoint(&II);

// Check that the predicate is all active
auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
  return std::nullopt;

const auto PTruePattern =
    cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
if (PTruePattern != AArch64SVEPredPattern::all)
  return std::nullopt;

// Check that we have a compare of zero..
auto *SplatValue =
    dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2)));
if (!SplatValue || !SplatValue->isZero())
  return std::nullopt;

// ..against a dupq
auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
if (!DupQLane ||
    DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
  return std::nullopt;

// Where the dupq is a lane 0 replicate of a vector insert
if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero())
  return std::nullopt;

auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
  return std::nullopt;

// Where the vector insert is a fixed constant vector insert into undef at
// index zero
if (!isa<UndefValue>(VecIns->getArgOperand(0)))
  return std::nullopt;

if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
  return std::nullopt;

auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
if (!ConstVec)
  return std::nullopt;

auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
  return std::nullopt;

unsigned NumElts = VecTy->getNumElements();
unsigned PredicateBits = 0;

// Expand intrinsic operands to a 16-bit byte level predicate
for (unsigned I = 0; I < NumElts; ++I) {
  auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
  if (!Arg)
    return std::nullopt;
  if (!Arg->isZero())
    PredicateBits |= 1 << (I * (16 / NumElts));
}

// If all bits are zero bail early with an empty predicate
if (PredicateBits == 0) {
  auto *PFalse = Constant::getNullValue(II.getType());
  PFalse->takeName(&II);
  return IC.replaceInstUsesWith(II, PFalse);
}

// Calculate largest predicate type used (where byte predicate is largest)
unsigned Mask = 8;
for (unsigned I = 0; I < 16; ++I)
  if ((PredicateBits & (1 << I)) != 0)
    Mask |= (I % 8);

unsigned PredSize = Mask & -Mask;
auto *PredType = ScalableVectorType::get(
    Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));

// Ensure all relevant bits are set
for (unsigned I = 0; I < 16; I += PredSize)
  if ((PredicateBits & (1 << I)) == 0)
    return std::nullopt;

auto *PTruePat =
    ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
                                      {PredType}, {PTruePat});
auto *ConvertToSVBool = Builder.CreateIntrinsic(
    Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
auto *ConvertFromSVBool =
    Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
                            {II.getType()}, {ConvertToSVBool});

ConvertFromSVBool->takeName(&II);
return IC.replaceInstUsesWith(II, ConvertFromSVBool);
870}

872static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
                                                     IntrinsicInst &II) {
IRBuilder<> Builder(II.getContext());
Builder.SetInsertPoint(&II);
Value *Pg = II.getArgOperand(0);
Value *Vec = II.getArgOperand(1);
auto IntrinsicID = II.getIntrinsicID();
bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;

// lastX(splat(X)) --> X
if (auto *SplatVal = getSplatValue(Vec))
  return IC.replaceInstUsesWith(II, SplatVal);

// If x and/or y is a splat value then:
// lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
Value *LHS, *RHS;
if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
  if (isSplatValue(LHS) || isSplatValue(RHS)) {
    auto *OldBinOp = cast<BinaryOperator>(Vec);
    auto OpC = OldBinOp->getOpcode();
    auto *NewLHS =
        Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
    auto *NewRHS =
        Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
    auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags(
        OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), &II);
    return IC.replaceInstUsesWith(II, NewBinOp);
  }
}

auto *C = dyn_cast<Constant>(Pg);
if (IsAfter && C && C->isNullValue()) {
  // The intrinsic is extracting lane 0 so use an extract instead.
  auto *IdxTy = Type::getInt64Ty(II.getContext());
  auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
  Extract->insertBefore(&II);
  Extract->takeName(&II);
  return IC.replaceInstUsesWith(II, Extract);
}

auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
if (!IntrPG)
  return std::nullopt;

if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
  return std::nullopt;

const auto PTruePattern =
    cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();

// Can the intrinsic's predicate be converted to a known constant index?
unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
if (!MinNumElts)
  return std::nullopt;

unsigned Idx = MinNumElts - 1;
// Increment the index if extracting the element after the last active
// predicate element.
if (IsAfter)
  ++Idx;

// Ignore extracts whose index is larger than the known minimum vector
// length. NOTE: This is an artificial constraint where we prefer to
// maintain what the user asked for until an alternative is proven faster.
auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
if (Idx >= PgVTy->getMinNumElements())
  return std::nullopt;

// The intrinsic is extracting a fixed lane so use an extract instead.
auto *IdxTy = Type::getInt64Ty(II.getContext());
auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
Extract->insertBefore(&II);
Extract->takeName(&II);
return IC.replaceInstUsesWith(II, Extract);
946}

948static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
                                                         IntrinsicInst &II) {
// The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
// integer variant across a variety of micro-architectures. Replace scalar
// integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
// bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
// depending on the micro-architecture, but has been observed as generally
// being faster, particularly when the CLAST[AB] op is a loop-carried
// dependency.
IRBuilder<> Builder(II.getContext());
Builder.SetInsertPoint(&II);
Value *Pg = II.getArgOperand(0);
Value *Fallback = II.getArgOperand(1);
Value *Vec = II.getArgOperand(2);
Type *Ty = II.getType();

if (!Ty->isIntegerTy())
  return std::nullopt;

Type *FPTy;
switch (cast<IntegerType>(Ty)->getBitWidth()) {
default:
  return std::nullopt;
case 16:
  FPTy = Builder.getHalfTy();
  break;
case 32:
  FPTy = Builder.getFloatTy();
  break;
case 64:
  FPTy = Builder.getDoubleTy();
  break;
}

Value *FPFallBack = Builder.CreateBitCast(Fallback, FPTy);
auto *FPVTy = VectorType::get(
    FPTy, cast<VectorType>(Vec->getType())->getElementCount());
Value *FPVec = Builder.CreateBitCast(Vec, FPVTy);
auto *FPII = Builder.CreateIntrinsic(II.getIntrinsicID(), {FPVec->getType()},
                                     {Pg, FPFallBack, FPVec});
Value *FPIItoInt = Builder.CreateBitCast(FPII, II.getType());
return IC.replaceInstUsesWith(II, FPIItoInt);
990}

992static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
                                                   IntrinsicInst &II) {
LLVMContext &Ctx = II.getContext();
IRBuilder<> Builder(Ctx);
Builder.SetInsertPoint(&II);
// Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
// can work with RDFFR_PP for ptest elimination.
auto *AllPat =
    ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
                                      {II.getType()}, {AllPat});
auto *RDFFR =
    Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue});
RDFFR->takeName(&II);
return IC.replaceInstUsesWith(II, RDFFR);
1007}

1009static std::optional<Instruction *>
1010instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) {
const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();

if (Pattern == AArch64SVEPredPattern::all) {
  LLVMContext &Ctx = II.getContext();
  IRBuilder<> Builder(Ctx);
  Builder.SetInsertPoint(&II);

  Constant *StepVal = ConstantInt::get(II.getType(), NumElts);
  auto *VScale = Builder.CreateVScale(StepVal);
  VScale->takeName(&II);
  return IC.replaceInstUsesWith(II, VScale);
}

unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);

return MinNumElts && NumElts >= MinNumElts
           ? std::optional<Instruction *>(IC.replaceInstUsesWith(
                 II, ConstantInt::get(II.getType(), MinNumElts)))
           : std::nullopt;
1030}

1032static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
                                                      IntrinsicInst &II) {
Value *PgVal = II.getArgOperand(0);
Value *OpVal = II.getArgOperand(1);

IRBuilder<> Builder(II.getContext());
Builder.SetInsertPoint(&II);

// PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
// Later optimizations prefer this form.
if (PgVal == OpVal &&
    (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
     II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
  Value *Ops[] = {PgVal, OpVal};
  Type *Tys[] = {PgVal->getType()};

  auto *PTest =
      Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
  PTest->takeName(&II);

  return IC.replaceInstUsesWith(II, PTest);
}

IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(PgVal);
IntrinsicInst *Op = dyn_cast<IntrinsicInst>(OpVal);

if (!Pg || !Op)
  return std::nullopt;

Intrinsic::ID OpIID = Op->getIntrinsicID();

if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
    OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
    Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
  Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
  Type *Tys[] = {Pg->getArgOperand(0)->getType()};

  auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);

  PTest->takeName(&II);
  return IC.replaceInstUsesWith(II, PTest);
}

// Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
// Later optimizations may rewrite sequence to use the flag-setting variant
// of instruction X to remove PTEST.
if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
    ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
     (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
     (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
     (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
     (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
     (OpIID == Intrinsic::aarch64_sve_and_z) ||
     (OpIID == Intrinsic::aarch64_sve_bic_z) ||
     (OpIID == Intrinsic::aarch64_sve_eor_z) ||
     (OpIID == Intrinsic::aarch64_sve_nand_z) ||
     (OpIID == Intrinsic::aarch64_sve_nor_z) ||
     (OpIID == Intrinsic::aarch64_sve_orn_z) ||
     (OpIID == Intrinsic::aarch64_sve_orr_z))) {
  Value *Ops[] = {Pg->getArgOperand(0), Pg};
  Type *Tys[] = {Pg->getType()};

  auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
  PTest->takeName(&II);

  return IC.replaceInstUsesWith(II, PTest);
}

return std::nullopt;
1101}

1103template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc>
1104static std::optional<Instruction *>
1105instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II,
                                bool MergeIntoAddendOp) {
Value *P = II.getOperand(0);
Value *MulOp0, *MulOp1, *AddendOp, *Mul;
if (MergeIntoAddendOp) {
  AddendOp = II.getOperand(1);
  Mul = II.getOperand(2);
} else {
  AddendOp = II.getOperand(2);
  Mul = II.getOperand(1);
}

if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(P), m_Value(MulOp0),
                                    m_Value(MulOp1))))
  return std::nullopt;

if (!Mul->hasOneUse())
  return std::nullopt;

Instruction *FMFSource = nullptr;
if (II.getType()->isFPOrFPVectorTy()) {
  llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
  // Stop the combine when the flags on the inputs differ in case dropping
  // flags would lead to us missing out on more beneficial optimizations.
  if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
    return std::nullopt;
  if (!FAddFlags.allowContract())
    return std::nullopt;
  FMFSource = &II;
}

IRBuilder<> Builder(II.getContext());
Builder.SetInsertPoint(&II);

CallInst *Res;
if (MergeIntoAddendOp)
  Res = Builder.CreateIntrinsic(FuseOpc, {II.getType()},
                                {P, AddendOp, MulOp0, MulOp1}, FMFSource);
else
  Res = Builder.CreateIntrinsic(FuseOpc, {II.getType()},
                                {P, MulOp0, MulOp1, AddendOp}, FMFSource);

return IC.replaceInstUsesWith(II, Res);
1148}

1150static bool isAllActivePredicate(Value *Pred) {
// Look through convert.from.svbool(convert.to.svbool(...) chain.
Value *UncastedPred;
if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
                    m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
                        m_Value(UncastedPred)))))
  // If the predicate has the same or less lanes than the uncasted
  // predicate then we know the casting has no effect.
  if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
      cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
    Pred = UncastedPred;

return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
                       m_ConstantInt<AArch64SVEPredPattern::all>()));
1164}

1166static std::optional<Instruction *>
1167instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
IRBuilder<> Builder(II.getContext());
Builder.SetInsertPoint(&II);

Value *Pred = II.getOperand(0);
Value *PtrOp = II.getOperand(1);
Type *VecTy = II.getType();
Value *VecPtr = Builder.CreateBitCast(PtrOp, VecTy->getPointerTo());

if (isAllActivePredicate(Pred)) {
  LoadInst *Load = Builder.CreateLoad(VecTy, VecPtr);
  Load->copyMetadata(II);
  return IC.replaceInstUsesWith(II, Load);
}

CallInst *MaskedLoad =
    Builder.CreateMaskedLoad(VecTy, VecPtr, PtrOp->getPointerAlignment(DL),
                             Pred, ConstantAggregateZero::get(VecTy));
MaskedLoad->copyMetadata(II);
return IC.replaceInstUsesWith(II, MaskedLoad);
1187}

1189static std::optional<Instruction *>
1190instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
IRBuilder<> Builder(II.getContext());
Builder.SetInsertPoint(&II);

Value *VecOp = II.getOperand(0);
Value *Pred = II.getOperand(1);
Value *PtrOp = II.getOperand(2);
Value *VecPtr =
    Builder.CreateBitCast(PtrOp, VecOp->getType()->getPointerTo());

if (isAllActivePredicate(Pred)) {
  StoreInst *Store = Builder.CreateStore(VecOp, VecPtr);
  Store->copyMetadata(II);
  return IC.eraseInstFromFunction(II);
}

CallInst *MaskedStore = Builder.CreateMaskedStore(
    VecOp, VecPtr, PtrOp->getPointerAlignment(DL), Pred);
MaskedStore->copyMetadata(II);
return IC.eraseInstFromFunction(II);
1210}

1212static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) {
switch (Intrinsic) {
case Intrinsic::aarch64_sve_fmul:
  return Instruction::BinaryOps::FMul;
case Intrinsic::aarch64_sve_fadd:
  return Instruction::BinaryOps::FAdd;
case Intrinsic::aarch64_sve_fsub:
  return Instruction::BinaryOps::FSub;
default:
  return Instruction::BinaryOpsEnd;
}
1223}

1225static std::optional<Instruction *>
1226instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II) {
// Bail due to missing support for ISD::STRICT_ scalable vector operations.
if (II.isStrictFP())
  return std::nullopt;

auto *OpPredicate = II.getOperand(0);
auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
if (BinOpCode == Instruction::BinaryOpsEnd ||
    !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
                            m_ConstantInt<AArch64SVEPredPattern::all>())))
  return std::nullopt;
IRBuilder<> Builder(II.getContext());
Builder.SetInsertPoint(&II);
Builder.setFastMathFlags(II.getFastMathFlags());
auto BinOp =
    Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2));
return IC.replaceInstUsesWith(II, BinOp);
1243}

1245static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
                                                          IntrinsicInst &II) {
if (auto FMLA =
        instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
                                          Intrinsic::aarch64_sve_fmla>(IC, II,
                                                                       true))
  return FMLA;
if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
                                                 Intrinsic::aarch64_sve_mla>(
        IC, II, true))
  return MLA;
if (auto FMAD =
        instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
                                          Intrinsic::aarch64_sve_fmad>(IC, II,
                                                                       false))
  return FMAD;
if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
                                                 Intrinsic::aarch64_sve_mad>(
        IC, II, false))
  return MAD;
return instCombineSVEVectorBinOp(IC, II);
1266}

1268static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
                                                          IntrinsicInst &II) {
if (auto FMLS =
        instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
                                          Intrinsic::aarch64_sve_fmls>(IC, II,
                                                                       true))
  return FMLS;
if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
                                                 Intrinsic::aarch64_sve_mls>(
        IC, II, true))
  return MLS;
if (auto FMSB =
        instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
                                          Intrinsic::aarch64_sve_fnmsb>(
            IC, II, false))
  return FMSB;
return instCombineSVEVectorBinOp(IC, II);
1285}

1287static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
                                                          IntrinsicInst &II) {
auto *OpPredicate = II.getOperand(0);
auto *OpMultiplicand = II.getOperand(1);
auto *OpMultiplier = II.getOperand(2);

IRBuilder<> Builder(II.getContext());
Builder.SetInsertPoint(&II);

// Return true if a given instruction is a unit splat value, false otherwise.
auto IsUnitSplat = [](auto *I) {
  auto *SplatValue = getSplatValue(I);
  if (!SplatValue)
    return false;
  return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
};

// Return true if a given instruction is an aarch64_sve_dup intrinsic call
// with a unit splat value, false otherwise.
auto IsUnitDup = [](auto *I) {
  auto *IntrI = dyn_cast<IntrinsicInst>(I);
  if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
    return false;

  auto *SplatValue = IntrI->getOperand(2);
  return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
};

if (IsUnitSplat(OpMultiplier)) {
  // [f]mul pg %n, (dupx 1) => %n
  OpMultiplicand->takeName(&II);
  return IC.replaceInstUsesWith(II, OpMultiplicand);
} else if (IsUnitDup(OpMultiplier)) {
  // [f]mul pg %n, (dup pg 1) => %n
  auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
  auto *DupPg = DupInst->getOperand(1);
  // TODO: this is naive. The optimization is still valid if DupPg
  // 'encompasses' OpPredicate, not only if they're the same predicate.
  if (OpPredicate == DupPg) {
    OpMultiplicand->takeName(&II);
    return IC.replaceInstUsesWith(II, OpMultiplicand);
  }
}

return instCombineSVEVectorBinOp(IC, II);
1332}

1334static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
                                                       IntrinsicInst &II) {
IRBuilder<> Builder(II.getContext());
Builder.SetInsertPoint(&II);
Value *UnpackArg = II.getArgOperand(0);
auto *RetTy = cast<ScalableVectorType>(II.getType());
bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
                II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;

// Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
// Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
if (auto *ScalarArg = getSplatValue(UnpackArg)) {
  ScalarArg =
      Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
  Value *NewVal =
      Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
  NewVal->takeName(&II);
  return IC.replaceInstUsesWith(II, NewVal);
}

return std::nullopt;
1355}
1356static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
                                                    IntrinsicInst &II) {
auto *OpVal = II.getOperand(0);
auto *OpIndices = II.getOperand(1);
VectorType *VTy = cast<VectorType>(II.getType());

// Check whether OpIndices is a constant splat value < minimal element count
// of result.
auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
if (!SplatValue ||
    SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
  return std::nullopt;

// Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
// splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
IRBuilder<> Builder(II.getContext());
Builder.SetInsertPoint(&II);
auto *Extract = Builder.CreateExtractElement(OpVal, SplatValue);
auto *VectorSplat =
    Builder.CreateVectorSplat(VTy->getElementCount(), Extract);

VectorSplat->takeName(&II);
return IC.replaceInstUsesWith(II, VectorSplat);
1379}

1381static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
                                                    IntrinsicInst &II) {
// zip1(uzp1(A, B), uzp2(A, B)) --> A
// zip2(uzp1(A, B), uzp2(A, B)) --> B
Value *A, *B;
if (match(II.getArgOperand(0),
          m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) &&
    match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
                                   m_Specific(A), m_Specific(B))))
  return IC.replaceInstUsesWith(
      II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));

return std::nullopt;
1394}

1396static std::optional<Instruction *>
1397instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) {
Value *Mask = II.getOperand(0);
Value *BasePtr = II.getOperand(1);
Value *Index = II.getOperand(2);
Type *Ty = II.getType();
Value *PassThru = ConstantAggregateZero::get(Ty);

// Contiguous gather => masked load.
// (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
// => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
Value *IndexBase;
if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
                     m_Value(IndexBase), m_SpecificInt(1)))) {
  IRBuilder<> Builder(II.getContext());
  Builder.SetInsertPoint(&II);

  Align Alignment =
      BasePtr->getPointerAlignment(II.getModule()->getDataLayout());

  Type *VecPtrTy = PointerType::getUnqual(Ty);
  Value *Ptr = Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
                                 BasePtr, IndexBase);
  Ptr = Builder.CreateBitCast(Ptr, VecPtrTy);
  CallInst *MaskedLoad =
      Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
  MaskedLoad->takeName(&II);
  return IC.replaceInstUsesWith(II, MaskedLoad);
}

return std::nullopt;
1427}

1429static std::optional<Instruction *>
1430instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II) {
Value *Val = II.getOperand(0);
Value *Mask = II.getOperand(1);
Value *BasePtr = II.getOperand(2);
Value *Index = II.getOperand(3);
Type *Ty = Val->getType();

// Contiguous scatter => masked store.
// (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
// => (masked.store Value (gep BasePtr IndexBase) Align Mask)
Value *IndexBase;
if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
                     m_Value(IndexBase), m_SpecificInt(1)))) {
  IRBuilder<> Builder(II.getContext());
  Builder.SetInsertPoint(&II);

  Align Alignment =
      BasePtr->getPointerAlignment(II.getModule()->getDataLayout());

  Value *Ptr = Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
                                 BasePtr, IndexBase);
  Type *VecPtrTy = PointerType::getUnqual(Ty);
  Ptr = Builder.CreateBitCast(Ptr, VecPtrTy);

  (void)Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);

  return IC.eraseInstFromFunction(II);
}

return std::nullopt;
1460}

1462static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
                                                     IntrinsicInst &II) {
IRBuilder<> Builder(II.getContext());
Builder.SetInsertPoint(&II);
Type *Int32Ty = Builder.getInt32Ty();
Value *Pred = II.getOperand(0);
Value *Vec = II.getOperand(1);
Value *DivVec = II.getOperand(2);

Value *SplatValue = getSplatValue(DivVec);
ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
if (!SplatConstantInt)
  return std::nullopt;
APInt Divisor = SplatConstantInt->getValue();

if (Divisor.isPowerOf2()) {
  Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
  auto ASRD = Builder.CreateIntrinsic(
      Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
  return IC.replaceInstUsesWith(II, ASRD);
}
if (Divisor.isNegatedPowerOf2()) {
  Divisor.negate();
  Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
  auto ASRD = Builder.CreateIntrinsic(
      Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
  auto NEG = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_neg,
                                     {ASRD->getType()}, {ASRD, Pred, ASRD});
  return IC.replaceInstUsesWith(II, NEG);
}

return std::nullopt;
1494}

1496bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
size_t VecSize = Vec.size();
if (VecSize == 1)
  return true;
if (!isPowerOf2_64(VecSize))
  return false;
size_t HalfVecSize = VecSize / 2;

for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
     RHS != Vec.end(); LHS++, RHS++) {
  if (*LHS != nullptr && *RHS != nullptr) {
    if (*LHS == *RHS)
      continue;
    else
      return false;
  }
  if (!AllowPoison)
    return false;
  if (*LHS == nullptr && *RHS != nullptr)
    *LHS = *RHS;
}

Vec.resize(HalfVecSize);
SimplifyValuePattern(Vec, AllowPoison);
return true;
1521}

1523// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
1524// to dupqlane(f64(C)) where C is A concatenated with B
1525static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
                                                         IntrinsicInst &II) {
Value *CurrentInsertElt = nullptr, *Default = nullptr;
if (!match(II.getOperand(0),
           m_Intrinsic<Intrinsic::vector_insert>(
               m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
    !isa<FixedVectorType>(CurrentInsertElt->getType()))
  return std::nullopt;
auto IIScalableTy = cast<ScalableVectorType>(II.getType());

// Insert the scalars into a container ordered by InsertElement index
SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
  auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
  Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
  CurrentInsertElt = InsertElt->getOperand(0);
}

bool AllowPoison =
    isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
if (!SimplifyValuePattern(Elts, AllowPoison))
  return std::nullopt;

// Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
IRBuilder<> Builder(II.getContext());
Builder.SetInsertPoint(&II);
Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
for (size_t I = 0; I < Elts.size(); I++) {
  if (Elts[I] == nullptr)
    continue;
  InsertEltChain = Builder.CreateInsertElement(InsertEltChain, Elts[I],
                                               Builder.getInt64(I));
}
if (InsertEltChain == nullptr)
  return std::nullopt;

// Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
// value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
// be bitcast to a type wide enough to fit the sequence, be splatted, and then
// be narrowed back to the original type.
unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
                               IIScalableTy->getMinNumElements() /
                               PatternWidth;

IntegerType *WideTy = Builder.getIntNTy(PatternWidth);
auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
auto *WideShuffleMaskTy =
    ScalableVectorType::get(Builder.getInt32Ty(), PatternElementCount);

auto ZeroIdx = ConstantInt::get(Builder.getInt64Ty(), APInt(64, 0));
auto InsertSubvector = Builder.CreateInsertVector(
    II.getType(), PoisonValue::get(II.getType()), InsertEltChain, ZeroIdx);
auto WideBitcast =
    Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
auto WideShuffle = Builder.CreateShuffleVector(
    WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
auto NarrowBitcast =
    Builder.CreateBitOrPointerCast(WideShuffle, II.getType());

return IC.replaceInstUsesWith(II, NarrowBitcast);
1587}

1589static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
                                                      IntrinsicInst &II) {
Value *A = II.getArgOperand(0);
Value *B = II.getArgOperand(1);
if (A == B)
  return IC.replaceInstUsesWith(II, A);

return std::nullopt;
1597}

1599static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
                                                      IntrinsicInst &II) {
IRBuilder<> Builder(&II);
Value *Pred = II.getOperand(0);
Value *Vec = II.getOperand(1);
Value *Shift = II.getOperand(2);

// Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
Value *AbsPred, *MergedValue;
if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(
                    m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
    !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>(
                    m_Value(MergedValue), m_Value(AbsPred), m_Value())))

  return std::nullopt;

// Transform is valid if any of the following are true:
// * The ABS merge value is an undef or non-negative
// * The ABS predicate is all active
// * The ABS predicate and the SRSHL predicates are the same
if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
    AbsPred != Pred && !isAllActivePredicate(AbsPred))
  return std::nullopt;

// Only valid when the shift amount is non-negative, otherwise the rounding
// behaviour of SRSHL cannot be ignored.
if (!match(Shift, m_NonNegative()))
  return std::nullopt;

auto LSL = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl, {II.getType()},
                                   {Pred, Vec, Shift});

return IC.replaceInstUsesWith(II, LSL);
1632}

1634std::optional<Instruction *>
1635AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
                                   IntrinsicInst &II) const {
Intrinsic::ID IID = II.getIntrinsicID();
switch (IID) {
default:
  break;
case Intrinsic::aarch64_neon_fmaxnm:
case Intrinsic::aarch64_neon_fminnm:
  return instCombineMaxMinNM(IC, II);
case Intrinsic::aarch64_sve_convert_from_svbool:
  return instCombineConvertFromSVBool(IC, II);
case Intrinsic::aarch64_sve_dup:
  return instCombineSVEDup(IC, II);
case Intrinsic::aarch64_sve_dup_x:
  return instCombineSVEDupX(IC, II);
case Intrinsic::aarch64_sve_cmpne:
case Intrinsic::aarch64_sve_cmpne_wide:
  return instCombineSVECmpNE(IC, II);
case Intrinsic::aarch64_sve_rdffr:
  return instCombineRDFFR(IC, II);
case Intrinsic::aarch64_sve_lasta:
case Intrinsic::aarch64_sve_lastb:
  return instCombineSVELast(IC, II);
case Intrinsic::aarch64_sve_clasta_n:
case Intrinsic::aarch64_sve_clastb_n:
  return instCombineSVECondLast(IC, II);
case Intrinsic::aarch64_sve_cntd:
  return instCombineSVECntElts(IC, II, 2);
case Intrinsic::aarch64_sve_cntw:
  return instCombineSVECntElts(IC, II, 4);
case Intrinsic::aarch64_sve_cnth:
  return instCombineSVECntElts(IC, II, 8);
case Intrinsic::aarch64_sve_cntb:
  return instCombineSVECntElts(IC, II, 16);
case Intrinsic::aarch64_sve_ptest_any:
case Intrinsic::aarch64_sve_ptest_first:
case Intrinsic::aarch64_sve_ptest_last:
  return instCombineSVEPTest(IC, II);
case Intrinsic::aarch64_sve_mul:
case Intrinsic::aarch64_sve_fmul:
  return instCombineSVEVectorMul(IC, II);
case Intrinsic::aarch64_sve_fadd:
case Intrinsic::aarch64_sve_add:
  return instCombineSVEVectorAdd(IC, II);
case Intrinsic::aarch64_sve_fadd_u:
  return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
                                           Intrinsic::aarch64_sve_fmla_u>(
      IC, II, true);
case Intrinsic::aarch64_sve_fsub:
case Intrinsic::aarch64_sve_sub:
  return instCombineSVEVectorSub(IC, II);
case Intrinsic::aarch64_sve_fsub_u:
  return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
                                           Intrinsic::aarch64_sve_fmls_u>(
      IC, II, true);
case Intrinsic::aarch64_sve_tbl:
  return instCombineSVETBL(IC, II);
case Intrinsic::aarch64_sve_uunpkhi:
case Intrinsic::aarch64_sve_uunpklo:
case Intrinsic::aarch64_sve_sunpkhi:
case Intrinsic::aarch64_sve_sunpklo:
  return instCombineSVEUnpack(IC, II);
case Intrinsic::aarch64_sve_zip1:
case Intrinsic::aarch64_sve_zip2:
  return instCombineSVEZip(IC, II);
case Intrinsic::aarch64_sve_ld1_gather_index:
  return instCombineLD1GatherIndex(IC, II);
case Intrinsic::aarch64_sve_st1_scatter_index:
  return instCombineST1ScatterIndex(IC, II);
case Intrinsic::aarch64_sve_ld1:
  return instCombineSVELD1(IC, II, DL);
case Intrinsic::aarch64_sve_st1:
  return instCombineSVEST1(IC, II, DL);
case Intrinsic::aarch64_sve_sdiv:
  return instCombineSVESDIV(IC, II);
case Intrinsic::aarch64_sve_sel:
  return instCombineSVESel(IC, II);
case Intrinsic::aarch64_sve_srshl:
  return instCombineSVESrshl(IC, II);
case Intrinsic::aarch64_sve_dupq_lane:
  return instCombineSVEDupqLane(IC, II);
}

return std::nullopt;
1719}

1721std::optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic(
  InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
  APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
  std::function<void(Instruction *, unsigned, APInt, APInt &)>
      SimplifyAndSetOp) const {
switch (II.getIntrinsicID()) {
default:
  break;
case Intrinsic::aarch64_neon_fcvtxn:
case Intrinsic::aarch64_neon_rshrn:
case Intrinsic::aarch64_neon_sqrshrn:
case Intrinsic::aarch64_neon_sqrshrun:
case Intrinsic::aarch64_neon_sqshrn:
case Intrinsic::aarch64_neon_sqshrun:
case Intrinsic::aarch64_neon_sqxtn:
case Intrinsic::aarch64_neon_sqxtun:
case Intrinsic::aarch64_neon_uqrshrn:
case Intrinsic::aarch64_neon_uqshrn:
case Intrinsic::aarch64_neon_uqxtn:
  SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
  break;
}

return std::nullopt;
1745}

1747TypeSize
1748AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
switch (K) {
case TargetTransformInfo::RGK_Scalar:
  return TypeSize::getFixed(64);
case TargetTransformInfo::RGK_FixedWidthVector:
  if (!ST->isStreamingSVEModeDisabled() &&
      !EnableFixedwidthAutovecInStreamingMode)
    return TypeSize::getFixed(0);

  if (ST->hasSVE())
    return TypeSize::getFixed(
        std::max(ST->getMinSVEVectorSizeInBits(), 128u));

  return TypeSize::getFixed(ST->hasNEON() ? 128 : 0);
case TargetTransformInfo::RGK_ScalableVector:
  if (!ST->isStreamingSVEModeDisabled() && !EnableScalableAutovecInStreamingMode)
    return TypeSize::getScalable(0);

  return TypeSize::getScalable(ST->hasSVE() ? 128 : 0);
}
llvm_unreachable("Unsupported register kind")::llvm::llvm_unreachable_internal("Unsupported register kind"
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 1768
);
1769}

1771bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
                                         ArrayRef<const Value *> Args) {

// A helper that returns a vector type from the given type. The number of
// elements in type Ty determines the vector width.
auto toVectorTy = [&](Type *ArgTy) {
  return VectorType::get(ArgTy->getScalarType(),
                         cast<VectorType>(DstTy)->getElementCount());
};

// Exit early if DstTy is not a vector type whose elements are at least
// 16-bits wide. SVE doesn't generally have the same set of instructions to
// perform an extend with the add/sub/mul. There are SMULLB style
// instructions, but they operate on top/bottom, requiring some sort of lane
// interleaving to be used with zext/sext.
if (!useNeonVector(DstTy) || DstTy->getScalarSizeInBits() < 16)
  return false;

// Determine if the operation has a widening variant. We consider both the
// "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
// instructions.
//
// TODO: Add additional widening operations (e.g., shl, etc.) once we
//       verify that their extending operands are eliminated during code
//       generation.
switch (Opcode) {
case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
case Instruction::Mul: // SMULL(2), UMULL(2)
  break;
default:
  return false;
}

// To be a widening instruction (either the "wide" or "long" versions), the
// second operand must be a sign- or zero extend.
if (Args.size() != 2 ||
    (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])))
  return false;
auto *Extend = cast<CastInst>(Args[1]);
auto *Arg0 = dyn_cast<CastInst>(Args[0]);

// A mul only has a mull version (not like addw). Both operands need to be
// extending and the same type.
if (Opcode == Instruction::Mul &&
    (!Arg0 || Arg0->getOpcode() != Extend->getOpcode() ||
     Arg0->getOperand(0)->getType() != Extend->getOperand(0)->getType()))
  return false;

// Legalize the destination type and ensure it can be used in a widening
// operation.
auto DstTyL = getTypeLegalizationCost(DstTy);
unsigned DstElTySize = DstTyL.second.getScalarSizeInBits();
if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits())
  return false;

// Legalize the source type and ensure it can be used in a widening
// operation.
auto *SrcTy = toVectorTy(Extend->getSrcTy());
auto SrcTyL = getTypeLegalizationCost(SrcTy);
unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
  return false;

// Get the total number of vector elements in the legalized types.
InstructionCost NumDstEls =
    DstTyL.first * DstTyL.second.getVectorMinNumElements();
InstructionCost NumSrcEls =
    SrcTyL.first * SrcTyL.second.getVectorMinNumElements();

// Return true if the legalized types have the same number of vector elements
// and the destination element type size is twice that of the source type.
return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize;
1844}

1846InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
                                               Type *Src,
                                               TTI::CastContextHint CCH,
                                               TTI::TargetCostKind CostKind,
                                               const Instruction *I) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ?
 void (0) : __assert_fail ("ISD && \"Invalid opcode\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 1852
, __extension__ __PRETTY_FUNCTION__));

// If the cast is observable, and it is used by a widening instruction (e.g.,
// uaddl, saddw, etc.), it may be free.
if (I && I->hasOneUser()) {
  auto *SingleUser = cast<Instruction>(*I->user_begin());
  SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
  if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
    // If the cast is the second operand, it is free. We will generate either
    // a "wide" or "long" version of the widening instruction.
    if (I == SingleUser->getOperand(1))
      return 0;
    // If the cast is not the second operand, it will be free if it looks the
    // same as the second operand. In this case, we will generate a "long"
    // version of the widening instruction.
    if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
      if (I->getOpcode() == unsigned(Cast->getOpcode()) &&
          cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
        return 0;
  }
}

// TODO: Allow non-throughput costs that aren't binary.
auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
  if (CostKind != TTI::TCK_RecipThroughput)
    return Cost == 0 ? 0 : 1;
  return Cost;
};

EVT SrcTy = TLI->getValueType(DL, Src);
EVT DstTy = TLI->getValueType(DL, Dst);

if (!SrcTy.isSimple() || !DstTy.isSimple())
  return AdjustCost(
      BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));

static const TypeConversionCostTblEntry
ConversionTbl[] = {
  { ISD::TRUNCATE, MVT::v2i8,   MVT::v2i64,  1},  // xtn
  { ISD::TRUNCATE, MVT::v2i16,  MVT::v2i64,  1},  // xtn
  { ISD::TRUNCATE, MVT::v2i32,  MVT::v2i64,  1},  // xtn
  { ISD::TRUNCATE, MVT::v4i8,   MVT::v4i32,  1},  // xtn
  { ISD::TRUNCATE, MVT::v4i8,   MVT::v4i64,  3},  // 2 xtn + 1 uzp1
  { ISD::TRUNCATE, MVT::v4i16,  MVT::v4i32,  1},  // xtn
  { ISD::TRUNCATE, MVT::v4i16,  MVT::v4i64,  2},  // 1 uzp1 + 1 xtn
  { ISD::TRUNCATE, MVT::v4i32,  MVT::v4i64,  1},  // 1 uzp1
  { ISD::TRUNCATE, MVT::v8i8,   MVT::v8i16,  1},  // 1 xtn
  { ISD::TRUNCATE, MVT::v8i8,   MVT::v8i32,  2},  // 1 uzp1 + 1 xtn
  { ISD::TRUNCATE, MVT::v8i8,   MVT::v8i64,  4},  // 3 x uzp1 + xtn
  { ISD::TRUNCATE, MVT::v8i16,  MVT::v8i32,  1},  // 1 uzp1
  { ISD::TRUNCATE, MVT::v8i16,  MVT::v8i64,  3},  // 3 x uzp1
  { ISD::TRUNCATE, MVT::v8i32,  MVT::v8i64,  2},  // 2 x uzp1
  { ISD::TRUNCATE, MVT::v16i8,  MVT::v16i16, 1},  // uzp1
  { ISD::TRUNCATE, MVT::v16i8,  MVT::v16i32, 3},  // (2 + 1) x uzp1
  { ISD::TRUNCATE, MVT::v16i8,  MVT::v16i64, 7},  // (4 + 2 + 1) x uzp1
  { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2},  // 2 x uzp1
  { ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6},  // (4 + 2) x uzp1
  { ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4},  // 4 x uzp1

  // Truncations on nxvmiN
  { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 },
  { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 },
  { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 },
  { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 },
  { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 },
  { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 },
  { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 },
  { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 },
  { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 },
  { ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1 },
  { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 },
  { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 },
  { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 },
  { ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 },
  { ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 },
  { ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 },

  // The number of shll instructions for the extension.
  { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
  { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
  { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32, 2 },
  { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32, 2 },
  { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,  3 },
  { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,  3 },
  { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16, 2 },
  { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16, 2 },
  { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i8,  7 },
  { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i8,  7 },
  { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i16, 6 },
  { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i16, 6 },
  { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
  { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
  { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
  { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },

  // LowerVectorINT_TO_FP:
  { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
  { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
  { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
  { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
  { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
  { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },

  // Complex: to v2f32
  { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8,  3 },
  { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
  { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
  { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8,  3 },
  { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
  { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },

  // Complex: to v4f32
  { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8,  4 },
  { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
  { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8,  3 },
  { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },

  // Complex: to v8f32
  { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8,  10 },
  { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
  { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8,  10 },
  { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },

  // Complex: to v16f32
  { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
  { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },

  // Complex: to v2f64
  { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8,  4 },
  { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
  { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
  { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8,  4 },
  { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
  { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },

  // Complex: to v4f64
  { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32,  4 },
  { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32,  4 },

  // LowerVectorFP_TO_INT
  { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
  { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
  { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
  { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
  { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
  { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },

  // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
  { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
  { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
  { ISD::FP_TO_SINT, MVT::v2i8,  MVT::v2f32, 1 },
  { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
  { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
  { ISD::FP_TO_UINT, MVT::v2i8,  MVT::v2f32, 1 },

  // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
  { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
  { ISD::FP_TO_SINT, MVT::v4i8,  MVT::v4f32, 2 },
  { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
  { ISD::FP_TO_UINT, MVT::v4i8,  MVT::v4f32, 2 },

  // Complex, from nxv2f32.
  { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
  { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
  { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
  { ISD::FP_TO_SINT, MVT::nxv2i8,  MVT::nxv2f32, 1 },
  { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
  { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
  { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
  { ISD::FP_TO_UINT, MVT::nxv2i8,  MVT::nxv2f32, 1 },

  // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
  { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
  { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
  { ISD::FP_TO_SINT, MVT::v2i8,  MVT::v2f64, 2 },
  { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
  { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
  { ISD::FP_TO_UINT, MVT::v2i8,  MVT::v2f64, 2 },

  // Complex, from nxv2f64.
  { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
  { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
  { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
  { ISD::FP_TO_SINT, MVT::nxv2i8,  MVT::nxv2f64, 1 },
  { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
  { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
  { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
  { ISD::FP_TO_UINT, MVT::nxv2i8,  MVT::nxv2f64, 1 },

  // Complex, from nxv4f32.
  { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
  { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
  { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
  { ISD::FP_TO_SINT, MVT::nxv4i8,  MVT::nxv4f32, 1 },
  { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
  { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
  { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
  { ISD::FP_TO_UINT, MVT::nxv4i8,  MVT::nxv4f32, 1 },

  // Complex, from nxv8f64. Illegal -> illegal conversions not required.
  { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
  { ISD::FP_TO_SINT, MVT::nxv8i8,  MVT::nxv8f64, 7 },
  { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
  { ISD::FP_TO_UINT, MVT::nxv8i8,  MVT::nxv8f64, 7 },

  // Complex, from nxv4f64. Illegal -> illegal conversions not required.
  { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
  { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
  { ISD::FP_TO_SINT, MVT::nxv4i8,  MVT::nxv4f64, 3 },
  { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
  { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
  { ISD::FP_TO_UINT, MVT::nxv4i8,  MVT::nxv4f64, 3 },

  // Complex, from nxv8f32. Illegal -> illegal conversions not required.
  { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
  { ISD::FP_TO_SINT, MVT::nxv8i8,  MVT::nxv8f32, 3 },
  { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
  { ISD::FP_TO_UINT, MVT::nxv8i8,  MVT::nxv8f32, 3 },

  // Complex, from nxv8f16.
  { ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
  { ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
  { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
  { ISD::FP_TO_SINT, MVT::nxv8i8,  MVT::nxv8f16, 1 },
  { ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
  { ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
  { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
  { ISD::FP_TO_UINT, MVT::nxv8i8,  MVT::nxv8f16, 1 },

  // Complex, from nxv4f16.
  { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
  { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
  { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
  { ISD::FP_TO_SINT, MVT::nxv4i8,  MVT::nxv4f16, 1 },
  { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
  { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
  { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
  { ISD::FP_TO_UINT, MVT::nxv4i8,  MVT::nxv4f16, 1 },

  // Complex, from nxv2f16.
  { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
  { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
  { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
  { ISD::FP_TO_SINT, MVT::nxv2i8,  MVT::nxv2f16, 1 },
  { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
  { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
  { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
  { ISD::FP_TO_UINT, MVT::nxv2i8,  MVT::nxv2f16, 1 },

  // Truncate from nxvmf32 to nxvmf16.
  { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 },
  { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 },
  { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 },

  // Truncate from nxvmf64 to nxvmf16.
  { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 },
  { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 },
  { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 },

  // Truncate from nxvmf64 to nxvmf32.
  { ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 },
  { ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 },
  { ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 },

  // Extend from nxvmf16 to nxvmf32.
  { ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
  { ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
  { ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},

  // Extend from nxvmf16 to nxvmf64.
  { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
  { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
  { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},

  // Extend from nxvmf32 to nxvmf64.
  { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
  { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
  { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},

  // Bitcasts from float to integer
  { ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0 },
  { ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0 },
  { ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0 },

  // Bitcasts from integer to float
  { ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0 },
  { ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0 },
  { ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0 },

  // Add cost for extending to illegal -too wide- scalable vectors.
  // zero/sign extend are implemented by multiple unpack operations,
  // where each operation has a cost of 1.
  { ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
  { ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
  { ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
  { ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
  { ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
  { ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},

  { ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
  { ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
  { ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
  { ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
  { ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
  { ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
};

if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
                                               DstTy.getSimpleVT(),
                                               SrcTy.getSimpleVT()))
  return AdjustCost(Entry->Cost);

static const TypeConversionCostTblEntry FP16Tbl[] = {
    {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
    {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
    {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
    {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
    {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
    {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
    {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
    {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
    {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
    {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
    {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
    {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
    {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
    {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
    {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
    {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
    {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
    {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
    {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2},   // ushll + ucvtf
    {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2},   // sshll + scvtf
    {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
    {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
};

if (ST->hasFullFP16())
  if (const auto *Entry = ConvertCostTableLookup(
          FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
    return AdjustCost(Entry->Cost);

// The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
// but we also want to include the TTI::CastContextHint::Masked case too.
if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
    CCH == TTI::CastContextHint::Masked && ST->hasSVEorSME() &&
    TLI->isTypeLegal(DstTy))
  CCH = TTI::CastContextHint::Normal;

return AdjustCost(
    BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2203}

2205InstructionCost AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode,
                                                       Type *Dst,
                                                       VectorType *VecTy,
                                                       unsigned Index) {

// Make sure we were given a valid extend opcode.
assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&(static_cast <bool> ((Opcode == Instruction::SExt || Opcode
 == Instruction::ZExt) && "Invalid opcode") ? void (0
) : __assert_fail ("(Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && \"Invalid opcode\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2212
, __extension__ __PRETTY_FUNCTION__))
       "Invalid opcode")(static_cast <bool> ((Opcode == Instruction::SExt || Opcode
 == Instruction::ZExt) && "Invalid opcode") ? void (0
) : __assert_fail ("(Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && \"Invalid opcode\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2212
, __extension__ __PRETTY_FUNCTION__));

// We are extending an element we extract from a vector, so the source type
// of the extend is the element type of the vector.
auto *Src = VecTy->getElementType();

// Sign- and zero-extends are for integer types only.
assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type")(static_cast <bool> (isa<IntegerType>(Dst) &&
 isa<IntegerType>(Src) && "Invalid type") ? void
 (0) : __assert_fail ("isa<IntegerType>(Dst) && isa<IntegerType>(Src) && \"Invalid type\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2219
, __extension__ __PRETTY_FUNCTION__));

// Get the cost for the extract. We compute the cost (if any) for the extend
// below.
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
                                          CostKind, Index, nullptr, nullptr);

// Legalize the types.
auto VecLT = getTypeLegalizationCost(VecTy);
auto DstVT = TLI->getValueType(DL, Dst);
auto SrcVT = TLI->getValueType(DL, Src);

// If the resulting type is still a vector and the destination type is legal,
// we may get the extension for free. If not, get the default cost for the
// extend.
if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
  return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
                                 CostKind);

// The destination type should be larger than the element type. If not, get
// the default cost for the extend.
if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
  return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
                                 CostKind);

switch (Opcode) {
default:
  llvm_unreachable("Opcode should be either SExt or ZExt")::llvm::llvm_unreachable_internal("Opcode should be either SExt or ZExt"
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2247
);

// For sign-extends, we only need a smov, which performs the extension
// automatically.
case Instruction::SExt:
  return Cost;

// For zero-extends, the extend is performed automatically by a umov unless
// the destination type is i64 and the element type is i8 or i16.
case Instruction::ZExt:
  if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
    return Cost;
}

// If we are unable to perform the extend for free, get the default cost.
return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
                               CostKind);
2264}

2266InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
                                             TTI::TargetCostKind CostKind,
                                             const Instruction *I) {
if (CostKind != TTI::TCK_RecipThroughput)
  return Opcode == Instruction::PHI ? 0 : 1;
assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind")(static_cast <bool> (CostKind == TTI::TCK_RecipThroughput
 && "unexpected CostKind") ? void (0) : __assert_fail
 ("CostKind == TTI::TCK_RecipThroughput && \"unexpected CostKind\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2271
, __extension__ __PRETTY_FUNCTION__));
// Branches are assumed to be predicted.
return 0;
2274}

2276InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(const Instruction *I,
                                                       Type *Val,
                                                       unsigned Index,
                                                       bool HasRealUse) {
assert(Val->isVectorTy() && "This must be a vector type")(static_cast <bool> (Val->isVectorTy() && "This must be a vector type"
) ? void (0) : __assert_fail ("Val->isVectorTy() && \"This must be a vector type\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2280
, __extension__ __PRETTY_FUNCTION__));

if (Index != -1U) {
  // Legalize the type.
  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);

  // This type is legalized to a scalar type.
  if (!LT.second.isVector())
    return 0;

  // The type may be split. For fixed-width vectors we can normalize the
  // index to the new type.
  if (LT.second.isFixedLengthVector()) {
    unsigned Width = LT.second.getVectorNumElements();
    Index = Index % Width;
  }

  // The element at index zero is already inside the vector.
  // - For a physical (HasRealUse==true) insert-element or extract-element
  // instruction that extracts integers, an explicit FPR -> GPR move is
  // needed. So it has non-zero cost.
  // - For the rest of cases (virtual instruction or element type is float),
  // consider the instruction free.
  if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
    return 0;

  // This is recognising a LD1 single-element structure to one lane of one
  // register instruction. I.e., if this is an `insertelement` instruction,
  // and its second operand is a load, then we will generate a LD1, which
  // are expensive instructions.
  if (I && dyn_cast<LoadInst>(I->getOperand(1)))
    return ST->getVectorInsertExtractBaseCost() + 1;

  // FIXME:
  // If the extract-element and insert-element instructions could be
  // simplified away (e.g., could be combined into users by looking at use-def
  // context), they have no cost. This is not done in the first place for
  // compile-time considerations.
}

// All other insert/extracts cost this much.
return ST->getVectorInsertExtractBaseCost();
2322}

2324InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
                                                 TTI::TargetCostKind CostKind,
                                                 unsigned Index, Value *Op0,
                                                 Value *Op1) {
bool HasRealUse =
    Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
return getVectorInstrCostHelper(nullptr, Val, Index, HasRealUse);
2331}

2333InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
                                                 Type *Val,
                                                 TTI::TargetCostKind CostKind,
                                                 unsigned Index) {
return getVectorInstrCostHelper(&I, Val, Index, true /* HasRealUse */);
2338}

2340InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
  unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
  TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
  ArrayRef<const Value *> Args,
  const Instruction *CxtI) {

// TODO: Handle more cost kinds.
if (CostKind != TTI::TCK_RecipThroughput)
  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
                                       Op2Info, Args, CxtI);

// Legalize the type.
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
int ISD = TLI->InstructionOpcodeToISD(Opcode);

switch (ISD) {
default:
  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
                                       Op2Info);
case ISD::SDIV:
  if (Op2Info.isConstant() && Op2Info.isUniform() && Op2Info.isPowerOf2()) {
    // On AArch64, scalar signed division by constants power-of-two are
    // normally expanded to the sequence ADD + CMP + SELECT + SRA.
    // The OperandValue properties many not be same as that of previous
    // operation; conservatively assume OP_None.
    InstructionCost Cost = getArithmeticInstrCost(
        Instruction::Add, Ty, CostKind,
        Op1Info.getNoProps(), Op2Info.getNoProps());
    Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
                                   Op1Info.getNoProps(), Op2Info.getNoProps());
    Cost += getArithmeticInstrCost(
        Instruction::Select, Ty, CostKind,
        Op1Info.getNoProps(), Op2Info.getNoProps());
    Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
                                   Op1Info.getNoProps(), Op2Info.getNoProps());
    return Cost;
  }
  [[fallthrough]];
case ISD::UDIV: {
  if (Op2Info.isConstant() && Op2Info.isUniform()) {
    auto VT = TLI->getValueType(DL, Ty);
    if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
      // Vector signed division by constant are expanded to the
      // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
      // to MULHS + SUB + SRL + ADD + SRL.
      InstructionCost MulCost = getArithmeticInstrCost(
          Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
      InstructionCost AddCost = getArithmeticInstrCost(
          Instruction::Add, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
      InstructionCost ShrCost = getArithmeticInstrCost(
          Instruction::AShr, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
      return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
    }
  }

  InstructionCost Cost = BaseT::getArithmeticInstrCost(
      Opcode, Ty, CostKind, Op1Info, Op2Info);
  if (Ty->isVectorTy()) {
    if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
      // SDIV/UDIV operations are lowered using SVE, then we can have less
      // costs.
      if (isa<FixedVectorType>(Ty) && cast<FixedVectorType>(Ty)
                                              ->getPrimitiveSizeInBits()
                                              .getFixedValue() < 128) {
        EVT VT = TLI->getValueType(DL, Ty);
        static const CostTblEntry DivTbl[]{
            {ISD::SDIV, MVT::v2i8, 5},  {ISD::SDIV, MVT::v4i8, 8},
            {ISD::SDIV, MVT::v8i8, 8},  {ISD::SDIV, MVT::v2i16, 5},
            {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
            {ISD::UDIV, MVT::v2i8, 5},  {ISD::UDIV, MVT::v4i8, 8},
            {ISD::UDIV, MVT::v8i8, 8},  {ISD::UDIV, MVT::v2i16, 5},
            {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};

        const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
        if (nullptr != Entry)
          return Entry->Cost;
      }
      // For 8/16-bit elements, the cost is higher because the type
      // requires promotion and possibly splitting:
      if (LT.second.getScalarType() == MVT::i8)
        Cost *= 8;
      else if (LT.second.getScalarType() == MVT::i16)
        Cost *= 4;
      return Cost;
    } else {
      // If one of the operands is a uniform constant then the cost for each
      // element is Cost for insertion, extraction and division.
      // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
      // operation with scalar type
      if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
          (Op2Info.isConstant() && Op2Info.isUniform())) {
        if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
          InstructionCost DivCost = BaseT::getArithmeticInstrCost(
              Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
          return (4 + DivCost) * VTy->getNumElements();
        }
      }
      // On AArch64, without SVE, vector divisions are expanded
      // into scalar divisions of each pair of elements.
      Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty,
                                     CostKind, Op1Info, Op2Info);
      Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind,
                                     Op1Info, Op2Info);
    }

    // TODO: if one of the arguments is scalar, then it's not necessary to
    // double the cost of handling the vector elements.
    Cost += Cost;
  }
  return Cost;
}
case ISD::MUL:
  // When SVE is available, then we can lower the v2i64 operation using
  // the SVE mul instruction, which has a lower cost.
  if (LT.second == MVT::v2i64 && ST->hasSVE())
    return LT.first;

  // When SVE is not available, there is no MUL.2d instruction,
  // which means mul <2 x i64> is expensive as elements are extracted
  // from the vectors and the muls scalarized.
  // As getScalarizationOverhead is a bit too pessimistic, we
  // estimate the cost for a i64 vector directly here, which is:
  // - four 2-cost i64 extracts,
  // - two 2-cost i64 inserts, and
  // - two 1-cost muls.
  // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
  // LT.first = 2 the cost is 28. If both operands are extensions it will not
  // need to scalarize so the cost can be cheaper (smull or umull).
  // so the cost can be cheaper (smull or umull).
  if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
    return LT.first;
  return LT.first * 14;
case ISD::ADD:
case ISD::XOR:
case ISD::OR:
case ISD::AND:
case ISD::SRL:
case ISD::SRA:
case ISD::SHL:
  // These nodes are marked as 'custom' for combining purposes only.
  // We know that they are legal. See LowerAdd in ISelLowering.
  return LT.first;

case ISD::FNEG:
case ISD::FADD:
case ISD::FSUB:
  // Increase the cost for half and bfloat types if not architecturally
  // supported.
  if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) ||
      (Ty->getScalarType()->isBFloatTy() && !ST->hasBF16()))
    return 2 * LT.first;
  if (!Ty->getScalarType()->isFP128Ty())
    return LT.first;
  LLVM_FALLTHROUGH[[fallthrough]];
case ISD::FMUL:
case ISD::FDIV:
  // These nodes are marked as 'custom' just to lower them to SVE.
  // We know said lowering will incur no additional cost.
  if (!Ty->getScalarType()->isFP128Ty())
    return 2 * LT.first;

  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
                                       Op2Info);
}
2504}

2506InstructionCost AArch64TTIImpl::getAddressComputationCost(Type *Ty,
                                                        ScalarEvolution *SE,
                                                        const SCEV *Ptr) {
// Address computations in vectorized code with non-consecutive addresses will
// likely result in more instructions compared to scalar code where the
// computation can more often be merged into the index mode. The resulting
// extra micro-ops can significantly decrease throughput.
unsigned NumVectorInstToHideOverhead = 10;
int MaxMergeDistance = 64;

if (Ty->isVectorTy() && SE &&
    !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
  return NumVectorInstToHideOverhead;

// In many cases the address computation is not merged into the instruction
// addressing mode.
return 1;
2523}

2525InstructionCost AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
                                                 Type *CondTy,
                                                 CmpInst::Predicate VecPred,
                                                 TTI::TargetCostKind CostKind,
                                                 const Instruction *I) {
// TODO: Handle other cost kinds.
if (CostKind != TTI::TCK_RecipThroughput)
  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
                                   I);

int ISD = TLI->InstructionOpcodeToISD(Opcode);
// We don't lower some vector selects well that are wider than the register
// width.
if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) {
  // We would need this many instructions to hide the scalarization happening.
  const int AmortizationCost = 20;

  // If VecPred is not set, check if we can get a predicate from the context
  // instruction, if its type matches the requested ValTy.
  if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
    CmpInst::Predicate CurrentPred;
    if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
                          m_Value())))
      VecPred = CurrentPred;
  }
  // Check if we have a compare/select chain that can be lowered using
  // a (F)CMxx & BFI pair.
  if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
      VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
      VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
      VecPred == CmpInst::FCMP_UNE) {
    static const auto ValidMinMaxTys = {
        MVT::v8i8,  MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
        MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
    static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};

    auto LT = getTypeLegalizationCost(ValTy);
    if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }) ||
        (ST->hasFullFP16() &&
         any_of(ValidFP16MinMaxTys, [&LT](MVT M) { return M == LT.second; })))
      return LT.first;
  }

  static const TypeConversionCostTblEntry
  VectorSelectTbl[] = {
    { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
    { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
    { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
    { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
    { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
    { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
  };

  EVT SelCondTy = TLI->getValueType(DL, CondTy);
  EVT SelValTy = TLI->getValueType(DL, ValTy);
  if (SelCondTy.isSimple() && SelValTy.isSimple()) {
    if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
                                                   SelCondTy.getSimpleVT(),
                                                   SelValTy.getSimpleVT()))
      return Entry->Cost;
  }
}
// The base case handles scalable vectors fine for now, since it treats the
// cost as 1 * legalization cost.
return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
2590}

2592AArch64TTIImpl::TTI::MemCmpExpansionOptions
2593AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
TTI::MemCmpExpansionOptions Options;
if (ST->requiresStrictAlign()) {
  // TODO: Add cost modeling for strict align. Misaligned loads expand to
  // a bunch of instructions when strict align is enabled.
  return Options;
}
Options.AllowOverlappingLoads = true;
Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
Options.NumLoadsPerBlock = Options.MaxNumLoads;
// TODO: Though vector loads usually perform well on AArch64, in some targets
// they may wake up the FP unit, which raises the power consumption.  Perhaps
// they could be used with no holds barred (-O3).
Options.LoadSizes = {8, 4, 2, 1};
return Options;
2608}

2610bool AArch64TTIImpl::prefersVectorizedAddressing() const {
return ST->hasSVE();
2612}

2614InstructionCost
2615AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
                                    Align Alignment, unsigned AddressSpace,
                                    TTI::TargetCostKind CostKind) {
if (useNeonVector(Src))
  return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
                                      CostKind);
auto LT = getTypeLegalizationCost(Src);
if (!LT.first.isValid())
  return InstructionCost::getInvalid();

// The code-generator is currently not able to handle scalable vectors
// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
// it. This change will be removed when code-generation for these types is
// sufficiently reliable.
if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1))
  return InstructionCost::getInvalid();

return LT.first;
2633}

2635static unsigned getSVEGatherScatterOverhead(unsigned Opcode) {
return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead;
2637}

2639InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
  unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
  Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
if (useNeonVector(DataTy))
  return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
                                       Alignment, CostKind, I);
auto *VT = cast<VectorType>(DataTy);
auto LT = getTypeLegalizationCost(DataTy);
if (!LT.first.isValid())
  return InstructionCost::getInvalid();

// The code-generator is currently not able to handle scalable vectors
// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
// it. This change will be removed when code-generation for these types is
// sufficiently reliable.
if (cast<VectorType>(DataTy)->getElementCount() ==
    ElementCount::getScalable(1))
  return InstructionCost::getInvalid();

ElementCount LegalVF = LT.second.getVectorElementCount();
InstructionCost MemOpCost =
    getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
                    {TTI::OK_AnyValue, TTI::OP_None}, I);
// Add on an overhead cost for using gathers/scatters.
// TODO: At the moment this is applied unilaterally for all CPUs, but at some
// point we may want a per-CPU overhead.
MemOpCost *= getSVEGatherScatterOverhead(Opcode);
return LT.first * MemOpCost * getMaxNumElements(LegalVF);
2667}

2669bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {
return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
2671}

2673InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
                                              MaybeAlign Alignment,
                                              unsigned AddressSpace,
                                              TTI::TargetCostKind CostKind,
                                              TTI::OperandValueInfo OpInfo,
                                              const Instruction *I) {
EVT VT = TLI->getValueType(DL, Ty, true);
// Type legalization can't handle structs
if (VT == MVT::Other)
  return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
                                CostKind);

auto LT = getTypeLegalizationCost(Ty);
if (!LT.first.isValid())
  return InstructionCost::getInvalid();

// The code-generator is currently not able to handle scalable vectors
// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
// it. This change will be removed when code-generation for these types is
// sufficiently reliable.
if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
  if (VTy->getElementCount() == ElementCount::getScalable(1))
    return InstructionCost::getInvalid();

// TODO: consider latency as well for TCK_SizeAndLatency.
if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
  return LT.first;

if (CostKind != TTI::TCK_RecipThroughput)
  return 1;

if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
    LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
  // Unaligned stores are extremely inefficient. We don't split all
  // unaligned 128-bit stores because the negative impact that has shown in
  // practice on inlined block copy code.
  // We make such stores expensive so that we will only vectorize if there
  // are 6 other instructions getting vectorized.
  const int AmortizationCost = 6;

  return LT.first * 2 * AmortizationCost;
}

// Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
if (Ty->isPtrOrPtrVectorTy())
  return LT.first;

// Check truncating stores and extending loads.
if (useNeonVector(Ty) &&
    Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
  // v4i8 types are lowered to scalar a load/store and sshll/xtn.
  if (VT == MVT::v4i8)
    return 2;
  // Otherwise we need to scalarize.
  return cast<FixedVectorType>(Ty)->getNumElements() * 2;
}

return LT.first;
2731}

2733InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
  unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
  Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
  bool UseMaskForCond, bool UseMaskForGaps) {
assert(Factor >= 2 && "Invalid interleave factor")(static_cast <bool> (Factor >= 2 && "Invalid interleave factor"
) ? void (0) : __assert_fail ("Factor >= 2 && \"Invalid interleave factor\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2737
, __extension__ __PRETTY_FUNCTION__));
auto *VecVTy = cast<FixedVectorType>(VecTy);

if (!UseMaskForCond && !UseMaskForGaps &&
    Factor <= TLI->getMaxSupportedInterleaveFactor()) {
  unsigned NumElts = VecVTy->getNumElements();
  auto *SubVecTy =
      FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);

  // ldN/stN only support legal vector types of size 64 or 128 in bits.
  // Accesses having vector types that are a multiple of 128 bits can be
  // matched to more than one ldN/stN instruction.
  bool UseScalable;
  if (NumElts % Factor == 0 &&
      TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
    return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
}

return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
                                         Alignment, AddressSpace, CostKind,
                                         UseMaskForCond, UseMaskForGaps);
2758}

2760InstructionCost
2761AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
InstructionCost Cost = 0;
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
for (auto *I : Tys) {
  if (!I->isVectorTy())
    continue;
  if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
      128)
    Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
            getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
}
return Cost;
2773}

2775unsigned AArch64TTIImpl::getMaxInterleaveFactor(ElementCount VF) {
return ST->getMaxInterleaveFactor();
2777}

2779// For Falkor, we want to avoid having too many strided loads in a loop since
2780// that can exhaust the HW prefetcher resources.  We adjust the unroller
2781// MaxCount preference below to attempt to ensure unrolling doesn't create too
2782// many strided loads.
2783static void
2784getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                            TargetTransformInfo::UnrollingPreferences &UP) {
enum { MaxStridedLoads = 7 };
auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
  int StridedLoads = 0;
  // FIXME? We could make this more precise by looking at the CFG and
  // e.g. not counting loads in each side of an if-then-else diamond.
  for (const auto BB : L->blocks()) {
    for (auto &I : *BB) {
      LoadInst *LMemI = dyn_cast<LoadInst>(&I);
      if (!LMemI)
        continue;

      Value *PtrValue = LMemI->getPointerOperand();
      if (L->isLoopInvariant(PtrValue))
        continue;

      const SCEV *LSCEV = SE.getSCEV(PtrValue);
      const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
      if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
        continue;

      // FIXME? We could take pairing of unrolled load copies into account
      // by looking at the AddRec, but we would probably have to limit this
      // to loops with no stores or other memory optimization barriers.
      ++StridedLoads;
      // We've seen enough strided loads that seeing more won't make a
      // difference.
      if (StridedLoads > MaxStridedLoads / 2)
        return StridedLoads;
    }
  }
  return StridedLoads;
};

int StridedLoads = countStridedLoads(L, SE);
LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoadsdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("aarch64tti")) { dbgs() << "falkor-hwpf: detected " <<
 StridedLoads << " strided loads\n"; } } while (false)
7
←
Assuming 'DebugFlag' is false→
8
←
Loop condition is false.  Exiting loop→
                  << " strided loads\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("aarch64tti")) { dbgs() << "falkor-hwpf: detected " <<
 StridedLoads << " strided loads\n"; } } while (false);
// Pick the largest power of 2 unroll count that won't result in too many
// strided loads.
if (StridedLoads) {
9
←
Assuming 'StridedLoads' is not equal to 0→
10
←
Taking true branch→
  UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
11
←
Calling 'Log2_32'→
13
←
Returning from 'Log2_32'→
14
←
The result of the left shift is undefined due to shifting by '4294967295', which is greater or equal to the width of type 'int'
  LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("aarch64tti")) { dbgs() << "falkor-hwpf: setting unroll MaxCount to "
 << UP.MaxCount << '\n'; } } while (false)
                    << UP.MaxCount << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("aarch64tti")) { dbgs() << "falkor-hwpf: setting unroll MaxCount to "
 << UP.MaxCount << '\n'; } } while (false);
}
2829}

2831void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                           TTI::UnrollingPreferences &UP,
                                           OptimizationRemarkEmitter *ORE) {
// Enable partial unrolling and runtime unrolling.
BaseT::getUnrollingPreferences(L, SE, UP, ORE);

UP.UpperBound = true;

// For inner loop, it is more likely to be a hot one, and the runtime check
// can be promoted out from LICM pass, so the overhead is less, let's try
// a larger threshold to unroll more loops.
if (L->getLoopDepth() > 1)
1
Assuming the condition is false→
2
←
Taking false branch→
  UP.PartialThreshold *= 2;

// Disable partial & runtime unrolling on -Os.
UP.PartialOptSizeThreshold = 0;

if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
3
←
Assuming the condition is true→
5
←
Taking true branch→
    EnableFalkorHWPFUnrollFix)
4
←
Assuming the condition is true→
  getFalkorUnrollingPreferences(L, SE, UP);
6
←
Calling 'getFalkorUnrollingPreferences'→

// Scan the loop: don't unroll loops with calls as this could prevent
// inlining. Don't unroll vector loops either, as they don't benefit much from
// unrolling.
for (auto *BB : L->getBlocks()) {
  for (auto &I : *BB) {
    // Don't unroll vectorised loop.
    if (I.getType()->isVectorTy())
      return;

    if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
      if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
        if (!isLoweredToCall(F))
          continue;
      }
      return;
    }
  }
}

// Enable runtime unrolling for in-order models
// If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
// checking for that case, we can ensure that the default behaviour is
// unchanged
if (ST->getProcFamily() != AArch64Subtarget::Others &&
    !ST->getSchedModel().isOutOfOrder()) {
  UP.Runtime = true;
  UP.Partial = true;
  UP.UnrollRemainder = true;
  UP.DefaultUnrollRuntimeCount = 4;

  UP.UnrollAndJam = true;
  UP.UnrollAndJamInnerLoopThreshold = 60;
}
2885}

2887void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
                                         TTI::PeelingPreferences &PP) {
BaseT::getPeelingPreferences(L, SE, PP);
2890}

2892Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
                                                       Type *ExpectedType) {
switch (Inst->getIntrinsicID()) {
default:
  return nullptr;
case Intrinsic::aarch64_neon_st2:
case Intrinsic::aarch64_neon_st3:
case Intrinsic::aarch64_neon_st4: {
  // Create a struct type
  StructType *ST = dyn_cast<StructType>(ExpectedType);
  if (!ST)
    return nullptr;
  unsigned NumElts = Inst->arg_size() - 1;
  if (ST->getNumElements() != NumElts)
    return nullptr;
  for (unsigned i = 0, e = NumElts; i != e; ++i) {
    if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
      return nullptr;
  }
  Value *Res = PoisonValue::get(ExpectedType);
  IRBuilder<> Builder(Inst);
  for (unsigned i = 0, e = NumElts; i != e; ++i) {
    Value *L = Inst->getArgOperand(i);
    Res = Builder.CreateInsertValue(Res, L, i);
  }
  return Res;
}
case Intrinsic::aarch64_neon_ld2:
case Intrinsic::aarch64_neon_ld3:
case Intrinsic::aarch64_neon_ld4:
  if (Inst->getType() == ExpectedType)
    return Inst;
  return nullptr;
}
2926}

2928bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
                                      MemIntrinsicInfo &Info) {
switch (Inst->getIntrinsicID()) {
default:
  break;
case Intrinsic::aarch64_neon_ld2:
case Intrinsic::aarch64_neon_ld3:
case Intrinsic::aarch64_neon_ld4:
  Info.ReadMem = true;
  Info.WriteMem = false;
  Info.PtrVal = Inst->getArgOperand(0);
  break;
case Intrinsic::aarch64_neon_st2:
case Intrinsic::aarch64_neon_st3:
case Intrinsic::aarch64_neon_st4:
  Info.ReadMem = false;
  Info.WriteMem = true;
  Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
  break;
}

switch (Inst->getIntrinsicID()) {
default:
  return false;
case Intrinsic::aarch64_neon_ld2:
case Intrinsic::aarch64_neon_st2:
  Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
  break;
case Intrinsic::aarch64_neon_ld3:
case Intrinsic::aarch64_neon_st3:
  Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
  break;
case Intrinsic::aarch64_neon_ld4:
case Intrinsic::aarch64_neon_st4:
  Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
  break;
}
return true;
2966}

2968/// See if \p I should be considered for address type promotion. We check if \p
2969/// I is a sext with right type and used in memory accesses. If it used in a
2970/// "complex" getelementptr, we allow it to be promoted without finding other
2971/// sext instructions that sign extended the same initial value. A getelementptr
2972/// is considered as "complex" if it has more than 2 operands.
2973bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
  const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
bool Considerable = false;
AllowPromotionWithoutCommonHeader = false;
if (!isa<SExtInst>(&I))
  return false;
Type *ConsideredSExtType =
    Type::getInt64Ty(I.getParent()->getParent()->getContext());
if (I.getType() != ConsideredSExtType)
  return false;
// See if the sext is the one with the right type and used in at least one
// GetElementPtrInst.
for (const User *U : I.users()) {
  if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
    Considerable = true;
    // A getelementptr is considered as "complex" if it has more than 2
    // operands. We will promote a SExt used in such complex GEP as we
    // expect some computation to be merged if they are done on 64 bits.
    if (GEPInst->getNumOperands() > 2) {
      AllowPromotionWithoutCommonHeader = true;
      break;
    }
  }
}
return Considerable;
2998}

3000bool AArch64TTIImpl::isLegalToVectorizeReduction(
  const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
if (!VF.isScalable())
  return true;

Type *Ty = RdxDesc.getRecurrenceType();
if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
  return false;

switch (RdxDesc.getRecurrenceKind()) {
case RecurKind::Add:
case RecurKind::FAdd:
case RecurKind::And:
case RecurKind::Or:
case RecurKind::Xor:
case RecurKind::SMin:
case RecurKind::SMax:
case RecurKind::UMin:
case RecurKind::UMax:
case RecurKind::FMin:
case RecurKind::FMax:
case RecurKind::SelectICmp:
case RecurKind::SelectFCmp:
case RecurKind::FMulAdd:
  return true;
default:
  return false;
}
3028}

3030InstructionCost
3031AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
                                     bool IsUnsigned, FastMathFlags FMF,
                                     TTI::TargetCostKind CostKind) {
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);

if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
  return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, FMF, CostKind);

assert((isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) &&(static_cast <bool> ((isa<ScalableVectorType>(Ty)
 == isa<ScalableVectorType>(CondTy)) && "Both vector needs to be equally scalable"
) ? void (0) : __assert_fail ("(isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) && \"Both vector needs to be equally scalable\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 3040
, __extension__ __PRETTY_FUNCTION__))
       "Both vector needs to be equally scalable")(static_cast <bool> ((isa<ScalableVectorType>(Ty)
 == isa<ScalableVectorType>(CondTy)) && "Both vector needs to be equally scalable"
) ? void (0) : __assert_fail ("(isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) && \"Both vector needs to be equally scalable\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 3040
, __extension__ __PRETTY_FUNCTION__));

InstructionCost LegalizationCost = 0;
if (LT.first > 1) {
  Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
  Intrinsic::ID MinMaxOpcode =
      Ty->isFPOrFPVectorTy()
          ? Intrinsic::maxnum
          : (IsUnsigned ? Intrinsic::umin : Intrinsic::smin);
  IntrinsicCostAttributes Attrs(MinMaxOpcode, LegalVTy, {LegalVTy, LegalVTy},
                                FMF);
  LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
}

return LegalizationCost + /*Cost of horizontal reduction*/ 2;
3055}

3057InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE(
  unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) {
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
InstructionCost LegalizationCost = 0;
if (LT.first > 1) {
  Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
  LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
  LegalizationCost *= LT.first - 1;
}

int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ?
 void (0) : __assert_fail ("ISD && \"Invalid opcode\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 3068
, __extension__ __PRETTY_FUNCTION__));
// Add the final reduction cost for the legal horizontal reduction
switch (ISD) {
case ISD::ADD:
case ISD::AND:
case ISD::OR:
case ISD::XOR:
case ISD::FADD:
  return LegalizationCost + 2;
default:
  return InstructionCost::getInvalid();
}
3080}

3082InstructionCost
3083AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
                                         std::optional<FastMathFlags> FMF,
                                         TTI::TargetCostKind CostKind) {
if (TTI::requiresOrderedReduction(FMF)) {
  if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
    InstructionCost BaseCost =
        BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
    // Add on extra cost to reflect the extra overhead on some CPUs. We still
    // end up vectorizing for more computationally intensive loops.
    return BaseCost + FixedVTy->getNumElements();
  }

  if (Opcode != Instruction::FAdd)
    return InstructionCost::getInvalid();

  auto *VTy = cast<ScalableVectorType>(ValTy);
  InstructionCost Cost =
      getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
  Cost *= getMaxNumElements(VTy->getElementCount());
  return Cost;
}

if (isa<ScalableVectorType>(ValTy))
  return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);

std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
MVT MTy = LT.second;
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ?
 void (0) : __assert_fail ("ISD && \"Invalid opcode\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 3111
, __extension__ __PRETTY_FUNCTION__));

// Horizontal adds can use the 'addv' instruction. We model the cost of these
// instructions as twice a normal vector add, plus 1 for each legalization
// step (LT.first). This is the only arithmetic vector reduction operation for
// which we have an instruction.
// OR, XOR and AND costs should match the codegen from:
// OR: llvm/test/CodeGen/AArch64/reduce-or.ll
// XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
// AND: llvm/test/CodeGen/AArch64/reduce-and.ll
static const CostTblEntry CostTblNoPairwise[]{
    {ISD::ADD, MVT::v8i8,   2},
    {ISD::ADD, MVT::v16i8,  2},
    {ISD::ADD, MVT::v4i16,  2},
    {ISD::ADD, MVT::v8i16,  2},
    {ISD::ADD, MVT::v4i32,  2},
    {ISD::ADD, MVT::v2i64,  2},
    {ISD::OR,  MVT::v8i8,  15},
    {ISD::OR,  MVT::v16i8, 17},
    {ISD::OR,  MVT::v4i16,  7},
    {ISD::OR,  MVT::v8i16,  9},
    {ISD::OR,  MVT::v2i32,  3},
    {ISD::OR,  MVT::v4i32,  5},
    {ISD::OR,  MVT::v2i64,  3},
    {ISD::XOR, MVT::v8i8,  15},
    {ISD::XOR, MVT::v16i8, 17},
    {ISD::XOR, MVT::v4i16,  7},
    {ISD::XOR, MVT::v8i16,  9},
    {ISD::XOR, MVT::v2i32,  3},
    {ISD::XOR, MVT::v4i32,  5},
    {ISD::XOR, MVT::v2i64,  3},
    {ISD::AND, MVT::v8i8,  15},
    {ISD::AND, MVT::v16i8, 17},
    {ISD::AND, MVT::v4i16,  7},
    {ISD::AND, MVT::v8i16,  9},
    {ISD::AND, MVT::v2i32,  3},
    {ISD::AND, MVT::v4i32,  5},
    {ISD::AND, MVT::v2i64,  3},
};
switch (ISD) {
default:
  break;
case ISD::ADD:
  if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
    return (LT.first - 1) + Entry->Cost;
  break;
case ISD::XOR:
case ISD::AND:
case ISD::OR:
  const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
  if (!Entry)
    break;
  auto *ValVTy = cast<FixedVectorType>(ValTy);
  if (!ValVTy->getElementType()->isIntegerTy(1) &&
      MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
      isPowerOf2_32(ValVTy->getNumElements())) {
    InstructionCost ExtraCost = 0;
    if (LT.first != 1) {
      // Type needs to be split, so there is an extra cost of LT.first - 1
      // arithmetic ops.
      auto *Ty = FixedVectorType::get(ValTy->getElementType(),
                                      MTy.getVectorNumElements());
      ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
      ExtraCost *= LT.first - 1;
    }
    return Entry->Cost + ExtraCost;
  }
  break;
}
return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
3181}

3183InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) {
static const CostTblEntry ShuffleTbl[] = {
    { TTI::SK_Splice, MVT::nxv16i8,  1 },
    { TTI::SK_Splice, MVT::nxv8i16,  1 },
    { TTI::SK_Splice, MVT::nxv4i32,  1 },
    { TTI::SK_Splice, MVT::nxv2i64,  1 },
    { TTI::SK_Splice, MVT::nxv2f16,  1 },
    { TTI::SK_Splice, MVT::nxv4f16,  1 },
    { TTI::SK_Splice, MVT::nxv8f16,  1 },
    { TTI::SK_Splice, MVT::nxv2bf16, 1 },
    { TTI::SK_Splice, MVT::nxv4bf16, 1 },
    { TTI::SK_Splice, MVT::nxv8bf16, 1 },
    { TTI::SK_Splice, MVT::nxv2f32,  1 },
    { TTI::SK_Splice, MVT::nxv4f32,  1 },
    { TTI::SK_Splice, MVT::nxv2f64,  1 },
};

// The code-generator is currently not able to handle scalable vectors
// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
// it. This change will be removed when code-generation for these types is
// sufficiently reliable.
if (Tp->getElementCount() == ElementCount::getScalable(1))
  return InstructionCost::getInvalid();

std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
EVT PromotedVT = LT.second.getScalarType() == MVT::i1
                     ? TLI->getPromotedVTForPredicate(EVT(LT.second))
                     : LT.second;
Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
InstructionCost LegalizationCost = 0;
if (Index < 0) {
  LegalizationCost =
      getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
                         CmpInst::BAD_ICMP_PREDICATE, CostKind) +
      getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
                         CmpInst::BAD_ICMP_PREDICATE, CostKind);
}

// Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
// Cost performed on a promoted type.
if (LT.second.getScalarType() == MVT::i1) {
  LegalizationCost +=
      getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
                       TTI::CastContextHint::None, CostKind) +
      getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
                       TTI::CastContextHint::None, CostKind);
}
const auto *Entry =
    CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
assert(Entry && "Illegal Type for Splice")(static_cast <bool> (Entry && "Illegal Type for Splice"
) ? void (0) : __assert_fail ("Entry && \"Illegal Type for Splice\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 3234
, __extension__ __PRETTY_FUNCTION__));
LegalizationCost += Entry->Cost;
return LegalizationCost * LT.first;
3237}

3239InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
                                             VectorType *Tp,
                                             ArrayRef<int> Mask,
                                             TTI::TargetCostKind CostKind,
                                             int Index, VectorType *SubTp,
                                             ArrayRef<const Value *> Args) {
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
// If we have a Mask, and the LT is being legalized somehow, split the Mask
// into smaller vectors and sum the cost of each shuffle.
if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() &&
    Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
    cast<FixedVectorType>(Tp)->getNumElements() >
        LT.second.getVectorNumElements() &&
    !Index && !SubTp) {
  unsigned TpNumElts = cast<FixedVectorType>(Tp)->getNumElements();
  assert(Mask.size() == TpNumElts && "Expected Mask and Tp size to match!")(static_cast <bool> (Mask.size() == TpNumElts &&
 "Expected Mask and Tp size to match!") ? void (0) : __assert_fail
 ("Mask.size() == TpNumElts && \"Expected Mask and Tp size to match!\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 3254
, __extension__ __PRETTY_FUNCTION__));
  unsigned LTNumElts = LT.second.getVectorNumElements();
  unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
  VectorType *NTp =
      VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount());
  InstructionCost Cost;
  for (unsigned N = 0; N < NumVecs; N++) {
    SmallVector<int> NMask;
    // Split the existing mask into chunks of size LTNumElts. Track the source
    // sub-vectors to ensure the result has at most 2 inputs.
    unsigned Source1, Source2;
    unsigned NumSources = 0;
    for (unsigned E = 0; E < LTNumElts; E++) {
      int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
                                                    : PoisonMaskElem;
      if (MaskElt < 0) {
        NMask.push_back(PoisonMaskElem);
        continue;
      }

      // Calculate which source from the input this comes from and whether it
      // is new to us.
      unsigned Source = MaskElt / LTNumElts;
      if (NumSources == 0) {
        Source1 = Source;
        NumSources = 1;
      } else if (NumSources == 1 && Source != Source1) {
        Source2 = Source;
        NumSources = 2;
      } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
        NumSources++;
      }

      // Add to the new mask. For the NumSources>2 case these are not correct,
      // but are only used for the modular lane number.
      if (Source == Source1)
        NMask.push_back(MaskElt % LTNumElts);
      else if (Source == Source2)
        NMask.push_back(MaskElt % LTNumElts + LTNumElts);
      else
        NMask.push_back(MaskElt % LTNumElts);
    }
    // If the sub-mask has at most 2 input sub-vectors then re-cost it using
    // getShuffleCost. If not then cost it using the worst case.
    if (NumSources <= 2)
      Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
                                             : TTI::SK_PermuteTwoSrc,
                             NTp, NMask, CostKind, 0, nullptr, Args);
    else if (any_of(enumerate(NMask), [&](const auto &ME) {
               return ME.value() % LTNumElts == ME.index();
             }))
      Cost += LTNumElts - 1;
    else
      Cost += LTNumElts;
  }
  return Cost;
}

Kind = improveShuffleKindFromMask(Kind, Mask);

// Check for broadcast loads, which are supported by the LD1R instruction.
// In terms of code-size, the shuffle vector is free when a load + dup get
// folded into a LD1R. That's what we check and return here. For performance
// and reciprocal throughput, a LD1R is not completely free. In this case, we
// return the cost for the broadcast below (i.e. 1 for most/all types), so
// that we model the load + dup sequence slightly higher because LD1R is a
// high latency instruction.
if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
  bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
  if (IsLoad && LT.second.isVector() &&
      isLegalBroadcastLoad(Tp->getElementType(),
                           LT.second.getVectorElementCount()))
    return 0;
}

// If we have 4 elements for the shuffle and a Mask, get the cost straight
// from the perfect shuffle tables.
if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) &&
    (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) &&
    all_of(Mask, [](int E) { return E < 8; }))
  return getPerfectShuffleCost(Mask);

if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
    Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
    Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
  static const CostTblEntry ShuffleTbl[] = {
      // Broadcast shuffle kinds can be performed with 'dup'.
      {TTI::SK_Broadcast, MVT::v8i8, 1},
      {TTI::SK_Broadcast, MVT::v16i8, 1},
      {TTI::SK_Broadcast, MVT::v4i16, 1},
      {TTI::SK_Broadcast, MVT::v8i16, 1},
      {TTI::SK_Broadcast, MVT::v2i32, 1},
      {TTI::SK_Broadcast, MVT::v4i32, 1},
      {TTI::SK_Broadcast, MVT::v2i64, 1},
      {TTI::SK_Broadcast, MVT::v4f16, 1},
      {TTI::SK_Broadcast, MVT::v8f16, 1},
      {TTI::SK_Broadcast, MVT::v2f32, 1},
      {TTI::SK_Broadcast, MVT::v4f32, 1},
      {TTI::SK_Broadcast, MVT::v2f64, 1},
      // Transpose shuffle kinds can be performed with 'trn1/trn2' and
      // 'zip1/zip2' instructions.
      {TTI::SK_Transpose, MVT::v8i8, 1},
      {TTI::SK_Transpose, MVT::v16i8, 1},
      {TTI::SK_Transpose, MVT::v4i16, 1},
      {TTI::SK_Transpose, MVT::v8i16, 1},
      {TTI::SK_Transpose, MVT::v2i32, 1},
      {TTI::SK_Transpose, MVT::v4i32, 1},
      {TTI::SK_Transpose, MVT::v2i64, 1},
      {TTI::SK_Transpose, MVT::v4f16, 1},
      {TTI::SK_Transpose, MVT::v8f16, 1},
      {TTI::SK_Transpose, MVT::v2f32, 1},
      {TTI::SK_Transpose, MVT::v4f32, 1},
      {TTI::SK_Transpose, MVT::v2f64, 1},
      // Select shuffle kinds.
      // TODO: handle vXi8/vXi16.
      {TTI::SK_Select, MVT::v2i32, 1}, // mov.
      {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
      {TTI::SK_Select, MVT::v2i64, 1}, // mov.
      {TTI::SK_Select, MVT::v2f32, 1}, // mov.
      {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
      {TTI::SK_Select, MVT::v2f64, 1}, // mov.
      // PermuteSingleSrc shuffle kinds.
      {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
      {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
      {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
      {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
      {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
      {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
      {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
      {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
      {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
      {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8},  // constpool + load + tbl
      {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8},  // constpool + load + tbl
      {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
      {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8},   // constpool + load + tbl
      {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8},  // constpool + load + tbl
      // Reverse can be lowered with `rev`.
      {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
      {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
      {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
      {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
      {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
      {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
      {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
      {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
      {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
      {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
      {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
      {TTI::SK_Reverse, MVT::v8i8, 1},  // REV64
      // Splice can all be lowered as `ext`.
      {TTI::SK_Splice, MVT::v2i32, 1},
      {TTI::SK_Splice, MVT::v4i32, 1},
      {TTI::SK_Splice, MVT::v2i64, 1},
      {TTI::SK_Splice, MVT::v2f32, 1},
      {TTI::SK_Splice, MVT::v4f32, 1},
      {TTI::SK_Splice, MVT::v2f64, 1},
      {TTI::SK_Splice, MVT::v8f16, 1},
      {TTI::SK_Splice, MVT::v8bf16, 1},
      {TTI::SK_Splice, MVT::v8i16, 1},
      {TTI::SK_Splice, MVT::v16i8, 1},
      {TTI::SK_Splice, MVT::v4bf16, 1},
      {TTI::SK_Splice, MVT::v4f16, 1},
      {TTI::SK_Splice, MVT::v4i16, 1},
      {TTI::SK_Splice, MVT::v8i8, 1},
      // Broadcast shuffle kinds for scalable vectors
      {TTI::SK_Broadcast, MVT::nxv16i8, 1},
      {TTI::SK_Broadcast, MVT::nxv8i16, 1},
      {TTI::SK_Broadcast, MVT::nxv4i32, 1},
      {TTI::SK_Broadcast, MVT::nxv2i64, 1},
      {TTI::SK_Broadcast, MVT::nxv2f16, 1},
      {TTI::SK_Broadcast, MVT::nxv4f16, 1},
      {TTI::SK_Broadcast, MVT::nxv8f16, 1},
      {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
      {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
      {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
      {TTI::SK_Broadcast, MVT::nxv2f32, 1},
      {TTI::SK_Broadcast, MVT::nxv4f32, 1},
      {TTI::SK_Broadcast, MVT::nxv2f64, 1},
      {TTI::SK_Broadcast, MVT::nxv16i1, 1},
      {TTI::SK_Broadcast, MVT::nxv8i1, 1},
      {TTI::SK_Broadcast, MVT::nxv4i1, 1},
      {TTI::SK_Broadcast, MVT::nxv2i1, 1},
      // Handle the cases for vector.reverse with scalable vectors
      {TTI::SK_Reverse, MVT::nxv16i8, 1},
      {TTI::SK_Reverse, MVT::nxv8i16, 1},
      {TTI::SK_Reverse, MVT::nxv4i32, 1},
      {TTI::SK_Reverse, MVT::nxv2i64, 1},
      {TTI::SK_Reverse, MVT::nxv2f16, 1},
      {TTI::SK_Reverse, MVT::nxv4f16, 1},
      {TTI::SK_Reverse, MVT::nxv8f16, 1},
      {TTI::SK_Reverse, MVT::nxv2bf16, 1},
      {TTI::SK_Reverse, MVT::nxv4bf16, 1},
      {TTI::SK_Reverse, MVT::nxv8bf16, 1},
      {TTI::SK_Reverse, MVT::nxv2f32, 1},
      {TTI::SK_Reverse, MVT::nxv4f32, 1},
      {TTI::SK_Reverse, MVT::nxv2f64, 1},
      {TTI::SK_Reverse, MVT::nxv16i1, 1},
      {TTI::SK_Reverse, MVT::nxv8i1, 1},
      {TTI::SK_Reverse, MVT::nxv4i1, 1},
      {TTI::SK_Reverse, MVT::nxv2i1, 1},
  };
  if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
    return LT.first * Entry->Cost;
}

if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))
  return getSpliceCost(Tp, Index);

// Inserting a subvector can often be done with either a D, S or H register
// move, so long as the inserted vector is "aligned".
if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
    LT.second.getSizeInBits() <= 128 && SubTp) {
  std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
  if (SubLT.second.isVector()) {
    int NumElts = LT.second.getVectorNumElements();
    int NumSubElts = SubLT.second.getVectorNumElements();
    if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
      return SubLT.first;
  }
}

return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
3476}

3478static bool containsDecreasingPointers(Loop *TheLoop,
                                     PredicatedScalarEvolution *PSE) {
const ValueToValueMap &Strides = ValueToValueMap();
for (BasicBlock *BB : TheLoop->blocks()) {
  // Scan the instructions in the block and look for addresses that are
  // consecutive and decreasing.
  for (Instruction &I : *BB) {
    if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
      Value *Ptr = getLoadStorePointerOperand(&I);
      Type *AccessTy = getLoadStoreType(&I);
      if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true,
                       /*ShouldCheckWrap=*/false)
              .value_or(0) < 0)
        return true;
    }
  }
}
return false;
3496}

3498bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
if (!ST->hasSVE() || TailFoldingKindLoc == TailFoldingKind::TFDisabled)
  return false;

// We don't currently support vectorisation with interleaving for SVE - with
// such loops we're better off not using tail-folding. This gives us a chance
// to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
if (TFI->IAI->hasGroups())
  return false;

TailFoldingKind Required; // Defaults to 0.
if (TFI->LVL->getReductionVars().size())
  Required.add(TailFoldingKind::TFReductions);
if (TFI->LVL->getFixedOrderRecurrences().size())
  Required.add(TailFoldingKind::TFRecurrences);

// We call this to discover whether any load/store pointers in the loop have
// negative strides. This will require extra work to reverse the loop
// predicate, which may be expensive.
if (containsDecreasingPointers(TFI->LVL->getLoop(),
                               TFI->LVL->getPredicatedScalarEvolution()))
  Required.add(TailFoldingKind::TFReverse);
if (!Required)
  Required.add(TailFoldingKind::TFSimple);

return (TailFoldingKindLoc & Required) == Required;
3524}

3526InstructionCost
3527AArch64TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
                                   int64_t BaseOffset, bool HasBaseReg,
                                   int64_t Scale, unsigned AddrSpace) const {
// Scaling factors are not free at all.
// Operands                     | Rt Latency
// -------------------------------------------
// Rt, [Xn, Xm]                 | 4
// -------------------------------------------
// Rt, [Xn, Xm, lsl #imm]       | Rn: 4 Rm: 5
// Rt, [Xn, Wm, <extend> #imm]  |
TargetLoweringBase::AddrMode AM;
AM.BaseGV = BaseGV;
AM.BaseOffs = BaseOffset;
AM.HasBaseReg = HasBaseReg;
AM.Scale = Scale;
if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
  // Scale represents reg2 * scale, thus account for 1 if
  // it is not equal to 0 or 1.
  return AM.Scale != 0 && AM.Scale != 1;
return -1;
3547}

←

/build/source/llvm/include/llvm/Support/MathExtras.h

1//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains some functions that are useful for math stuff.
10//
11//===----------------------------------------------------------------------===//
12 
13#ifndef LLVM_SUPPORT_MATHEXTRAS_H
14#define LLVM_SUPPORT_MATHEXTRAS_H
15 
16#include "llvm/ADT/bit.h"
17#include "llvm/Support/Compiler.h"
18#include <cassert>
19#include <climits>
20#include <cstdint>
21#include <cstring>
22#include <limits>
23#include <type_traits>
24 
25namespace llvm {
26 
27/// Mathematical constants.
28namespace numbers {
29// TODO: Track C++20 std::numbers.
30// TODO: Favor using the hexadecimal FP constants (requires C++17).
31constexpr double e          = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113
32                 egamma     = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620
33                 ln2        = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162
34                 ln10       = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392
35                 log2e      = 1.4426950408889634074, // (0x1.71547652b82feP+0)
36                 log10e     = .43429448190325182765, // (0x1.bcb7b1526e50eP-2)
37                 pi         = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796
38                 inv_pi     = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541
39                 sqrtpi     = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161
40                 inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197
41                 sqrt2      = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219
42                 inv_sqrt2  = .70710678118654752440, // (0x1.6a09e667f3bcdP-1)
43                 sqrt3      = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194
44                 inv_sqrt3  = .57735026918962576451, // (0x1.279a74590331cP-1)
45                 phi        = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622
46constexpr float ef          = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113
47                egammaf     = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620
48                ln2f        = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162
49                ln10f       = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392
50                log2ef      = 1.44269504F, // (0x1.715476P+0)
51                log10ef     = .434294482F, // (0x1.bcb7b2P-2)
52                pif         = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796
53                inv_pif     = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541
54                sqrtpif     = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161
55                inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197
56                sqrt2f      = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193
57                inv_sqrt2f  = .707106781F, // (0x1.6a09e6P-1)
58                sqrt3f      = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194
59                inv_sqrt3f  = .577350269F, // (0x1.279a74P-1)
60                phif        = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622
61} // namespace numbers
62 
63/// Count number of 0's from the least significant bit to the most
64///   stopping at the first 1.
65///
66/// Only unsigned integral types are allowed.
67///
68/// Returns std::numeric_limits<T>::digits on an input of 0.
69template <typename T>
70LLVM_DEPRECATED("Use llvm::countr_zero instead.", "llvm::countr_zero")__attribute__((deprecated("Use llvm::countr_zero instead.", "llvm::countr_zero"
)))
71unsigned countTrailingZeros(T Val) {
72  static_assert(std::is_unsigned_v<T>,
73                "Only unsigned integral types are allowed.");
74  return llvm::countr_zero(Val);
75}
76 
77/// Count number of 0's from the most significant bit to the least
78///   stopping at the first 1.
79///
80/// Only unsigned integral types are allowed.
81///
82/// Returns std::numeric_limits<T>::digits on an input of 0.
83template <typename T>
84LLVM_DEPRECATED("Use llvm::countl_zero instead.", "llvm::countl_zero")__attribute__((deprecated("Use llvm::countl_zero instead.", "llvm::countl_zero"
)))
85unsigned countLeadingZeros(T Val) {
86  static_assert(std::is_unsigned_v<T>,
87                "Only unsigned integral types are allowed.");
88  return llvm::countl_zero(Val);
89}
90 
91/// Create a bitmask with the N right-most bits set to 1, and all other
92/// bits set to 0.  Only unsigned types are allowed.
93template <typename T> T maskTrailingOnes(unsigned N) {
94  static_assert(std::is_unsigned_v<T>, "Invalid type!");
95  const unsigned Bits = CHAR_BIT8 * sizeof(T);
96  assert(N <= Bits && "Invalid bit index")(static_cast <bool> (N <= Bits && "Invalid bit index"
) ? void (0) : __assert_fail ("N <= Bits && \"Invalid bit index\""
, "llvm/include/llvm/Support/MathExtras.h", 96, __extension__
 __PRETTY_FUNCTION__));
97  return N == 0 ? 0 : (T(-1) >> (Bits - N));
98}
99 
100/// Create a bitmask with the N left-most bits set to 1, and all other
101/// bits set to 0.  Only unsigned types are allowed.
102template <typename T> T maskLeadingOnes(unsigned N) {
103  return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
104}
105 
106/// Create a bitmask with the N right-most bits set to 0, and all other
107/// bits set to 1.  Only unsigned types are allowed.
108template <typename T> T maskTrailingZeros(unsigned N) {
109  return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
110}
111 
112/// Create a bitmask with the N left-most bits set to 0, and all other
113/// bits set to 1.  Only unsigned types are allowed.
114template <typename T> T maskLeadingZeros(unsigned N) {
115  return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
116}
117 
118/// Macro compressed bit reversal table for 256 bits.
119///
120/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
121static const unsigned char BitReverseTable256[256] = {
122#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
123#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
124#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
125  R6(0), R6(2), R6(1), R6(3)
126#undef R2
127#undef R4
128#undef R6
129};
130 
131/// Reverse the bits in \p Val.
132template <typename T> T reverseBits(T Val) {
133#if __has_builtin(__builtin_bitreverse8)1
134  if constexpr (std::is_same_v<T, uint8_t>)
135    return __builtin_bitreverse8(Val);
136#endif
137#if __has_builtin(__builtin_bitreverse16)1
138  if constexpr (std::is_same_v<T, uint16_t>)
139    return __builtin_bitreverse16(Val);
140#endif
141#if __has_builtin(__builtin_bitreverse32)1
142  if constexpr (std::is_same_v<T, uint32_t>)
143    return __builtin_bitreverse32(Val);
144#endif
145#if __has_builtin(__builtin_bitreverse64)1
146  if constexpr (std::is_same_v<T, uint64_t>)
147    return __builtin_bitreverse64(Val);
148#endif
149 
150  unsigned char in[sizeof(Val)];
151  unsigned char out[sizeof(Val)];
152  std::memcpy(in, &Val, sizeof(Val));
153  for (unsigned i = 0; i < sizeof(Val); ++i)
154    out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
155  std::memcpy(&Val, out, sizeof(Val));
156  return Val;
157}
158 
159// NOTE: The following support functions use the _32/_64 extensions instead of
160// type overloading so that signed and unsigned integers can be used without
161// ambiguity.
162 
163/// Return the high 32 bits of a 64 bit value.
164constexpr inline uint32_t Hi_32(uint64_t Value) {
165  return static_cast<uint32_t>(Value >> 32);
166}
167 
168/// Return the low 32 bits of a 64 bit value.
169constexpr inline uint32_t Lo_32(uint64_t Value) {
170  return static_cast<uint32_t>(Value);
171}
172 
173/// Make a 64-bit integer from a high / low pair of 32-bit integers.
174constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
175  return ((uint64_t)High << 32) | (uint64_t)Low;
176}
177 
178/// Checks if an integer fits into the given bit width.
179template <unsigned N> constexpr inline bool isInt(int64_t x) {
180  if constexpr (N == 8)
181    return static_cast<int8_t>(x) == x;
182  if constexpr (N == 16)
183    return static_cast<int16_t>(x) == x;
184  if constexpr (N == 32)
185    return static_cast<int32_t>(x) == x;
186  if constexpr (N < 64)
187    return -(INT64_C(1)1L << (N - 1)) <= x && x < (INT64_C(1)1L << (N - 1));
188  (void)x; // MSVC v19.25 warns that x is unused.
189  return true;
190}
191 
192/// Checks if a signed integer is an N bit number shifted left by S.
193template <unsigned N, unsigned S>
194constexpr inline bool isShiftedInt(int64_t x) {
195  static_assert(
196      N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number.");
197  static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide.");
198  return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
199}
200 
201/// Checks if an unsigned integer fits into the given bit width.
202template <unsigned N> constexpr inline bool isUInt(uint64_t x) {
203  static_assert(N > 0, "isUInt<0> doesn't make sense");
204  if constexpr (N == 8)
205    return static_cast<uint8_t>(x) == x;
206  if constexpr (N == 16)
207    return static_cast<uint16_t>(x) == x;
208  if constexpr (N == 32)
209    return static_cast<uint32_t>(x) == x;
210  if constexpr (N < 64)
211    return x < (UINT64_C(1)1UL << (N));
212  (void)x; // MSVC v19.25 warns that x is unused.
213  return true;
214}
215 
216/// Checks if a unsigned integer is an N bit number shifted left by S.
217template <unsigned N, unsigned S>
218constexpr inline bool isShiftedUInt(uint64_t x) {
219  static_assert(
220      N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)");
221  static_assert(N + S <= 64,
222                "isShiftedUInt<N, S> with N + S > 64 is too wide.");
223  // Per the two static_asserts above, S must be strictly less than 64.  So
224  // 1 << S is not undefined behavior.
225  return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
226}
227 
228/// Gets the maximum value for a N-bit unsigned integer.
229inline uint64_t maxUIntN(uint64_t N) {
230  assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
 "integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "llvm/include/llvm/Support/MathExtras.h", 230, __extension__
 __PRETTY_FUNCTION__));
231 
232  // uint64_t(1) << 64 is undefined behavior, so we can't do
233  //   (uint64_t(1) << N) - 1
234  // without checking first that N != 64.  But this works and doesn't have a
235  // branch.
236  return UINT64_MAX(18446744073709551615UL) >> (64 - N);
237}
238 
239/// Gets the minimum value for a N-bit signed integer.
240inline int64_t minIntN(int64_t N) {
241  assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
 "integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "llvm/include/llvm/Support/MathExtras.h", 241, __extension__
 __PRETTY_FUNCTION__));
242 
243  return UINT64_C(1)1UL + ~(UINT64_C(1)1UL << (N - 1));
244}
245 
246/// Gets the maximum value for a N-bit signed integer.
247inline int64_t maxIntN(int64_t N) {
248  assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
 "integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "llvm/include/llvm/Support/MathExtras.h", 248, __extension__
 __PRETTY_FUNCTION__));
249 
250  // This relies on two's complement wraparound when N == 64, so we convert to
251  // int64_t only at the very end to avoid UB.
252  return (UINT64_C(1)1UL << (N - 1)) - 1;
253}
254 
255/// Checks if an unsigned integer fits into the given (dynamic) bit width.
256inline bool isUIntN(unsigned N, uint64_t x) {
257  return N >= 64 || x <= maxUIntN(N);
258}
259 
260/// Checks if an signed integer fits into the given (dynamic) bit width.
261inline bool isIntN(unsigned N, int64_t x) {
262  return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N));
263}
264 
265/// Return true if the argument is a non-empty sequence of ones starting at the
266/// least significant bit with the remainder zero (32 bit version).
267/// Ex. isMask_32(0x0000FFFFU) == true.
268constexpr inline bool isMask_32(uint32_t Value) {
269  return Value && ((Value + 1) & Value) == 0;
270}
271 
272/// Return true if the argument is a non-empty sequence of ones starting at the
273/// least significant bit with the remainder zero (64 bit version).
274constexpr inline bool isMask_64(uint64_t Value) {
275  return Value && ((Value + 1) & Value) == 0;
276}
277 
278/// Return true if the argument contains a non-empty sequence of ones with the
279/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
280constexpr inline bool isShiftedMask_32(uint32_t Value) {
281  return Value && isMask_32((Value - 1) | Value);
282}
283 
284/// Return true if the argument contains a non-empty sequence of ones with the
285/// remainder zero (64 bit version.)
286constexpr inline bool isShiftedMask_64(uint64_t Value) {
287  return Value && isMask_64((Value - 1) | Value);
288}
289 
290/// Return true if the argument is a power of two > 0.
291/// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
292constexpr inline bool isPowerOf2_32(uint32_t Value) {
293  return llvm::has_single_bit(Value);
294}
295 
296/// Return true if the argument is a power of two > 0 (64 bit edition.)
297constexpr inline bool isPowerOf2_64(uint64_t Value) {
298  return llvm::has_single_bit(Value);
299}
300 
301/// Count the number of ones from the most significant bit to the first
302/// zero bit.
303///
304/// Ex. countLeadingOnes(0xFF0FFF00) == 8.
305/// Only unsigned integral types are allowed.
306///
307/// Returns std::numeric_limits<T>::digits on an input of all ones.
308template <typename T>
309LLVM_DEPRECATED("Use llvm::countl_one instead.", "llvm::countl_one")__attribute__((deprecated("Use llvm::countl_one instead.", "llvm::countl_one"
)))
310unsigned countLeadingOnes(T Value) {
311  static_assert(std::is_unsigned_v<T>,
312                "Only unsigned integral types are allowed.");
313  return llvm::countl_one<T>(Value);
314}
315 
316/// Count the number of ones from the least significant bit to the first
317/// zero bit.
318///
319/// Ex. countTrailingOnes(0x00FF00FF) == 8.
320/// Only unsigned integral types are allowed.
321///
322/// Returns std::numeric_limits<T>::digits on an input of all ones.
323template <typename T>
324LLVM_DEPRECATED("Use llvm::countr_one instead.", "llvm::countr_one")__attribute__((deprecated("Use llvm::countr_one instead.", "llvm::countr_one"
)))
325unsigned countTrailingOnes(T Value) {
326  static_assert(std::is_unsigned_v<T>,
327                "Only unsigned integral types are allowed.");
328  return llvm::countr_one<T>(Value);
329}
330 
331/// Count the number of set bits in a value.
332/// Ex. countPopulation(0xF000F000) = 8
333/// Returns 0 if the word is zero.
334template <typename T>
335LLVM_DEPRECATED("Use llvm::popcount instead.", "llvm::popcount")__attribute__((deprecated("Use llvm::popcount instead.", "llvm::popcount"
)))
336inline unsigned countPopulation(T Value) {
337  static_assert(std::is_unsigned_v<T>,
338                "Only unsigned integral types are allowed.");
339  return (unsigned)llvm::popcount(Value);
340}
341 
342/// Return true if the argument contains a non-empty sequence of ones with the
343/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
344/// If true, \p MaskIdx will specify the index of the lowest set bit and \p
345/// MaskLen is updated to specify the length of the mask, else neither are
346/// updated.
347inline bool isShiftedMask_32(uint32_t Value, unsigned &MaskIdx,
348                             unsigned &MaskLen) {
349  if (!isShiftedMask_32(Value))
350    return false;
351  MaskIdx = llvm::countr_zero(Value);
352  MaskLen = llvm::popcount(Value);
353  return true;
354}
355 
356/// Return true if the argument contains a non-empty sequence of ones with the
357/// remainder zero (64 bit version.) If true, \p MaskIdx will specify the index
358/// of the lowest set bit and \p MaskLen is updated to specify the length of the
359/// mask, else neither are updated.
360inline bool isShiftedMask_64(uint64_t Value, unsigned &MaskIdx,
361                             unsigned &MaskLen) {
362  if (!isShiftedMask_64(Value))
363    return false;
364  MaskIdx = llvm::countr_zero(Value);
365  MaskLen = llvm::popcount(Value);
366  return true;
367}
368 
369/// Compile time Log2.
370/// Valid only for positive powers of two.
371template <size_t kValue> constexpr inline size_t CTLog2() {
372  static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue),
373                "Value is not a valid power of 2");
374  return 1 + CTLog2<kValue / 2>();
375}
376 
377template <> constexpr inline size_t CTLog2<1>() { return 0; }
378 
379/// Return the floor log base 2 of the specified value, -1 if the value is zero.
380/// (32 bit edition.)
381/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
382inline unsigned Log2_32(uint32_t Value) {
383  return 31 - llvm::countl_zero(Value);
12
←
Returning the value 4294967295→
384}
385 
386/// Return the floor log base 2 of the specified value, -1 if the value is zero.
387/// (64 bit edition.)
388inline unsigned Log2_64(uint64_t Value) {
389  return 63 - llvm::countl_zero(Value);
390}
391 
392/// Return the ceil log base 2 of the specified value, 32 if the value is zero.
393/// (32 bit edition).
394/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
395inline unsigned Log2_32_Ceil(uint32_t Value) {
396  return 32 - llvm::countl_zero(Value - 1);
397}
398 
399/// Return the ceil log base 2 of the specified value, 64 if the value is zero.
400/// (64 bit edition.)
401inline unsigned Log2_64_Ceil(uint64_t Value) {
402  return 64 - llvm::countl_zero(Value - 1);
403}
404 
405/// This function takes a 64-bit integer and returns the bit equivalent double.
406LLVM_DEPRECATED("use llvm::bit_cast instead", "llvm::bit_cast<double>")__attribute__((deprecated("use llvm::bit_cast instead", "llvm::bit_cast<double>"
)))
407inline double BitsToDouble(uint64_t Bits) {
408  static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
409  return llvm::bit_cast<double>(Bits);
410}
411 
412/// This function takes a 32-bit integer and returns the bit equivalent float.
413LLVM_DEPRECATED("use llvm::bit_cast instead", "llvm::bit_cast<float>")__attribute__((deprecated("use llvm::bit_cast instead", "llvm::bit_cast<float>"
)))
414inline float BitsToFloat(uint32_t Bits) {
415  static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
416  return llvm::bit_cast<float>(Bits);
417}
418 
419/// This function takes a double and returns the bit equivalent 64-bit integer.
420/// Note that copying doubles around changes the bits of NaNs on some hosts,
421/// notably x86, so this routine cannot be used if these bits are needed.
422LLVM_DEPRECATED("use llvm::bit_cast instead", "llvm::bit_cast<uint64_t>")__attribute__((deprecated("use llvm::bit_cast instead", "llvm::bit_cast<uint64_t>"
)))
423inline uint64_t DoubleToBits(double Double) {
424  static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
425  return llvm::bit_cast<uint64_t>(Double);
426}
427 
428/// This function takes a float and returns the bit equivalent 32-bit integer.
429/// Note that copying floats around changes the bits of NaNs on some hosts,
430/// notably x86, so this routine cannot be used if these bits are needed.
431LLVM_DEPRECATED("use llvm::bit_cast instead", "llvm::bit_cast<uint32_t>")__attribute__((deprecated("use llvm::bit_cast instead", "llvm::bit_cast<uint32_t>"
)))
432inline uint32_t FloatToBits(float Float) {
433  static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
434  return llvm::bit_cast<uint32_t>(Float);
435}
436 
437/// A and B are either alignments or offsets. Return the minimum alignment that
438/// may be assumed after adding the two together.
439constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
440  // The largest power of 2 that divides both A and B.
441  //
442  // Replace "-Value" by "1+~Value" in the following commented code to avoid
443  // MSVC warning C4146
444  //    return (A | B) & -(A | B);
445  return (A | B) & (1 + ~(A | B));
446}
447 
448/// Returns the next power of two (in 64-bits) that is strictly greater than A.
449/// Returns zero on overflow.
450constexpr inline uint64_t NextPowerOf2(uint64_t A) {
451  A |= (A >> 1);
452  A |= (A >> 2);
453  A |= (A >> 4);
454  A |= (A >> 8);
455  A |= (A >> 16);
456  A |= (A >> 32);
457  return A + 1;
458}
459 
460/// Returns the power of two which is less than or equal to the given value.
461/// Essentially, it is a floor operation across the domain of powers of two.
462LLVM_DEPRECATED("use llvm::bit_floor instead", "llvm::bit_floor")__attribute__((deprecated("use llvm::bit_floor instead", "llvm::bit_floor"
)))
463inline uint64_t PowerOf2Floor(uint64_t A) {
464  return llvm::bit_floor(A);
465}
466 
467/// Returns the power of two which is greater than or equal to the given value.
468/// Essentially, it is a ceil operation across the domain of powers of two.
469inline uint64_t PowerOf2Ceil(uint64_t A) {
470  if (!A)
471    return 0;
472  return NextPowerOf2(A - 1);
473}
474 
475/// Returns the next integer (mod 2**64) that is greater than or equal to
476/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
477///
478/// Examples:
479/// \code
480///   alignTo(5, 8) = 8
481///   alignTo(17, 8) = 24
482///   alignTo(~0LL, 8) = 0
483///   alignTo(321, 255) = 510
484/// \endcode
485inline uint64_t alignTo(uint64_t Value, uint64_t Align) {
486  assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0."
) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 486, __extension__
 __PRETTY_FUNCTION__));
487  return (Value + Align - 1) / Align * Align;
488}
489 
490inline uint64_t alignToPowerOf2(uint64_t Value, uint64_t Align) {
491  assert(Align != 0 && (Align & (Align - 1)) == 0 &&(static_cast <bool> (Align != 0 && (Align &
 (Align - 1)) == 0 && "Align must be a power of 2") ?
 void (0) : __assert_fail ("Align != 0 && (Align & (Align - 1)) == 0 && \"Align must be a power of 2\""
, "llvm/include/llvm/Support/MathExtras.h", 492, __extension__
 __PRETTY_FUNCTION__))
492         "Align must be a power of 2")(static_cast <bool> (Align != 0 && (Align &
 (Align - 1)) == 0 && "Align must be a power of 2") ?
 void (0) : __assert_fail ("Align != 0 && (Align & (Align - 1)) == 0 && \"Align must be a power of 2\""
, "llvm/include/llvm/Support/MathExtras.h", 492, __extension__
 __PRETTY_FUNCTION__));
493  return (Value + Align - 1) & -Align;
494}
495 
496/// If non-zero \p Skew is specified, the return value will be a minimal integer
497/// that is greater than or equal to \p Size and equal to \p A * N + \p Skew for
498/// some integer N. If \p Skew is larger than \p A, its value is adjusted to '\p
499/// Skew mod \p A'. \p Align must be non-zero.
500///
501/// Examples:
502/// \code
503///   alignTo(5, 8, 7) = 7
504///   alignTo(17, 8, 1) = 17
505///   alignTo(~0LL, 8, 3) = 3
506///   alignTo(321, 255, 42) = 552
507/// \endcode
508inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew) {
509  assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0."
) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 509, __extension__
 __PRETTY_FUNCTION__));
510  Skew %= Align;
511  return alignTo(Value - Skew, Align) + Skew;
512}
513 
514/// Returns the next integer (mod 2**64) that is greater than or equal to
515/// \p Value and is a multiple of \c Align. \c Align must be non-zero.
516template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) {
517  static_assert(Align != 0u, "Align must be non-zero");
518  return (Value + Align - 1) / Align * Align;
519}
520 
521/// Returns the integer ceil(Numerator / Denominator).
522inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
523  return alignTo(Numerator, Denominator) / Denominator;
524}
525 
526/// Returns the integer nearest(Numerator / Denominator).
527inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) {
528  return (Numerator + (Denominator / 2)) / Denominator;
529}
530 
531/// Returns the largest uint64_t less than or equal to \p Value and is
532/// \p Skew mod \p Align. \p Align must be non-zero
533inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
534  assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0."
) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 534, __extension__
 __PRETTY_FUNCTION__));
535  Skew %= Align;
536  return (Value - Skew) / Align * Align + Skew;
537}
538 
539/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
540/// Requires 0 < B <= 32.
541template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) {
542  static_assert(B > 0, "Bit width can't be 0.");
543  static_assert(B <= 32, "Bit width out of range.");
544  return int32_t(X << (32 - B)) >> (32 - B);
545}
546 
547/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
548/// Requires 0 < B <= 32.
549inline int32_t SignExtend32(uint32_t X, unsigned B) {
550  assert(B > 0 && "Bit width can't be 0.")(static_cast <bool> (B > 0 && "Bit width can't be 0."
) ? void (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 550, __extension__
 __PRETTY_FUNCTION__));
551  assert(B <= 32 && "Bit width out of range.")(static_cast <bool> (B <= 32 && "Bit width out of range."
) ? void (0) : __assert_fail ("B <= 32 && \"Bit width out of range.\""
, "llvm/include/llvm/Support/MathExtras.h", 551, __extension__
 __PRETTY_FUNCTION__));
552  return int32_t(X << (32 - B)) >> (32 - B);
553}
554 
555/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
556/// Requires 0 < B <= 64.
557template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) {
558  static_assert(B > 0, "Bit width can't be 0.");
559  static_assert(B <= 64, "Bit width out of range.");
560  return int64_t(x << (64 - B)) >> (64 - B);
561}
562 
563/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
564/// Requires 0 < B <= 64.
565inline int64_t SignExtend64(uint64_t X, unsigned B) {
566  assert(B > 0 && "Bit width can't be 0.")(static_cast <bool> (B > 0 && "Bit width can't be 0."
) ? void (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 566, __extension__
 __PRETTY_FUNCTION__));
567  assert(B <= 64 && "Bit width out of range.")(static_cast <bool> (B <= 64 && "Bit width out of range."
) ? void (0) : __assert_fail ("B <= 64 && \"Bit width out of range.\""
, "llvm/include/llvm/Support/MathExtras.h", 567, __extension__
 __PRETTY_FUNCTION__));
568  return int64_t(X << (64 - B)) >> (64 - B);
569}
570 
571/// Subtract two unsigned integers, X and Y, of type T and return the absolute
572/// value of the result.
573template <typename T>
574std::enable_if_t<std::is_unsigned_v<T>, T> AbsoluteDifference(T X, T Y) {
575  return X > Y ? (X - Y) : (Y - X);
576}
577 
578/// Add two unsigned integers, X and Y, of type T.  Clamp the result to the
579/// maximum representable value of T on overflow.  ResultOverflowed indicates if
580/// the result is larger than the maximum representable value of type T.
581template <typename T>
582std::enable_if_t<std::is_unsigned_v<T>, T>
583SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) {
584  bool Dummy;
585  bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
586  // Hacker's Delight, p. 29
587  T Z = X + Y;
588  Overflowed = (Z < X || Z < Y);
589  if (Overflowed)
590    return std::numeric_limits<T>::max();
591  else
592    return Z;
593}
594 
595/// Add multiple unsigned integers of type T.  Clamp the result to the
596/// maximum representable value of T on overflow.
597template <class T, class... Ts>
598std::enable_if_t<std::is_unsigned_v<T>, T> SaturatingAdd(T X, T Y, T Z,
599                                                         Ts... Args) {
600  bool Overflowed = false;
601  T XY = SaturatingAdd(X, Y, &Overflowed);
602  if (Overflowed)
603    return SaturatingAdd(std::numeric_limits<T>::max(), T(1), Args...);
604  return SaturatingAdd(XY, Z, Args...);
605}
606 
607/// Multiply two unsigned integers, X and Y, of type T.  Clamp the result to the
608/// maximum representable value of T on overflow.  ResultOverflowed indicates if
609/// the result is larger than the maximum representable value of type T.
610template <typename T>
611std::enable_if_t<std::is_unsigned_v<T>, T>
612SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) {
613  bool Dummy;
614  bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
615 
616  // Hacker's Delight, p. 30 has a different algorithm, but we don't use that
617  // because it fails for uint16_t (where multiplication can have undefined
618  // behavior due to promotion to int), and requires a division in addition
619  // to the multiplication.
620 
621  Overflowed = false;
622 
623  // Log2(Z) would be either Log2Z or Log2Z + 1.
624  // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z
625  // will necessarily be less than Log2Max as desired.
626  int Log2Z = Log2_64(X) + Log2_64(Y);
627  const T Max = std::numeric_limits<T>::max();
628  int Log2Max = Log2_64(Max);
629  if (Log2Z < Log2Max) {
630    return X * Y;
631  }
632  if (Log2Z > Log2Max) {
633    Overflowed = true;
634    return Max;
635  }
636 
637  // We're going to use the top bit, and maybe overflow one
638  // bit past it. Multiply all but the bottom bit then add
639  // that on at the end.
640  T Z = (X >> 1) * Y;
641  if (Z & ~(Max >> 1)) {
642    Overflowed = true;
643    return Max;
644  }
645  Z <<= 1;
646  if (X & 1)
647    return SaturatingAdd(Z, Y, ResultOverflowed);
648 
649  return Z;
650}
651 
652/// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to
653/// the product. Clamp the result to the maximum representable value of T on
654/// overflow. ResultOverflowed indicates if the result is larger than the
655/// maximum representable value of type T.
656template <typename T>
657std::enable_if_t<std::is_unsigned_v<T>, T>
658SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) {
659  bool Dummy;
660  bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
661 
662  T Product = SaturatingMultiply(X, Y, &Overflowed);
663  if (Overflowed)
664    return Product;
665 
666  return SaturatingAdd(A, Product, &Overflowed);
667}
668 
669/// Use this rather than HUGE_VALF; the latter causes warnings on MSVC.
670extern const float huge_valf;
671 
672 
673/// Add two signed integers, computing the two's complement truncated result,
674/// returning true if overflow occurred.
675template <typename T>
676std::enable_if_t<std::is_signed_v<T>, T> AddOverflow(T X, T Y, T &Result) {
677#if __has_builtin(__builtin_add_overflow)1
678  return __builtin_add_overflow(X, Y, &Result);
679#else
680  // Perform the unsigned addition.
681  using U = std::make_unsigned_t<T>;
682  const U UX = static_cast<U>(X);
683  const U UY = static_cast<U>(Y);
684  const U UResult = UX + UY;
685 
686  // Convert to signed.
687  Result = static_cast<T>(UResult);
688 
689  // Adding two positive numbers should result in a positive number.
690  if (X > 0 && Y > 0)
691    return Result <= 0;
692  // Adding two negatives should result in a negative number.
693  if (X < 0 && Y < 0)
694    return Result >= 0;
695  return false;
696#endif
697}
698 
699/// Subtract two signed integers, computing the two's complement truncated
700/// result, returning true if an overflow ocurred.
701template <typename T>
702std::enable_if_t<std::is_signed_v<T>, T> SubOverflow(T X, T Y, T &Result) {
703#if __has_builtin(__builtin_sub_overflow)1
704  return __builtin_sub_overflow(X, Y, &Result);
705#else
706  // Perform the unsigned addition.
707  using U = std::make_unsigned_t<T>;
708  const U UX = static_cast<U>(X);
709  const U UY = static_cast<U>(Y);
710  const U UResult = UX - UY;
711 
712  // Convert to signed.
713  Result = static_cast<T>(UResult);
714 
715  // Subtracting a positive number from a negative results in a negative number.
716  if (X <= 0 && Y > 0)
717    return Result >= 0;
718  // Subtracting a negative number from a positive results in a positive number.
719  if (X >= 0 && Y < 0)
720    return Result <= 0;
721  return false;
722#endif
723}
724 
725/// Multiply two signed integers, computing the two's complement truncated
726/// result, returning true if an overflow ocurred.
727template <typename T>
728std::enable_if_t<std::is_signed_v<T>, T> MulOverflow(T X, T Y, T &Result) {
729  // Perform the unsigned multiplication on absolute values.
730  using U = std::make_unsigned_t<T>;
731  const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X);
732  const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y);
733  const U UResult = UX * UY;
734 
735  // Convert to signed.
736  const bool IsNegative = (X < 0) ^ (Y < 0);
737  Result = IsNegative ? (0 - UResult) : UResult;
738 
739  // If any of the args was 0, result is 0 and no overflow occurs.
740  if (UX == 0 || UY == 0)
741    return false;
742 
743  // UX and UY are in [1, 2^n], where n is the number of digits.
744  // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for
745  // positive) divided by an argument compares to the other.
746  if (IsNegative)
747    return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY;
748  else
749    return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY;
750}
751 
752} // End llvm namespace
753 
754#endif