LLVM  13.0.0git
ARMTargetTransformInfo.cpp
Go to the documentation of this file.
1 //===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
10 #include "ARMSubtarget.h"
12 #include "llvm/ADT/APInt.h"
13 #include "llvm/ADT/SmallVector.h"
14 #include "llvm/Analysis/LoopInfo.h"
15 #include "llvm/CodeGen/CostTable.h"
18 #include "llvm/IR/BasicBlock.h"
19 #include "llvm/IR/DataLayout.h"
20 #include "llvm/IR/DerivedTypes.h"
21 #include "llvm/IR/Instruction.h"
22 #include "llvm/IR/Instructions.h"
23 #include "llvm/IR/Intrinsics.h"
24 #include "llvm/IR/IntrinsicInst.h"
25 #include "llvm/IR/IntrinsicsARM.h"
26 #include "llvm/IR/PatternMatch.h"
27 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Casting.h"
30 #include "llvm/Support/KnownBits.h"
36 #include <algorithm>
37 #include <cassert>
38 #include <cstdint>
39 #include <utility>
40 
41 using namespace llvm;
42 
43 #define DEBUG_TYPE "armtti"
44 
46  "enable-arm-maskedldst", cl::Hidden, cl::init(true),
47  cl::desc("Enable the generation of masked loads and stores"));
48 
50  "disable-arm-loloops", cl::Hidden, cl::init(false),
51  cl::desc("Disable the generation of low-overhead loops"));
52 
53 static cl::opt<bool>
54  AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
55  cl::desc("Enable the generation of WLS loops"));
56 
58 
60 
62 
63 /// Convert a vector load intrinsic into a simple llvm load instruction.
64 /// This is beneficial when the underlying object being addressed comes
65 /// from a constant, since we get constant-folding for free.
66 static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
68  auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
69 
70  if (!IntrAlign)
71  return nullptr;
72 
73  unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
74  ? MemAlign
75  : IntrAlign->getLimitedValue();
76 
77  if (!isPowerOf2_32(Alignment))
78  return nullptr;
79 
80  auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
81  PointerType::get(II.getType(), 0));
82  return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
83 }
84 
86  const Function *Callee) const {
87  const TargetMachine &TM = getTLI()->getTargetMachine();
88  const FeatureBitset &CallerBits =
89  TM.getSubtargetImpl(*Caller)->getFeatureBits();
90  const FeatureBitset &CalleeBits =
91  TM.getSubtargetImpl(*Callee)->getFeatureBits();
92 
93  // To inline a callee, all features not in the allowed list must match exactly.
94  bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
95  (CalleeBits & ~InlineFeaturesAllowed);
96  // For features in the allowed list, the callee's features must be a subset of
97  // the callers'.
98  bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
99  (CalleeBits & InlineFeaturesAllowed);
100  return MatchExact && MatchSubset;
101 }
102 
105  ScalarEvolution *SE) const {
106  if (ST->hasMVEIntegerOps())
107  return TTI::AMK_PostIndexed;
108 
109  if (L->getHeader()->getParent()->hasOptSize())
110  return TTI::AMK_None;
111 
112  if (ST->isMClass() && ST->isThumb2() &&
113  L->getNumBlocks() == 1)
114  return TTI::AMK_PreIndexed;
115 
116  return TTI::AMK_None;
117 }
118 
121  using namespace PatternMatch;
122  Intrinsic::ID IID = II.getIntrinsicID();
123  switch (IID) {
124  default:
125  break;
126  case Intrinsic::arm_neon_vld1: {
127  Align MemAlign =
129  &IC.getAssumptionCache(), &IC.getDominatorTree());
130  if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
131  return IC.replaceInstUsesWith(II, V);
132  }
133  break;
134  }
135 
136  case Intrinsic::arm_neon_vld2:
137  case Intrinsic::arm_neon_vld3:
138  case Intrinsic::arm_neon_vld4:
139  case Intrinsic::arm_neon_vld2lane:
140  case Intrinsic::arm_neon_vld3lane:
141  case Intrinsic::arm_neon_vld4lane:
142  case Intrinsic::arm_neon_vst1:
143  case Intrinsic::arm_neon_vst2:
144  case Intrinsic::arm_neon_vst3:
145  case Intrinsic::arm_neon_vst4:
146  case Intrinsic::arm_neon_vst2lane:
147  case Intrinsic::arm_neon_vst3lane:
148  case Intrinsic::arm_neon_vst4lane: {
149  Align MemAlign =
151  &IC.getAssumptionCache(), &IC.getDominatorTree());
152  unsigned AlignArg = II.getNumArgOperands() - 1;
153  Value *AlignArgOp = II.getArgOperand(AlignArg);
154  MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
155  if (Align && *Align < MemAlign) {
156  return IC.replaceOperand(
157  II, AlignArg,
159  false));
160  }
161  break;
162  }
163 
164  case Intrinsic::arm_mve_pred_i2v: {
165  Value *Arg = II.getArgOperand(0);
166  Value *ArgArg;
167  if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
168  PatternMatch::m_Value(ArgArg))) &&
169  II.getType() == ArgArg->getType()) {
170  return IC.replaceInstUsesWith(II, ArgArg);
171  }
172  Constant *XorMask;
173  if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
174  PatternMatch::m_Value(ArgArg)),
175  PatternMatch::m_Constant(XorMask))) &&
176  II.getType() == ArgArg->getType()) {
177  if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
178  if (CI->getValue().trunc(16).isAllOnesValue()) {
179  auto TrueVector = IC.Builder.CreateVectorSplat(
180  cast<FixedVectorType>(II.getType())->getNumElements(),
181  IC.Builder.getTrue());
182  return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
183  }
184  }
185  }
186  KnownBits ScalarKnown(32);
187  if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
188  ScalarKnown, 0)) {
189  return &II;
190  }
191  break;
192  }
193  case Intrinsic::arm_mve_pred_v2i: {
194  Value *Arg = II.getArgOperand(0);
195  Value *ArgArg;
196  if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
197  PatternMatch::m_Value(ArgArg)))) {
198  return IC.replaceInstUsesWith(II, ArgArg);
199  }
200  if (!II.getMetadata(LLVMContext::MD_range)) {
201  Type *IntTy32 = Type::getInt32Ty(II.getContext());
202  Metadata *M[] = {
204  ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0xFFFF))};
205  II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M));
206  return &II;
207  }
208  break;
209  }
210  case Intrinsic::arm_mve_vadc:
211  case Intrinsic::arm_mve_vadc_predicated: {
212  unsigned CarryOp =
213  (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
214  assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
215  "Bad type for intrinsic!");
216 
217  KnownBits CarryKnown(32);
218  if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
219  CarryKnown)) {
220  return &II;
221  }
222  break;
223  }
224  case Intrinsic::arm_mve_vmldava: {
225  Instruction *I = cast<Instruction>(&II);
226  if (I->hasOneUse()) {
227  auto *User = cast<Instruction>(*I->user_begin());
228  Value *OpZ;
229  if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
230  match(I->getOperand(3), m_Zero())) {
231  Value *OpX = I->getOperand(4);
232  Value *OpY = I->getOperand(5);
233  Type *OpTy = OpX->getType();
234 
236  Value *V =
237  IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
238  {I->getOperand(0), I->getOperand(1),
239  I->getOperand(2), OpZ, OpX, OpY});
240 
241  IC.replaceInstUsesWith(*User, V);
242  return IC.eraseInstFromFunction(*User);
243  }
244  }
245  return None;
246  }
247  }
248  return None;
249 }
250 
253  assert(Ty->isIntegerTy());
254 
255  unsigned Bits = Ty->getPrimitiveSizeInBits();
256  if (Bits == 0 || Imm.getActiveBits() >= 64)
257  return 4;
258 
259  int64_t SImmVal = Imm.getSExtValue();
260  uint64_t ZImmVal = Imm.getZExtValue();
261  if (!ST->isThumb()) {
262  if ((SImmVal >= 0 && SImmVal < 65536) ||
263  (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
264  (ARM_AM::getSOImmVal(~ZImmVal) != -1))
265  return 1;
266  return ST->hasV6T2Ops() ? 2 : 3;
267  }
268  if (ST->isThumb2()) {
269  if ((SImmVal >= 0 && SImmVal < 65536) ||
270  (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
271  (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
272  return 1;
273  return ST->hasV6T2Ops() ? 2 : 3;
274  }
275  // Thumb1, any i8 imm cost 1.
276  if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
277  return 1;
278  if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
279  return 2;
280  // Load from constantpool.
281  return 3;
282 }
283 
284 // Constants smaller than 256 fit in the immediate field of
285 // Thumb1 instructions so we return a zero cost and 1 otherwise.
286 int ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
287  const APInt &Imm, Type *Ty) {
288  if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
289  return 0;
290 
291  return 1;
292 }
293 
294 // Checks whether Inst is part of a min(max()) or max(min()) pattern
295 // that will match to an SSAT instruction
296 static bool isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
297  Value *LHS, *RHS;
298  ConstantInt *C;
299  SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor;
300 
301  if (InstSPF == SPF_SMAX &&
303  C->getValue() == Imm && Imm.isNegative() && (-Imm).isPowerOf2()) {
304 
305  auto isSSatMin = [&](Value *MinInst) {
306  if (isa<SelectInst>(MinInst)) {
307  Value *MinLHS, *MinRHS;
308  ConstantInt *MinC;
309  SelectPatternFlavor MinSPF =
310  matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
311  if (MinSPF == SPF_SMIN &&
313  MinC->getValue() == ((-Imm) - 1))
314  return true;
315  }
316  return false;
317  };
318 
319  if (isSSatMin(Inst->getOperand(1)) ||
320  (Inst->hasNUses(2) && (isSSatMin(*Inst->user_begin()) ||
321  isSSatMin(*(++Inst->user_begin())))))
322  return true;
323  }
324  return false;
325 }
326 
327 int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
328  const APInt &Imm, Type *Ty,
330  Instruction *Inst) {
331  // Division by a constant can be turned into multiplication, but only if we
332  // know it's constant. So it's not so much that the immediate is cheap (it's
333  // not), but that the alternative is worse.
334  // FIXME: this is probably unneeded with GlobalISel.
335  if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
336  Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
337  Idx == 1)
338  return 0;
339 
340  if (Opcode == Instruction::And) {
341  // UXTB/UXTH
342  if (Imm == 255 || Imm == 65535)
343  return 0;
344  // Conversion to BIC is free, and means we can use ~Imm instead.
345  return std::min(getIntImmCost(Imm, Ty, CostKind),
346  getIntImmCost(~Imm, Ty, CostKind));
347  }
348 
349  if (Opcode == Instruction::Add)
350  // Conversion to SUB is free, and means we can use -Imm instead.
351  return std::min(getIntImmCost(Imm, Ty, CostKind),
352  getIntImmCost(-Imm, Ty, CostKind));
353 
354  if (Opcode == Instruction::ICmp && Imm.isNegative() &&
355  Ty->getIntegerBitWidth() == 32) {
356  int64_t NegImm = -Imm.getSExtValue();
357  if (ST->isThumb2() && NegImm < 1<<12)
358  // icmp X, #-C -> cmn X, #C
359  return 0;
360  if (ST->isThumb() && NegImm < 1<<8)
361  // icmp X, #-C -> adds X, #C
362  return 0;
363  }
364 
365  // xor a, -1 can always be folded to MVN
366  if (Opcode == Instruction::Xor && Imm.isAllOnesValue())
367  return 0;
368 
369  // Ensures negative constant of min(max()) or max(min()) patterns that
370  // match to SSAT instructions don't get hoisted
371  if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
372  Ty->getIntegerBitWidth() <= 32) {
373  if (isSSATMinMaxPattern(Inst, Imm) ||
374  (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
375  isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm)))
376  return 0;
377  }
378 
379  return getIntImmCost(Imm, Ty, CostKind);
380 }
381 
384  (ST->hasNEON() || ST->hasMVEIntegerOps())) {
385  // FIXME: The vectorizer is highly sensistive to the cost of these
386  // instructions, which suggests that it may be using the costs incorrectly.
387  // But, for now, just make them free to avoid performance regressions for
388  // vector targets.
389  return 0;
390  }
391  return BaseT::getCFInstrCost(Opcode, CostKind);
392 }
393 
394 int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
397  const Instruction *I) {
398  int ISD = TLI->InstructionOpcodeToISD(Opcode);
399  assert(ISD && "Invalid opcode");
400 
401  // TODO: Allow non-throughput costs that aren't binary.
402  auto AdjustCost = [&CostKind](int Cost) {
404  return Cost == 0 ? 0 : 1;
405  return Cost;
406  };
407  auto IsLegalFPType = [this](EVT VT) {
408  EVT EltVT = VT.getScalarType();
409  return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
410  (EltVT == MVT::f64 && ST->hasFP64()) ||
411  (EltVT == MVT::f16 && ST->hasFullFP16());
412  };
413 
414  EVT SrcTy = TLI->getValueType(DL, Src);
415  EVT DstTy = TLI->getValueType(DL, Dst);
416 
417  if (!SrcTy.isSimple() || !DstTy.isSimple())
418  return AdjustCost(
419  BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
420 
421  // Extending masked load/Truncating masked stores is expensive because we
422  // currently don't split them. This means that we'll likely end up
423  // loading/storing each element individually (hence the high cost).
424  if ((ST->hasMVEIntegerOps() &&
425  (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
426  Opcode == Instruction::SExt)) ||
427  (ST->hasMVEFloatOps() &&
428  (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
429  IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
430  if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
431  return 2 * DstTy.getVectorNumElements() *
433 
434  // The extend of other kinds of load is free
435  if (CCH == TTI::CastContextHint::Normal ||
437  static const TypeConversionCostTblEntry LoadConversionTbl[] = {
450  };
451  if (const auto *Entry = ConvertCostTableLookup(
452  LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
453  return AdjustCost(Entry->Cost);
454 
455  static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
462  // The following extend from a legal type to an illegal type, so need to
463  // split the load. This introduced an extra load operation, but the
464  // extend is still "free".
471  };
472  if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
473  if (const auto *Entry =
474  ConvertCostTableLookup(MVELoadConversionTbl, ISD,
475  DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
476  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
477  }
478 
479  static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
480  // FPExtends are similar but also require the VCVT instructions.
483  };
484  if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
485  if (const auto *Entry =
486  ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
487  DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
488  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
489  }
490 
491  // The truncate of a store is free. This is the mirror of extends above.
492  static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
500  };
501  if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
502  if (const auto *Entry =
503  ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
504  SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
505  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
506  }
507 
508  static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
511  };
512  if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
513  if (const auto *Entry =
514  ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
515  SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
516  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
517  }
518  }
519 
520  // NEON vector operations that can extend their inputs.
521  if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
522  I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
523  static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
524  // vaddl
525  { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
526  { ISD::ADD, MVT::v8i16, MVT::v8i8, 0 },
527  // vsubl
528  { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
529  { ISD::SUB, MVT::v8i16, MVT::v8i8, 0 },
530  // vmull
531  { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
532  { ISD::MUL, MVT::v8i16, MVT::v8i8, 0 },
533  // vshll
534  { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
535  { ISD::SHL, MVT::v8i16, MVT::v8i8, 0 },
536  };
537 
538  auto *User = cast<Instruction>(*I->user_begin());
539  int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
540  if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
541  DstTy.getSimpleVT(),
542  SrcTy.getSimpleVT())) {
543  return AdjustCost(Entry->Cost);
544  }
545  }
546 
547  // Single to/from double precision conversions.
548  if (Src->isVectorTy() && ST->hasNEON() &&
549  ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
550  DstTy.getScalarType() == MVT::f32) ||
551  (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
552  DstTy.getScalarType() == MVT::f64))) {
553  static const CostTblEntry NEONFltDblTbl[] = {
554  // Vector fptrunc/fpext conversions.
557  {ISD::FP_EXTEND, MVT::v4f32, 4}};
558 
559  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
560  if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
561  return AdjustCost(LT.first * Entry->Cost);
562  }
563 
564  // Some arithmetic, load and store operations have specific instructions
565  // to cast up/down their types automatically at no extra cost.
566  // TODO: Get these tables to know at least what the related operations are.
567  static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
574 
575  // The number of vmovl instructions for the extension.
594 
595  // Operations that we legalize using splitting.
598 
599  // Vector float <-> i32 conversions.
602 
623 
630 
631  // Vector double <-> i32 conversions.
634 
641 
648  };
649 
650  if (SrcTy.isVector() && ST->hasNEON()) {
651  if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
652  DstTy.getSimpleVT(),
653  SrcTy.getSimpleVT()))
654  return AdjustCost(Entry->Cost);
655  }
656 
657  // Scalar float to integer conversions.
658  static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
679  };
680  if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
681  if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
682  DstTy.getSimpleVT(),
683  SrcTy.getSimpleVT()))
684  return AdjustCost(Entry->Cost);
685  }
686 
687  // Scalar integer to float conversions.
688  static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
709  };
710 
711  if (SrcTy.isInteger() && ST->hasNEON()) {
712  if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
713  ISD, DstTy.getSimpleVT(),
714  SrcTy.getSimpleVT()))
715  return AdjustCost(Entry->Cost);
716  }
717 
718  // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
719  // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
720  // are linearised so take more.
721  static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
734  };
735 
736  if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
737  if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
738  ISD, DstTy.getSimpleVT(),
739  SrcTy.getSimpleVT()))
740  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
741  }
742 
743  if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
744  // As general rule, fp converts that were not matched above are scalarized
745  // and cost 1 vcvt for each lane, so long as the instruction is available.
746  // If not it will become a series of function calls.
747  const int CallCost = getCallInstrCost(nullptr, Dst, {Src}, CostKind);
748  int Lanes = 1;
749  if (SrcTy.isFixedLengthVector())
750  Lanes = SrcTy.getVectorNumElements();
751 
752  if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
753  return Lanes;
754  else
755  return Lanes * CallCost;
756  }
757 
758  if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
759  SrcTy.isFixedLengthVector()) {
760  // Treat a truncate with larger than legal source (128bits for MVE) as
761  // expensive, 2 instructions per lane.
762  if ((SrcTy.getScalarType() == MVT::i8 ||
763  SrcTy.getScalarType() == MVT::i16 ||
764  SrcTy.getScalarType() == MVT::i32) &&
765  SrcTy.getSizeInBits() > 128 &&
766  SrcTy.getSizeInBits() > DstTy.getSizeInBits())
767  return SrcTy.getVectorNumElements() * 2;
768  }
769 
770  // Scalar integer conversion costs.
771  static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
772  // i16 -> i64 requires two dependent operations.
774 
775  // Truncates on i64 are assumed to be free.
778  { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 },
780  };
781 
782  if (SrcTy.isInteger()) {
783  if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
784  DstTy.getSimpleVT(),
785  SrcTy.getSimpleVT()))
786  return AdjustCost(Entry->Cost);
787  }
788 
789  int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
791  : 1;
792  return AdjustCost(
793  BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
794 }
795 
796 int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
797  unsigned Index) {
798  // Penalize inserting into an D-subregister. We end up with a three times
799  // lower estimated throughput on swift.
800  if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
801  ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
802  return 3;
803 
804  if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
805  Opcode == Instruction::ExtractElement)) {
806  // Cross-class copies are expensive on many microarchitectures,
807  // so assume they are expensive by default.
808  if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
809  return 3;
810 
811  // Even if it's not a cross class copy, this likely leads to mixing
812  // of NEON and VFP code and should be therefore penalized.
813  if (ValTy->isVectorTy() &&
814  ValTy->getScalarSizeInBits() <= 32)
815  return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
816  }
817 
818  if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
819  Opcode == Instruction::ExtractElement)) {
820  // We say MVE moves costs at least the MVEVectorCostFactor, even though
821  // they are scalar instructions. This helps prevent mixing scalar and
822  // vector, to prevent vectorising where we end up just scalarising the
823  // result anyway.
824  return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index),
826  cast<FixedVectorType>(ValTy)->getNumElements() / 2;
827  }
828 
829  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
830 }
831 
832 int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
833  CmpInst::Predicate VecPred,
835  const Instruction *I) {
836  int ISD = TLI->InstructionOpcodeToISD(Opcode);
837 
838  // Thumb scalar code size cost for select.
839  if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
840  ST->isThumb() && !ValTy->isVectorTy()) {
841  // Assume expensive structs.
842  if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
843  return TTI::TCC_Expensive;
844 
845  // Select costs can vary because they:
846  // - may require one or more conditional mov (including an IT),
847  // - can't operate directly on immediates,
848  // - require live flags, which we can't copy around easily.
849  int Cost = TLI->getTypeLegalizationCost(DL, ValTy).first;
850 
851  // Possible IT instruction for Thumb2, or more for Thumb1.
852  ++Cost;
853 
854  // i1 values may need rematerialising by using mov immediates and/or
855  // flag setting instructions.
856  if (ValTy->isIntegerTy(1))
857  ++Cost;
858 
859  return Cost;
860  }
861 
862  // If this is a vector min/max/abs, use the cost of that intrinsic directly
863  // instead. Hopefully when min/max intrinsics are more prevalent this code
864  // will not be needed.
865  const Instruction *Sel = I;
866  if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
867  Sel->hasOneUse())
868  Sel = cast<Instruction>(Sel->user_back());
869  if (Sel && ValTy->isVectorTy() &&
870  (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
871  const Value *LHS, *RHS;
872  SelectPatternFlavor SPF = matchSelectPattern(Sel, LHS, RHS).Flavor;
873  unsigned IID = 0;
874  switch (SPF) {
875  case SPF_ABS:
876  IID = Intrinsic::abs;
877  break;
878  case SPF_SMIN:
879  IID = Intrinsic::smin;
880  break;
881  case SPF_SMAX:
882  IID = Intrinsic::smax;
883  break;
884  case SPF_UMIN:
885  IID = Intrinsic::umin;
886  break;
887  case SPF_UMAX:
888  IID = Intrinsic::umax;
889  break;
890  case SPF_FMINNUM:
891  IID = Intrinsic::minnum;
892  break;
893  case SPF_FMAXNUM:
894  IID = Intrinsic::maxnum;
895  break;
896  default:
897  break;
898  }
899  if (IID) {
900  // The ICmp is free, the select gets the cost of the min/max/etc
901  if (Sel != I)
902  return 0;
903  IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
904  return getIntrinsicInstrCost(CostAttrs, CostKind);
905  }
906  }
907 
908  // On NEON a vector select gets lowered to vbsl.
909  if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
910  // Lowering of some vector selects is currently far from perfect.
911  static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
912  { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
913  { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
915  };
916 
917  EVT SelCondTy = TLI->getValueType(DL, CondTy);
918  EVT SelValTy = TLI->getValueType(DL, ValTy);
919  if (SelCondTy.isSimple() && SelValTy.isSimple()) {
920  if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
921  SelCondTy.getSimpleVT(),
922  SelValTy.getSimpleVT()))
923  return Entry->Cost;
924  }
925 
926  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
927  return LT.first;
928  }
929 
930  if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
931  (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
932  cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
933  FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
934  FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy);
935  if (!VecCondTy)
936  VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy));
937 
938  // If we don't have mve.fp any fp operations will need to be scalarized.
939  if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
940  // One scalaization insert, one scalarization extract and the cost of the
941  // fcmps.
942  return BaseT::getScalarizationOverhead(VecValTy, false, true) +
943  BaseT::getScalarizationOverhead(VecCondTy, true, false) +
944  VecValTy->getNumElements() *
945  getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
946  VecCondTy->getScalarType(), VecPred, CostKind,
947  I);
948  }
949 
950  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
951  int BaseCost = ST->getMVEVectorCostFactor(CostKind);
952  // There are two types - the input that specifies the type of the compare
953  // and the output vXi1 type. Because we don't know how the output will be
954  // split, we may need an expensive shuffle to get two in sync. This has the
955  // effect of making larger than legal compares (v8i32 for example)
956  // expensive.
957  if (LT.second.getVectorNumElements() > 2) {
958  if (LT.first > 1)
959  return LT.first * BaseCost +
960  BaseT::getScalarizationOverhead(VecCondTy, true, false);
961  return BaseCost;
962  }
963  }
964 
965  // Default to cheap (throughput/size of 1 instruction) but adjust throughput
966  // for "multiple beats" potentially needed by MVE instructions.
967  int BaseCost = 1;
968  if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
969  BaseCost = ST->getMVEVectorCostFactor(CostKind);
970 
971  return BaseCost *
972  BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
973 }
974 
976  const SCEV *Ptr) {
977  // Address computations in vectorized code with non-consecutive addresses will
978  // likely result in more instructions compared to scalar code where the
979  // computation can more often be merged into the index mode. The resulting
980  // extra micro-ops can significantly decrease throughput.
981  unsigned NumVectorInstToHideOverhead = 10;
982  int MaxMergeDistance = 64;
983 
984  if (ST->hasNEON()) {
985  if (Ty->isVectorTy() && SE &&
986  !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
987  return NumVectorInstToHideOverhead;
988 
989  // In many cases the address computation is not merged into the instruction
990  // addressing mode.
991  return 1;
992  }
993  return BaseT::getAddressComputationCost(Ty, SE, Ptr);
994 }
995 
997  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
998  // If a VCTP is part of a chain, it's already profitable and shouldn't be
999  // optimized, else LSR may block tail-predication.
1000  switch (II->getIntrinsicID()) {
1001  case Intrinsic::arm_mve_vctp8:
1002  case Intrinsic::arm_mve_vctp16:
1003  case Intrinsic::arm_mve_vctp32:
1004  case Intrinsic::arm_mve_vctp64:
1005  return true;
1006  default:
1007  break;
1008  }
1009  }
1010  return false;
1011 }
1012 
1013 bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
1015  return false;
1016 
1017  if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
1018  // Don't support v2i1 yet.
1019  if (VecTy->getNumElements() == 2)
1020  return false;
1021 
1022  // We don't support extending fp types.
1023  unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
1024  if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
1025  return false;
1026  }
1027 
1028  unsigned EltWidth = DataTy->getScalarSizeInBits();
1029  return (EltWidth == 32 && Alignment >= 4) ||
1030  (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
1031 }
1032 
1035  return false;
1036 
1037  // This method is called in 2 places:
1038  // - from the vectorizer with a scalar type, in which case we need to get
1039  // this as good as we can with the limited info we have (and rely on the cost
1040  // model for the rest).
1041  // - from the masked intrinsic lowering pass with the actual vector type.
1042  // For MVE, we have a custom lowering pass that will already have custom
1043  // legalised any gathers that we can to MVE intrinsics, and want to expand all
1044  // the rest. The pass runs before the masked intrinsic lowering pass, so if we
1045  // are here, we know we want to expand.
1046  if (isa<VectorType>(Ty))
1047  return false;
1048 
1049  unsigned EltWidth = Ty->getScalarSizeInBits();
1050  return ((EltWidth == 32 && Alignment >= 4) ||
1051  (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1052 }
1053 
1054 /// Given a memcpy/memset/memmove instruction, return the number of memory
1055 /// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1056 /// call is used.
1058  MemOp MOp;
1059  unsigned DstAddrSpace = ~0u;
1060  unsigned SrcAddrSpace = ~0u;
1061  const Function *F = I->getParent()->getParent();
1062 
1063  if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
1064  ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
1065  // If 'size' is not a constant, a library call will be generated.
1066  if (!C)
1067  return -1;
1068 
1069  const unsigned Size = C->getValue().getZExtValue();
1070  const Align DstAlign = *MC->getDestAlign();
1071  const Align SrcAlign = *MC->getSourceAlign();
1072 
1073  MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
1074  /*IsVolatile*/ false);
1075  DstAddrSpace = MC->getDestAddressSpace();
1076  SrcAddrSpace = MC->getSourceAddressSpace();
1077  }
1078  else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
1079  ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
1080  // If 'size' is not a constant, a library call will be generated.
1081  if (!C)
1082  return -1;
1083 
1084  const unsigned Size = C->getValue().getZExtValue();
1085  const Align DstAlign = *MS->getDestAlign();
1086 
1087  MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
1088  /*IsZeroMemset*/ false, /*IsVolatile*/ false);
1089  DstAddrSpace = MS->getDestAddressSpace();
1090  }
1091  else
1092  llvm_unreachable("Expected a memcpy/move or memset!");
1093 
1094  unsigned Limit, Factor = 2;
1095  switch(I->getIntrinsicID()) {
1096  case Intrinsic::memcpy:
1097  Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
1098  break;
1099  case Intrinsic::memmove:
1100  Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
1101  break;
1102  case Intrinsic::memset:
1103  Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
1104  Factor = 1;
1105  break;
1106  default:
1107  llvm_unreachable("Expected a memcpy/move or memset!");
1108  }
1109 
1110  // MemOps will be poplulated with a list of data types that needs to be
1111  // loaded and stored. That's why we multiply the number of elements by 2 to
1112  // get the cost for this memcpy.
1113  std::vector<EVT> MemOps;
1114  if (getTLI()->findOptimalMemOpLowering(
1115  MemOps, Limit, MOp, DstAddrSpace,
1116  SrcAddrSpace, F->getAttributes()))
1117  return MemOps.size() * Factor;
1118 
1119  // If we can't find an optimal memop lowering, return the default cost
1120  return -1;
1121 }
1122 
1124  int NumOps = getNumMemOps(cast<IntrinsicInst>(I));
1125 
1126  // To model the cost of a library call, we assume 1 for the call, and
1127  // 3 for the argument setup.
1128  if (NumOps == -1)
1129  return 4;
1130  return NumOps;
1131 }
1132 
1134  int Index, VectorType *SubTp) {
1135  if (ST->hasNEON()) {
1136  if (Kind == TTI::SK_Broadcast) {
1137  static const CostTblEntry NEONDupTbl[] = {
1138  // VDUP handles these cases.
1145 
1150 
1151  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1152 
1153  if (const auto *Entry =
1154  CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
1155  return LT.first * Entry->Cost;
1156  }
1157  if (Kind == TTI::SK_Reverse) {
1158  static const CostTblEntry NEONShuffleTbl[] = {
1159  // Reverse shuffle cost one instruction if we are shuffling within a
1160  // double word (vrev) or two if we shuffle a quad word (vrev, vext).
1167 
1172 
1173  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1174 
1175  if (const auto *Entry =
1176  CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
1177  return LT.first * Entry->Cost;
1178  }
1179  if (Kind == TTI::SK_Select) {
1180  static const CostTblEntry NEONSelShuffleTbl[] = {
1181  // Select shuffle cost table for ARM. Cost is the number of
1182  // instructions
1183  // required to create the shuffled vector.
1184 
1189 
1193 
1195 
1197 
1198  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1199  if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
1200  ISD::VECTOR_SHUFFLE, LT.second))
1201  return LT.first * Entry->Cost;
1202  }
1203  }
1204  if (ST->hasMVEIntegerOps()) {
1205  if (Kind == TTI::SK_Broadcast) {
1206  static const CostTblEntry MVEDupTbl[] = {
1207  // VDUP handles these cases.
1213 
1214  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1215 
1216  if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
1217  LT.second))
1218  return LT.first * Entry->Cost *
1220  }
1221  }
1222  int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
1224  : 1;
1225  return BaseCost * BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
1226 }
1227 
1228 int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
1230  TTI::OperandValueKind Op1Info,
1231  TTI::OperandValueKind Op2Info,
1232  TTI::OperandValueProperties Opd1PropInfo,
1233  TTI::OperandValueProperties Opd2PropInfo,
1235  const Instruction *CxtI) {
1236  int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1237  if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
1238  // Make operations on i1 relatively expensive as this often involves
1239  // combining predicates. AND and XOR should be easier to handle with IT
1240  // blocks.
1241  switch (ISDOpcode) {
1242  default:
1243  break;
1244  case ISD::AND:
1245  case ISD::XOR:
1246  return 2;
1247  case ISD::OR:
1248  return 3;
1249  }
1250  }
1251 
1252  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
1253 
1254  if (ST->hasNEON()) {
1255  const unsigned FunctionCallDivCost = 20;
1256  const unsigned ReciprocalDivCost = 10;
1257  static const CostTblEntry CostTbl[] = {
1258  // Division.
1259  // These costs are somewhat random. Choose a cost of 20 to indicate that
1260  // vectorizing devision (added function call) is going to be very expensive.
1261  // Double registers types.
1262  { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1263  { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1264  { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
1265  { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
1266  { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1267  { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1268  { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
1269  { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
1270  { ISD::SDIV, MVT::v4i16, ReciprocalDivCost},
1271  { ISD::UDIV, MVT::v4i16, ReciprocalDivCost},
1272  { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
1273  { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
1274  { ISD::SDIV, MVT::v8i8, ReciprocalDivCost},
1275  { ISD::UDIV, MVT::v8i8, ReciprocalDivCost},
1276  { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost},
1277  { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost},
1278  // Quad register types.
1279  { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1280  { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1281  { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
1282  { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
1283  { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1284  { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1285  { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
1286  { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
1287  { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1288  { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1289  { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
1290  { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
1291  { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1292  { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1293  { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
1294  { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
1295  // Multiplication.
1296  };
1297 
1298  if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
1299  return LT.first * Entry->Cost;
1300 
1301  int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
1302  Op2Info,
1303  Opd1PropInfo, Opd2PropInfo);
1304 
1305  // This is somewhat of a hack. The problem that we are facing is that SROA
1306  // creates a sequence of shift, and, or instructions to construct values.
1307  // These sequences are recognized by the ISel and have zero-cost. Not so for
1308  // the vectorized code. Because we have support for v2i64 but not i64 those
1309  // sequences look particularly beneficial to vectorize.
1310  // To work around this we increase the cost of v2i64 operations to make them
1311  // seem less beneficial.
1312  if (LT.second == MVT::v2i64 &&
1314  Cost += 4;
1315 
1316  return Cost;
1317  }
1318 
1319  // If this operation is a shift on arm/thumb2, it might well be folded into
1320  // the following instruction, hence having a cost of 0.
1321  auto LooksLikeAFreeShift = [&]() {
1322  if (ST->isThumb1Only() || Ty->isVectorTy())
1323  return false;
1324 
1325  if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
1326  return false;
1328  return false;
1329 
1330  // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1331  switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
1332  case Instruction::Add:
1333  case Instruction::Sub:
1334  case Instruction::And:
1335  case Instruction::Xor:
1336  case Instruction::Or:
1337  case Instruction::ICmp:
1338  return true;
1339  default:
1340  return false;
1341  }
1342  };
1343  if (LooksLikeAFreeShift())
1344  return 0;
1345 
1346  // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1347  // for "multiple beats" potentially needed by MVE instructions.
1348  int BaseCost = 1;
1349  if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1350  BaseCost = ST->getMVEVectorCostFactor(CostKind);
1351 
1352  // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
1353  // without treating floats as more expensive that scalars or increasing the
1354  // costs for custom operations. The results is also multiplied by the
1355  // MVEVectorCostFactor where appropriate.
1356  if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
1357  return LT.first * BaseCost;
1358 
1359  // Else this is expand, assume that we need to scalarize this op.
1360  if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
1361  unsigned Num = VTy->getNumElements();
1362  unsigned Cost = getArithmeticInstrCost(Opcode, Ty->getScalarType(),
1363  CostKind);
1364  // Return the cost of multiple scalar invocation plus the cost of
1365  // inserting and extracting the values.
1366  SmallVector<Type *> Tys(Args.size(), Ty);
1367  return BaseT::getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
1368  }
1369 
1370  return BaseCost;
1371 }
1372 
1373 int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1374  MaybeAlign Alignment, unsigned AddressSpace,
1376  const Instruction *I) {
1377  // TODO: Handle other cost kinds.
1379  return 1;
1380 
1381  // Type legalization can't handle structs
1382  if (TLI->getValueType(DL, Src, true) == MVT::Other)
1383  return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1384  CostKind);
1385 
1386  if (ST->hasNEON() && Src->isVectorTy() &&
1387  (Alignment && *Alignment != Align(16)) &&
1388  cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
1389  // Unaligned loads/stores are extremely inefficient.
1390  // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1391  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
1392  return LT.first * 4;
1393  }
1394 
1395  // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
1396  // Same for stores.
1397  if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
1398  ((Opcode == Instruction::Load && I->hasOneUse() &&
1399  isa<FPExtInst>(*I->user_begin())) ||
1400  (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
1401  FixedVectorType *SrcVTy = cast<FixedVectorType>(Src);
1402  Type *DstTy =
1403  Opcode == Instruction::Load
1404  ? (*I->user_begin())->getType()
1405  : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
1406  if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
1407  DstTy->getScalarType()->isFloatTy())
1408  return ST->getMVEVectorCostFactor(CostKind);
1409  }
1410 
1411  int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1413  : 1;
1414  return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1415  CostKind, I);
1416 }
1417 
1418 unsigned ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
1419  Align Alignment,
1420  unsigned AddressSpace,
1422  if (ST->hasMVEIntegerOps()) {
1423  if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment))
1424  return ST->getMVEVectorCostFactor(CostKind);
1425  if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment))
1426  return ST->getMVEVectorCostFactor(CostKind);
1427  }
1428  if (!isa<FixedVectorType>(Src))
1429  return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1430  CostKind);
1431  // Scalar cost, which is currently very high due to the efficiency of the
1432  // generated code.
1433  return cast<FixedVectorType>(Src)->getNumElements() * 8;
1434 }
1435 
1437  unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1438  Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1439  bool UseMaskForCond, bool UseMaskForGaps) {
1440  assert(Factor >= 2 && "Invalid interleave factor");
1441  assert(isa<VectorType>(VecTy) && "Expect a vector type");
1442 
1443  // vldN/vstN doesn't support vector types of i64/f64 element.
1444  bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
1445 
1446  if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
1447  !UseMaskForCond && !UseMaskForGaps) {
1448  unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1449  auto *SubVecTy =
1450  FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
1451 
1452  // vldN/vstN only support legal vector types of size 64 or 128 in bits.
1453  // Accesses having vector types that are a multiple of 128 bits can be
1454  // matched to more than one vldN/vstN instruction.
1455  int BaseCost =
1457  if (NumElts % Factor == 0 &&
1458  TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
1459  return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1460 
1461  // Some smaller than legal interleaved patterns are cheap as we can make
1462  // use of the vmovn or vrev patterns to interleave a standard load. This is
1463  // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1464  // promoted differently). The cost of 2 here is then a load and vrev or
1465  // vmovn.
1466  if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
1467  VecTy->isIntOrIntVectorTy() &&
1468  DL.getTypeSizeInBits(SubVecTy).getFixedSize() <= 64)
1469  return 2 * BaseCost;
1470  }
1471 
1472  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1473  Alignment, AddressSpace, CostKind,
1474  UseMaskForCond, UseMaskForGaps);
1475 }
1476 
1477 unsigned ARMTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
1478  const Value *Ptr, bool VariableMask,
1479  Align Alignment,
1481  const Instruction *I) {
1482  using namespace PatternMatch;
1483  if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
1484  return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1485  Alignment, CostKind, I);
1486 
1487  assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
1488  auto *VTy = cast<FixedVectorType>(DataTy);
1489 
1490  // TODO: Splitting, once we do that.
1491 
1492  unsigned NumElems = VTy->getNumElements();
1493  unsigned EltSize = VTy->getScalarSizeInBits();
1494  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, DataTy);
1495 
1496  // For now, it is assumed that for the MVE gather instructions the loads are
1497  // all effectively serialised. This means the cost is the scalar cost
1498  // multiplied by the number of elements being loaded. This is possibly very
1499  // conservative, but even so we still end up vectorising loops because the
1500  // cost per iteration for many loops is lower than for scalar loops.
1501  unsigned VectorCost =
1502  NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
1503  // The scalarization cost should be a lot higher. We use the number of vector
1504  // elements plus the scalarization overhead.
1505  unsigned ScalarCost = NumElems * LT.first +
1506  BaseT::getScalarizationOverhead(VTy, true, false) +
1507  BaseT::getScalarizationOverhead(VTy, false, true);
1508 
1509  if (EltSize < 8 || Alignment < EltSize / 8)
1510  return ScalarCost;
1511 
1512  unsigned ExtSize = EltSize;
1513  // Check whether there's a single user that asks for an extended type
1514  if (I != nullptr) {
1515  // Dependent of the caller of this function, a gather instruction will
1516  // either have opcode Instruction::Load or be a call to the masked_gather
1517  // intrinsic
1518  if ((I->getOpcode() == Instruction::Load ||
1519  match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
1520  I->hasOneUse()) {
1521  const User *Us = *I->users().begin();
1522  if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
1523  // only allow valid type combinations
1524  unsigned TypeSize =
1525  cast<Instruction>(Us)->getType()->getScalarSizeInBits();
1526  if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
1527  (TypeSize == 16 && EltSize == 8)) &&
1528  TypeSize * NumElems == 128) {
1529  ExtSize = TypeSize;
1530  }
1531  }
1532  }
1533  // Check whether the input data needs to be truncated
1534  TruncInst *T;
1535  if ((I->getOpcode() == Instruction::Store ||
1536  match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
1537  (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
1538  // Only allow valid type combinations
1539  unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
1540  if (((EltSize == 16 && TypeSize == 32) ||
1541  (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
1542  TypeSize * NumElems == 128)
1543  ExtSize = TypeSize;
1544  }
1545  }
1546 
1547  if (ExtSize * NumElems != 128 || NumElems < 4)
1548  return ScalarCost;
1549 
1550  // Any (aligned) i32 gather will not need to be scalarised.
1551  if (ExtSize == 32)
1552  return VectorCost;
1553  // For smaller types, we need to ensure that the gep's inputs are correctly
1554  // extended from a small enough value. Other sizes (including i64) are
1555  // scalarized for now.
1556  if (ExtSize != 8 && ExtSize != 16)
1557  return ScalarCost;
1558 
1559  if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
1560  Ptr = BC->getOperand(0);
1561  if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
1562  if (GEP->getNumOperands() != 2)
1563  return ScalarCost;
1564  unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
1565  // Scale needs to be correct (which is only relevant for i16s).
1566  if (Scale != 1 && Scale * 8 != ExtSize)
1567  return ScalarCost;
1568  // And we need to zext (not sext) the indexes from a small enough type.
1569  if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
1570  if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
1571  return VectorCost;
1572  }
1573  return ScalarCost;
1574  }
1575  return ScalarCost;
1576 }
1577 
1579  bool IsPairwiseForm,
1581  EVT ValVT = TLI->getValueType(DL, ValTy);
1582  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1583  if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD)
1584  return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
1585  CostKind);
1586 
1587  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1588 
1589  static const CostTblEntry CostTblAdd[]{
1590  {ISD::ADD, MVT::v16i8, 1},
1591  {ISD::ADD, MVT::v8i16, 1},
1592  {ISD::ADD, MVT::v4i32, 1},
1593  };
1594  if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
1595  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1596 
1597  return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
1598  CostKind);
1599 }
1600 
1602 ARMTTIImpl::getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
1603  Type *ResTy, VectorType *ValTy,
1605  EVT ValVT = TLI->getValueType(DL, ValTy);
1606  EVT ResVT = TLI->getValueType(DL, ResTy);
1607  if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1608  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1609  if ((LT.second == MVT::v16i8 && ResVT.getSizeInBits() <= 32) ||
1610  (LT.second == MVT::v8i16 &&
1611  ResVT.getSizeInBits() <= (IsMLA ? 64 : 32)) ||
1612  (LT.second == MVT::v4i32 && ResVT.getSizeInBits() <= 64))
1613  return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1614  }
1615 
1616  return BaseT::getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, ValTy,
1617  CostKind);
1618 }
1619 
1622  switch (ICA.getID()) {
1623  case Intrinsic::get_active_lane_mask:
1624  // Currently we make a somewhat optimistic assumption that
1625  // active_lane_mask's are always free. In reality it may be freely folded
1626  // into a tail predicated loop, expanded into a VCPT or expanded into a lot
1627  // of add/icmp code. We may need to improve this in the future, but being
1628  // able to detect if it is free or not involves looking at a lot of other
1629  // code. We currently assume that the vectorizer inserted these, and knew
1630  // what it was doing in adding one.
1631  if (ST->hasMVEIntegerOps())
1632  return 0;
1633  break;
1634  case Intrinsic::sadd_sat:
1635  case Intrinsic::ssub_sat:
1636  case Intrinsic::uadd_sat:
1637  case Intrinsic::usub_sat: {
1638  if (!ST->hasMVEIntegerOps())
1639  break;
1640  Type *VT = ICA.getReturnType();
1641 
1642  std::pair<int, MVT> LT =
1643  TLI->getTypeLegalizationCost(DL, VT);
1644  if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1645  LT.second == MVT::v16i8) {
1646  // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
1647  // need to extend the type, as it uses shr(qadd(shl, shl)).
1648  unsigned Instrs =
1649  LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4;
1650  return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
1651  }
1652  break;
1653  }
1654  case Intrinsic::abs:
1655  case Intrinsic::smin:
1656  case Intrinsic::smax:
1657  case Intrinsic::umin:
1658  case Intrinsic::umax: {
1659  if (!ST->hasMVEIntegerOps())
1660  break;
1661  Type *VT = ICA.getReturnType();
1662 
1663  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1664  if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1665  LT.second == MVT::v16i8)
1666  return LT.first * ST->getMVEVectorCostFactor(CostKind);
1667  break;
1668  }
1669  case Intrinsic::minnum:
1670  case Intrinsic::maxnum: {
1671  if (!ST->hasMVEFloatOps())
1672  break;
1673  Type *VT = ICA.getReturnType();
1674  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1675  if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
1676  return LT.first * ST->getMVEVectorCostFactor(CostKind);
1677  break;
1678  }
1679  }
1680 
1682 }
1683 
1685  if (!F->isIntrinsic())
1687 
1688  // Assume all Arm-specific intrinsics map to an instruction.
1689  if (F->getName().startswith("llvm.arm"))
1690  return false;
1691 
1692  switch (F->getIntrinsicID()) {
1693  default: break;
1694  case Intrinsic::powi:
1695  case Intrinsic::sin:
1696  case Intrinsic::cos:
1697  case Intrinsic::pow:
1698  case Intrinsic::log:
1699  case Intrinsic::log10:
1700  case Intrinsic::log2:
1701  case Intrinsic::exp:
1702  case Intrinsic::exp2:
1703  return true;
1704  case Intrinsic::sqrt:
1705  case Intrinsic::fabs:
1706  case Intrinsic::copysign:
1707  case Intrinsic::floor:
1708  case Intrinsic::ceil:
1709  case Intrinsic::trunc:
1710  case Intrinsic::rint:
1711  case Intrinsic::nearbyint:
1712  case Intrinsic::round:
1713  case Intrinsic::canonicalize:
1714  case Intrinsic::lround:
1715  case Intrinsic::llround:
1716  case Intrinsic::lrint:
1717  case Intrinsic::llrint:
1718  if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
1719  return true;
1720  if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
1721  return true;
1722  // Some operations can be handled by vector instructions and assume
1723  // unsupported vectors will be expanded into supported scalar ones.
1724  // TODO Handle scalar operations properly.
1725  return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
1726  case Intrinsic::masked_store:
1727  case Intrinsic::masked_load:
1728  case Intrinsic::masked_gather:
1729  case Intrinsic::masked_scatter:
1730  return !ST->hasMVEIntegerOps();
1731  case Intrinsic::sadd_with_overflow:
1732  case Intrinsic::uadd_with_overflow:
1733  case Intrinsic::ssub_with_overflow:
1734  case Intrinsic::usub_with_overflow:
1735  case Intrinsic::sadd_sat:
1736  case Intrinsic::uadd_sat:
1737  case Intrinsic::ssub_sat:
1738  case Intrinsic::usub_sat:
1739  return false;
1740  }
1741 
1742  return BaseT::isLoweredToCall(F);
1743 }
1744 
1746  unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
1747  EVT VT = TLI->getValueType(DL, I.getType(), true);
1748  if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
1749  return true;
1750 
1751  // Check if an intrinsic will be lowered to a call and assume that any
1752  // other CallInst will generate a bl.
1753  if (auto *Call = dyn_cast<CallInst>(&I)) {
1754  if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
1755  switch(II->getIntrinsicID()) {
1756  case Intrinsic::memcpy:
1757  case Intrinsic::memset:
1758  case Intrinsic::memmove:
1759  return getNumMemOps(II) == -1;
1760  default:
1761  if (const Function *F = Call->getCalledFunction())
1762  return isLoweredToCall(F);
1763  }
1764  }
1765  return true;
1766  }
1767 
1768  // FPv5 provides conversions between integer, double-precision,
1769  // single-precision, and half-precision formats.
1770  switch (I.getOpcode()) {
1771  default:
1772  break;
1773  case Instruction::FPToSI:
1774  case Instruction::FPToUI:
1775  case Instruction::SIToFP:
1776  case Instruction::UIToFP:
1777  case Instruction::FPTrunc:
1778  case Instruction::FPExt:
1779  return !ST->hasFPARMv8Base();
1780  }
1781 
1782  // FIXME: Unfortunately the approach of checking the Operation Action does
1783  // not catch all cases of Legalization that use library calls. Our
1784  // Legalization step categorizes some transformations into library calls as
1785  // Custom, Expand or even Legal when doing type legalization. So for now
1786  // we have to special case for instance the SDIV of 64bit integers and the
1787  // use of floating point emulation.
1788  if (VT.isInteger() && VT.getSizeInBits() >= 64) {
1789  switch (ISD) {
1790  default:
1791  break;
1792  case ISD::SDIV:
1793  case ISD::UDIV:
1794  case ISD::SREM:
1795  case ISD::UREM:
1796  case ISD::SDIVREM:
1797  case ISD::UDIVREM:
1798  return true;
1799  }
1800  }
1801 
1802  // Assume all other non-float operations are supported.
1803  if (!VT.isFloatingPoint())
1804  return false;
1805 
1806  // We'll need a library call to handle most floats when using soft.
1807  if (TLI->useSoftFloat()) {
1808  switch (I.getOpcode()) {
1809  default:
1810  return true;
1811  case Instruction::Alloca:
1812  case Instruction::Load:
1813  case Instruction::Store:
1814  case Instruction::Select:
1815  case Instruction::PHI:
1816  return false;
1817  }
1818  }
1819 
1820  // We'll need a libcall to perform double precision operations on a single
1821  // precision only FPU.
1822  if (I.getType()->isDoubleTy() && !ST->hasFP64())
1823  return true;
1824 
1825  // Likewise for half precision arithmetic.
1826  if (I.getType()->isHalfTy() && !ST->hasFullFP16())
1827  return true;
1828 
1829  return false;
1830 }
1831 
1833  AssumptionCache &AC,
1834  TargetLibraryInfo *LibInfo,
1835  HardwareLoopInfo &HWLoopInfo) {
1836  // Low-overhead branches are only supported in the 'low-overhead branch'
1837  // extension of v8.1-m.
1838  if (!ST->hasLOB() || DisableLowOverheadLoops) {
1839  LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
1840  return false;
1841  }
1842 
1844  LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
1845  return false;
1846  }
1847 
1848  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
1849  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
1850  LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
1851  return false;
1852  }
1853 
1854  const SCEV *TripCountSCEV =
1855  SE.getAddExpr(BackedgeTakenCount,
1856  SE.getOne(BackedgeTakenCount->getType()));
1857 
1858  // We need to store the trip count in LR, a 32-bit register.
1859  if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
1860  LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
1861  return false;
1862  }
1863 
1864  // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
1865  // point in generating a hardware loop if that's going to happen.
1866 
1867  auto IsHardwareLoopIntrinsic = [](Instruction &I) {
1868  if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
1869  switch (Call->getIntrinsicID()) {
1870  default:
1871  break;
1872  case Intrinsic::start_loop_iterations:
1873  case Intrinsic::test_set_loop_iterations:
1874  case Intrinsic::loop_decrement:
1875  case Intrinsic::loop_decrement_reg:
1876  return true;
1877  }
1878  }
1879  return false;
1880  };
1881 
1882  // Scan the instructions to see if there's any that we know will turn into a
1883  // call or if this loop is already a low-overhead loop or will become a tail
1884  // predicated loop.
1885  bool IsTailPredLoop = false;
1886  auto ScanLoop = [&](Loop *L) {
1887  for (auto *BB : L->getBlocks()) {
1888  for (auto &I : *BB) {
1889  if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
1890  isa<InlineAsm>(I)) {
1891  LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
1892  return false;
1893  }
1894  if (auto *II = dyn_cast<IntrinsicInst>(&I))
1895  IsTailPredLoop |=
1896  II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
1897  II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
1898  II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
1899  II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
1900  II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
1901  }
1902  }
1903  return true;
1904  };
1905 
1906  // Visit inner loops.
1907  for (auto Inner : *L)
1908  if (!ScanLoop(Inner))
1909  return false;
1910 
1911  if (!ScanLoop(L))
1912  return false;
1913 
1914  // TODO: Check whether the trip count calculation is expensive. If L is the
1915  // inner loop but we know it has a low trip count, calculating that trip
1916  // count (in the parent loop) may be detrimental.
1917 
1918  LLVMContext &C = L->getHeader()->getContext();
1919  HWLoopInfo.CounterInReg = true;
1920  HWLoopInfo.IsNestingLegal = false;
1921  HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
1922  HWLoopInfo.CountType = Type::getInt32Ty(C);
1923  HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
1924  return true;
1925 }
1926 
1927 static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
1928  // We don't allow icmp's, and because we only look at single block loops,
1929  // we simply count the icmps, i.e. there should only be 1 for the backedge.
1930  if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
1931  return false;
1932 
1933  if (isa<FCmpInst>(&I))
1934  return false;
1935 
1936  // We could allow extending/narrowing FP loads/stores, but codegen is
1937  // too inefficient so reject this for now.
1938  if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
1939  return false;
1940 
1941  // Extends have to be extending-loads
1942  if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
1943  if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
1944  return false;
1945 
1946  // Truncs have to be narrowing-stores
1947  if (isa<TruncInst>(&I) )
1948  if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
1949  return false;
1950 
1951  return true;
1952 }
1953 
1954 // To set up a tail-predicated loop, we need to know the total number of
1955 // elements processed by that loop. Thus, we need to determine the element
1956 // size and:
1957 // 1) it should be uniform for all operations in the vector loop, so we
1958 // e.g. don't want any widening/narrowing operations.
1959 // 2) it should be smaller than i64s because we don't have vector operations
1960 // that work on i64s.
1961 // 3) we don't want elements to be reversed or shuffled, to make sure the
1962 // tail-predication masks/predicates the right lanes.
1963 //
1965  const DataLayout &DL,
1966  const LoopAccessInfo *LAI) {
1967  LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
1968 
1969  // If there are live-out values, it is probably a reduction. We can predicate
1970  // most reduction operations freely under MVE using a combination of
1971  // prefer-predicated-reduction-select and inloop reductions. We limit this to
1972  // floating point and integer reductions, but don't check for operators
1973  // specifically here. If the value ends up not being a reduction (and so the
1974  // vectorizer cannot tailfold the loop), we should fall back to standard
1975  // vectorization automatically.
1977  LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
1978  bool ReductionsDisabled =
1981 
1982  for (auto *I : LiveOuts) {
1983  if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
1984  !I->getType()->isHalfTy()) {
1985  LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
1986  "live-out value\n");
1987  return false;
1988  }
1989  if (ReductionsDisabled) {
1990  LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
1991  return false;
1992  }
1993  }
1994 
1995  // Next, check that all instructions can be tail-predicated.
1996  PredicatedScalarEvolution PSE = LAI->getPSE();
1997  SmallVector<Instruction *, 16> LoadStores;
1998  int ICmpCount = 0;
1999 
2000  for (BasicBlock *BB : L->blocks()) {
2001  for (Instruction &I : BB->instructionsWithoutDebug()) {
2002  if (isa<PHINode>(&I))
2003  continue;
2004  if (!canTailPredicateInstruction(I, ICmpCount)) {
2005  LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
2006  return false;
2007  }
2008 
2009  Type *T = I.getType();
2010  if (T->isPointerTy())
2011  T = T->getPointerElementType();
2012 
2013  if (T->getScalarSizeInBits() > 32) {
2014  LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
2015  return false;
2016  }
2017  if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
2018  Value *Ptr = isa<LoadInst>(I) ? I.getOperand(0) : I.getOperand(1);
2019  int64_t NextStride = getPtrStride(PSE, Ptr, L);
2020  if (NextStride == 1) {
2021  // TODO: for now only allow consecutive strides of 1. We could support
2022  // other strides as long as it is uniform, but let's keep it simple
2023  // for now.
2024  continue;
2025  } else if (NextStride == -1 ||
2026  (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
2027  (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
2028  LLVM_DEBUG(dbgs()
2029  << "Consecutive strides of 2 found, vld2/vstr2 can't "
2030  "be tail-predicated\n.");
2031  return false;
2032  // TODO: don't tail predicate if there is a reversed load?
2033  } else if (EnableMaskedGatherScatters) {
2034  // Gather/scatters do allow loading from arbitrary strides, at
2035  // least if they are loop invariant.
2036  // TODO: Loop variant strides should in theory work, too, but
2037  // this requires further testing.
2038  const SCEV *PtrScev =
2040  if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
2041  const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
2042  if (PSE.getSE()->isLoopInvariant(Step, L))
2043  continue;
2044  }
2045  }
2046  LLVM_DEBUG(dbgs() << "Bad stride found, can't "
2047  "tail-predicate\n.");
2048  return false;
2049  }
2050  }
2051  }
2052 
2053  LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
2054  return true;
2055 }
2056 
2058  ScalarEvolution &SE,
2059  AssumptionCache &AC,
2060  TargetLibraryInfo *TLI,
2061  DominatorTree *DT,
2062  const LoopAccessInfo *LAI) {
2063  if (!EnableTailPredication) {
2064  LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
2065  return false;
2066  }
2067 
2068  // Creating a predicated vector loop is the first step for generating a
2069  // tail-predicated hardware loop, for which we need the MVE masked
2070  // load/stores instructions:
2071  if (!ST->hasMVEIntegerOps())
2072  return false;
2073 
2074  // For now, restrict this to single block loops.
2075  if (L->getNumBlocks() > 1) {
2076  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
2077  "loop.\n");
2078  return false;
2079  }
2080 
2081  assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
2082 
2083  HardwareLoopInfo HWLoopInfo(L);
2084  if (!HWLoopInfo.canAnalyze(*LI)) {
2085  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2086  "analyzable.\n");
2087  return false;
2088  }
2089 
2090  // This checks if we have the low-overhead branch architecture
2091  // extension, and if we will create a hardware-loop:
2092  if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {
2093  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2094  "profitable.\n");
2095  return false;
2096  }
2097 
2098  if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) {
2099  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2100  "a candidate.\n");
2101  return false;
2102  }
2103 
2104  return canTailPredicateLoop(L, LI, SE, DL, LAI);
2105 }
2106 
2108  if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
2109  return false;
2110 
2111  // Intrinsic @llvm.get.active.lane.mask is supported.
2112  // It is used in the MVETailPredication pass, which requires the number of
2113  // elements processed by this vector loop to setup the tail-predicated
2114  // loop.
2115  return true;
2116 }
2119  // Only currently enable these preferences for M-Class cores.
2120  if (!ST->isMClass())
2122 
2123  // Disable loop unrolling for Oz and Os.
2124  UP.OptSizeThreshold = 0;
2125  UP.PartialOptSizeThreshold = 0;
2126  if (L->getHeader()->getParent()->hasOptSize())
2127  return;
2128 
2129  // Only enable on Thumb-2 targets.
2130  if (!ST->isThumb2())
2131  return;
2132 
2133  SmallVector<BasicBlock*, 4> ExitingBlocks;
2134  L->getExitingBlocks(ExitingBlocks);
2135  LLVM_DEBUG(dbgs() << "Loop has:\n"
2136  << "Blocks: " << L->getNumBlocks() << "\n"
2137  << "Exit blocks: " << ExitingBlocks.size() << "\n");
2138 
2139  // Only allow another exit other than the latch. This acts as an early exit
2140  // as it mirrors the profitability calculation of the runtime unroller.
2141  if (ExitingBlocks.size() > 2)
2142  return;
2143 
2144  // Limit the CFG of the loop body for targets with a branch predictor.
2145  // Allowing 4 blocks permits if-then-else diamonds in the body.
2146  if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
2147  return;
2148 
2149  // Don't unroll vectorized loops, including the remainder loop
2150  if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
2151  return;
2152 
2153  // Scan the loop: don't unroll loops with calls as this could prevent
2154  // inlining.
2155  unsigned Cost = 0;
2156  for (auto *BB : L->getBlocks()) {
2157  for (auto &I : *BB) {
2158  // Don't unroll vectorised loop. MVE does not benefit from it as much as
2159  // scalar code.
2160  if (I.getType()->isVectorTy())
2161  return;
2162 
2163  if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2164  if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2165  if (!isLoweredToCall(F))
2166  continue;
2167  }
2168  return;
2169  }
2170 
2171  SmallVector<const Value*, 4> Operands(I.operand_values());
2172  Cost +=
2174  }
2175  }
2176 
2177  LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2178 
2179  UP.Partial = true;
2180  UP.Runtime = true;
2181  UP.UpperBound = true;
2182  UP.UnrollRemainder = true;
2184  UP.UnrollAndJam = true;
2186 
2187  // Force unrolling small loops can be very useful because of the branch
2188  // taken cost of the backedge.
2189  if (Cost < 12)
2190  UP.Force = true;
2191 }
2192 
2195  BaseT::getPeelingPreferences(L, SE, PP);
2196 }
2197 
2198 bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
2199  TTI::ReductionFlags Flags) const {
2200  if (!ST->hasMVEIntegerOps())
2201  return false;
2202 
2203  unsigned ScalarBits = Ty->getScalarSizeInBits();
2204  switch (Opcode) {
2205  case Instruction::Add:
2206  return ScalarBits <= 64;
2207  default:
2208  return false;
2209  }
2210 }
2211 
2213  unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const {
2214  if (!ST->hasMVEIntegerOps())
2215  return false;
2216  return true;
2217 }
llvm::ISD::SUB
@ SUB
Definition: ISDOpcodes.h:233
llvm::Check::Size
@ Size
Definition: FileCheck.h:73
ARMSubtarget.h
llvm::InstructionCost
Definition: InstructionCost.h:26
llvm::EngineKind::Kind
Kind
Definition: ExecutionEngine.h:524
llvm::TargetLoweringBase::getMaxStoresPerMemmove
unsigned getMaxStoresPerMemmove(bool OptSize) const
Get maximum # of store operations permitted for llvm.memmove.
Definition: TargetLowering.h:1600
llvm::TargetTransformInfo::CastContextHint::Masked
@ Masked
The cast is used with a masked load/store.
ValueTypes.h
llvm::TargetTransformInfo::UnrollingPreferences::PartialOptSizeThreshold
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
Definition: TargetTransformInfo.h:449
llvm::ScalarEvolution::hasLoopInvariantBackedgeTakenCount
bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
Definition: ScalarEvolution.cpp:12081
llvm::BasicTTIImplBase< ARMTTIImpl >::getArithmeticInstrCost
unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:660
llvm::TargetTransformInfo::SK_Select
@ SK_Select
Selects elements from the corresponding lane of either source operand.
Definition: TargetTransformInfo.h:842
llvm::BasicTTIImplBase< ARMTTIImpl >::DL
const DataLayout & DL
Definition: TargetTransformInfoImpl.h:38
llvm::TargetTransformInfo::UnrollingPreferences::Runtime
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
Definition: TargetTransformInfo.h:477
llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition: TargetTransformInfo.h:210
llvm::SPF_SMAX
@ SPF_SMAX
Unsigned minimum.
Definition: ValueTracking.h:659
llvm::ISD::VECTOR_SHUFFLE
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:557
llvm::ARM_AM::isThumbImmShiftedVal
bool isThumbImmShiftedVal(unsigned V)
isThumbImmShiftedVal - Return true if the specified value can be obtained by left shifting a 8-bit im...
Definition: ARMAddressingModes.h:235
llvm::MVT::v4f16
@ v4f16
Definition: MachineValueType.h:125
llvm::TargetTransformInfo::TCC_Expensive
@ TCC_Expensive
The cost of a 'div' instruction on x86.
Definition: TargetTransformInfo.h:265
llvm::IRBuilderBase::SetInsertPoint
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:184
llvm
This class represents lattice values for constants.
Definition: AllocatorList.h:23
llvm::TargetTransformInfo::ReductionFlags
Flags describing the kind of vector reduction.
Definition: TargetTransformInfo.h:1317
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::SystemZISD::TM
@ TM
Definition: SystemZISelLowering.h:65
llvm::ARMSubtarget::hasMVEFloatOps
bool hasMVEFloatOps() const
Definition: ARMSubtarget.h:616
llvm::HardwareLoopInfo::LoopDecrement
Value * LoopDecrement
Definition: TargetTransformInfo.h:100
llvm::ARMTTIImpl::getAddressComputationCost
int getAddressComputationCost(Type *Val, ScalarEvolution *SE, const SCEV *Ptr)
Definition: ARMTargetTransformInfo.cpp:975
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:111
llvm::ISD::OR
@ OR
Definition: ISDOpcodes.h:589
llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:437
InstCombiner.h
llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:722
llvm::InstCombiner::getDominatorTree
DominatorTree & getDominatorTree() const
Definition: InstCombiner.h:369
llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:107
IntrinsicInst.h
llvm::ARMTTIImpl::getGatherScatterOpCost
unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:1477
ceil
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g ceil
Definition: README-FPStack.txt:54
DisableLowOverheadLoops
static cl::opt< bool > DisableLowOverheadLoops("disable-arm-loloops", cl::Hidden, cl::init(false), cl::desc("Disable the generation of low-overhead loops"))
llvm::TypeSize::getFixedSize
ScalarTy getFixedSize() const
Definition: TypeSize.h:422
llvm::BasicTTIImplBase< ARMTTIImpl >::getCmpSelInstrCost
unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:902
llvm::Function
Definition: Function.h:61
llvm::Loop
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:529
llvm::ISD::UDIV
@ UDIV
Definition: ISDOpcodes.h:236
llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:51
llvm::SelectPatternResult::Flavor
SelectPatternFlavor Flavor
Definition: ValueTracking.h:679
llvm::DataLayout::getTypeSizeInBits
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:661
llvm::TargetTransformInfo::AMK_PostIndexed
@ AMK_PostIndexed
Definition: TargetTransformInfo.h:632
llvm::PointerType::get
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Definition: Type.cpp:686
MVEMaxSupportedInterleaveFactor
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor
llvm::ARMTTIImpl::areInlineCompatible
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
Definition: ARMTargetTransformInfo.cpp:85
llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:317
llvm::ARMTTIImpl::getIntImmCodeSizeCost
int getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty)
Definition: ARMTargetTransformInfo.cpp:286
llvm::PredicatedScalarEvolution
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
Definition: ScalarEvolution.h:2119
llvm::ConstantInt::getValue
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:133
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
llvm::InstCombiner::Builder
BuilderTy & Builder
Definition: InstCombiner.h:56
llvm::ARMSubtarget::hasFPARMv8Base
bool hasFPARMv8Base() const
Definition: ARMSubtarget.h:643
llvm::APInt::getSExtValue
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1643
llvm::CmpInst::makeCmpResultType
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:1034
llvm::IRBuilder< TargetFolder, IRBuilderCallbackInserter >
llvm::IntrinsicCostAttributes::getReturnType
Type * getReturnType() const
Definition: TargetTransformInfo.h:147
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:443
llvm::TargetTransformInfo::UnrollingPreferences::UnrollAndJamInnerLoopThreshold
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
Definition: TargetTransformInfo.h:496
Local.h
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:151
llvm::TargetTransformInfo::UnrollingPreferences::UnrollRemainder
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
Definition: TargetTransformInfo.h:489
llvm::ISD::FP_TO_SINT
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:741
llvm::ARMTTIImpl::getMaskedMemoryOpCost
unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:1418
llvm::ARMTTIImpl::getIntImmCost
int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:251
llvm::Type::isFPOrFPVectorTy
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:190
llvm::TargetTransformInfo::TCK_CodeSize
@ TCK_CodeSize
Instruction code size.
Definition: TargetTransformInfo.h:213
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:140
llvm::CostTblEntry
Cost Table Entry.
Definition: CostTable.h:24
llvm::ARMSubtarget::hasV6T2Ops
bool hasV6T2Ops() const
Definition: ARMSubtarget.h:602
EnableMaskedLoadStores
static cl::opt< bool > EnableMaskedLoadStores("enable-arm-maskedldst", cl::Hidden, cl::init(true), cl::desc("Enable the generation of masked loads and stores"))
llvm::MemOp
Definition: TargetLowering.h:108
APInt.h
llvm::ARMTTIImpl::getMemcpyCost
int getMemcpyCost(const Instruction *I)
Definition: ARMTargetTransformInfo.cpp:1123
llvm::HardwareLoopInfo::isHardwareLoopCandidate
bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, DominatorTree &DT, bool ForceNestedLoop=false, bool ForceHardwareLoopPHI=false)
Definition: TargetTransformInfo.cpp:101
llvm::TargetTransformInfo::UnrollingPreferences::Partial
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Definition: TargetTransformInfo.h:473
llvm::findDefsUsedOutsideOfLoop
SmallVector< Instruction *, 8 > findDefsUsedOutsideOfLoop(Loop *L)
Returns the instructions that use values defined in the loop.
Definition: LoopUtils.cpp:132
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:46
llvm::APInt::getBitWidth
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1581
llvm::TargetTransformInfo::PeelingPreferences
Definition: TargetTransformInfo.h:526
llvm::tgtok::Bits
@ Bits
Definition: TGLexer.h:50
llvm::Instruction::isShift
bool isShift() const
Definition: Instruction.h:167
llvm::SPF_UMAX
@ SPF_UMAX
Signed maximum.
Definition: ValueTracking.h:660
llvm::Optional
Definition: APInt.h:33
T
#define T
Definition: Mips16ISelLowering.cpp:341
llvm::ARMSubtarget::hasMVEIntegerOps
bool hasMVEIntegerOps() const
Definition: ARMSubtarget.h:615
llvm::FeatureBitset
Container class for subtarget features.
Definition: SubtargetFeature.h:40
llvm::ConstantAsMetadata::get
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:415
llvm::Value::user_begin
user_iterator user_begin()
Definition: Value.h:400
llvm::CallBase::getNumArgOperands
unsigned getNumArgOperands() const
Definition: InstrTypes.h:1339
llvm::ARMTTIImpl::getVectorInstrCost
int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: ARMTargetTransformInfo.cpp:796
llvm::isPowerOf2_32
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:492
llvm::matchSelectPattern
SelectPatternResult matchSelectPattern(Value *V, Value *&LHS, Value *&RHS, Instruction::CastOps *CastOp=nullptr, unsigned Depth=0)
Pattern match integer [SU]MIN, [SU]MAX and ABS idioms, returning the kind and providing the out param...
Definition: ValueTracking.cpp:5913
llvm::SelectPatternFlavor
SelectPatternFlavor
Specific patterns of select instructions we can match.
Definition: ValueTracking.h:655
llvm::MVT::v2f64
@ v2f64
Definition: MachineValueType.h:154
llvm::ARMTTIImpl::getArithmeticInstrCost
int getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueKind Op1Info=TTI::OK_AnyValue, TTI::OperandValueKind Op2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: ARMTargetTransformInfo.cpp:1228
getType
static wasm::ValType getType(const TargetRegisterClass *RC)
Definition: WebAssemblyMCInstLower.cpp:190
llvm::FixedVectorType
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:527
llvm::BasicTTIImplBase< ARMTTIImpl >::getArithmeticReductionCost
unsigned getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, bool IsPairwise, TTI::TargetCostKind CostKind)
Try to calculate arithmetic and shuffle op costs for reduction operations.
Definition: BasicTTIImpl.h:1893
llvm::ARMTTIImpl::getCFInstrCost
int getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:382
llvm::ARMTTIImpl::getArithmeticReductionCost
int getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, bool IsPairwiseForm, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:1578
llvm::TargetLoweringBase::getOperationAction
LegalizeAction getOperationAction(unsigned Op, EVT VT) const
Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...
Definition: TargetLowering.h:1037
llvm::Type::getInt32Ty
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:197
llvm::LoopBase::getNumBlocks
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
Definition: LoopInfo.h:185
llvm::APIntOps::umin
const APInt & umin(const APInt &A, const APInt &B)
Determine the smaller of two APInts considered to be signed.
Definition: APInt.h:2183
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:122
llvm::MDNode::get
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1194
llvm::TargetTransformInfo::SK_Broadcast
@ SK_Broadcast
Broadcast element 0 to all other elements.
Definition: TargetTransformInfo.h:840
F
#define F(x, y, z)
Definition: MD5.cpp:56
llvm::ARMTTIImpl::isLegalMaskedLoad
bool isLegalMaskedLoad(Type *DataTy, Align Alignment)
Definition: ARMTargetTransformInfo.cpp:1013
llvm::Instruction::setMetadata
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1330
llvm::ARMTTIImpl::isHardwareLoopProfitable
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo)
Definition: ARMTargetTransformInfo.cpp:1832
KnownBits.h
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
llvm::HardwareLoopInfo::IsNestingLegal
bool IsNestingLegal
Definition: TargetTransformInfo.h:102
floor
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g floor
Definition: README-FPStack.txt:54
llvm::EVT::isSimple
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:124
MachineValueType.h
llvm::AArch64CC::LT
@ LT
Definition: AArch64BaseInfo.h:247
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
Arg
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
Definition: AMDGPULibCalls.cpp:205
Instruction.h
llvm::ConstantInt::get
static Constant * get(Type *Ty, uint64_t V, bool isSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:876
llvm::FixedVectorType::getNumElements
unsigned getNumElements() const
Definition: DerivedTypes.h:570
llvm::ARMSubtarget::hasLOB
bool hasLOB() const
Definition: ARMSubtarget.h:651
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:77
llvm::InstCombiner::replaceOperand
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
Definition: InstCombiner.h:437
llvm::MVT::i1
@ i1
Definition: MachineValueType.h:40
llvm::APInt::isNonNegative
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition: APInt.h:369
llvm::MVT::v8f16
@ v8f16
Definition: MachineValueType.h:126
llvm::APInt::isNegative
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:364
llvm::ARM_AM::getSOImmVal
int getSOImmVal(unsigned Arg)
getSOImmVal - Given a 32-bit immediate, if it is something that can fit into an shifter_operand immed...
Definition: ARMAddressingModes.h:163
SubtargetFeature.h
TargetMachine.h
llvm::ScalarEvolution::getOne
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
Definition: ScalarEvolution.h:596
llvm::ISD::SELECT
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:635
llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
llvm::ISD::ZERO_EXTEND
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:685
EnableTailPredication
cl::opt< TailPredication::Mode > EnableTailPredication
llvm::TargetTransformInfo::ShuffleKind
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Definition: TargetTransformInfo.h:839
llvm::BasicTTIImplBase< ARMTTIImpl >::getCastInstrCost
unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:740
llvm::TargetTransformInfo::CastContextHint
CastContextHint
Represents a hint about the context in which a cast is used.
Definition: TargetTransformInfo.h:1069
llvm::User
Definition: User.h:44
llvm::TargetLoweringBase::getMaxStoresPerMemset
unsigned getMaxStoresPerMemset(bool OptSize) const
Get maximum # of store operations permitted for llvm.memset.
Definition: TargetLowering.h:1561
llvm::EVT
Extended Value Type.
Definition: ValueTypes.h:35
llvm::getKnownAlignment
Align getKnownAlignment(Value *V, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to infer an alignment for the specified pointer.
Definition: Local.h:224
Intrinsics.h
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
llvm::TargetTransformInfo::UnrollingPreferences::Force
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
Definition: TargetTransformInfo.h:485
llvm::MVT::f64
@ f64
Definition: MachineValueType.h:53
llvm::ARMSubtarget::hasBranchPredictor
bool hasBranchPredictor() const
Definition: ARMSubtarget.h:705
round
static uint64_t round(uint64_t Acc, uint64_t Input)
Definition: xxhash.cpp:57
llvm::EVT::getVectorNumElements
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:301
llvm::getPtrStride
int64_t getPtrStride(PredicatedScalarEvolution &PSE, Value *Ptr, const Loop *Lp, const ValueToValueMap &StridesMap=ValueToValueMap(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of its element size.
Definition: LoopAccessAnalysis.cpp:1017
AllowWLSLoops
static cl::opt< bool > AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true), cl::desc("Enable the generation of WLS loops"))
llvm::ISD::TRUNCATE
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:691
llvm::LoopBase::blocks
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:178
llvm::APInt::getLimitedValue
uint64_t getLimitedValue(uint64_t Limit=UINT64_MAX) const
If this value is smaller than the specified limit, return it, otherwise return the limit value.
Definition: APInt.h:487
llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:235
llvm::MVT::v16i1
@ v16i1
Definition: MachineValueType.h:65
llvm::ISD::UDIVREM
@ UDIVREM
Definition: ISDOpcodes.h:249
llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:119
llvm::PatternMatch::m_c_Add
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
Definition: PatternMatch.h:2170
llvm::ARMTargetLowering::useSoftFloat
bool useSoftFloat() const override
Definition: ARMISelLowering.cpp:1558
llvm::MVT::v8i1
@ v8i1
Definition: MachineValueType.h:64
llvm::TargetTransformInfo::UnrollingPreferences::UnrollAndJam
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
Definition: TargetTransformInfo.h:491
llvm::LoopBase::getBlocks
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:171
llvm::BasicTTIImplBase< ARMTTIImpl >::getCallInstrCost
unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency)
Compute a cost of the given call instruction.
Definition: BasicTTIImpl.h:1843
llvm::PatternMatch::m_ConstantInt
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:101
llvm::EVT::isInteger
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:139
llvm::Instruction
Definition: Instruction.h:45
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition: Type.cpp:147
llvm::HardwareLoopInfo::PerformEntryTest
bool PerformEntryTest
Definition: TargetTransformInfo.h:106
canTailPredicateLoop
static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, const DataLayout &DL, const LoopAccessInfo *LAI)
Definition: ARMTargetTransformInfo.cpp:1964
llvm::ARMTTIImpl::isLoweredToCall
bool isLoweredToCall(const Function *F)
Definition: ARMTargetTransformInfo.cpp:1684
llvm::ISD::SINT_TO_FP
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:695
llvm::InstCombiner::eraseInstFromFunction
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
llvm::APInt::getZExtValue
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1631
llvm::TypeConversionCostTblEntry
Type Conversion Cost Table.
Definition: CostTable.h:44
llvm::LoopBase::getExitingBlocks
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
Definition: LoopInfoImpl.h:34
LoopUtils.h
llvm::BasicTTIImplBase< ARMTTIImpl >::getShuffleCost
unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index, VectorType *SubTp)
Definition: BasicTTIImpl.h:718
llvm::HardwareLoopInfo::CounterInReg
bool CounterInReg
Definition: TargetTransformInfo.h:104
llvm::ARMSubtarget::hasV6Ops
bool hasV6Ops() const
Definition: ARMSubtarget.h:599
llvm::ISD::AND
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:588
llvm::ARMTTIImpl::emitGetActiveLaneMask
bool emitGetActiveLaneMask() const
Definition: ARMTargetTransformInfo.cpp:2107
PatternMatch.h
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:643
llvm::ARMSubtarget::hasFP64
bool hasFP64() const
Definition: ARMSubtarget.h:678
llvm::MVT::v1i64
@ v1i64
Definition: MachineValueType.h:108
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::Metadata
Root of the metadata hierarchy.
Definition: Metadata.h:58
llvm::ARM_AM::getT2SOImmVal
int getT2SOImmVal(unsigned Arg)
getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit into a Thumb-2 shifter_oper...
Definition: ARMAddressingModes.h:320
llvm::AddressSpace
AddressSpace
Definition: NVPTXBaseInfo.h:21
llvm::None
const NoneType None
Definition: None.h:23
llvm::MVT::v4i16
@ v4i16
Definition: MachineValueType.h:86
llvm::ARMSubtarget::getMVEVectorCostFactor
unsigned getMVEVectorCostFactor(TargetTransformInfo::TargetCostKind CostKind) const
Definition: ARMSubtarget.h:916
llvm::Type::getIntegerBitWidth
unsigned getIntegerBitWidth() const
Definition: DerivedTypes.h:96
llvm::MVT::v4i8
@ v4i8
Definition: MachineValueType.h:75
llvm::SPF_SMIN
@ SPF_SMIN
Definition: ValueTracking.h:657
Type.h
llvm::IntrinsicCostAttributes
Definition: TargetTransformInfo.h:115
llvm::BasicTTIImplBase< ARMTTIImpl >::getAddressComputationCost
unsigned getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
Definition: BasicTTIImpl.h:1853
llvm::APInt::isAllOnesValue
bool isAllOnesValue() const
Determine if all bits are set.
Definition: APInt.h:401
llvm::Instruction::getMetadata
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:277
llvm::maxnum
LLVM_READONLY APFloat maxnum(const APFloat &A, const APFloat &B)
Implements IEEE maxNum semantics.
Definition: APFloat.h:1297
LoopInfo.h
llvm::ARMTTIImpl::getNumMemOps
int getNumMemOps(const IntrinsicInst *I) const
Given a memcpy/memset/memmove instruction, return the number of memory operations performed,...
Definition: ARMTargetTransformInfo.cpp:1057
llvm::ARMTTIImpl::isProfitableLSRChainElement
bool isProfitableLSRChainElement(Instruction *I)
Definition: ARMTargetTransformInfo.cpp:996
llvm::ARMTTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP)
Definition: ARMTargetTransformInfo.cpp:2117
Operands
mir Rename Register Operands
Definition: MIRNamerPass.cpp:78
llvm::ARMTTIImpl::preferPredicateOverEpilogue
bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, const LoopAccessInfo *LAI)
Definition: ARMTargetTransformInfo.cpp:2057
getCalledFunction
static const Function * getCalledFunction(const Value *V, bool LookThroughBitCast, bool &IsNoBuiltin)
Definition: MemoryBuiltins.cpp:116
llvm::PatternMatch::m_Xor
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1086
llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33
llvm::Type::isIntegerTy
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:202
llvm::APInt::getOneBitSet
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:593
llvm::TargetTransformInfo::SK_Reverse
@ SK_Reverse
Reverse the order of the vector.
Definition: TargetTransformInfo.h:841
llvm::MVT::v2i8
@ v2i8
Definition: MachineValueType.h:74
llvm::MVT::v4i64
@ v4i64
Definition: MachineValueType.h:110
llvm::VectorType
Base class of all SIMD vector types.
Definition: DerivedTypes.h:391
llvm::TargetTransformInfo::CastContextHint::Normal
@ Normal
The cast is used with a normal load/store.
BasicBlock.h
llvm::cl::opt< bool >
llvm::SCEV
This class represents an analyzed expression in the program.
Definition: ScalarEvolution.h:78
llvm::ARMTTIImpl::getExtendedAddReductionCost
InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *ValTy, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:1602
llvm::ARMTTIImpl::getShuffleCost
int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index, VectorType *SubTp)
Definition: ARMTargetTransformInfo.cpp:1133
llvm::PatternMatch::m_Zero
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:491
llvm::Constant
This is an important base class in LLVM.
Definition: Constant.h:41
llvm::MVT::v16i8
@ v16i8
Definition: MachineValueType.h:77
llvm::EVT::getSizeInBits
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:336
llvm::ARMSubtarget::isThumb1Only
bool isThumb1Only() const
Definition: ARMSubtarget.h:802
llvm::ScalarEvolution::getUnsignedRangeMax
APInt getUnsignedRangeMax(const SCEV *S)
Determine the max of the unsigned range for a particular SCEV.
Definition: ScalarEvolution.h:861
llvm::InstCombiner::getAssumptionCache
AssumptionCache & getAssumptionCache() const
Definition: InstCombiner.h:367
llvm::MVT::v16i16
@ v16i16
Definition: MachineValueType.h:88
Index
uint32_t Index
Definition: ELFObjHandler.cpp:84
llvm::MVT::v2i64
@ v2i64
Definition: MachineValueType.h:109
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
llvm::ARMTTIImpl::getIntrinsicInstrCost
int getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:1620
llvm::InstCombiner::getDataLayout
const DataLayout & getDataLayout() const
Definition: InstCombiner.h:370
llvm::ISD::FP_TO_UINT
@ FP_TO_UINT
Definition: ISDOpcodes.h:742
llvm::CostTableLookup
const CostTblEntry * CostTableLookup(ArrayRef< CostTblEntry > Tbl, int ISD, MVT Ty)
Find in cost table, TypeTy must be comparable to CompareTy by ==.
Definition: CostTable.h:31
llvm::MVT::v16f32
@ v16f32
Definition: MachineValueType.h:145
llvm::Instruction::user_back
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:91
llvm::TruncInst
This class represents a truncation of integer types.
Definition: Instructions.h:4691
llvm::TargetTransformInfo::OK_UniformConstantValue
@ OK_UniformConstantValue
Definition: TargetTransformInfo.h:892
llvm::ARMSubtarget::isMClass
bool isMClass() const
Definition: ARMSubtarget.h:805
llvm::ConvertCostTableLookup
const TypeConversionCostTblEntry * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntry > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table, TypeTy must be comparable to CompareTy by ==.
Definition: CostTable.h:54
llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
llvm::DenseMap< const Value *, Value * >
llvm::TargetTransformInfo::UnrollingPreferences
Parameters that control the generic loop unrolling transformation.
Definition: TargetTransformInfo.h:420
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::TargetTransformInfo::OperandValueProperties
OperandValueProperties
Additional properties of an operand's values.
Definition: TargetTransformInfo.h:897
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:440
llvm::LoopAccessInfo
Drive the analysis of memory accesses in the loop.
Definition: LoopAccessAnalysis.h:519
llvm::Type::isHalfTy
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
llvm::MVT::v4f32
@ v4f32
Definition: MachineValueType.h:142
llvm::MVT::i8
@ i8
Definition: MachineValueType.h:41
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
llvm::MVT::Other
@ Other
Definition: MachineValueType.h:39
llvm::ARMSubtarget::hasSlowLoadDSubregister
bool hasSlowLoadDSubregister() const
Definition: ARMSubtarget.h:692
llvm::SPF_ABS
@ SPF_ABS
Floating point maxnum.
Definition: ValueTracking.h:663
memcpy
<%struct.s * > cast struct s *S to sbyte *< sbyte * > sbyte uint cast struct s *agg result to sbyte *< sbyte * > sbyte uint cast struct s *memtmp to sbyte *< sbyte * > sbyte uint ret void llc ends up issuing two memcpy or custom lower memcpy(of small size) to be ldmia/stmia. I think option 2 is better but the current register allocator cannot allocate a chunk of registers at a time. A feasible temporary solution is to use specific physical registers at the lowering time for small(<
llvm::TargetLoweringBase::getMaxStoresPerMemcpy
unsigned getMaxStoresPerMemcpy(bool OptSize) const
Get maximum # of store operations permitted for llvm.memcpy.
Definition: TargetLowering.h:1571
llvm::TargetTransformInfoImplBase::isLoweredToCall
bool isLoweredToCall(const Function *F) const
Definition: TargetTransformInfoImpl.h:105
llvm::ARMSubtarget::hasFullFP16
bool hasFullFP16() const
Definition: ARMSubtarget.h:722
llvm::ARMTTIImpl::preferInLoopReduction
bool preferInLoopReduction(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const
Definition: ARMTargetTransformInfo.cpp:2198
llvm::IRBuilderBase::CreateIntrinsic
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with args, mangled using Types.
Definition: IRBuilder.cpp:781
llvm::IRBuilderBase::getTrue
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:458
llvm::PatternMatch::m_Constant
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
Definition: PatternMatch.h:98
llvm::TargetTransformInfo::OperandValueKind
OperandValueKind
Additional information about an operand's possible values.
Definition: TargetTransformInfo.h:889
isSSATMinMaxPattern
static bool isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm)
Definition: ARMTargetTransformInfo.cpp:296
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:643
llvm::PatternMatch::m_Value
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:70
llvm::TargetTransformInfoImplCRTPBase< ARMTTIImpl >::getUserCost
int getUserCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
Definition: TargetTransformInfoImpl.h:889
llvm::ARMTTIImpl::getPreferredAddressingMode
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const
Definition: ARMTargetTransformInfo.cpp:104
llvm::APIntOps::smin
const APInt & smin(const APInt &A, const APInt &B)
Determine the smaller of two APInts considered to be signed.
Definition: APInt.h:2173
llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: APInt.h:32
llvm::LoopInfo
Definition: LoopInfo.h:1079
llvm::EVT::isVector
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:149
ARMAddressingModes.h
llvm::ARMSubtarget::hasNEON
bool hasNEON() const
Definition: ARMSubtarget.h:644
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:339
DataLayout.h
llvm::MVT::i64
@ i64
Definition: MachineValueType.h:44
llvm::MVT::v2i32
@ v2i32
Definition: MachineValueType.h:94
llvm::AssumptionCache
A cache of @llvm.assume calls within a function.
Definition: AssumptionCache.h:41
llvm::ARMTTIImpl::isLegalMaskedGather
bool isLegalMaskedGather(Type *Ty, Align Alignment)
Definition: ARMTargetTransformInfo.cpp:1033
llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition: TargetTransformInfo.h:214
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:136
llvm::BasicTTIImplBase< ARMTTIImpl >::getMemoryOpCost
unsigned getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:956
llvm::ISD::SREM
@ SREM
Definition: ISDOpcodes.h:237
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:246
if
if(llvm_vc STREQUAL "") set(fake_version_inc "$
Definition: CMakeLists.txt:14
llvm::MVT::v2f32
@ v2f32
Definition: MachineValueType.h:140
CostKind
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
llvm::Value::getContext
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:871
llvm::ARMTTIImpl::getCmpSelInstrCost
int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:832
llvm::TargetTransformInfo::UnrollingPreferences::DefaultUnrollRuntimeCount
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
Definition: TargetTransformInfo.h:456
trunc
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g trunc
Definition: README-FPStack.txt:63
llvm::SPF_FMINNUM
@ SPF_FMINNUM
Unsigned maximum.
Definition: ValueTracking.h:661
llvm::TargetLoweringBase::InstructionOpcodeToISD
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
Definition: TargetLoweringBase.cpp:1744
llvm::TargetTransformInfo::AddressingModeKind
AddressingModeKind
Definition: TargetTransformInfo.h:630
llvm::MVT::v4i32
@ v4i32
Definition: MachineValueType.h:96
llvm::BasicTTIImplBase< ARMTTIImpl >::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:470
llvm::SPF_FMAXNUM
@ SPF_FMAXNUM
Floating point minnum.
Definition: ValueTracking.h:662
canTailPredicateInstruction
static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount)
Definition: ARMTargetTransformInfo.cpp:1927
llvm::MVT::v8i64
@ v8i64
Definition: MachineValueType.h:111
llvm::ISD::XOR
@ XOR
Definition: ISDOpcodes.h:590
llvm::Function::hasOptSize
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:685
llvm::ScalarEvolution::isLoopInvariant
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
Definition: ScalarEvolution.cpp:12349
llvm::InstCombiner::replaceInstUsesWith
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition: InstCombiner.h:416
llvm::MVT::v16i32
@ v16i32
Definition: MachineValueType.h:99
llvm::MemOp::Set
static MemOp Set(uint64_t Size, bool DstAlignCanChange, Align DstAlign, bool IsZeroMemset, bool IsVolatile)
Definition: TargetLowering.h:141
llvm::ARMTTIImpl::getIntImmCostInst
int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
Definition: ARMTargetTransformInfo.cpp:327
llvm::BasicTTIImplBase::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP)
Definition: BasicTTIImpl.h:408
llvm::MCID::Select
@ Select
Definition: MCInstrDesc.h:163
llvm::TargetLoweringBase::getTargetMachine
const TargetMachine & getTargetMachine() const
Definition: TargetLowering.h:335
llvm::LoopBase::isInnermost
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
Definition: LoopInfo.h:165
llvm::APIntOps::umax
const APInt & umax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be unsigned.
Definition: APInt.h:2188
llvm::minnum
LLVM_READONLY APFloat minnum(const APFloat &A, const APFloat &B)
Implements IEEE minNum semantics.
Definition: APFloat.h:1286
llvm::MemOp::Copy
static MemOp Copy(uint64_t Size, bool DstAlignCanChange, Align DstAlign, Align SrcAlign, bool IsVolatile, bool MemcpyStrSrc=false)
Definition: TargetLowering.h:126
llvm::KnownBits
Definition: KnownBits.h:23
llvm::Type::isFloatTy
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:148
llvm::BasicTTIImplBase< ARMTTIImpl >::getCFInstrCost
unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:898
CostTable.h
llvm::EVT::getScalarType
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:288
llvm::TargetTransformInfo::UnrollingPreferences::UpperBound
bool UpperBound
Allow using trip count upper bound to unroll loops.
Definition: TargetTransformInfo.h:487
llvm::Align::value
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
llvm::Type::isIntOrIntVectorTy
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:208
llvm::replaceSymbolicStrideSCEV
const SCEV * replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE, const ValueToValueMap &PtrToStride, Value *Ptr, Value *OrigPtr=nullptr)
Return the SCEV corresponding to a pointer with the symbolic stride replaced with constant one,...
Definition: LoopAccessAnalysis.cpp:143
llvm::MVT::v8i16
@ v8i16
Definition: MachineValueType.h:87
ISDOpcodes.h
llvm::TypeSize
Definition: TypeSize.h:413
Casting.h
llvm::BasicTTIImplBase< ARMTTIImpl >::getIntrinsicInstrCost
unsigned getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
Definition: BasicTTIImpl.h:1196
llvm::Value::hasNUses
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:146
llvm::LoopBase::getHeader
BlockT * getHeader() const
Definition: LoopInfo.h:104
llvm::BasicTTIImplBase< ARMTTIImpl >::getInterleavedMemoryOpCost
unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Definition: BasicTTIImpl.h:1044
llvm::MVT::i32
@ i32
Definition: MachineValueType.h:43
llvm::TargetLibraryInfo
Provides information about what library functions are available for the current target.
Definition: TargetLibraryInfo.h:202
llvm::ISD::SDIV
@ SDIV
Definition: ISDOpcodes.h:235
powi
This is blocked on not handling X *X *X powi(X, 3)(see note above). The issue is that we end up getting t
llvm::log2
static double log2(double V)
Definition: AMDGPULibCalls.cpp:841
llvm::InstCombiner::SimplifyDemandedBits
virtual bool SimplifyDemandedBits(Instruction *I, unsigned OpNo, const APInt &DemandedMask, KnownBits &Known, unsigned Depth=0)=0
llvm::MCID::Add
@ Add
Definition: MCInstrDesc.h:184
llvm::MVT::v8i32
@ v8i32
Definition: MachineValueType.h:98
llvm::InstCombiner
The core instruction combiner logic.
Definition: InstCombiner.h:45
llvm::ISD::UINT_TO_FP
@ UINT_TO_FP
Definition: ISDOpcodes.h:696
llvm::ISD::ADD
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:232
llvm::TargetLoweringBase::isOperationLegalOrCustomOrPromote
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
Definition: TargetLowering.h:1132
llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:44
llvm::HardwareLoopInfo
Attributes of a target dependent hardware loop.
Definition: TargetTransformInfo.h:92
llvm::TargetTransformInfoImplBase::isConstantStridedAccessLessThan
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
Definition: TargetTransformInfoImpl.h:799
llvm::ARMTTIImpl::getCastInstrCost
int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:394
llvm::TargetLoweringBase::getTypeLegalizationCost
std::pair< int, MVT > getTypeLegalizationCost(const DataLayout &DL, Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: TargetLoweringBase.cpp:1824
llvm::ISD::FP_EXTEND
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:789
llvm::ARMTTIImpl::maybeLoweredToCall
bool maybeLoweredToCall(Instruction &I)
Definition: ARMTargetTransformInfo.cpp:1745
Instructions.h
llvm::IntrinsicCostAttributes::getID
Intrinsic::ID getID() const
Definition: TargetTransformInfo.h:145
llvm::ISD::SHL
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:613
SmallVector.h
llvm::PatternMatch::m_Specific
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:758
llvm::TargetTransformInfoImplBase::getMaskedMemoryOpCost
unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind) const
Definition: TargetTransformInfoImpl.h:528
llvm::ISD::MUL
@ MUL
Definition: ISDOpcodes.h:234
llvm::ISD::UREM
@ UREM
Definition: ISDOpcodes.h:238
llvm::MVT::f16
@ f16
Definition: MachineValueType.h:51
llvm::TailPredication::EnabledNoReductions
@ EnabledNoReductions
Definition: ARMTargetTransformInfo.h:44
llvm::CallBase::getArgOperand
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1341
llvm::ARMTTIImpl::isLegalMaskedStore
bool isLegalMaskedStore(Type *DataTy, Align Alignment)
Definition: ARMTargetTransformInfo.h:172
llvm::SPF_UMIN
@ SPF_UMIN
Signed minimum.
Definition: ValueTracking.h:658
llvm::getBooleanLoopAttribute
bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
Definition: LoopUtils.cpp:296
llvm::ARMTargetLowering::getNumInterleavedAccesses
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
Definition: ARMISelLowering.cpp:19131
llvm::max
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:350
simplifyNeonVld1
static Value * simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign, InstCombiner::BuilderTy &Builder)
Convert a vector load intrinsic into a simple llvm load instruction.
Definition: ARMTargetTransformInfo.cpp:66
llvm::APInt::getActiveBits
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition: APInt.h:1605
llvm::BasicTTIImplBase< ARMTTIImpl >::getGatherScatterOpCost
unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:999
llvm::ARMTTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: ARMTargetTransformInfo.cpp:2193
ARMTargetTransformInfo.h
llvm::HardwareLoopInfo::canAnalyze
bool canAnalyze(LoopInfo &LI)
Definition: TargetTransformInfo.cpp:47
llvm::IRBuilderBase::CreateVectorSplat
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:995
DerivedTypes.h
llvm::SCEV::getType
Type * getType() const
Return the LLVM type of this SCEV expression.
Definition: ScalarEvolution.cpp:379
EnableMaskedGatherScatters
cl::opt< bool > EnableMaskedGatherScatters
llvm::ScalarEvolution::getAddExpr
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
Definition: ScalarEvolution.cpp:2292
llvm::ARMTTIImpl::getInterleavedMemoryOpCost
int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Definition: ARMTargetTransformInfo.cpp:1436
llvm::APInt::getLowBitsSet
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Get a value with low bits set.
Definition: APInt.h:667
llvm::MVT::i16
@ i16
Definition: MachineValueType.h:42
llvm::TargetTransformInfo::UnrollingPreferences::OptSizeThreshold
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Definition: TargetTransformInfo.h:442
llvm::BasicTTIImplBase< ARMTTIImpl >::getScalarizationOverhead
unsigned getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:580
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
GEP
Hexagon Common GEP
Definition: HexagonCommonGEP.cpp:171
llvm::ARMTTIImpl::preferPredicatedReductionSelect
bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const
Definition: ARMTargetTransformInfo.cpp:2212
llvm::ISD::SDIVREM
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:248
llvm::ARMSubtarget::isThumb2
bool isThumb2() const
Definition: ARMSubtarget.h:803
llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition: AMDGPUMetadata.h:379
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
llvm::cl::desc
Definition: CommandLine.h:411
llvm::TargetLoweringBase::getValueType
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
Definition: TargetLowering.h:1386
llvm::LoopAccessInfo::getPSE
const PredicatedScalarEvolution & getPSE() const
Used to add runtime SCEV checks.
Definition: LoopAccessAnalysis.h:591
llvm::ISD::SIGN_EXTEND
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:682
llvm::MVT::v8i8
@ v8i8
Definition: MachineValueType.h:76
llvm::ScalarEvolution::getBackedgeTakenCount
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
Definition: ScalarEvolution.cpp:6916
llvm::MVT::v8f32
@ v8f32
Definition: MachineValueType.h:144
llvm::PredicatedScalarEvolution::getSE
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
Definition: ScalarEvolution.h:2151
llvm::ARMTTIImpl::instCombineIntrinsic
Optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
Definition: ARMTargetTransformInfo.cpp:120
llvm::BinaryOperator::Create
static BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), Instruction *InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
Definition: Instructions.cpp:2536
llvm::MVT::v2i16
@ v2i16
Definition: MachineValueType.h:84
llvm::MVT::v16i64
@ v16i64
Definition: MachineValueType.h:112
llvm::abs
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1272
llvm::BasicTTIImplBase< ARMTTIImpl >::getExtendedAddReductionCost
InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:2007
llvm::HardwareLoopInfo::CountType
IntegerType * CountType
Definition: TargetTransformInfo.h:99
llvm::EVT::isFixedLengthVector
bool isFixedLengthVector() const
Definition: ValueTypes.h:159
llvm::ISD::FP_ROUND
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:770
llvm::MVT::f32
@ f32
Definition: MachineValueType.h:52
llvm::Value
LLVM Value Representation.
Definition: Value.h:75
llvm::TargetTransformInfo::TCK_RecipThroughput
@ TCK_RecipThroughput
Reciprocal throughput.
Definition: TargetTransformInfo.h:211
llvm::EVT::isFloatingPoint
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:134
llvm::TargetTransformInfo::AMK_None
@ AMK_None
Definition: TargetTransformInfo.h:633
llvm::ARMTTIImpl::getMemoryOpCost
int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:1373
llvm::TargetTransformInfo::AMK_PreIndexed
@ AMK_PreIndexed
Definition: TargetTransformInfo.h:631
llvm::APIntOps::smax
const APInt & smax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be signed.
Definition: APInt.h:2178
llvm::ARMSubtarget::hasVFP2Base
bool hasVFP2Base() const
Definition: ARMSubtarget.h:640
llvm::TargetLoweringBase::LibCall
@ LibCall
Definition: TargetLowering.h:196
llvm::ARMSubtarget::isThumb
bool isThumb() const
Definition: ARMSubtarget.h:800
llvm::Type::getPrimitiveSizeInBits
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition: Type.cpp:122
llvm::EVT::getSimpleVT
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:281
llvm::MVT::v4i1
@ v4i1
Definition: MachineValueType.h:63
llvm::ARMTargetLowering::isLegalInterleavedAccessType
bool isLegalInterleavedAccessType(unsigned Factor, FixedVectorType *VecTy, Align Alignment, const DataLayout &DL) const
Returns true if VecTy is a legal interleaved access type.
Definition: ARMISelLowering.cpp:19136
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:37
llvm::BasicTTIImplBase< ARMTTIImpl >::getVectorInstrCost
unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: BasicTTIImpl.h:949
llvm::DataLayout::getTypeAllocSize
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:497
llvm::TailPredication::ForceEnabledNoReductions
@ ForceEnabledNoReductions
Definition: ARMTargetTransformInfo.h:46