LLVM  14.0.0git
ARMTargetTransformInfo.cpp
Go to the documentation of this file.
1 //===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
10 #include "ARMSubtarget.h"
12 #include "llvm/ADT/APInt.h"
13 #include "llvm/ADT/SmallVector.h"
14 #include "llvm/Analysis/LoopInfo.h"
15 #include "llvm/CodeGen/CostTable.h"
18 #include "llvm/IR/BasicBlock.h"
19 #include "llvm/IR/DataLayout.h"
20 #include "llvm/IR/DerivedTypes.h"
21 #include "llvm/IR/Instruction.h"
22 #include "llvm/IR/Instructions.h"
23 #include "llvm/IR/Intrinsics.h"
24 #include "llvm/IR/IntrinsicInst.h"
25 #include "llvm/IR/IntrinsicsARM.h"
26 #include "llvm/IR/PatternMatch.h"
27 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Casting.h"
30 #include "llvm/Support/KnownBits.h"
36 #include <algorithm>
37 #include <cassert>
38 #include <cstdint>
39 #include <utility>
40 
41 using namespace llvm;
42 
43 #define DEBUG_TYPE "armtti"
44 
46  "enable-arm-maskedldst", cl::Hidden, cl::init(true),
47  cl::desc("Enable the generation of masked loads and stores"));
48 
50  "disable-arm-loloops", cl::Hidden, cl::init(false),
51  cl::desc("Disable the generation of low-overhead loops"));
52 
53 static cl::opt<bool>
54  AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
55  cl::desc("Enable the generation of WLS loops"));
56 
58 
60 
62 
63 /// Convert a vector load intrinsic into a simple llvm load instruction.
64 /// This is beneficial when the underlying object being addressed comes
65 /// from a constant, since we get constant-folding for free.
66 static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
68  auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
69 
70  if (!IntrAlign)
71  return nullptr;
72 
73  unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
74  ? MemAlign
75  : IntrAlign->getLimitedValue();
76 
77  if (!isPowerOf2_32(Alignment))
78  return nullptr;
79 
80  auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
81  PointerType::get(II.getType(), 0));
82  return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
83 }
84 
86  const Function *Callee) const {
87  const TargetMachine &TM = getTLI()->getTargetMachine();
88  const FeatureBitset &CallerBits =
89  TM.getSubtargetImpl(*Caller)->getFeatureBits();
90  const FeatureBitset &CalleeBits =
91  TM.getSubtargetImpl(*Callee)->getFeatureBits();
92 
93  // To inline a callee, all features not in the allowed list must match exactly.
94  bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
95  (CalleeBits & ~InlineFeaturesAllowed);
96  // For features in the allowed list, the callee's features must be a subset of
97  // the callers'.
98  bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
99  (CalleeBits & InlineFeaturesAllowed);
100  return MatchExact && MatchSubset;
101 }
102 
105  ScalarEvolution *SE) const {
106  if (ST->hasMVEIntegerOps())
107  return TTI::AMK_PostIndexed;
108 
109  if (L->getHeader()->getParent()->hasOptSize())
110  return TTI::AMK_None;
111 
112  if (ST->isMClass() && ST->isThumb2() &&
113  L->getNumBlocks() == 1)
114  return TTI::AMK_PreIndexed;
115 
116  return TTI::AMK_None;
117 }
118 
121  using namespace PatternMatch;
122  Intrinsic::ID IID = II.getIntrinsicID();
123  switch (IID) {
124  default:
125  break;
126  case Intrinsic::arm_neon_vld1: {
127  Align MemAlign =
129  &IC.getAssumptionCache(), &IC.getDominatorTree());
130  if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
131  return IC.replaceInstUsesWith(II, V);
132  }
133  break;
134  }
135 
136  case Intrinsic::arm_neon_vld2:
137  case Intrinsic::arm_neon_vld3:
138  case Intrinsic::arm_neon_vld4:
139  case Intrinsic::arm_neon_vld2lane:
140  case Intrinsic::arm_neon_vld3lane:
141  case Intrinsic::arm_neon_vld4lane:
142  case Intrinsic::arm_neon_vst1:
143  case Intrinsic::arm_neon_vst2:
144  case Intrinsic::arm_neon_vst3:
145  case Intrinsic::arm_neon_vst4:
146  case Intrinsic::arm_neon_vst2lane:
147  case Intrinsic::arm_neon_vst3lane:
148  case Intrinsic::arm_neon_vst4lane: {
149  Align MemAlign =
151  &IC.getAssumptionCache(), &IC.getDominatorTree());
152  unsigned AlignArg = II.arg_size() - 1;
153  Value *AlignArgOp = II.getArgOperand(AlignArg);
154  MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
155  if (Align && *Align < MemAlign) {
156  return IC.replaceOperand(
157  II, AlignArg,
159  false));
160  }
161  break;
162  }
163 
164  case Intrinsic::arm_mve_pred_i2v: {
165  Value *Arg = II.getArgOperand(0);
166  Value *ArgArg;
167  if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
168  PatternMatch::m_Value(ArgArg))) &&
169  II.getType() == ArgArg->getType()) {
170  return IC.replaceInstUsesWith(II, ArgArg);
171  }
172  Constant *XorMask;
173  if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
174  PatternMatch::m_Value(ArgArg)),
175  PatternMatch::m_Constant(XorMask))) &&
176  II.getType() == ArgArg->getType()) {
177  if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
178  if (CI->getValue().trunc(16).isAllOnes()) {
179  auto TrueVector = IC.Builder.CreateVectorSplat(
180  cast<FixedVectorType>(II.getType())->getNumElements(),
181  IC.Builder.getTrue());
182  return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
183  }
184  }
185  }
186  KnownBits ScalarKnown(32);
187  if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
188  ScalarKnown, 0)) {
189  return &II;
190  }
191  break;
192  }
193  case Intrinsic::arm_mve_pred_v2i: {
194  Value *Arg = II.getArgOperand(0);
195  Value *ArgArg;
196  if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
197  PatternMatch::m_Value(ArgArg)))) {
198  return IC.replaceInstUsesWith(II, ArgArg);
199  }
200  if (!II.getMetadata(LLVMContext::MD_range)) {
201  Type *IntTy32 = Type::getInt32Ty(II.getContext());
202  Metadata *M[] = {
204  ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0x10000))};
205  II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M));
206  return &II;
207  }
208  break;
209  }
210  case Intrinsic::arm_mve_vadc:
211  case Intrinsic::arm_mve_vadc_predicated: {
212  unsigned CarryOp =
213  (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
214  assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
215  "Bad type for intrinsic!");
216 
217  KnownBits CarryKnown(32);
218  if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
219  CarryKnown)) {
220  return &II;
221  }
222  break;
223  }
224  case Intrinsic::arm_mve_vmldava: {
225  Instruction *I = cast<Instruction>(&II);
226  if (I->hasOneUse()) {
227  auto *User = cast<Instruction>(*I->user_begin());
228  Value *OpZ;
229  if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
230  match(I->getOperand(3), m_Zero())) {
231  Value *OpX = I->getOperand(4);
232  Value *OpY = I->getOperand(5);
233  Type *OpTy = OpX->getType();
234 
236  Value *V =
237  IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
238  {I->getOperand(0), I->getOperand(1),
239  I->getOperand(2), OpZ, OpX, OpY});
240 
241  IC.replaceInstUsesWith(*User, V);
242  return IC.eraseInstFromFunction(*User);
243  }
244  }
245  return None;
246  }
247  }
248  return None;
249 }
250 
252  InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
253  APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
254  std::function<void(Instruction *, unsigned, APInt, APInt &)>
255  SimplifyAndSetOp) const {
256 
257  // Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the
258  // opcode specifying a Top/Bottom instruction, which can change between
259  // instructions.
260  auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) {
261  unsigned NumElts = cast<FixedVectorType>(II.getType())->getNumElements();
262  unsigned IsTop = cast<ConstantInt>(II.getOperand(TopOpc))->getZExtValue();
263 
264  // The only odd/even lanes of operand 0 will only be demanded depending
265  // on whether this is a top/bottom instruction.
266  APInt DemandedElts =
267  APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
268  : APInt::getHighBitsSet(2, 1));
269  SimplifyAndSetOp(&II, 0, OrigDemandedElts & DemandedElts, UndefElts);
270  // The other lanes will be defined from the inserted elements.
271  UndefElts &= APInt::getSplat(NumElts, !IsTop ? APInt::getLowBitsSet(2, 1)
272  : APInt::getHighBitsSet(2, 1));
273  return None;
274  };
275 
276  switch (II.getIntrinsicID()) {
277  default:
278  break;
279  case Intrinsic::arm_mve_vcvt_narrow:
280  SimplifyNarrowInstrTopBottom(2);
281  break;
282  case Intrinsic::arm_mve_vqmovn:
283  SimplifyNarrowInstrTopBottom(4);
284  break;
285  case Intrinsic::arm_mve_vshrn:
286  SimplifyNarrowInstrTopBottom(7);
287  break;
288  }
289 
290  return None;
291 }
292 
295  assert(Ty->isIntegerTy());
296 
297  unsigned Bits = Ty->getPrimitiveSizeInBits();
298  if (Bits == 0 || Imm.getActiveBits() >= 64)
299  return 4;
300 
301  int64_t SImmVal = Imm.getSExtValue();
302  uint64_t ZImmVal = Imm.getZExtValue();
303  if (!ST->isThumb()) {
304  if ((SImmVal >= 0 && SImmVal < 65536) ||
305  (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
306  (ARM_AM::getSOImmVal(~ZImmVal) != -1))
307  return 1;
308  return ST->hasV6T2Ops() ? 2 : 3;
309  }
310  if (ST->isThumb2()) {
311  if ((SImmVal >= 0 && SImmVal < 65536) ||
312  (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
313  (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
314  return 1;
315  return ST->hasV6T2Ops() ? 2 : 3;
316  }
317  // Thumb1, any i8 imm cost 1.
318  if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
319  return 1;
320  if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
321  return 2;
322  // Load from constantpool.
323  return 3;
324 }
325 
326 // Constants smaller than 256 fit in the immediate field of
327 // Thumb1 instructions so we return a zero cost and 1 otherwise.
329  const APInt &Imm, Type *Ty) {
330  if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
331  return 0;
332 
333  return 1;
334 }
335 
336 // Checks whether Inst is part of a min(max()) or max(min()) pattern
337 // that will match to an SSAT instruction
338 static bool isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
339  Value *LHS, *RHS;
340  ConstantInt *C;
341  SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor;
342 
343  if (InstSPF == SPF_SMAX &&
345  C->getValue() == Imm && Imm.isNegative() && Imm.isNegatedPowerOf2()) {
346 
347  auto isSSatMin = [&](Value *MinInst) {
348  if (isa<SelectInst>(MinInst)) {
349  Value *MinLHS, *MinRHS;
350  ConstantInt *MinC;
351  SelectPatternFlavor MinSPF =
352  matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
353  if (MinSPF == SPF_SMIN &&
355  MinC->getValue() == ((-Imm) - 1))
356  return true;
357  }
358  return false;
359  };
360 
361  if (isSSatMin(Inst->getOperand(1)) ||
362  (Inst->hasNUses(2) && (isSSatMin(*Inst->user_begin()) ||
363  isSSatMin(*(++Inst->user_begin())))))
364  return true;
365  }
366  return false;
367 }
368 
369 InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
370  const APInt &Imm, Type *Ty,
372  Instruction *Inst) {
373  // Division by a constant can be turned into multiplication, but only if we
374  // know it's constant. So it's not so much that the immediate is cheap (it's
375  // not), but that the alternative is worse.
376  // FIXME: this is probably unneeded with GlobalISel.
377  if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
378  Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
379  Idx == 1)
380  return 0;
381 
382  // Leave any gep offsets for the CodeGenPrepare, which will do a better job at
383  // splitting any large offsets.
384  if (Opcode == Instruction::GetElementPtr && Idx != 0)
385  return 0;
386 
387  if (Opcode == Instruction::And) {
388  // UXTB/UXTH
389  if (Imm == 255 || Imm == 65535)
390  return 0;
391  // Conversion to BIC is free, and means we can use ~Imm instead.
392  return std::min(getIntImmCost(Imm, Ty, CostKind),
393  getIntImmCost(~Imm, Ty, CostKind));
394  }
395 
396  if (Opcode == Instruction::Add)
397  // Conversion to SUB is free, and means we can use -Imm instead.
398  return std::min(getIntImmCost(Imm, Ty, CostKind),
399  getIntImmCost(-Imm, Ty, CostKind));
400 
401  if (Opcode == Instruction::ICmp && Imm.isNegative() &&
402  Ty->getIntegerBitWidth() == 32) {
403  int64_t NegImm = -Imm.getSExtValue();
404  if (ST->isThumb2() && NegImm < 1<<12)
405  // icmp X, #-C -> cmn X, #C
406  return 0;
407  if (ST->isThumb() && NegImm < 1<<8)
408  // icmp X, #-C -> adds X, #C
409  return 0;
410  }
411 
412  // xor a, -1 can always be folded to MVN
413  if (Opcode == Instruction::Xor && Imm.isAllOnes())
414  return 0;
415 
416  // Ensures negative constant of min(max()) or max(min()) patterns that
417  // match to SSAT instructions don't get hoisted
418  if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
419  Ty->getIntegerBitWidth() <= 32) {
420  if (isSSATMinMaxPattern(Inst, Imm) ||
421  (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
422  isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm)))
423  return 0;
424  }
425 
426  // We can convert <= -1 to < 0, which is generally quite cheap.
427  if (Inst && Opcode == Instruction::ICmp && Idx == 1 && Imm.isAllOnesValue()) {
428  ICmpInst::Predicate Pred = cast<ICmpInst>(Inst)->getPredicate();
429  if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE)
430  return std::min(getIntImmCost(Imm, Ty, CostKind),
431  getIntImmCost(Imm + 1, Ty, CostKind));
432  }
433 
434  return getIntImmCost(Imm, Ty, CostKind);
435 }
436 
439  const Instruction *I) {
441  (ST->hasNEON() || ST->hasMVEIntegerOps())) {
442  // FIXME: The vectorizer is highly sensistive to the cost of these
443  // instructions, which suggests that it may be using the costs incorrectly.
444  // But, for now, just make them free to avoid performance regressions for
445  // vector targets.
446  return 0;
447  }
448  return BaseT::getCFInstrCost(Opcode, CostKind, I);
449 }
450 
452  Type *Src,
455  const Instruction *I) {
456  int ISD = TLI->InstructionOpcodeToISD(Opcode);
457  assert(ISD && "Invalid opcode");
458 
459  // TODO: Allow non-throughput costs that aren't binary.
460  auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
462  return Cost == 0 ? 0 : 1;
463  return Cost;
464  };
465  auto IsLegalFPType = [this](EVT VT) {
466  EVT EltVT = VT.getScalarType();
467  return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
468  (EltVT == MVT::f64 && ST->hasFP64()) ||
469  (EltVT == MVT::f16 && ST->hasFullFP16());
470  };
471 
472  EVT SrcTy = TLI->getValueType(DL, Src);
473  EVT DstTy = TLI->getValueType(DL, Dst);
474 
475  if (!SrcTy.isSimple() || !DstTy.isSimple())
476  return AdjustCost(
477  BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
478 
479  // Extending masked load/Truncating masked stores is expensive because we
480  // currently don't split them. This means that we'll likely end up
481  // loading/storing each element individually (hence the high cost).
482  if ((ST->hasMVEIntegerOps() &&
483  (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
484  Opcode == Instruction::SExt)) ||
485  (ST->hasMVEFloatOps() &&
486  (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
487  IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
488  if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
489  return 2 * DstTy.getVectorNumElements() *
491 
492  // The extend of other kinds of load is free
493  if (CCH == TTI::CastContextHint::Normal ||
495  static const TypeConversionCostTblEntry LoadConversionTbl[] = {
508  };
509  if (const auto *Entry = ConvertCostTableLookup(
510  LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
511  return AdjustCost(Entry->Cost);
512 
513  static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
520  // The following extend from a legal type to an illegal type, so need to
521  // split the load. This introduced an extra load operation, but the
522  // extend is still "free".
529  };
530  if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
531  if (const auto *Entry =
532  ConvertCostTableLookup(MVELoadConversionTbl, ISD,
533  DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
534  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
535  }
536 
537  static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
538  // FPExtends are similar but also require the VCVT instructions.
541  };
542  if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
543  if (const auto *Entry =
544  ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
545  DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
546  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
547  }
548 
549  // The truncate of a store is free. This is the mirror of extends above.
550  static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
558  };
559  if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
560  if (const auto *Entry =
561  ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
562  SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
563  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
564  }
565 
566  static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
569  };
570  if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
571  if (const auto *Entry =
572  ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
573  SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
574  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
575  }
576  }
577 
578  // NEON vector operations that can extend their inputs.
579  if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
580  I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
581  static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
582  // vaddl
583  { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
584  { ISD::ADD, MVT::v8i16, MVT::v8i8, 0 },
585  // vsubl
586  { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
587  { ISD::SUB, MVT::v8i16, MVT::v8i8, 0 },
588  // vmull
589  { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
590  { ISD::MUL, MVT::v8i16, MVT::v8i8, 0 },
591  // vshll
592  { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
593  { ISD::SHL, MVT::v8i16, MVT::v8i8, 0 },
594  };
595 
596  auto *User = cast<Instruction>(*I->user_begin());
597  int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
598  if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
599  DstTy.getSimpleVT(),
600  SrcTy.getSimpleVT())) {
601  return AdjustCost(Entry->Cost);
602  }
603  }
604 
605  // Single to/from double precision conversions.
606  if (Src->isVectorTy() && ST->hasNEON() &&
607  ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
608  DstTy.getScalarType() == MVT::f32) ||
609  (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
610  DstTy.getScalarType() == MVT::f64))) {
611  static const CostTblEntry NEONFltDblTbl[] = {
612  // Vector fptrunc/fpext conversions.
615  {ISD::FP_EXTEND, MVT::v4f32, 4}};
616 
617  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
618  if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
619  return AdjustCost(LT.first * Entry->Cost);
620  }
621 
622  // Some arithmetic, load and store operations have specific instructions
623  // to cast up/down their types automatically at no extra cost.
624  // TODO: Get these tables to know at least what the related operations are.
625  static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
632 
633  // The number of vmovl instructions for the extension.
652 
653  // Operations that we legalize using splitting.
656 
657  // Vector float <-> i32 conversions.
660 
681 
688 
689  // Vector double <-> i32 conversions.
692 
699 
706  };
707 
708  if (SrcTy.isVector() && ST->hasNEON()) {
709  if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
710  DstTy.getSimpleVT(),
711  SrcTy.getSimpleVT()))
712  return AdjustCost(Entry->Cost);
713  }
714 
715  // Scalar float to integer conversions.
716  static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
737  };
738  if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
739  if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
740  DstTy.getSimpleVT(),
741  SrcTy.getSimpleVT()))
742  return AdjustCost(Entry->Cost);
743  }
744 
745  // Scalar integer to float conversions.
746  static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
767  };
768 
769  if (SrcTy.isInteger() && ST->hasNEON()) {
770  if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
771  ISD, DstTy.getSimpleVT(),
772  SrcTy.getSimpleVT()))
773  return AdjustCost(Entry->Cost);
774  }
775 
776  // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
777  // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
778  // are linearised so take more.
779  static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
792  };
793 
794  if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
795  if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
796  ISD, DstTy.getSimpleVT(),
797  SrcTy.getSimpleVT()))
798  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
799  }
800 
801  if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
802  // As general rule, fp converts that were not matched above are scalarized
803  // and cost 1 vcvt for each lane, so long as the instruction is available.
804  // If not it will become a series of function calls.
805  const InstructionCost CallCost =
806  getCallInstrCost(nullptr, Dst, {Src}, CostKind);
807  int Lanes = 1;
808  if (SrcTy.isFixedLengthVector())
809  Lanes = SrcTy.getVectorNumElements();
810 
811  if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
812  return Lanes;
813  else
814  return Lanes * CallCost;
815  }
816 
817  if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
818  SrcTy.isFixedLengthVector()) {
819  // Treat a truncate with larger than legal source (128bits for MVE) as
820  // expensive, 2 instructions per lane.
821  if ((SrcTy.getScalarType() == MVT::i8 ||
822  SrcTy.getScalarType() == MVT::i16 ||
823  SrcTy.getScalarType() == MVT::i32) &&
824  SrcTy.getSizeInBits() > 128 &&
825  SrcTy.getSizeInBits() > DstTy.getSizeInBits())
826  return SrcTy.getVectorNumElements() * 2;
827  }
828 
829  // Scalar integer conversion costs.
830  static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
831  // i16 -> i64 requires two dependent operations.
833 
834  // Truncates on i64 are assumed to be free.
837  { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 },
839  };
840 
841  if (SrcTy.isInteger()) {
842  if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
843  DstTy.getSimpleVT(),
844  SrcTy.getSimpleVT()))
845  return AdjustCost(Entry->Cost);
846  }
847 
848  int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
850  : 1;
851  return AdjustCost(
852  BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
853 }
854 
856  unsigned Index) {
857  // Penalize inserting into an D-subregister. We end up with a three times
858  // lower estimated throughput on swift.
859  if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
860  ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
861  return 3;
862 
863  if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
864  Opcode == Instruction::ExtractElement)) {
865  // Cross-class copies are expensive on many microarchitectures,
866  // so assume they are expensive by default.
867  if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
868  return 3;
869 
870  // Even if it's not a cross class copy, this likely leads to mixing
871  // of NEON and VFP code and should be therefore penalized.
872  if (ValTy->isVectorTy() &&
873  ValTy->getScalarSizeInBits() <= 32)
874  return std::max<InstructionCost>(
875  BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
876  }
877 
878  if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
879  Opcode == Instruction::ExtractElement)) {
880  // Integer cross-lane moves are more expensive than float, which can
881  // sometimes just be vmovs. Integer involve being passes to GPR registers,
882  // causing more of a delay.
883  std::pair<InstructionCost, MVT> LT =
884  getTLI()->getTypeLegalizationCost(DL, ValTy->getScalarType());
885  return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
886  }
887 
888  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
889 }
890 
892  Type *CondTy,
893  CmpInst::Predicate VecPred,
895  const Instruction *I) {
896  int ISD = TLI->InstructionOpcodeToISD(Opcode);
897 
898  // Thumb scalar code size cost for select.
899  if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
900  ST->isThumb() && !ValTy->isVectorTy()) {
901  // Assume expensive structs.
902  if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
903  return TTI::TCC_Expensive;
904 
905  // Select costs can vary because they:
906  // - may require one or more conditional mov (including an IT),
907  // - can't operate directly on immediates,
908  // - require live flags, which we can't copy around easily.
909  InstructionCost Cost = TLI->getTypeLegalizationCost(DL, ValTy).first;
910 
911  // Possible IT instruction for Thumb2, or more for Thumb1.
912  ++Cost;
913 
914  // i1 values may need rematerialising by using mov immediates and/or
915  // flag setting instructions.
916  if (ValTy->isIntegerTy(1))
917  ++Cost;
918 
919  return Cost;
920  }
921 
922  // If this is a vector min/max/abs, use the cost of that intrinsic directly
923  // instead. Hopefully when min/max intrinsics are more prevalent this code
924  // will not be needed.
925  const Instruction *Sel = I;
926  if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
927  Sel->hasOneUse())
928  Sel = cast<Instruction>(Sel->user_back());
929  if (Sel && ValTy->isVectorTy() &&
930  (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
931  const Value *LHS, *RHS;
932  SelectPatternFlavor SPF = matchSelectPattern(Sel, LHS, RHS).Flavor;
933  unsigned IID = 0;
934  switch (SPF) {
935  case SPF_ABS:
936  IID = Intrinsic::abs;
937  break;
938  case SPF_SMIN:
939  IID = Intrinsic::smin;
940  break;
941  case SPF_SMAX:
942  IID = Intrinsic::smax;
943  break;
944  case SPF_UMIN:
945  IID = Intrinsic::umin;
946  break;
947  case SPF_UMAX:
948  IID = Intrinsic::umax;
949  break;
950  case SPF_FMINNUM:
951  IID = Intrinsic::minnum;
952  break;
953  case SPF_FMAXNUM:
954  IID = Intrinsic::maxnum;
955  break;
956  default:
957  break;
958  }
959  if (IID) {
960  // The ICmp is free, the select gets the cost of the min/max/etc
961  if (Sel != I)
962  return 0;
963  IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
964  return getIntrinsicInstrCost(CostAttrs, CostKind);
965  }
966  }
967 
968  // On NEON a vector select gets lowered to vbsl.
969  if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
970  // Lowering of some vector selects is currently far from perfect.
971  static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
972  { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
973  { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
975  };
976 
977  EVT SelCondTy = TLI->getValueType(DL, CondTy);
978  EVT SelValTy = TLI->getValueType(DL, ValTy);
979  if (SelCondTy.isSimple() && SelValTy.isSimple()) {
980  if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
981  SelCondTy.getSimpleVT(),
982  SelValTy.getSimpleVT()))
983  return Entry->Cost;
984  }
985 
986  std::pair<InstructionCost, MVT> LT =
987  TLI->getTypeLegalizationCost(DL, ValTy);
988  return LT.first;
989  }
990 
991  if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
992  (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
993  cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
994  FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
995  FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy);
996  if (!VecCondTy)
997  VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy));
998 
999  // If we don't have mve.fp any fp operations will need to be scalarized.
1000  if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
1001  // One scalaization insert, one scalarization extract and the cost of the
1002  // fcmps.
1003  return BaseT::getScalarizationOverhead(VecValTy, false, true) +
1004  BaseT::getScalarizationOverhead(VecCondTy, true, false) +
1005  VecValTy->getNumElements() *
1006  getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
1007  VecCondTy->getScalarType(), VecPred, CostKind,
1008  I);
1009  }
1010 
1011  std::pair<InstructionCost, MVT> LT =
1012  TLI->getTypeLegalizationCost(DL, ValTy);
1013  int BaseCost = ST->getMVEVectorCostFactor(CostKind);
1014  // There are two types - the input that specifies the type of the compare
1015  // and the output vXi1 type. Because we don't know how the output will be
1016  // split, we may need an expensive shuffle to get two in sync. This has the
1017  // effect of making larger than legal compares (v8i32 for example)
1018  // expensive.
1019  if (LT.second.getVectorNumElements() > 2) {
1020  if (LT.first > 1)
1021  return LT.first * BaseCost +
1022  BaseT::getScalarizationOverhead(VecCondTy, true, false);
1023  return BaseCost;
1024  }
1025  }
1026 
1027  // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1028  // for "multiple beats" potentially needed by MVE instructions.
1029  int BaseCost = 1;
1030  if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
1031  BaseCost = ST->getMVEVectorCostFactor(CostKind);
1032 
1033  return BaseCost *
1034  BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1035 }
1036 
1038  ScalarEvolution *SE,
1039  const SCEV *Ptr) {
1040  // Address computations in vectorized code with non-consecutive addresses will
1041  // likely result in more instructions compared to scalar code where the
1042  // computation can more often be merged into the index mode. The resulting
1043  // extra micro-ops can significantly decrease throughput.
1044  unsigned NumVectorInstToHideOverhead = 10;
1045  int MaxMergeDistance = 64;
1046 
1047  if (ST->hasNEON()) {
1048  if (Ty->isVectorTy() && SE &&
1049  !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
1050  return NumVectorInstToHideOverhead;
1051 
1052  // In many cases the address computation is not merged into the instruction
1053  // addressing mode.
1054  return 1;
1055  }
1056  return BaseT::getAddressComputationCost(Ty, SE, Ptr);
1057 }
1058 
1060  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
1061  // If a VCTP is part of a chain, it's already profitable and shouldn't be
1062  // optimized, else LSR may block tail-predication.
1063  switch (II->getIntrinsicID()) {
1064  case Intrinsic::arm_mve_vctp8:
1065  case Intrinsic::arm_mve_vctp16:
1066  case Intrinsic::arm_mve_vctp32:
1067  case Intrinsic::arm_mve_vctp64:
1068  return true;
1069  default:
1070  break;
1071  }
1072  }
1073  return false;
1074 }
1075 
1076 bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
1078  return false;
1079 
1080  if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
1081  // Don't support v2i1 yet.
1082  if (VecTy->getNumElements() == 2)
1083  return false;
1084 
1085  // We don't support extending fp types.
1086  unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
1087  if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
1088  return false;
1089  }
1090 
1091  unsigned EltWidth = DataTy->getScalarSizeInBits();
1092  return (EltWidth == 32 && Alignment >= 4) ||
1093  (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
1094 }
1095 
1098  return false;
1099 
1100  // This method is called in 2 places:
1101  // - from the vectorizer with a scalar type, in which case we need to get
1102  // this as good as we can with the limited info we have (and rely on the cost
1103  // model for the rest).
1104  // - from the masked intrinsic lowering pass with the actual vector type.
1105  // For MVE, we have a custom lowering pass that will already have custom
1106  // legalised any gathers that we can to MVE intrinsics, and want to expand all
1107  // the rest. The pass runs before the masked intrinsic lowering pass, so if we
1108  // are here, we know we want to expand.
1109  if (isa<VectorType>(Ty))
1110  return false;
1111 
1112  unsigned EltWidth = Ty->getScalarSizeInBits();
1113  return ((EltWidth == 32 && Alignment >= 4) ||
1114  (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1115 }
1116 
1117 /// Given a memcpy/memset/memmove instruction, return the number of memory
1118 /// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1119 /// call is used.
1121  MemOp MOp;
1122  unsigned DstAddrSpace = ~0u;
1123  unsigned SrcAddrSpace = ~0u;
1124  const Function *F = I->getParent()->getParent();
1125 
1126  if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
1127  ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
1128  // If 'size' is not a constant, a library call will be generated.
1129  if (!C)
1130  return -1;
1131 
1132  const unsigned Size = C->getValue().getZExtValue();
1133  const Align DstAlign = *MC->getDestAlign();
1134  const Align SrcAlign = *MC->getSourceAlign();
1135 
1136  MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
1137  /*IsVolatile*/ false);
1138  DstAddrSpace = MC->getDestAddressSpace();
1139  SrcAddrSpace = MC->getSourceAddressSpace();
1140  }
1141  else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
1142  ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
1143  // If 'size' is not a constant, a library call will be generated.
1144  if (!C)
1145  return -1;
1146 
1147  const unsigned Size = C->getValue().getZExtValue();
1148  const Align DstAlign = *MS->getDestAlign();
1149 
1150  MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
1151  /*IsZeroMemset*/ false, /*IsVolatile*/ false);
1152  DstAddrSpace = MS->getDestAddressSpace();
1153  }
1154  else
1155  llvm_unreachable("Expected a memcpy/move or memset!");
1156 
1157  unsigned Limit, Factor = 2;
1158  switch(I->getIntrinsicID()) {
1159  case Intrinsic::memcpy:
1160  Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
1161  break;
1162  case Intrinsic::memmove:
1163  Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
1164  break;
1165  case Intrinsic::memset:
1166  Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
1167  Factor = 1;
1168  break;
1169  default:
1170  llvm_unreachable("Expected a memcpy/move or memset!");
1171  }
1172 
1173  // MemOps will be poplulated with a list of data types that needs to be
1174  // loaded and stored. That's why we multiply the number of elements by 2 to
1175  // get the cost for this memcpy.
1176  std::vector<EVT> MemOps;
1177  if (getTLI()->findOptimalMemOpLowering(
1178  MemOps, Limit, MOp, DstAddrSpace,
1179  SrcAddrSpace, F->getAttributes()))
1180  return MemOps.size() * Factor;
1181 
1182  // If we can't find an optimal memop lowering, return the default cost
1183  return -1;
1184 }
1185 
1187  int NumOps = getNumMemOps(cast<IntrinsicInst>(I));
1188 
1189  // To model the cost of a library call, we assume 1 for the call, and
1190  // 3 for the argument setup.
1191  if (NumOps == -1)
1192  return 4;
1193  return NumOps;
1194 }
1195 
1198  int Index, VectorType *SubTp) {
1200  if (ST->hasNEON()) {
1201  if (Kind == TTI::SK_Broadcast) {
1202  static const CostTblEntry NEONDupTbl[] = {
1203  // VDUP handles these cases.
1210 
1215 
1216  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1217  if (const auto *Entry =
1218  CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
1219  return LT.first * Entry->Cost;
1220  }
1221  if (Kind == TTI::SK_Reverse) {
1222  static const CostTblEntry NEONShuffleTbl[] = {
1223  // Reverse shuffle cost one instruction if we are shuffling within a
1224  // double word (vrev) or two if we shuffle a quad word (vrev, vext).
1231 
1236 
1237  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1238  if (const auto *Entry =
1239  CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
1240  return LT.first * Entry->Cost;
1241  }
1242  if (Kind == TTI::SK_Select) {
1243  static const CostTblEntry NEONSelShuffleTbl[] = {
1244  // Select shuffle cost table for ARM. Cost is the number of
1245  // instructions
1246  // required to create the shuffled vector.
1247 
1252 
1256 
1258 
1260 
1261  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1262  if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
1263  ISD::VECTOR_SHUFFLE, LT.second))
1264  return LT.first * Entry->Cost;
1265  }
1266  }
1267  if (ST->hasMVEIntegerOps()) {
1268  if (Kind == TTI::SK_Broadcast) {
1269  static const CostTblEntry MVEDupTbl[] = {
1270  // VDUP handles these cases.
1276 
1277  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1278  if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
1279  LT.second))
1280  return LT.first * Entry->Cost *
1282  }
1283 
1284  if (!Mask.empty()) {
1285  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1286  if (Mask.size() <= LT.second.getVectorNumElements() &&
1287  (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
1288  isVREVMask(Mask, LT.second, 64)))
1290  }
1291  }
1292 
1293  int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
1295  : 1;
1296  return BaseCost * BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
1297 }
1298 
1300  unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1302  TTI::OperandValueProperties Opd1PropInfo,
1304  const Instruction *CxtI) {
1305  int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1306  if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
1307  // Make operations on i1 relatively expensive as this often involves
1308  // combining predicates. AND and XOR should be easier to handle with IT
1309  // blocks.
1310  switch (ISDOpcode) {
1311  default:
1312  break;
1313  case ISD::AND:
1314  case ISD::XOR:
1315  return 2;
1316  case ISD::OR:
1317  return 3;
1318  }
1319  }
1320 
1321  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
1322 
1323  if (ST->hasNEON()) {
1324  const unsigned FunctionCallDivCost = 20;
1325  const unsigned ReciprocalDivCost = 10;
1326  static const CostTblEntry CostTbl[] = {
1327  // Division.
1328  // These costs are somewhat random. Choose a cost of 20 to indicate that
1329  // vectorizing devision (added function call) is going to be very expensive.
1330  // Double registers types.
1331  { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1332  { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1333  { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
1334  { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
1335  { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1336  { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1337  { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
1338  { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
1339  { ISD::SDIV, MVT::v4i16, ReciprocalDivCost},
1340  { ISD::UDIV, MVT::v4i16, ReciprocalDivCost},
1341  { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
1342  { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
1343  { ISD::SDIV, MVT::v8i8, ReciprocalDivCost},
1344  { ISD::UDIV, MVT::v8i8, ReciprocalDivCost},
1345  { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost},
1346  { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost},
1347  // Quad register types.
1348  { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1349  { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1350  { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
1351  { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
1352  { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1353  { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1354  { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
1355  { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
1356  { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1357  { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1358  { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
1359  { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
1360  { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1361  { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1362  { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
1363  { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
1364  // Multiplication.
1365  };
1366 
1367  if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
1368  return LT.first * Entry->Cost;
1369 
1371  Opcode, Ty, CostKind, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo);
1372 
1373  // This is somewhat of a hack. The problem that we are facing is that SROA
1374  // creates a sequence of shift, and, or instructions to construct values.
1375  // These sequences are recognized by the ISel and have zero-cost. Not so for
1376  // the vectorized code. Because we have support for v2i64 but not i64 those
1377  // sequences look particularly beneficial to vectorize.
1378  // To work around this we increase the cost of v2i64 operations to make them
1379  // seem less beneficial.
1380  if (LT.second == MVT::v2i64 &&
1382  Cost += 4;
1383 
1384  return Cost;
1385  }
1386 
1387  // If this operation is a shift on arm/thumb2, it might well be folded into
1388  // the following instruction, hence having a cost of 0.
1389  auto LooksLikeAFreeShift = [&]() {
1390  if (ST->isThumb1Only() || Ty->isVectorTy())
1391  return false;
1392 
1393  if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
1394  return false;
1396  return false;
1397 
1398  // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1399  switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
1400  case Instruction::Add:
1401  case Instruction::Sub:
1402  case Instruction::And:
1403  case Instruction::Xor:
1404  case Instruction::Or:
1405  case Instruction::ICmp:
1406  return true;
1407  default:
1408  return false;
1409  }
1410  };
1411  if (LooksLikeAFreeShift())
1412  return 0;
1413 
1414  // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1415  // for "multiple beats" potentially needed by MVE instructions.
1416  int BaseCost = 1;
1417  if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1418  BaseCost = ST->getMVEVectorCostFactor(CostKind);
1419 
1420  // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
1421  // without treating floats as more expensive that scalars or increasing the
1422  // costs for custom operations. The results is also multiplied by the
1423  // MVEVectorCostFactor where appropriate.
1424  if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
1425  return LT.first * BaseCost;
1426 
1427  // Else this is expand, assume that we need to scalarize this op.
1428  if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
1429  unsigned Num = VTy->getNumElements();
1430  InstructionCost Cost =
1432  // Return the cost of multiple scalar invocation plus the cost of
1433  // inserting and extracting the values.
1434  SmallVector<Type *> Tys(Args.size(), Ty);
1435  return BaseT::getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
1436  }
1437 
1438  return BaseCost;
1439 }
1440 
1442  MaybeAlign Alignment,
1443  unsigned AddressSpace,
1445  const Instruction *I) {
1446  // TODO: Handle other cost kinds.
1448  return 1;
1449 
1450  // Type legalization can't handle structs
1451  if (TLI->getValueType(DL, Src, true) == MVT::Other)
1452  return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1453  CostKind);
1454 
1455  if (ST->hasNEON() && Src->isVectorTy() &&
1456  (Alignment && *Alignment != Align(16)) &&
1457  cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
1458  // Unaligned loads/stores are extremely inefficient.
1459  // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1460  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
1461  return LT.first * 4;
1462  }
1463 
1464  // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
1465  // Same for stores.
1466  if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
1467  ((Opcode == Instruction::Load && I->hasOneUse() &&
1468  isa<FPExtInst>(*I->user_begin())) ||
1469  (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
1470  FixedVectorType *SrcVTy = cast<FixedVectorType>(Src);
1471  Type *DstTy =
1472  Opcode == Instruction::Load
1473  ? (*I->user_begin())->getType()
1474  : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
1475  if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
1476  DstTy->getScalarType()->isFloatTy())
1477  return ST->getMVEVectorCostFactor(CostKind);
1478  }
1479 
1480  int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1482  : 1;
1483  return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1484  CostKind, I);
1485 }
1486 
1488 ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1489  unsigned AddressSpace,
1491  if (ST->hasMVEIntegerOps()) {
1492  if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment))
1493  return ST->getMVEVectorCostFactor(CostKind);
1494  if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment))
1495  return ST->getMVEVectorCostFactor(CostKind);
1496  }
1497  if (!isa<FixedVectorType>(Src))
1498  return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1499  CostKind);
1500  // Scalar cost, which is currently very high due to the efficiency of the
1501  // generated code.
1502  return cast<FixedVectorType>(Src)->getNumElements() * 8;
1503 }
1504 
1506  unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1507  Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1508  bool UseMaskForCond, bool UseMaskForGaps) {
1509  assert(Factor >= 2 && "Invalid interleave factor");
1510  assert(isa<VectorType>(VecTy) && "Expect a vector type");
1511 
1512  // vldN/vstN doesn't support vector types of i64/f64 element.
1513  bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
1514 
1515  if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
1516  !UseMaskForCond && !UseMaskForGaps) {
1517  unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1518  auto *SubVecTy =
1519  FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
1520 
1521  // vldN/vstN only support legal vector types of size 64 or 128 in bits.
1522  // Accesses having vector types that are a multiple of 128 bits can be
1523  // matched to more than one vldN/vstN instruction.
1524  int BaseCost =
1526  if (NumElts % Factor == 0 &&
1527  TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
1528  return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1529 
1530  // Some smaller than legal interleaved patterns are cheap as we can make
1531  // use of the vmovn or vrev patterns to interleave a standard load. This is
1532  // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1533  // promoted differently). The cost of 2 here is then a load and vrev or
1534  // vmovn.
1535  if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
1536  VecTy->isIntOrIntVectorTy() &&
1537  DL.getTypeSizeInBits(SubVecTy).getFixedSize() <= 64)
1538  return 2 * BaseCost;
1539  }
1540 
1541  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1542  Alignment, AddressSpace, CostKind,
1543  UseMaskForCond, UseMaskForGaps);
1544 }
1545 
1547  unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1548  Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
1549  using namespace PatternMatch;
1550  if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
1551  return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1552  Alignment, CostKind, I);
1553 
1554  assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
1555  auto *VTy = cast<FixedVectorType>(DataTy);
1556 
1557  // TODO: Splitting, once we do that.
1558 
1559  unsigned NumElems = VTy->getNumElements();
1560  unsigned EltSize = VTy->getScalarSizeInBits();
1561  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, DataTy);
1562 
1563  // For now, it is assumed that for the MVE gather instructions the loads are
1564  // all effectively serialised. This means the cost is the scalar cost
1565  // multiplied by the number of elements being loaded. This is possibly very
1566  // conservative, but even so we still end up vectorising loops because the
1567  // cost per iteration for many loops is lower than for scalar loops.
1568  InstructionCost VectorCost =
1569  NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
1570  // The scalarization cost should be a lot higher. We use the number of vector
1571  // elements plus the scalarization overhead.
1572  InstructionCost ScalarCost =
1573  NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, true, false) +
1574  BaseT::getScalarizationOverhead(VTy, false, true);
1575 
1576  if (EltSize < 8 || Alignment < EltSize / 8)
1577  return ScalarCost;
1578 
1579  unsigned ExtSize = EltSize;
1580  // Check whether there's a single user that asks for an extended type
1581  if (I != nullptr) {
1582  // Dependent of the caller of this function, a gather instruction will
1583  // either have opcode Instruction::Load or be a call to the masked_gather
1584  // intrinsic
1585  if ((I->getOpcode() == Instruction::Load ||
1586  match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
1587  I->hasOneUse()) {
1588  const User *Us = *I->users().begin();
1589  if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
1590  // only allow valid type combinations
1591  unsigned TypeSize =
1592  cast<Instruction>(Us)->getType()->getScalarSizeInBits();
1593  if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
1594  (TypeSize == 16 && EltSize == 8)) &&
1595  TypeSize * NumElems == 128) {
1596  ExtSize = TypeSize;
1597  }
1598  }
1599  }
1600  // Check whether the input data needs to be truncated
1601  TruncInst *T;
1602  if ((I->getOpcode() == Instruction::Store ||
1603  match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
1604  (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
1605  // Only allow valid type combinations
1606  unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
1607  if (((EltSize == 16 && TypeSize == 32) ||
1608  (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
1609  TypeSize * NumElems == 128)
1610  ExtSize = TypeSize;
1611  }
1612  }
1613 
1614  if (ExtSize * NumElems != 128 || NumElems < 4)
1615  return ScalarCost;
1616 
1617  // Any (aligned) i32 gather will not need to be scalarised.
1618  if (ExtSize == 32)
1619  return VectorCost;
1620  // For smaller types, we need to ensure that the gep's inputs are correctly
1621  // extended from a small enough value. Other sizes (including i64) are
1622  // scalarized for now.
1623  if (ExtSize != 8 && ExtSize != 16)
1624  return ScalarCost;
1625 
1626  if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
1627  Ptr = BC->getOperand(0);
1628  if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
1629  if (GEP->getNumOperands() != 2)
1630  return ScalarCost;
1631  unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
1632  // Scale needs to be correct (which is only relevant for i16s).
1633  if (Scale != 1 && Scale * 8 != ExtSize)
1634  return ScalarCost;
1635  // And we need to zext (not sext) the indexes from a small enough type.
1636  if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
1637  if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
1638  return VectorCost;
1639  }
1640  return ScalarCost;
1641  }
1642  return ScalarCost;
1643 }
1644 
1650  return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1651 
1652  EVT ValVT = TLI->getValueType(DL, ValTy);
1653  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1654  if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD)
1655  return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1656 
1657  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1658 
1659  static const CostTblEntry CostTblAdd[]{
1660  {ISD::ADD, MVT::v16i8, 1},
1661  {ISD::ADD, MVT::v8i16, 1},
1662  {ISD::ADD, MVT::v4i32, 1},
1663  };
1664  if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
1665  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1666 
1667  return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1668 }
1669 
1671 ARMTTIImpl::getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
1672  Type *ResTy, VectorType *ValTy,
1674  EVT ValVT = TLI->getValueType(DL, ValTy);
1675  EVT ResVT = TLI->getValueType(DL, ResTy);
1676 
1677  if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1678  std::pair<InstructionCost, MVT> LT =
1679  TLI->getTypeLegalizationCost(DL, ValTy);
1680 
1681  // The legal cases are:
1682  // VADDV u/s 8/16/32
1683  // VMLAV u/s 8/16/32
1684  // VADDLV u/s 32
1685  // VMLALV u/s 16/32
1686  // Codegen currently cannot always handle larger than legal vectors very
1687  // well, especially for predicated reductions where the mask needs to be
1688  // split, so restrict to 128bit or smaller input types.
1689  unsigned RevVTSize = ResVT.getSizeInBits();
1690  if (ValVT.getSizeInBits() <= 128 &&
1691  ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1692  (LT.second == MVT::v8i16 && RevVTSize <= (IsMLA ? 64u : 32u)) ||
1693  (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1694  return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1695  }
1696 
1697  return BaseT::getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, ValTy,
1698  CostKind);
1699 }
1700 
1704  switch (ICA.getID()) {
1705  case Intrinsic::get_active_lane_mask:
1706  // Currently we make a somewhat optimistic assumption that
1707  // active_lane_mask's are always free. In reality it may be freely folded
1708  // into a tail predicated loop, expanded into a VCPT or expanded into a lot
1709  // of add/icmp code. We may need to improve this in the future, but being
1710  // able to detect if it is free or not involves looking at a lot of other
1711  // code. We currently assume that the vectorizer inserted these, and knew
1712  // what it was doing in adding one.
1713  if (ST->hasMVEIntegerOps())
1714  return 0;
1715  break;
1716  case Intrinsic::sadd_sat:
1717  case Intrinsic::ssub_sat:
1718  case Intrinsic::uadd_sat:
1719  case Intrinsic::usub_sat: {
1720  if (!ST->hasMVEIntegerOps())
1721  break;
1722  Type *VT = ICA.getReturnType();
1723 
1724  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1725  if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1726  LT.second == MVT::v16i8) {
1727  // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
1728  // need to extend the type, as it uses shr(qadd(shl, shl)).
1729  unsigned Instrs =
1730  LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4;
1731  return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
1732  }
1733  break;
1734  }
1735  case Intrinsic::abs:
1736  case Intrinsic::smin:
1737  case Intrinsic::smax:
1738  case Intrinsic::umin:
1739  case Intrinsic::umax: {
1740  if (!ST->hasMVEIntegerOps())
1741  break;
1742  Type *VT = ICA.getReturnType();
1743 
1744  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1745  if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1746  LT.second == MVT::v16i8)
1747  return LT.first * ST->getMVEVectorCostFactor(CostKind);
1748  break;
1749  }
1750  case Intrinsic::minnum:
1751  case Intrinsic::maxnum: {
1752  if (!ST->hasMVEFloatOps())
1753  break;
1754  Type *VT = ICA.getReturnType();
1755  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1756  if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
1757  return LT.first * ST->getMVEVectorCostFactor(CostKind);
1758  break;
1759  }
1760  }
1761 
1763 }
1764 
1766  if (!F->isIntrinsic())
1768 
1769  // Assume all Arm-specific intrinsics map to an instruction.
1770  if (F->getName().startswith("llvm.arm"))
1771  return false;
1772 
1773  switch (F->getIntrinsicID()) {
1774  default: break;
1775  case Intrinsic::powi:
1776  case Intrinsic::sin:
1777  case Intrinsic::cos:
1778  case Intrinsic::pow:
1779  case Intrinsic::log:
1780  case Intrinsic::log10:
1781  case Intrinsic::log2:
1782  case Intrinsic::exp:
1783  case Intrinsic::exp2:
1784  return true;
1785  case Intrinsic::sqrt:
1786  case Intrinsic::fabs:
1787  case Intrinsic::copysign:
1788  case Intrinsic::floor:
1789  case Intrinsic::ceil:
1790  case Intrinsic::trunc:
1791  case Intrinsic::rint:
1792  case Intrinsic::nearbyint:
1793  case Intrinsic::round:
1794  case Intrinsic::canonicalize:
1795  case Intrinsic::lround:
1796  case Intrinsic::llround:
1797  case Intrinsic::lrint:
1798  case Intrinsic::llrint:
1799  if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
1800  return true;
1801  if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
1802  return true;
1803  // Some operations can be handled by vector instructions and assume
1804  // unsupported vectors will be expanded into supported scalar ones.
1805  // TODO Handle scalar operations properly.
1806  return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
1807  case Intrinsic::masked_store:
1808  case Intrinsic::masked_load:
1809  case Intrinsic::masked_gather:
1810  case Intrinsic::masked_scatter:
1811  return !ST->hasMVEIntegerOps();
1812  case Intrinsic::sadd_with_overflow:
1813  case Intrinsic::uadd_with_overflow:
1814  case Intrinsic::ssub_with_overflow:
1815  case Intrinsic::usub_with_overflow:
1816  case Intrinsic::sadd_sat:
1817  case Intrinsic::uadd_sat:
1818  case Intrinsic::ssub_sat:
1819  case Intrinsic::usub_sat:
1820  return false;
1821  }
1822 
1823  return BaseT::isLoweredToCall(F);
1824 }
1825 
1827  unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
1828  EVT VT = TLI->getValueType(DL, I.getType(), true);
1829  if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
1830  return true;
1831 
1832  // Check if an intrinsic will be lowered to a call and assume that any
1833  // other CallInst will generate a bl.
1834  if (auto *Call = dyn_cast<CallInst>(&I)) {
1835  if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
1836  switch(II->getIntrinsicID()) {
1837  case Intrinsic::memcpy:
1838  case Intrinsic::memset:
1839  case Intrinsic::memmove:
1840  return getNumMemOps(II) == -1;
1841  default:
1842  if (const Function *F = Call->getCalledFunction())
1843  return isLoweredToCall(F);
1844  }
1845  }
1846  return true;
1847  }
1848 
1849  // FPv5 provides conversions between integer, double-precision,
1850  // single-precision, and half-precision formats.
1851  switch (I.getOpcode()) {
1852  default:
1853  break;
1854  case Instruction::FPToSI:
1855  case Instruction::FPToUI:
1856  case Instruction::SIToFP:
1857  case Instruction::UIToFP:
1858  case Instruction::FPTrunc:
1859  case Instruction::FPExt:
1860  return !ST->hasFPARMv8Base();
1861  }
1862 
1863  // FIXME: Unfortunately the approach of checking the Operation Action does
1864  // not catch all cases of Legalization that use library calls. Our
1865  // Legalization step categorizes some transformations into library calls as
1866  // Custom, Expand or even Legal when doing type legalization. So for now
1867  // we have to special case for instance the SDIV of 64bit integers and the
1868  // use of floating point emulation.
1869  if (VT.isInteger() && VT.getSizeInBits() >= 64) {
1870  switch (ISD) {
1871  default:
1872  break;
1873  case ISD::SDIV:
1874  case ISD::UDIV:
1875  case ISD::SREM:
1876  case ISD::UREM:
1877  case ISD::SDIVREM:
1878  case ISD::UDIVREM:
1879  return true;
1880  }
1881  }
1882 
1883  // Assume all other non-float operations are supported.
1884  if (!VT.isFloatingPoint())
1885  return false;
1886 
1887  // We'll need a library call to handle most floats when using soft.
1888  if (TLI->useSoftFloat()) {
1889  switch (I.getOpcode()) {
1890  default:
1891  return true;
1892  case Instruction::Alloca:
1893  case Instruction::Load:
1894  case Instruction::Store:
1895  case Instruction::Select:
1896  case Instruction::PHI:
1897  return false;
1898  }
1899  }
1900 
1901  // We'll need a libcall to perform double precision operations on a single
1902  // precision only FPU.
1903  if (I.getType()->isDoubleTy() && !ST->hasFP64())
1904  return true;
1905 
1906  // Likewise for half precision arithmetic.
1907  if (I.getType()->isHalfTy() && !ST->hasFullFP16())
1908  return true;
1909 
1910  return false;
1911 }
1912 
1914  AssumptionCache &AC,
1915  TargetLibraryInfo *LibInfo,
1916  HardwareLoopInfo &HWLoopInfo) {
1917  // Low-overhead branches are only supported in the 'low-overhead branch'
1918  // extension of v8.1-m.
1919  if (!ST->hasLOB() || DisableLowOverheadLoops) {
1920  LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
1921  return false;
1922  }
1923 
1925  LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
1926  return false;
1927  }
1928 
1929  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
1930  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
1931  LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
1932  return false;
1933  }
1934 
1935  const SCEV *TripCountSCEV =
1936  SE.getAddExpr(BackedgeTakenCount,
1937  SE.getOne(BackedgeTakenCount->getType()));
1938 
1939  // We need to store the trip count in LR, a 32-bit register.
1940  if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
1941  LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
1942  return false;
1943  }
1944 
1945  // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
1946  // point in generating a hardware loop if that's going to happen.
1947 
1948  auto IsHardwareLoopIntrinsic = [](Instruction &I) {
1949  if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
1950  switch (Call->getIntrinsicID()) {
1951  default:
1952  break;
1953  case Intrinsic::start_loop_iterations:
1954  case Intrinsic::test_start_loop_iterations:
1955  case Intrinsic::loop_decrement:
1956  case Intrinsic::loop_decrement_reg:
1957  return true;
1958  }
1959  }
1960  return false;
1961  };
1962 
1963  // Scan the instructions to see if there's any that we know will turn into a
1964  // call or if this loop is already a low-overhead loop or will become a tail
1965  // predicated loop.
1966  bool IsTailPredLoop = false;
1967  auto ScanLoop = [&](Loop *L) {
1968  for (auto *BB : L->getBlocks()) {
1969  for (auto &I : *BB) {
1970  if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
1971  isa<InlineAsm>(I)) {
1972  LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
1973  return false;
1974  }
1975  if (auto *II = dyn_cast<IntrinsicInst>(&I))
1976  IsTailPredLoop |=
1977  II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
1978  II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
1979  II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
1980  II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
1981  II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
1982  }
1983  }
1984  return true;
1985  };
1986 
1987  // Visit inner loops.
1988  for (auto Inner : *L)
1989  if (!ScanLoop(Inner))
1990  return false;
1991 
1992  if (!ScanLoop(L))
1993  return false;
1994 
1995  // TODO: Check whether the trip count calculation is expensive. If L is the
1996  // inner loop but we know it has a low trip count, calculating that trip
1997  // count (in the parent loop) may be detrimental.
1998 
1999  LLVMContext &C = L->getHeader()->getContext();
2000  HWLoopInfo.CounterInReg = true;
2001  HWLoopInfo.IsNestingLegal = false;
2002  HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
2003  HWLoopInfo.CountType = Type::getInt32Ty(C);
2004  HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
2005  return true;
2006 }
2007 
2008 static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
2009  // We don't allow icmp's, and because we only look at single block loops,
2010  // we simply count the icmps, i.e. there should only be 1 for the backedge.
2011  if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
2012  return false;
2013  // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
2014  // not currently canonical, but soon will be. Code without them uses icmp, and
2015  // so is not tail predicated as per the condition above. In order to get the
2016  // same performance we treat min and max the same as an icmp for tailpred
2017  // purposes for the moment (we often rely on non-tailpred and higher VF's to
2018  // pick more optimial instructions like VQDMULH. They need to be recognized
2019  // directly by the vectorizer).
2020  if (auto *II = dyn_cast<IntrinsicInst>(&I))
2021  if ((II->getIntrinsicID() == Intrinsic::smin ||
2022  II->getIntrinsicID() == Intrinsic::smax ||
2023  II->getIntrinsicID() == Intrinsic::umin ||
2024  II->getIntrinsicID() == Intrinsic::umax) &&
2025  ++ICmpCount > 1)
2026  return false;
2027 
2028  if (isa<FCmpInst>(&I))
2029  return false;
2030 
2031  // We could allow extending/narrowing FP loads/stores, but codegen is
2032  // too inefficient so reject this for now.
2033  if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
2034  return false;
2035 
2036  // Extends have to be extending-loads
2037  if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
2038  if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
2039  return false;
2040 
2041  // Truncs have to be narrowing-stores
2042  if (isa<TruncInst>(&I) )
2043  if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
2044  return false;
2045 
2046  return true;
2047 }
2048 
2049 // To set up a tail-predicated loop, we need to know the total number of
2050 // elements processed by that loop. Thus, we need to determine the element
2051 // size and:
2052 // 1) it should be uniform for all operations in the vector loop, so we
2053 // e.g. don't want any widening/narrowing operations.
2054 // 2) it should be smaller than i64s because we don't have vector operations
2055 // that work on i64s.
2056 // 3) we don't want elements to be reversed or shuffled, to make sure the
2057 // tail-predication masks/predicates the right lanes.
2058 //
2060  const DataLayout &DL,
2061  const LoopAccessInfo *LAI) {
2062  LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
2063 
2064  // If there are live-out values, it is probably a reduction. We can predicate
2065  // most reduction operations freely under MVE using a combination of
2066  // prefer-predicated-reduction-select and inloop reductions. We limit this to
2067  // floating point and integer reductions, but don't check for operators
2068  // specifically here. If the value ends up not being a reduction (and so the
2069  // vectorizer cannot tailfold the loop), we should fall back to standard
2070  // vectorization automatically.
2072  LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
2073  bool ReductionsDisabled =
2076 
2077  for (auto *I : LiveOuts) {
2078  if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
2079  !I->getType()->isHalfTy()) {
2080  LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
2081  "live-out value\n");
2082  return false;
2083  }
2084  if (ReductionsDisabled) {
2085  LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
2086  return false;
2087  }
2088  }
2089 
2090  // Next, check that all instructions can be tail-predicated.
2091  PredicatedScalarEvolution PSE = LAI->getPSE();
2092  SmallVector<Instruction *, 16> LoadStores;
2093  int ICmpCount = 0;
2094 
2095  for (BasicBlock *BB : L->blocks()) {
2096  for (Instruction &I : BB->instructionsWithoutDebug()) {
2097  if (isa<PHINode>(&I))
2098  continue;
2099  if (!canTailPredicateInstruction(I, ICmpCount)) {
2100  LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
2101  return false;
2102  }
2103 
2104  Type *T = I.getType();
2105  if (T->isPointerTy())
2106  T = T->getPointerElementType();
2107 
2108  if (T->getScalarSizeInBits() > 32) {
2109  LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
2110  return false;
2111  }
2112  if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
2114  Type *AccessTy = getLoadStoreType(&I);
2115  int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L);
2116  if (NextStride == 1) {
2117  // TODO: for now only allow consecutive strides of 1. We could support
2118  // other strides as long as it is uniform, but let's keep it simple
2119  // for now.
2120  continue;
2121  } else if (NextStride == -1 ||
2122  (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
2123  (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
2124  LLVM_DEBUG(dbgs()
2125  << "Consecutive strides of 2 found, vld2/vstr2 can't "
2126  "be tail-predicated\n.");
2127  return false;
2128  // TODO: don't tail predicate if there is a reversed load?
2129  } else if (EnableMaskedGatherScatters) {
2130  // Gather/scatters do allow loading from arbitrary strides, at
2131  // least if they are loop invariant.
2132  // TODO: Loop variant strides should in theory work, too, but
2133  // this requires further testing.
2134  const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr);
2135  if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
2136  const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
2137  if (PSE.getSE()->isLoopInvariant(Step, L))
2138  continue;
2139  }
2140  }
2141  LLVM_DEBUG(dbgs() << "Bad stride found, can't "
2142  "tail-predicate\n.");
2143  return false;
2144  }
2145  }
2146  }
2147 
2148  LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
2149  return true;
2150 }
2151 
2153  ScalarEvolution &SE,
2154  AssumptionCache &AC,
2155  TargetLibraryInfo *TLI,
2156  DominatorTree *DT,
2157  const LoopAccessInfo *LAI) {
2158  if (!EnableTailPredication) {
2159  LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
2160  return false;
2161  }
2162 
2163  // Creating a predicated vector loop is the first step for generating a
2164  // tail-predicated hardware loop, for which we need the MVE masked
2165  // load/stores instructions:
2166  if (!ST->hasMVEIntegerOps())
2167  return false;
2168 
2169  // For now, restrict this to single block loops.
2170  if (L->getNumBlocks() > 1) {
2171  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
2172  "loop.\n");
2173  return false;
2174  }
2175 
2176  assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
2177 
2178  HardwareLoopInfo HWLoopInfo(L);
2179  if (!HWLoopInfo.canAnalyze(*LI)) {
2180  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2181  "analyzable.\n");
2182  return false;
2183  }
2184 
2185  // This checks if we have the low-overhead branch architecture
2186  // extension, and if we will create a hardware-loop:
2187  if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {
2188  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2189  "profitable.\n");
2190  return false;
2191  }
2192 
2193  if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) {
2194  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2195  "a candidate.\n");
2196  return false;
2197  }
2198 
2199  return canTailPredicateLoop(L, LI, SE, DL, LAI);
2200 }
2201 
2203  if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
2204  return false;
2205 
2206  // Intrinsic @llvm.get.active.lane.mask is supported.
2207  // It is used in the MVETailPredication pass, which requires the number of
2208  // elements processed by this vector loop to setup the tail-predicated
2209  // loop.
2210  return true;
2211 }
2215  // Enable Upper bound unrolling universally, not dependant upon the conditions
2216  // below.
2217  UP.UpperBound = true;
2218 
2219  // Only currently enable these preferences for M-Class cores.
2220  if (!ST->isMClass())
2221  return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2222 
2223  // Disable loop unrolling for Oz and Os.
2224  UP.OptSizeThreshold = 0;
2225  UP.PartialOptSizeThreshold = 0;
2226  if (L->getHeader()->getParent()->hasOptSize())
2227  return;
2228 
2229  SmallVector<BasicBlock*, 4> ExitingBlocks;
2230  L->getExitingBlocks(ExitingBlocks);
2231  LLVM_DEBUG(dbgs() << "Loop has:\n"
2232  << "Blocks: " << L->getNumBlocks() << "\n"
2233  << "Exit blocks: " << ExitingBlocks.size() << "\n");
2234 
2235  // Only allow another exit other than the latch. This acts as an early exit
2236  // as it mirrors the profitability calculation of the runtime unroller.
2237  if (ExitingBlocks.size() > 2)
2238  return;
2239 
2240  // Limit the CFG of the loop body for targets with a branch predictor.
2241  // Allowing 4 blocks permits if-then-else diamonds in the body.
2242  if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
2243  return;
2244 
2245  // Don't unroll vectorized loops, including the remainder loop
2246  if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
2247  return;
2248 
2249  // Scan the loop: don't unroll loops with calls as this could prevent
2250  // inlining.
2251  InstructionCost Cost = 0;
2252  for (auto *BB : L->getBlocks()) {
2253  for (auto &I : *BB) {
2254  // Don't unroll vectorised loop. MVE does not benefit from it as much as
2255  // scalar code.
2256  if (I.getType()->isVectorTy())
2257  return;
2258 
2259  if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2260  if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2261  if (!isLoweredToCall(F))
2262  continue;
2263  }
2264  return;
2265  }
2266 
2267  SmallVector<const Value*, 4> Operands(I.operand_values());
2268  Cost +=
2270  }
2271  }
2272 
2273  // On v6m cores, there are very few registers available. We can easily end up
2274  // spilling and reloading more registers in an unrolled loop. Look at the
2275  // number of LCSSA phis as a rough measure of how many registers will need to
2276  // be live out of the loop, reducing the default unroll count if more than 1
2277  // value is needed. In the long run, all of this should be being learnt by a
2278  // machine.
2279  unsigned UnrollCount = 4;
2280  if (ST->isThumb1Only()) {
2281  unsigned ExitingValues = 0;
2282  SmallVector<BasicBlock *, 4> ExitBlocks;
2283  L->getExitBlocks(ExitBlocks);
2284  for (auto *Exit : ExitBlocks) {
2285  // Count the number of LCSSA phis. Exclude values coming from GEP's as
2286  // only the last is expected to be needed for address operands.
2287  unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) {
2288  return PH.getNumOperands() != 1 ||
2289  !isa<GetElementPtrInst>(PH.getOperand(0));
2290  });
2291  ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
2292  }
2293  if (ExitingValues)
2294  UnrollCount /= ExitingValues;
2295  if (UnrollCount <= 1)
2296  return;
2297  }
2298 
2299  LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2300  LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
2301 
2302  UP.Partial = true;
2303  UP.Runtime = true;
2304  UP.UnrollRemainder = true;
2306  UP.UnrollAndJam = true;
2308 
2309  // Force unrolling small loops can be very useful because of the branch
2310  // taken cost of the backedge.
2311  if (Cost < 12)
2312  UP.Force = true;
2313 }
2314 
2317  BaseT::getPeelingPreferences(L, SE, PP);
2318 }
2319 
2320 bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
2321  TTI::ReductionFlags Flags) const {
2322  if (!ST->hasMVEIntegerOps())
2323  return false;
2324 
2325  unsigned ScalarBits = Ty->getScalarSizeInBits();
2326  switch (Opcode) {
2327  case Instruction::Add:
2328  return ScalarBits <= 64;
2329  default:
2330  return false;
2331  }
2332 }
2333 
2335  unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const {
2336  if (!ST->hasMVEIntegerOps())
2337  return false;
2338  return true;
2339 }
llvm::ISD::SUB
@ SUB
Definition: ISDOpcodes.h:240
llvm::Check::Size
@ Size
Definition: FileCheck.h:73
ARMSubtarget.h
llvm::InstructionCost
Definition: InstructionCost.h:29
llvm::TargetLoweringBase::getMaxStoresPerMemmove
unsigned getMaxStoresPerMemmove(bool OptSize) const
Get maximum # of store operations permitted for llvm.memmove.
Definition: TargetLowering.h:1636
llvm::TargetTransformInfo::CastContextHint::Masked
@ Masked
The cast is used with a masked load/store.
ValueTypes.h
llvm::TargetTransformInfo::UnrollingPreferences::PartialOptSizeThreshold
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
Definition: TargetTransformInfo.h:461
llvm::ScalarEvolution::hasLoopInvariantBackedgeTakenCount
bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
Definition: ScalarEvolution.cpp:12508
llvm::TargetTransformInfo::SK_Select
@ SK_Select
Selects elements from the corresponding lane of either source operand.
Definition: TargetTransformInfo.h:866
llvm::BasicTTIImplBase< ARMTTIImpl >::DL
const DataLayout & DL
Definition: TargetTransformInfoImpl.h:39
llvm::TargetTransformInfo::UnrollingPreferences::Runtime
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
Definition: TargetTransformInfo.h:489
llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition: TargetTransformInfo.h:212
llvm::SPF_SMAX
@ SPF_SMAX
Unsigned minimum.
Definition: ValueTracking.h:689
llvm::ISD::VECTOR_SHUFFLE
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:563
llvm::ARM_AM::isThumbImmShiftedVal
bool isThumbImmShiftedVal(unsigned V)
isThumbImmShiftedVal - Return true if the specified value can be obtained by left shifting a 8-bit im...
Definition: ARMAddressingModes.h:235
llvm::MVT::v4f16
@ v4f16
Definition: MachineValueType.h:136
llvm::TargetTransformInfo::TCC_Expensive
@ TCC_Expensive
The cost of a 'div' instruction on x86.
Definition: TargetTransformInfo.h:265
llvm::IRBuilderBase::SetInsertPoint
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:184
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AllocatorList.h:23
llvm::TargetTransformInfo::ReductionFlags
Flags describing the kind of vector reduction.
Definition: TargetTransformInfo.h:1355
llvm::ConvertCostTableLookup
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType >> Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
llvm::CostTblEntryT
Cost Table Entry.
Definition: CostTable.h:25
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::LoopBase::getExitBlocks
void getExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all of the successor blocks of this loop.
Definition: LoopInfoImpl.h:62
llvm::ARMTTIImpl::getInterleavedMemoryOpCost
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Definition: ARMTargetTransformInfo.cpp:1505
llvm::ARMSubtarget::hasMVEFloatOps
bool hasMVEFloatOps() const
Definition: ARMSubtarget.h:638
llvm::HardwareLoopInfo::LoopDecrement
Value * LoopDecrement
Definition: TargetTransformInfo.h:104
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
llvm::ISD::OR
@ OR
Definition: ISDOpcodes.h:633
llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
InstCombiner.h
llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:721
llvm::InstCombiner::getDominatorTree
DominatorTree & getDominatorTree() const
Definition: InstCombiner.h:370
llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:107
IntrinsicInst.h
ceil
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g ceil
Definition: README-FPStack.txt:54
DisableLowOverheadLoops
static cl::opt< bool > DisableLowOverheadLoops("disable-arm-loloops", cl::Hidden, cl::init(false), cl::desc("Disable the generation of low-overhead loops"))
llvm::TypeSize::getFixedSize
ScalarTy getFixedSize() const
Definition: TypeSize.h:425
T
llvm::Function
Definition: Function.h:62
llvm::Loop
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:530
llvm::ISD::UDIV
@ UDIV
Definition: ISDOpcodes.h:243
llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:52
llvm::SelectPatternResult::Flavor
SelectPatternFlavor Flavor
Definition: ValueTracking.h:709
llvm::TargetTransformInfoImplCRTPBase< ARMTTIImpl >::getUserCost
InstructionCost getUserCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
Definition: TargetTransformInfoImpl.h:942
llvm::DataLayout::getTypeSizeInBits
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:664
llvm::BasicTTIImplBase< ARMTTIImpl >::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:1061
llvm::TargetTransformInfo::AMK_PostIndexed
@ AMK_PostIndexed
Definition: TargetTransformInfo.h:645
llvm::ARMTTIImpl::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, int Index, VectorType *SubTp)
Definition: ARMTargetTransformInfo.cpp:1196
llvm::PointerType::get
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Definition: Type.cpp:729
MVEMaxSupportedInterleaveFactor
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor
llvm::ARMTTIImpl::areInlineCompatible
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
Definition: ARMTargetTransformInfo.cpp:85
llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:308
llvm::PredicatedScalarEvolution
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
Definition: ScalarEvolution.h:2147
llvm::ConstantInt::getValue
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:133
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
llvm::InstCombiner::Builder
BuilderTy & Builder
Definition: InstCombiner.h:58
llvm::ARMSubtarget::hasFPARMv8Base
bool hasFPARMv8Base() const
Definition: ARMSubtarget.h:665
llvm::APInt::getSExtValue
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1474
llvm::CmpInst::makeCmpResultType
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:1046
llvm::IRBuilder< TargetFolder, IRBuilderCallbackInserter >
llvm::IntrinsicCostAttributes::getReturnType
Type * getReturnType() const
Definition: TargetTransformInfo.h:151
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:460
llvm::BasicTTIImplBase< ARMTTIImpl >::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:757
llvm::TargetTransformInfo::UnrollingPreferences::UnrollAndJamInnerLoopThreshold
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
Definition: TargetTransformInfo.h:508
Local.h
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:151
llvm::TargetTransformInfo::UnrollingPreferences::UnrollRemainder
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
Definition: TargetTransformInfo.h:501
llvm::ISD::FP_TO_SINT
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:785
llvm::Type::isFPOrFPVectorTy
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:178
llvm::TargetTransformInfo::TCK_CodeSize
@ TCK_CodeSize
Instruction code size.
Definition: TargetTransformInfo.h:215
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:143
llvm::ARMSubtarget::hasV6T2Ops
bool hasV6T2Ops() const
Definition: ARMSubtarget.h:621
EnableMaskedLoadStores
static cl::opt< bool > EnableMaskedLoadStores("enable-arm-maskedldst", cl::Hidden, cl::init(true), cl::desc("Enable the generation of masked loads and stores"))
llvm::MemOp
Definition: TargetLowering.h:112
APInt.h
llvm::ARMTTIImpl::getGatherScatterOpCost
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:1546
llvm::getLoadStoreType
Type * getLoadStoreType(Value *I)
A helper function that returns the type of a load or store instruction.
Definition: Instructions.h:5373
llvm::HardwareLoopInfo::isHardwareLoopCandidate
bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, DominatorTree &DT, bool ForceNestedLoop=false, bool ForceHardwareLoopPHI=false)
Definition: TargetTransformInfo.cpp:100
llvm::CmpInst::ICMP_SGT
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:748
llvm::ARMTTIImpl::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:437
llvm::TargetTransformInfo::UnrollingPreferences::Partial
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Definition: TargetTransformInfo.h:485
llvm::findDefsUsedOutsideOfLoop
SmallVector< Instruction *, 8 > findDefsUsedOutsideOfLoop(Loop *L)
Returns the instructions that use values defined in the loop.
Definition: LoopUtils.cpp:131
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
llvm::getPtrStride
int64_t getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const ValueToValueMap &StridesMap=ValueToValueMap(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
Definition: LoopAccessAnalysis.cpp:1052
llvm::APInt::getBitWidth
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1410
llvm::TargetTransformInfo::PeelingPreferences
Definition: TargetTransformInfo.h:539
llvm::tgtok::Bits
@ Bits
Definition: TGLexer.h:50
llvm::Instruction::isShift
bool isShift() const
Definition: Instruction.h:167
llvm::BasicTTIImplBase< ARMTTIImpl >::improveShuffleKindFromMask
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask) const
Definition: BasicTTIImpl.h:839
llvm::SPF_UMAX
@ SPF_UMAX
Signed maximum.
Definition: ValueTracking.h:690
llvm::Optional
Definition: APInt.h:33
llvm::ARMSubtarget::hasMVEIntegerOps
bool hasMVEIntegerOps() const
Definition: ARMSubtarget.h:637
llvm::FeatureBitset
Container class for subtarget features.
Definition: SubtargetFeature.h:40
llvm::ConstantAsMetadata::get
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:419
llvm::TargetLoweringBase::getTypeLegalizationCost
std::pair< InstructionCost, MVT > getTypeLegalizationCost(const DataLayout &DL, Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: TargetLoweringBase.cpp:1847
llvm::Value::user_begin
user_iterator user_begin()
Definition: Value.h:397
llvm::CmpInst::ICMP_SLE
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:751
llvm::isPowerOf2_32
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:491
llvm::matchSelectPattern
SelectPatternResult matchSelectPattern(Value *V, Value *&LHS, Value *&RHS, Instruction::CastOps *CastOp=nullptr, unsigned Depth=0)
Pattern match integer [SU]MIN, [SU]MAX and ABS idioms, returning the kind and providing the out param...
Definition: ValueTracking.cpp:6240
llvm::SelectPatternFlavor
SelectPatternFlavor
Specific patterns of select instructions we can match.
Definition: ValueTracking.h:685
llvm::MVT::v2f64
@ v2f64
Definition: MachineValueType.h:172
llvm::FixedVectorType
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:525
llvm::BitmaskEnumDetail::Mask
std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
llvm::count_if
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1671
llvm::TargetLoweringBase::getOperationAction
LegalizeAction getOperationAction(unsigned Op, EVT VT) const
Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...
Definition: TargetLowering.h:1055
llvm::ARMTTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: ARMTargetTransformInfo.cpp:2212
llvm::Type::getInt32Ty
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:241
llvm::LoopBase::getNumBlocks
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
Definition: LoopInfo.h:185
llvm::APIntOps::umin
const APInt & umin(const APInt &A, const APInt &B)
Determine the smaller of two APInts considered to be unsigned.
Definition: APInt.h:2128
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::MDNode::get
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1233
llvm::TargetTransformInfo::SK_Broadcast
@ SK_Broadcast
Broadcast element 0 to all other elements.
Definition: TargetTransformInfo.h:864
F
#define F(x, y, z)
Definition: MD5.cpp:56
llvm::ARMTTIImpl::isLegalMaskedLoad
bool isLegalMaskedLoad(Type *DataTy, Align Alignment)
Definition: ARMTargetTransformInfo.cpp:1076
llvm::Instruction::setMetadata
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1336
llvm::ARMTTIImpl::isHardwareLoopProfitable
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo)
Definition: ARMTargetTransformInfo.cpp:1913
KnownBits.h
llvm::TargetTransformInfo::requiresOrderedReduction
static bool requiresOrderedReduction(Optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
Definition: TargetTransformInfo.h:1177
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
llvm::HardwareLoopInfo::IsNestingLegal
bool IsNestingLegal
Definition: TargetTransformInfo.h:106
floor
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g floor
Definition: README-FPStack.txt:54
llvm::EVT::isSimple
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:130
MachineValueType.h
UnrollCount
static cl::opt< unsigned > UnrollCount("unroll-count", cl::Hidden, cl::desc("Use this unroll count for all loops including those with " "unroll_count pragma values, for testing purposes"))
llvm::AArch64CC::LT
@ LT
Definition: AArch64BaseInfo.h:266
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
Arg
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
Definition: AMDGPULibCalls.cpp:206
Instruction.h
llvm::FixedVectorType::getNumElements
unsigned getNumElements() const
Definition: DerivedTypes.h:568
llvm::ARMSubtarget::hasLOB
bool hasLOB() const
Definition: ARMSubtarget.h:673
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
llvm::InstCombiner::replaceOperand
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
Definition: InstCombiner.h:438
llvm::Intrinsic::getType
FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys=None)
Return the function type for an intrinsic.
Definition: Function.cpp:1340
llvm::MVT::i1
@ i1
Definition: MachineValueType.h:43
llvm::APInt::isNonNegative
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition: APInt.h:317
llvm::MVT::v8f16
@ v8f16
Definition: MachineValueType.h:137
llvm::APInt::isNegative
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:312
llvm::BasicTTIImplBase::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:493
llvm::ARM_AM::getSOImmVal
int getSOImmVal(unsigned Arg)
getSOImmVal - Given a 32-bit immediate, if it is something that can fit into an shifter_operand immed...
Definition: ARMAddressingModes.h:163
SubtargetFeature.h
TargetMachine.h
llvm::TypeConversionCostTblEntryT
Type Conversion Cost Table.
Definition: CostTable.h:55
llvm::ScalarEvolution::getOne
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
Definition: ScalarEvolution.h:647
llvm::ISD::SELECT
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:679
llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
llvm::ISD::ZERO_EXTEND
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:729
EnableTailPredication
cl::opt< TailPredication::Mode > EnableTailPredication
llvm::BasicTTIImplBase< ARMTTIImpl >::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: BasicTTIImpl.h:1114
llvm::ARMTTIImpl::getIntImmCostInst
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
Definition: ARMTargetTransformInfo.cpp:369
llvm::TargetTransformInfo::ShuffleKind
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Definition: TargetTransformInfo.h:863
llvm::TargetTransformInfo::CastContextHint
CastContextHint
Represents a hint about the context in which a cast is used.
Definition: TargetTransformInfo.h:1069
llvm::User
Definition: User.h:44
llvm::TargetLoweringBase::getMaxStoresPerMemset
unsigned getMaxStoresPerMemset(bool OptSize) const
Get maximum # of store operations permitted for llvm.memset.
Definition: TargetLowering.h:1597
llvm::BasicTTIImplBase< ARMTTIImpl >::getAddressComputationCost
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
Definition: BasicTTIImpl.h:2032
llvm::EVT
Extended Value Type.
Definition: ValueTypes.h:35
llvm::getKnownAlignment
Align getKnownAlignment(Value *V, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to infer an alignment for the specified pointer.
Definition: Local.h:224
Intrinsics.h
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
llvm::TargetTransformInfo::UnrollingPreferences::Force
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
Definition: TargetTransformInfo.h:497
llvm::MVT::f64
@ f64
Definition: MachineValueType.h:56
llvm::ARMSubtarget::hasBranchPredictor
bool hasBranchPredictor() const
Definition: ARMSubtarget.h:727
round
static uint64_t round(uint64_t Acc, uint64_t Input)
Definition: xxhash.cpp:57
llvm::EVT::getVectorNumElements
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:309
llvm::BasicTTIImplBase< ARMTTIImpl >::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, Optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:2145
AllowWLSLoops
static cl::opt< bool > AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true), cl::desc("Enable the generation of WLS loops"))
llvm::ISD::TRUNCATE
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:735
llvm::LoopBase::blocks
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:178
llvm::APInt::getLimitedValue
uint64_t getLimitedValue(uint64_t Limit=UINT64_MAX) const
If this value is smaller than the specified limit, return it, otherwise return the limit value.
Definition: APInt.h:456
llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:226
llvm::ARMTTIImpl::getIntImmCodeSizeCost
InstructionCost getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty)
Definition: ARMTargetTransformInfo.cpp:328
llvm::MVT::v16i1
@ v16i1
Definition: MachineValueType.h:68
llvm::ISD::UDIVREM
@ UDIVREM
Definition: ISDOpcodes.h:256
llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:109
llvm::PatternMatch::m_c_Add
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
Definition: PatternMatch.h:2232
llvm::ARMTargetLowering::useSoftFloat
bool useSoftFloat() const override
Definition: ARMISelLowering.cpp:1583
llvm::MVT::v8i1
@ v8i1
Definition: MachineValueType.h:67
llvm::TargetTransformInfo::UnrollingPreferences::UnrollAndJam
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
Definition: TargetTransformInfo.h:503
llvm::LoopBase::getBlocks
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:171
llvm::PatternMatch::m_ConstantInt
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:145
llvm::EVT::isInteger
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:145
llvm::Instruction
Definition: Instruction.h:45
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition: Type.cpp:191
llvm::HardwareLoopInfo::PerformEntryTest
bool PerformEntryTest
Definition: TargetTransformInfo.h:110
canTailPredicateLoop
static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, const DataLayout &DL, const LoopAccessInfo *LAI)
Definition: ARMTargetTransformInfo.cpp:2059
llvm::APInt::isAllOnes
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:347
llvm::ARMTTIImpl::isLoweredToCall
bool isLoweredToCall(const Function *F)
Definition: ARMTargetTransformInfo.cpp:1765
llvm::ISD::SINT_TO_FP
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:739
llvm::InstCombiner::eraseInstFromFunction
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
llvm::APInt::getZExtValue
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1460
llvm::APInt::getHighBitsSet
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:279
llvm::LoopBase::getExitingBlocks
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
Definition: LoopInfoImpl.h:34
llvm::ConstantInt::get
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:925
LoopUtils.h
llvm::HardwareLoopInfo::CounterInReg
bool CounterInReg
Definition: TargetTransformInfo.h:108
llvm::ARMSubtarget::hasV6Ops
bool hasV6Ops() const
Definition: ARMSubtarget.h:618
llvm::ISD::AND
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:632
llvm::ARMTTIImpl::emitGetActiveLaneMask
bool emitGetActiveLaneMask() const
Definition: ARMTargetTransformInfo.cpp:2202
PatternMatch.h
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:686
llvm::ARMSubtarget::hasFP64
bool hasFP64() const
Definition: ARMSubtarget.h:700
llvm::MVT::v1i64
@ v1i64
Definition: MachineValueType.h:117
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::Metadata
Root of the metadata hierarchy.
Definition: Metadata.h:62
llvm::isVREVMask
bool isVREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isVREVMask - Check if a vector shuffle corresponds to a VREV instruction with the specified blocksize...
Definition: ARMTargetTransformInfo.h:307
llvm::ARM_AM::getT2SOImmVal
int getT2SOImmVal(unsigned Arg)
getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit into a Thumb-2 shifter_oper...
Definition: ARMAddressingModes.h:320
llvm::AddressSpace
AddressSpace
Definition: NVPTXBaseInfo.h:21
llvm::BasicTTIImplBase< ARMTTIImpl >::getCmpSelInstrCost
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:1066
llvm::BasicTTIImplBase< ARMTTIImpl >::getCastInstrCost
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:896
llvm::None
const NoneType None
Definition: None.h:23
llvm::MVT::v4i16
@ v4i16
Definition: MachineValueType.h:91
llvm::lltok::Kind
Kind
Definition: LLToken.h:18
llvm::ARMSubtarget::getMVEVectorCostFactor
unsigned getMVEVectorCostFactor(TargetTransformInfo::TargetCostKind CostKind) const
Definition: ARMSubtarget.h:933
llvm::Type::getIntegerBitWidth
unsigned getIntegerBitWidth() const
Definition: DerivedTypes.h:97
llvm::MVT::v4i8
@ v4i8
Definition: MachineValueType.h:78
llvm::SPF_SMIN
@ SPF_SMIN
Definition: ValueTracking.h:687
Type.h
llvm::IntrinsicCostAttributes
Definition: TargetTransformInfo.h:119
llvm::APInt::isAllOnesValue
bool isAllOnesValue() const
NOTE: This is soft-deprecated. Please use isAllOnes() instead.
Definition: APInt.h:356
llvm::Instruction::getMetadata
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:282
llvm::maxnum
LLVM_READONLY APFloat maxnum(const APFloat &A, const APFloat &B)
Implements IEEE maxNum semantics.
Definition: APFloat.h:1307
LoopInfo.h
llvm::ARMTTIImpl::getNumMemOps
int getNumMemOps(const IntrinsicInst *I) const
Given a memcpy/memset/memmove instruction, return the number of memory operations performed,...
Definition: ARMTargetTransformInfo.cpp:1120
llvm::ARMTTIImpl::isProfitableLSRChainElement
bool isProfitableLSRChainElement(Instruction *I)
Definition: ARMTargetTransformInfo.cpp:1059
Operands
mir Rename Register Operands
Definition: MIRNamerPass.cpp:78
llvm::ARMTTIImpl::preferPredicateOverEpilogue
bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, const LoopAccessInfo *LAI)
Definition: ARMTargetTransformInfo.cpp:2152
getCalledFunction
static const Function * getCalledFunction(const Value *V, bool LookThroughBitCast, bool &IsNoBuiltin)
Definition: MemoryBuiltins.cpp:118
llvm::ScalarEvolution::getSCEV
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
Definition: ScalarEvolution.cpp:4111
llvm::PatternMatch::m_Xor
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1112
llvm::Type::isIntegerTy
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:190
llvm::APInt::getOneBitSet
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:222
llvm::TargetTransformInfo::SK_Reverse
@ SK_Reverse
Reverse the order of the vector.
Definition: TargetTransformInfo.h:865
llvm::MVT::v2i8
@ v2i8
Definition: MachineValueType.h:77
llvm::MVT::v4i64
@ v4i64
Definition: MachineValueType.h:120
llvm::VectorType
Base class of all SIMD vector types.
Definition: DerivedTypes.h:389
llvm::ARMTTIImpl::getMemoryOpCost
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:1441
llvm::TargetTransformInfo::CastContextHint::Normal
@ Normal
The cast is used with a normal load/store.
BasicBlock.h
llvm::cl::opt< bool >
llvm::SCEV
This class represents an analyzed expression in the program.
Definition: ScalarEvolution.h:77
llvm::ARMTTIImpl::getExtendedAddReductionCost
InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *ValTy, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:1671
llvm::PatternMatch::m_Zero
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:535
llvm::Constant
This is an important base class in LLVM.
Definition: Constant.h:41
llvm::MVT::v16i8
@ v16i8
Definition: MachineValueType.h:80
llvm::CostTableLookup
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType >> Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
llvm::EVT::getSizeInBits
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:341
llvm::ARMSubtarget::isThumb1Only
bool isThumb1Only() const
Definition: ARMSubtarget.h:817
llvm::ScalarEvolution::getUnsignedRangeMax
APInt getUnsignedRangeMax(const SCEV *S)
Determine the max of the unsigned range for a particular SCEV.
Definition: ScalarEvolution.h:956
llvm::ARMTTIImpl::getMemcpyCost
InstructionCost getMemcpyCost(const Instruction *I)
Definition: ARMTargetTransformInfo.cpp:1186
llvm::ARMTTIImpl::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueKind Op1Info=TTI::OK_AnyValue, TTI::OperandValueKind Op2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: ARMTargetTransformInfo.cpp:1299
llvm::InstCombiner::getAssumptionCache
AssumptionCache & getAssumptionCache() const
Definition: InstCombiner.h:368
llvm::MVT::v16i16
@ v16i16
Definition: MachineValueType.h:93
Index
uint32_t Index
Definition: ELFObjHandler.cpp:84
llvm::MVT::v2i64
@ v2i64
Definition: MachineValueType.h:118
uint64_t
llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33
llvm::InstCombiner::getDataLayout
const DataLayout & getDataLayout() const
Definition: InstCombiner.h:371
llvm::ISD::FP_TO_UINT
@ FP_TO_UINT
Definition: ISDOpcodes.h:786
llvm::ARMTTIImpl::getAddressComputationCost
InstructionCost getAddressComputationCost(Type *Val, ScalarEvolution *SE, const SCEV *Ptr)
Definition: ARMTargetTransformInfo.cpp:1037
llvm::MVT::v16f32
@ v16f32
Definition: MachineValueType.h:162
llvm::Instruction::user_back
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:91
llvm::TruncInst
This class represents a truncation of integer types.
Definition: Instructions.h:4782
llvm::TargetTransformInfo::OK_UniformConstantValue
@ OK_UniformConstantValue
Definition: TargetTransformInfo.h:885
llvm::ARMSubtarget::isMClass
bool isMClass() const
Definition: ARMSubtarget.h:820
llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
llvm::TargetTransformInfo::UnrollingPreferences
Parameters that control the generic loop unrolling transformation.
Definition: TargetTransformInfo.h:432
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::TargetTransformInfo::OperandValueProperties
OperandValueProperties
Additional properties of an operand's values.
Definition: TargetTransformInfo.h:890
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:441
llvm::ARMTTIImpl::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:1702
llvm::LoopAccessInfo
Drive the analysis of memory accesses in the loop.
Definition: LoopAccessAnalysis.h:515
llvm::ARMTTIImpl::simplifyDemandedVectorEltsIntrinsic
Optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
Definition: ARMTargetTransformInfo.cpp:251
llvm::Type::isHalfTy
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:141
llvm::MVT::v4f32
@ v4f32
Definition: MachineValueType.h:157
llvm::MVT::i8
@ i8
Definition: MachineValueType.h:44
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:80
llvm::MVT::Other
@ Other
Definition: MachineValueType.h:42
llvm::ARMSubtarget::hasSlowLoadDSubregister
bool hasSlowLoadDSubregister() const
Definition: ARMSubtarget.h:714
llvm::SPF_ABS
@ SPF_ABS
Floating point maxnum.
Definition: ValueTracking.h:693
memcpy
<%struct.s * > cast struct s *S to sbyte *< sbyte * > sbyte uint cast struct s *agg result to sbyte *< sbyte * > sbyte uint cast struct s *memtmp to sbyte *< sbyte * > sbyte uint ret void llc ends up issuing two memcpy or custom lower memcpy(of small size) to be ldmia/stmia. I think option 2 is better but the current register allocator cannot allocate a chunk of registers at a time. A feasible temporary solution is to use specific physical registers at the lowering time for small(<
llvm::TargetLoweringBase::getMaxStoresPerMemcpy
unsigned getMaxStoresPerMemcpy(bool OptSize) const
Get maximum # of store operations permitted for llvm.memcpy.
Definition: TargetLowering.h:1607
llvm::TargetTransformInfoImplBase::isLoweredToCall
bool isLoweredToCall(const Function *F) const
Definition: TargetTransformInfoImpl.h:124
llvm::ARMTTIImpl::getMaskedMemoryOpCost
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:1488
llvm::ARMSubtarget::hasFullFP16
bool hasFullFP16() const
Definition: ARMSubtarget.h:744
llvm::ARMTTIImpl::preferInLoopReduction
bool preferInLoopReduction(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const
Definition: ARMTargetTransformInfo.cpp:2320
function
print Print MemDeps of function
Definition: MemDepPrinter.cpp:83
llvm::IRBuilderBase::CreateIntrinsic
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with args, mangled using Types.
Definition: IRBuilder.cpp:851
llvm::BasicTTIImplBase< ARMTTIImpl >::getInterleavedMemoryOpCost
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Definition: BasicTTIImpl.h:1215
llvm::IRBuilderBase::getTrue
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:458
llvm::PatternMatch::m_Constant
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
Definition: PatternMatch.h:142
llvm::TargetTransformInfo::OperandValueKind
OperandValueKind
Additional information about an operand's possible values.
Definition: TargetTransformInfo.h:882
isSSATMinMaxPattern
static bool isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm)
Definition: ARMTargetTransformInfo.cpp:338
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:650
llvm::PatternMatch::m_Value
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:75
llvm::ARMTTIImpl::getPreferredAddressingMode
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const
Definition: ARMTargetTransformInfo.cpp:104
llvm::APIntOps::smin
const APInt & smin(const APInt &A, const APInt &B)
Determine the smaller of two APInts considered to be signed.
Definition: APInt.h:2118
llvm::ArrayRef< int >
llvm::LoopInfo
Definition: LoopInfo.h:1083
llvm::EVT::isVector
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:155
ARMAddressingModes.h
llvm::ARMSubtarget::hasNEON
bool hasNEON() const
Definition: ARMSubtarget.h:666
llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition: OptimizationRemarkEmitter.h:33
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
DataLayout.h
llvm::MVT::i64
@ i64
Definition: MachineValueType.h:47
llvm::MVT::v2i32
@ v2i32
Definition: MachineValueType.h:101
llvm::AssumptionCache
A cache of @llvm.assume calls within a function.
Definition: AssumptionCache.h:42
llvm::ARMTTIImpl::isLegalMaskedGather
bool isLegalMaskedGather(Type *Ty, Align Alignment)
Definition: ARMTargetTransformInfo.cpp:1096
llvm::BasicTTIImplBase< ARMTTIImpl >::getScalarizationOverhead
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:678
llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition: TargetTransformInfo.h:216
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:134
llvm::ISD::SREM
@ SREM
Definition: ISDOpcodes.h:244
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
if
if(llvm_vc STREQUAL "") set(fake_version_inc "$
Definition: CMakeLists.txt:14
llvm::MVT::v2f32
@ v2f32
Definition: MachineValueType.h:155
CostKind
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
llvm::Value::getContext
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:991
llvm::ARMTTIImpl::getCmpSelInstrCost
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:891
llvm::TargetTransformInfo::UnrollingPreferences::DefaultUnrollRuntimeCount
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
Definition: TargetTransformInfo.h:468
trunc
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g trunc
Definition: README-FPStack.txt:63
llvm::SPF_FMINNUM
@ SPF_FMINNUM
Unsigned maximum.
Definition: ValueTracking.h:691
llvm::TargetLoweringBase::InstructionOpcodeToISD
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
Definition: TargetLoweringBase.cpp:1767
llvm::TargetTransformInfo::AddressingModeKind
AddressingModeKind
Definition: TargetTransformInfo.h:643
llvm::MVT::v4i32
@ v4i32
Definition: MachineValueType.h:103
llvm::BasicTTIImplBase< ARMTTIImpl >::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:565
llvm::SPF_FMAXNUM
@ SPF_FMAXNUM
Floating point minnum.
Definition: ValueTracking.h:692
canTailPredicateInstruction
static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount)
Definition: ARMTargetTransformInfo.cpp:2008
llvm::MVT::v8i64
@ v8i64
Definition: MachineValueType.h:121
llvm::ISD::XOR
@ XOR
Definition: ISDOpcodes.h:634
llvm::Function::hasOptSize
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:661
llvm::ScalarEvolution::isLoopInvariant
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
Definition: ScalarEvolution.cpp:12776
llvm::InstCombiner::replaceInstUsesWith
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition: InstCombiner.h:417
llvm::MVT::v16i32
@ v16i32
Definition: MachineValueType.h:108
llvm::MemOp::Set
static MemOp Set(uint64_t Size, bool DstAlignCanChange, Align DstAlign, bool IsZeroMemset, bool IsVolatile)
Definition: TargetLowering.h:145
llvm::MCID::Select
@ Select
Definition: MCInstrDesc.h:162
llvm::TargetLoweringBase::getTargetMachine
const TargetMachine & getTargetMachine() const
Definition: TargetLowering.h:339
llvm::LoopBase::isInnermost
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
Definition: LoopInfo.h:165
llvm::APIntOps::umax
const APInt & umax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be unsigned.
Definition: APInt.h:2133
llvm::minnum
LLVM_READONLY APFloat minnum(const APFloat &A, const APFloat &B)
Implements IEEE minNum semantics.
Definition: APFloat.h:1296
llvm::MemOp::Copy
static MemOp Copy(uint64_t Size, bool DstAlignCanChange, Align DstAlign, Align SrcAlign, bool IsVolatile, bool MemcpyStrSrc=false)
Definition: TargetLowering.h:130
llvm::KnownBits
Definition: KnownBits.h:23
llvm::Type::isFloatTy
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:147
llvm::BasicTTIImplBase< ARMTTIImpl >::getCallInstrCost
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Compute a cost of the given call instruction.
Definition: BasicTTIImpl.h:2020
CostTable.h
llvm::EVT::getScalarType
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:296
llvm::TargetTransformInfo::UnrollingPreferences::UpperBound
bool UpperBound
Allow using trip count upper bound to unroll loops.
Definition: TargetTransformInfo.h:499
llvm::Align::value
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
llvm::CallBase::arg_size
unsigned arg_size() const
Definition: InstrTypes.h:1341
llvm::Type::isIntOrIntVectorTy
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:196
llvm::MVT::v8i16
@ v8i16
Definition: MachineValueType.h:92
llvm::ARMTTIImpl::getIntImmCost
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:293
ISDOpcodes.h
llvm::APInt::isNegatedPowerOf2
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:434
llvm::TypeSize
Definition: TypeSize.h:416
Casting.h
llvm::BasicTTIImplBase< ARMTTIImpl >::getMaskedMemoryOpCost
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:1199
llvm::Value::hasNUses
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:149
llvm::LoopBase::getHeader
BlockT * getHeader() const
Definition: LoopInfo.h:104
llvm::MVT::i32
@ i32
Definition: MachineValueType.h:46
llvm::TargetLibraryInfo
Provides information about what library functions are available for the current target.
Definition: TargetLibraryInfo.h:221
llvm::ISD::SDIV
@ SDIV
Definition: ISDOpcodes.h:242
powi
This is blocked on not handling X *X *X powi(X, 3)(see note above). The issue is that we end up getting t
llvm::log2
static double log2(double V)
Definition: AMDGPULibCalls.cpp:842
llvm::APInt::getSplat
static APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition: APInt.cpp:597
llvm::InstCombiner::SimplifyDemandedBits
virtual bool SimplifyDemandedBits(Instruction *I, unsigned OpNo, const APInt &DemandedMask, KnownBits &Known, unsigned Depth=0)=0
llvm::MCID::Add
@ Add
Definition: MCInstrDesc.h:183
llvm::MVT::v8i32
@ v8i32
Definition: MachineValueType.h:107
llvm::InstCombiner
The core instruction combiner logic.
Definition: InstCombiner.h:45
llvm::ISD::UINT_TO_FP
@ UINT_TO_FP
Definition: ISDOpcodes.h:740
llvm::ISD::ADD
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
llvm::TargetLoweringBase::isOperationLegalOrCustomOrPromote
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
Definition: TargetLowering.h:1150
llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:45
llvm::HardwareLoopInfo
Attributes of a target dependent hardware loop.
Definition: TargetTransformInfo.h:96
llvm::TargetTransformInfoImplBase::isConstantStridedAccessLessThan
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
Definition: TargetTransformInfoImpl.h:853
llvm::ISD::FP_EXTEND
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:833
llvm::BasicTTIImplBase< ARMTTIImpl >::getGatherScatterOpCost
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:1206
llvm::ARMTTIImpl::maybeLoweredToCall
bool maybeLoweredToCall(Instruction &I)
Definition: ARMTargetTransformInfo.cpp:1826
Instructions.h
llvm::IntrinsicCostAttributes::getID
Intrinsic::ID getID() const
Definition: TargetTransformInfo.h:149
llvm::ARMTTIImpl::getCastInstrCost
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:451
llvm::ISD::SHL
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:657
SmallVector.h
llvm::PatternMatch::m_Specific
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:777
llvm::ISD::MUL
@ MUL
Definition: ISDOpcodes.h:241
llvm::ISD::UREM
@ UREM
Definition: ISDOpcodes.h:245
llvm::MVT::f16
@ f16
Definition: MachineValueType.h:54
llvm::TailPredication::EnabledNoReductions
@ EnabledNoReductions
Definition: ARMTargetTransformInfo.h:44
llvm::CallBase::getArgOperand
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1343
llvm::BasicTTIImplBase< ARMTTIImpl >::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, int Index, VectorType *SubTp)
Definition: BasicTTIImpl.h:872
llvm::BasicTTIImplBase< ARMTTIImpl >::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
Definition: BasicTTIImpl.h:1355
llvm::ARMTTIImpl::isLegalMaskedStore
bool isLegalMaskedStore(Type *DataTy, Align Alignment)
Definition: ARMTargetTransformInfo.h:188
llvm::SPF_UMIN
@ SPF_UMIN
Signed minimum.
Definition: ValueTracking.h:688
llvm::getBooleanLoopAttribute
bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
Definition: LoopInfo.cpp:1088
llvm::ARMTargetLowering::getNumInterleavedAccesses
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
Definition: ARMISelLowering.cpp:20976
simplifyNeonVld1
static Value * simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign, InstCombiner::BuilderTy &Builder)
Convert a vector load intrinsic into a simple llvm load instruction.
Definition: ARMTargetTransformInfo.cpp:66
llvm::APInt::getActiveBits
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition: APInt.h:1434
llvm::ARMTTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: ARMTargetTransformInfo.cpp:2315
llvm::BasicTTIImplBase< ARMTTIImpl >::getMemoryOpCost
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:1155
ARMTargetTransformInfo.h
llvm::HardwareLoopInfo::canAnalyze
bool canAnalyze(LoopInfo &LI)
Definition: TargetTransformInfo.cpp:47
llvm::IRBuilderBase::CreateVectorSplat
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1109
DerivedTypes.h
llvm::SCEV::getType
Type * getType() const
Return the LLVM type of this SCEV expression.
Definition: ScalarEvolution.cpp:377
llvm::getLoadStorePointerOperand
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
Definition: Instructions.h:5328
TM
const char LLVMTargetMachineRef TM
Definition: PassBuilderBindings.cpp:47
EnableMaskedGatherScatters
cl::opt< bool > EnableMaskedGatherScatters
llvm::ScalarEvolution::getAddExpr
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
Definition: ScalarEvolution.cpp:2417
llvm::APInt::getLowBitsSet
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:289
llvm::MVT::i16
@ i16
Definition: MachineValueType.h:45
llvm::TargetTransformInfo::UnrollingPreferences::OptSizeThreshold
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Definition: TargetTransformInfo.h:454
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
GEP
Hexagon Common GEP
Definition: HexagonCommonGEP.cpp:172
llvm::ARMTTIImpl::preferPredicatedReductionSelect
bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const
Definition: ARMTargetTransformInfo.cpp:2334
llvm::ISD::SDIVREM
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:255
llvm::ARMTTIImpl::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: ARMTargetTransformInfo.cpp:855
llvm::ARMSubtarget::isThumb2
bool isThumb2() const
Definition: ARMSubtarget.h:818
llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition: AMDGPUMetadata.h:389
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
llvm::cl::desc
Definition: CommandLine.h:412
llvm::TargetLoweringBase::getValueType
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
Definition: TargetLowering.h:1417
llvm::LoopAccessInfo::getPSE
const PredicatedScalarEvolution & getPSE() const
Used to add runtime SCEV checks.
Definition: LoopAccessAnalysis.h:587
llvm::ISD::SIGN_EXTEND
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:726
llvm::MVT::v8i8
@ v8i8
Definition: MachineValueType.h:79
llvm::ScalarEvolution::getBackedgeTakenCount
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
Definition: ScalarEvolution.cpp:7494
llvm::MVT::v8f32
@ v8f32
Definition: MachineValueType.h:161
llvm::PredicatedScalarEvolution::getSE
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
Definition: ScalarEvolution.h:2179
llvm::ARMTTIImpl::instCombineIntrinsic
Optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
Definition: ARMTargetTransformInfo.cpp:120
llvm::BinaryOperator::Create
static BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), Instruction *InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
Definition: Instructions.cpp:2755
llvm::MVT::v2i16
@ v2i16
Definition: MachineValueType.h:89
llvm::MVT::v16i64
@ v16i64
Definition: MachineValueType.h:122
llvm::abs
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1282
llvm::BasicTTIImplBase< ARMTTIImpl >::getExtendedAddReductionCost
InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:2213
llvm::HardwareLoopInfo::CountType
IntegerType * CountType
Definition: TargetTransformInfo.h:103
llvm::EVT::isFixedLengthVector
bool isFixedLengthVector() const
Definition: ValueTypes.h:165
llvm::ISD::FP_ROUND
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:814
llvm::MVT::f32
@ f32
Definition: MachineValueType.h:55
llvm::ARMTTIImpl::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, Optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:1646
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
llvm::TargetTransformInfo::TCK_RecipThroughput
@ TCK_RecipThroughput
Reciprocal throughput.
Definition: TargetTransformInfo.h:213
llvm::EVT::isFloatingPoint
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:140
llvm::TargetTransformInfo::AMK_None
@ AMK_None
Definition: TargetTransformInfo.h:646
llvm::TargetTransformInfo::AMK_PreIndexed
@ AMK_PreIndexed
Definition: TargetTransformInfo.h:644
llvm::APIntOps::smax
const APInt & smax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be signed.
Definition: APInt.h:2123
llvm::ARMSubtarget::hasVFP2Base
bool hasVFP2Base() const
Definition: ARMSubtarget.h:662
llvm::TargetLoweringBase::LibCall
@ LibCall
Definition: TargetLowering.h:200
llvm::ARMSubtarget::isThumb
bool isThumb() const
Definition: ARMSubtarget.h:815
llvm::Type::getPrimitiveSizeInBits
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition: Type.cpp:166
llvm::EVT::getSimpleVT
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:289
llvm::MVT::v4i1
@ v4i1
Definition: MachineValueType.h:66
llvm::ARMTargetLowering::isLegalInterleavedAccessType
bool isLegalInterleavedAccessType(unsigned Factor, FixedVectorType *VecTy, Align Alignment, const DataLayout &DL) const
Returns true if VecTy is a legal interleaved access type.
Definition: ARMISelLowering.cpp:20981
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38
llvm::DataLayout::getTypeAllocSize
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:503
llvm::TailPredication::ForceEnabledNoReductions
@ ForceEnabledNoReductions
Definition: ARMTargetTransformInfo.h:46