LLVM  16.0.0git
ARMTargetTransformInfo.cpp
Go to the documentation of this file.
1 //===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
10 #include "ARMSubtarget.h"
12 #include "llvm/ADT/APInt.h"
13 #include "llvm/ADT/SmallVector.h"
14 #include "llvm/Analysis/LoopInfo.h"
15 #include "llvm/CodeGen/CostTable.h"
18 #include "llvm/IR/BasicBlock.h"
19 #include "llvm/IR/DataLayout.h"
20 #include "llvm/IR/DerivedTypes.h"
21 #include "llvm/IR/Instruction.h"
22 #include "llvm/IR/Instructions.h"
23 #include "llvm/IR/IntrinsicInst.h"
24 #include "llvm/IR/Intrinsics.h"
25 #include "llvm/IR/IntrinsicsARM.h"
26 #include "llvm/IR/PatternMatch.h"
27 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Casting.h"
30 #include "llvm/Support/KnownBits.h"
37 #include <algorithm>
38 #include <cassert>
39 #include <cstdint>
40 #include <utility>
41 
42 using namespace llvm;
43 
44 #define DEBUG_TYPE "armtti"
45 
47  "enable-arm-maskedldst", cl::Hidden, cl::init(true),
48  cl::desc("Enable the generation of masked loads and stores"));
49 
51  "disable-arm-loloops", cl::Hidden, cl::init(false),
52  cl::desc("Disable the generation of low-overhead loops"));
53 
54 static cl::opt<bool>
55  AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
56  cl::desc("Enable the generation of WLS loops"));
57 
59 
61 
63 
64 /// Convert a vector load intrinsic into a simple llvm load instruction.
65 /// This is beneficial when the underlying object being addressed comes
66 /// from a constant, since we get constant-folding for free.
67 static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
69  auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
70 
71  if (!IntrAlign)
72  return nullptr;
73 
74  unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
75  ? MemAlign
76  : IntrAlign->getLimitedValue();
77 
78  if (!isPowerOf2_32(Alignment))
79  return nullptr;
80 
81  auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
82  PointerType::get(II.getType(), 0));
83  return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
84 }
85 
87  const Function *Callee) const {
88  const TargetMachine &TM = getTLI()->getTargetMachine();
89  const FeatureBitset &CallerBits =
90  TM.getSubtargetImpl(*Caller)->getFeatureBits();
91  const FeatureBitset &CalleeBits =
92  TM.getSubtargetImpl(*Callee)->getFeatureBits();
93 
94  // To inline a callee, all features not in the allowed list must match exactly.
95  bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
96  (CalleeBits & ~InlineFeaturesAllowed);
97  // For features in the allowed list, the callee's features must be a subset of
98  // the callers'.
99  bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
100  (CalleeBits & InlineFeaturesAllowed);
101  return MatchExact && MatchSubset;
102 }
103 
106  ScalarEvolution *SE) const {
107  if (ST->hasMVEIntegerOps())
108  return TTI::AMK_PostIndexed;
109 
110  if (L->getHeader()->getParent()->hasOptSize())
111  return TTI::AMK_None;
112 
113  if (ST->isMClass() && ST->isThumb2() &&
114  L->getNumBlocks() == 1)
115  return TTI::AMK_PreIndexed;
116 
117  return TTI::AMK_None;
118 }
119 
122  using namespace PatternMatch;
123  Intrinsic::ID IID = II.getIntrinsicID();
124  switch (IID) {
125  default:
126  break;
127  case Intrinsic::arm_neon_vld1: {
128  Align MemAlign =
130  &IC.getAssumptionCache(), &IC.getDominatorTree());
131  if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
132  return IC.replaceInstUsesWith(II, V);
133  }
134  break;
135  }
136 
137  case Intrinsic::arm_neon_vld2:
138  case Intrinsic::arm_neon_vld3:
139  case Intrinsic::arm_neon_vld4:
140  case Intrinsic::arm_neon_vld2lane:
141  case Intrinsic::arm_neon_vld3lane:
142  case Intrinsic::arm_neon_vld4lane:
143  case Intrinsic::arm_neon_vst1:
144  case Intrinsic::arm_neon_vst2:
145  case Intrinsic::arm_neon_vst3:
146  case Intrinsic::arm_neon_vst4:
147  case Intrinsic::arm_neon_vst2lane:
148  case Intrinsic::arm_neon_vst3lane:
149  case Intrinsic::arm_neon_vst4lane: {
150  Align MemAlign =
152  &IC.getAssumptionCache(), &IC.getDominatorTree());
153  unsigned AlignArg = II.arg_size() - 1;
154  Value *AlignArgOp = II.getArgOperand(AlignArg);
155  MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
156  if (Align && *Align < MemAlign) {
157  return IC.replaceOperand(
158  II, AlignArg,
160  false));
161  }
162  break;
163  }
164 
165  case Intrinsic::arm_mve_pred_i2v: {
166  Value *Arg = II.getArgOperand(0);
167  Value *ArgArg;
168  if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
169  PatternMatch::m_Value(ArgArg))) &&
170  II.getType() == ArgArg->getType()) {
171  return IC.replaceInstUsesWith(II, ArgArg);
172  }
173  Constant *XorMask;
174  if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
175  PatternMatch::m_Value(ArgArg)),
176  PatternMatch::m_Constant(XorMask))) &&
177  II.getType() == ArgArg->getType()) {
178  if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
179  if (CI->getValue().trunc(16).isAllOnes()) {
180  auto TrueVector = IC.Builder.CreateVectorSplat(
181  cast<FixedVectorType>(II.getType())->getNumElements(),
182  IC.Builder.getTrue());
183  return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
184  }
185  }
186  }
187  KnownBits ScalarKnown(32);
188  if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
189  ScalarKnown, 0)) {
190  return &II;
191  }
192  break;
193  }
194  case Intrinsic::arm_mve_pred_v2i: {
195  Value *Arg = II.getArgOperand(0);
196  Value *ArgArg;
197  if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
198  PatternMatch::m_Value(ArgArg)))) {
199  return IC.replaceInstUsesWith(II, ArgArg);
200  }
201  if (!II.getMetadata(LLVMContext::MD_range)) {
202  Type *IntTy32 = Type::getInt32Ty(II.getContext());
203  Metadata *M[] = {
205  ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0x10000))};
206  II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M));
207  return &II;
208  }
209  break;
210  }
211  case Intrinsic::arm_mve_vadc:
212  case Intrinsic::arm_mve_vadc_predicated: {
213  unsigned CarryOp =
214  (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
215  assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
216  "Bad type for intrinsic!");
217 
218  KnownBits CarryKnown(32);
219  if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
220  CarryKnown)) {
221  return &II;
222  }
223  break;
224  }
225  case Intrinsic::arm_mve_vmldava: {
226  Instruction *I = cast<Instruction>(&II);
227  if (I->hasOneUse()) {
228  auto *User = cast<Instruction>(*I->user_begin());
229  Value *OpZ;
230  if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
231  match(I->getOperand(3), m_Zero())) {
232  Value *OpX = I->getOperand(4);
233  Value *OpY = I->getOperand(5);
234  Type *OpTy = OpX->getType();
235 
237  Value *V =
238  IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
239  {I->getOperand(0), I->getOperand(1),
240  I->getOperand(2), OpZ, OpX, OpY});
241 
242  IC.replaceInstUsesWith(*User, V);
243  return IC.eraseInstFromFunction(*User);
244  }
245  }
246  return None;
247  }
248  }
249  return None;
250 }
251 
253  InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
254  APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
255  std::function<void(Instruction *, unsigned, APInt, APInt &)>
256  SimplifyAndSetOp) const {
257 
258  // Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the
259  // opcode specifying a Top/Bottom instruction, which can change between
260  // instructions.
261  auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) {
262  unsigned NumElts = cast<FixedVectorType>(II.getType())->getNumElements();
263  unsigned IsTop = cast<ConstantInt>(II.getOperand(TopOpc))->getZExtValue();
264 
265  // The only odd/even lanes of operand 0 will only be demanded depending
266  // on whether this is a top/bottom instruction.
267  APInt DemandedElts =
268  APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
269  : APInt::getHighBitsSet(2, 1));
270  SimplifyAndSetOp(&II, 0, OrigDemandedElts & DemandedElts, UndefElts);
271  // The other lanes will be defined from the inserted elements.
272  UndefElts &= APInt::getSplat(NumElts, !IsTop ? APInt::getLowBitsSet(2, 1)
273  : APInt::getHighBitsSet(2, 1));
274  return None;
275  };
276 
277  switch (II.getIntrinsicID()) {
278  default:
279  break;
280  case Intrinsic::arm_mve_vcvt_narrow:
281  SimplifyNarrowInstrTopBottom(2);
282  break;
283  case Intrinsic::arm_mve_vqmovn:
284  SimplifyNarrowInstrTopBottom(4);
285  break;
286  case Intrinsic::arm_mve_vshrn:
287  SimplifyNarrowInstrTopBottom(7);
288  break;
289  }
290 
291  return None;
292 }
293 
296  assert(Ty->isIntegerTy());
297 
298  unsigned Bits = Ty->getPrimitiveSizeInBits();
299  if (Bits == 0 || Imm.getActiveBits() >= 64)
300  return 4;
301 
302  int64_t SImmVal = Imm.getSExtValue();
303  uint64_t ZImmVal = Imm.getZExtValue();
304  if (!ST->isThumb()) {
305  if ((SImmVal >= 0 && SImmVal < 65536) ||
306  (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
307  (ARM_AM::getSOImmVal(~ZImmVal) != -1))
308  return 1;
309  return ST->hasV6T2Ops() ? 2 : 3;
310  }
311  if (ST->isThumb2()) {
312  if ((SImmVal >= 0 && SImmVal < 65536) ||
313  (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
314  (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
315  return 1;
316  return ST->hasV6T2Ops() ? 2 : 3;
317  }
318  // Thumb1, any i8 imm cost 1.
319  if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
320  return 1;
321  if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
322  return 2;
323  // Load from constantpool.
324  return 3;
325 }
326 
327 // Constants smaller than 256 fit in the immediate field of
328 // Thumb1 instructions so we return a zero cost and 1 otherwise.
330  const APInt &Imm, Type *Ty) {
331  if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
332  return 0;
333 
334  return 1;
335 }
336 
337 // Checks whether Inst is part of a min(max()) or max(min()) pattern
338 // that will match to an SSAT instruction. Returns the instruction being
339 // saturated, or null if no saturation pattern was found.
341  Value *LHS, *RHS;
342  ConstantInt *C;
344 
345  if (InstSPF == SPF_SMAX &&
347  C->getValue() == Imm && Imm.isNegative() && Imm.isNegatedPowerOf2()) {
348 
349  auto isSSatMin = [&](Value *MinInst) {
350  if (isa<SelectInst>(MinInst)) {
351  Value *MinLHS, *MinRHS;
352  ConstantInt *MinC;
353  SelectPatternFlavor MinSPF =
354  matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
355  if (MinSPF == SPF_SMIN &&
357  MinC->getValue() == ((-Imm) - 1))
358  return true;
359  }
360  return false;
361  };
362 
363  if (isSSatMin(Inst->getOperand(1)))
364  return cast<Instruction>(Inst->getOperand(1))->getOperand(1);
365  if (Inst->hasNUses(2) &&
366  (isSSatMin(*Inst->user_begin()) || isSSatMin(*(++Inst->user_begin()))))
367  return Inst->getOperand(1);
368  }
369  return nullptr;
370 }
371 
372 // Look for a FP Saturation pattern, where the instruction can be simplified to
373 // a fptosi.sat. max(min(fptosi)). The constant in this case is always free.
374 static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm) {
375  if (Imm.getBitWidth() != 64 ||
376  Imm != APInt::getHighBitsSet(64, 33)) // -2147483648
377  return false;
378  Value *FP = isSSATMinMaxPattern(Inst, Imm);
379  if (!FP && isa<ICmpInst>(Inst) && Inst->hasOneUse())
380  FP = isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm);
381  if (!FP)
382  return false;
383  return isa<FPToSIInst>(FP);
384 }
385 
386 InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
387  const APInt &Imm, Type *Ty,
389  Instruction *Inst) {
390  // Division by a constant can be turned into multiplication, but only if we
391  // know it's constant. So it's not so much that the immediate is cheap (it's
392  // not), but that the alternative is worse.
393  // FIXME: this is probably unneeded with GlobalISel.
394  if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
395  Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
396  Idx == 1)
397  return 0;
398 
399  // Leave any gep offsets for the CodeGenPrepare, which will do a better job at
400  // splitting any large offsets.
401  if (Opcode == Instruction::GetElementPtr && Idx != 0)
402  return 0;
403 
404  if (Opcode == Instruction::And) {
405  // UXTB/UXTH
406  if (Imm == 255 || Imm == 65535)
407  return 0;
408  // Conversion to BIC is free, and means we can use ~Imm instead.
409  return std::min(getIntImmCost(Imm, Ty, CostKind),
410  getIntImmCost(~Imm, Ty, CostKind));
411  }
412 
413  if (Opcode == Instruction::Add)
414  // Conversion to SUB is free, and means we can use -Imm instead.
415  return std::min(getIntImmCost(Imm, Ty, CostKind),
416  getIntImmCost(-Imm, Ty, CostKind));
417 
418  if (Opcode == Instruction::ICmp && Imm.isNegative() &&
419  Ty->getIntegerBitWidth() == 32) {
420  int64_t NegImm = -Imm.getSExtValue();
421  if (ST->isThumb2() && NegImm < 1<<12)
422  // icmp X, #-C -> cmn X, #C
423  return 0;
424  if (ST->isThumb() && NegImm < 1<<8)
425  // icmp X, #-C -> adds X, #C
426  return 0;
427  }
428 
429  // xor a, -1 can always be folded to MVN
430  if (Opcode == Instruction::Xor && Imm.isAllOnes())
431  return 0;
432 
433  // Ensures negative constant of min(max()) or max(min()) patterns that
434  // match to SSAT instructions don't get hoisted
435  if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
436  Ty->getIntegerBitWidth() <= 32) {
437  if (isSSATMinMaxPattern(Inst, Imm) ||
438  (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
439  isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm)))
440  return 0;
441  }
442 
443  if (Inst && ST->hasVFP2Base() && isFPSatMinMaxPattern(Inst, Imm))
444  return 0;
445 
446  // We can convert <= -1 to < 0, which is generally quite cheap.
447  if (Inst && Opcode == Instruction::ICmp && Idx == 1 && Imm.isAllOnesValue()) {
448  ICmpInst::Predicate Pred = cast<ICmpInst>(Inst)->getPredicate();
449  if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE)
450  return std::min(getIntImmCost(Imm, Ty, CostKind),
451  getIntImmCost(Imm + 1, Ty, CostKind));
452  }
453 
454  return getIntImmCost(Imm, Ty, CostKind);
455 }
456 
459  const Instruction *I) {
461  (ST->hasNEON() || ST->hasMVEIntegerOps())) {
462  // FIXME: The vectorizer is highly sensistive to the cost of these
463  // instructions, which suggests that it may be using the costs incorrectly.
464  // But, for now, just make them free to avoid performance regressions for
465  // vector targets.
466  return 0;
467  }
468  return BaseT::getCFInstrCost(Opcode, CostKind, I);
469 }
470 
472  Type *Src,
475  const Instruction *I) {
476  int ISD = TLI->InstructionOpcodeToISD(Opcode);
477  assert(ISD && "Invalid opcode");
478 
479  // TODO: Allow non-throughput costs that aren't binary.
480  auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
482  return Cost == 0 ? 0 : 1;
483  return Cost;
484  };
485  auto IsLegalFPType = [this](EVT VT) {
486  EVT EltVT = VT.getScalarType();
487  return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
488  (EltVT == MVT::f64 && ST->hasFP64()) ||
489  (EltVT == MVT::f16 && ST->hasFullFP16());
490  };
491 
492  EVT SrcTy = TLI->getValueType(DL, Src);
493  EVT DstTy = TLI->getValueType(DL, Dst);
494 
495  if (!SrcTy.isSimple() || !DstTy.isSimple())
496  return AdjustCost(
497  BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
498 
499  // Extending masked load/Truncating masked stores is expensive because we
500  // currently don't split them. This means that we'll likely end up
501  // loading/storing each element individually (hence the high cost).
502  if ((ST->hasMVEIntegerOps() &&
503  (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
504  Opcode == Instruction::SExt)) ||
505  (ST->hasMVEFloatOps() &&
506  (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
507  IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
508  if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
509  return 2 * DstTy.getVectorNumElements() *
511 
512  // The extend of other kinds of load is free
513  if (CCH == TTI::CastContextHint::Normal ||
515  static const TypeConversionCostTblEntry LoadConversionTbl[] = {
528  };
529  if (const auto *Entry = ConvertCostTableLookup(
530  LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
531  return AdjustCost(Entry->Cost);
532 
533  static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
540  // The following extend from a legal type to an illegal type, so need to
541  // split the load. This introduced an extra load operation, but the
542  // extend is still "free".
549  };
550  if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
551  if (const auto *Entry =
552  ConvertCostTableLookup(MVELoadConversionTbl, ISD,
553  DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
554  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
555  }
556 
557  static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
558  // FPExtends are similar but also require the VCVT instructions.
561  };
562  if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
563  if (const auto *Entry =
564  ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
565  DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
566  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
567  }
568 
569  // The truncate of a store is free. This is the mirror of extends above.
570  static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
578  };
579  if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
580  if (const auto *Entry =
581  ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
582  SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
583  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
584  }
585 
586  static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
589  };
590  if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
591  if (const auto *Entry =
592  ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
593  SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
594  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
595  }
596  }
597 
598  // NEON vector operations that can extend their inputs.
599  if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
600  I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
601  static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
602  // vaddl
603  { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
604  { ISD::ADD, MVT::v8i16, MVT::v8i8, 0 },
605  // vsubl
606  { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
607  { ISD::SUB, MVT::v8i16, MVT::v8i8, 0 },
608  // vmull
609  { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
610  { ISD::MUL, MVT::v8i16, MVT::v8i8, 0 },
611  // vshll
612  { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
613  { ISD::SHL, MVT::v8i16, MVT::v8i8, 0 },
614  };
615 
616  auto *User = cast<Instruction>(*I->user_begin());
617  int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
618  if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
619  DstTy.getSimpleVT(),
620  SrcTy.getSimpleVT())) {
621  return AdjustCost(Entry->Cost);
622  }
623  }
624 
625  // Single to/from double precision conversions.
626  if (Src->isVectorTy() && ST->hasNEON() &&
627  ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
628  DstTy.getScalarType() == MVT::f32) ||
629  (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
630  DstTy.getScalarType() == MVT::f64))) {
631  static const CostTblEntry NEONFltDblTbl[] = {
632  // Vector fptrunc/fpext conversions.
635  {ISD::FP_EXTEND, MVT::v4f32, 4}};
636 
637  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
638  if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
639  return AdjustCost(LT.first * Entry->Cost);
640  }
641 
642  // Some arithmetic, load and store operations have specific instructions
643  // to cast up/down their types automatically at no extra cost.
644  // TODO: Get these tables to know at least what the related operations are.
645  static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
652 
653  // The number of vmovl instructions for the extension.
672 
673  // Operations that we legalize using splitting.
676 
677  // Vector float <-> i32 conversions.
680 
701 
708 
709  // Vector double <-> i32 conversions.
712 
719 
726  };
727 
728  if (SrcTy.isVector() && ST->hasNEON()) {
729  if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
730  DstTy.getSimpleVT(),
731  SrcTy.getSimpleVT()))
732  return AdjustCost(Entry->Cost);
733  }
734 
735  // Scalar float to integer conversions.
736  static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
757  };
758  if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
759  if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
760  DstTy.getSimpleVT(),
761  SrcTy.getSimpleVT()))
762  return AdjustCost(Entry->Cost);
763  }
764 
765  // Scalar integer to float conversions.
766  static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
787  };
788 
789  if (SrcTy.isInteger() && ST->hasNEON()) {
790  if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
791  ISD, DstTy.getSimpleVT(),
792  SrcTy.getSimpleVT()))
793  return AdjustCost(Entry->Cost);
794  }
795 
796  // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
797  // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
798  // are linearised so take more.
799  static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
812  };
813 
814  if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
815  if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
816  ISD, DstTy.getSimpleVT(),
817  SrcTy.getSimpleVT()))
818  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
819  }
820 
821  if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
822  // As general rule, fp converts that were not matched above are scalarized
823  // and cost 1 vcvt for each lane, so long as the instruction is available.
824  // If not it will become a series of function calls.
825  const InstructionCost CallCost =
826  getCallInstrCost(nullptr, Dst, {Src}, CostKind);
827  int Lanes = 1;
828  if (SrcTy.isFixedLengthVector())
829  Lanes = SrcTy.getVectorNumElements();
830 
831  if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
832  return Lanes;
833  else
834  return Lanes * CallCost;
835  }
836 
837  if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
838  SrcTy.isFixedLengthVector()) {
839  // Treat a truncate with larger than legal source (128bits for MVE) as
840  // expensive, 2 instructions per lane.
841  if ((SrcTy.getScalarType() == MVT::i8 ||
842  SrcTy.getScalarType() == MVT::i16 ||
843  SrcTy.getScalarType() == MVT::i32) &&
844  SrcTy.getSizeInBits() > 128 &&
845  SrcTy.getSizeInBits() > DstTy.getSizeInBits())
846  return SrcTy.getVectorNumElements() * 2;
847  }
848 
849  // Scalar integer conversion costs.
850  static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
851  // i16 -> i64 requires two dependent operations.
853 
854  // Truncates on i64 are assumed to be free.
857  { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 },
859  };
860 
861  if (SrcTy.isInteger()) {
862  if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
863  DstTy.getSimpleVT(),
864  SrcTy.getSimpleVT()))
865  return AdjustCost(Entry->Cost);
866  }
867 
868  int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
870  : 1;
871  return AdjustCost(
872  BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
873 }
874 
876  unsigned Index) {
877  // Penalize inserting into an D-subregister. We end up with a three times
878  // lower estimated throughput on swift.
879  if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
880  ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
881  return 3;
882 
883  if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
884  Opcode == Instruction::ExtractElement)) {
885  // Cross-class copies are expensive on many microarchitectures,
886  // so assume they are expensive by default.
887  if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
888  return 3;
889 
890  // Even if it's not a cross class copy, this likely leads to mixing
891  // of NEON and VFP code and should be therefore penalized.
892  if (ValTy->isVectorTy() &&
893  ValTy->getScalarSizeInBits() <= 32)
894  return std::max<InstructionCost>(
895  BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
896  }
897 
898  if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
899  Opcode == Instruction::ExtractElement)) {
900  // Integer cross-lane moves are more expensive than float, which can
901  // sometimes just be vmovs. Integer involve being passes to GPR registers,
902  // causing more of a delay.
903  std::pair<InstructionCost, MVT> LT =
905  return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
906  }
907 
908  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
909 }
910 
912  Type *CondTy,
913  CmpInst::Predicate VecPred,
915  const Instruction *I) {
916  int ISD = TLI->InstructionOpcodeToISD(Opcode);
917 
918  // Thumb scalar code size cost for select.
919  if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
920  ST->isThumb() && !ValTy->isVectorTy()) {
921  // Assume expensive structs.
922  if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
923  return TTI::TCC_Expensive;
924 
925  // Select costs can vary because they:
926  // - may require one or more conditional mov (including an IT),
927  // - can't operate directly on immediates,
928  // - require live flags, which we can't copy around easily.
930 
931  // Possible IT instruction for Thumb2, or more for Thumb1.
932  ++Cost;
933 
934  // i1 values may need rematerialising by using mov immediates and/or
935  // flag setting instructions.
936  if (ValTy->isIntegerTy(1))
937  ++Cost;
938 
939  return Cost;
940  }
941 
942  // If this is a vector min/max/abs, use the cost of that intrinsic directly
943  // instead. Hopefully when min/max intrinsics are more prevalent this code
944  // will not be needed.
945  const Instruction *Sel = I;
946  if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
947  Sel->hasOneUse())
948  Sel = cast<Instruction>(Sel->user_back());
949  if (Sel && ValTy->isVectorTy() &&
950  (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
951  const Value *LHS, *RHS;
953  unsigned IID = 0;
954  switch (SPF) {
955  case SPF_ABS:
956  IID = Intrinsic::abs;
957  break;
958  case SPF_SMIN:
959  IID = Intrinsic::smin;
960  break;
961  case SPF_SMAX:
962  IID = Intrinsic::smax;
963  break;
964  case SPF_UMIN:
965  IID = Intrinsic::umin;
966  break;
967  case SPF_UMAX:
968  IID = Intrinsic::umax;
969  break;
970  case SPF_FMINNUM:
971  IID = Intrinsic::minnum;
972  break;
973  case SPF_FMAXNUM:
974  IID = Intrinsic::maxnum;
975  break;
976  default:
977  break;
978  }
979  if (IID) {
980  // The ICmp is free, the select gets the cost of the min/max/etc
981  if (Sel != I)
982  return 0;
983  IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
984  return getIntrinsicInstrCost(CostAttrs, CostKind);
985  }
986  }
987 
988  // On NEON a vector select gets lowered to vbsl.
989  if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
990  // Lowering of some vector selects is currently far from perfect.
991  static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
992  { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
993  { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
995  };
996 
997  EVT SelCondTy = TLI->getValueType(DL, CondTy);
998  EVT SelValTy = TLI->getValueType(DL, ValTy);
999  if (SelCondTy.isSimple() && SelValTy.isSimple()) {
1000  if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
1001  SelCondTy.getSimpleVT(),
1002  SelValTy.getSimpleVT()))
1003  return Entry->Cost;
1004  }
1005 
1006  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1007  return LT.first;
1008  }
1009 
1010  if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
1011  (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
1012  cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
1013  FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
1014  FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy);
1015  if (!VecCondTy)
1016  VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy));
1017 
1018  // If we don't have mve.fp any fp operations will need to be scalarized.
1019  if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
1020  // One scalaization insert, one scalarization extract and the cost of the
1021  // fcmps.
1022  return BaseT::getScalarizationOverhead(VecValTy, false, true) +
1023  BaseT::getScalarizationOverhead(VecCondTy, true, false) +
1024  VecValTy->getNumElements() *
1025  getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
1026  VecCondTy->getScalarType(), VecPred, CostKind,
1027  I);
1028  }
1029 
1030  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1031  int BaseCost = ST->getMVEVectorCostFactor(CostKind);
1032  // There are two types - the input that specifies the type of the compare
1033  // and the output vXi1 type. Because we don't know how the output will be
1034  // split, we may need an expensive shuffle to get two in sync. This has the
1035  // effect of making larger than legal compares (v8i32 for example)
1036  // expensive.
1037  if (LT.second.isVector() && LT.second.getVectorNumElements() > 2) {
1038  if (LT.first > 1)
1039  return LT.first * BaseCost +
1040  BaseT::getScalarizationOverhead(VecCondTy, true, false);
1041  return BaseCost;
1042  }
1043  }
1044 
1045  // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1046  // for "multiple beats" potentially needed by MVE instructions.
1047  int BaseCost = 1;
1048  if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
1049  BaseCost = ST->getMVEVectorCostFactor(CostKind);
1050 
1051  return BaseCost *
1052  BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1053 }
1054 
1056  ScalarEvolution *SE,
1057  const SCEV *Ptr) {
1058  // Address computations in vectorized code with non-consecutive addresses will
1059  // likely result in more instructions compared to scalar code where the
1060  // computation can more often be merged into the index mode. The resulting
1061  // extra micro-ops can significantly decrease throughput.
1062  unsigned NumVectorInstToHideOverhead = 10;
1063  int MaxMergeDistance = 64;
1064 
1065  if (ST->hasNEON()) {
1066  if (Ty->isVectorTy() && SE &&
1067  !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
1068  return NumVectorInstToHideOverhead;
1069 
1070  // In many cases the address computation is not merged into the instruction
1071  // addressing mode.
1072  return 1;
1073  }
1074  return BaseT::getAddressComputationCost(Ty, SE, Ptr);
1075 }
1076 
1078  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
1079  // If a VCTP is part of a chain, it's already profitable and shouldn't be
1080  // optimized, else LSR may block tail-predication.
1081  switch (II->getIntrinsicID()) {
1082  case Intrinsic::arm_mve_vctp8:
1083  case Intrinsic::arm_mve_vctp16:
1084  case Intrinsic::arm_mve_vctp32:
1085  case Intrinsic::arm_mve_vctp64:
1086  return true;
1087  default:
1088  break;
1089  }
1090  }
1091  return false;
1092 }
1093 
1094 bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
1095  if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
1096  return false;
1097 
1098  if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
1099  // Don't support v2i1 yet.
1100  if (VecTy->getNumElements() == 2)
1101  return false;
1102 
1103  // We don't support extending fp types.
1104  unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
1105  if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
1106  return false;
1107  }
1108 
1109  unsigned EltWidth = DataTy->getScalarSizeInBits();
1110  return (EltWidth == 32 && Alignment >= 4) ||
1111  (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
1112 }
1113 
1115  if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
1116  return false;
1117 
1118  unsigned EltWidth = Ty->getScalarSizeInBits();
1119  return ((EltWidth == 32 && Alignment >= 4) ||
1120  (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1121 }
1122 
1123 /// Given a memcpy/memset/memmove instruction, return the number of memory
1124 /// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1125 /// call is used.
1127  MemOp MOp;
1128  unsigned DstAddrSpace = ~0u;
1129  unsigned SrcAddrSpace = ~0u;
1130  const Function *F = I->getParent()->getParent();
1131 
1132  if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
1133  ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
1134  // If 'size' is not a constant, a library call will be generated.
1135  if (!C)
1136  return -1;
1137 
1138  const unsigned Size = C->getValue().getZExtValue();
1139  const Align DstAlign = *MC->getDestAlign();
1140  const Align SrcAlign = *MC->getSourceAlign();
1141 
1142  MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
1143  /*IsVolatile*/ false);
1144  DstAddrSpace = MC->getDestAddressSpace();
1145  SrcAddrSpace = MC->getSourceAddressSpace();
1146  }
1147  else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
1148  ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
1149  // If 'size' is not a constant, a library call will be generated.
1150  if (!C)
1151  return -1;
1152 
1153  const unsigned Size = C->getValue().getZExtValue();
1154  const Align DstAlign = *MS->getDestAlign();
1155 
1156  MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
1157  /*IsZeroMemset*/ false, /*IsVolatile*/ false);
1158  DstAddrSpace = MS->getDestAddressSpace();
1159  }
1160  else
1161  llvm_unreachable("Expected a memcpy/move or memset!");
1162 
1163  unsigned Limit, Factor = 2;
1164  switch(I->getIntrinsicID()) {
1165  case Intrinsic::memcpy:
1166  Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
1167  break;
1168  case Intrinsic::memmove:
1169  Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
1170  break;
1171  case Intrinsic::memset:
1172  Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
1173  Factor = 1;
1174  break;
1175  default:
1176  llvm_unreachable("Expected a memcpy/move or memset!");
1177  }
1178 
1179  // MemOps will be poplulated with a list of data types that needs to be
1180  // loaded and stored. That's why we multiply the number of elements by 2 to
1181  // get the cost for this memcpy.
1182  std::vector<EVT> MemOps;
1183  if (getTLI()->findOptimalMemOpLowering(
1184  MemOps, Limit, MOp, DstAddrSpace,
1185  SrcAddrSpace, F->getAttributes()))
1186  return MemOps.size() * Factor;
1187 
1188  // If we can't find an optimal memop lowering, return the default cost
1189  return -1;
1190 }
1191 
1193  int NumOps = getNumMemOps(cast<IntrinsicInst>(I));
1194 
1195  // To model the cost of a library call, we assume 1 for the call, and
1196  // 3 for the argument setup.
1197  if (NumOps == -1)
1198  return 4;
1199  return NumOps;
1200 }
1201 
1205  int Index, VectorType *SubTp,
1207  Kind = improveShuffleKindFromMask(Kind, Mask);
1208  if (ST->hasNEON()) {
1209  if (Kind == TTI::SK_Broadcast) {
1210  static const CostTblEntry NEONDupTbl[] = {
1211  // VDUP handles these cases.
1218 
1223 
1224  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
1225  if (const auto *Entry =
1226  CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
1227  return LT.first * Entry->Cost;
1228  }
1229  if (Kind == TTI::SK_Reverse) {
1230  static const CostTblEntry NEONShuffleTbl[] = {
1231  // Reverse shuffle cost one instruction if we are shuffling within a
1232  // double word (vrev) or two if we shuffle a quad word (vrev, vext).
1239 
1244 
1245  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
1246  if (const auto *Entry =
1247  CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
1248  return LT.first * Entry->Cost;
1249  }
1250  if (Kind == TTI::SK_Select) {
1251  static const CostTblEntry NEONSelShuffleTbl[] = {
1252  // Select shuffle cost table for ARM. Cost is the number of
1253  // instructions
1254  // required to create the shuffled vector.
1255 
1260 
1264 
1266 
1268 
1269  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
1270  if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
1271  ISD::VECTOR_SHUFFLE, LT.second))
1272  return LT.first * Entry->Cost;
1273  }
1274  }
1275  if (ST->hasMVEIntegerOps()) {
1276  if (Kind == TTI::SK_Broadcast) {
1277  static const CostTblEntry MVEDupTbl[] = {
1278  // VDUP handles these cases.
1284 
1285  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
1286  if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
1287  LT.second))
1288  return LT.first * Entry->Cost *
1290  }
1291 
1292  if (!Mask.empty()) {
1293  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
1294  if (LT.second.isVector() &&
1295  Mask.size() <= LT.second.getVectorNumElements() &&
1296  (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
1297  isVREVMask(Mask, LT.second, 64)))
1299  }
1300  }
1301 
1302  int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
1304  : 1;
1305  return BaseCost *
1306  BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
1307 }
1308 
1310  unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1313  const Instruction *CxtI) {
1314  int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1315  if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
1316  // Make operations on i1 relatively expensive as this often involves
1317  // combining predicates. AND and XOR should be easier to handle with IT
1318  // blocks.
1319  switch (ISDOpcode) {
1320  default:
1321  break;
1322  case ISD::AND:
1323  case ISD::XOR:
1324  return 2;
1325  case ISD::OR:
1326  return 3;
1327  }
1328  }
1329 
1330  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1331 
1332  if (ST->hasNEON()) {
1333  const unsigned FunctionCallDivCost = 20;
1334  const unsigned ReciprocalDivCost = 10;
1335  static const CostTblEntry CostTbl[] = {
1336  // Division.
1337  // These costs are somewhat random. Choose a cost of 20 to indicate that
1338  // vectorizing devision (added function call) is going to be very expensive.
1339  // Double registers types.
1340  { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1341  { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1342  { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
1343  { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
1344  { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1345  { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1346  { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
1347  { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
1348  { ISD::SDIV, MVT::v4i16, ReciprocalDivCost},
1349  { ISD::UDIV, MVT::v4i16, ReciprocalDivCost},
1350  { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
1351  { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
1352  { ISD::SDIV, MVT::v8i8, ReciprocalDivCost},
1353  { ISD::UDIV, MVT::v8i8, ReciprocalDivCost},
1354  { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost},
1355  { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost},
1356  // Quad register types.
1357  { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1358  { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1359  { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
1360  { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
1361  { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1362  { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1363  { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
1364  { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
1365  { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1366  { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1367  { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
1368  { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
1369  { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1370  { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1371  { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
1372  { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
1373  // Multiplication.
1374  };
1375 
1376  if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
1377  return LT.first * Entry->Cost;
1378 
1380  Opcode, Ty, CostKind, Op1Info, Op2Info);
1381 
1382  // This is somewhat of a hack. The problem that we are facing is that SROA
1383  // creates a sequence of shift, and, or instructions to construct values.
1384  // These sequences are recognized by the ISel and have zero-cost. Not so for
1385  // the vectorized code. Because we have support for v2i64 but not i64 those
1386  // sequences look particularly beneficial to vectorize.
1387  // To work around this we increase the cost of v2i64 operations to make them
1388  // seem less beneficial.
1389  if (LT.second == MVT::v2i64 && Op2Info.isUniform() && Op2Info.isConstant())
1390  Cost += 4;
1391 
1392  return Cost;
1393  }
1394 
1395  // If this operation is a shift on arm/thumb2, it might well be folded into
1396  // the following instruction, hence having a cost of 0.
1397  auto LooksLikeAFreeShift = [&]() {
1398  if (ST->isThumb1Only() || Ty->isVectorTy())
1399  return false;
1400 
1401  if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
1402  return false;
1403  if (!Op2Info.isUniform() || !Op2Info.isConstant())
1404  return false;
1405 
1406  // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1407  switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
1408  case Instruction::Add:
1409  case Instruction::Sub:
1410  case Instruction::And:
1411  case Instruction::Xor:
1412  case Instruction::Or:
1413  case Instruction::ICmp:
1414  return true;
1415  default:
1416  return false;
1417  }
1418  };
1419  if (LooksLikeAFreeShift())
1420  return 0;
1421 
1422  // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1423  // for "multiple beats" potentially needed by MVE instructions.
1424  int BaseCost = 1;
1425  if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1426  BaseCost = ST->getMVEVectorCostFactor(CostKind);
1427 
1428  // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
1429  // without treating floats as more expensive that scalars or increasing the
1430  // costs for custom operations. The results is also multiplied by the
1431  // MVEVectorCostFactor where appropriate.
1432  if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
1433  return LT.first * BaseCost;
1434 
1435  // Else this is expand, assume that we need to scalarize this op.
1436  if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
1437  unsigned Num = VTy->getNumElements();
1440  // Return the cost of multiple scalar invocation plus the cost of
1441  // inserting and extracting the values.
1442  SmallVector<Type *> Tys(Args.size(), Ty);
1443  return BaseT::getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
1444  }
1445 
1446  return BaseCost;
1447 }
1448 
1450  MaybeAlign Alignment,
1451  unsigned AddressSpace,
1453  TTI::OperandValueInfo OpInfo,
1454  const Instruction *I) {
1455  // TODO: Handle other cost kinds.
1457  return 1;
1458 
1459  // Type legalization can't handle structs
1460  if (TLI->getValueType(DL, Src, true) == MVT::Other)
1461  return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1462  CostKind);
1463 
1464  if (ST->hasNEON() && Src->isVectorTy() &&
1465  (Alignment && *Alignment != Align(16)) &&
1466  cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
1467  // Unaligned loads/stores are extremely inefficient.
1468  // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1469  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1470  return LT.first * 4;
1471  }
1472 
1473  // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
1474  // Same for stores.
1475  if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
1476  ((Opcode == Instruction::Load && I->hasOneUse() &&
1477  isa<FPExtInst>(*I->user_begin())) ||
1478  (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
1479  FixedVectorType *SrcVTy = cast<FixedVectorType>(Src);
1480  Type *DstTy =
1481  Opcode == Instruction::Load
1482  ? (*I->user_begin())->getType()
1483  : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
1484  if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
1485  DstTy->getScalarType()->isFloatTy())
1486  return ST->getMVEVectorCostFactor(CostKind);
1487  }
1488 
1489  int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1491  : 1;
1492  return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1493  CostKind, OpInfo, I);
1494 }
1495 
1497 ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1498  unsigned AddressSpace,
1500  if (ST->hasMVEIntegerOps()) {
1501  if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment))
1502  return ST->getMVEVectorCostFactor(CostKind);
1503  if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment))
1504  return ST->getMVEVectorCostFactor(CostKind);
1505  }
1506  if (!isa<FixedVectorType>(Src))
1507  return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1508  CostKind);
1509  // Scalar cost, which is currently very high due to the efficiency of the
1510  // generated code.
1511  return cast<FixedVectorType>(Src)->getNumElements() * 8;
1512 }
1513 
1515  unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1516  Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1517  bool UseMaskForCond, bool UseMaskForGaps) {
1518  assert(Factor >= 2 && "Invalid interleave factor");
1519  assert(isa<VectorType>(VecTy) && "Expect a vector type");
1520 
1521  // vldN/vstN doesn't support vector types of i64/f64 element.
1522  bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
1523 
1524  if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
1525  !UseMaskForCond && !UseMaskForGaps) {
1526  unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1527  auto *SubVecTy =
1528  FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
1529 
1530  // vldN/vstN only support legal vector types of size 64 or 128 in bits.
1531  // Accesses having vector types that are a multiple of 128 bits can be
1532  // matched to more than one vldN/vstN instruction.
1533  int BaseCost =
1534  ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1;
1535  if (NumElts % Factor == 0 &&
1536  TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
1537  return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1538 
1539  // Some smaller than legal interleaved patterns are cheap as we can make
1540  // use of the vmovn or vrev patterns to interleave a standard load. This is
1541  // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1542  // promoted differently). The cost of 2 here is then a load and vrev or
1543  // vmovn.
1544  if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
1545  VecTy->isIntOrIntVectorTy() &&
1546  DL.getTypeSizeInBits(SubVecTy).getFixedSize() <= 64)
1547  return 2 * BaseCost;
1548  }
1549 
1550  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1551  Alignment, AddressSpace, CostKind,
1552  UseMaskForCond, UseMaskForGaps);
1553 }
1554 
1556  unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1557  Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
1558  using namespace PatternMatch;
1559  if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
1560  return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1561  Alignment, CostKind, I);
1562 
1563  assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
1564  auto *VTy = cast<FixedVectorType>(DataTy);
1565 
1566  // TODO: Splitting, once we do that.
1567 
1568  unsigned NumElems = VTy->getNumElements();
1569  unsigned EltSize = VTy->getScalarSizeInBits();
1570  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(DataTy);
1571 
1572  // For now, it is assumed that for the MVE gather instructions the loads are
1573  // all effectively serialised. This means the cost is the scalar cost
1574  // multiplied by the number of elements being loaded. This is possibly very
1575  // conservative, but even so we still end up vectorising loops because the
1576  // cost per iteration for many loops is lower than for scalar loops.
1577  InstructionCost VectorCost =
1578  NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
1579  // The scalarization cost should be a lot higher. We use the number of vector
1580  // elements plus the scalarization overhead.
1581  InstructionCost ScalarCost =
1582  NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, true, false) +
1583  BaseT::getScalarizationOverhead(VTy, false, true);
1584 
1585  if (EltSize < 8 || Alignment < EltSize / 8)
1586  return ScalarCost;
1587 
1588  unsigned ExtSize = EltSize;
1589  // Check whether there's a single user that asks for an extended type
1590  if (I != nullptr) {
1591  // Dependent of the caller of this function, a gather instruction will
1592  // either have opcode Instruction::Load or be a call to the masked_gather
1593  // intrinsic
1594  if ((I->getOpcode() == Instruction::Load ||
1595  match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
1596  I->hasOneUse()) {
1597  const User *Us = *I->users().begin();
1598  if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
1599  // only allow valid type combinations
1600  unsigned TypeSize =
1601  cast<Instruction>(Us)->getType()->getScalarSizeInBits();
1602  if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
1603  (TypeSize == 16 && EltSize == 8)) &&
1604  TypeSize * NumElems == 128) {
1605  ExtSize = TypeSize;
1606  }
1607  }
1608  }
1609  // Check whether the input data needs to be truncated
1610  TruncInst *T;
1611  if ((I->getOpcode() == Instruction::Store ||
1612  match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
1613  (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
1614  // Only allow valid type combinations
1615  unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
1616  if (((EltSize == 16 && TypeSize == 32) ||
1617  (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
1618  TypeSize * NumElems == 128)
1619  ExtSize = TypeSize;
1620  }
1621  }
1622 
1623  if (ExtSize * NumElems != 128 || NumElems < 4)
1624  return ScalarCost;
1625 
1626  // Any (aligned) i32 gather will not need to be scalarised.
1627  if (ExtSize == 32)
1628  return VectorCost;
1629  // For smaller types, we need to ensure that the gep's inputs are correctly
1630  // extended from a small enough value. Other sizes (including i64) are
1631  // scalarized for now.
1632  if (ExtSize != 8 && ExtSize != 16)
1633  return ScalarCost;
1634 
1635  if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
1636  Ptr = BC->getOperand(0);
1637  if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
1638  if (GEP->getNumOperands() != 2)
1639  return ScalarCost;
1640  unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
1641  // Scale needs to be correct (which is only relevant for i16s).
1642  if (Scale != 1 && Scale * 8 != ExtSize)
1643  return ScalarCost;
1644  // And we need to zext (not sext) the indexes from a small enough type.
1645  if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
1646  if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
1647  return VectorCost;
1648  }
1649  return ScalarCost;
1650  }
1651  return ScalarCost;
1652 }
1653 
1659  return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1660 
1661  EVT ValVT = TLI->getValueType(DL, ValTy);
1662  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1663  if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD)
1664  return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1665 
1666  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1667 
1668  static const CostTblEntry CostTblAdd[]{
1669  {ISD::ADD, MVT::v16i8, 1},
1670  {ISD::ADD, MVT::v8i16, 1},
1671  {ISD::ADD, MVT::v4i32, 1},
1672  };
1673  if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
1674  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1675 
1676  return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1677 }
1678 
1680  unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1682  EVT ValVT = TLI->getValueType(DL, ValTy);
1683  EVT ResVT = TLI->getValueType(DL, ResTy);
1684 
1685  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1686 
1687  switch (ISD) {
1688  case ISD::ADD:
1689  if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1690  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1691 
1692  // The legal cases are:
1693  // VADDV u/s 8/16/32
1694  // VADDLV u/s 32
1695  // Codegen currently cannot always handle larger than legal vectors very
1696  // well, especially for predicated reductions where the mask needs to be
1697  // split, so restrict to 128bit or smaller input types.
1698  unsigned RevVTSize = ResVT.getSizeInBits();
1699  if (ValVT.getSizeInBits() <= 128 &&
1700  ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1701  (LT.second == MVT::v8i16 && RevVTSize <= 32) ||
1702  (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1703  return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1704  }
1705  break;
1706  default:
1707  break;
1708  }
1709  return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, FMF,
1710  CostKind);
1711 }
1712 
1715  VectorType *ValTy,
1717  EVT ValVT = TLI->getValueType(DL, ValTy);
1718  EVT ResVT = TLI->getValueType(DL, ResTy);
1719 
1720  if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1721  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1722 
1723  // The legal cases are:
1724  // VMLAV u/s 8/16/32
1725  // VMLALV u/s 16/32
1726  // Codegen currently cannot always handle larger than legal vectors very
1727  // well, especially for predicated reductions where the mask needs to be
1728  // split, so restrict to 128bit or smaller input types.
1729  unsigned RevVTSize = ResVT.getSizeInBits();
1730  if (ValVT.getSizeInBits() <= 128 &&
1731  ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1732  (LT.second == MVT::v8i16 && RevVTSize <= 64) ||
1733  (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1734  return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1735  }
1736 
1737  return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, ValTy, CostKind);
1738 }
1739 
1743  switch (ICA.getID()) {
1744  case Intrinsic::get_active_lane_mask:
1745  // Currently we make a somewhat optimistic assumption that
1746  // active_lane_mask's are always free. In reality it may be freely folded
1747  // into a tail predicated loop, expanded into a VCPT or expanded into a lot
1748  // of add/icmp code. We may need to improve this in the future, but being
1749  // able to detect if it is free or not involves looking at a lot of other
1750  // code. We currently assume that the vectorizer inserted these, and knew
1751  // what it was doing in adding one.
1752  if (ST->hasMVEIntegerOps())
1753  return 0;
1754  break;
1755  case Intrinsic::sadd_sat:
1756  case Intrinsic::ssub_sat:
1757  case Intrinsic::uadd_sat:
1758  case Intrinsic::usub_sat: {
1759  if (!ST->hasMVEIntegerOps())
1760  break;
1761  Type *VT = ICA.getReturnType();
1762 
1763  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
1764  if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1765  LT.second == MVT::v16i8) {
1766  // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
1767  // need to extend the type, as it uses shr(qadd(shl, shl)).
1768  unsigned Instrs =
1769  LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4;
1770  return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
1771  }
1772  break;
1773  }
1774  case Intrinsic::abs:
1775  case Intrinsic::smin:
1776  case Intrinsic::smax:
1777  case Intrinsic::umin:
1778  case Intrinsic::umax: {
1779  if (!ST->hasMVEIntegerOps())
1780  break;
1781  Type *VT = ICA.getReturnType();
1782 
1783  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
1784  if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1785  LT.second == MVT::v16i8)
1786  return LT.first * ST->getMVEVectorCostFactor(CostKind);
1787  break;
1788  }
1789  case Intrinsic::minnum:
1790  case Intrinsic::maxnum: {
1791  if (!ST->hasMVEFloatOps())
1792  break;
1793  Type *VT = ICA.getReturnType();
1794  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
1795  if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
1796  return LT.first * ST->getMVEVectorCostFactor(CostKind);
1797  break;
1798  }
1799  case Intrinsic::fptosi_sat:
1800  case Intrinsic::fptoui_sat: {
1801  if (ICA.getArgTypes().empty())
1802  break;
1803  bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
1804  auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
1805  EVT MTy = TLI->getValueType(DL, ICA.getReturnType());
1806  // Check for the legal types, with the corect subtarget features.
1807  if ((ST->hasVFP2Base() && LT.second == MVT::f32 && MTy == MVT::i32) ||
1808  (ST->hasFP64() && LT.second == MVT::f64 && MTy == MVT::i32) ||
1809  (ST->hasFullFP16() && LT.second == MVT::f16 && MTy == MVT::i32))
1810  return LT.first;
1811 
1812  // Equally for MVE vector types
1813  if (ST->hasMVEFloatOps() &&
1814  (LT.second == MVT::v4f32 || LT.second == MVT::v8f16) &&
1815  LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())
1816  return LT.first * ST->getMVEVectorCostFactor(CostKind);
1817 
1818  // Otherwise we use a legal convert followed by a min+max
1819  if (((ST->hasVFP2Base() && LT.second == MVT::f32) ||
1820  (ST->hasFP64() && LT.second == MVT::f64) ||
1821  (ST->hasFullFP16() && LT.second == MVT::f16) ||
1822  (ST->hasMVEFloatOps() &&
1823  (LT.second == MVT::v4f32 || LT.second == MVT::v8f16))) &&
1824  LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
1825  Type *LegalTy = Type::getIntNTy(ICA.getReturnType()->getContext(),
1826  LT.second.getScalarSizeInBits());
1828  LT.second.isVector() ? ST->getMVEVectorCostFactor(CostKind) : 1;
1829  IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin
1830  : Intrinsic::umin,
1831  LegalTy, {LegalTy, LegalTy});
1832  Cost += getIntrinsicInstrCost(Attrs1, CostKind);
1833  IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax
1834  : Intrinsic::umax,
1835  LegalTy, {LegalTy, LegalTy});
1836  Cost += getIntrinsicInstrCost(Attrs2, CostKind);
1837  return LT.first * Cost;
1838  }
1839  break;
1840  }
1841  }
1842 
1844 }
1845 
1847  if (!F->isIntrinsic())
1848  return BaseT::isLoweredToCall(F);
1849 
1850  // Assume all Arm-specific intrinsics map to an instruction.
1851  if (F->getName().startswith("llvm.arm"))
1852  return false;
1853 
1854  switch (F->getIntrinsicID()) {
1855  default: break;
1856  case Intrinsic::powi:
1857  case Intrinsic::sin:
1858  case Intrinsic::cos:
1859  case Intrinsic::pow:
1860  case Intrinsic::log:
1861  case Intrinsic::log10:
1862  case Intrinsic::log2:
1863  case Intrinsic::exp:
1864  case Intrinsic::exp2:
1865  return true;
1866  case Intrinsic::sqrt:
1867  case Intrinsic::fabs:
1868  case Intrinsic::copysign:
1869  case Intrinsic::floor:
1870  case Intrinsic::ceil:
1871  case Intrinsic::trunc:
1872  case Intrinsic::rint:
1873  case Intrinsic::nearbyint:
1874  case Intrinsic::round:
1875  case Intrinsic::canonicalize:
1876  case Intrinsic::lround:
1877  case Intrinsic::llround:
1878  case Intrinsic::lrint:
1879  case Intrinsic::llrint:
1880  if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
1881  return true;
1882  if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
1883  return true;
1884  // Some operations can be handled by vector instructions and assume
1885  // unsupported vectors will be expanded into supported scalar ones.
1886  // TODO Handle scalar operations properly.
1887  return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
1888  case Intrinsic::masked_store:
1889  case Intrinsic::masked_load:
1890  case Intrinsic::masked_gather:
1891  case Intrinsic::masked_scatter:
1892  return !ST->hasMVEIntegerOps();
1893  case Intrinsic::sadd_with_overflow:
1894  case Intrinsic::uadd_with_overflow:
1895  case Intrinsic::ssub_with_overflow:
1896  case Intrinsic::usub_with_overflow:
1897  case Intrinsic::sadd_sat:
1898  case Intrinsic::uadd_sat:
1899  case Intrinsic::ssub_sat:
1900  case Intrinsic::usub_sat:
1901  return false;
1902  }
1903 
1904  return BaseT::isLoweredToCall(F);
1905 }
1906 
1908  unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
1909  EVT VT = TLI->getValueType(DL, I.getType(), true);
1910  if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
1911  return true;
1912 
1913  // Check if an intrinsic will be lowered to a call and assume that any
1914  // other CallInst will generate a bl.
1915  if (auto *Call = dyn_cast<CallInst>(&I)) {
1916  if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
1917  switch(II->getIntrinsicID()) {
1918  case Intrinsic::memcpy:
1919  case Intrinsic::memset:
1920  case Intrinsic::memmove:
1921  return getNumMemOps(II) == -1;
1922  default:
1923  if (const Function *F = Call->getCalledFunction())
1924  return isLoweredToCall(F);
1925  }
1926  }
1927  return true;
1928  }
1929 
1930  // FPv5 provides conversions between integer, double-precision,
1931  // single-precision, and half-precision formats.
1932  switch (I.getOpcode()) {
1933  default:
1934  break;
1935  case Instruction::FPToSI:
1936  case Instruction::FPToUI:
1937  case Instruction::SIToFP:
1938  case Instruction::UIToFP:
1939  case Instruction::FPTrunc:
1940  case Instruction::FPExt:
1941  return !ST->hasFPARMv8Base();
1942  }
1943 
1944  // FIXME: Unfortunately the approach of checking the Operation Action does
1945  // not catch all cases of Legalization that use library calls. Our
1946  // Legalization step categorizes some transformations into library calls as
1947  // Custom, Expand or even Legal when doing type legalization. So for now
1948  // we have to special case for instance the SDIV of 64bit integers and the
1949  // use of floating point emulation.
1950  if (VT.isInteger() && VT.getSizeInBits() >= 64) {
1951  switch (ISD) {
1952  default:
1953  break;
1954  case ISD::SDIV:
1955  case ISD::UDIV:
1956  case ISD::SREM:
1957  case ISD::UREM:
1958  case ISD::SDIVREM:
1959  case ISD::UDIVREM:
1960  return true;
1961  }
1962  }
1963 
1964  // Assume all other non-float operations are supported.
1965  if (!VT.isFloatingPoint())
1966  return false;
1967 
1968  // We'll need a library call to handle most floats when using soft.
1969  if (TLI->useSoftFloat()) {
1970  switch (I.getOpcode()) {
1971  default:
1972  return true;
1973  case Instruction::Alloca:
1974  case Instruction::Load:
1975  case Instruction::Store:
1976  case Instruction::Select:
1977  case Instruction::PHI:
1978  return false;
1979  }
1980  }
1981 
1982  // We'll need a libcall to perform double precision operations on a single
1983  // precision only FPU.
1984  if (I.getType()->isDoubleTy() && !ST->hasFP64())
1985  return true;
1986 
1987  // Likewise for half precision arithmetic.
1988  if (I.getType()->isHalfTy() && !ST->hasFullFP16())
1989  return true;
1990 
1991  return false;
1992 }
1993 
1995  AssumptionCache &AC,
1996  TargetLibraryInfo *LibInfo,
1997  HardwareLoopInfo &HWLoopInfo) {
1998  // Low-overhead branches are only supported in the 'low-overhead branch'
1999  // extension of v8.1-m.
2000  if (!ST->hasLOB() || DisableLowOverheadLoops) {
2001  LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
2002  return false;
2003  }
2004 
2006  LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
2007  return false;
2008  }
2009 
2010  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2011  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
2012  LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
2013  return false;
2014  }
2015 
2016  const SCEV *TripCountSCEV =
2017  SE.getAddExpr(BackedgeTakenCount,
2018  SE.getOne(BackedgeTakenCount->getType()));
2019 
2020  // We need to store the trip count in LR, a 32-bit register.
2021  if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
2022  LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
2023  return false;
2024  }
2025 
2026  // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
2027  // point in generating a hardware loop if that's going to happen.
2028 
2029  auto IsHardwareLoopIntrinsic = [](Instruction &I) {
2030  if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
2031  switch (Call->getIntrinsicID()) {
2032  default:
2033  break;
2034  case Intrinsic::start_loop_iterations:
2035  case Intrinsic::test_start_loop_iterations:
2036  case Intrinsic::loop_decrement:
2037  case Intrinsic::loop_decrement_reg:
2038  return true;
2039  }
2040  }
2041  return false;
2042  };
2043 
2044  // Scan the instructions to see if there's any that we know will turn into a
2045  // call or if this loop is already a low-overhead loop or will become a tail
2046  // predicated loop.
2047  bool IsTailPredLoop = false;
2048  auto ScanLoop = [&](Loop *L) {
2049  for (auto *BB : L->getBlocks()) {
2050  for (auto &I : *BB) {
2051  if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
2052  isa<InlineAsm>(I)) {
2053  LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
2054  return false;
2055  }
2056  if (auto *II = dyn_cast<IntrinsicInst>(&I))
2057  IsTailPredLoop |=
2058  II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
2059  II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
2060  II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
2061  II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
2062  II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
2063  }
2064  }
2065  return true;
2066  };
2067 
2068  // Visit inner loops.
2069  for (auto *Inner : *L)
2070  if (!ScanLoop(Inner))
2071  return false;
2072 
2073  if (!ScanLoop(L))
2074  return false;
2075 
2076  // TODO: Check whether the trip count calculation is expensive. If L is the
2077  // inner loop but we know it has a low trip count, calculating that trip
2078  // count (in the parent loop) may be detrimental.
2079 
2080  LLVMContext &C = L->getHeader()->getContext();
2081  HWLoopInfo.CounterInReg = true;
2082  HWLoopInfo.IsNestingLegal = false;
2083  HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
2084  HWLoopInfo.CountType = Type::getInt32Ty(C);
2085  HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
2086  return true;
2087 }
2088 
2089 static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
2090  // We don't allow icmp's, and because we only look at single block loops,
2091  // we simply count the icmps, i.e. there should only be 1 for the backedge.
2092  if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
2093  return false;
2094  // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
2095  // not currently canonical, but soon will be. Code without them uses icmp, and
2096  // so is not tail predicated as per the condition above. In order to get the
2097  // same performance we treat min and max the same as an icmp for tailpred
2098  // purposes for the moment (we often rely on non-tailpred and higher VF's to
2099  // pick more optimial instructions like VQDMULH. They need to be recognized
2100  // directly by the vectorizer).
2101  if (auto *II = dyn_cast<IntrinsicInst>(&I))
2102  if ((II->getIntrinsicID() == Intrinsic::smin ||
2103  II->getIntrinsicID() == Intrinsic::smax ||
2104  II->getIntrinsicID() == Intrinsic::umin ||
2105  II->getIntrinsicID() == Intrinsic::umax) &&
2106  ++ICmpCount > 1)
2107  return false;
2108 
2109  if (isa<FCmpInst>(&I))
2110  return false;
2111 
2112  // We could allow extending/narrowing FP loads/stores, but codegen is
2113  // too inefficient so reject this for now.
2114  if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
2115  return false;
2116 
2117  // Extends have to be extending-loads
2118  if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
2119  if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
2120  return false;
2121 
2122  // Truncs have to be narrowing-stores
2123  if (isa<TruncInst>(&I) )
2124  if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
2125  return false;
2126 
2127  return true;
2128 }
2129 
2130 // To set up a tail-predicated loop, we need to know the total number of
2131 // elements processed by that loop. Thus, we need to determine the element
2132 // size and:
2133 // 1) it should be uniform for all operations in the vector loop, so we
2134 // e.g. don't want any widening/narrowing operations.
2135 // 2) it should be smaller than i64s because we don't have vector operations
2136 // that work on i64s.
2137 // 3) we don't want elements to be reversed or shuffled, to make sure the
2138 // tail-predication masks/predicates the right lanes.
2139 //
2141  const DataLayout &DL,
2142  const LoopAccessInfo *LAI) {
2143  LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
2144 
2145  // If there are live-out values, it is probably a reduction. We can predicate
2146  // most reduction operations freely under MVE using a combination of
2147  // prefer-predicated-reduction-select and inloop reductions. We limit this to
2148  // floating point and integer reductions, but don't check for operators
2149  // specifically here. If the value ends up not being a reduction (and so the
2150  // vectorizer cannot tailfold the loop), we should fall back to standard
2151  // vectorization automatically.
2153  LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
2154  bool ReductionsDisabled =
2157 
2158  for (auto *I : LiveOuts) {
2159  if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
2160  !I->getType()->isHalfTy()) {
2161  LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
2162  "live-out value\n");
2163  return false;
2164  }
2165  if (ReductionsDisabled) {
2166  LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
2167  return false;
2168  }
2169  }
2170 
2171  // Next, check that all instructions can be tail-predicated.
2172  PredicatedScalarEvolution PSE = LAI->getPSE();
2173  SmallVector<Instruction *, 16> LoadStores;
2174  int ICmpCount = 0;
2175 
2176  for (BasicBlock *BB : L->blocks()) {
2177  for (Instruction &I : BB->instructionsWithoutDebug()) {
2178  if (isa<PHINode>(&I))
2179  continue;
2180  if (!canTailPredicateInstruction(I, ICmpCount)) {
2181  LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
2182  return false;
2183  }
2184 
2185  Type *T = I.getType();
2186  if (T->getScalarSizeInBits() > 32) {
2187  LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
2188  return false;
2189  }
2190  if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
2192  Type *AccessTy = getLoadStoreType(&I);
2193  int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L).value_or(0);
2194  if (NextStride == 1) {
2195  // TODO: for now only allow consecutive strides of 1. We could support
2196  // other strides as long as it is uniform, but let's keep it simple
2197  // for now.
2198  continue;
2199  } else if (NextStride == -1 ||
2200  (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
2201  (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
2202  LLVM_DEBUG(dbgs()
2203  << "Consecutive strides of 2 found, vld2/vstr2 can't "
2204  "be tail-predicated\n.");
2205  return false;
2206  // TODO: don't tail predicate if there is a reversed load?
2207  } else if (EnableMaskedGatherScatters) {
2208  // Gather/scatters do allow loading from arbitrary strides, at
2209  // least if they are loop invariant.
2210  // TODO: Loop variant strides should in theory work, too, but
2211  // this requires further testing.
2212  const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr);
2213  if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
2214  const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
2215  if (PSE.getSE()->isLoopInvariant(Step, L))
2216  continue;
2217  }
2218  }
2219  LLVM_DEBUG(dbgs() << "Bad stride found, can't "
2220  "tail-predicate\n.");
2221  return false;
2222  }
2223  }
2224  }
2225 
2226  LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
2227  return true;
2228 }
2229 
2231  Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC,
2233  InterleavedAccessInfo *IAI) {
2234  if (!EnableTailPredication) {
2235  LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
2236  return false;
2237  }
2238 
2239  // Creating a predicated vector loop is the first step for generating a
2240  // tail-predicated hardware loop, for which we need the MVE masked
2241  // load/stores instructions:
2242  if (!ST->hasMVEIntegerOps())
2243  return false;
2244 
2245  // For now, restrict this to single block loops.
2246  if (L->getNumBlocks() > 1) {
2247  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
2248  "loop.\n");
2249  return false;
2250  }
2251 
2252  assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
2253 
2254  HardwareLoopInfo HWLoopInfo(L);
2255  if (!HWLoopInfo.canAnalyze(*LI)) {
2256  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2257  "analyzable.\n");
2258  return false;
2259  }
2260 
2261  // This checks if we have the low-overhead branch architecture
2262  // extension, and if we will create a hardware-loop:
2263  if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {
2264  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2265  "profitable.\n");
2266  return false;
2267  }
2268 
2269  if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) {
2270  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2271  "a candidate.\n");
2272  return false;
2273  }
2274 
2275  return canTailPredicateLoop(L, LI, SE, DL, LVL->getLAI());
2276 }
2277 
2279  if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
2280  return PredicationStyle::None;
2281 
2282  // Intrinsic @llvm.get.active.lane.mask is supported.
2283  // It is used in the MVETailPredication pass, which requires the number of
2284  // elements processed by this vector loop to setup the tail-predicated
2285  // loop.
2286  return PredicationStyle::Data;
2287 }
2291  // Enable Upper bound unrolling universally, not dependant upon the conditions
2292  // below.
2293  UP.UpperBound = true;
2294 
2295  // Only currently enable these preferences for M-Class cores.
2296  if (!ST->isMClass())
2297  return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2298 
2299  // Disable loop unrolling for Oz and Os.
2300  UP.OptSizeThreshold = 0;
2301  UP.PartialOptSizeThreshold = 0;
2302  if (L->getHeader()->getParent()->hasOptSize())
2303  return;
2304 
2305  SmallVector<BasicBlock*, 4> ExitingBlocks;
2306  L->getExitingBlocks(ExitingBlocks);
2307  LLVM_DEBUG(dbgs() << "Loop has:\n"
2308  << "Blocks: " << L->getNumBlocks() << "\n"
2309  << "Exit blocks: " << ExitingBlocks.size() << "\n");
2310 
2311  // Only allow another exit other than the latch. This acts as an early exit
2312  // as it mirrors the profitability calculation of the runtime unroller.
2313  if (ExitingBlocks.size() > 2)
2314  return;
2315 
2316  // Limit the CFG of the loop body for targets with a branch predictor.
2317  // Allowing 4 blocks permits if-then-else diamonds in the body.
2318  if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
2319  return;
2320 
2321  // Don't unroll vectorized loops, including the remainder loop
2322  if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
2323  return;
2324 
2325  // Scan the loop: don't unroll loops with calls as this could prevent
2326  // inlining.
2327  InstructionCost Cost = 0;
2328  for (auto *BB : L->getBlocks()) {
2329  for (auto &I : *BB) {
2330  // Don't unroll vectorised loop. MVE does not benefit from it as much as
2331  // scalar code.
2332  if (I.getType()->isVectorTy())
2333  return;
2334 
2335  if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2336  if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2337  if (!isLoweredToCall(F))
2338  continue;
2339  }
2340  return;
2341  }
2342 
2343  SmallVector<const Value*, 4> Operands(I.operand_values());
2346  }
2347  }
2348 
2349  // On v6m cores, there are very few registers available. We can easily end up
2350  // spilling and reloading more registers in an unrolled loop. Look at the
2351  // number of LCSSA phis as a rough measure of how many registers will need to
2352  // be live out of the loop, reducing the default unroll count if more than 1
2353  // value is needed. In the long run, all of this should be being learnt by a
2354  // machine.
2355  unsigned UnrollCount = 4;
2356  if (ST->isThumb1Only()) {
2357  unsigned ExitingValues = 0;
2358  SmallVector<BasicBlock *, 4> ExitBlocks;
2359  L->getExitBlocks(ExitBlocks);
2360  for (auto *Exit : ExitBlocks) {
2361  // Count the number of LCSSA phis. Exclude values coming from GEP's as
2362  // only the last is expected to be needed for address operands.
2363  unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) {
2364  return PH.getNumOperands() != 1 ||
2365  !isa<GetElementPtrInst>(PH.getOperand(0));
2366  });
2367  ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
2368  }
2369  if (ExitingValues)
2370  UnrollCount /= ExitingValues;
2371  if (UnrollCount <= 1)
2372  return;
2373  }
2374 
2375  LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2376  LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
2377 
2378  UP.Partial = true;
2379  UP.Runtime = true;
2380  UP.UnrollRemainder = true;
2382  UP.UnrollAndJam = true;
2384 
2385  // Force unrolling small loops can be very useful because of the branch
2386  // taken cost of the backedge.
2387  if (Cost < 12)
2388  UP.Force = true;
2389 }
2390 
2393  BaseT::getPeelingPreferences(L, SE, PP);
2394 }
2395 
2396 bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
2397  TTI::ReductionFlags Flags) const {
2398  if (!ST->hasMVEIntegerOps())
2399  return false;
2400 
2401  unsigned ScalarBits = Ty->getScalarSizeInBits();
2402  switch (Opcode) {
2403  case Instruction::Add:
2404  return ScalarBits <= 64;
2405  default:
2406  return false;
2407  }
2408 }
2409 
2411  unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const {
2412  if (!ST->hasMVEIntegerOps())
2413  return false;
2414  return true;
2415 }
2416 
2418  int64_t BaseOffset,
2419  bool HasBaseReg, int64_t Scale,
2420  unsigned AddrSpace) const {
2422  AM.BaseGV = BaseGV;
2423  AM.BaseOffs = BaseOffset;
2424  AM.HasBaseReg = HasBaseReg;
2425  AM.Scale = Scale;
2426  if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) {
2427  if (ST->hasFPAO())
2428  return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
2429  return 0;
2430  }
2431  return -1;
2432 }
llvm::ISD::SUB
@ SUB
Definition: ISDOpcodes.h:240
ARMSubtarget.h
llvm::InstructionCost
Definition: InstructionCost.h:30
llvm::TargetLoweringBase::getMaxStoresPerMemmove
unsigned getMaxStoresPerMemmove(bool OptSize) const
Get maximum # of store operations permitted for llvm.memmove.
Definition: TargetLowering.h:1716
llvm::TargetTransformInfo::CastContextHint::Masked
@ Masked
The cast is used with a masked load/store.
ValueTypes.h
CmpMode::FP
@ FP
llvm::TargetTransformInfo::UnrollingPreferences::PartialOptSizeThreshold
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
Definition: TargetTransformInfo.h:446
llvm::ScalarEvolution::hasLoopInvariantBackedgeTakenCount
bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
Definition: ScalarEvolution.cpp:13345
llvm::TargetTransformInfo::SK_Select
@ SK_Select
Selects elements from the corresponding lane of either source operand.
Definition: TargetTransformInfo.h:890
llvm::BasicTTIImplBase< ARMTTIImpl >::DL
const DataLayout & DL
Definition: TargetTransformInfoImpl.h:37
llvm::TargetTransformInfo::UnrollingPreferences::Runtime
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
Definition: TargetTransformInfo.h:474
llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition: TargetTransformInfo.h:218
llvm::SPF_SMAX
@ SPF_SMAX
Unsigned minimum.
Definition: ValueTracking.h:691
llvm::ISD::VECTOR_SHUFFLE
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:586
llvm::ARM_AM::isThumbImmShiftedVal
bool isThumbImmShiftedVal(unsigned V)
isThumbImmShiftedVal - Return true if the specified value can be obtained by left shifting a 8-bit im...
Definition: ARMAddressingModes.h:235
llvm::MVT::v4f16
@ v4f16
Definition: MachineValueType.h:149
llvm::TargetTransformInfo::TCC_Expensive
@ TCC_Expensive
The cost of a 'div' instruction on x86.
Definition: TargetTransformInfo.h:246
llvm::IRBuilderBase::SetInsertPoint
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:179
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
llvm::TargetTransformInfo::ReductionFlags
Flags describing the kind of vector reduction.
Definition: TargetTransformInfo.h:1455
llvm::ConvertCostTableLookup
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType >> Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
llvm::CostTblEntryT
Cost Table Entry.
Definition: CostTable.h:25
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::LoopBase::getExitBlocks
void getExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all of the successor blocks of this loop.
Definition: LoopInfoImpl.h:64
llvm::ARMTTIImpl::getInterleavedMemoryOpCost
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Definition: ARMTargetTransformInfo.cpp:1514
llvm::HardwareLoopInfo::LoopDecrement
Value * LoopDecrement
Definition: TargetTransformInfo.h:105
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
llvm::ISD::OR
@ OR
Definition: ISDOpcodes.h:667
llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
InstCombiner.h
llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:720
llvm::InstCombiner::getDominatorTree
DominatorTree & getDominatorTree() const
Definition: InstCombiner.h:370
PHI
Rewrite undef for PHI
Definition: AMDGPURewriteUndefForPHI.cpp:101
llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:104
IntrinsicInst.h
ceil
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g ceil
Definition: README-FPStack.txt:54
DisableLowOverheadLoops
static cl::opt< bool > DisableLowOverheadLoops("disable-arm-loloops", cl::Hidden, cl::init(false), cl::desc("Disable the generation of low-overhead loops"))
llvm::TypeSize::getFixedSize
ScalarTy getFixedSize() const
Definition: TypeSize.h:444
llvm::Function
Definition: Function.h:60
llvm::Loop
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:547
llvm::LoopVectorizationLegality::getLAI
const LoopAccessInfo * getLAI() const
Definition: LoopVectorizationLegality.h:368
llvm::ISD::UDIV
@ UDIV
Definition: ISDOpcodes.h:243
llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:53
llvm::SelectPatternResult::Flavor
SelectPatternFlavor Flavor
Definition: ValueTracking.h:711
llvm::DataLayout::getTypeSizeInBits
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:673
llvm::BasicTTIImplBase< ARMTTIImpl >::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:1132
llvm::TargetTransformInfo::AMK_PostIndexed
@ AMK_PostIndexed
Definition: TargetTransformInfo.h:635
llvm::PointerType::get
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Definition: Type.cpp:727
MVEMaxSupportedInterleaveFactor
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor
llvm::ARMTTIImpl::areInlineCompatible
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
Definition: ARMTargetTransformInfo.cpp:86
llvm::TargetTransformInfoImplCRTPBase< ARMTTIImpl >::getInstructionCost
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
Definition: TargetTransformInfoImpl.h:1015
llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:328
llvm::PredicatedScalarEvolution
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
Definition: ScalarEvolution.h:2219
llvm::ConstantInt::getValue
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:133
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1199
llvm::InstCombiner::Builder
BuilderTy & Builder
Definition: InstCombiner.h:58
llvm::ARMSubtarget::hasFPARMv8Base
bool hasFPARMv8Base() const
Definition: ARMSubtarget.h:335
llvm::CmpInst::makeCmpResultType
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:1045
llvm::IRBuilder< TargetFolder, IRBuilderCallbackInserter >
llvm::IntrinsicCostAttributes::getReturnType
Type * getReturnType() const
Definition: TargetTransformInfo.h:153
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:449
llvm::LoopVectorizationLegality
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
Definition: LoopVectorizationLegality.h:241
llvm::TargetTransformInfo::UnrollingPreferences::UnrollAndJamInnerLoopThreshold
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
Definition: TargetTransformInfo.h:493
Local.h
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
llvm::TargetTransformInfo::UnrollingPreferences::UnrollRemainder
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
Definition: TargetTransformInfo.h:486
llvm::ISD::FP_TO_SINT
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:819
llvm::Type::isFPOrFPVectorTy
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:198
llvm::TargetTransformInfo::TCK_CodeSize
@ TCK_CodeSize
Instruction code size.
Definition: TargetTransformInfo.h:221
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:140
EnableMaskedLoadStores
static cl::opt< bool > EnableMaskedLoadStores("enable-arm-maskedldst", cl::Hidden, cl::init(true), cl::desc("Enable the generation of masked loads and stores"))
llvm::MemOp
Definition: TargetLowering.h:111
APInt.h
llvm::ARMTTIImpl::getGatherScatterOpCost
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:1555
llvm::getLoadStoreType
Type * getLoadStoreType(Value *I)
A helper function that returns the type of a load or store instruction.
Definition: Instructions.h:5406
llvm::HardwareLoopInfo::isHardwareLoopCandidate
bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, DominatorTree &DT, bool ForceNestedLoop=false, bool ForceHardwareLoopPHI=false)
Definition: TargetTransformInfo.cpp:105
llvm::CmpInst::ICMP_SGT
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:747
llvm::ARMTTIImpl::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:457
llvm::TargetTransformInfo::UnrollingPreferences::Partial
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Definition: TargetTransformInfo.h:470
llvm::findDefsUsedOutsideOfLoop
SmallVector< Instruction *, 8 > findDefsUsedOutsideOfLoop(Loop *L)
Returns the instructions that use values defined in the loop.
Definition: LoopUtils.cpp:124
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
llvm::APInt::getBitWidth
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1431
llvm::TargetTransformInfo::PeelingPreferences
Definition: TargetTransformInfo.h:529
llvm::tgtok::Bits
@ Bits
Definition: TGLexer.h:50
llvm::Instruction::isShift
bool isShift() const
Definition: Instruction.h:171
llvm::BasicTTIImplBase< ARMTTIImpl >::improveShuffleKindFromMask
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask) const
Definition: BasicTTIImpl.h:903
llvm::SPF_UMAX
@ SPF_UMAX
Signed maximum.
Definition: ValueTracking.h:692
llvm::Optional
Definition: APInt.h:33
T
#define T
Definition: Mips16ISelLowering.cpp:341
llvm::FeatureBitset
Container class for subtarget features.
Definition: SubtargetFeature.h:40
llvm::ConstantAsMetadata::get
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:420
llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33
llvm::Value::user_begin
user_iterator user_begin()
Definition: Value.h:397
llvm::CmpInst::ICMP_SLE
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:750
RHS
Value * RHS
Definition: X86PartialReduction.cpp:76
llvm::isPowerOf2_32
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:458
llvm::matchSelectPattern
SelectPatternResult matchSelectPattern(Value *V, Value *&LHS, Value *&RHS, Instruction::CastOps *CastOp=nullptr, unsigned Depth=0)
Pattern match integer [SU]MIN, [SU]MAX and ABS idioms, returning the kind and providing the out param...
Definition: ValueTracking.cpp:6397
llvm::SelectPatternFlavor
SelectPatternFlavor
Specific patterns of select instructions we can match.
Definition: ValueTracking.h:687
llvm::MVT::v2f64
@ v2f64
Definition: MachineValueType.h:190
llvm::TargetTransformInfo::OperandValueInfo
Definition: TargetTransformInfo.h:924
llvm::FixedVectorType
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:525
llvm::count_if
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1877
llvm::TargetLoweringBase::getOperationAction
LegalizeAction getOperationAction(unsigned Op, EVT VT) const
Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...
Definition: TargetLowering.h:1116
llvm::ARMTTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: ARMTargetTransformInfo.cpp:2288
llvm::Type::getInt32Ty
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:239
llvm::LoopBase::getNumBlocks
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
Definition: LoopInfo.h:202
llvm::APIntOps::umin
const APInt & umin(const APInt &A, const APInt &B)
Determine the smaller of two APInts considered to be unsigned.
Definition: APInt.h:2157
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::MDNode::get
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1400
llvm::TargetTransformInfo::SK_Broadcast
@ SK_Broadcast
Broadcast element 0 to all other elements.
Definition: TargetTransformInfo.h:888
F
#define F(x, y, z)
Definition: MD5.cpp:55
llvm::ARMTTIImpl::isLegalMaskedLoad
bool isLegalMaskedLoad(Type *DataTy, Align Alignment)
Definition: ARMTargetTransformInfo.cpp:1094
llvm::Instruction::setMetadata
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1456
llvm::ARMTTIImpl::isHardwareLoopProfitable
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo)
Definition: ARMTargetTransformInfo.cpp:1994
KnownBits.h
llvm::TargetTransformInfo::requiresOrderedReduction
static bool requiresOrderedReduction(Optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
Definition: TargetTransformInfo.h:1267
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:55
llvm::HardwareLoopInfo::IsNestingLegal
bool IsNestingLegal
Definition: TargetTransformInfo.h:107
floor
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g floor
Definition: README-FPStack.txt:54
llvm::EVT::isSimple
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:129
MachineValueType.h
UnrollCount
static cl::opt< unsigned > UnrollCount("unroll-count", cl::Hidden, cl::desc("Use this unroll count for all loops including those with " "unroll_count pragma values, for testing purposes"))
llvm::AArch64CC::LT
@ LT
Definition: AArch64BaseInfo.h:266
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
Arg
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
Definition: AMDGPULibCalls.cpp:187
llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
Instruction.h
llvm::FixedVectorType::getNumElements
unsigned getNumElements() const
Definition: DerivedTypes.h:568
LHS
Value * LHS
Definition: X86PartialReduction.cpp:75
llvm::IntrinsicCostAttributes::getArgTypes
const SmallVectorImpl< Type * > & getArgTypes() const
Definition: TargetTransformInfo.h:157
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
llvm::InstCombiner::replaceOperand
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
Definition: InstCombiner.h:438
llvm::MVT::i1
@ i1
Definition: MachineValueType.h:43
llvm::BasicTTIImplBase< ARMTTIImpl >::getTypeLegalizationCost
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:789
llvm::MVT::v8f16
@ v8f16
Definition: MachineValueType.h:150
llvm::BasicTTIImplBase::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:524
llvm::ARM_AM::getSOImmVal
int getSOImmVal(unsigned Arg)
getSOImmVal - Given a 32-bit immediate, if it is something that can fit into an shifter_operand immed...
Definition: ARMAddressingModes.h:163
SubtargetFeature.h
llvm::ARMTTIImpl::emitGetActiveLaneMask
PredicationStyle emitGetActiveLaneMask() const
Definition: ARMTargetTransformInfo.cpp:2278
TargetMachine.h
llvm::TypeConversionCostTblEntryT
Type Conversion Cost Table.
Definition: CostTable.h:55
llvm::ScalarEvolution::getOne
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
Definition: ScalarEvolution.h:649
llvm::ISD::SELECT
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:713
llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
llvm::ISD::ZERO_EXTEND
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:763
EnableTailPredication
cl::opt< TailPredication::Mode > EnableTailPredication
llvm::BasicTTIImplBase< ARMTTIImpl >::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: BasicTTIImpl.h:1187
InlinePriorityMode::Cost
@ Cost
llvm::ARMTTIImpl::getIntImmCostInst
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
Definition: ARMTargetTransformInfo.cpp:386
llvm::TargetTransformInfo::ShuffleKind
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Definition: TargetTransformInfo.h:887
llvm::TargetTransformInfo::CastContextHint
CastContextHint
Represents a hint about the context in which a cast is used.
Definition: TargetTransformInfo.h:1139
llvm::User
Definition: User.h:44
llvm::TargetLoweringBase::getMaxStoresPerMemset
unsigned getMaxStoresPerMemset(bool OptSize) const
Get maximum # of store operations permitted for llvm.memset.
Definition: TargetLowering.h:1677
llvm::BasicTTIImplBase< ARMTTIImpl >::getAddressComputationCost
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
Definition: BasicTTIImpl.h:2166
llvm::EVT
Extended Value Type.
Definition: ValueTypes.h:34
llvm::getKnownAlignment
Align getKnownAlignment(Value *V, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to infer an alignment for the specified pointer.
Definition: Local.h:222
Intrinsics.h
llvm::TargetLoweringBase::AddrMode::HasBaseReg
bool HasBaseReg
Definition: TargetLowering.h:2579
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
llvm::TargetTransformInfo::UnrollingPreferences::Force
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
Definition: TargetTransformInfo.h:482
llvm::MVT::f64
@ f64
Definition: MachineValueType.h:58
round
static uint64_t round(uint64_t Acc, uint64_t Input)
Definition: xxhash.cpp:56
llvm::EVT::getVectorNumElements
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:308
llvm::ARMTTIImpl::getMemoryOpCost
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:1449
llvm::ARMTTIImpl::preferPredicateOverEpilogue
bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL, InterleavedAccessInfo *IAI)
Definition: ARMTargetTransformInfo.cpp:2230
llvm::BasicTTIImplBase< ARMTTIImpl >::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, Optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:2284
llvm::BasicTTIImplBase< ARMTTIImpl >::getMulAccReductionCost
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:2373
AllowWLSLoops
static cl::opt< bool > AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true), cl::desc("Enable the generation of WLS loops"))
llvm::ISD::TRUNCATE
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:769
llvm::LoopBase::blocks
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:195
llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:246
llvm::ARMTTIImpl::getIntImmCodeSizeCost
InstructionCost getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty)
Definition: ARMTargetTransformInfo.cpp:329
llvm::MVT::v16i1
@ v16i1
Definition: MachineValueType.h:70
llvm::ISD::UDIVREM
@ UDIVREM
Definition: ISDOpcodes.h:256
llvm::dwarf::Index
Index
Definition: Dwarf.h:472
llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
llvm::ARMTTIImpl::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: ARMTargetTransformInfo.cpp:1309
llvm::PatternMatch::m_c_Add
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
Definition: PatternMatch.h:2237
llvm::ARMTargetLowering::useSoftFloat
bool useSoftFloat() const override
Definition: ARMISelLowering.cpp:1613
llvm::BasicTTIImplBase< ARMTTIImpl >::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:825
llvm::MVT::v8i1
@ v8i1
Definition: MachineValueType.h:69
llvm::TargetTransformInfo::UnrollingPreferences::UnrollAndJam
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
Definition: TargetTransformInfo.h:488
llvm::LoopBase::getBlocks
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:188
llvm::PatternMatch::m_ConstantInt
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:147
llvm::EVT::isInteger
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:144
llvm::Instruction
Definition: Instruction.h:42
llvm::InterleavedAccessInfo
Drive the analysis of interleaved memory accesses in the loop.
Definition: VectorUtils.h:759
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition: Type.cpp:189
llvm::HardwareLoopInfo::PerformEntryTest
bool PerformEntryTest
Definition: TargetTransformInfo.h:111
canTailPredicateLoop
static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, const DataLayout &DL, const LoopAccessInfo *LAI)
Definition: ARMTargetTransformInfo.cpp:2140
llvm::ARMTTIImpl::isLoweredToCall
bool isLoweredToCall(const Function *F)
Definition: ARMTargetTransformInfo.cpp:1846
llvm::ISD::SINT_TO_FP
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:773
llvm::InstCombiner::eraseInstFromFunction
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
llvm::APInt::getHighBitsSet
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:279
llvm::LoopBase::getExitingBlocks
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
Definition: LoopInfoImpl.h:33
llvm::ConstantInt::get
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:879
LoopUtils.h
llvm::HardwareLoopInfo::CounterInReg
bool CounterInReg
Definition: TargetTransformInfo.h:109
llvm::ISD::AND
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:666
PatternMatch.h
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:684
llvm::MVT::v1i64
@ v1i64
Definition: MachineValueType.h:130
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::Metadata
Root of the metadata hierarchy.
Definition: Metadata.h:62
llvm::isVREVMask
bool isVREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isVREVMask - Check if a vector shuffle corresponds to a VREV instruction with the specified blocksize...
Definition: ARMTargetTransformInfo.h:332
llvm::ARM_AM::getT2SOImmVal
int getT2SOImmVal(unsigned Arg)
getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit into a Thumb-2 shifter_oper...
Definition: ARMAddressingModes.h:320
llvm::TargetTransformInfo::OperandValueInfo::isUniform
bool isUniform() const
Definition: TargetTransformInfo.h:931
llvm::AddressSpace
AddressSpace
Definition: NVPTXBaseInfo.h:21
llvm::BasicTTIImplBase< ARMTTIImpl >::getCmpSelInstrCost
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:1137
llvm::BasicTTIImplBase< ARMTTIImpl >::getCastInstrCost
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:969
llvm::MVT::v4i16
@ v4i16
Definition: MachineValueType.h:100
llvm::ARMSubtarget::getMVEVectorCostFactor
unsigned getMVEVectorCostFactor(TargetTransformInfo::TargetCostKind CostKind) const
Definition: ARMSubtarget.h:540
llvm::Type::getIntegerBitWidth
unsigned getIntegerBitWidth() const
Definition: DerivedTypes.h:97
llvm::MVT::v4i8
@ v4i8
Definition: MachineValueType.h:87
llvm::SPF_SMIN
@ SPF_SMIN
Definition: ValueTracking.h:689
Type.h
llvm::IntrinsicCostAttributes
Definition: TargetTransformInfo.h:120
llvm::Instruction::getMetadata
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:271
llvm::maxnum
LLVM_READONLY APFloat maxnum(const APFloat &A, const APFloat &B)
Implements IEEE maxNum semantics.
Definition: APFloat.h:1322
LoopInfo.h
llvm::ARMTTIImpl::getNumMemOps
int getNumMemOps(const IntrinsicInst *I) const
Given a memcpy/memset/memmove instruction, return the number of memory operations performed,...
Definition: ARMTargetTransformInfo.cpp:1126
llvm::ARMTTIImpl::isProfitableLSRChainElement
bool isProfitableLSRChainElement(Instruction *I)
Definition: ARMTargetTransformInfo.cpp:1077
Operands
mir Rename Register Operands
Definition: MIRNamerPass.cpp:74
llvm::ScalarEvolution::getSCEV
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
Definition: ScalarEvolution.cpp:4442
llvm::PatternMatch::m_Xor
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1105
llvm::Type::isIntegerTy
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:210
llvm::APInt::getOneBitSet
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:222
llvm::TargetTransformInfo::SK_Reverse
@ SK_Reverse
Reverse the order of the vector.
Definition: TargetTransformInfo.h:889
llvm::MVT::v2i8
@ v2i8
Definition: MachineValueType.h:86
llvm::MVT::v4i64
@ v4i64
Definition: MachineValueType.h:133
llvm::VectorType
Base class of all SIMD vector types.
Definition: DerivedTypes.h:389
llvm::TargetTransformInfo::CastContextHint::Normal
@ Normal
The cast is used with a normal load/store.
BasicBlock.h
llvm::cl::opt< bool >
llvm::SCEV
This class represents an analyzed expression in the program.
Definition: ScalarEvolution.h:75
llvm::BasicTTIImplBase< ARMTTIImpl >::isLegalAddressingMode
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr)
Definition: BasicTTIImpl.h:314
llvm::PatternMatch::m_Zero
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:537
llvm::GlobalValue
Definition: GlobalValue.h:44
llvm::Constant
This is an important base class in LLVM.
Definition: Constant.h:41
llvm::MVT::v16i8
@ v16i8
Definition: MachineValueType.h:89
llvm::CostTableLookup
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType >> Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
llvm::EVT::getSizeInBits
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:340
llvm::ARMSubtarget::isThumb1Only
bool isThumb1Only() const
Definition: ARMSubtarget.h:420
llvm::ScalarEvolution::getUnsignedRangeMax
APInt getUnsignedRangeMax(const SCEV *S)
Determine the max of the unsigned range for a particular SCEV.
Definition: ScalarEvolution.h:967
llvm::ARMTTIImpl::getMemcpyCost
InstructionCost getMemcpyCost(const Instruction *I)
Definition: ARMTargetTransformInfo.cpp:1192
llvm::InstCombiner::getAssumptionCache
AssumptionCache & getAssumptionCache() const
Definition: InstCombiner.h:368
llvm::MVT::v16i16
@ v16i16
Definition: MachineValueType.h:102
llvm::MVT::v2i64
@ v2i64
Definition: MachineValueType.h:131
uint64_t
llvm::InstCombiner::getDataLayout
const DataLayout & getDataLayout() const
Definition: InstCombiner.h:371
llvm::ISD::FP_TO_UINT
@ FP_TO_UINT
Definition: ISDOpcodes.h:820
llvm::ARMTTIImpl::getAddressComputationCost
InstructionCost getAddressComputationCost(Type *Val, ScalarEvolution *SE, const SCEV *Ptr)
Definition: ARMTargetTransformInfo.cpp:1055
llvm::MVT::v16f32
@ v16f32
Definition: MachineValueType.h:179
llvm::Instruction::user_back
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:88
llvm::TruncInst
This class represents a truncation of integer types.
Definition: Instructions.h:4811
llvm::ARMSubtarget::isMClass
bool isMClass() const
Definition: ARMSubtarget.h:422
llvm::getPtrStride
Optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const ValueToValueMap &StridesMap=ValueToValueMap(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
Definition: LoopAccessAnalysis.cpp:1369
llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
llvm::TargetTransformInfo::UnrollingPreferences
Parameters that control the generic loop unrolling transformation.
Definition: TargetTransformInfo.h:417
isSSATMinMaxPattern
static Value * isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm)
Definition: ARMTargetTransformInfo.cpp:340
I
#define I(x, y, z)
Definition: MD5.cpp:58
getCalledFunction
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
Definition: MemoryBuiltins.cpp:154
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:447
llvm::ARMTTIImpl::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:1741
llvm::LoopAccessInfo
Drive the analysis of memory accesses in the loop.
Definition: LoopAccessAnalysis.h:562
llvm::ARMTTIImpl::simplifyDemandedVectorEltsIntrinsic
Optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
Definition: ARMTargetTransformInfo.cpp:252
llvm::BasicTTIImplBase< ARMTTIImpl >::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=None)
Definition: BasicTTIImpl.h:939
llvm::Type::isHalfTy
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
llvm::MVT::v4f32
@ v4f32
Definition: MachineValueType.h:170
llvm::MVT::i8
@ i8
Definition: MachineValueType.h:46
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
llvm::MVT::Other
@ Other
Definition: MachineValueType.h:42
llvm::SPF_ABS
@ SPF_ABS
Floating point maxnum.
Definition: ValueTracking.h:695
memcpy
<%struct.s * > cast struct s *S to sbyte *< sbyte * > sbyte uint cast struct s *agg result to sbyte *< sbyte * > sbyte uint cast struct s *memtmp to sbyte *< sbyte * > sbyte uint ret void llc ends up issuing two memcpy or custom lower memcpy(of small size) to be ldmia/stmia. I think option 2 is better but the current register allocator cannot allocate a chunk of registers at a time. A feasible temporary solution is to use specific physical registers at the lowering time for small(<
llvm::TargetLoweringBase::getMaxStoresPerMemcpy
unsigned getMaxStoresPerMemcpy(bool OptSize) const
Get maximum # of store operations permitted for llvm.memcpy.
Definition: TargetLowering.h:1687
llvm::TargetTransformInfoImplBase::isLoweredToCall
bool isLoweredToCall(const Function *F) const
Definition: TargetTransformInfoImpl.h:123
llvm::ARMTTIImpl::getMaskedMemoryOpCost
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:1497
Ptr
@ Ptr
Definition: TargetLibraryInfo.cpp:60
llvm::ARMTTIImpl::preferInLoopReduction
bool preferInLoopReduction(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const
Definition: ARMTargetTransformInfo.cpp:2396
function
print Print MemDeps of function
Definition: MemDepPrinter.cpp:82
llvm::IRBuilderBase::CreateIntrinsic
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:962
llvm::BasicTTIImplBase< ARMTTIImpl >::getInterleavedMemoryOpCost
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Definition: BasicTTIImpl.h:1291
llvm::BasicTTIImplBase< ARMTTIImpl >::getMemoryOpCost
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:1231
llvm::IRBuilderBase::getTrue
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:449
llvm::PatternMatch::m_Constant
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
Definition: PatternMatch.h:144
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:651
llvm::PatternMatch::m_Value
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:75
getType
static M68kRelType getType(unsigned Kind, MCSymbolRefExpr::VariantKind &Modifier, bool &IsPCRel)
Definition: M68kELFObjectWriter.cpp:48
llvm::ARMTTIImpl::getPreferredAddressingMode
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const
Definition: ARMTargetTransformInfo.cpp:105
llvm::APIntOps::smin
const APInt & smin(const APInt &A, const APInt &B)
Determine the smaller of two APInts considered to be signed.
Definition: APInt.h:2147
llvm::ArrayRef< int >
llvm::LoopInfo
Definition: LoopInfo.h:1108
llvm::EVT::isVector
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:154
ARMAddressingModes.h
llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition: OptimizationRemarkEmitter.h:33
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
DataLayout.h
llvm::MVT::i64
@ i64
Definition: MachineValueType.h:49
llvm::MVT::v2i32
@ v2i32
Definition: MachineValueType.h:110
llvm::EVT::getScalarSizeInBits
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:352
llvm::AssumptionCache
A cache of @llvm.assume calls within a function.
Definition: AssumptionCache.h:42
llvm::ARMTTIImpl::isLegalMaskedGather
bool isLegalMaskedGather(Type *Ty, Align Alignment)
Definition: ARMTargetTransformInfo.cpp:1114
llvm::BasicTTIImplBase< ARMTTIImpl >::getScalarizationOverhead
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:707
llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition: TargetTransformInfo.h:222
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:143
llvm::ISD::SREM
@ SREM
Definition: ISDOpcodes.h:244
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
if
if(llvm_vc STREQUAL "") set(fake_version_inc "$
Definition: CMakeLists.txt:14
llvm::MVT::v2f32
@ v2f32
Definition: MachineValueType.h:168
llvm::TargetLoweringBase::AddrMode::BaseGV
GlobalValue * BaseGV
Definition: TargetLowering.h:2577
CostKind
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
llvm::Value::getContext
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:994
llvm::ARMTTIImpl::getCmpSelInstrCost
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:911
llvm::TargetTransformInfo::UnrollingPreferences::DefaultUnrollRuntimeCount
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
Definition: TargetTransformInfo.h:453
trunc
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g trunc
Definition: README-FPStack.txt:63
llvm::SPF_FMINNUM
@ SPF_FMINNUM
Unsigned maximum.
Definition: ValueTracking.h:693
llvm::TargetLoweringBase::InstructionOpcodeToISD
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
Definition: TargetLoweringBase.cpp:1783
llvm::TargetTransformInfo::AddressingModeKind
AddressingModeKind
Definition: TargetTransformInfo.h:633
llvm::MVT::v4i32
@ v4i32
Definition: MachineValueType.h:112
llvm::BasicTTIImplBase< ARMTTIImpl >::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:596
llvm::SPF_FMAXNUM
@ SPF_FMAXNUM
Floating point minnum.
Definition: ValueTracking.h:694
llvm::Type::getContext
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
canTailPredicateInstruction
static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount)
Definition: ARMTargetTransformInfo.cpp:2089
llvm::MVT::v8i64
@ v8i64
Definition: MachineValueType.h:134
llvm::ISD::XOR
@ XOR
Definition: ISDOpcodes.h:668
llvm::Function::hasOptSize
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:645
llvm::ScalarEvolution::isLoopInvariant
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
Definition: ScalarEvolution.cpp:13637
llvm::InstCombiner::replaceInstUsesWith
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition: InstCombiner.h:417
llvm::MVT::v16i32
@ v16i32
Definition: MachineValueType.h:121
llvm::MemOp::Set
static MemOp Set(uint64_t Size, bool DstAlignCanChange, Align DstAlign, bool IsZeroMemset, bool IsVolatile)
Definition: TargetLowering.h:144
llvm::MCID::Select
@ Select
Definition: MCInstrDesc.h:164
llvm::TargetLoweringBase::getTargetMachine
const TargetMachine & getTargetMachine() const
Definition: TargetLowering.h:349
llvm::LoopBase::isInnermost
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
Definition: LoopInfo.h:182
llvm::APIntOps::umax
const APInt & umax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be unsigned.
Definition: APInt.h:2162
llvm::TargetLoweringBase::AddrMode::BaseOffs
int64_t BaseOffs
Definition: TargetLowering.h:2578
llvm::minnum
LLVM_READONLY APFloat minnum(const APFloat &A, const APFloat &B)
Implements IEEE minNum semantics.
Definition: APFloat.h:1311
llvm::MemOp::Copy
static MemOp Copy(uint64_t Size, bool DstAlignCanChange, Align DstAlign, Align SrcAlign, bool IsVolatile, bool MemcpyStrSrc=false)
Definition: TargetLowering.h:129
llvm::KnownBits
Definition: KnownBits.h:23
llvm::Type::getIntNTy
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition: Type.cpp:243
isFPSatMinMaxPattern
static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm)
Definition: ARMTargetTransformInfo.cpp:374
llvm::Type::isFloatTy
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
llvm::None
constexpr std::nullopt_t None
Definition: None.h:27
llvm::TargetLoweringBase::AddrMode::Scale
int64_t Scale
Definition: TargetLowering.h:2580
llvm::BasicTTIImplBase< ARMTTIImpl >::getCallInstrCost
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Compute a cost of the given call instruction.
Definition: BasicTTIImpl.h:2155
CostTable.h
llvm::EVT::getScalarType
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:295
llvm::TargetTransformInfo::UnrollingPreferences::UpperBound
bool UpperBound
Allow using trip count upper bound to unroll loops.
Definition: TargetTransformInfo.h:484
llvm::Align::value
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
llvm::ARMTTIImpl::getExtendedReductionCost
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, Optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:1679
llvm::CallBase::arg_size
unsigned arg_size() const
Definition: InstrTypes.h:1340
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
llvm::Type::isIntOrIntVectorTy
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:216
llvm::MVT::v8i16
@ v8i16
Definition: MachineValueType.h:101
llvm::ARMTTIImpl::getIntImmCost
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:294
ISDOpcodes.h
llvm::TypeSize
Definition: TypeSize.h:435
Casting.h
llvm::BasicTTIImplBase< ARMTTIImpl >::getMaskedMemoryOpCost
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:1275
llvm::Value::hasNUses
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:148
llvm::LoopBase::getHeader
BlockT * getHeader() const
Definition: LoopInfo.h:105
llvm::MVT::i32
@ i32
Definition: MachineValueType.h:48
llvm::TargetLibraryInfo
Provides information about what library functions are available for the current target.
Definition: TargetLibraryInfo.h:226
LoopVectorizationLegality.h
llvm::ISD::SDIV
@ SDIV
Definition: ISDOpcodes.h:242
powi
This is blocked on not handling X *X *X powi(X, 3)(see note above). The issue is that we end up getting t
llvm::log2
static double log2(double V)
Definition: AMDGPULibCalls.cpp:794
llvm::APInt::getSplat
static APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition: APInt.cpp:612
llvm::InstCombiner::SimplifyDemandedBits
virtual bool SimplifyDemandedBits(Instruction *I, unsigned OpNo, const APInt &DemandedMask, KnownBits &Known, unsigned Depth=0)=0
llvm::MCID::Add
@ Add
Definition: MCInstrDesc.h:185
llvm::PredicationStyle
PredicationStyle
Definition: TargetTransformInfo.h:166
llvm::MVT::v8i32
@ v8i32
Definition: MachineValueType.h:116
llvm::InstCombiner
The core instruction combiner logic.
Definition: InstCombiner.h:45
llvm::ISD::UINT_TO_FP
@ UINT_TO_FP
Definition: ISDOpcodes.h:774
llvm::ISD::ADD
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
llvm::TargetLoweringBase::isOperationLegalOrCustomOrPromote
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
Definition: TargetLowering.h:1212
llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:46
llvm::HardwareLoopInfo
Attributes of a target dependent hardware loop.
Definition: TargetTransformInfo.h:97
llvm::BasicTTIImplBase< ARMTTIImpl >::getExtendedReductionCost
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, Optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:2357
llvm::RISCVMatInt::Imm
@ Imm
Definition: RISCVMatInt.h:23
llvm::TargetTransformInfoImplBase::isConstantStridedAccessLessThan
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
Definition: TargetTransformInfoImpl.h:926
llvm::ARMTTIImpl::getScalingFactorCost
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const
getScalingFactorCost - Return the cost of the scaling used in addressing mode represented by AM.
Definition: ARMTargetTransformInfo.cpp:2417
llvm::ISD::FP_EXTEND
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:871
llvm::BasicTTIImplBase< ARMTTIImpl >::getGatherScatterOpCost
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:1282
llvm::ARMTTIImpl::maybeLoweredToCall
bool maybeLoweredToCall(Instruction &I)
Definition: ARMTargetTransformInfo.cpp:1907
Instructions.h
llvm::IntrinsicCostAttributes::getID
Intrinsic::ID getID() const
Definition: TargetTransformInfo.h:151
llvm::ARMTTIImpl::getCastInstrCost
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:471
llvm::ISD::SHL
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:691
SmallVector.h
llvm::PatternMatch::m_Specific
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:772
llvm::ARMTTIImpl::getMulAccReductionCost
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *ValTy, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:1714
llvm::ISD::MUL
@ MUL
Definition: ISDOpcodes.h:241
llvm::ISD::UREM
@ UREM
Definition: ISDOpcodes.h:245
llvm::MVT::f16
@ f16
Definition: MachineValueType.h:56
llvm::TailPredication::EnabledNoReductions
@ EnabledNoReductions
Definition: ARMTargetTransformInfo.h:44
llvm::CallBase::getArgOperand
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1342
llvm::BasicTTIImplBase< ARMTTIImpl >::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
Definition: BasicTTIImpl.h:1435
llvm::ARMTTIImpl::isLegalMaskedStore
bool isLegalMaskedStore(Type *DataTy, Align Alignment)
Definition: ARMTargetTransformInfo.h:188
llvm::SPF_UMIN
@ SPF_UMIN
Signed minimum.
Definition: ValueTracking.h:690
llvm::getBooleanLoopAttribute
bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
Definition: LoopInfo.cpp:1085
llvm::ARMTargetLowering::getNumInterleavedAccesses
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
Definition: ARMISelLowering.cpp:21326
simplifyNeonVld1
static Value * simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign, InstCombiner::BuilderTy &Builder)
Convert a vector load intrinsic into a simple llvm load instruction.
Definition: ARMTargetTransformInfo.cpp:67
llvm::ARMTTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: ARMTargetTransformInfo.cpp:2391
llvm::TargetLoweringBase::AddrMode
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...
Definition: TargetLowering.h:2576
ARMTargetTransformInfo.h
llvm::HardwareLoopInfo::canAnalyze
bool canAnalyze(LoopInfo &LI)
Definition: TargetTransformInfo.cpp:50
llvm::IRBuilderBase::CreateVectorSplat
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1246
DerivedTypes.h
llvm::SCEV::getType
Type * getType() const
Return the LLVM type of this SCEV expression.
Definition: ScalarEvolution.cpp:403
llvm::getLoadStorePointerOperand
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
Definition: Instructions.h:5361
TM
const char LLVMTargetMachineRef TM
Definition: PassBuilderBindings.cpp:47
EnableMaskedGatherScatters
cl::opt< bool > EnableMaskedGatherScatters
llvm::ScalarEvolution::getAddExpr
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
Definition: ScalarEvolution.cpp:2493
llvm::APInt::getLowBitsSet
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:289
llvm::MVT::i16
@ i16
Definition: MachineValueType.h:47
llvm::TargetTransformInfo::UnrollingPreferences::OptSizeThreshold
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Definition: TargetTransformInfo.h:439
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
GEP
Hexagon Common GEP
Definition: HexagonCommonGEP.cpp:171
llvm::ARMTTIImpl::preferPredicatedReductionSelect
bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const
Definition: ARMTargetTransformInfo.cpp:2410
llvm::ISD::SDIVREM
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:255
llvm::ARMTTIImpl::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: ARMTargetTransformInfo.cpp:875
llvm::ARMSubtarget::isThumb2
bool isThumb2() const
Definition: ARMSubtarget.h:421
llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition: AMDGPUMetadata.h:394
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
llvm::cl::desc
Definition: CommandLine.h:413
llvm::TargetLoweringBase::getValueType
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
Definition: TargetLowering.h:1497
llvm::LoopAccessInfo::getPSE
const PredicatedScalarEvolution & getPSE() const
Used to add runtime SCEV checks.
Definition: LoopAccessAnalysis.h:639
llvm::ISD::SIGN_EXTEND
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:760
llvm::MVT::v8i8
@ v8i8
Definition: MachineValueType.h:88
llvm::TargetTransformInfo::OperandValueInfo::isConstant
bool isConstant() const
Definition: TargetTransformInfo.h:928
llvm::ScalarEvolution::getBackedgeTakenCount
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
Definition: ScalarEvolution.cpp:8242
llvm::MVT::v8f32
@ v8f32
Definition: MachineValueType.h:174
llvm::PredicatedScalarEvolution::getSE
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
Definition: ScalarEvolution.h:2251
llvm::ARMTTIImpl::instCombineIntrinsic
Optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
Definition: ARMTargetTransformInfo.cpp:121
llvm::BinaryOperator::Create
static BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), Instruction *InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
Definition: Instructions.cpp:2936
llvm::MVT::v2i16
@ v2i16
Definition: MachineValueType.h:98
llvm::ARMTTIImpl::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=None)
Definition: ARMTargetTransformInfo.cpp:1202
llvm::MVT::v16i64
@ v16i64
Definition: MachineValueType.h:135
llvm::abs
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1297
llvm::Optional::value_or
constexpr T value_or(U &&alt) const &
Definition: Optional.h:291
llvm::HardwareLoopInfo::CountType
IntegerType * CountType
Definition: TargetTransformInfo.h:104
llvm::EVT::isFixedLengthVector
bool isFixedLengthVector() const
Definition: ValueTypes.h:164
llvm::ISD::FP_ROUND
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:852
llvm::MVT::f32
@ f32
Definition: MachineValueType.h:57
llvm::ARMTTIImpl::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, Optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:1655
llvm::PredicationStyle::None
@ None
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
llvm::TargetTransformInfo::TCK_RecipThroughput
@ TCK_RecipThroughput
Reciprocal throughput.
Definition: TargetTransformInfo.h:219
llvm::EVT::isFloatingPoint
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:139
llvm::TargetTransformInfo::AMK_None
@ AMK_None
Definition: TargetTransformInfo.h:636
llvm::TargetTransformInfo::AMK_PreIndexed
@ AMK_PreIndexed
Definition: TargetTransformInfo.h:634
llvm::APIntOps::smax
const APInt & smax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be signed.
Definition: APInt.h:2152
llvm::Data
@ Data
llvm::ARMSubtarget::hasVFP2Base
bool hasVFP2Base() const
Definition: ARMSubtarget.h:332
llvm::TargetLoweringBase::LibCall
@ LibCall
Definition: TargetLowering.h:199
llvm::Type::getPrimitiveSizeInBits
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition: Type.cpp:164
llvm::EVT::getSimpleVT
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:288
llvm::MVT::v4i1
@ v4i1
Definition: MachineValueType.h:68
llvm::ARMTargetLowering::isLegalInterleavedAccessType
bool isLegalInterleavedAccessType(unsigned Factor, FixedVectorType *VecTy, Align Alignment, const DataLayout &DL) const
Returns true if VecTy is a legal interleaved access type.
Definition: ARMISelLowering.cpp:21331
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:39
llvm::DataLayout::getTypeAllocSize
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:506
llvm::TailPredication::ForceEnabledNoReductions
@ ForceEnabledNoReductions
Definition: ARMTargetTransformInfo.h:46