LLVM  15.0.0git
ARMTargetTransformInfo.cpp
Go to the documentation of this file.
1 //===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
10 #include "ARMSubtarget.h"
12 #include "llvm/ADT/APInt.h"
13 #include "llvm/ADT/SmallVector.h"
14 #include "llvm/Analysis/LoopInfo.h"
15 #include "llvm/CodeGen/CostTable.h"
18 #include "llvm/IR/BasicBlock.h"
19 #include "llvm/IR/DataLayout.h"
20 #include "llvm/IR/DerivedTypes.h"
21 #include "llvm/IR/Instruction.h"
22 #include "llvm/IR/Instructions.h"
23 #include "llvm/IR/Intrinsics.h"
24 #include "llvm/IR/IntrinsicInst.h"
25 #include "llvm/IR/IntrinsicsARM.h"
26 #include "llvm/IR/PatternMatch.h"
27 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Casting.h"
30 #include "llvm/Support/KnownBits.h"
36 #include <algorithm>
37 #include <cassert>
38 #include <cstdint>
39 #include <utility>
40 
41 using namespace llvm;
42 
43 #define DEBUG_TYPE "armtti"
44 
46  "enable-arm-maskedldst", cl::Hidden, cl::init(true),
47  cl::desc("Enable the generation of masked loads and stores"));
48 
50  "disable-arm-loloops", cl::Hidden, cl::init(false),
51  cl::desc("Disable the generation of low-overhead loops"));
52 
53 static cl::opt<bool>
54  AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
55  cl::desc("Enable the generation of WLS loops"));
56 
58 
60 
62 
63 /// Convert a vector load intrinsic into a simple llvm load instruction.
64 /// This is beneficial when the underlying object being addressed comes
65 /// from a constant, since we get constant-folding for free.
66 static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
68  auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
69 
70  if (!IntrAlign)
71  return nullptr;
72 
73  unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
74  ? MemAlign
75  : IntrAlign->getLimitedValue();
76 
77  if (!isPowerOf2_32(Alignment))
78  return nullptr;
79 
80  auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
81  PointerType::get(II.getType(), 0));
82  return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
83 }
84 
86  const Function *Callee) const {
87  const TargetMachine &TM = getTLI()->getTargetMachine();
88  const FeatureBitset &CallerBits =
89  TM.getSubtargetImpl(*Caller)->getFeatureBits();
90  const FeatureBitset &CalleeBits =
91  TM.getSubtargetImpl(*Callee)->getFeatureBits();
92 
93  // To inline a callee, all features not in the allowed list must match exactly.
94  bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
95  (CalleeBits & ~InlineFeaturesAllowed);
96  // For features in the allowed list, the callee's features must be a subset of
97  // the callers'.
98  bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
99  (CalleeBits & InlineFeaturesAllowed);
100  return MatchExact && MatchSubset;
101 }
102 
105  ScalarEvolution *SE) const {
106  if (ST->hasMVEIntegerOps())
107  return TTI::AMK_PostIndexed;
108 
109  if (L->getHeader()->getParent()->hasOptSize())
110  return TTI::AMK_None;
111 
112  if (ST->isMClass() && ST->isThumb2() &&
113  L->getNumBlocks() == 1)
114  return TTI::AMK_PreIndexed;
115 
116  return TTI::AMK_None;
117 }
118 
121  using namespace PatternMatch;
122  Intrinsic::ID IID = II.getIntrinsicID();
123  switch (IID) {
124  default:
125  break;
126  case Intrinsic::arm_neon_vld1: {
127  Align MemAlign =
129  &IC.getAssumptionCache(), &IC.getDominatorTree());
130  if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
131  return IC.replaceInstUsesWith(II, V);
132  }
133  break;
134  }
135 
136  case Intrinsic::arm_neon_vld2:
137  case Intrinsic::arm_neon_vld3:
138  case Intrinsic::arm_neon_vld4:
139  case Intrinsic::arm_neon_vld2lane:
140  case Intrinsic::arm_neon_vld3lane:
141  case Intrinsic::arm_neon_vld4lane:
142  case Intrinsic::arm_neon_vst1:
143  case Intrinsic::arm_neon_vst2:
144  case Intrinsic::arm_neon_vst3:
145  case Intrinsic::arm_neon_vst4:
146  case Intrinsic::arm_neon_vst2lane:
147  case Intrinsic::arm_neon_vst3lane:
148  case Intrinsic::arm_neon_vst4lane: {
149  Align MemAlign =
151  &IC.getAssumptionCache(), &IC.getDominatorTree());
152  unsigned AlignArg = II.arg_size() - 1;
153  Value *AlignArgOp = II.getArgOperand(AlignArg);
154  MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
155  if (Align && *Align < MemAlign) {
156  return IC.replaceOperand(
157  II, AlignArg,
159  false));
160  }
161  break;
162  }
163 
164  case Intrinsic::arm_mve_pred_i2v: {
165  Value *Arg = II.getArgOperand(0);
166  Value *ArgArg;
167  if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
168  PatternMatch::m_Value(ArgArg))) &&
169  II.getType() == ArgArg->getType()) {
170  return IC.replaceInstUsesWith(II, ArgArg);
171  }
172  Constant *XorMask;
173  if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
174  PatternMatch::m_Value(ArgArg)),
175  PatternMatch::m_Constant(XorMask))) &&
176  II.getType() == ArgArg->getType()) {
177  if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
178  if (CI->getValue().trunc(16).isAllOnes()) {
179  auto TrueVector = IC.Builder.CreateVectorSplat(
180  cast<FixedVectorType>(II.getType())->getNumElements(),
181  IC.Builder.getTrue());
182  return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
183  }
184  }
185  }
186  KnownBits ScalarKnown(32);
187  if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
188  ScalarKnown, 0)) {
189  return &II;
190  }
191  break;
192  }
193  case Intrinsic::arm_mve_pred_v2i: {
194  Value *Arg = II.getArgOperand(0);
195  Value *ArgArg;
196  if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
197  PatternMatch::m_Value(ArgArg)))) {
198  return IC.replaceInstUsesWith(II, ArgArg);
199  }
200  if (!II.getMetadata(LLVMContext::MD_range)) {
201  Type *IntTy32 = Type::getInt32Ty(II.getContext());
202  Metadata *M[] = {
204  ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0x10000))};
205  II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M));
206  return &II;
207  }
208  break;
209  }
210  case Intrinsic::arm_mve_vadc:
211  case Intrinsic::arm_mve_vadc_predicated: {
212  unsigned CarryOp =
213  (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
214  assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
215  "Bad type for intrinsic!");
216 
217  KnownBits CarryKnown(32);
218  if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
219  CarryKnown)) {
220  return &II;
221  }
222  break;
223  }
224  case Intrinsic::arm_mve_vmldava: {
225  Instruction *I = cast<Instruction>(&II);
226  if (I->hasOneUse()) {
227  auto *User = cast<Instruction>(*I->user_begin());
228  Value *OpZ;
229  if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
230  match(I->getOperand(3), m_Zero())) {
231  Value *OpX = I->getOperand(4);
232  Value *OpY = I->getOperand(5);
233  Type *OpTy = OpX->getType();
234 
236  Value *V =
237  IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
238  {I->getOperand(0), I->getOperand(1),
239  I->getOperand(2), OpZ, OpX, OpY});
240 
241  IC.replaceInstUsesWith(*User, V);
242  return IC.eraseInstFromFunction(*User);
243  }
244  }
245  return None;
246  }
247  }
248  return None;
249 }
250 
252  InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
253  APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
254  std::function<void(Instruction *, unsigned, APInt, APInt &)>
255  SimplifyAndSetOp) const {
256 
257  // Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the
258  // opcode specifying a Top/Bottom instruction, which can change between
259  // instructions.
260  auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) {
261  unsigned NumElts = cast<FixedVectorType>(II.getType())->getNumElements();
262  unsigned IsTop = cast<ConstantInt>(II.getOperand(TopOpc))->getZExtValue();
263 
264  // The only odd/even lanes of operand 0 will only be demanded depending
265  // on whether this is a top/bottom instruction.
266  APInt DemandedElts =
267  APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
268  : APInt::getHighBitsSet(2, 1));
269  SimplifyAndSetOp(&II, 0, OrigDemandedElts & DemandedElts, UndefElts);
270  // The other lanes will be defined from the inserted elements.
271  UndefElts &= APInt::getSplat(NumElts, !IsTop ? APInt::getLowBitsSet(2, 1)
272  : APInt::getHighBitsSet(2, 1));
273  return None;
274  };
275 
276  switch (II.getIntrinsicID()) {
277  default:
278  break;
279  case Intrinsic::arm_mve_vcvt_narrow:
280  SimplifyNarrowInstrTopBottom(2);
281  break;
282  case Intrinsic::arm_mve_vqmovn:
283  SimplifyNarrowInstrTopBottom(4);
284  break;
285  case Intrinsic::arm_mve_vshrn:
286  SimplifyNarrowInstrTopBottom(7);
287  break;
288  }
289 
290  return None;
291 }
292 
295  assert(Ty->isIntegerTy());
296 
297  unsigned Bits = Ty->getPrimitiveSizeInBits();
298  if (Bits == 0 || Imm.getActiveBits() >= 64)
299  return 4;
300 
301  int64_t SImmVal = Imm.getSExtValue();
302  uint64_t ZImmVal = Imm.getZExtValue();
303  if (!ST->isThumb()) {
304  if ((SImmVal >= 0 && SImmVal < 65536) ||
305  (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
306  (ARM_AM::getSOImmVal(~ZImmVal) != -1))
307  return 1;
308  return ST->hasV6T2Ops() ? 2 : 3;
309  }
310  if (ST->isThumb2()) {
311  if ((SImmVal >= 0 && SImmVal < 65536) ||
312  (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
313  (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
314  return 1;
315  return ST->hasV6T2Ops() ? 2 : 3;
316  }
317  // Thumb1, any i8 imm cost 1.
318  if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
319  return 1;
320  if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
321  return 2;
322  // Load from constantpool.
323  return 3;
324 }
325 
326 // Constants smaller than 256 fit in the immediate field of
327 // Thumb1 instructions so we return a zero cost and 1 otherwise.
329  const APInt &Imm, Type *Ty) {
330  if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
331  return 0;
332 
333  return 1;
334 }
335 
336 // Checks whether Inst is part of a min(max()) or max(min()) pattern
337 // that will match to an SSAT instruction. Returns the instruction being
338 // saturated, or null if no saturation pattern was found.
339 static Value *isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
340  Value *LHS, *RHS;
341  ConstantInt *C;
343 
344  if (InstSPF == SPF_SMAX &&
346  C->getValue() == Imm && Imm.isNegative() && Imm.isNegatedPowerOf2()) {
347 
348  auto isSSatMin = [&](Value *MinInst) {
349  if (isa<SelectInst>(MinInst)) {
350  Value *MinLHS, *MinRHS;
351  ConstantInt *MinC;
352  SelectPatternFlavor MinSPF =
353  matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
354  if (MinSPF == SPF_SMIN &&
356  MinC->getValue() == ((-Imm) - 1))
357  return true;
358  }
359  return false;
360  };
361 
362  if (isSSatMin(Inst->getOperand(1)))
363  return cast<Instruction>(Inst->getOperand(1))->getOperand(1);
364  if (Inst->hasNUses(2) &&
365  (isSSatMin(*Inst->user_begin()) || isSSatMin(*(++Inst->user_begin()))))
366  return Inst->getOperand(1);
367  }
368  return nullptr;
369 }
370 
371 // Look for a FP Saturation pattern, where the instruction can be simplified to
372 // a fptosi.sat. max(min(fptosi)). The constant in this case is always free.
373 static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm) {
374  if (Imm.getBitWidth() != 64 ||
375  Imm != APInt::getHighBitsSet(64, 33)) // -2147483648
376  return false;
377  Value *FP = isSSATMinMaxPattern(Inst, Imm);
378  if (!FP && isa<ICmpInst>(Inst) && Inst->hasOneUse())
379  FP = isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm);
380  if (!FP)
381  return false;
382  return isa<FPToSIInst>(FP);
383 }
384 
385 InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
386  const APInt &Imm, Type *Ty,
388  Instruction *Inst) {
389  // Division by a constant can be turned into multiplication, but only if we
390  // know it's constant. So it's not so much that the immediate is cheap (it's
391  // not), but that the alternative is worse.
392  // FIXME: this is probably unneeded with GlobalISel.
393  if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
394  Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
395  Idx == 1)
396  return 0;
397 
398  // Leave any gep offsets for the CodeGenPrepare, which will do a better job at
399  // splitting any large offsets.
400  if (Opcode == Instruction::GetElementPtr && Idx != 0)
401  return 0;
402 
403  if (Opcode == Instruction::And) {
404  // UXTB/UXTH
405  if (Imm == 255 || Imm == 65535)
406  return 0;
407  // Conversion to BIC is free, and means we can use ~Imm instead.
408  return std::min(getIntImmCost(Imm, Ty, CostKind),
409  getIntImmCost(~Imm, Ty, CostKind));
410  }
411 
412  if (Opcode == Instruction::Add)
413  // Conversion to SUB is free, and means we can use -Imm instead.
414  return std::min(getIntImmCost(Imm, Ty, CostKind),
415  getIntImmCost(-Imm, Ty, CostKind));
416 
417  if (Opcode == Instruction::ICmp && Imm.isNegative() &&
418  Ty->getIntegerBitWidth() == 32) {
419  int64_t NegImm = -Imm.getSExtValue();
420  if (ST->isThumb2() && NegImm < 1<<12)
421  // icmp X, #-C -> cmn X, #C
422  return 0;
423  if (ST->isThumb() && NegImm < 1<<8)
424  // icmp X, #-C -> adds X, #C
425  return 0;
426  }
427 
428  // xor a, -1 can always be folded to MVN
429  if (Opcode == Instruction::Xor && Imm.isAllOnes())
430  return 0;
431 
432  // Ensures negative constant of min(max()) or max(min()) patterns that
433  // match to SSAT instructions don't get hoisted
434  if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
435  Ty->getIntegerBitWidth() <= 32) {
436  if (isSSATMinMaxPattern(Inst, Imm) ||
437  (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
438  isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm)))
439  return 0;
440  }
441 
442  if (Inst && ST->hasVFP2Base() && isFPSatMinMaxPattern(Inst, Imm))
443  return 0;
444 
445  // We can convert <= -1 to < 0, which is generally quite cheap.
446  if (Inst && Opcode == Instruction::ICmp && Idx == 1 && Imm.isAllOnesValue()) {
447  ICmpInst::Predicate Pred = cast<ICmpInst>(Inst)->getPredicate();
448  if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE)
449  return std::min(getIntImmCost(Imm, Ty, CostKind),
450  getIntImmCost(Imm + 1, Ty, CostKind));
451  }
452 
453  return getIntImmCost(Imm, Ty, CostKind);
454 }
455 
458  const Instruction *I) {
460  (ST->hasNEON() || ST->hasMVEIntegerOps())) {
461  // FIXME: The vectorizer is highly sensistive to the cost of these
462  // instructions, which suggests that it may be using the costs incorrectly.
463  // But, for now, just make them free to avoid performance regressions for
464  // vector targets.
465  return 0;
466  }
467  return BaseT::getCFInstrCost(Opcode, CostKind, I);
468 }
469 
471  Type *Src,
474  const Instruction *I) {
475  int ISD = TLI->InstructionOpcodeToISD(Opcode);
476  assert(ISD && "Invalid opcode");
477 
478  // TODO: Allow non-throughput costs that aren't binary.
479  auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
481  return Cost == 0 ? 0 : 1;
482  return Cost;
483  };
484  auto IsLegalFPType = [this](EVT VT) {
485  EVT EltVT = VT.getScalarType();
486  return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
487  (EltVT == MVT::f64 && ST->hasFP64()) ||
488  (EltVT == MVT::f16 && ST->hasFullFP16());
489  };
490 
491  EVT SrcTy = TLI->getValueType(DL, Src);
492  EVT DstTy = TLI->getValueType(DL, Dst);
493 
494  if (!SrcTy.isSimple() || !DstTy.isSimple())
495  return AdjustCost(
496  BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
497 
498  // Extending masked load/Truncating masked stores is expensive because we
499  // currently don't split them. This means that we'll likely end up
500  // loading/storing each element individually (hence the high cost).
501  if ((ST->hasMVEIntegerOps() &&
502  (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
503  Opcode == Instruction::SExt)) ||
504  (ST->hasMVEFloatOps() &&
505  (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
506  IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
507  if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
508  return 2 * DstTy.getVectorNumElements() *
510 
511  // The extend of other kinds of load is free
512  if (CCH == TTI::CastContextHint::Normal ||
514  static const TypeConversionCostTblEntry LoadConversionTbl[] = {
527  };
528  if (const auto *Entry = ConvertCostTableLookup(
529  LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
530  return AdjustCost(Entry->Cost);
531 
532  static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
539  // The following extend from a legal type to an illegal type, so need to
540  // split the load. This introduced an extra load operation, but the
541  // extend is still "free".
548  };
549  if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
550  if (const auto *Entry =
551  ConvertCostTableLookup(MVELoadConversionTbl, ISD,
552  DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
553  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
554  }
555 
556  static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
557  // FPExtends are similar but also require the VCVT instructions.
560  };
561  if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
562  if (const auto *Entry =
563  ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
564  DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
565  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
566  }
567 
568  // The truncate of a store is free. This is the mirror of extends above.
569  static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
577  };
578  if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
579  if (const auto *Entry =
580  ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
581  SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
582  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
583  }
584 
585  static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
588  };
589  if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
590  if (const auto *Entry =
591  ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
592  SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
593  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
594  }
595  }
596 
597  // NEON vector operations that can extend their inputs.
598  if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
599  I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
600  static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
601  // vaddl
602  { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
603  { ISD::ADD, MVT::v8i16, MVT::v8i8, 0 },
604  // vsubl
605  { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
606  { ISD::SUB, MVT::v8i16, MVT::v8i8, 0 },
607  // vmull
608  { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
609  { ISD::MUL, MVT::v8i16, MVT::v8i8, 0 },
610  // vshll
611  { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
612  { ISD::SHL, MVT::v8i16, MVT::v8i8, 0 },
613  };
614 
615  auto *User = cast<Instruction>(*I->user_begin());
616  int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
617  if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
618  DstTy.getSimpleVT(),
619  SrcTy.getSimpleVT())) {
620  return AdjustCost(Entry->Cost);
621  }
622  }
623 
624  // Single to/from double precision conversions.
625  if (Src->isVectorTy() && ST->hasNEON() &&
626  ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
627  DstTy.getScalarType() == MVT::f32) ||
628  (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
629  DstTy.getScalarType() == MVT::f64))) {
630  static const CostTblEntry NEONFltDblTbl[] = {
631  // Vector fptrunc/fpext conversions.
634  {ISD::FP_EXTEND, MVT::v4f32, 4}};
635 
636  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
637  if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
638  return AdjustCost(LT.first * Entry->Cost);
639  }
640 
641  // Some arithmetic, load and store operations have specific instructions
642  // to cast up/down their types automatically at no extra cost.
643  // TODO: Get these tables to know at least what the related operations are.
644  static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
651 
652  // The number of vmovl instructions for the extension.
671 
672  // Operations that we legalize using splitting.
675 
676  // Vector float <-> i32 conversions.
679 
700 
707 
708  // Vector double <-> i32 conversions.
711 
718 
725  };
726 
727  if (SrcTy.isVector() && ST->hasNEON()) {
728  if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
729  DstTy.getSimpleVT(),
730  SrcTy.getSimpleVT()))
731  return AdjustCost(Entry->Cost);
732  }
733 
734  // Scalar float to integer conversions.
735  static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
756  };
757  if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
758  if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
759  DstTy.getSimpleVT(),
760  SrcTy.getSimpleVT()))
761  return AdjustCost(Entry->Cost);
762  }
763 
764  // Scalar integer to float conversions.
765  static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
786  };
787 
788  if (SrcTy.isInteger() && ST->hasNEON()) {
789  if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
790  ISD, DstTy.getSimpleVT(),
791  SrcTy.getSimpleVT()))
792  return AdjustCost(Entry->Cost);
793  }
794 
795  // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
796  // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
797  // are linearised so take more.
798  static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
811  };
812 
813  if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
814  if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
815  ISD, DstTy.getSimpleVT(),
816  SrcTy.getSimpleVT()))
817  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
818  }
819 
820  if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
821  // As general rule, fp converts that were not matched above are scalarized
822  // and cost 1 vcvt for each lane, so long as the instruction is available.
823  // If not it will become a series of function calls.
824  const InstructionCost CallCost =
825  getCallInstrCost(nullptr, Dst, {Src}, CostKind);
826  int Lanes = 1;
827  if (SrcTy.isFixedLengthVector())
828  Lanes = SrcTy.getVectorNumElements();
829 
830  if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
831  return Lanes;
832  else
833  return Lanes * CallCost;
834  }
835 
836  if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
837  SrcTy.isFixedLengthVector()) {
838  // Treat a truncate with larger than legal source (128bits for MVE) as
839  // expensive, 2 instructions per lane.
840  if ((SrcTy.getScalarType() == MVT::i8 ||
841  SrcTy.getScalarType() == MVT::i16 ||
842  SrcTy.getScalarType() == MVT::i32) &&
843  SrcTy.getSizeInBits() > 128 &&
844  SrcTy.getSizeInBits() > DstTy.getSizeInBits())
845  return SrcTy.getVectorNumElements() * 2;
846  }
847 
848  // Scalar integer conversion costs.
849  static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
850  // i16 -> i64 requires two dependent operations.
852 
853  // Truncates on i64 are assumed to be free.
856  { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 },
858  };
859 
860  if (SrcTy.isInteger()) {
861  if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
862  DstTy.getSimpleVT(),
863  SrcTy.getSimpleVT()))
864  return AdjustCost(Entry->Cost);
865  }
866 
867  int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
869  : 1;
870  return AdjustCost(
871  BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
872 }
873 
875  unsigned Index) {
876  // Penalize inserting into an D-subregister. We end up with a three times
877  // lower estimated throughput on swift.
878  if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
879  ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
880  return 3;
881 
882  if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
883  Opcode == Instruction::ExtractElement)) {
884  // Cross-class copies are expensive on many microarchitectures,
885  // so assume they are expensive by default.
886  if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
887  return 3;
888 
889  // Even if it's not a cross class copy, this likely leads to mixing
890  // of NEON and VFP code and should be therefore penalized.
891  if (ValTy->isVectorTy() &&
892  ValTy->getScalarSizeInBits() <= 32)
893  return std::max<InstructionCost>(
894  BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
895  }
896 
897  if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
898  Opcode == Instruction::ExtractElement)) {
899  // Integer cross-lane moves are more expensive than float, which can
900  // sometimes just be vmovs. Integer involve being passes to GPR registers,
901  // causing more of a delay.
902  std::pair<InstructionCost, MVT> LT =
903  getTLI()->getTypeLegalizationCost(DL, ValTy->getScalarType());
904  return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
905  }
906 
907  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
908 }
909 
911  Type *CondTy,
912  CmpInst::Predicate VecPred,
914  const Instruction *I) {
915  int ISD = TLI->InstructionOpcodeToISD(Opcode);
916 
917  // Thumb scalar code size cost for select.
918  if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
919  ST->isThumb() && !ValTy->isVectorTy()) {
920  // Assume expensive structs.
921  if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
922  return TTI::TCC_Expensive;
923 
924  // Select costs can vary because they:
925  // - may require one or more conditional mov (including an IT),
926  // - can't operate directly on immediates,
927  // - require live flags, which we can't copy around easily.
928  InstructionCost Cost = TLI->getTypeLegalizationCost(DL, ValTy).first;
929 
930  // Possible IT instruction for Thumb2, or more for Thumb1.
931  ++Cost;
932 
933  // i1 values may need rematerialising by using mov immediates and/or
934  // flag setting instructions.
935  if (ValTy->isIntegerTy(1))
936  ++Cost;
937 
938  return Cost;
939  }
940 
941  // If this is a vector min/max/abs, use the cost of that intrinsic directly
942  // instead. Hopefully when min/max intrinsics are more prevalent this code
943  // will not be needed.
944  const Instruction *Sel = I;
945  if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
946  Sel->hasOneUse())
947  Sel = cast<Instruction>(Sel->user_back());
948  if (Sel && ValTy->isVectorTy() &&
949  (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
950  const Value *LHS, *RHS;
952  unsigned IID = 0;
953  switch (SPF) {
954  case SPF_ABS:
955  IID = Intrinsic::abs;
956  break;
957  case SPF_SMIN:
958  IID = Intrinsic::smin;
959  break;
960  case SPF_SMAX:
961  IID = Intrinsic::smax;
962  break;
963  case SPF_UMIN:
964  IID = Intrinsic::umin;
965  break;
966  case SPF_UMAX:
967  IID = Intrinsic::umax;
968  break;
969  case SPF_FMINNUM:
970  IID = Intrinsic::minnum;
971  break;
972  case SPF_FMAXNUM:
973  IID = Intrinsic::maxnum;
974  break;
975  default:
976  break;
977  }
978  if (IID) {
979  // The ICmp is free, the select gets the cost of the min/max/etc
980  if (Sel != I)
981  return 0;
982  IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
983  return getIntrinsicInstrCost(CostAttrs, CostKind);
984  }
985  }
986 
987  // On NEON a vector select gets lowered to vbsl.
988  if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
989  // Lowering of some vector selects is currently far from perfect.
990  static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
991  { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
992  { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
994  };
995 
996  EVT SelCondTy = TLI->getValueType(DL, CondTy);
997  EVT SelValTy = TLI->getValueType(DL, ValTy);
998  if (SelCondTy.isSimple() && SelValTy.isSimple()) {
999  if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
1000  SelCondTy.getSimpleVT(),
1001  SelValTy.getSimpleVT()))
1002  return Entry->Cost;
1003  }
1004 
1005  std::pair<InstructionCost, MVT> LT =
1006  TLI->getTypeLegalizationCost(DL, ValTy);
1007  return LT.first;
1008  }
1009 
1010  if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
1011  (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
1012  cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
1013  FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
1014  FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy);
1015  if (!VecCondTy)
1016  VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy));
1017 
1018  // If we don't have mve.fp any fp operations will need to be scalarized.
1019  if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
1020  // One scalaization insert, one scalarization extract and the cost of the
1021  // fcmps.
1022  return BaseT::getScalarizationOverhead(VecValTy, false, true) +
1023  BaseT::getScalarizationOverhead(VecCondTy, true, false) +
1024  VecValTy->getNumElements() *
1025  getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
1026  VecCondTy->getScalarType(), VecPred, CostKind,
1027  I);
1028  }
1029 
1030  std::pair<InstructionCost, MVT> LT =
1031  TLI->getTypeLegalizationCost(DL, ValTy);
1032  int BaseCost = ST->getMVEVectorCostFactor(CostKind);
1033  // There are two types - the input that specifies the type of the compare
1034  // and the output vXi1 type. Because we don't know how the output will be
1035  // split, we may need an expensive shuffle to get two in sync. This has the
1036  // effect of making larger than legal compares (v8i32 for example)
1037  // expensive.
1038  if (LT.second.getVectorNumElements() > 2) {
1039  if (LT.first > 1)
1040  return LT.first * BaseCost +
1041  BaseT::getScalarizationOverhead(VecCondTy, true, false);
1042  return BaseCost;
1043  }
1044  }
1045 
1046  // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1047  // for "multiple beats" potentially needed by MVE instructions.
1048  int BaseCost = 1;
1049  if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
1050  BaseCost = ST->getMVEVectorCostFactor(CostKind);
1051 
1052  return BaseCost *
1053  BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1054 }
1055 
1057  ScalarEvolution *SE,
1058  const SCEV *Ptr) {
1059  // Address computations in vectorized code with non-consecutive addresses will
1060  // likely result in more instructions compared to scalar code where the
1061  // computation can more often be merged into the index mode. The resulting
1062  // extra micro-ops can significantly decrease throughput.
1063  unsigned NumVectorInstToHideOverhead = 10;
1064  int MaxMergeDistance = 64;
1065 
1066  if (ST->hasNEON()) {
1067  if (Ty->isVectorTy() && SE &&
1068  !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
1069  return NumVectorInstToHideOverhead;
1070 
1071  // In many cases the address computation is not merged into the instruction
1072  // addressing mode.
1073  return 1;
1074  }
1075  return BaseT::getAddressComputationCost(Ty, SE, Ptr);
1076 }
1077 
1079  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
1080  // If a VCTP is part of a chain, it's already profitable and shouldn't be
1081  // optimized, else LSR may block tail-predication.
1082  switch (II->getIntrinsicID()) {
1083  case Intrinsic::arm_mve_vctp8:
1084  case Intrinsic::arm_mve_vctp16:
1085  case Intrinsic::arm_mve_vctp32:
1086  case Intrinsic::arm_mve_vctp64:
1087  return true;
1088  default:
1089  break;
1090  }
1091  }
1092  return false;
1093 }
1094 
1095 bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
1096  if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
1097  return false;
1098 
1099  if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
1100  // Don't support v2i1 yet.
1101  if (VecTy->getNumElements() == 2)
1102  return false;
1103 
1104  // We don't support extending fp types.
1105  unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
1106  if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
1107  return false;
1108  }
1109 
1110  unsigned EltWidth = DataTy->getScalarSizeInBits();
1111  return (EltWidth == 32 && Alignment >= 4) ||
1112  (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
1113 }
1114 
1116  if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
1117  return false;
1118 
1119  unsigned EltWidth = Ty->getScalarSizeInBits();
1120  return ((EltWidth == 32 && Alignment >= 4) ||
1121  (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1122 }
1123 
1124 /// Given a memcpy/memset/memmove instruction, return the number of memory
1125 /// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1126 /// call is used.
1128  MemOp MOp;
1129  unsigned DstAddrSpace = ~0u;
1130  unsigned SrcAddrSpace = ~0u;
1131  const Function *F = I->getParent()->getParent();
1132 
1133  if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
1134  ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
1135  // If 'size' is not a constant, a library call will be generated.
1136  if (!C)
1137  return -1;
1138 
1139  const unsigned Size = C->getValue().getZExtValue();
1140  const Align DstAlign = *MC->getDestAlign();
1141  const Align SrcAlign = *MC->getSourceAlign();
1142 
1143  MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
1144  /*IsVolatile*/ false);
1145  DstAddrSpace = MC->getDestAddressSpace();
1146  SrcAddrSpace = MC->getSourceAddressSpace();
1147  }
1148  else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
1149  ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
1150  // If 'size' is not a constant, a library call will be generated.
1151  if (!C)
1152  return -1;
1153 
1154  const unsigned Size = C->getValue().getZExtValue();
1155  const Align DstAlign = *MS->getDestAlign();
1156 
1157  MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
1158  /*IsZeroMemset*/ false, /*IsVolatile*/ false);
1159  DstAddrSpace = MS->getDestAddressSpace();
1160  }
1161  else
1162  llvm_unreachable("Expected a memcpy/move or memset!");
1163 
1164  unsigned Limit, Factor = 2;
1165  switch(I->getIntrinsicID()) {
1166  case Intrinsic::memcpy:
1167  Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
1168  break;
1169  case Intrinsic::memmove:
1170  Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
1171  break;
1172  case Intrinsic::memset:
1173  Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
1174  Factor = 1;
1175  break;
1176  default:
1177  llvm_unreachable("Expected a memcpy/move or memset!");
1178  }
1179 
1180  // MemOps will be poplulated with a list of data types that needs to be
1181  // loaded and stored. That's why we multiply the number of elements by 2 to
1182  // get the cost for this memcpy.
1183  std::vector<EVT> MemOps;
1184  if (getTLI()->findOptimalMemOpLowering(
1185  MemOps, Limit, MOp, DstAddrSpace,
1186  SrcAddrSpace, F->getAttributes()))
1187  return MemOps.size() * Factor;
1188 
1189  // If we can't find an optimal memop lowering, return the default cost
1190  return -1;
1191 }
1192 
1194  int NumOps = getNumMemOps(cast<IntrinsicInst>(I));
1195 
1196  // To model the cost of a library call, we assume 1 for the call, and
1197  // 3 for the argument setup.
1198  if (NumOps == -1)
1199  return 4;
1200  return NumOps;
1201 }
1202 
1205  int Index, VectorType *SubTp,
1208  if (ST->hasNEON()) {
1209  if (Kind == TTI::SK_Broadcast) {
1210  static const CostTblEntry NEONDupTbl[] = {
1211  // VDUP handles these cases.
1218 
1223 
1224  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1225  if (const auto *Entry =
1226  CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
1227  return LT.first * Entry->Cost;
1228  }
1229  if (Kind == TTI::SK_Reverse) {
1230  static const CostTblEntry NEONShuffleTbl[] = {
1231  // Reverse shuffle cost one instruction if we are shuffling within a
1232  // double word (vrev) or two if we shuffle a quad word (vrev, vext).
1239 
1244 
1245  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1246  if (const auto *Entry =
1247  CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
1248  return LT.first * Entry->Cost;
1249  }
1250  if (Kind == TTI::SK_Select) {
1251  static const CostTblEntry NEONSelShuffleTbl[] = {
1252  // Select shuffle cost table for ARM. Cost is the number of
1253  // instructions
1254  // required to create the shuffled vector.
1255 
1260 
1264 
1266 
1268 
1269  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1270  if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
1271  ISD::VECTOR_SHUFFLE, LT.second))
1272  return LT.first * Entry->Cost;
1273  }
1274  }
1275  if (ST->hasMVEIntegerOps()) {
1276  if (Kind == TTI::SK_Broadcast) {
1277  static const CostTblEntry MVEDupTbl[] = {
1278  // VDUP handles these cases.
1284 
1285  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1286  if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
1287  LT.second))
1288  return LT.first * Entry->Cost *
1290  }
1291 
1292  if (!Mask.empty()) {
1293  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1294  if (Mask.size() <= LT.second.getVectorNumElements() &&
1295  (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
1296  isVREVMask(Mask, LT.second, 64)))
1298  }
1299  }
1300 
1301  int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
1303  : 1;
1304  return BaseCost * BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
1305 }
1306 
1308  unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1310  TTI::OperandValueProperties Opd1PropInfo,
1312  const Instruction *CxtI) {
1313  int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1314  if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
1315  // Make operations on i1 relatively expensive as this often involves
1316  // combining predicates. AND and XOR should be easier to handle with IT
1317  // blocks.
1318  switch (ISDOpcode) {
1319  default:
1320  break;
1321  case ISD::AND:
1322  case ISD::XOR:
1323  return 2;
1324  case ISD::OR:
1325  return 3;
1326  }
1327  }
1328 
1329  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
1330 
1331  if (ST->hasNEON()) {
1332  const unsigned FunctionCallDivCost = 20;
1333  const unsigned ReciprocalDivCost = 10;
1334  static const CostTblEntry CostTbl[] = {
1335  // Division.
1336  // These costs are somewhat random. Choose a cost of 20 to indicate that
1337  // vectorizing devision (added function call) is going to be very expensive.
1338  // Double registers types.
1339  { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1340  { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1341  { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
1342  { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
1343  { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1344  { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1345  { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
1346  { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
1347  { ISD::SDIV, MVT::v4i16, ReciprocalDivCost},
1348  { ISD::UDIV, MVT::v4i16, ReciprocalDivCost},
1349  { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
1350  { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
1351  { ISD::SDIV, MVT::v8i8, ReciprocalDivCost},
1352  { ISD::UDIV, MVT::v8i8, ReciprocalDivCost},
1353  { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost},
1354  { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost},
1355  // Quad register types.
1356  { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1357  { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1358  { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
1359  { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
1360  { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1361  { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1362  { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
1363  { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
1364  { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1365  { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1366  { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
1367  { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
1368  { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1369  { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1370  { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
1371  { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
1372  // Multiplication.
1373  };
1374 
1375  if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
1376  return LT.first * Entry->Cost;
1377 
1379  Opcode, Ty, CostKind, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo);
1380 
1381  // This is somewhat of a hack. The problem that we are facing is that SROA
1382  // creates a sequence of shift, and, or instructions to construct values.
1383  // These sequences are recognized by the ISel and have zero-cost. Not so for
1384  // the vectorized code. Because we have support for v2i64 but not i64 those
1385  // sequences look particularly beneficial to vectorize.
1386  // To work around this we increase the cost of v2i64 operations to make them
1387  // seem less beneficial.
1388  if (LT.second == MVT::v2i64 &&
1390  Cost += 4;
1391 
1392  return Cost;
1393  }
1394 
1395  // If this operation is a shift on arm/thumb2, it might well be folded into
1396  // the following instruction, hence having a cost of 0.
1397  auto LooksLikeAFreeShift = [&]() {
1398  if (ST->isThumb1Only() || Ty->isVectorTy())
1399  return false;
1400 
1401  if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
1402  return false;
1404  return false;
1405 
1406  // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1407  switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
1408  case Instruction::Add:
1409  case Instruction::Sub:
1410  case Instruction::And:
1411  case Instruction::Xor:
1412  case Instruction::Or:
1413  case Instruction::ICmp:
1414  return true;
1415  default:
1416  return false;
1417  }
1418  };
1419  if (LooksLikeAFreeShift())
1420  return 0;
1421 
1422  // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1423  // for "multiple beats" potentially needed by MVE instructions.
1424  int BaseCost = 1;
1425  if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1426  BaseCost = ST->getMVEVectorCostFactor(CostKind);
1427 
1428  // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
1429  // without treating floats as more expensive that scalars or increasing the
1430  // costs for custom operations. The results is also multiplied by the
1431  // MVEVectorCostFactor where appropriate.
1432  if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
1433  return LT.first * BaseCost;
1434 
1435  // Else this is expand, assume that we need to scalarize this op.
1436  if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
1437  unsigned Num = VTy->getNumElements();
1438  InstructionCost Cost =
1440  // Return the cost of multiple scalar invocation plus the cost of
1441  // inserting and extracting the values.
1442  SmallVector<Type *> Tys(Args.size(), Ty);
1443  return BaseT::getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
1444  }
1445 
1446  return BaseCost;
1447 }
1448 
1450  MaybeAlign Alignment,
1451  unsigned AddressSpace,
1453  const Instruction *I) {
1454  // TODO: Handle other cost kinds.
1456  return 1;
1457 
1458  // Type legalization can't handle structs
1459  if (TLI->getValueType(DL, Src, true) == MVT::Other)
1460  return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1461  CostKind);
1462 
1463  if (ST->hasNEON() && Src->isVectorTy() &&
1464  (Alignment && *Alignment != Align(16)) &&
1465  cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
1466  // Unaligned loads/stores are extremely inefficient.
1467  // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1468  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
1469  return LT.first * 4;
1470  }
1471 
1472  // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
1473  // Same for stores.
1474  if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
1475  ((Opcode == Instruction::Load && I->hasOneUse() &&
1476  isa<FPExtInst>(*I->user_begin())) ||
1477  (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
1478  FixedVectorType *SrcVTy = cast<FixedVectorType>(Src);
1479  Type *DstTy =
1480  Opcode == Instruction::Load
1481  ? (*I->user_begin())->getType()
1482  : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
1483  if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
1484  DstTy->getScalarType()->isFloatTy())
1485  return ST->getMVEVectorCostFactor(CostKind);
1486  }
1487 
1488  int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1490  : 1;
1491  return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1492  CostKind, I);
1493 }
1494 
1496 ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1497  unsigned AddressSpace,
1499  if (ST->hasMVEIntegerOps()) {
1500  if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment))
1501  return ST->getMVEVectorCostFactor(CostKind);
1502  if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment))
1503  return ST->getMVEVectorCostFactor(CostKind);
1504  }
1505  if (!isa<FixedVectorType>(Src))
1506  return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1507  CostKind);
1508  // Scalar cost, which is currently very high due to the efficiency of the
1509  // generated code.
1510  return cast<FixedVectorType>(Src)->getNumElements() * 8;
1511 }
1512 
1514  unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1515  Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1516  bool UseMaskForCond, bool UseMaskForGaps) {
1517  assert(Factor >= 2 && "Invalid interleave factor");
1518  assert(isa<VectorType>(VecTy) && "Expect a vector type");
1519 
1520  // vldN/vstN doesn't support vector types of i64/f64 element.
1521  bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
1522 
1523  if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
1524  !UseMaskForCond && !UseMaskForGaps) {
1525  unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1526  auto *SubVecTy =
1527  FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
1528 
1529  // vldN/vstN only support legal vector types of size 64 or 128 in bits.
1530  // Accesses having vector types that are a multiple of 128 bits can be
1531  // matched to more than one vldN/vstN instruction.
1532  int BaseCost =
1533  ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1;
1534  if (NumElts % Factor == 0 &&
1535  TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
1536  return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1537 
1538  // Some smaller than legal interleaved patterns are cheap as we can make
1539  // use of the vmovn or vrev patterns to interleave a standard load. This is
1540  // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1541  // promoted differently). The cost of 2 here is then a load and vrev or
1542  // vmovn.
1543  if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
1544  VecTy->isIntOrIntVectorTy() &&
1545  DL.getTypeSizeInBits(SubVecTy).getFixedSize() <= 64)
1546  return 2 * BaseCost;
1547  }
1548 
1549  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1550  Alignment, AddressSpace, CostKind,
1551  UseMaskForCond, UseMaskForGaps);
1552 }
1553 
1555  unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1556  Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
1557  using namespace PatternMatch;
1558  if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
1559  return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1560  Alignment, CostKind, I);
1561 
1562  assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
1563  auto *VTy = cast<FixedVectorType>(DataTy);
1564 
1565  // TODO: Splitting, once we do that.
1566 
1567  unsigned NumElems = VTy->getNumElements();
1568  unsigned EltSize = VTy->getScalarSizeInBits();
1569  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, DataTy);
1570 
1571  // For now, it is assumed that for the MVE gather instructions the loads are
1572  // all effectively serialised. This means the cost is the scalar cost
1573  // multiplied by the number of elements being loaded. This is possibly very
1574  // conservative, but even so we still end up vectorising loops because the
1575  // cost per iteration for many loops is lower than for scalar loops.
1576  InstructionCost VectorCost =
1577  NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
1578  // The scalarization cost should be a lot higher. We use the number of vector
1579  // elements plus the scalarization overhead.
1580  InstructionCost ScalarCost =
1581  NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, true, false) +
1582  BaseT::getScalarizationOverhead(VTy, false, true);
1583 
1584  if (EltSize < 8 || Alignment < EltSize / 8)
1585  return ScalarCost;
1586 
1587  unsigned ExtSize = EltSize;
1588  // Check whether there's a single user that asks for an extended type
1589  if (I != nullptr) {
1590  // Dependent of the caller of this function, a gather instruction will
1591  // either have opcode Instruction::Load or be a call to the masked_gather
1592  // intrinsic
1593  if ((I->getOpcode() == Instruction::Load ||
1594  match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
1595  I->hasOneUse()) {
1596  const User *Us = *I->users().begin();
1597  if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
1598  // only allow valid type combinations
1599  unsigned TypeSize =
1600  cast<Instruction>(Us)->getType()->getScalarSizeInBits();
1601  if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
1602  (TypeSize == 16 && EltSize == 8)) &&
1603  TypeSize * NumElems == 128) {
1604  ExtSize = TypeSize;
1605  }
1606  }
1607  }
1608  // Check whether the input data needs to be truncated
1609  TruncInst *T;
1610  if ((I->getOpcode() == Instruction::Store ||
1611  match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
1612  (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
1613  // Only allow valid type combinations
1614  unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
1615  if (((EltSize == 16 && TypeSize == 32) ||
1616  (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
1617  TypeSize * NumElems == 128)
1618  ExtSize = TypeSize;
1619  }
1620  }
1621 
1622  if (ExtSize * NumElems != 128 || NumElems < 4)
1623  return ScalarCost;
1624 
1625  // Any (aligned) i32 gather will not need to be scalarised.
1626  if (ExtSize == 32)
1627  return VectorCost;
1628  // For smaller types, we need to ensure that the gep's inputs are correctly
1629  // extended from a small enough value. Other sizes (including i64) are
1630  // scalarized for now.
1631  if (ExtSize != 8 && ExtSize != 16)
1632  return ScalarCost;
1633 
1634  if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
1635  Ptr = BC->getOperand(0);
1636  if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
1637  if (GEP->getNumOperands() != 2)
1638  return ScalarCost;
1639  unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
1640  // Scale needs to be correct (which is only relevant for i16s).
1641  if (Scale != 1 && Scale * 8 != ExtSize)
1642  return ScalarCost;
1643  // And we need to zext (not sext) the indexes from a small enough type.
1644  if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
1645  if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
1646  return VectorCost;
1647  }
1648  return ScalarCost;
1649  }
1650  return ScalarCost;
1651 }
1652 
1658  return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1659 
1660  EVT ValVT = TLI->getValueType(DL, ValTy);
1661  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1662  if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD)
1663  return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1664 
1665  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1666 
1667  static const CostTblEntry CostTblAdd[]{
1668  {ISD::ADD, MVT::v16i8, 1},
1669  {ISD::ADD, MVT::v8i16, 1},
1670  {ISD::ADD, MVT::v4i32, 1},
1671  };
1672  if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
1673  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1674 
1675  return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1676 }
1677 
1679 ARMTTIImpl::getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
1680  Type *ResTy, VectorType *ValTy,
1682  EVT ValVT = TLI->getValueType(DL, ValTy);
1683  EVT ResVT = TLI->getValueType(DL, ResTy);
1684 
1685  if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1686  std::pair<InstructionCost, MVT> LT =
1687  TLI->getTypeLegalizationCost(DL, ValTy);
1688 
1689  // The legal cases are:
1690  // VADDV u/s 8/16/32
1691  // VMLAV u/s 8/16/32
1692  // VADDLV u/s 32
1693  // VMLALV u/s 16/32
1694  // Codegen currently cannot always handle larger than legal vectors very
1695  // well, especially for predicated reductions where the mask needs to be
1696  // split, so restrict to 128bit or smaller input types.
1697  unsigned RevVTSize = ResVT.getSizeInBits();
1698  if (ValVT.getSizeInBits() <= 128 &&
1699  ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1700  (LT.second == MVT::v8i16 && RevVTSize <= (IsMLA ? 64u : 32u)) ||
1701  (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1702  return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1703  }
1704 
1705  return BaseT::getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, ValTy,
1706  CostKind);
1707 }
1708 
1712  switch (ICA.getID()) {
1713  case Intrinsic::get_active_lane_mask:
1714  // Currently we make a somewhat optimistic assumption that
1715  // active_lane_mask's are always free. In reality it may be freely folded
1716  // into a tail predicated loop, expanded into a VCPT or expanded into a lot
1717  // of add/icmp code. We may need to improve this in the future, but being
1718  // able to detect if it is free or not involves looking at a lot of other
1719  // code. We currently assume that the vectorizer inserted these, and knew
1720  // what it was doing in adding one.
1721  if (ST->hasMVEIntegerOps())
1722  return 0;
1723  break;
1724  case Intrinsic::sadd_sat:
1725  case Intrinsic::ssub_sat:
1726  case Intrinsic::uadd_sat:
1727  case Intrinsic::usub_sat: {
1728  if (!ST->hasMVEIntegerOps())
1729  break;
1730  Type *VT = ICA.getReturnType();
1731 
1732  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1733  if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1734  LT.second == MVT::v16i8) {
1735  // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
1736  // need to extend the type, as it uses shr(qadd(shl, shl)).
1737  unsigned Instrs =
1738  LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4;
1739  return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
1740  }
1741  break;
1742  }
1743  case Intrinsic::abs:
1744  case Intrinsic::smin:
1745  case Intrinsic::smax:
1746  case Intrinsic::umin:
1747  case Intrinsic::umax: {
1748  if (!ST->hasMVEIntegerOps())
1749  break;
1750  Type *VT = ICA.getReturnType();
1751 
1752  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1753  if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1754  LT.second == MVT::v16i8)
1755  return LT.first * ST->getMVEVectorCostFactor(CostKind);
1756  break;
1757  }
1758  case Intrinsic::minnum:
1759  case Intrinsic::maxnum: {
1760  if (!ST->hasMVEFloatOps())
1761  break;
1762  Type *VT = ICA.getReturnType();
1763  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1764  if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
1765  return LT.first * ST->getMVEVectorCostFactor(CostKind);
1766  break;
1767  }
1768  case Intrinsic::fptosi_sat:
1769  case Intrinsic::fptoui_sat: {
1770  if (ICA.getArgTypes().empty())
1771  break;
1772  bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
1773  auto LT = TLI->getTypeLegalizationCost(DL, ICA.getArgTypes()[0]);
1774  EVT MTy = TLI->getValueType(DL, ICA.getReturnType());
1775  // Check for the legal types, with the corect subtarget features.
1776  if ((ST->hasVFP2Base() && LT.second == MVT::f32 && MTy == MVT::i32) ||
1777  (ST->hasFP64() && LT.second == MVT::f64 && MTy == MVT::i32) ||
1778  (ST->hasFullFP16() && LT.second == MVT::f16 && MTy == MVT::i32))
1779  return LT.first;
1780 
1781  // Equally for MVE vector types
1782  if (ST->hasMVEFloatOps() &&
1783  (LT.second == MVT::v4f32 || LT.second == MVT::v8f16) &&
1784  LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())
1785  return LT.first * ST->getMVEVectorCostFactor(CostKind);
1786 
1787  // Otherwise we use a legal convert followed by a min+max
1788  if (((ST->hasVFP2Base() && LT.second == MVT::f32) ||
1789  (ST->hasFP64() && LT.second == MVT::f64) ||
1790  (ST->hasFullFP16() && LT.second == MVT::f16) ||
1791  (ST->hasMVEFloatOps() &&
1792  (LT.second == MVT::v4f32 || LT.second == MVT::v8f16))) &&
1793  LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
1794  Type *LegalTy = Type::getIntNTy(ICA.getReturnType()->getContext(),
1795  LT.second.getScalarSizeInBits());
1796  InstructionCost Cost =
1797  LT.second.isVector() ? ST->getMVEVectorCostFactor(CostKind) : 1;
1798  IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin
1799  : Intrinsic::umin,
1800  LegalTy, {LegalTy, LegalTy});
1801  Cost += getIntrinsicInstrCost(Attrs1, CostKind);
1802  IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax
1803  : Intrinsic::umax,
1804  LegalTy, {LegalTy, LegalTy});
1805  Cost += getIntrinsicInstrCost(Attrs2, CostKind);
1806  return LT.first * Cost;
1807  }
1808  break;
1809  }
1810  }
1811 
1813 }
1814 
1816  if (!F->isIntrinsic())
1817  return BaseT::isLoweredToCall(F);
1818 
1819  // Assume all Arm-specific intrinsics map to an instruction.
1820  if (F->getName().startswith("llvm.arm"))
1821  return false;
1822 
1823  switch (F->getIntrinsicID()) {
1824  default: break;
1825  case Intrinsic::powi:
1826  case Intrinsic::sin:
1827  case Intrinsic::cos:
1828  case Intrinsic::pow:
1829  case Intrinsic::log:
1830  case Intrinsic::log10:
1831  case Intrinsic::log2:
1832  case Intrinsic::exp:
1833  case Intrinsic::exp2:
1834  return true;
1835  case Intrinsic::sqrt:
1836  case Intrinsic::fabs:
1837  case Intrinsic::copysign:
1838  case Intrinsic::floor:
1839  case Intrinsic::ceil:
1840  case Intrinsic::trunc:
1841  case Intrinsic::rint:
1842  case Intrinsic::nearbyint:
1843  case Intrinsic::round:
1844  case Intrinsic::canonicalize:
1845  case Intrinsic::lround:
1846  case Intrinsic::llround:
1847  case Intrinsic::lrint:
1848  case Intrinsic::llrint:
1849  if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
1850  return true;
1851  if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
1852  return true;
1853  // Some operations can be handled by vector instructions and assume
1854  // unsupported vectors will be expanded into supported scalar ones.
1855  // TODO Handle scalar operations properly.
1856  return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
1857  case Intrinsic::masked_store:
1858  case Intrinsic::masked_load:
1859  case Intrinsic::masked_gather:
1860  case Intrinsic::masked_scatter:
1861  return !ST->hasMVEIntegerOps();
1862  case Intrinsic::sadd_with_overflow:
1863  case Intrinsic::uadd_with_overflow:
1864  case Intrinsic::ssub_with_overflow:
1865  case Intrinsic::usub_with_overflow:
1866  case Intrinsic::sadd_sat:
1867  case Intrinsic::uadd_sat:
1868  case Intrinsic::ssub_sat:
1869  case Intrinsic::usub_sat:
1870  return false;
1871  }
1872 
1873  return BaseT::isLoweredToCall(F);
1874 }
1875 
1877  unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
1878  EVT VT = TLI->getValueType(DL, I.getType(), true);
1879  if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
1880  return true;
1881 
1882  // Check if an intrinsic will be lowered to a call and assume that any
1883  // other CallInst will generate a bl.
1884  if (auto *Call = dyn_cast<CallInst>(&I)) {
1885  if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
1886  switch(II->getIntrinsicID()) {
1887  case Intrinsic::memcpy:
1888  case Intrinsic::memset:
1889  case Intrinsic::memmove:
1890  return getNumMemOps(II) == -1;
1891  default:
1892  if (const Function *F = Call->getCalledFunction())
1893  return isLoweredToCall(F);
1894  }
1895  }
1896  return true;
1897  }
1898 
1899  // FPv5 provides conversions between integer, double-precision,
1900  // single-precision, and half-precision formats.
1901  switch (I.getOpcode()) {
1902  default:
1903  break;
1904  case Instruction::FPToSI:
1905  case Instruction::FPToUI:
1906  case Instruction::SIToFP:
1907  case Instruction::UIToFP:
1908  case Instruction::FPTrunc:
1909  case Instruction::FPExt:
1910  return !ST->hasFPARMv8Base();
1911  }
1912 
1913  // FIXME: Unfortunately the approach of checking the Operation Action does
1914  // not catch all cases of Legalization that use library calls. Our
1915  // Legalization step categorizes some transformations into library calls as
1916  // Custom, Expand or even Legal when doing type legalization. So for now
1917  // we have to special case for instance the SDIV of 64bit integers and the
1918  // use of floating point emulation.
1919  if (VT.isInteger() && VT.getSizeInBits() >= 64) {
1920  switch (ISD) {
1921  default:
1922  break;
1923  case ISD::SDIV:
1924  case ISD::UDIV:
1925  case ISD::SREM:
1926  case ISD::UREM:
1927  case ISD::SDIVREM:
1928  case ISD::UDIVREM:
1929  return true;
1930  }
1931  }
1932 
1933  // Assume all other non-float operations are supported.
1934  if (!VT.isFloatingPoint())
1935  return false;
1936 
1937  // We'll need a library call to handle most floats when using soft.
1938  if (TLI->useSoftFloat()) {
1939  switch (I.getOpcode()) {
1940  default:
1941  return true;
1942  case Instruction::Alloca:
1943  case Instruction::Load:
1944  case Instruction::Store:
1945  case Instruction::Select:
1946  case Instruction::PHI:
1947  return false;
1948  }
1949  }
1950 
1951  // We'll need a libcall to perform double precision operations on a single
1952  // precision only FPU.
1953  if (I.getType()->isDoubleTy() && !ST->hasFP64())
1954  return true;
1955 
1956  // Likewise for half precision arithmetic.
1957  if (I.getType()->isHalfTy() && !ST->hasFullFP16())
1958  return true;
1959 
1960  return false;
1961 }
1962 
1964  AssumptionCache &AC,
1965  TargetLibraryInfo *LibInfo,
1966  HardwareLoopInfo &HWLoopInfo) {
1967  // Low-overhead branches are only supported in the 'low-overhead branch'
1968  // extension of v8.1-m.
1969  if (!ST->hasLOB() || DisableLowOverheadLoops) {
1970  LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
1971  return false;
1972  }
1973 
1975  LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
1976  return false;
1977  }
1978 
1979  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
1980  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
1981  LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
1982  return false;
1983  }
1984 
1985  const SCEV *TripCountSCEV =
1986  SE.getAddExpr(BackedgeTakenCount,
1987  SE.getOne(BackedgeTakenCount->getType()));
1988 
1989  // We need to store the trip count in LR, a 32-bit register.
1990  if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
1991  LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
1992  return false;
1993  }
1994 
1995  // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
1996  // point in generating a hardware loop if that's going to happen.
1997 
1998  auto IsHardwareLoopIntrinsic = [](Instruction &I) {
1999  if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
2000  switch (Call->getIntrinsicID()) {
2001  default:
2002  break;
2003  case Intrinsic::start_loop_iterations:
2004  case Intrinsic::test_start_loop_iterations:
2005  case Intrinsic::loop_decrement:
2006  case Intrinsic::loop_decrement_reg:
2007  return true;
2008  }
2009  }
2010  return false;
2011  };
2012 
2013  // Scan the instructions to see if there's any that we know will turn into a
2014  // call or if this loop is already a low-overhead loop or will become a tail
2015  // predicated loop.
2016  bool IsTailPredLoop = false;
2017  auto ScanLoop = [&](Loop *L) {
2018  for (auto *BB : L->getBlocks()) {
2019  for (auto &I : *BB) {
2020  if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
2021  isa<InlineAsm>(I)) {
2022  LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
2023  return false;
2024  }
2025  if (auto *II = dyn_cast<IntrinsicInst>(&I))
2026  IsTailPredLoop |=
2027  II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
2028  II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
2029  II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
2030  II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
2031  II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
2032  }
2033  }
2034  return true;
2035  };
2036 
2037  // Visit inner loops.
2038  for (auto Inner : *L)
2039  if (!ScanLoop(Inner))
2040  return false;
2041 
2042  if (!ScanLoop(L))
2043  return false;
2044 
2045  // TODO: Check whether the trip count calculation is expensive. If L is the
2046  // inner loop but we know it has a low trip count, calculating that trip
2047  // count (in the parent loop) may be detrimental.
2048 
2049  LLVMContext &C = L->getHeader()->getContext();
2050  HWLoopInfo.CounterInReg = true;
2051  HWLoopInfo.IsNestingLegal = false;
2052  HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
2053  HWLoopInfo.CountType = Type::getInt32Ty(C);
2054  HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
2055  return true;
2056 }
2057 
2058 static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
2059  // We don't allow icmp's, and because we only look at single block loops,
2060  // we simply count the icmps, i.e. there should only be 1 for the backedge.
2061  if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
2062  return false;
2063  // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
2064  // not currently canonical, but soon will be. Code without them uses icmp, and
2065  // so is not tail predicated as per the condition above. In order to get the
2066  // same performance we treat min and max the same as an icmp for tailpred
2067  // purposes for the moment (we often rely on non-tailpred and higher VF's to
2068  // pick more optimial instructions like VQDMULH. They need to be recognized
2069  // directly by the vectorizer).
2070  if (auto *II = dyn_cast<IntrinsicInst>(&I))
2071  if ((II->getIntrinsicID() == Intrinsic::smin ||
2072  II->getIntrinsicID() == Intrinsic::smax ||
2073  II->getIntrinsicID() == Intrinsic::umin ||
2074  II->getIntrinsicID() == Intrinsic::umax) &&
2075  ++ICmpCount > 1)
2076  return false;
2077 
2078  if (isa<FCmpInst>(&I))
2079  return false;
2080 
2081  // We could allow extending/narrowing FP loads/stores, but codegen is
2082  // too inefficient so reject this for now.
2083  if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
2084  return false;
2085 
2086  // Extends have to be extending-loads
2087  if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
2088  if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
2089  return false;
2090 
2091  // Truncs have to be narrowing-stores
2092  if (isa<TruncInst>(&I) )
2093  if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
2094  return false;
2095 
2096  return true;
2097 }
2098 
2099 // To set up a tail-predicated loop, we need to know the total number of
2100 // elements processed by that loop. Thus, we need to determine the element
2101 // size and:
2102 // 1) it should be uniform for all operations in the vector loop, so we
2103 // e.g. don't want any widening/narrowing operations.
2104 // 2) it should be smaller than i64s because we don't have vector operations
2105 // that work on i64s.
2106 // 3) we don't want elements to be reversed or shuffled, to make sure the
2107 // tail-predication masks/predicates the right lanes.
2108 //
2110  const DataLayout &DL,
2111  const LoopAccessInfo *LAI) {
2112  LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
2113 
2114  // If there are live-out values, it is probably a reduction. We can predicate
2115  // most reduction operations freely under MVE using a combination of
2116  // prefer-predicated-reduction-select and inloop reductions. We limit this to
2117  // floating point and integer reductions, but don't check for operators
2118  // specifically here. If the value ends up not being a reduction (and so the
2119  // vectorizer cannot tailfold the loop), we should fall back to standard
2120  // vectorization automatically.
2122  LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
2123  bool ReductionsDisabled =
2126 
2127  for (auto *I : LiveOuts) {
2128  if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
2129  !I->getType()->isHalfTy()) {
2130  LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
2131  "live-out value\n");
2132  return false;
2133  }
2134  if (ReductionsDisabled) {
2135  LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
2136  return false;
2137  }
2138  }
2139 
2140  // Next, check that all instructions can be tail-predicated.
2141  PredicatedScalarEvolution PSE = LAI->getPSE();
2142  SmallVector<Instruction *, 16> LoadStores;
2143  int ICmpCount = 0;
2144 
2145  for (BasicBlock *BB : L->blocks()) {
2146  for (Instruction &I : BB->instructionsWithoutDebug()) {
2147  if (isa<PHINode>(&I))
2148  continue;
2149  if (!canTailPredicateInstruction(I, ICmpCount)) {
2150  LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
2151  return false;
2152  }
2153 
2154  Type *T = I.getType();
2155  if (T->getScalarSizeInBits() > 32) {
2156  LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
2157  return false;
2158  }
2159  if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
2161  Type *AccessTy = getLoadStoreType(&I);
2162  int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L);
2163  if (NextStride == 1) {
2164  // TODO: for now only allow consecutive strides of 1. We could support
2165  // other strides as long as it is uniform, but let's keep it simple
2166  // for now.
2167  continue;
2168  } else if (NextStride == -1 ||
2169  (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
2170  (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
2171  LLVM_DEBUG(dbgs()
2172  << "Consecutive strides of 2 found, vld2/vstr2 can't "
2173  "be tail-predicated\n.");
2174  return false;
2175  // TODO: don't tail predicate if there is a reversed load?
2176  } else if (EnableMaskedGatherScatters) {
2177  // Gather/scatters do allow loading from arbitrary strides, at
2178  // least if they are loop invariant.
2179  // TODO: Loop variant strides should in theory work, too, but
2180  // this requires further testing.
2181  const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr);
2182  if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
2183  const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
2184  if (PSE.getSE()->isLoopInvariant(Step, L))
2185  continue;
2186  }
2187  }
2188  LLVM_DEBUG(dbgs() << "Bad stride found, can't "
2189  "tail-predicate\n.");
2190  return false;
2191  }
2192  }
2193  }
2194 
2195  LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
2196  return true;
2197 }
2198 
2200  ScalarEvolution &SE,
2201  AssumptionCache &AC,
2202  TargetLibraryInfo *TLI,
2203  DominatorTree *DT,
2204  const LoopAccessInfo *LAI) {
2205  if (!EnableTailPredication) {
2206  LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
2207  return false;
2208  }
2209 
2210  // Creating a predicated vector loop is the first step for generating a
2211  // tail-predicated hardware loop, for which we need the MVE masked
2212  // load/stores instructions:
2213  if (!ST->hasMVEIntegerOps())
2214  return false;
2215 
2216  // For now, restrict this to single block loops.
2217  if (L->getNumBlocks() > 1) {
2218  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
2219  "loop.\n");
2220  return false;
2221  }
2222 
2223  assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
2224 
2225  HardwareLoopInfo HWLoopInfo(L);
2226  if (!HWLoopInfo.canAnalyze(*LI)) {
2227  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2228  "analyzable.\n");
2229  return false;
2230  }
2231 
2232  // This checks if we have the low-overhead branch architecture
2233  // extension, and if we will create a hardware-loop:
2234  if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {
2235  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2236  "profitable.\n");
2237  return false;
2238  }
2239 
2240  if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) {
2241  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2242  "a candidate.\n");
2243  return false;
2244  }
2245 
2246  return canTailPredicateLoop(L, LI, SE, DL, LAI);
2247 }
2248 
2250  if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
2251  return false;
2252 
2253  // Intrinsic @llvm.get.active.lane.mask is supported.
2254  // It is used in the MVETailPredication pass, which requires the number of
2255  // elements processed by this vector loop to setup the tail-predicated
2256  // loop.
2257  return true;
2258 }
2262  // Enable Upper bound unrolling universally, not dependant upon the conditions
2263  // below.
2264  UP.UpperBound = true;
2265 
2266  // Only currently enable these preferences for M-Class cores.
2267  if (!ST->isMClass())
2268  return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2269 
2270  // Disable loop unrolling for Oz and Os.
2271  UP.OptSizeThreshold = 0;
2272  UP.PartialOptSizeThreshold = 0;
2273  if (L->getHeader()->getParent()->hasOptSize())
2274  return;
2275 
2276  SmallVector<BasicBlock*, 4> ExitingBlocks;
2277  L->getExitingBlocks(ExitingBlocks);
2278  LLVM_DEBUG(dbgs() << "Loop has:\n"
2279  << "Blocks: " << L->getNumBlocks() << "\n"
2280  << "Exit blocks: " << ExitingBlocks.size() << "\n");
2281 
2282  // Only allow another exit other than the latch. This acts as an early exit
2283  // as it mirrors the profitability calculation of the runtime unroller.
2284  if (ExitingBlocks.size() > 2)
2285  return;
2286 
2287  // Limit the CFG of the loop body for targets with a branch predictor.
2288  // Allowing 4 blocks permits if-then-else diamonds in the body.
2289  if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
2290  return;
2291 
2292  // Don't unroll vectorized loops, including the remainder loop
2293  if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
2294  return;
2295 
2296  // Scan the loop: don't unroll loops with calls as this could prevent
2297  // inlining.
2298  InstructionCost Cost = 0;
2299  for (auto *BB : L->getBlocks()) {
2300  for (auto &I : *BB) {
2301  // Don't unroll vectorised loop. MVE does not benefit from it as much as
2302  // scalar code.
2303  if (I.getType()->isVectorTy())
2304  return;
2305 
2306  if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2307  if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2308  if (!isLoweredToCall(F))
2309  continue;
2310  }
2311  return;
2312  }
2313 
2314  SmallVector<const Value*, 4> Operands(I.operand_values());
2315  Cost +=
2317  }
2318  }
2319 
2320  // On v6m cores, there are very few registers available. We can easily end up
2321  // spilling and reloading more registers in an unrolled loop. Look at the
2322  // number of LCSSA phis as a rough measure of how many registers will need to
2323  // be live out of the loop, reducing the default unroll count if more than 1
2324  // value is needed. In the long run, all of this should be being learnt by a
2325  // machine.
2326  unsigned UnrollCount = 4;
2327  if (ST->isThumb1Only()) {
2328  unsigned ExitingValues = 0;
2329  SmallVector<BasicBlock *, 4> ExitBlocks;
2330  L->getExitBlocks(ExitBlocks);
2331  for (auto *Exit : ExitBlocks) {
2332  // Count the number of LCSSA phis. Exclude values coming from GEP's as
2333  // only the last is expected to be needed for address operands.
2334  unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) {
2335  return PH.getNumOperands() != 1 ||
2336  !isa<GetElementPtrInst>(PH.getOperand(0));
2337  });
2338  ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
2339  }
2340  if (ExitingValues)
2341  UnrollCount /= ExitingValues;
2342  if (UnrollCount <= 1)
2343  return;
2344  }
2345 
2346  LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2347  LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
2348 
2349  UP.Partial = true;
2350  UP.Runtime = true;
2351  UP.UnrollRemainder = true;
2353  UP.UnrollAndJam = true;
2355 
2356  // Force unrolling small loops can be very useful because of the branch
2357  // taken cost of the backedge.
2358  if (Cost < 12)
2359  UP.Force = true;
2360 }
2361 
2364  BaseT::getPeelingPreferences(L, SE, PP);
2365 }
2366 
2367 bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
2368  TTI::ReductionFlags Flags) const {
2369  if (!ST->hasMVEIntegerOps())
2370  return false;
2371 
2372  unsigned ScalarBits = Ty->getScalarSizeInBits();
2373  switch (Opcode) {
2374  case Instruction::Add:
2375  return ScalarBits <= 64;
2376  default:
2377  return false;
2378  }
2379 }
2380 
2382  unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const {
2383  if (!ST->hasMVEIntegerOps())
2384  return false;
2385  return true;
2386 }
llvm::ISD::SUB
@ SUB
Definition: ISDOpcodes.h:240
ARMSubtarget.h
llvm::InstructionCost
Definition: InstructionCost.h:29
llvm::TargetLoweringBase::getMaxStoresPerMemmove
unsigned getMaxStoresPerMemmove(bool OptSize) const
Get maximum # of store operations permitted for llvm.memmove.
Definition: TargetLowering.h:1678
llvm::TargetTransformInfo::CastContextHint::Masked
@ Masked
The cast is used with a masked load/store.
ValueTypes.h
CmpMode::FP
@ FP
llvm::TargetTransformInfo::UnrollingPreferences::PartialOptSizeThreshold
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
Definition: TargetTransformInfo.h:459
llvm::ScalarEvolution::hasLoopInvariantBackedgeTakenCount
bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
Definition: ScalarEvolution.cpp:12978
llvm::TargetTransformInfo::SK_Select
@ SK_Select
Selects elements from the corresponding lane of either source operand.
Definition: TargetTransformInfo.h:874
llvm::BasicTTIImplBase< ARMTTIImpl >::DL
const DataLayout & DL
Definition: TargetTransformInfoImpl.h:37
llvm::TargetTransformInfo::UnrollingPreferences::Runtime
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
Definition: TargetTransformInfo.h:487
llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition: TargetTransformInfo.h:210
llvm::SPF_SMAX
@ SPF_SMAX
Unsigned minimum.
Definition: ValueTracking.h:680
llvm::ISD::VECTOR_SHUFFLE
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:586
llvm::ARM_AM::isThumbImmShiftedVal
bool isThumbImmShiftedVal(unsigned V)
isThumbImmShiftedVal - Return true if the specified value can be obtained by left shifting a 8-bit im...
Definition: ARMAddressingModes.h:235
llvm::MVT::v4f16
@ v4f16
Definition: MachineValueType.h:136
llvm::TargetTransformInfo::TCC_Expensive
@ TCC_Expensive
The cost of a 'div' instruction on x86.
Definition: TargetTransformInfo.h:263
llvm::IRBuilderBase::SetInsertPoint
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:179
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:17
llvm::TargetTransformInfo::ReductionFlags
Flags describing the kind of vector reduction.
Definition: TargetTransformInfo.h:1386
llvm::ConvertCostTableLookup
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType >> Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
llvm::CostTblEntryT
Cost Table Entry.
Definition: CostTable.h:25
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::LoopBase::getExitBlocks
void getExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all of the successor blocks of this loop.
Definition: LoopInfoImpl.h:61
llvm::ARMTTIImpl::getInterleavedMemoryOpCost
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Definition: ARMTargetTransformInfo.cpp:1513
llvm::HardwareLoopInfo::LoopDecrement
Value * LoopDecrement
Definition: TargetTransformInfo.h:102
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
llvm::ISD::OR
@ OR
Definition: ISDOpcodes.h:667
llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
InstCombiner.h
llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:719
llvm::InstCombiner::getDominatorTree
DominatorTree & getDominatorTree() const
Definition: InstCombiner.h:370
llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:104
IntrinsicInst.h
ceil
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g ceil
Definition: README-FPStack.txt:54
DisableLowOverheadLoops
static cl::opt< bool > DisableLowOverheadLoops("disable-arm-loloops", cl::Hidden, cl::init(false), cl::desc("Disable the generation of low-overhead loops"))
llvm::TypeSize::getFixedSize
ScalarTy getFixedSize() const
Definition: TypeSize.h:430
T
llvm::Function
Definition: Function.h:60
llvm::Loop
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:530
llvm::ISD::UDIV
@ UDIV
Definition: ISDOpcodes.h:243
llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:53
llvm::SelectPatternResult::Flavor
SelectPatternFlavor Flavor
Definition: ValueTracking.h:700
llvm::TargetTransformInfoImplCRTPBase< ARMTTIImpl >::getUserCost
InstructionCost getUserCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
Definition: TargetTransformInfoImpl.h:970
llvm::DataLayout::getTypeSizeInBits
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:673
llvm::BasicTTIImplBase< ARMTTIImpl >::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:1086
llvm::TargetTransformInfo::AMK_PostIndexed
@ AMK_PostIndexed
Definition: TargetTransformInfo.h:643
llvm::PointerType::get
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Definition: Type.cpp:727
MVEMaxSupportedInterleaveFactor
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor
llvm::ARMTTIImpl::areInlineCompatible
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
Definition: ARMTargetTransformInfo.cpp:85
llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:309
llvm::PredicatedScalarEvolution
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
Definition: ScalarEvolution.h:2176
llvm::ConstantInt::getValue
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:133
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1185
llvm::InstCombiner::Builder
BuilderTy & Builder
Definition: InstCombiner.h:58
llvm::ARMSubtarget::hasFPARMv8Base
bool hasFPARMv8Base() const
Definition: ARMSubtarget.h:337
llvm::APInt::getSExtValue
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1490
llvm::CmpInst::makeCmpResultType
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:1044
llvm::IRBuilder< TargetFolder, IRBuilderCallbackInserter >
llvm::IntrinsicCostAttributes::getReturnType
Type * getReturnType() const
Definition: TargetTransformInfo.h:149
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:449
llvm::BasicTTIImplBase< ARMTTIImpl >::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:777
llvm::TargetTransformInfo::UnrollingPreferences::UnrollAndJamInnerLoopThreshold
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
Definition: TargetTransformInfo.h:506
Local.h
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
llvm::TargetTransformInfo::UnrollingPreferences::UnrollRemainder
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
Definition: TargetTransformInfo.h:499
llvm::ISD::FP_TO_SINT
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:819
llvm::Type::isFPOrFPVectorTy
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:179
llvm::TargetTransformInfo::TCK_CodeSize
@ TCK_CodeSize
Instruction code size.
Definition: TargetTransformInfo.h:213
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:139
EnableMaskedLoadStores
static cl::opt< bool > EnableMaskedLoadStores("enable-arm-maskedldst", cl::Hidden, cl::init(true), cl::desc("Enable the generation of masked loads and stores"))
llvm::MemOp
Definition: TargetLowering.h:111
APInt.h
llvm::ARMTTIImpl::getGatherScatterOpCost
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:1554
llvm::getLoadStoreType
Type * getLoadStoreType(Value *I)
A helper function that returns the type of a load or store instruction.
Definition: Instructions.h:5375
llvm::HardwareLoopInfo::isHardwareLoopCandidate
bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, DominatorTree &DT, bool ForceNestedLoop=false, bool ForceHardwareLoopPHI=false)
Definition: TargetTransformInfo.cpp:98
llvm::CmpInst::ICMP_SGT
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:746
llvm::ARMTTIImpl::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:456
llvm::TargetTransformInfo::UnrollingPreferences::Partial
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Definition: TargetTransformInfo.h:483
llvm::findDefsUsedOutsideOfLoop
SmallVector< Instruction *, 8 > findDefsUsedOutsideOfLoop(Loop *L)
Returns the instructions that use values defined in the loop.
Definition: LoopUtils.cpp:126
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
llvm::getPtrStride
int64_t getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const ValueToValueMap &StridesMap=ValueToValueMap(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
Definition: LoopAccessAnalysis.cpp:1169
llvm::APInt::getBitWidth
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1423
llvm::TargetTransformInfo::PeelingPreferences
Definition: TargetTransformInfo.h:537
llvm::tgtok::Bits
@ Bits
Definition: TGLexer.h:50
llvm::Instruction::isShift
bool isShift() const
Definition: Instruction.h:164
llvm::BasicTTIImplBase< ARMTTIImpl >::improveShuffleKindFromMask
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask) const
Definition: BasicTTIImpl.h:859
llvm::SPF_UMAX
@ SPF_UMAX
Signed maximum.
Definition: ValueTracking.h:681
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
llvm::Optional
Definition: APInt.h:33
llvm::FeatureBitset
Container class for subtarget features.
Definition: SubtargetFeature.h:40
llvm::ConstantAsMetadata::get
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:420
llvm::TargetLoweringBase::getTypeLegalizationCost
std::pair< InstructionCost, MVT > getTypeLegalizationCost(const DataLayout &DL, Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: TargetLoweringBase.cpp:1808
llvm::Value::user_begin
user_iterator user_begin()
Definition: Value.h:397
llvm::CmpInst::ICMP_SLE
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:749
RHS
Value * RHS
Definition: X86PartialReduction.cpp:76
llvm::isPowerOf2_32
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:491
llvm::matchSelectPattern
SelectPatternResult matchSelectPattern(Value *V, Value *&LHS, Value *&RHS, Instruction::CastOps *CastOp=nullptr, unsigned Depth=0)
Pattern match integer [SU]MIN, [SU]MAX and ABS idioms, returning the kind and providing the out param...
Definition: ValueTracking.cpp:6192
llvm::SelectPatternFlavor
SelectPatternFlavor
Specific patterns of select instructions we can match.
Definition: ValueTracking.h:676
llvm::MVT::v2f64
@ v2f64
Definition: MachineValueType.h:172
llvm::FixedVectorType
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:525
llvm::count_if
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1706
llvm::TargetLoweringBase::getOperationAction
LegalizeAction getOperationAction(unsigned Op, EVT VT) const
Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...
Definition: TargetLowering.h:1091
llvm::ARMTTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: ARMTargetTransformInfo.cpp:2259
llvm::Type::getInt32Ty
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:239
llvm::LoopBase::getNumBlocks
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
Definition: LoopInfo.h:185
llvm::APIntOps::umin
const APInt & umin(const APInt &A, const APInt &B)
Determine the smaller of two APInts considered to be unsigned.
Definition: APInt.h:2149
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::MDNode::get
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1289
llvm::TargetTransformInfo::SK_Broadcast
@ SK_Broadcast
Broadcast element 0 to all other elements.
Definition: TargetTransformInfo.h:872
F
#define F(x, y, z)
Definition: MD5.cpp:55
llvm::ARMTTIImpl::isLegalMaskedLoad
bool isLegalMaskedLoad(Type *DataTy, Align Alignment)
Definition: ARMTargetTransformInfo.cpp:1095
llvm::Instruction::setMetadata
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1368
llvm::ARMTTIImpl::isHardwareLoopProfitable
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo)
Definition: ARMTargetTransformInfo.cpp:1963
KnownBits.h
llvm::TargetTransformInfo::requiresOrderedReduction
static bool requiresOrderedReduction(Optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
Definition: TargetTransformInfo.h:1206
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:55
llvm::HardwareLoopInfo::IsNestingLegal
bool IsNestingLegal
Definition: TargetTransformInfo.h:104
floor
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g floor
Definition: README-FPStack.txt:54
llvm::EVT::isSimple
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:129
MachineValueType.h
UnrollCount
static cl::opt< unsigned > UnrollCount("unroll-count", cl::Hidden, cl::desc("Use this unroll count for all loops including those with " "unroll_count pragma values, for testing purposes"))
llvm::AArch64CC::LT
@ LT
Definition: AArch64BaseInfo.h:266
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
Arg
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
Definition: AMDGPULibCalls.cpp:186
llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
Instruction.h
llvm::FixedVectorType::getNumElements
unsigned getNumElements() const
Definition: DerivedTypes.h:568
LHS
Value * LHS
Definition: X86PartialReduction.cpp:75
llvm::IntrinsicCostAttributes::getArgTypes
const SmallVectorImpl< Type * > & getArgTypes() const
Definition: TargetTransformInfo.h:153
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
llvm::InstCombiner::replaceOperand
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
Definition: InstCombiner.h:438
llvm::Intrinsic::getType
FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys=None)
Return the function type for an intrinsic.
Definition: Function.cpp:1366
llvm::MVT::i1
@ i1
Definition: MachineValueType.h:43
llvm::APInt::isNonNegative
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition: APInt.h:317
llvm::MVT::v8f16
@ v8f16
Definition: MachineValueType.h:137
llvm::APInt::isNegative
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:312
llvm::BasicTTIImplBase::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:513
llvm::ARM_AM::getSOImmVal
int getSOImmVal(unsigned Arg)
getSOImmVal - Given a 32-bit immediate, if it is something that can fit into an shifter_operand immed...
Definition: ARMAddressingModes.h:163
SubtargetFeature.h
TargetMachine.h
llvm::TypeConversionCostTblEntryT
Type Conversion Cost Table.
Definition: CostTable.h:55
llvm::ScalarEvolution::getOne
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
Definition: ScalarEvolution.h:645
llvm::ISD::SELECT
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:713
llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
llvm::ISD::ZERO_EXTEND
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:763
EnableTailPredication
cl::opt< TailPredication::Mode > EnableTailPredication
llvm::BasicTTIImplBase< ARMTTIImpl >::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: BasicTTIImpl.h:1139
llvm::ARMTTIImpl::getIntImmCostInst
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
Definition: ARMTargetTransformInfo.cpp:385
llvm::TargetTransformInfo::ShuffleKind
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Definition: TargetTransformInfo.h:871
llvm::TargetTransformInfo::CastContextHint
CastContextHint
Represents a hint about the context in which a cast is used.
Definition: TargetTransformInfo.h:1091
llvm::User
Definition: User.h:44
llvm::TargetLoweringBase::getMaxStoresPerMemset
unsigned getMaxStoresPerMemset(bool OptSize) const
Get maximum # of store operations permitted for llvm.memset.
Definition: TargetLowering.h:1639
llvm::BasicTTIImplBase< ARMTTIImpl >::getAddressComputationCost
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
Definition: BasicTTIImpl.h:2106
llvm::EVT
Extended Value Type.
Definition: ValueTypes.h:34
llvm::getKnownAlignment
Align getKnownAlignment(Value *V, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to infer an alignment for the specified pointer.
Definition: Local.h:222
Intrinsics.h
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
llvm::TargetTransformInfo::UnrollingPreferences::Force
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
Definition: TargetTransformInfo.h:495
llvm::MVT::f64
@ f64
Definition: MachineValueType.h:56
round
static uint64_t round(uint64_t Acc, uint64_t Input)
Definition: xxhash.cpp:56
llvm::EVT::getVectorNumElements
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:308
llvm::BasicTTIImplBase< ARMTTIImpl >::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, Optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:2219
AllowWLSLoops
static cl::opt< bool > AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true), cl::desc("Enable the generation of WLS loops"))
llvm::ISD::TRUNCATE
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:769
llvm::LoopBase::blocks
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:178
llvm::APInt::getLimitedValue
uint64_t getLimitedValue(uint64_t Limit=UINT64_MAX) const
If this value is smaller than the specified limit, return it, otherwise return the limit value.
Definition: APInt.h:456
llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:227
llvm::ARMTTIImpl::getIntImmCodeSizeCost
InstructionCost getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty)
Definition: ARMTargetTransformInfo.cpp:328
llvm::MVT::v16i1
@ v16i1
Definition: MachineValueType.h:68
llvm::ISD::UDIVREM
@ UDIVREM
Definition: ISDOpcodes.h:256
llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:109
llvm::PatternMatch::m_c_Add
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
Definition: PatternMatch.h:2243
llvm::ARMTargetLowering::useSoftFloat
bool useSoftFloat() const override
Definition: ARMISelLowering.cpp:1602
llvm::MVT::v8i1
@ v8i1
Definition: MachineValueType.h:67
llvm::TargetTransformInfo::UnrollingPreferences::UnrollAndJam
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
Definition: TargetTransformInfo.h:501
llvm::LoopBase::getBlocks
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:171
llvm::PatternMatch::m_ConstantInt
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:145
llvm::EVT::isInteger
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:144
llvm::Instruction
Definition: Instruction.h:42
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition: Type.cpp:189
llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33
llvm::HardwareLoopInfo::PerformEntryTest
bool PerformEntryTest
Definition: TargetTransformInfo.h:108
canTailPredicateLoop
static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, const DataLayout &DL, const LoopAccessInfo *LAI)
Definition: ARMTargetTransformInfo.cpp:2109
llvm::APInt::isAllOnes
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:347
llvm::ARMTTIImpl::isLoweredToCall
bool isLoweredToCall(const Function *F)
Definition: ARMTargetTransformInfo.cpp:1815
llvm::ISD::SINT_TO_FP
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:773
llvm::InstCombiner::eraseInstFromFunction
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
llvm::APInt::getZExtValue
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1478
llvm::APInt::getHighBitsSet
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:279
llvm::LoopBase::getExitingBlocks
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
Definition: LoopInfoImpl.h:33
llvm::ConstantInt::get
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:919
LoopUtils.h
llvm::HardwareLoopInfo::CounterInReg
bool CounterInReg
Definition: TargetTransformInfo.h:106
llvm::ISD::AND
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:666
llvm::ARMTTIImpl::emitGetActiveLaneMask
bool emitGetActiveLaneMask() const
Definition: ARMTargetTransformInfo.cpp:2249
PatternMatch.h
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:684
llvm::MVT::v1i64
@ v1i64
Definition: MachineValueType.h:117
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::Metadata
Root of the metadata hierarchy.
Definition: Metadata.h:62
llvm::isVREVMask
bool isVREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isVREVMask - Check if a vector shuffle corresponds to a VREV instruction with the specified blocksize...
Definition: ARMTargetTransformInfo.h:320
llvm::ARM_AM::getT2SOImmVal
int getT2SOImmVal(unsigned Arg)
getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit into a Thumb-2 shifter_oper...
Definition: ARMAddressingModes.h:320
llvm::AddressSpace
AddressSpace
Definition: NVPTXBaseInfo.h:21
llvm::BasicTTIImplBase< ARMTTIImpl >::getCmpSelInstrCost
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:1091
llvm::BasicTTIImplBase< ARMTTIImpl >::getCastInstrCost
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:921
llvm::None
const NoneType None
Definition: None.h:24
llvm::MVT::v4i16
@ v4i16
Definition: MachineValueType.h:91
llvm::lltok::Kind
Kind
Definition: LLToken.h:18
llvm::ARMSubtarget::getMVEVectorCostFactor
unsigned getMVEVectorCostFactor(TargetTransformInfo::TargetCostKind CostKind) const
Definition: ARMSubtarget.h:539
llvm::Type::getIntegerBitWidth
unsigned getIntegerBitWidth() const
Definition: DerivedTypes.h:97
llvm::MVT::v4i8
@ v4i8
Definition: MachineValueType.h:78
llvm::SPF_SMIN
@ SPF_SMIN
Definition: ValueTracking.h:678
Type.h
llvm::IntrinsicCostAttributes
Definition: TargetTransformInfo.h:117
llvm::APInt::isAllOnesValue
bool isAllOnesValue() const
NOTE: This is soft-deprecated. Please use isAllOnes() instead.
Definition: APInt.h:356
llvm::Instruction::getMetadata
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:279
llvm::maxnum
LLVM_READONLY APFloat maxnum(const APFloat &A, const APFloat &B)
Implements IEEE maxNum semantics.
Definition: APFloat.h:1306
LoopInfo.h
llvm::ARMTTIImpl::getNumMemOps
int getNumMemOps(const IntrinsicInst *I) const
Given a memcpy/memset/memmove instruction, return the number of memory operations performed,...
Definition: ARMTargetTransformInfo.cpp:1127
llvm::ARMTTIImpl::isProfitableLSRChainElement
bool isProfitableLSRChainElement(Instruction *I)
Definition: ARMTargetTransformInfo.cpp:1078
Operands
mir Rename Register Operands
Definition: MIRNamerPass.cpp:74
llvm::ARMTTIImpl::preferPredicateOverEpilogue
bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, const LoopAccessInfo *LAI)
Definition: ARMTargetTransformInfo.cpp:2199
llvm::ScalarEvolution::getSCEV
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
Definition: ScalarEvolution.cpp:4407
llvm::PatternMatch::m_Xor
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1115
llvm::Type::isIntegerTy
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:191
llvm::APInt::getOneBitSet
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:222
llvm::TargetTransformInfo::SK_Reverse
@ SK_Reverse
Reverse the order of the vector.
Definition: TargetTransformInfo.h:873
llvm::MVT::v2i8
@ v2i8
Definition: MachineValueType.h:77
llvm::MVT::v4i64
@ v4i64
Definition: MachineValueType.h:120
llvm::VectorType
Base class of all SIMD vector types.
Definition: DerivedTypes.h:389
llvm::ARMTTIImpl::getMemoryOpCost
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:1449
llvm::TargetTransformInfo::CastContextHint::Normal
@ Normal
The cast is used with a normal load/store.
BasicBlock.h
llvm::cl::opt< bool >
llvm::SCEV
This class represents an analyzed expression in the program.
Definition: ScalarEvolution.h:75
llvm::ARMTTIImpl::getExtendedAddReductionCost
InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *ValTy, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:1679
llvm::PatternMatch::m_Zero
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:535
llvm::Constant
This is an important base class in LLVM.
Definition: Constant.h:41
llvm::MVT::v16i8
@ v16i8
Definition: MachineValueType.h:80
llvm::CostTableLookup
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType >> Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
llvm::EVT::getSizeInBits
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:340
llvm::ARMSubtarget::isThumb1Only
bool isThumb1Only() const
Definition: ARMSubtarget.h:422
llvm::ScalarEvolution::getUnsignedRangeMax
APInt getUnsignedRangeMax(const SCEV *S)
Determine the max of the unsigned range for a particular SCEV.
Definition: ScalarEvolution.h:956
llvm::ARMTTIImpl::getMemcpyCost
InstructionCost getMemcpyCost(const Instruction *I)
Definition: ARMTargetTransformInfo.cpp:1193
llvm::ARMTTIImpl::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueKind Op1Info=TTI::OK_AnyValue, TTI::OperandValueKind Op2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: ARMTargetTransformInfo.cpp:1307
llvm::InstCombiner::getAssumptionCache
AssumptionCache & getAssumptionCache() const
Definition: InstCombiner.h:368
llvm::MVT::v16i16
@ v16i16
Definition: MachineValueType.h:93
llvm::MVT::v2i64
@ v2i64
Definition: MachineValueType.h:118
uint64_t
llvm::InstCombiner::getDataLayout
const DataLayout & getDataLayout() const
Definition: InstCombiner.h:371
llvm::ISD::FP_TO_UINT
@ FP_TO_UINT
Definition: ISDOpcodes.h:820
llvm::ARMTTIImpl::getAddressComputationCost
InstructionCost getAddressComputationCost(Type *Val, ScalarEvolution *SE, const SCEV *Ptr)
Definition: ARMTargetTransformInfo.cpp:1056
llvm::MVT::v16f32
@ v16f32
Definition: MachineValueType.h:162
llvm::Instruction::user_back
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:88
llvm::TruncInst
This class represents a truncation of integer types.
Definition: Instructions.h:4780
llvm::TargetTransformInfo::OK_UniformConstantValue
@ OK_UniformConstantValue
Definition: TargetTransformInfo.h:893
llvm::ARMSubtarget::isMClass
bool isMClass() const
Definition: ARMSubtarget.h:424
llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
llvm::TargetTransformInfo::UnrollingPreferences
Parameters that control the generic loop unrolling transformation.
Definition: TargetTransformInfo.h:430
llvm::ARMTTIImpl::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=None)
Definition: ARMTargetTransformInfo.cpp:1203
isSSATMinMaxPattern
static Value * isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm)
Definition: ARMTargetTransformInfo.cpp:339
I
#define I(x, y, z)
Definition: MD5.cpp:58
getCalledFunction
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
Definition: MemoryBuiltins.cpp:160
llvm::TargetTransformInfo::OperandValueProperties
OperandValueProperties
Additional properties of an operand's values.
Definition: TargetTransformInfo.h:898
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:432
llvm::ARMTTIImpl::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:1710
llvm::LoopAccessInfo
Drive the analysis of memory accesses in the loop.
Definition: LoopAccessAnalysis.h:558
llvm::ARMTTIImpl::simplifyDemandedVectorEltsIntrinsic
Optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
Definition: ARMTargetTransformInfo.cpp:251
llvm::Type::isHalfTy
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
llvm::MVT::v4f32
@ v4f32
Definition: MachineValueType.h:157
llvm::MVT::i8
@ i8
Definition: MachineValueType.h:44
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
llvm::MVT::Other
@ Other
Definition: MachineValueType.h:42
llvm::SPF_ABS
@ SPF_ABS
Floating point maxnum.
Definition: ValueTracking.h:684
memcpy
<%struct.s * > cast struct s *S to sbyte *< sbyte * > sbyte uint cast struct s *agg result to sbyte *< sbyte * > sbyte uint cast struct s *memtmp to sbyte *< sbyte * > sbyte uint ret void llc ends up issuing two memcpy or custom lower memcpy(of small size) to be ldmia/stmia. I think option 2 is better but the current register allocator cannot allocate a chunk of registers at a time. A feasible temporary solution is to use specific physical registers at the lowering time for small(<
llvm::TargetLoweringBase::getMaxStoresPerMemcpy
unsigned getMaxStoresPerMemcpy(bool OptSize) const
Get maximum # of store operations permitted for llvm.memcpy.
Definition: TargetLowering.h:1649
llvm::TargetTransformInfoImplBase::isLoweredToCall
bool isLoweredToCall(const Function *F) const
Definition: TargetTransformInfoImpl.h:121
llvm::ARMTTIImpl::getMaskedMemoryOpCost
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:1496
llvm::ARMTTIImpl::preferInLoopReduction
bool preferInLoopReduction(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const
Definition: ARMTargetTransformInfo.cpp:2367
function
print Print MemDeps of function
Definition: MemDepPrinter.cpp:82
llvm::IRBuilderBase::CreateIntrinsic
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with args, mangled using Types.
Definition: IRBuilder.cpp:857
llvm::BasicTTIImplBase< ARMTTIImpl >::getInterleavedMemoryOpCost
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Definition: BasicTTIImpl.h:1240
llvm::IRBuilderBase::getTrue
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:441
llvm::PatternMatch::m_Constant
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
Definition: PatternMatch.h:142
llvm::TargetTransformInfo::OperandValueKind
OperandValueKind
Additional information about an operand's possible values.
Definition: TargetTransformInfo.h:890
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:651
llvm::PatternMatch::m_Value
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:75
llvm::ARMTTIImpl::getPreferredAddressingMode
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const
Definition: ARMTargetTransformInfo.cpp:104
llvm::APIntOps::smin
const APInt & smin(const APInt &A, const APInt &B)
Determine the smaller of two APInts considered to be signed.
Definition: APInt.h:2139
llvm::ArrayRef< int >
llvm::LoopInfo
Definition: LoopInfo.h:1086
llvm::EVT::isVector
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:154
ARMAddressingModes.h
llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition: OptimizationRemarkEmitter.h:33
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
DataLayout.h
llvm::MVT::i64
@ i64
Definition: MachineValueType.h:47
llvm::MVT::v2i32
@ v2i32
Definition: MachineValueType.h:101
llvm::EVT::getScalarSizeInBits
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:352
llvm::AssumptionCache
A cache of @llvm.assume calls within a function.
Definition: AssumptionCache.h:42
llvm::ARMTTIImpl::isLegalMaskedGather
bool isLegalMaskedGather(Type *Ty, Align Alignment)
Definition: ARMTargetTransformInfo.cpp:1115
llvm::BasicTTIImplBase< ARMTTIImpl >::getScalarizationOverhead
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:698
llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition: TargetTransformInfo.h:214
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:143
llvm::ISD::SREM
@ SREM
Definition: ISDOpcodes.h:244
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
if
if(llvm_vc STREQUAL "") set(fake_version_inc "$
Definition: CMakeLists.txt:14
llvm::MVT::v2f32
@ v2f32
Definition: MachineValueType.h:155
CostKind
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
llvm::Value::getContext
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:991
llvm::ARMTTIImpl::getCmpSelInstrCost
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:910
llvm::TargetTransformInfo::UnrollingPreferences::DefaultUnrollRuntimeCount
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
Definition: TargetTransformInfo.h:466
trunc
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g trunc
Definition: README-FPStack.txt:63
llvm::SPF_FMINNUM
@ SPF_FMINNUM
Unsigned maximum.
Definition: ValueTracking.h:682
llvm::TargetLoweringBase::InstructionOpcodeToISD
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
Definition: TargetLoweringBase.cpp:1728
llvm::TargetTransformInfo::AddressingModeKind
AddressingModeKind
Definition: TargetTransformInfo.h:641
llvm::MVT::v4i32
@ v4i32
Definition: MachineValueType.h:103
llvm::BasicTTIImplBase< ARMTTIImpl >::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:585
llvm::SPF_FMAXNUM
@ SPF_FMAXNUM
Floating point minnum.
Definition: ValueTracking.h:683
llvm::Type::getContext
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
canTailPredicateInstruction
static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount)
Definition: ARMTargetTransformInfo.cpp:2058
llvm::MVT::v8i64
@ v8i64
Definition: MachineValueType.h:121
llvm::ISD::XOR
@ XOR
Definition: ISDOpcodes.h:668
llvm::Function::hasOptSize
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:663
llvm::ScalarEvolution::isLoopInvariant
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
Definition: ScalarEvolution.cpp:13248
llvm::InstCombiner::replaceInstUsesWith
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition: InstCombiner.h:417
llvm::MVT::v16i32
@ v16i32
Definition: MachineValueType.h:108
llvm::MemOp::Set
static MemOp Set(uint64_t Size, bool DstAlignCanChange, Align DstAlign, bool IsZeroMemset, bool IsVolatile)
Definition: TargetLowering.h:144
llvm::MCID::Select
@ Select
Definition: MCInstrDesc.h:164
llvm::TargetLoweringBase::getTargetMachine
const TargetMachine & getTargetMachine() const
Definition: TargetLowering.h:347
llvm::LoopBase::isInnermost
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
Definition: LoopInfo.h:165
llvm::APIntOps::umax
const APInt & umax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be unsigned.
Definition: APInt.h:2154
llvm::minnum
LLVM_READONLY APFloat minnum(const APFloat &A, const APFloat &B)
Implements IEEE minNum semantics.
Definition: APFloat.h:1295
llvm::MemOp::Copy
static MemOp Copy(uint64_t Size, bool DstAlignCanChange, Align DstAlign, Align SrcAlign, bool IsVolatile, bool MemcpyStrSrc=false)
Definition: TargetLowering.h:129
llvm::KnownBits
Definition: KnownBits.h:23
llvm::Type::getIntNTy
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition: Type.cpp:243
isFPSatMinMaxPattern
static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm)
Definition: ARMTargetTransformInfo.cpp:373
llvm::Type::isFloatTy
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:148
llvm::BasicTTIImplBase< ARMTTIImpl >::getCallInstrCost
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Compute a cost of the given call instruction.
Definition: BasicTTIImpl.h:2094
CostTable.h
llvm::EVT::getScalarType
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:295
llvm::TargetTransformInfo::UnrollingPreferences::UpperBound
bool UpperBound
Allow using trip count upper bound to unroll loops.
Definition: TargetTransformInfo.h:497
llvm::Align::value
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
llvm::CallBase::arg_size
unsigned arg_size() const
Definition: InstrTypes.h:1339
llvm::Type::isIntOrIntVectorTy
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:197
llvm::MVT::v8i16
@ v8i16
Definition: MachineValueType.h:92
llvm::ARMTTIImpl::getIntImmCost
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:293
ISDOpcodes.h
llvm::APInt::isNegatedPowerOf2
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:434
llvm::TypeSize
Definition: TypeSize.h:421
Casting.h
llvm::BasicTTIImplBase< ARMTTIImpl >::getMaskedMemoryOpCost
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:1224
llvm::Value::hasNUses
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:145
llvm::LoopBase::getHeader
BlockT * getHeader() const
Definition: LoopInfo.h:104
llvm::MVT::i32
@ i32
Definition: MachineValueType.h:46
llvm::TargetLibraryInfo
Provides information about what library functions are available for the current target.
Definition: TargetLibraryInfo.h:222
llvm::ISD::SDIV
@ SDIV
Definition: ISDOpcodes.h:242
powi
This is blocked on not handling X *X *X powi(X, 3)(see note above). The issue is that we end up getting t
llvm::log2
static double log2(double V)
Definition: AMDGPULibCalls.cpp:802
llvm::APInt::getSplat
static APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition: APInt.cpp:612
llvm::InstCombiner::SimplifyDemandedBits
virtual bool SimplifyDemandedBits(Instruction *I, unsigned OpNo, const APInt &DemandedMask, KnownBits &Known, unsigned Depth=0)=0
llvm::MCID::Add
@ Add
Definition: MCInstrDesc.h:185
llvm::MVT::v8i32
@ v8i32
Definition: MachineValueType.h:107
llvm::InstCombiner
The core instruction combiner logic.
Definition: InstCombiner.h:45
llvm::ISD::UINT_TO_FP
@ UINT_TO_FP
Definition: ISDOpcodes.h:774
llvm::ISD::ADD
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
llvm::TargetLoweringBase::isOperationLegalOrCustomOrPromote
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
Definition: TargetLowering.h:1186
llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:46
llvm::HardwareLoopInfo
Attributes of a target dependent hardware loop.
Definition: TargetTransformInfo.h:94
llvm::TargetTransformInfoImplBase::isConstantStridedAccessLessThan
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
Definition: TargetTransformInfoImpl.h:881
llvm::ISD::FP_EXTEND
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:871
llvm::BasicTTIImplBase< ARMTTIImpl >::getGatherScatterOpCost
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:1231
llvm::ARMTTIImpl::maybeLoweredToCall
bool maybeLoweredToCall(Instruction &I)
Definition: ARMTargetTransformInfo.cpp:1876
Instructions.h
llvm::IntrinsicCostAttributes::getID
Intrinsic::ID getID() const
Definition: TargetTransformInfo.h:147
llvm::ARMTTIImpl::getCastInstrCost
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:470
llvm::ISD::SHL
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:691
SmallVector.h
llvm::PatternMatch::m_Specific
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:780
llvm::ISD::MUL
@ MUL
Definition: ISDOpcodes.h:241
llvm::ISD::UREM
@ UREM
Definition: ISDOpcodes.h:245
llvm::MVT::f16
@ f16
Definition: MachineValueType.h:54
llvm::TailPredication::EnabledNoReductions
@ EnabledNoReductions
Definition: ARMTargetTransformInfo.h:44
llvm::CallBase::getArgOperand
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1341
llvm::BasicTTIImplBase< ARMTTIImpl >::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
Definition: BasicTTIImpl.h:1380
llvm::ARMTTIImpl::isLegalMaskedStore
bool isLegalMaskedStore(Type *DataTy, Align Alignment)
Definition: ARMTargetTransformInfo.h:188
llvm::SPF_UMIN
@ SPF_UMIN
Signed minimum.
Definition: ValueTracking.h:679
llvm::getBooleanLoopAttribute
bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
Definition: LoopInfo.cpp:1084
llvm::ARMTargetLowering::getNumInterleavedAccesses
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
Definition: ARMISelLowering.cpp:21205
simplifyNeonVld1
static Value * simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign, InstCombiner::BuilderTy &Builder)
Convert a vector load intrinsic into a simple llvm load instruction.
Definition: ARMTargetTransformInfo.cpp:66
llvm::APInt::getActiveBits
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition: APInt.h:1447
llvm::ARMTTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: ARMTargetTransformInfo.cpp:2362
llvm::BasicTTIImplBase< ARMTTIImpl >::getMemoryOpCost
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:1180
ARMTargetTransformInfo.h
llvm::HardwareLoopInfo::canAnalyze
bool canAnalyze(LoopInfo &LI)
Definition: TargetTransformInfo.cpp:45
llvm::IRBuilderBase::CreateVectorSplat
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1114
DerivedTypes.h
llvm::SCEV::getType
Type * getType() const
Return the LLVM type of this SCEV expression.
Definition: ScalarEvolution.cpp:393
llvm::getLoadStorePointerOperand
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
Definition: Instructions.h:5330
TM
const char LLVMTargetMachineRef TM
Definition: PassBuilderBindings.cpp:47
EnableMaskedGatherScatters
cl::opt< bool > EnableMaskedGatherScatters
llvm::ScalarEvolution::getAddExpr
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
Definition: ScalarEvolution.cpp:2454
llvm::APInt::getLowBitsSet
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:289
llvm::MVT::i16
@ i16
Definition: MachineValueType.h:45
llvm::TargetTransformInfo::UnrollingPreferences::OptSizeThreshold
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Definition: TargetTransformInfo.h:452
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
GEP
Hexagon Common GEP
Definition: HexagonCommonGEP.cpp:172
llvm::ARMTTIImpl::preferPredicatedReductionSelect
bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const
Definition: ARMTargetTransformInfo.cpp:2381
llvm::ISD::SDIVREM
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:255
llvm::ARMTTIImpl::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: ARMTargetTransformInfo.cpp:874
llvm::ARMSubtarget::isThumb2
bool isThumb2() const
Definition: ARMSubtarget.h:423
llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition: AMDGPUMetadata.h:394
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
llvm::cl::desc
Definition: CommandLine.h:405
llvm::TargetLoweringBase::getValueType
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
Definition: TargetLowering.h:1459
llvm::LoopAccessInfo::getPSE
const PredicatedScalarEvolution & getPSE() const
Used to add runtime SCEV checks.
Definition: LoopAccessAnalysis.h:635
llvm::ISD::SIGN_EXTEND
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:760
llvm::MVT::v8i8
@ v8i8
Definition: MachineValueType.h:79
llvm::ScalarEvolution::getBackedgeTakenCount
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
Definition: ScalarEvolution.cpp:7907
llvm::MVT::v8f32
@ v8f32
Definition: MachineValueType.h:161
llvm::PredicatedScalarEvolution::getSE
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
Definition: ScalarEvolution.h:2208
llvm::ARMTTIImpl::instCombineIntrinsic
Optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
Definition: ARMTargetTransformInfo.cpp:120
llvm::BinaryOperator::Create
static BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), Instruction *InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
Definition: Instructions.cpp:2778
llvm::MVT::v2i16
@ v2i16
Definition: MachineValueType.h:89
llvm::MVT::v16i64
@ v16i64
Definition: MachineValueType.h:122
llvm::abs
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1281
llvm::BasicTTIImplBase< ARMTTIImpl >::getExtendedAddReductionCost
InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:2287
llvm::HardwareLoopInfo::CountType
IntegerType * CountType
Definition: TargetTransformInfo.h:101
llvm::EVT::isFixedLengthVector
bool isFixedLengthVector() const
Definition: ValueTypes.h:164
llvm::ISD::FP_ROUND
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:852
llvm::MVT::f32
@ f32
Definition: MachineValueType.h:55
llvm::ARMTTIImpl::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, Optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:1654
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
llvm::TargetTransformInfo::TCK_RecipThroughput
@ TCK_RecipThroughput
Reciprocal throughput.
Definition: TargetTransformInfo.h:211
llvm::EVT::isFloatingPoint
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:139
llvm::TargetTransformInfo::AMK_None
@ AMK_None
Definition: TargetTransformInfo.h:644
llvm::TargetTransformInfo::AMK_PreIndexed
@ AMK_PreIndexed
Definition: TargetTransformInfo.h:642
llvm::APIntOps::smax
const APInt & smax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be signed.
Definition: APInt.h:2144
llvm::ARMSubtarget::hasVFP2Base
bool hasVFP2Base() const
Definition: ARMSubtarget.h:334
llvm::TargetLoweringBase::LibCall
@ LibCall
Definition: TargetLowering.h:199
llvm::Type::getPrimitiveSizeInBits
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition: Type.cpp:164
llvm::EVT::getSimpleVT
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:288
llvm::MVT::v4i1
@ v4i1
Definition: MachineValueType.h:66
llvm::ARMTargetLowering::isLegalInterleavedAccessType
bool isLegalInterleavedAccessType(unsigned Factor, FixedVectorType *VecTy, Align Alignment, const DataLayout &DL) const
Returns true if VecTy is a legal interleaved access type.
Definition: ARMISelLowering.cpp:21210
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:37
llvm::BasicTTIImplBase< ARMTTIImpl >::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=None)
Definition: BasicTTIImpl.h:892
llvm::DataLayout::getTypeAllocSize
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:506
llvm::TailPredication::ForceEnabledNoReductions
@ ForceEnabledNoReductions
Definition: ARMTargetTransformInfo.h:46