LLVM  13.0.0git
ARMTargetTransformInfo.cpp
Go to the documentation of this file.
1 //===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
10 #include "ARMSubtarget.h"
12 #include "llvm/ADT/APInt.h"
13 #include "llvm/ADT/SmallVector.h"
14 #include "llvm/Analysis/LoopInfo.h"
15 #include "llvm/CodeGen/CostTable.h"
18 #include "llvm/IR/BasicBlock.h"
19 #include "llvm/IR/DataLayout.h"
20 #include "llvm/IR/DerivedTypes.h"
21 #include "llvm/IR/Instruction.h"
22 #include "llvm/IR/Instructions.h"
23 #include "llvm/IR/Intrinsics.h"
24 #include "llvm/IR/IntrinsicInst.h"
25 #include "llvm/IR/IntrinsicsARM.h"
26 #include "llvm/IR/PatternMatch.h"
27 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Casting.h"
30 #include "llvm/Support/KnownBits.h"
36 #include <algorithm>
37 #include <cassert>
38 #include <cstdint>
39 #include <utility>
40 
41 using namespace llvm;
42 
43 #define DEBUG_TYPE "armtti"
44 
46  "enable-arm-maskedldst", cl::Hidden, cl::init(true),
47  cl::desc("Enable the generation of masked loads and stores"));
48 
50  "disable-arm-loloops", cl::Hidden, cl::init(false),
51  cl::desc("Disable the generation of low-overhead loops"));
52 
53 static cl::opt<bool>
54  AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
55  cl::desc("Enable the generation of WLS loops"));
56 
58 
60 
62 
63 /// Convert a vector load intrinsic into a simple llvm load instruction.
64 /// This is beneficial when the underlying object being addressed comes
65 /// from a constant, since we get constant-folding for free.
66 static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
68  auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
69 
70  if (!IntrAlign)
71  return nullptr;
72 
73  unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
74  ? MemAlign
75  : IntrAlign->getLimitedValue();
76 
77  if (!isPowerOf2_32(Alignment))
78  return nullptr;
79 
80  auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
81  PointerType::get(II.getType(), 0));
82  return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
83 }
84 
86  const Function *Callee) const {
87  const TargetMachine &TM = getTLI()->getTargetMachine();
88  const FeatureBitset &CallerBits =
89  TM.getSubtargetImpl(*Caller)->getFeatureBits();
90  const FeatureBitset &CalleeBits =
91  TM.getSubtargetImpl(*Callee)->getFeatureBits();
92 
93  // To inline a callee, all features not in the allowed list must match exactly.
94  bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
95  (CalleeBits & ~InlineFeaturesAllowed);
96  // For features in the allowed list, the callee's features must be a subset of
97  // the callers'.
98  bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
99  (CalleeBits & InlineFeaturesAllowed);
100  return MatchExact && MatchSubset;
101 }
102 
105  ScalarEvolution *SE) const {
106  if (ST->hasMVEIntegerOps())
107  return TTI::AMK_PostIndexed;
108 
109  if (L->getHeader()->getParent()->hasOptSize())
110  return TTI::AMK_None;
111 
112  if (ST->isMClass() && ST->isThumb2() &&
113  L->getNumBlocks() == 1)
114  return TTI::AMK_PreIndexed;
115 
116  return TTI::AMK_None;
117 }
118 
121  using namespace PatternMatch;
122  Intrinsic::ID IID = II.getIntrinsicID();
123  switch (IID) {
124  default:
125  break;
126  case Intrinsic::arm_neon_vld1: {
127  Align MemAlign =
129  &IC.getAssumptionCache(), &IC.getDominatorTree());
130  if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
131  return IC.replaceInstUsesWith(II, V);
132  }
133  break;
134  }
135 
136  case Intrinsic::arm_neon_vld2:
137  case Intrinsic::arm_neon_vld3:
138  case Intrinsic::arm_neon_vld4:
139  case Intrinsic::arm_neon_vld2lane:
140  case Intrinsic::arm_neon_vld3lane:
141  case Intrinsic::arm_neon_vld4lane:
142  case Intrinsic::arm_neon_vst1:
143  case Intrinsic::arm_neon_vst2:
144  case Intrinsic::arm_neon_vst3:
145  case Intrinsic::arm_neon_vst4:
146  case Intrinsic::arm_neon_vst2lane:
147  case Intrinsic::arm_neon_vst3lane:
148  case Intrinsic::arm_neon_vst4lane: {
149  Align MemAlign =
151  &IC.getAssumptionCache(), &IC.getDominatorTree());
152  unsigned AlignArg = II.getNumArgOperands() - 1;
153  Value *AlignArgOp = II.getArgOperand(AlignArg);
154  MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
155  if (Align && *Align < MemAlign) {
156  return IC.replaceOperand(
157  II, AlignArg,
159  false));
160  }
161  break;
162  }
163 
164  case Intrinsic::arm_mve_pred_i2v: {
165  Value *Arg = II.getArgOperand(0);
166  Value *ArgArg;
167  if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
168  PatternMatch::m_Value(ArgArg))) &&
169  II.getType() == ArgArg->getType()) {
170  return IC.replaceInstUsesWith(II, ArgArg);
171  }
172  Constant *XorMask;
173  if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
174  PatternMatch::m_Value(ArgArg)),
175  PatternMatch::m_Constant(XorMask))) &&
176  II.getType() == ArgArg->getType()) {
177  if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
178  if (CI->getValue().trunc(16).isAllOnesValue()) {
179  auto TrueVector = IC.Builder.CreateVectorSplat(
180  cast<FixedVectorType>(II.getType())->getNumElements(),
181  IC.Builder.getTrue());
182  return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
183  }
184  }
185  }
186  KnownBits ScalarKnown(32);
187  if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
188  ScalarKnown, 0)) {
189  return &II;
190  }
191  break;
192  }
193  case Intrinsic::arm_mve_pred_v2i: {
194  Value *Arg = II.getArgOperand(0);
195  Value *ArgArg;
196  if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
197  PatternMatch::m_Value(ArgArg)))) {
198  return IC.replaceInstUsesWith(II, ArgArg);
199  }
200  if (!II.getMetadata(LLVMContext::MD_range)) {
201  Type *IntTy32 = Type::getInt32Ty(II.getContext());
202  Metadata *M[] = {
204  ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0xFFFF))};
205  II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M));
206  return &II;
207  }
208  break;
209  }
210  case Intrinsic::arm_mve_vadc:
211  case Intrinsic::arm_mve_vadc_predicated: {
212  unsigned CarryOp =
213  (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
214  assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
215  "Bad type for intrinsic!");
216 
217  KnownBits CarryKnown(32);
218  if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
219  CarryKnown)) {
220  return &II;
221  }
222  break;
223  }
224  case Intrinsic::arm_mve_vmldava: {
225  Instruction *I = cast<Instruction>(&II);
226  if (I->hasOneUse()) {
227  auto *User = cast<Instruction>(*I->user_begin());
228  Value *OpZ;
229  if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
230  match(I->getOperand(3), m_Zero())) {
231  Value *OpX = I->getOperand(4);
232  Value *OpY = I->getOperand(5);
233  Type *OpTy = OpX->getType();
234 
236  Value *V =
237  IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
238  {I->getOperand(0), I->getOperand(1),
239  I->getOperand(2), OpZ, OpX, OpY});
240 
241  IC.replaceInstUsesWith(*User, V);
242  return IC.eraseInstFromFunction(*User);
243  }
244  }
245  return None;
246  }
247  }
248  return None;
249 }
250 
253  assert(Ty->isIntegerTy());
254 
255  unsigned Bits = Ty->getPrimitiveSizeInBits();
256  if (Bits == 0 || Imm.getActiveBits() >= 64)
257  return 4;
258 
259  int64_t SImmVal = Imm.getSExtValue();
260  uint64_t ZImmVal = Imm.getZExtValue();
261  if (!ST->isThumb()) {
262  if ((SImmVal >= 0 && SImmVal < 65536) ||
263  (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
264  (ARM_AM::getSOImmVal(~ZImmVal) != -1))
265  return 1;
266  return ST->hasV6T2Ops() ? 2 : 3;
267  }
268  if (ST->isThumb2()) {
269  if ((SImmVal >= 0 && SImmVal < 65536) ||
270  (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
271  (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
272  return 1;
273  return ST->hasV6T2Ops() ? 2 : 3;
274  }
275  // Thumb1, any i8 imm cost 1.
276  if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
277  return 1;
278  if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
279  return 2;
280  // Load from constantpool.
281  return 3;
282 }
283 
284 // Constants smaller than 256 fit in the immediate field of
285 // Thumb1 instructions so we return a zero cost and 1 otherwise.
287  const APInt &Imm, Type *Ty) {
288  if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
289  return 0;
290 
291  return 1;
292 }
293 
294 // Checks whether Inst is part of a min(max()) or max(min()) pattern
295 // that will match to an SSAT instruction
296 static bool isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
297  Value *LHS, *RHS;
298  ConstantInt *C;
299  SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor;
300 
301  if (InstSPF == SPF_SMAX &&
303  C->getValue() == Imm && Imm.isNegative() && (-Imm).isPowerOf2()) {
304 
305  auto isSSatMin = [&](Value *MinInst) {
306  if (isa<SelectInst>(MinInst)) {
307  Value *MinLHS, *MinRHS;
308  ConstantInt *MinC;
309  SelectPatternFlavor MinSPF =
310  matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
311  if (MinSPF == SPF_SMIN &&
313  MinC->getValue() == ((-Imm) - 1))
314  return true;
315  }
316  return false;
317  };
318 
319  if (isSSatMin(Inst->getOperand(1)) ||
320  (Inst->hasNUses(2) && (isSSatMin(*Inst->user_begin()) ||
321  isSSatMin(*(++Inst->user_begin())))))
322  return true;
323  }
324  return false;
325 }
326 
327 InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
328  const APInt &Imm, Type *Ty,
330  Instruction *Inst) {
331  // Division by a constant can be turned into multiplication, but only if we
332  // know it's constant. So it's not so much that the immediate is cheap (it's
333  // not), but that the alternative is worse.
334  // FIXME: this is probably unneeded with GlobalISel.
335  if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
336  Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
337  Idx == 1)
338  return 0;
339 
340  if (Opcode == Instruction::And) {
341  // UXTB/UXTH
342  if (Imm == 255 || Imm == 65535)
343  return 0;
344  // Conversion to BIC is free, and means we can use ~Imm instead.
345  return std::min(getIntImmCost(Imm, Ty, CostKind),
346  getIntImmCost(~Imm, Ty, CostKind));
347  }
348 
349  if (Opcode == Instruction::Add)
350  // Conversion to SUB is free, and means we can use -Imm instead.
351  return std::min(getIntImmCost(Imm, Ty, CostKind),
352  getIntImmCost(-Imm, Ty, CostKind));
353 
354  if (Opcode == Instruction::ICmp && Imm.isNegative() &&
355  Ty->getIntegerBitWidth() == 32) {
356  int64_t NegImm = -Imm.getSExtValue();
357  if (ST->isThumb2() && NegImm < 1<<12)
358  // icmp X, #-C -> cmn X, #C
359  return 0;
360  if (ST->isThumb() && NegImm < 1<<8)
361  // icmp X, #-C -> adds X, #C
362  return 0;
363  }
364 
365  // xor a, -1 can always be folded to MVN
366  if (Opcode == Instruction::Xor && Imm.isAllOnesValue())
367  return 0;
368 
369  // Ensures negative constant of min(max()) or max(min()) patterns that
370  // match to SSAT instructions don't get hoisted
371  if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
372  Ty->getIntegerBitWidth() <= 32) {
373  if (isSSATMinMaxPattern(Inst, Imm) ||
374  (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
375  isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm)))
376  return 0;
377  }
378 
379  return getIntImmCost(Imm, Ty, CostKind);
380 }
381 
384  const Instruction *I) {
386  (ST->hasNEON() || ST->hasMVEIntegerOps())) {
387  // FIXME: The vectorizer is highly sensistive to the cost of these
388  // instructions, which suggests that it may be using the costs incorrectly.
389  // But, for now, just make them free to avoid performance regressions for
390  // vector targets.
391  return 0;
392  }
393  return BaseT::getCFInstrCost(Opcode, CostKind, I);
394 }
395 
397  Type *Src,
400  const Instruction *I) {
401  int ISD = TLI->InstructionOpcodeToISD(Opcode);
402  assert(ISD && "Invalid opcode");
403 
404  // TODO: Allow non-throughput costs that aren't binary.
405  auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
407  return Cost == 0 ? 0 : 1;
408  return Cost;
409  };
410  auto IsLegalFPType = [this](EVT VT) {
411  EVT EltVT = VT.getScalarType();
412  return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
413  (EltVT == MVT::f64 && ST->hasFP64()) ||
414  (EltVT == MVT::f16 && ST->hasFullFP16());
415  };
416 
417  EVT SrcTy = TLI->getValueType(DL, Src);
418  EVT DstTy = TLI->getValueType(DL, Dst);
419 
420  if (!SrcTy.isSimple() || !DstTy.isSimple())
421  return AdjustCost(
422  BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
423 
424  // Extending masked load/Truncating masked stores is expensive because we
425  // currently don't split them. This means that we'll likely end up
426  // loading/storing each element individually (hence the high cost).
427  if ((ST->hasMVEIntegerOps() &&
428  (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
429  Opcode == Instruction::SExt)) ||
430  (ST->hasMVEFloatOps() &&
431  (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
432  IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
433  if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
434  return 2 * DstTy.getVectorNumElements() *
436 
437  // The extend of other kinds of load is free
438  if (CCH == TTI::CastContextHint::Normal ||
440  static const TypeConversionCostTblEntry LoadConversionTbl[] = {
453  };
454  if (const auto *Entry = ConvertCostTableLookup(
455  LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
456  return AdjustCost(Entry->Cost);
457 
458  static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
465  // The following extend from a legal type to an illegal type, so need to
466  // split the load. This introduced an extra load operation, but the
467  // extend is still "free".
474  };
475  if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
476  if (const auto *Entry =
477  ConvertCostTableLookup(MVELoadConversionTbl, ISD,
478  DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
479  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
480  }
481 
482  static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
483  // FPExtends are similar but also require the VCVT instructions.
486  };
487  if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
488  if (const auto *Entry =
489  ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
490  DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
491  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
492  }
493 
494  // The truncate of a store is free. This is the mirror of extends above.
495  static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
503  };
504  if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
505  if (const auto *Entry =
506  ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
507  SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
508  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
509  }
510 
511  static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
514  };
515  if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
516  if (const auto *Entry =
517  ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
518  SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
519  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
520  }
521  }
522 
523  // NEON vector operations that can extend their inputs.
524  if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
525  I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
526  static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
527  // vaddl
528  { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
529  { ISD::ADD, MVT::v8i16, MVT::v8i8, 0 },
530  // vsubl
531  { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
532  { ISD::SUB, MVT::v8i16, MVT::v8i8, 0 },
533  // vmull
534  { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
535  { ISD::MUL, MVT::v8i16, MVT::v8i8, 0 },
536  // vshll
537  { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
538  { ISD::SHL, MVT::v8i16, MVT::v8i8, 0 },
539  };
540 
541  auto *User = cast<Instruction>(*I->user_begin());
542  int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
543  if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
544  DstTy.getSimpleVT(),
545  SrcTy.getSimpleVT())) {
546  return AdjustCost(Entry->Cost);
547  }
548  }
549 
550  // Single to/from double precision conversions.
551  if (Src->isVectorTy() && ST->hasNEON() &&
552  ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
553  DstTy.getScalarType() == MVT::f32) ||
554  (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
555  DstTy.getScalarType() == MVT::f64))) {
556  static const CostTblEntry NEONFltDblTbl[] = {
557  // Vector fptrunc/fpext conversions.
560  {ISD::FP_EXTEND, MVT::v4f32, 4}};
561 
562  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
563  if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
564  return AdjustCost(LT.first * Entry->Cost);
565  }
566 
567  // Some arithmetic, load and store operations have specific instructions
568  // to cast up/down their types automatically at no extra cost.
569  // TODO: Get these tables to know at least what the related operations are.
570  static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
577 
578  // The number of vmovl instructions for the extension.
597 
598  // Operations that we legalize using splitting.
601 
602  // Vector float <-> i32 conversions.
605 
626 
633 
634  // Vector double <-> i32 conversions.
637 
644 
651  };
652 
653  if (SrcTy.isVector() && ST->hasNEON()) {
654  if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
655  DstTy.getSimpleVT(),
656  SrcTy.getSimpleVT()))
657  return AdjustCost(Entry->Cost);
658  }
659 
660  // Scalar float to integer conversions.
661  static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
682  };
683  if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
684  if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
685  DstTy.getSimpleVT(),
686  SrcTy.getSimpleVT()))
687  return AdjustCost(Entry->Cost);
688  }
689 
690  // Scalar integer to float conversions.
691  static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
712  };
713 
714  if (SrcTy.isInteger() && ST->hasNEON()) {
715  if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
716  ISD, DstTy.getSimpleVT(),
717  SrcTy.getSimpleVT()))
718  return AdjustCost(Entry->Cost);
719  }
720 
721  // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
722  // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
723  // are linearised so take more.
724  static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
737  };
738 
739  if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
740  if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
741  ISD, DstTy.getSimpleVT(),
742  SrcTy.getSimpleVT()))
743  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
744  }
745 
746  if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
747  // As general rule, fp converts that were not matched above are scalarized
748  // and cost 1 vcvt for each lane, so long as the instruction is available.
749  // If not it will become a series of function calls.
750  const InstructionCost CallCost =
751  getCallInstrCost(nullptr, Dst, {Src}, CostKind);
752  int Lanes = 1;
753  if (SrcTy.isFixedLengthVector())
754  Lanes = SrcTy.getVectorNumElements();
755 
756  if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
757  return Lanes;
758  else
759  return Lanes * CallCost;
760  }
761 
762  if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
763  SrcTy.isFixedLengthVector()) {
764  // Treat a truncate with larger than legal source (128bits for MVE) as
765  // expensive, 2 instructions per lane.
766  if ((SrcTy.getScalarType() == MVT::i8 ||
767  SrcTy.getScalarType() == MVT::i16 ||
768  SrcTy.getScalarType() == MVT::i32) &&
769  SrcTy.getSizeInBits() > 128 &&
770  SrcTy.getSizeInBits() > DstTy.getSizeInBits())
771  return SrcTy.getVectorNumElements() * 2;
772  }
773 
774  // Scalar integer conversion costs.
775  static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
776  // i16 -> i64 requires two dependent operations.
778 
779  // Truncates on i64 are assumed to be free.
782  { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 },
784  };
785 
786  if (SrcTy.isInteger()) {
787  if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
788  DstTy.getSimpleVT(),
789  SrcTy.getSimpleVT()))
790  return AdjustCost(Entry->Cost);
791  }
792 
793  int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
795  : 1;
796  return AdjustCost(
797  BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
798 }
799 
801  unsigned Index) {
802  // Penalize inserting into an D-subregister. We end up with a three times
803  // lower estimated throughput on swift.
804  if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
805  ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
806  return 3;
807 
808  if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
809  Opcode == Instruction::ExtractElement)) {
810  // Cross-class copies are expensive on many microarchitectures,
811  // so assume they are expensive by default.
812  if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
813  return 3;
814 
815  // Even if it's not a cross class copy, this likely leads to mixing
816  // of NEON and VFP code and should be therefore penalized.
817  if (ValTy->isVectorTy() &&
818  ValTy->getScalarSizeInBits() <= 32)
819  return std::max<InstructionCost>(
820  BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
821  }
822 
823  if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
824  Opcode == Instruction::ExtractElement)) {
825  // Integer cross-lane moves are more expensive than float, which can
826  // sometimes just be vmovs. Integer involve being passes to GPR registers,
827  // causing more of a delay.
828  std::pair<InstructionCost, MVT> LT =
829  getTLI()->getTypeLegalizationCost(DL, ValTy->getScalarType());
830  return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
831  }
832 
833  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
834 }
835 
837  Type *CondTy,
838  CmpInst::Predicate VecPred,
840  const Instruction *I) {
841  int ISD = TLI->InstructionOpcodeToISD(Opcode);
842 
843  // Thumb scalar code size cost for select.
844  if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
845  ST->isThumb() && !ValTy->isVectorTy()) {
846  // Assume expensive structs.
847  if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
848  return TTI::TCC_Expensive;
849 
850  // Select costs can vary because they:
851  // - may require one or more conditional mov (including an IT),
852  // - can't operate directly on immediates,
853  // - require live flags, which we can't copy around easily.
854  InstructionCost Cost = TLI->getTypeLegalizationCost(DL, ValTy).first;
855 
856  // Possible IT instruction for Thumb2, or more for Thumb1.
857  ++Cost;
858 
859  // i1 values may need rematerialising by using mov immediates and/or
860  // flag setting instructions.
861  if (ValTy->isIntegerTy(1))
862  ++Cost;
863 
864  return Cost;
865  }
866 
867  // If this is a vector min/max/abs, use the cost of that intrinsic directly
868  // instead. Hopefully when min/max intrinsics are more prevalent this code
869  // will not be needed.
870  const Instruction *Sel = I;
871  if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
872  Sel->hasOneUse())
873  Sel = cast<Instruction>(Sel->user_back());
874  if (Sel && ValTy->isVectorTy() &&
875  (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
876  const Value *LHS, *RHS;
877  SelectPatternFlavor SPF = matchSelectPattern(Sel, LHS, RHS).Flavor;
878  unsigned IID = 0;
879  switch (SPF) {
880  case SPF_ABS:
881  IID = Intrinsic::abs;
882  break;
883  case SPF_SMIN:
884  IID = Intrinsic::smin;
885  break;
886  case SPF_SMAX:
887  IID = Intrinsic::smax;
888  break;
889  case SPF_UMIN:
890  IID = Intrinsic::umin;
891  break;
892  case SPF_UMAX:
893  IID = Intrinsic::umax;
894  break;
895  case SPF_FMINNUM:
896  IID = Intrinsic::minnum;
897  break;
898  case SPF_FMAXNUM:
899  IID = Intrinsic::maxnum;
900  break;
901  default:
902  break;
903  }
904  if (IID) {
905  // The ICmp is free, the select gets the cost of the min/max/etc
906  if (Sel != I)
907  return 0;
908  IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
909  return getIntrinsicInstrCost(CostAttrs, CostKind);
910  }
911  }
912 
913  // On NEON a vector select gets lowered to vbsl.
914  if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
915  // Lowering of some vector selects is currently far from perfect.
916  static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
917  { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
918  { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
920  };
921 
922  EVT SelCondTy = TLI->getValueType(DL, CondTy);
923  EVT SelValTy = TLI->getValueType(DL, ValTy);
924  if (SelCondTy.isSimple() && SelValTy.isSimple()) {
925  if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
926  SelCondTy.getSimpleVT(),
927  SelValTy.getSimpleVT()))
928  return Entry->Cost;
929  }
930 
931  std::pair<InstructionCost, MVT> LT =
932  TLI->getTypeLegalizationCost(DL, ValTy);
933  return LT.first;
934  }
935 
936  if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
937  (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
938  cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
939  FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
940  FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy);
941  if (!VecCondTy)
942  VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy));
943 
944  // If we don't have mve.fp any fp operations will need to be scalarized.
945  if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
946  // One scalaization insert, one scalarization extract and the cost of the
947  // fcmps.
948  return BaseT::getScalarizationOverhead(VecValTy, false, true) +
949  BaseT::getScalarizationOverhead(VecCondTy, true, false) +
950  VecValTy->getNumElements() *
951  getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
952  VecCondTy->getScalarType(), VecPred, CostKind,
953  I);
954  }
955 
956  std::pair<InstructionCost, MVT> LT =
957  TLI->getTypeLegalizationCost(DL, ValTy);
958  int BaseCost = ST->getMVEVectorCostFactor(CostKind);
959  // There are two types - the input that specifies the type of the compare
960  // and the output vXi1 type. Because we don't know how the output will be
961  // split, we may need an expensive shuffle to get two in sync. This has the
962  // effect of making larger than legal compares (v8i32 for example)
963  // expensive.
964  if (LT.second.getVectorNumElements() > 2) {
965  if (LT.first > 1)
966  return LT.first * BaseCost +
967  BaseT::getScalarizationOverhead(VecCondTy, true, false);
968  return BaseCost;
969  }
970  }
971 
972  // Default to cheap (throughput/size of 1 instruction) but adjust throughput
973  // for "multiple beats" potentially needed by MVE instructions.
974  int BaseCost = 1;
975  if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
976  BaseCost = ST->getMVEVectorCostFactor(CostKind);
977 
978  return BaseCost *
979  BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
980 }
981 
983  ScalarEvolution *SE,
984  const SCEV *Ptr) {
985  // Address computations in vectorized code with non-consecutive addresses will
986  // likely result in more instructions compared to scalar code where the
987  // computation can more often be merged into the index mode. The resulting
988  // extra micro-ops can significantly decrease throughput.
989  unsigned NumVectorInstToHideOverhead = 10;
990  int MaxMergeDistance = 64;
991 
992  if (ST->hasNEON()) {
993  if (Ty->isVectorTy() && SE &&
994  !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
995  return NumVectorInstToHideOverhead;
996 
997  // In many cases the address computation is not merged into the instruction
998  // addressing mode.
999  return 1;
1000  }
1001  return BaseT::getAddressComputationCost(Ty, SE, Ptr);
1002 }
1003 
1005  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
1006  // If a VCTP is part of a chain, it's already profitable and shouldn't be
1007  // optimized, else LSR may block tail-predication.
1008  switch (II->getIntrinsicID()) {
1009  case Intrinsic::arm_mve_vctp8:
1010  case Intrinsic::arm_mve_vctp16:
1011  case Intrinsic::arm_mve_vctp32:
1012  case Intrinsic::arm_mve_vctp64:
1013  return true;
1014  default:
1015  break;
1016  }
1017  }
1018  return false;
1019 }
1020 
1021 bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
1023  return false;
1024 
1025  if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
1026  // Don't support v2i1 yet.
1027  if (VecTy->getNumElements() == 2)
1028  return false;
1029 
1030  // We don't support extending fp types.
1031  unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
1032  if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
1033  return false;
1034  }
1035 
1036  unsigned EltWidth = DataTy->getScalarSizeInBits();
1037  return (EltWidth == 32 && Alignment >= 4) ||
1038  (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
1039 }
1040 
1043  return false;
1044 
1045  // This method is called in 2 places:
1046  // - from the vectorizer with a scalar type, in which case we need to get
1047  // this as good as we can with the limited info we have (and rely on the cost
1048  // model for the rest).
1049  // - from the masked intrinsic lowering pass with the actual vector type.
1050  // For MVE, we have a custom lowering pass that will already have custom
1051  // legalised any gathers that we can to MVE intrinsics, and want to expand all
1052  // the rest. The pass runs before the masked intrinsic lowering pass, so if we
1053  // are here, we know we want to expand.
1054  if (isa<VectorType>(Ty))
1055  return false;
1056 
1057  unsigned EltWidth = Ty->getScalarSizeInBits();
1058  return ((EltWidth == 32 && Alignment >= 4) ||
1059  (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1060 }
1061 
1062 /// Given a memcpy/memset/memmove instruction, return the number of memory
1063 /// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1064 /// call is used.
1066  MemOp MOp;
1067  unsigned DstAddrSpace = ~0u;
1068  unsigned SrcAddrSpace = ~0u;
1069  const Function *F = I->getParent()->getParent();
1070 
1071  if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
1072  ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
1073  // If 'size' is not a constant, a library call will be generated.
1074  if (!C)
1075  return -1;
1076 
1077  const unsigned Size = C->getValue().getZExtValue();
1078  const Align DstAlign = *MC->getDestAlign();
1079  const Align SrcAlign = *MC->getSourceAlign();
1080 
1081  MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
1082  /*IsVolatile*/ false);
1083  DstAddrSpace = MC->getDestAddressSpace();
1084  SrcAddrSpace = MC->getSourceAddressSpace();
1085  }
1086  else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
1087  ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
1088  // If 'size' is not a constant, a library call will be generated.
1089  if (!C)
1090  return -1;
1091 
1092  const unsigned Size = C->getValue().getZExtValue();
1093  const Align DstAlign = *MS->getDestAlign();
1094 
1095  MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
1096  /*IsZeroMemset*/ false, /*IsVolatile*/ false);
1097  DstAddrSpace = MS->getDestAddressSpace();
1098  }
1099  else
1100  llvm_unreachable("Expected a memcpy/move or memset!");
1101 
1102  unsigned Limit, Factor = 2;
1103  switch(I->getIntrinsicID()) {
1104  case Intrinsic::memcpy:
1105  Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
1106  break;
1107  case Intrinsic::memmove:
1108  Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
1109  break;
1110  case Intrinsic::memset:
1111  Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
1112  Factor = 1;
1113  break;
1114  default:
1115  llvm_unreachable("Expected a memcpy/move or memset!");
1116  }
1117 
1118  // MemOps will be poplulated with a list of data types that needs to be
1119  // loaded and stored. That's why we multiply the number of elements by 2 to
1120  // get the cost for this memcpy.
1121  std::vector<EVT> MemOps;
1122  if (getTLI()->findOptimalMemOpLowering(
1123  MemOps, Limit, MOp, DstAddrSpace,
1124  SrcAddrSpace, F->getAttributes()))
1125  return MemOps.size() * Factor;
1126 
1127  // If we can't find an optimal memop lowering, return the default cost
1128  return -1;
1129 }
1130 
1132  int NumOps = getNumMemOps(cast<IntrinsicInst>(I));
1133 
1134  // To model the cost of a library call, we assume 1 for the call, and
1135  // 3 for the argument setup.
1136  if (NumOps == -1)
1137  return 4;
1138  return NumOps;
1139 }
1140 
1143  int Index, VectorType *SubTp) {
1145  if (ST->hasNEON()) {
1146  if (Kind == TTI::SK_Broadcast) {
1147  static const CostTblEntry NEONDupTbl[] = {
1148  // VDUP handles these cases.
1155 
1160 
1161  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1162  if (const auto *Entry =
1163  CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
1164  return LT.first * Entry->Cost;
1165  }
1166  if (Kind == TTI::SK_Reverse) {
1167  static const CostTblEntry NEONShuffleTbl[] = {
1168  // Reverse shuffle cost one instruction if we are shuffling within a
1169  // double word (vrev) or two if we shuffle a quad word (vrev, vext).
1176 
1181 
1182  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1183  if (const auto *Entry =
1184  CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
1185  return LT.first * Entry->Cost;
1186  }
1187  if (Kind == TTI::SK_Select) {
1188  static const CostTblEntry NEONSelShuffleTbl[] = {
1189  // Select shuffle cost table for ARM. Cost is the number of
1190  // instructions
1191  // required to create the shuffled vector.
1192 
1197 
1201 
1203 
1205 
1206  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1207  if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
1208  ISD::VECTOR_SHUFFLE, LT.second))
1209  return LT.first * Entry->Cost;
1210  }
1211  }
1212  if (ST->hasMVEIntegerOps()) {
1213  if (Kind == TTI::SK_Broadcast) {
1214  static const CostTblEntry MVEDupTbl[] = {
1215  // VDUP handles these cases.
1221 
1222  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1223  if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
1224  LT.second))
1225  return LT.first * Entry->Cost *
1227  }
1228 
1229  if (!Mask.empty()) {
1230  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1231  if (Mask.size() <= LT.second.getVectorNumElements() &&
1232  (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
1233  isVREVMask(Mask, LT.second, 64)))
1235  }
1236  }
1237 
1238  int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
1240  : 1;
1241  return BaseCost * BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
1242 }
1243 
1245  unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1247  TTI::OperandValueProperties Opd1PropInfo,
1249  const Instruction *CxtI) {
1250  int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1251  if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
1252  // Make operations on i1 relatively expensive as this often involves
1253  // combining predicates. AND and XOR should be easier to handle with IT
1254  // blocks.
1255  switch (ISDOpcode) {
1256  default:
1257  break;
1258  case ISD::AND:
1259  case ISD::XOR:
1260  return 2;
1261  case ISD::OR:
1262  return 3;
1263  }
1264  }
1265 
1266  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
1267 
1268  if (ST->hasNEON()) {
1269  const unsigned FunctionCallDivCost = 20;
1270  const unsigned ReciprocalDivCost = 10;
1271  static const CostTblEntry CostTbl[] = {
1272  // Division.
1273  // These costs are somewhat random. Choose a cost of 20 to indicate that
1274  // vectorizing devision (added function call) is going to be very expensive.
1275  // Double registers types.
1276  { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1277  { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1278  { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
1279  { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
1280  { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1281  { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1282  { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
1283  { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
1284  { ISD::SDIV, MVT::v4i16, ReciprocalDivCost},
1285  { ISD::UDIV, MVT::v4i16, ReciprocalDivCost},
1286  { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
1287  { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
1288  { ISD::SDIV, MVT::v8i8, ReciprocalDivCost},
1289  { ISD::UDIV, MVT::v8i8, ReciprocalDivCost},
1290  { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost},
1291  { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost},
1292  // Quad register types.
1293  { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1294  { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1295  { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
1296  { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
1297  { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1298  { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1299  { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
1300  { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
1301  { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1302  { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1303  { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
1304  { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
1305  { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1306  { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1307  { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
1308  { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
1309  // Multiplication.
1310  };
1311 
1312  if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
1313  return LT.first * Entry->Cost;
1314 
1316  Opcode, Ty, CostKind, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo);
1317 
1318  // This is somewhat of a hack. The problem that we are facing is that SROA
1319  // creates a sequence of shift, and, or instructions to construct values.
1320  // These sequences are recognized by the ISel and have zero-cost. Not so for
1321  // the vectorized code. Because we have support for v2i64 but not i64 those
1322  // sequences look particularly beneficial to vectorize.
1323  // To work around this we increase the cost of v2i64 operations to make them
1324  // seem less beneficial.
1325  if (LT.second == MVT::v2i64 &&
1327  Cost += 4;
1328 
1329  return Cost;
1330  }
1331 
1332  // If this operation is a shift on arm/thumb2, it might well be folded into
1333  // the following instruction, hence having a cost of 0.
1334  auto LooksLikeAFreeShift = [&]() {
1335  if (ST->isThumb1Only() || Ty->isVectorTy())
1336  return false;
1337 
1338  if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
1339  return false;
1341  return false;
1342 
1343  // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1344  switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
1345  case Instruction::Add:
1346  case Instruction::Sub:
1347  case Instruction::And:
1348  case Instruction::Xor:
1349  case Instruction::Or:
1350  case Instruction::ICmp:
1351  return true;
1352  default:
1353  return false;
1354  }
1355  };
1356  if (LooksLikeAFreeShift())
1357  return 0;
1358 
1359  // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1360  // for "multiple beats" potentially needed by MVE instructions.
1361  int BaseCost = 1;
1362  if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1363  BaseCost = ST->getMVEVectorCostFactor(CostKind);
1364 
1365  // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
1366  // without treating floats as more expensive that scalars or increasing the
1367  // costs for custom operations. The results is also multiplied by the
1368  // MVEVectorCostFactor where appropriate.
1369  if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
1370  return LT.first * BaseCost;
1371 
1372  // Else this is expand, assume that we need to scalarize this op.
1373  if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
1374  unsigned Num = VTy->getNumElements();
1375  InstructionCost Cost =
1377  // Return the cost of multiple scalar invocation plus the cost of
1378  // inserting and extracting the values.
1379  SmallVector<Type *> Tys(Args.size(), Ty);
1380  return BaseT::getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
1381  }
1382 
1383  return BaseCost;
1384 }
1385 
1387  MaybeAlign Alignment,
1388  unsigned AddressSpace,
1390  const Instruction *I) {
1391  // TODO: Handle other cost kinds.
1393  return 1;
1394 
1395  // Type legalization can't handle structs
1396  if (TLI->getValueType(DL, Src, true) == MVT::Other)
1397  return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1398  CostKind);
1399 
1400  if (ST->hasNEON() && Src->isVectorTy() &&
1401  (Alignment && *Alignment != Align(16)) &&
1402  cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
1403  // Unaligned loads/stores are extremely inefficient.
1404  // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1405  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
1406  return LT.first * 4;
1407  }
1408 
1409  // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
1410  // Same for stores.
1411  if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
1412  ((Opcode == Instruction::Load && I->hasOneUse() &&
1413  isa<FPExtInst>(*I->user_begin())) ||
1414  (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
1415  FixedVectorType *SrcVTy = cast<FixedVectorType>(Src);
1416  Type *DstTy =
1417  Opcode == Instruction::Load
1418  ? (*I->user_begin())->getType()
1419  : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
1420  if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
1421  DstTy->getScalarType()->isFloatTy())
1422  return ST->getMVEVectorCostFactor(CostKind);
1423  }
1424 
1425  int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1427  : 1;
1428  return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1429  CostKind, I);
1430 }
1431 
1433 ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1434  unsigned AddressSpace,
1436  if (ST->hasMVEIntegerOps()) {
1437  if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment))
1438  return ST->getMVEVectorCostFactor(CostKind);
1439  if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment))
1440  return ST->getMVEVectorCostFactor(CostKind);
1441  }
1442  if (!isa<FixedVectorType>(Src))
1443  return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1444  CostKind);
1445  // Scalar cost, which is currently very high due to the efficiency of the
1446  // generated code.
1447  return cast<FixedVectorType>(Src)->getNumElements() * 8;
1448 }
1449 
1451  unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1452  Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1453  bool UseMaskForCond, bool UseMaskForGaps) {
1454  assert(Factor >= 2 && "Invalid interleave factor");
1455  assert(isa<VectorType>(VecTy) && "Expect a vector type");
1456 
1457  // vldN/vstN doesn't support vector types of i64/f64 element.
1458  bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
1459 
1460  if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
1461  !UseMaskForCond && !UseMaskForGaps) {
1462  unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1463  auto *SubVecTy =
1464  FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
1465 
1466  // vldN/vstN only support legal vector types of size 64 or 128 in bits.
1467  // Accesses having vector types that are a multiple of 128 bits can be
1468  // matched to more than one vldN/vstN instruction.
1469  int BaseCost =
1471  if (NumElts % Factor == 0 &&
1472  TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
1473  return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1474 
1475  // Some smaller than legal interleaved patterns are cheap as we can make
1476  // use of the vmovn or vrev patterns to interleave a standard load. This is
1477  // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1478  // promoted differently). The cost of 2 here is then a load and vrev or
1479  // vmovn.
1480  if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
1481  VecTy->isIntOrIntVectorTy() &&
1482  DL.getTypeSizeInBits(SubVecTy).getFixedSize() <= 64)
1483  return 2 * BaseCost;
1484  }
1485 
1486  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1487  Alignment, AddressSpace, CostKind,
1488  UseMaskForCond, UseMaskForGaps);
1489 }
1490 
1492  unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1493  Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
1494  using namespace PatternMatch;
1495  if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
1496  return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1497  Alignment, CostKind, I);
1498 
1499  assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
1500  auto *VTy = cast<FixedVectorType>(DataTy);
1501 
1502  // TODO: Splitting, once we do that.
1503 
1504  unsigned NumElems = VTy->getNumElements();
1505  unsigned EltSize = VTy->getScalarSizeInBits();
1506  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, DataTy);
1507 
1508  // For now, it is assumed that for the MVE gather instructions the loads are
1509  // all effectively serialised. This means the cost is the scalar cost
1510  // multiplied by the number of elements being loaded. This is possibly very
1511  // conservative, but even so we still end up vectorising loops because the
1512  // cost per iteration for many loops is lower than for scalar loops.
1513  InstructionCost VectorCost =
1514  NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
1515  // The scalarization cost should be a lot higher. We use the number of vector
1516  // elements plus the scalarization overhead.
1517  InstructionCost ScalarCost =
1518  NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, true, false) +
1519  BaseT::getScalarizationOverhead(VTy, false, true);
1520 
1521  if (EltSize < 8 || Alignment < EltSize / 8)
1522  return ScalarCost;
1523 
1524  unsigned ExtSize = EltSize;
1525  // Check whether there's a single user that asks for an extended type
1526  if (I != nullptr) {
1527  // Dependent of the caller of this function, a gather instruction will
1528  // either have opcode Instruction::Load or be a call to the masked_gather
1529  // intrinsic
1530  if ((I->getOpcode() == Instruction::Load ||
1531  match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
1532  I->hasOneUse()) {
1533  const User *Us = *I->users().begin();
1534  if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
1535  // only allow valid type combinations
1536  unsigned TypeSize =
1537  cast<Instruction>(Us)->getType()->getScalarSizeInBits();
1538  if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
1539  (TypeSize == 16 && EltSize == 8)) &&
1540  TypeSize * NumElems == 128) {
1541  ExtSize = TypeSize;
1542  }
1543  }
1544  }
1545  // Check whether the input data needs to be truncated
1546  TruncInst *T;
1547  if ((I->getOpcode() == Instruction::Store ||
1548  match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
1549  (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
1550  // Only allow valid type combinations
1551  unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
1552  if (((EltSize == 16 && TypeSize == 32) ||
1553  (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
1554  TypeSize * NumElems == 128)
1555  ExtSize = TypeSize;
1556  }
1557  }
1558 
1559  if (ExtSize * NumElems != 128 || NumElems < 4)
1560  return ScalarCost;
1561 
1562  // Any (aligned) i32 gather will not need to be scalarised.
1563  if (ExtSize == 32)
1564  return VectorCost;
1565  // For smaller types, we need to ensure that the gep's inputs are correctly
1566  // extended from a small enough value. Other sizes (including i64) are
1567  // scalarized for now.
1568  if (ExtSize != 8 && ExtSize != 16)
1569  return ScalarCost;
1570 
1571  if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
1572  Ptr = BC->getOperand(0);
1573  if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
1574  if (GEP->getNumOperands() != 2)
1575  return ScalarCost;
1576  unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
1577  // Scale needs to be correct (which is only relevant for i16s).
1578  if (Scale != 1 && Scale * 8 != ExtSize)
1579  return ScalarCost;
1580  // And we need to zext (not sext) the indexes from a small enough type.
1581  if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
1582  if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
1583  return VectorCost;
1584  }
1585  return ScalarCost;
1586  }
1587  return ScalarCost;
1588 }
1589 
1592  bool IsPairwiseForm,
1594  EVT ValVT = TLI->getValueType(DL, ValTy);
1595  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1596  if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD)
1597  return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
1598  CostKind);
1599 
1600  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1601 
1602  static const CostTblEntry CostTblAdd[]{
1603  {ISD::ADD, MVT::v16i8, 1},
1604  {ISD::ADD, MVT::v8i16, 1},
1605  {ISD::ADD, MVT::v4i32, 1},
1606  };
1607  if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
1608  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1609 
1610  return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
1611  CostKind);
1612 }
1613 
1615 ARMTTIImpl::getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
1616  Type *ResTy, VectorType *ValTy,
1618  EVT ValVT = TLI->getValueType(DL, ValTy);
1619  EVT ResVT = TLI->getValueType(DL, ResTy);
1620  if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1621  std::pair<InstructionCost, MVT> LT =
1622  TLI->getTypeLegalizationCost(DL, ValTy);
1623  if ((LT.second == MVT::v16i8 && ResVT.getSizeInBits() <= 32) ||
1624  (LT.second == MVT::v8i16 &&
1625  ResVT.getSizeInBits() <= (IsMLA ? 64 : 32)) ||
1626  (LT.second == MVT::v4i32 && ResVT.getSizeInBits() <= 64))
1627  return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1628  }
1629 
1630  return BaseT::getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, ValTy,
1631  CostKind);
1632 }
1633 
1637  switch (ICA.getID()) {
1638  case Intrinsic::get_active_lane_mask:
1639  // Currently we make a somewhat optimistic assumption that
1640  // active_lane_mask's are always free. In reality it may be freely folded
1641  // into a tail predicated loop, expanded into a VCPT or expanded into a lot
1642  // of add/icmp code. We may need to improve this in the future, but being
1643  // able to detect if it is free or not involves looking at a lot of other
1644  // code. We currently assume that the vectorizer inserted these, and knew
1645  // what it was doing in adding one.
1646  if (ST->hasMVEIntegerOps())
1647  return 0;
1648  break;
1649  case Intrinsic::sadd_sat:
1650  case Intrinsic::ssub_sat:
1651  case Intrinsic::uadd_sat:
1652  case Intrinsic::usub_sat: {
1653  if (!ST->hasMVEIntegerOps())
1654  break;
1655  Type *VT = ICA.getReturnType();
1656 
1657  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1658  if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1659  LT.second == MVT::v16i8) {
1660  // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
1661  // need to extend the type, as it uses shr(qadd(shl, shl)).
1662  unsigned Instrs =
1663  LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4;
1664  return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
1665  }
1666  break;
1667  }
1668  case Intrinsic::abs:
1669  case Intrinsic::smin:
1670  case Intrinsic::smax:
1671  case Intrinsic::umin:
1672  case Intrinsic::umax: {
1673  if (!ST->hasMVEIntegerOps())
1674  break;
1675  Type *VT = ICA.getReturnType();
1676 
1677  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1678  if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1679  LT.second == MVT::v16i8)
1680  return LT.first * ST->getMVEVectorCostFactor(CostKind);
1681  break;
1682  }
1683  case Intrinsic::minnum:
1684  case Intrinsic::maxnum: {
1685  if (!ST->hasMVEFloatOps())
1686  break;
1687  Type *VT = ICA.getReturnType();
1688  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1689  if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
1690  return LT.first * ST->getMVEVectorCostFactor(CostKind);
1691  break;
1692  }
1693  }
1694 
1696 }
1697 
1699  if (!F->isIntrinsic())
1701 
1702  // Assume all Arm-specific intrinsics map to an instruction.
1703  if (F->getName().startswith("llvm.arm"))
1704  return false;
1705 
1706  switch (F->getIntrinsicID()) {
1707  default: break;
1708  case Intrinsic::powi:
1709  case Intrinsic::sin:
1710  case Intrinsic::cos:
1711  case Intrinsic::pow:
1712  case Intrinsic::log:
1713  case Intrinsic::log10:
1714  case Intrinsic::log2:
1715  case Intrinsic::exp:
1716  case Intrinsic::exp2:
1717  return true;
1718  case Intrinsic::sqrt:
1719  case Intrinsic::fabs:
1720  case Intrinsic::copysign:
1721  case Intrinsic::floor:
1722  case Intrinsic::ceil:
1723  case Intrinsic::trunc:
1724  case Intrinsic::rint:
1725  case Intrinsic::nearbyint:
1726  case Intrinsic::round:
1727  case Intrinsic::canonicalize:
1728  case Intrinsic::lround:
1729  case Intrinsic::llround:
1730  case Intrinsic::lrint:
1731  case Intrinsic::llrint:
1732  if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
1733  return true;
1734  if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
1735  return true;
1736  // Some operations can be handled by vector instructions and assume
1737  // unsupported vectors will be expanded into supported scalar ones.
1738  // TODO Handle scalar operations properly.
1739  return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
1740  case Intrinsic::masked_store:
1741  case Intrinsic::masked_load:
1742  case Intrinsic::masked_gather:
1743  case Intrinsic::masked_scatter:
1744  return !ST->hasMVEIntegerOps();
1745  case Intrinsic::sadd_with_overflow:
1746  case Intrinsic::uadd_with_overflow:
1747  case Intrinsic::ssub_with_overflow:
1748  case Intrinsic::usub_with_overflow:
1749  case Intrinsic::sadd_sat:
1750  case Intrinsic::uadd_sat:
1751  case Intrinsic::ssub_sat:
1752  case Intrinsic::usub_sat:
1753  return false;
1754  }
1755 
1756  return BaseT::isLoweredToCall(F);
1757 }
1758 
1760  unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
1761  EVT VT = TLI->getValueType(DL, I.getType(), true);
1762  if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
1763  return true;
1764 
1765  // Check if an intrinsic will be lowered to a call and assume that any
1766  // other CallInst will generate a bl.
1767  if (auto *Call = dyn_cast<CallInst>(&I)) {
1768  if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
1769  switch(II->getIntrinsicID()) {
1770  case Intrinsic::memcpy:
1771  case Intrinsic::memset:
1772  case Intrinsic::memmove:
1773  return getNumMemOps(II) == -1;
1774  default:
1775  if (const Function *F = Call->getCalledFunction())
1776  return isLoweredToCall(F);
1777  }
1778  }
1779  return true;
1780  }
1781 
1782  // FPv5 provides conversions between integer, double-precision,
1783  // single-precision, and half-precision formats.
1784  switch (I.getOpcode()) {
1785  default:
1786  break;
1787  case Instruction::FPToSI:
1788  case Instruction::FPToUI:
1789  case Instruction::SIToFP:
1790  case Instruction::UIToFP:
1791  case Instruction::FPTrunc:
1792  case Instruction::FPExt:
1793  return !ST->hasFPARMv8Base();
1794  }
1795 
1796  // FIXME: Unfortunately the approach of checking the Operation Action does
1797  // not catch all cases of Legalization that use library calls. Our
1798  // Legalization step categorizes some transformations into library calls as
1799  // Custom, Expand or even Legal when doing type legalization. So for now
1800  // we have to special case for instance the SDIV of 64bit integers and the
1801  // use of floating point emulation.
1802  if (VT.isInteger() && VT.getSizeInBits() >= 64) {
1803  switch (ISD) {
1804  default:
1805  break;
1806  case ISD::SDIV:
1807  case ISD::UDIV:
1808  case ISD::SREM:
1809  case ISD::UREM:
1810  case ISD::SDIVREM:
1811  case ISD::UDIVREM:
1812  return true;
1813  }
1814  }
1815 
1816  // Assume all other non-float operations are supported.
1817  if (!VT.isFloatingPoint())
1818  return false;
1819 
1820  // We'll need a library call to handle most floats when using soft.
1821  if (TLI->useSoftFloat()) {
1822  switch (I.getOpcode()) {
1823  default:
1824  return true;
1825  case Instruction::Alloca:
1826  case Instruction::Load:
1827  case Instruction::Store:
1828  case Instruction::Select:
1829  case Instruction::PHI:
1830  return false;
1831  }
1832  }
1833 
1834  // We'll need a libcall to perform double precision operations on a single
1835  // precision only FPU.
1836  if (I.getType()->isDoubleTy() && !ST->hasFP64())
1837  return true;
1838 
1839  // Likewise for half precision arithmetic.
1840  if (I.getType()->isHalfTy() && !ST->hasFullFP16())
1841  return true;
1842 
1843  return false;
1844 }
1845 
1847  AssumptionCache &AC,
1848  TargetLibraryInfo *LibInfo,
1849  HardwareLoopInfo &HWLoopInfo) {
1850  // Low-overhead branches are only supported in the 'low-overhead branch'
1851  // extension of v8.1-m.
1852  if (!ST->hasLOB() || DisableLowOverheadLoops) {
1853  LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
1854  return false;
1855  }
1856 
1858  LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
1859  return false;
1860  }
1861 
1862  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
1863  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
1864  LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
1865  return false;
1866  }
1867 
1868  const SCEV *TripCountSCEV =
1869  SE.getAddExpr(BackedgeTakenCount,
1870  SE.getOne(BackedgeTakenCount->getType()));
1871 
1872  // We need to store the trip count in LR, a 32-bit register.
1873  if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
1874  LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
1875  return false;
1876  }
1877 
1878  // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
1879  // point in generating a hardware loop if that's going to happen.
1880 
1881  auto IsHardwareLoopIntrinsic = [](Instruction &I) {
1882  if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
1883  switch (Call->getIntrinsicID()) {
1884  default:
1885  break;
1886  case Intrinsic::start_loop_iterations:
1887  case Intrinsic::test_start_loop_iterations:
1888  case Intrinsic::loop_decrement:
1889  case Intrinsic::loop_decrement_reg:
1890  return true;
1891  }
1892  }
1893  return false;
1894  };
1895 
1896  // Scan the instructions to see if there's any that we know will turn into a
1897  // call or if this loop is already a low-overhead loop or will become a tail
1898  // predicated loop.
1899  bool IsTailPredLoop = false;
1900  auto ScanLoop = [&](Loop *L) {
1901  for (auto *BB : L->getBlocks()) {
1902  for (auto &I : *BB) {
1903  if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
1904  isa<InlineAsm>(I)) {
1905  LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
1906  return false;
1907  }
1908  if (auto *II = dyn_cast<IntrinsicInst>(&I))
1909  IsTailPredLoop |=
1910  II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
1911  II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
1912  II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
1913  II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
1914  II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
1915  }
1916  }
1917  return true;
1918  };
1919 
1920  // Visit inner loops.
1921  for (auto Inner : *L)
1922  if (!ScanLoop(Inner))
1923  return false;
1924 
1925  if (!ScanLoop(L))
1926  return false;
1927 
1928  // TODO: Check whether the trip count calculation is expensive. If L is the
1929  // inner loop but we know it has a low trip count, calculating that trip
1930  // count (in the parent loop) may be detrimental.
1931 
1932  LLVMContext &C = L->getHeader()->getContext();
1933  HWLoopInfo.CounterInReg = true;
1934  HWLoopInfo.IsNestingLegal = false;
1935  HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
1936  HWLoopInfo.CountType = Type::getInt32Ty(C);
1937  HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
1938  return true;
1939 }
1940 
1941 static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
1942  // We don't allow icmp's, and because we only look at single block loops,
1943  // we simply count the icmps, i.e. there should only be 1 for the backedge.
1944  if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
1945  return false;
1946 
1947  if (isa<FCmpInst>(&I))
1948  return false;
1949 
1950  // We could allow extending/narrowing FP loads/stores, but codegen is
1951  // too inefficient so reject this for now.
1952  if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
1953  return false;
1954 
1955  // Extends have to be extending-loads
1956  if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
1957  if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
1958  return false;
1959 
1960  // Truncs have to be narrowing-stores
1961  if (isa<TruncInst>(&I) )
1962  if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
1963  return false;
1964 
1965  return true;
1966 }
1967 
1968 // To set up a tail-predicated loop, we need to know the total number of
1969 // elements processed by that loop. Thus, we need to determine the element
1970 // size and:
1971 // 1) it should be uniform for all operations in the vector loop, so we
1972 // e.g. don't want any widening/narrowing operations.
1973 // 2) it should be smaller than i64s because we don't have vector operations
1974 // that work on i64s.
1975 // 3) we don't want elements to be reversed or shuffled, to make sure the
1976 // tail-predication masks/predicates the right lanes.
1977 //
1979  const DataLayout &DL,
1980  const LoopAccessInfo *LAI) {
1981  LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
1982 
1983  // If there are live-out values, it is probably a reduction. We can predicate
1984  // most reduction operations freely under MVE using a combination of
1985  // prefer-predicated-reduction-select and inloop reductions. We limit this to
1986  // floating point and integer reductions, but don't check for operators
1987  // specifically here. If the value ends up not being a reduction (and so the
1988  // vectorizer cannot tailfold the loop), we should fall back to standard
1989  // vectorization automatically.
1991  LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
1992  bool ReductionsDisabled =
1995 
1996  for (auto *I : LiveOuts) {
1997  if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
1998  !I->getType()->isHalfTy()) {
1999  LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
2000  "live-out value\n");
2001  return false;
2002  }
2003  if (ReductionsDisabled) {
2004  LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
2005  return false;
2006  }
2007  }
2008 
2009  // Next, check that all instructions can be tail-predicated.
2010  PredicatedScalarEvolution PSE = LAI->getPSE();
2011  SmallVector<Instruction *, 16> LoadStores;
2012  int ICmpCount = 0;
2013 
2014  for (BasicBlock *BB : L->blocks()) {
2015  for (Instruction &I : BB->instructionsWithoutDebug()) {
2016  if (isa<PHINode>(&I))
2017  continue;
2018  if (!canTailPredicateInstruction(I, ICmpCount)) {
2019  LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
2020  return false;
2021  }
2022 
2023  Type *T = I.getType();
2024  if (T->isPointerTy())
2025  T = T->getPointerElementType();
2026 
2027  if (T->getScalarSizeInBits() > 32) {
2028  LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
2029  return false;
2030  }
2031  if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
2032  Value *Ptr = isa<LoadInst>(I) ? I.getOperand(0) : I.getOperand(1);
2033  int64_t NextStride = getPtrStride(PSE, Ptr, L);
2034  if (NextStride == 1) {
2035  // TODO: for now only allow consecutive strides of 1. We could support
2036  // other strides as long as it is uniform, but let's keep it simple
2037  // for now.
2038  continue;
2039  } else if (NextStride == -1 ||
2040  (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
2041  (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
2042  LLVM_DEBUG(dbgs()
2043  << "Consecutive strides of 2 found, vld2/vstr2 can't "
2044  "be tail-predicated\n.");
2045  return false;
2046  // TODO: don't tail predicate if there is a reversed load?
2047  } else if (EnableMaskedGatherScatters) {
2048  // Gather/scatters do allow loading from arbitrary strides, at
2049  // least if they are loop invariant.
2050  // TODO: Loop variant strides should in theory work, too, but
2051  // this requires further testing.
2052  const SCEV *PtrScev =
2054  if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
2055  const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
2056  if (PSE.getSE()->isLoopInvariant(Step, L))
2057  continue;
2058  }
2059  }
2060  LLVM_DEBUG(dbgs() << "Bad stride found, can't "
2061  "tail-predicate\n.");
2062  return false;
2063  }
2064  }
2065  }
2066 
2067  LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
2068  return true;
2069 }
2070 
2072  ScalarEvolution &SE,
2073  AssumptionCache &AC,
2074  TargetLibraryInfo *TLI,
2075  DominatorTree *DT,
2076  const LoopAccessInfo *LAI) {
2077  if (!EnableTailPredication) {
2078  LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
2079  return false;
2080  }
2081 
2082  // Creating a predicated vector loop is the first step for generating a
2083  // tail-predicated hardware loop, for which we need the MVE masked
2084  // load/stores instructions:
2085  if (!ST->hasMVEIntegerOps())
2086  return false;
2087 
2088  // For now, restrict this to single block loops.
2089  if (L->getNumBlocks() > 1) {
2090  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
2091  "loop.\n");
2092  return false;
2093  }
2094 
2095  assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
2096 
2097  HardwareLoopInfo HWLoopInfo(L);
2098  if (!HWLoopInfo.canAnalyze(*LI)) {
2099  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2100  "analyzable.\n");
2101  return false;
2102  }
2103 
2104  // This checks if we have the low-overhead branch architecture
2105  // extension, and if we will create a hardware-loop:
2106  if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {
2107  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2108  "profitable.\n");
2109  return false;
2110  }
2111 
2112  if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) {
2113  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2114  "a candidate.\n");
2115  return false;
2116  }
2117 
2118  return canTailPredicateLoop(L, LI, SE, DL, LAI);
2119 }
2120 
2122  if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
2123  return false;
2124 
2125  // Intrinsic @llvm.get.active.lane.mask is supported.
2126  // It is used in the MVETailPredication pass, which requires the number of
2127  // elements processed by this vector loop to setup the tail-predicated
2128  // loop.
2129  return true;
2130 }
2133  // Enable Upper bound unrolling universally, not dependant upon the conditions
2134  // below.
2135  UP.UpperBound = true;
2136 
2137  // Only currently enable these preferences for M-Class cores.
2138  if (!ST->isMClass())
2140 
2141  // Disable loop unrolling for Oz and Os.
2142  UP.OptSizeThreshold = 0;
2143  UP.PartialOptSizeThreshold = 0;
2144  if (L->getHeader()->getParent()->hasOptSize())
2145  return;
2146 
2147  SmallVector<BasicBlock*, 4> ExitingBlocks;
2148  L->getExitingBlocks(ExitingBlocks);
2149  LLVM_DEBUG(dbgs() << "Loop has:\n"
2150  << "Blocks: " << L->getNumBlocks() << "\n"
2151  << "Exit blocks: " << ExitingBlocks.size() << "\n");
2152 
2153  // Only allow another exit other than the latch. This acts as an early exit
2154  // as it mirrors the profitability calculation of the runtime unroller.
2155  if (ExitingBlocks.size() > 2)
2156  return;
2157 
2158  // Limit the CFG of the loop body for targets with a branch predictor.
2159  // Allowing 4 blocks permits if-then-else diamonds in the body.
2160  if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
2161  return;
2162 
2163  // Don't unroll vectorized loops, including the remainder loop
2164  if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
2165  return;
2166 
2167  // Scan the loop: don't unroll loops with calls as this could prevent
2168  // inlining.
2169  InstructionCost Cost = 0;
2170  for (auto *BB : L->getBlocks()) {
2171  for (auto &I : *BB) {
2172  // Don't unroll vectorised loop. MVE does not benefit from it as much as
2173  // scalar code.
2174  if (I.getType()->isVectorTy())
2175  return;
2176 
2177  if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2178  if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2179  if (!isLoweredToCall(F))
2180  continue;
2181  }
2182  return;
2183  }
2184 
2185  SmallVector<const Value*, 4> Operands(I.operand_values());
2186  Cost +=
2188  }
2189  }
2190 
2191  LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2192 
2193  UP.Partial = true;
2194  UP.Runtime = true;
2195  UP.UnrollRemainder = true;
2197  UP.UnrollAndJam = true;
2199 
2200  // Force unrolling small loops can be very useful because of the branch
2201  // taken cost of the backedge.
2202  if (Cost < 12)
2203  UP.Force = true;
2204 }
2205 
2208  BaseT::getPeelingPreferences(L, SE, PP);
2209 }
2210 
2211 bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
2212  TTI::ReductionFlags Flags) const {
2213  if (!ST->hasMVEIntegerOps())
2214  return false;
2215 
2216  unsigned ScalarBits = Ty->getScalarSizeInBits();
2217  switch (Opcode) {
2218  case Instruction::Add:
2219  return ScalarBits <= 64;
2220  default:
2221  return false;
2222  }
2223 }
2224 
2226  unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const {
2227  if (!ST->hasMVEIntegerOps())
2228  return false;
2229  return true;
2230 }
llvm::ISD::SUB
@ SUB
Definition: ISDOpcodes.h:233
llvm::Check::Size
@ Size
Definition: FileCheck.h:73
ARMSubtarget.h
llvm::InstructionCost
Definition: InstructionCost.h:26
llvm::TargetLoweringBase::getMaxStoresPerMemmove
unsigned getMaxStoresPerMemmove(bool OptSize) const
Get maximum # of store operations permitted for llvm.memmove.
Definition: TargetLowering.h:1615
llvm::TargetTransformInfo::CastContextHint::Masked
@ Masked
The cast is used with a masked load/store.
ValueTypes.h
llvm::TargetTransformInfo::UnrollingPreferences::PartialOptSizeThreshold
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
Definition: TargetTransformInfo.h:452
llvm::ScalarEvolution::hasLoopInvariantBackedgeTakenCount
bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
Definition: ScalarEvolution.cpp:12384
llvm::TargetTransformInfo::SK_Select
@ SK_Select
Selects elements from the corresponding lane of either source operand.
Definition: TargetTransformInfo.h:853
llvm::BasicTTIImplBase< ARMTTIImpl >::DL
const DataLayout & DL
Definition: TargetTransformInfoImpl.h:38
llvm::TargetTransformInfo::UnrollingPreferences::Runtime
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
Definition: TargetTransformInfo.h:480
llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition: TargetTransformInfo.h:210
llvm::SPF_SMAX
@ SPF_SMAX
Unsigned minimum.
Definition: ValueTracking.h:661
llvm::ISD::VECTOR_SHUFFLE
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:557
llvm::ARM_AM::isThumbImmShiftedVal
bool isThumbImmShiftedVal(unsigned V)
isThumbImmShiftedVal - Return true if the specified value can be obtained by left shifting a 8-bit im...
Definition: ARMAddressingModes.h:235
llvm::MVT::v4f16
@ v4f16
Definition: MachineValueType.h:127
llvm::TargetTransformInfo::TCC_Expensive
@ TCC_Expensive
The cost of a 'div' instruction on x86.
Definition: TargetTransformInfo.h:263
llvm::IRBuilderBase::SetInsertPoint
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:184
llvm
Definition: AllocatorList.h:23
llvm::TargetTransformInfo::ReductionFlags
Flags describing the kind of vector reduction.
Definition: TargetTransformInfo.h:1340
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::ARMSubtarget::hasMVEFloatOps
bool hasMVEFloatOps() const
Definition: ARMSubtarget.h:625
llvm::HardwareLoopInfo::LoopDecrement
Value * LoopDecrement
Definition: TargetTransformInfo.h:102
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:112
llvm::ISD::OR
@ OR
Definition: ISDOpcodes.h:623
llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:435
InstCombiner.h
llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:722
llvm::InstCombiner::getDominatorTree
DominatorTree & getDominatorTree() const
Definition: InstCombiner.h:369
llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:107
IntrinsicInst.h
ceil
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g ceil
Definition: README-FPStack.txt:54
DisableLowOverheadLoops
static cl::opt< bool > DisableLowOverheadLoops("disable-arm-loloops", cl::Hidden, cl::init(false), cl::desc("Disable the generation of low-overhead loops"))
llvm::TypeSize::getFixedSize
ScalarTy getFixedSize() const
Definition: TypeSize.h:426
llvm::Function
Definition: Function.h:61
llvm::Loop
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:530
llvm::ISD::UDIV
@ UDIV
Definition: ISDOpcodes.h:236
llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:52
llvm::SelectPatternResult::Flavor
SelectPatternFlavor Flavor
Definition: ValueTracking.h:681
llvm::TargetTransformInfoImplCRTPBase< ARMTTIImpl >::getUserCost
InstructionCost getUserCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
Definition: TargetTransformInfoImpl.h:920
llvm::DataLayout::getTypeSizeInBits
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:655
llvm::BasicTTIImplBase< ARMTTIImpl >::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:1017
llvm::TargetTransformInfo::AMK_PostIndexed
@ AMK_PostIndexed
Definition: TargetTransformInfo.h:635
llvm::ARMTTIImpl::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, int Index, VectorType *SubTp)
Definition: ARMTargetTransformInfo.cpp:1141
llvm::PointerType::get
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Definition: Type.cpp:687
MVEMaxSupportedInterleaveFactor
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor
llvm::ARMTTIImpl::areInlineCompatible
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
Definition: ARMTargetTransformInfo.cpp:85
llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:317
llvm::PredicatedScalarEvolution
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
Definition: ScalarEvolution.h:2166
llvm::ConstantInt::getValue
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:131
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1167
llvm::InstCombiner::Builder
BuilderTy & Builder
Definition: InstCombiner.h:56
llvm::ARMSubtarget::hasFPARMv8Base
bool hasFPARMv8Base() const
Definition: ARMSubtarget.h:652
llvm::APInt::getSExtValue
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1643
llvm::BasicTTIImplBase< ARMTTIImpl >::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:739
llvm::CmpInst::makeCmpResultType
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:1034
llvm::IRBuilder< TargetFolder, IRBuilderCallbackInserter >
llvm::IntrinsicCostAttributes::getReturnType
Type * getReturnType() const
Definition: TargetTransformInfo.h:149
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:443
llvm::TargetTransformInfo::UnrollingPreferences::UnrollAndJamInnerLoopThreshold
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
Definition: TargetTransformInfo.h:499
Local.h
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:151
llvm::TargetTransformInfo::UnrollingPreferences::UnrollRemainder
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
Definition: TargetTransformInfo.h:492
llvm::ISD::FP_TO_SINT
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:775
llvm::Type::isFPOrFPVectorTy
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:190
llvm::TargetTransformInfo::TCK_CodeSize
@ TCK_CodeSize
Instruction code size.
Definition: TargetTransformInfo.h:213
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:143
llvm::CostTblEntry
Cost Table Entry.
Definition: CostTable.h:24
llvm::ARMSubtarget::hasV6T2Ops
bool hasV6T2Ops() const
Definition: ARMSubtarget.h:611
EnableMaskedLoadStores
static cl::opt< bool > EnableMaskedLoadStores("enable-arm-maskedldst", cl::Hidden, cl::init(true), cl::desc("Enable the generation of masked loads and stores"))
llvm::MemOp
Definition: TargetLowering.h:111
APInt.h
llvm::ARMTTIImpl::getGatherScatterOpCost
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:1491
llvm::HardwareLoopInfo::isHardwareLoopCandidate
bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, DominatorTree &DT, bool ForceNestedLoop=false, bool ForceHardwareLoopPHI=false)
Definition: TargetTransformInfo.cpp:100
llvm::ARMTTIImpl::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:382
llvm::TargetTransformInfo::UnrollingPreferences::Partial
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Definition: TargetTransformInfo.h:476
llvm::findDefsUsedOutsideOfLoop
SmallVector< Instruction *, 8 > findDefsUsedOutsideOfLoop(Loop *L)
Returns the instructions that use values defined in the loop.
Definition: LoopUtils.cpp:132
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:46
llvm::APInt::getBitWidth
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1581
llvm::TargetTransformInfo::PeelingPreferences
Definition: TargetTransformInfo.h:529
llvm::tgtok::Bits
@ Bits
Definition: TGLexer.h:50
llvm::Instruction::isShift
bool isShift() const
Definition: Instruction.h:167
llvm::BasicTTIImplBase< ARMTTIImpl >::improveShuffleKindFromMask
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask) const
Definition: BasicTTIImpl.h:801
llvm::SPF_UMAX
@ SPF_UMAX
Signed maximum.
Definition: ValueTracking.h:662
llvm::Optional
Definition: APInt.h:33
T
#define T
Definition: Mips16ISelLowering.cpp:341
llvm::ARMSubtarget::hasMVEIntegerOps
bool hasMVEIntegerOps() const
Definition: ARMSubtarget.h:624
llvm::FeatureBitset
Container class for subtarget features.
Definition: SubtargetFeature.h:40
llvm::ConstantAsMetadata::get
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:419
llvm::TargetLoweringBase::getTypeLegalizationCost
std::pair< InstructionCost, MVT > getTypeLegalizationCost(const DataLayout &DL, Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: TargetLoweringBase.cpp:1836
llvm::Value::user_begin
user_iterator user_begin()
Definition: Value.h:398
llvm::CallBase::getNumArgOperands
unsigned getNumArgOperands() const
Definition: InstrTypes.h:1339
llvm::isPowerOf2_32
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:491
llvm::matchSelectPattern
SelectPatternResult matchSelectPattern(Value *V, Value *&LHS, Value *&RHS, Instruction::CastOps *CastOp=nullptr, unsigned Depth=0)
Pattern match integer [SU]MIN, [SU]MAX and ABS idioms, returning the kind and providing the out param...
Definition: ValueTracking.cpp:6106
llvm::SelectPatternFlavor
SelectPatternFlavor
Specific patterns of select instructions we can match.
Definition: ValueTracking.h:657
llvm::MVT::v2f64
@ v2f64
Definition: MachineValueType.h:160
llvm::FixedVectorType
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:527
llvm::BitmaskEnumDetail::Mask
std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
llvm::TargetLoweringBase::getOperationAction
LegalizeAction getOperationAction(unsigned Op, EVT VT) const
Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...
Definition: TargetLowering.h:1047
llvm::Type::getInt32Ty
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:197
llvm::LoopBase::getNumBlocks
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
Definition: LoopInfo.h:185
llvm::APIntOps::umin
const APInt & umin(const APInt &A, const APInt &B)
Determine the smaller of two APInts considered to be unsigned.
Definition: APInt.h:2185
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:122
llvm::MDNode::get
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1198
llvm::TargetTransformInfo::SK_Broadcast
@ SK_Broadcast
Broadcast element 0 to all other elements.
Definition: TargetTransformInfo.h:851
F
#define F(x, y, z)
Definition: MD5.cpp:56
llvm::ARMTTIImpl::isLegalMaskedLoad
bool isLegalMaskedLoad(Type *DataTy, Align Alignment)
Definition: ARMTargetTransformInfo.cpp:1021
llvm::Instruction::setMetadata
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1330
llvm::ARMTTIImpl::isHardwareLoopProfitable
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo)
Definition: ARMTargetTransformInfo.cpp:1846
KnownBits.h
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
llvm::HardwareLoopInfo::IsNestingLegal
bool IsNestingLegal
Definition: TargetTransformInfo.h:104
floor
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g floor
Definition: README-FPStack.txt:54
llvm::EVT::isSimple
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:124
MachineValueType.h
llvm::AArch64CC::LT
@ LT
Definition: AArch64BaseInfo.h:247
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
Arg
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
Definition: AMDGPULibCalls.cpp:206
Instruction.h
llvm::FixedVectorType::getNumElements
unsigned getNumElements() const
Definition: DerivedTypes.h:570
llvm::ARMSubtarget::hasLOB
bool hasLOB() const
Definition: ARMSubtarget.h:660
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:77
llvm::InstCombiner::replaceOperand
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
Definition: InstCombiner.h:437
llvm::Intrinsic::getType
FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys=None)
Return the function type for an intrinsic.
Definition: Function.cpp:1285
llvm::MVT::i1
@ i1
Definition: MachineValueType.h:40
llvm::APInt::isNonNegative
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition: APInt.h:369
llvm::MVT::v8f16
@ v8f16
Definition: MachineValueType.h:128
llvm::APInt::isNegative
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:364
llvm::ARM_AM::getSOImmVal
int getSOImmVal(unsigned Arg)
getSOImmVal - Given a 32-bit immediate, if it is something that can fit into an shifter_operand immed...
Definition: ARMAddressingModes.h:163
SubtargetFeature.h
TargetMachine.h
llvm::ScalarEvolution::getOne
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
Definition: ScalarEvolution.h:607
llvm::ISD::SELECT
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:669
llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
llvm::ISD::ZERO_EXTEND
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:719
EnableTailPredication
cl::opt< TailPredication::Mode > EnableTailPredication
llvm::BasicTTIImplBase< ARMTTIImpl >::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: BasicTTIImpl.h:1070
llvm::ARMTTIImpl::getIntImmCostInst
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
Definition: ARMTargetTransformInfo.cpp:327
llvm::TargetTransformInfo::ShuffleKind
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Definition: TargetTransformInfo.h:850
llvm::TargetTransformInfo::CastContextHint
CastContextHint
Represents a hint about the context in which a cast is used.
Definition: TargetTransformInfo.h:1084
llvm::User
Definition: User.h:44
llvm::TargetLoweringBase::getMaxStoresPerMemset
unsigned getMaxStoresPerMemset(bool OptSize) const
Get maximum # of store operations permitted for llvm.memset.
Definition: TargetLowering.h:1576
llvm::BasicTTIImplBase< ARMTTIImpl >::getAddressComputationCost
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
Definition: BasicTTIImpl.h:1966
llvm::EVT
Extended Value Type.
Definition: ValueTypes.h:35
llvm::getKnownAlignment
Align getKnownAlignment(Value *V, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to infer an alignment for the specified pointer.
Definition: Local.h:223
Intrinsics.h
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
llvm::TargetTransformInfo::UnrollingPreferences::Force
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
Definition: TargetTransformInfo.h:488
llvm::MVT::f64
@ f64
Definition: MachineValueType.h:53
llvm::ARMSubtarget::hasBranchPredictor
bool hasBranchPredictor() const
Definition: ARMSubtarget.h:714
round
static uint64_t round(uint64_t Acc, uint64_t Input)
Definition: xxhash.cpp:57
llvm::EVT::getVectorNumElements
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:301
llvm::getPtrStride
int64_t getPtrStride(PredicatedScalarEvolution &PSE, Value *Ptr, const Loop *Lp, const ValueToValueMap &StridesMap=ValueToValueMap(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of its element size.
Definition: LoopAccessAnalysis.cpp:1017
AllowWLSLoops
static cl::opt< bool > AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true), cl::desc("Enable the generation of WLS loops"))
llvm::ISD::TRUNCATE
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:725
llvm::LoopBase::blocks
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:178
llvm::APInt::getLimitedValue
uint64_t getLimitedValue(uint64_t Limit=UINT64_MAX) const
If this value is smaller than the specified limit, return it, otherwise return the limit value.
Definition: APInt.h:487
llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:235
llvm::ARMTTIImpl::getIntImmCodeSizeCost
InstructionCost getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty)
Definition: ARMTargetTransformInfo.cpp:286
llvm::MVT::v16i1
@ v16i1
Definition: MachineValueType.h:65
llvm::ISD::UDIVREM
@ UDIVREM
Definition: ISDOpcodes.h:249
llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:109
llvm::PatternMatch::m_c_Add
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
Definition: PatternMatch.h:2220
llvm::ARMTargetLowering::useSoftFloat
bool useSoftFloat() const override
Definition: ARMISelLowering.cpp:1562
llvm::MVT::v8i1
@ v8i1
Definition: MachineValueType.h:64
llvm::TargetTransformInfo::UnrollingPreferences::UnrollAndJam
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
Definition: TargetTransformInfo.h:494
llvm::LoopBase::getBlocks
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:171
llvm::PatternMatch::m_ConstantInt
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:145
llvm::EVT::isInteger
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:139
llvm::Instruction
Definition: Instruction.h:45
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition: Type.cpp:147
llvm::HardwareLoopInfo::PerformEntryTest
bool PerformEntryTest
Definition: TargetTransformInfo.h:108
canTailPredicateLoop
static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, const DataLayout &DL, const LoopAccessInfo *LAI)
Definition: ARMTargetTransformInfo.cpp:1978
llvm::ARMTTIImpl::isLoweredToCall
bool isLoweredToCall(const Function *F)
Definition: ARMTargetTransformInfo.cpp:1698
llvm::ISD::SINT_TO_FP
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:729
llvm::InstCombiner::eraseInstFromFunction
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
llvm::APInt::getZExtValue
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1631
llvm::TypeConversionCostTblEntry
Type Conversion Cost Table.
Definition: CostTable.h:44
llvm::LoopBase::getExitingBlocks
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
Definition: LoopInfoImpl.h:34
llvm::ConstantInt::get
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:898
LoopUtils.h
llvm::HardwareLoopInfo::CounterInReg
bool CounterInReg
Definition: TargetTransformInfo.h:106
llvm::ARMSubtarget::hasV6Ops
bool hasV6Ops() const
Definition: ARMSubtarget.h:608
llvm::ISD::AND
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:622
llvm::ARMTTIImpl::emitGetActiveLaneMask
bool emitGetActiveLaneMask() const
Definition: ARMTargetTransformInfo.cpp:2121
PatternMatch.h
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:644
llvm::ARMSubtarget::hasFP64
bool hasFP64() const
Definition: ARMSubtarget.h:687
llvm::MVT::v1i64
@ v1i64
Definition: MachineValueType.h:109
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::Metadata
Root of the metadata hierarchy.
Definition: Metadata.h:62
llvm::isVREVMask
bool isVREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isVREVMask - Check if a vector shuffle corresponds to a VREV instruction with the specified blocksize...
Definition: ARMTargetTransformInfo.h:303
llvm::ARM_AM::getT2SOImmVal
int getT2SOImmVal(unsigned Arg)
getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit into a Thumb-2 shifter_oper...
Definition: ARMAddressingModes.h:320
llvm::AddressSpace
AddressSpace
Definition: NVPTXBaseInfo.h:21
llvm::BasicTTIImplBase< ARMTTIImpl >::getCmpSelInstrCost
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:1022
llvm::BasicTTIImplBase< ARMTTIImpl >::getCastInstrCost
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:856
llvm::None
const NoneType None
Definition: None.h:23
llvm::ARMTTIImpl::getInterleavedMemoryOpCost
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Definition: ARMTargetTransformInfo.cpp:1450
llvm::MVT::v4i16
@ v4i16
Definition: MachineValueType.h:86
llvm::lltok::Kind
Kind
Definition: LLToken.h:18
llvm::ARMSubtarget::getMVEVectorCostFactor
unsigned getMVEVectorCostFactor(TargetTransformInfo::TargetCostKind CostKind) const
Definition: ARMSubtarget.h:925
llvm::Type::getIntegerBitWidth
unsigned getIntegerBitWidth() const
Definition: DerivedTypes.h:96
llvm::MVT::v4i8
@ v4i8
Definition: MachineValueType.h:75
llvm::SPF_SMIN
@ SPF_SMIN
Definition: ValueTracking.h:659
Type.h
llvm::IntrinsicCostAttributes
Definition: TargetTransformInfo.h:117
llvm::APInt::isAllOnesValue
bool isAllOnesValue() const
Determine if all bits are set.
Definition: APInt.h:401
llvm::Instruction::getMetadata
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:282
llvm::maxnum
LLVM_READONLY APFloat maxnum(const APFloat &A, const APFloat &B)
Implements IEEE maxNum semantics.
Definition: APFloat.h:1309
LoopInfo.h
llvm::ARMTTIImpl::getNumMemOps
int getNumMemOps(const IntrinsicInst *I) const
Given a memcpy/memset/memmove instruction, return the number of memory operations performed,...
Definition: ARMTargetTransformInfo.cpp:1065
llvm::ARMTTIImpl::isProfitableLSRChainElement
bool isProfitableLSRChainElement(Instruction *I)
Definition: ARMTargetTransformInfo.cpp:1004
llvm::ARMTTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP)
Definition: ARMTargetTransformInfo.cpp:2131
Operands
mir Rename Register Operands
Definition: MIRNamerPass.cpp:78
llvm::ARMTTIImpl::preferPredicateOverEpilogue
bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, const LoopAccessInfo *LAI)
Definition: ARMTargetTransformInfo.cpp:2071
getCalledFunction
static const Function * getCalledFunction(const Value *V, bool LookThroughBitCast, bool &IsNoBuiltin)
Definition: MemoryBuiltins.cpp:117
llvm::PatternMatch::m_Xor
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1135
llvm::Type::isIntegerTy
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:202
llvm::APInt::getOneBitSet
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:593
llvm::TargetTransformInfo::SK_Reverse
@ SK_Reverse
Reverse the order of the vector.
Definition: TargetTransformInfo.h:852
llvm::MVT::v2i8
@ v2i8
Definition: MachineValueType.h:74
llvm::MVT::v4i64
@ v4i64
Definition: MachineValueType.h:111
llvm::VectorType
Base class of all SIMD vector types.
Definition: DerivedTypes.h:391
llvm::ARMTTIImpl::getMemoryOpCost
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:1386
llvm::TargetTransformInfo::CastContextHint::Normal
@ Normal
The cast is used with a normal load/store.
BasicBlock.h
llvm::cl::opt< bool >
llvm::SCEV
This class represents an analyzed expression in the program.
Definition: ScalarEvolution.h:78
llvm::ARMTTIImpl::getExtendedAddReductionCost
InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *ValTy, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:1615
llvm::PatternMatch::m_Zero
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:535
llvm::Constant
This is an important base class in LLVM.
Definition: Constant.h:41
llvm::MVT::v16i8
@ v16i8
Definition: MachineValueType.h:77
llvm::EVT::getSizeInBits
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:333
llvm::ARMSubtarget::isThumb1Only
bool isThumb1Only() const
Definition: ARMSubtarget.h:811
llvm::ScalarEvolution::getUnsignedRangeMax
APInt getUnsignedRangeMax(const SCEV *S)
Determine the max of the unsigned range for a particular SCEV.
Definition: ScalarEvolution.h:886
llvm::ARMTTIImpl::getMemcpyCost
InstructionCost getMemcpyCost(const Instruction *I)
Definition: ARMTargetTransformInfo.cpp:1131
llvm::InstCombiner::getAssumptionCache
AssumptionCache & getAssumptionCache() const
Definition: InstCombiner.h:367
llvm::MVT::v16i16
@ v16i16
Definition: MachineValueType.h:88
Index
uint32_t Index
Definition: ELFObjHandler.cpp:84
llvm::MVT::v2i64
@ v2i64
Definition: MachineValueType.h:110
llvm::InstCombiner::getDataLayout
const DataLayout & getDataLayout() const
Definition: InstCombiner.h:370
llvm::ISD::FP_TO_UINT
@ FP_TO_UINT
Definition: ISDOpcodes.h:776
llvm::ARMTTIImpl::getAddressComputationCost
InstructionCost getAddressComputationCost(Type *Val, ScalarEvolution *SE, const SCEV *Ptr)
Definition: ARMTargetTransformInfo.cpp:982
llvm::CostTableLookup
const CostTblEntry * CostTableLookup(ArrayRef< CostTblEntry > Tbl, int ISD, MVT Ty)
Find in cost table, TypeTy must be comparable to CompareTy by ==.
Definition: CostTable.h:31
llvm::MVT::v16f32
@ v16f32
Definition: MachineValueType.h:150
llvm::Instruction::user_back
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:91
llvm::TruncInst
This class represents a truncation of integer types.
Definition: Instructions.h:4721
llvm::BasicTTIImplBase< ARMTTIImpl >::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, bool IsPairwise, TTI::TargetCostKind CostKind)
Try to calculate arithmetic and shuffle op costs for reduction operations.
Definition: BasicTTIImpl.h:2006
llvm::TargetTransformInfo::OK_UniformConstantValue
@ OK_UniformConstantValue
Definition: TargetTransformInfo.h:903
llvm::ARMSubtarget::isMClass
bool isMClass() const
Definition: ARMSubtarget.h:814
llvm::ConvertCostTableLookup
const TypeConversionCostTblEntry * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntry > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table, TypeTy must be comparable to CompareTy by ==.
Definition: CostTable.h:54
llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
llvm::DenseMap< const Value *, Value * >
llvm::TargetTransformInfo::UnrollingPreferences
Parameters that control the generic loop unrolling transformation.
Definition: TargetTransformInfo.h:423
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::TargetTransformInfo::OperandValueProperties
OperandValueProperties
Additional properties of an operand's values.
Definition: TargetTransformInfo.h:908
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
llvm::ARMTTIImpl::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:1635
llvm::LoopAccessInfo
Drive the analysis of memory accesses in the loop.
Definition: LoopAccessAnalysis.h:519
llvm::Type::isHalfTy
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
llvm::MVT::v4f32
@ v4f32
Definition: MachineValueType.h:147
llvm::MVT::i8
@ i8
Definition: MachineValueType.h:41
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
llvm::MVT::Other
@ Other
Definition: MachineValueType.h:39
llvm::ARMSubtarget::hasSlowLoadDSubregister
bool hasSlowLoadDSubregister() const
Definition: ARMSubtarget.h:701
llvm::SPF_ABS
@ SPF_ABS
Floating point maxnum.
Definition: ValueTracking.h:665
memcpy
<%struct.s * > cast struct s *S to sbyte *< sbyte * > sbyte uint cast struct s *agg result to sbyte *< sbyte * > sbyte uint cast struct s *memtmp to sbyte *< sbyte * > sbyte uint ret void llc ends up issuing two memcpy or custom lower memcpy(of small size) to be ldmia/stmia. I think option 2 is better but the current register allocator cannot allocate a chunk of registers at a time. A feasible temporary solution is to use specific physical registers at the lowering time for small(<
llvm::TargetLoweringBase::getMaxStoresPerMemcpy
unsigned getMaxStoresPerMemcpy(bool OptSize) const
Get maximum # of store operations permitted for llvm.memcpy.
Definition: TargetLowering.h:1586
llvm::TargetTransformInfoImplBase::isLoweredToCall
bool isLoweredToCall(const Function *F) const
Definition: TargetTransformInfoImpl.h:116
llvm::ARMTTIImpl::getMaskedMemoryOpCost
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:1433
llvm::ARMSubtarget::hasFullFP16
bool hasFullFP16() const
Definition: ARMSubtarget.h:731
llvm::ARMTTIImpl::preferInLoopReduction
bool preferInLoopReduction(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const
Definition: ARMTargetTransformInfo.cpp:2211
llvm::IRBuilderBase::CreateIntrinsic
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with args, mangled using Types.
Definition: IRBuilder.cpp:818
llvm::BasicTTIImplBase< ARMTTIImpl >::getInterleavedMemoryOpCost
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Definition: BasicTTIImpl.h:1138
llvm::IRBuilderBase::getTrue
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:458
llvm::PatternMatch::m_Constant
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
Definition: PatternMatch.h:142
llvm::TargetTransformInfo::OperandValueKind
OperandValueKind
Additional information about an operand's possible values.
Definition: TargetTransformInfo.h:900
isSSATMinMaxPattern
static bool isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm)
Definition: ARMTargetTransformInfo.cpp:296
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:651
llvm::PatternMatch::m_Value
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:70
llvm::ARMTTIImpl::getPreferredAddressingMode
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const
Definition: ARMTargetTransformInfo.cpp:104
llvm::APIntOps::smin
const APInt & smin(const APInt &A, const APInt &B)
Determine the smaller of two APInts considered to be signed.
Definition: APInt.h:2175
llvm::ArrayRef< int >
llvm::LoopInfo
Definition: LoopInfo.h:1080
llvm::EVT::isVector
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:149
ARMAddressingModes.h
llvm::ARMSubtarget::hasNEON
bool hasNEON() const
Definition: ARMSubtarget.h:653
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
DataLayout.h
llvm::MVT::i64
@ i64
Definition: MachineValueType.h:44
llvm::MVT::v2i32
@ v2i32
Definition: MachineValueType.h:95
llvm::AssumptionCache
A cache of @llvm.assume calls within a function.
Definition: AssumptionCache.h:41
llvm::ARMTTIImpl::isLegalMaskedGather
bool isLegalMaskedGather(Type *Ty, Align Alignment)
Definition: ARMTargetTransformInfo.cpp:1041
llvm::BasicTTIImplBase< ARMTTIImpl >::getScalarizationOverhead
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:660
llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition: TargetTransformInfo.h:214
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:136
llvm::ISD::SREM
@ SREM
Definition: ISDOpcodes.h:237
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
if
if(llvm_vc STREQUAL "") set(fake_version_inc "$
Definition: CMakeLists.txt:14
llvm::MVT::v2f32
@ v2f32
Definition: MachineValueType.h:145
CostKind
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
llvm::Value::getContext
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:965
llvm::ARMTTIImpl::getCmpSelInstrCost
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:836
llvm::TargetTransformInfo::UnrollingPreferences::DefaultUnrollRuntimeCount
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
Definition: TargetTransformInfo.h:459
trunc
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g trunc
Definition: README-FPStack.txt:63
llvm::SPF_FMINNUM
@ SPF_FMINNUM
Unsigned maximum.
Definition: ValueTracking.h:663
llvm::TargetLoweringBase::InstructionOpcodeToISD
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
Definition: TargetLoweringBase.cpp:1756
llvm::TargetTransformInfo::AddressingModeKind
AddressingModeKind
Definition: TargetTransformInfo.h:633
llvm::MVT::v4i32
@ v4i32
Definition: MachineValueType.h:97
llvm::BasicTTIImplBase< ARMTTIImpl >::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:548
llvm::SPF_FMAXNUM
@ SPF_FMAXNUM
Floating point minnum.
Definition: ValueTracking.h:664
canTailPredicateInstruction
static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount)
Definition: ARMTargetTransformInfo.cpp:1941
llvm::MVT::v8i64
@ v8i64
Definition: MachineValueType.h:112
llvm::ISD::XOR
@ XOR
Definition: ISDOpcodes.h:624
llvm::Function::hasOptSize
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:716
llvm::ScalarEvolution::isLoopInvariant
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
Definition: ScalarEvolution.cpp:12652
llvm::BasicTTIImplBase< ARMTTIImpl >::getCallInstrCost
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency)
Compute a cost of the given call instruction.
Definition: BasicTTIImpl.h:1955
llvm::InstCombiner::replaceInstUsesWith
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition: InstCombiner.h:416
llvm::MVT::v16i32
@ v16i32
Definition: MachineValueType.h:100
llvm::MemOp::Set
static MemOp Set(uint64_t Size, bool DstAlignCanChange, Align DstAlign, bool IsZeroMemset, bool IsVolatile)
Definition: TargetLowering.h:144
llvm::BasicTTIImplBase::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP)
Definition: BasicTTIImpl.h:486
llvm::MCID::Select
@ Select
Definition: MCInstrDesc.h:162
llvm::TargetLoweringBase::getTargetMachine
const TargetMachine & getTargetMachine() const
Definition: TargetLowering.h:339
llvm::LoopBase::isInnermost
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
Definition: LoopInfo.h:165
llvm::APIntOps::umax
const APInt & umax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be unsigned.
Definition: APInt.h:2190
llvm::minnum
LLVM_READONLY APFloat minnum(const APFloat &A, const APFloat &B)
Implements IEEE minNum semantics.
Definition: APFloat.h:1298
llvm::MemOp::Copy
static MemOp Copy(uint64_t Size, bool DstAlignCanChange, Align DstAlign, Align SrcAlign, bool IsVolatile, bool MemcpyStrSrc=false)
Definition: TargetLowering.h:129
llvm::KnownBits
Definition: KnownBits.h:23
llvm::Type::isFloatTy
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:148
CostTable.h
llvm::EVT::getScalarType
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:288
llvm::TargetTransformInfo::UnrollingPreferences::UpperBound
bool UpperBound
Allow using trip count upper bound to unroll loops.
Definition: TargetTransformInfo.h:490
llvm::Align::value
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
llvm::Type::isIntOrIntVectorTy
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:208
llvm::replaceSymbolicStrideSCEV
const SCEV * replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE, const ValueToValueMap &PtrToStride, Value *Ptr, Value *OrigPtr=nullptr)
Return the SCEV corresponding to a pointer with the symbolic stride replaced with constant one,...
Definition: LoopAccessAnalysis.cpp:143
llvm::MVT::v8i16
@ v8i16
Definition: MachineValueType.h:87
llvm::ARMTTIImpl::getIntImmCost
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:251
ISDOpcodes.h
llvm::TypeSize
Definition: TypeSize.h:417
Casting.h
llvm::BasicTTIImplBase< ARMTTIImpl >::getMaskedMemoryOpCost
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:1122
llvm::Value::hasNUses
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:146
llvm::LoopBase::getHeader
BlockT * getHeader() const
Definition: LoopInfo.h:104
llvm::MVT::i32
@ i32
Definition: MachineValueType.h:43
llvm::TargetLibraryInfo
Provides information about what library functions are available for the current target.
Definition: TargetLibraryInfo.h:219
llvm::ISD::SDIV
@ SDIV
Definition: ISDOpcodes.h:235
powi
This is blocked on not handling X *X *X powi(X, 3)(see note above). The issue is that we end up getting t
llvm::log2
static double log2(double V)
Definition: AMDGPULibCalls.cpp:842
llvm::InstCombiner::SimplifyDemandedBits
virtual bool SimplifyDemandedBits(Instruction *I, unsigned OpNo, const APInt &DemandedMask, KnownBits &Known, unsigned Depth=0)=0
llvm::MCID::Add
@ Add
Definition: MCInstrDesc.h:183
llvm::MVT::v8i32
@ v8i32
Definition: MachineValueType.h:99
llvm::InstCombiner
The core instruction combiner logic.
Definition: InstCombiner.h:45
llvm::ISD::UINT_TO_FP
@ UINT_TO_FP
Definition: ISDOpcodes.h:730
llvm::ISD::ADD
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:232
llvm::TargetLoweringBase::isOperationLegalOrCustomOrPromote
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
Definition: TargetLowering.h:1142
llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:45
llvm::HardwareLoopInfo
Attributes of a target dependent hardware loop.
Definition: TargetTransformInfo.h:94
llvm::TargetTransformInfoImplBase::isConstantStridedAccessLessThan
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
Definition: TargetTransformInfoImpl.h:829
llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33
llvm::ISD::FP_EXTEND
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:823
llvm::BasicTTIImplBase< ARMTTIImpl >::getGatherScatterOpCost
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:1129
llvm::ARMTTIImpl::maybeLoweredToCall
bool maybeLoweredToCall(Instruction &I)
Definition: ARMTargetTransformInfo.cpp:1759
Instructions.h
llvm::IntrinsicCostAttributes::getID
Intrinsic::ID getID() const
Definition: TargetTransformInfo.h:147
llvm::ARMTTIImpl::getCastInstrCost
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:396
llvm::ISD::SHL
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:647
SmallVector.h
llvm::PatternMatch::m_Specific
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:802
llvm::ISD::MUL
@ MUL
Definition: ISDOpcodes.h:234
llvm::ISD::UREM
@ UREM
Definition: ISDOpcodes.h:238
llvm::MVT::f16
@ f16
Definition: MachineValueType.h:51
llvm::TailPredication::EnabledNoReductions
@ EnabledNoReductions
Definition: ARMTargetTransformInfo.h:44
llvm::CallBase::getArgOperand
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1341
llvm::BasicTTIImplBase< ARMTTIImpl >::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, int Index, VectorType *SubTp)
Definition: BasicTTIImpl.h:833
llvm::BasicTTIImplBase< ARMTTIImpl >::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
Definition: BasicTTIImpl.h:1287
llvm::ARMTTIImpl::isLegalMaskedStore
bool isLegalMaskedStore(Type *DataTy, Align Alignment)
Definition: ARMTargetTransformInfo.h:183
llvm::SPF_UMIN
@ SPF_UMIN
Signed minimum.
Definition: ValueTracking.h:660
llvm::getBooleanLoopAttribute
bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
Definition: LoopUtils.cpp:296
llvm::ARMTargetLowering::getNumInterleavedAccesses
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
Definition: ARMISelLowering.cpp:19686
simplifyNeonVld1
static Value * simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign, InstCombiner::BuilderTy &Builder)
Convert a vector load intrinsic into a simple llvm load instruction.
Definition: ARMTargetTransformInfo.cpp:66
llvm::APInt::getActiveBits
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition: APInt.h:1605
llvm::ARMTTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: ARMTargetTransformInfo.cpp:2206
llvm::BasicTTIImplBase< ARMTTIImpl >::getMemoryOpCost
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:1078
ARMTargetTransformInfo.h
llvm::HardwareLoopInfo::canAnalyze
bool canAnalyze(LoopInfo &LI)
Definition: TargetTransformInfo.cpp:47
llvm::IRBuilderBase::CreateVectorSplat
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1076
DerivedTypes.h
llvm::SCEV::getType
Type * getType() const
Return the LLVM type of this SCEV expression.
Definition: ScalarEvolution.cpp:379
TM
const char LLVMTargetMachineRef TM
Definition: PassBuilderBindings.cpp:47
EnableMaskedGatherScatters
cl::opt< bool > EnableMaskedGatherScatters
llvm::ScalarEvolution::getAddExpr
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
Definition: ScalarEvolution.cpp:2395
llvm::APInt::getLowBitsSet
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Get a value with low bits set.
Definition: APInt.h:667
llvm::MVT::i16
@ i16
Definition: MachineValueType.h:42
llvm::TargetTransformInfo::UnrollingPreferences::OptSizeThreshold
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Definition: TargetTransformInfo.h:445
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
GEP
Hexagon Common GEP
Definition: HexagonCommonGEP.cpp:171
llvm::ARMTTIImpl::preferPredicatedReductionSelect
bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const
Definition: ARMTargetTransformInfo.cpp:2225
llvm::ISD::SDIVREM
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:248
llvm::ARMTTIImpl::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: ARMTargetTransformInfo.cpp:800
llvm::ARMSubtarget::isThumb2
bool isThumb2() const
Definition: ARMSubtarget.h:812
llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition: AMDGPUMetadata.h:389
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
llvm::cl::desc
Definition: CommandLine.h:414
llvm::TargetLoweringBase::getValueType
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
Definition: TargetLowering.h:1396
llvm::LoopAccessInfo::getPSE
const PredicatedScalarEvolution & getPSE() const
Used to add runtime SCEV checks.
Definition: LoopAccessAnalysis.h:591
llvm::ISD::SIGN_EXTEND
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:716
llvm::MVT::v8i8
@ v8i8
Definition: MachineValueType.h:76
llvm::ARMTTIImpl::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueKind Op1Info=TTI::OK_AnyValue, TTI::OperandValueKind Op2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: ARMTargetTransformInfo.cpp:1244
llvm::ScalarEvolution::getBackedgeTakenCount
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
Definition: ScalarEvolution.cpp:7148
llvm::MVT::v8f32
@ v8f32
Definition: MachineValueType.h:149
llvm::PredicatedScalarEvolution::getSE
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
Definition: ScalarEvolution.h:2198
llvm::ARMTTIImpl::instCombineIntrinsic
Optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
Definition: ARMTargetTransformInfo.cpp:120
llvm::BinaryOperator::Create
static BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), Instruction *InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
Definition: Instructions.cpp:2550
llvm::MVT::v2i16
@ v2i16
Definition: MachineValueType.h:84
llvm::MVT::v16i64
@ v16i64
Definition: MachineValueType.h:113
llvm::abs
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1284
llvm::BasicTTIImplBase< ARMTTIImpl >::getExtendedAddReductionCost
InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:2136
llvm::HardwareLoopInfo::CountType
IntegerType * CountType
Definition: TargetTransformInfo.h:101
llvm::EVT::isFixedLengthVector
bool isFixedLengthVector() const
Definition: ValueTypes.h:159
llvm::ISD::FP_ROUND
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:804
llvm::MVT::f32
@ f32
Definition: MachineValueType.h:52
llvm::Value
LLVM Value Representation.
Definition: Value.h:75
llvm::TargetTransformInfo::TCK_RecipThroughput
@ TCK_RecipThroughput
Reciprocal throughput.
Definition: TargetTransformInfo.h:211
llvm::EVT::isFloatingPoint
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:134
llvm::TargetTransformInfo::AMK_None
@ AMK_None
Definition: TargetTransformInfo.h:636
llvm::TargetTransformInfo::AMK_PreIndexed
@ AMK_PreIndexed
Definition: TargetTransformInfo.h:634
llvm::APIntOps::smax
const APInt & smax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be signed.
Definition: APInt.h:2180
llvm::ARMSubtarget::hasVFP2Base
bool hasVFP2Base() const
Definition: ARMSubtarget.h:649
llvm::TargetLoweringBase::LibCall
@ LibCall
Definition: TargetLowering.h:199
llvm::ARMSubtarget::isThumb
bool isThumb() const
Definition: ARMSubtarget.h:809
llvm::ARMTTIImpl::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, bool IsPairwiseForm, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:1591
llvm::Type::getPrimitiveSizeInBits
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition: Type.cpp:122
llvm::EVT::getSimpleVT
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:281
llvm::MVT::v4i1
@ v4i1
Definition: MachineValueType.h:63
llvm::ARMTargetLowering::isLegalInterleavedAccessType
bool isLegalInterleavedAccessType(unsigned Factor, FixedVectorType *VecTy, Align Alignment, const DataLayout &DL) const
Returns true if VecTy is a legal interleaved access type.
Definition: ARMISelLowering.cpp:19691
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38
llvm::DataLayout::getTypeAllocSize
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:498
llvm::TailPredication::ForceEnabledNoReductions
@ ForceEnabledNoReductions
Definition: ARMTargetTransformInfo.h:46