LLVM  8.0.0svn
BasicTTIImpl.h
Go to the documentation of this file.
1 //===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// This file provides a helper that implements much of the TTI interface in
12 /// terms of the target-independent code generator and TargetLowering
13 /// interfaces.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #ifndef LLVM_CODEGEN_BASICTTIIMPL_H
18 #define LLVM_CODEGEN_BASICTTIIMPL_H
19 
20 #include "llvm/ADT/APInt.h"
21 #include "llvm/ADT/ArrayRef.h"
22 #include "llvm/ADT/BitVector.h"
23 #include "llvm/ADT/SmallPtrSet.h"
24 #include "llvm/ADT/SmallVector.h"
25 #include "llvm/Analysis/LoopInfo.h"
32 #include "llvm/IR/BasicBlock.h"
33 #include "llvm/IR/CallSite.h"
34 #include "llvm/IR/Constant.h"
35 #include "llvm/IR/Constants.h"
36 #include "llvm/IR/DataLayout.h"
37 #include "llvm/IR/DerivedTypes.h"
38 #include "llvm/IR/InstrTypes.h"
39 #include "llvm/IR/Instruction.h"
40 #include "llvm/IR/Instructions.h"
41 #include "llvm/IR/Intrinsics.h"
42 #include "llvm/IR/Operator.h"
43 #include "llvm/IR/Type.h"
44 #include "llvm/IR/Value.h"
45 #include "llvm/MC/MCSchedule.h"
46 #include "llvm/Support/Casting.h"
51 #include <algorithm>
52 #include <cassert>
53 #include <cstdint>
54 #include <limits>
55 #include <utility>
56 
57 namespace llvm {
58 
59 class Function;
60 class GlobalValue;
61 class LLVMContext;
62 class ScalarEvolution;
63 class SCEV;
64 class TargetMachine;
65 
66 extern cl::opt<unsigned> PartialUnrollingThreshold;
67 
68 /// Base class which can be used to help build a TTI implementation.
69 ///
70 /// This class provides as much implementation of the TTI interface as is
71 /// possible using the target independent parts of the code generator.
72 ///
73 /// In order to subclass it, your class must implement a getST() method to
74 /// return the subtarget, and a getTLI() method to return the target lowering.
75 /// We need these methods implemented in the derived class so that this class
76 /// doesn't have to duplicate storage for them.
77 template <typename T>
79 private:
81  using TTI = TargetTransformInfo;
82 
83  /// Estimate a cost of Broadcast as an extract and sequence of insert
84  /// operations.
85  unsigned getBroadcastShuffleOverhead(Type *Ty) {
86  assert(Ty->isVectorTy() && "Can only shuffle vectors");
87  unsigned Cost = 0;
88  // Broadcast cost is equal to the cost of extracting the zero'th element
89  // plus the cost of inserting it into every element of the result vector.
90  Cost += static_cast<T *>(this)->getVectorInstrCost(
91  Instruction::ExtractElement, Ty, 0);
92 
93  for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
94  Cost += static_cast<T *>(this)->getVectorInstrCost(
95  Instruction::InsertElement, Ty, i);
96  }
97  return Cost;
98  }
99 
100  /// Estimate a cost of shuffle as a sequence of extract and insert
101  /// operations.
102  unsigned getPermuteShuffleOverhead(Type *Ty) {
103  assert(Ty->isVectorTy() && "Can only shuffle vectors");
104  unsigned Cost = 0;
105  // Shuffle cost is equal to the cost of extracting element from its argument
106  // plus the cost of inserting them onto the result vector.
107 
108  // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
109  // index 0 of first vector, index 1 of second vector,index 2 of first
110  // vector and finally index 3 of second vector and insert them at index
111  // <0,1,2,3> of result vector.
112  for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
113  Cost += static_cast<T *>(this)
114  ->getVectorInstrCost(Instruction::InsertElement, Ty, i);
115  Cost += static_cast<T *>(this)
116  ->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
117  }
118  return Cost;
119  }
120 
121  /// Estimate a cost of subvector extraction as a sequence of extract and
122  /// insert operations.
123  unsigned getExtractSubvectorOverhead(Type *Ty, int Index, Type *SubTy) {
124  assert(Ty && Ty->isVectorTy() && SubTy && SubTy->isVectorTy() &&
125  "Can only extract subvectors from vectors");
126  int NumSubElts = SubTy->getVectorNumElements();
127  assert((Index + NumSubElts) <= (int)Ty->getVectorNumElements() &&
128  "SK_ExtractSubvector index out of range");
129 
130  unsigned Cost = 0;
131  // Subvector extraction cost is equal to the cost of extracting element from
132  // the source type plus the cost of inserting them into the result vector
133  // type.
134  for (int i = 0; i != NumSubElts; ++i) {
135  Cost += static_cast<T *>(this)->getVectorInstrCost(
136  Instruction::ExtractElement, Ty, i + Index);
137  Cost += static_cast<T *>(this)->getVectorInstrCost(
138  Instruction::InsertElement, SubTy, i);
139  }
140  return Cost;
141  }
142 
143  /// Estimate a cost of subvector insertion as a sequence of extract and
144  /// insert operations.
145  unsigned getInsertSubvectorOverhead(Type *Ty, int Index, Type *SubTy) {
146  assert(Ty && Ty->isVectorTy() && SubTy && SubTy->isVectorTy() &&
147  "Can only insert subvectors into vectors");
148  int NumSubElts = SubTy->getVectorNumElements();
149  assert((Index + NumSubElts) <= (int)Ty->getVectorNumElements() &&
150  "SK_InsertSubvector index out of range");
151 
152  unsigned Cost = 0;
153  // Subvector insertion cost is equal to the cost of extracting element from
154  // the source type plus the cost of inserting them into the result vector
155  // type.
156  for (int i = 0; i != NumSubElts; ++i) {
157  Cost += static_cast<T *>(this)->getVectorInstrCost(
158  Instruction::ExtractElement, SubTy, i);
159  Cost += static_cast<T *>(this)->getVectorInstrCost(
160  Instruction::InsertElement, Ty, i + Index);
161  }
162  return Cost;
163  }
164 
165  /// Local query method delegates up to T which *must* implement this!
166  const TargetSubtargetInfo *getST() const {
167  return static_cast<const T *>(this)->getST();
168  }
169 
170  /// Local query method delegates up to T which *must* implement this!
171  const TargetLoweringBase *getTLI() const {
172  return static_cast<const T *>(this)->getTLI();
173  }
174 
175  static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
176  switch (M) {
177  case TTI::MIM_Unindexed:
178  return ISD::UNINDEXED;
179  case TTI::MIM_PreInc:
180  return ISD::PRE_INC;
181  case TTI::MIM_PreDec:
182  return ISD::PRE_DEC;
183  case TTI::MIM_PostInc:
184  return ISD::POST_INC;
185  case TTI::MIM_PostDec:
186  return ISD::POST_DEC;
187  }
188  llvm_unreachable("Unexpected MemIndexedMode");
189  }
190 
191 protected:
192  explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
193  : BaseT(DL) {}
194 
196 
197 public:
198  /// \name Scalar TTI Implementations
199  /// @{
201  unsigned BitWidth, unsigned AddressSpace,
202  unsigned Alignment, bool *Fast) const {
203  EVT E = EVT::getIntegerVT(Context, BitWidth);
204  return getTLI()->allowsMisalignedMemoryAccesses(E, AddressSpace, Alignment, Fast);
205  }
206 
207  bool hasBranchDivergence() { return false; }
208 
209  bool isSourceOfDivergence(const Value *V) { return false; }
210 
211  bool isAlwaysUniform(const Value *V) { return false; }
212 
213  unsigned getFlatAddressSpace() {
214  // Return an invalid address space.
215  return -1;
216  }
217 
218  bool isLegalAddImmediate(int64_t imm) {
219  return getTLI()->isLegalAddImmediate(imm);
220  }
221 
222  bool isLegalICmpImmediate(int64_t imm) {
223  return getTLI()->isLegalICmpImmediate(imm);
224  }
225 
226  bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
227  bool HasBaseReg, int64_t Scale,
228  unsigned AddrSpace, Instruction *I = nullptr) {
230  AM.BaseGV = BaseGV;
231  AM.BaseOffs = BaseOffset;
232  AM.HasBaseReg = HasBaseReg;
233  AM.Scale = Scale;
234  return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
235  }
236 
238  const DataLayout &DL) const {
239  EVT VT = getTLI()->getValueType(DL, Ty);
240  return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
241  }
242 
244  const DataLayout &DL) const {
245  EVT VT = getTLI()->getValueType(DL, Ty);
246  return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
247  }
248 
251  }
252 
253  int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
254  bool HasBaseReg, int64_t Scale, unsigned AddrSpace) {
256  AM.BaseGV = BaseGV;
257  AM.BaseOffs = BaseOffset;
258  AM.HasBaseReg = HasBaseReg;
259  AM.Scale = Scale;
260  return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace);
261  }
262 
263  bool isTruncateFree(Type *Ty1, Type *Ty2) {
264  return getTLI()->isTruncateFree(Ty1, Ty2);
265  }
266 
268  return getTLI()->isProfitableToHoist(I);
269  }
270 
271  bool useAA() const { return getST()->useAA(); }
272 
273  bool isTypeLegal(Type *Ty) {
274  EVT VT = getTLI()->getValueType(DL, Ty);
275  return getTLI()->isTypeLegal(VT);
276  }
277 
278  int getGEPCost(Type *PointeeType, const Value *Ptr,
279  ArrayRef<const Value *> Operands) {
280  return BaseT::getGEPCost(PointeeType, Ptr, Operands);
281  }
282 
283  int getExtCost(const Instruction *I, const Value *Src) {
284  if (getTLI()->isExtFree(I))
286 
287  if (isa<ZExtInst>(I) || isa<SExtInst>(I))
288  if (const LoadInst *LI = dyn_cast<LoadInst>(Src))
289  if (getTLI()->isExtLoad(LI, I, DL))
291 
293  }
294 
295  unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
297  return BaseT::getIntrinsicCost(IID, RetTy, Arguments);
298  }
299 
300  unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
301  ArrayRef<Type *> ParamTys) {
302  if (IID == Intrinsic::cttz) {
303  if (getTLI()->isCheapToSpeculateCttz())
306  }
307 
308  if (IID == Intrinsic::ctlz) {
309  if (getTLI()->isCheapToSpeculateCtlz())
312  }
313 
314  return BaseT::getIntrinsicCost(IID, RetTy, ParamTys);
315  }
316 
318  unsigned &JumpTableSize) {
319  /// Try to find the estimated number of clusters. Note that the number of
320  /// clusters identified in this function could be different from the actural
321  /// numbers found in lowering. This function ignore switches that are
322  /// lowered with a mix of jump table / bit test / BTree. This function was
323  /// initially intended to be used when estimating the cost of switch in
324  /// inline cost heuristic, but it's a generic cost model to be used in other
325  /// places (e.g., in loop unrolling).
326  unsigned N = SI.getNumCases();
327  const TargetLoweringBase *TLI = getTLI();
328  const DataLayout &DL = this->getDataLayout();
329 
330  JumpTableSize = 0;
331  bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());
332 
333  // Early exit if both a jump table and bit test are not allowed.
334  if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N))
335  return N;
336 
337  APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
338  APInt MinCaseVal = MaxCaseVal;
339  for (auto CI : SI.cases()) {
340  const APInt &CaseVal = CI.getCaseValue()->getValue();
341  if (CaseVal.sgt(MaxCaseVal))
342  MaxCaseVal = CaseVal;
343  if (CaseVal.slt(MinCaseVal))
344  MinCaseVal = CaseVal;
345  }
346 
347  // Check if suitable for a bit test
348  if (N <= DL.getIndexSizeInBits(0u)) {
350  for (auto I : SI.cases())
351  Dests.insert(I.getCaseSuccessor());
352 
353  if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
354  DL))
355  return 1;
356  }
357 
358  // Check if suitable for a jump table.
359  if (IsJTAllowed) {
360  if (N < 2 || N < TLI->getMinimumJumpTableEntries())
361  return N;
362  uint64_t Range =
363  (MaxCaseVal - MinCaseVal)
364  .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1;
365  // Check whether a range of clusters is dense enough for a jump table
366  if (TLI->isSuitableForJumpTable(&SI, N, Range)) {
367  JumpTableSize = Range;
368  return 1;
369  }
370  }
371  return N;
372  }
373 
374  unsigned getJumpBufAlignment() { return getTLI()->getJumpBufAlignment(); }
375 
376  unsigned getJumpBufSize() { return getTLI()->getJumpBufSize(); }
377 
379  const TargetLoweringBase *TLI = getTLI();
382  }
383 
384  bool haveFastSqrt(Type *Ty) {
385  const TargetLoweringBase *TLI = getTLI();
386  EVT VT = TLI->getValueType(DL, Ty);
387  return TLI->isTypeLegal(VT) &&
389  }
390 
392  return true;
393  }
394 
395  unsigned getFPOpCost(Type *Ty) {
396  // Check whether FADD is available, as a proxy for floating-point in
397  // general.
398  const TargetLoweringBase *TLI = getTLI();
399  EVT VT = TLI->getValueType(DL, Ty);
403  }
404 
405  unsigned getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy) {
406  const TargetLoweringBase *TLI = getTLI();
407  switch (Opcode) {
408  default: break;
409  case Instruction::Trunc:
410  if (TLI->isTruncateFree(OpTy, Ty))
413  case Instruction::ZExt:
414  if (TLI->isZExtFree(OpTy, Ty))
417  }
418 
419  return BaseT::getOperationCost(Opcode, Ty, OpTy);
420  }
421 
422  unsigned getInliningThresholdMultiplier() { return 1; }
423 
426  // This unrolling functionality is target independent, but to provide some
427  // motivation for its intended use, for x86:
428 
429  // According to the Intel 64 and IA-32 Architectures Optimization Reference
430  // Manual, Intel Core models and later have a loop stream detector (and
431  // associated uop queue) that can benefit from partial unrolling.
432  // The relevant requirements are:
433  // - The loop must have no more than 4 (8 for Nehalem and later) branches
434  // taken, and none of them may be calls.
435  // - The loop can have no more than 18 (28 for Nehalem and later) uops.
436 
437  // According to the Software Optimization Guide for AMD Family 15h
438  // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
439  // and loop buffer which can benefit from partial unrolling.
440  // The relevant requirements are:
441  // - The loop must have fewer than 16 branches
442  // - The loop must have less than 40 uops in all executed loop branches
443 
444  // The number of taken branches in a loop is hard to estimate here, and
445  // benchmarking has revealed that it is better not to be conservative when
446  // estimating the branch count. As a result, we'll ignore the branch limits
447  // until someone finds a case where it matters in practice.
448 
449  unsigned MaxOps;
450  const TargetSubtargetInfo *ST = getST();
451  if (PartialUnrollingThreshold.getNumOccurrences() > 0)
452  MaxOps = PartialUnrollingThreshold;
453  else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
454  MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
455  else
456  return;
457 
458  // Scan the loop: don't unroll loops with calls.
459  for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E;
460  ++I) {
461  BasicBlock *BB = *I;
462 
463  for (BasicBlock::iterator J = BB->begin(), JE = BB->end(); J != JE; ++J)
464  if (isa<CallInst>(J) || isa<InvokeInst>(J)) {
465  ImmutableCallSite CS(&*J);
466  if (const Function *F = CS.getCalledFunction()) {
467  if (!static_cast<T *>(this)->isLoweredToCall(F))
468  continue;
469  }
470 
471  return;
472  }
473  }
474 
475  // Enable runtime and partial unrolling up to the specified size.
476  // Enable using trip count upper bound to unroll loops.
477  UP.Partial = UP.Runtime = UP.UpperBound = true;
478  UP.PartialThreshold = MaxOps;
479 
480  // Avoid unrolling when optimizing for size.
481  UP.OptSizeThreshold = 0;
483 
484  // Set number of instructions optimized when "back edge"
485  // becomes "fall through" to default value of 2.
486  UP.BEInsns = 2;
487  }
488 
490  if (isa<LoadInst>(I))
491  return getST()->getSchedModel().DefaultLoadLatency;
492 
494  }
495 
496  /// @}
497 
498  /// \name Vector TTI Implementations
499  /// @{
500 
501  unsigned getNumberOfRegisters(bool Vector) { return Vector ? 0 : 1; }
502 
503  unsigned getRegisterBitWidth(bool Vector) const { return 32; }
504 
505  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
506  /// are set if the result needs to be inserted and/or extracted from vectors.
507  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
508  assert(Ty->isVectorTy() && "Can only scalarize vectors");
509  unsigned Cost = 0;
510 
511  for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
512  if (Insert)
513  Cost += static_cast<T *>(this)
514  ->getVectorInstrCost(Instruction::InsertElement, Ty, i);
515  if (Extract)
516  Cost += static_cast<T *>(this)
517  ->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
518  }
519 
520  return Cost;
521  }
522 
523  /// Estimate the overhead of scalarizing an instructions unique
524  /// non-constant operands. The types of the arguments are ordinarily
525  /// scalar, in which case the costs are multiplied with VF.
527  unsigned VF) {
528  unsigned Cost = 0;
529  SmallPtrSet<const Value*, 4> UniqueOperands;
530  for (const Value *A : Args) {
531  if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
532  Type *VecTy = nullptr;
533  if (A->getType()->isVectorTy()) {
534  VecTy = A->getType();
535  // If A is a vector operand, VF should be 1 or correspond to A.
536  assert((VF == 1 || VF == VecTy->getVectorNumElements()) &&
537  "Vector argument does not match VF");
538  }
539  else
540  VecTy = VectorType::get(A->getType(), VF);
541 
542  Cost += getScalarizationOverhead(VecTy, false, true);
543  }
544  }
545 
546  return Cost;
547  }
548 
550  assert(VecTy->isVectorTy());
551 
552  unsigned Cost = 0;
553 
554  Cost += getScalarizationOverhead(VecTy, true, false);
555  if (!Args.empty())
557  VecTy->getVectorNumElements());
558  else
559  // When no information on arguments is provided, we add the cost
560  // associated with one argument as a heuristic.
561  Cost += getScalarizationOverhead(VecTy, false, true);
562 
563  return Cost;
564  }
565 
566  unsigned getMaxInterleaveFactor(unsigned VF) { return 1; }
567 
569  unsigned Opcode, Type *Ty,
575  // Check if any of the operands are vector operands.
576  const TargetLoweringBase *TLI = getTLI();
577  int ISD = TLI->InstructionOpcodeToISD(Opcode);
578  assert(ISD && "Invalid opcode");
579 
580  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
581 
582  bool IsFloat = Ty->isFPOrFPVectorTy();
583  // Assume that floating point arithmetic operations cost twice as much as
584  // integer operations.
585  unsigned OpCost = (IsFloat ? 2 : 1);
586 
587  if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
588  // The operation is legal. Assume it costs 1.
589  // TODO: Once we have extract/insert subvector cost we need to use them.
590  return LT.first * OpCost;
591  }
592 
593  if (!TLI->isOperationExpand(ISD, LT.second)) {
594  // If the operation is custom lowered, then assume that the code is twice
595  // as expensive.
596  return LT.first * 2 * OpCost;
597  }
598 
599  // Else, assume that we need to scalarize this op.
600  // TODO: If one of the types get legalized by splitting, handle this
601  // similarly to what getCastInstrCost() does.
602  if (Ty->isVectorTy()) {
603  unsigned Num = Ty->getVectorNumElements();
604  unsigned Cost = static_cast<T *>(this)
605  ->getArithmeticInstrCost(Opcode, Ty->getScalarType());
606  // Return the cost of multiple scalar invocation plus the cost of
607  // inserting and extracting the values.
608  return getScalarizationOverhead(Ty, Args) + Num * Cost;
609  }
610 
611  // We don't know anything about this scalar instruction.
612  return OpCost;
613  }
614 
615  unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
616  Type *SubTp) {
617  switch (Kind) {
618  case TTI::SK_Broadcast:
619  return getBroadcastShuffleOverhead(Tp);
620  case TTI::SK_Select:
621  case TTI::SK_Reverse:
622  case TTI::SK_Transpose:
625  return getPermuteShuffleOverhead(Tp);
627  return getExtractSubvectorOverhead(Tp, Index, SubTp);
629  return getInsertSubvectorOverhead(Tp, Index, SubTp);
630  }
631  llvm_unreachable("Unknown TTI::ShuffleKind");
632  }
633 
634  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
635  const Instruction *I = nullptr) {
636  const TargetLoweringBase *TLI = getTLI();
637  int ISD = TLI->InstructionOpcodeToISD(Opcode);
638  assert(ISD && "Invalid opcode");
639  std::pair<unsigned, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, Src);
640  std::pair<unsigned, MVT> DstLT = TLI->getTypeLegalizationCost(DL, Dst);
641 
642  // Check for NOOP conversions.
643  if (SrcLT.first == DstLT.first &&
644  SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) {
645 
646  // Bitcast between types that are legalized to the same type are free.
647  if (Opcode == Instruction::BitCast || Opcode == Instruction::Trunc)
648  return 0;
649  }
650 
651  if (Opcode == Instruction::Trunc &&
652  TLI->isTruncateFree(SrcLT.second, DstLT.second))
653  return 0;
654 
655  if (Opcode == Instruction::ZExt &&
656  TLI->isZExtFree(SrcLT.second, DstLT.second))
657  return 0;
658 
659  if (Opcode == Instruction::AddrSpaceCast &&
661  Dst->getPointerAddressSpace()))
662  return 0;
663 
664  // If this is a zext/sext of a load, return 0 if the corresponding
665  // extending load exists on target.
666  if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
667  I && isa<LoadInst>(I->getOperand(0))) {
668  EVT ExtVT = EVT::getEVT(Dst);
669  EVT LoadVT = EVT::getEVT(Src);
670  unsigned LType =
671  ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
672  if (TLI->isLoadExtLegal(LType, ExtVT, LoadVT))
673  return 0;
674  }
675 
676  // If the cast is marked as legal (or promote) then assume low cost.
677  if (SrcLT.first == DstLT.first &&
678  TLI->isOperationLegalOrPromote(ISD, DstLT.second))
679  return 1;
680 
681  // Handle scalar conversions.
682  if (!Src->isVectorTy() && !Dst->isVectorTy()) {
683  // Scalar bitcasts are usually free.
684  if (Opcode == Instruction::BitCast)
685  return 0;
686 
687  // Just check the op cost. If the operation is legal then assume it costs
688  // 1.
689  if (!TLI->isOperationExpand(ISD, DstLT.second))
690  return 1;
691 
692  // Assume that illegal scalar instruction are expensive.
693  return 4;
694  }
695 
696  // Check vector-to-vector casts.
697  if (Dst->isVectorTy() && Src->isVectorTy()) {
698  // If the cast is between same-sized registers, then the check is simple.
699  if (SrcLT.first == DstLT.first &&
700  SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) {
701 
702  // Assume that Zext is done using AND.
703  if (Opcode == Instruction::ZExt)
704  return 1;
705 
706  // Assume that sext is done using SHL and SRA.
707  if (Opcode == Instruction::SExt)
708  return 2;
709 
710  // Just check the op cost. If the operation is legal then assume it
711  // costs
712  // 1 and multiply by the type-legalization overhead.
713  if (!TLI->isOperationExpand(ISD, DstLT.second))
714  return SrcLT.first * 1;
715  }
716 
717  // If we are legalizing by splitting, query the concrete TTI for the cost
718  // of casting the original vector twice. We also need to factor in the
719  // cost of the split itself. Count that as 1, to be consistent with
720  // TLI->getTypeLegalizationCost().
721  if ((TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
723  (TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
725  Type *SplitDst = VectorType::get(Dst->getVectorElementType(),
726  Dst->getVectorNumElements() / 2);
727  Type *SplitSrc = VectorType::get(Src->getVectorElementType(),
728  Src->getVectorNumElements() / 2);
729  T *TTI = static_cast<T *>(this);
730  return TTI->getVectorSplitCost() +
731  (2 * TTI->getCastInstrCost(Opcode, SplitDst, SplitSrc, I));
732  }
733 
734  // In other cases where the source or destination are illegal, assume
735  // the operation will get scalarized.
736  unsigned Num = Dst->getVectorNumElements();
737  unsigned Cost = static_cast<T *>(this)->getCastInstrCost(
738  Opcode, Dst->getScalarType(), Src->getScalarType(), I);
739 
740  // Return the cost of multiple scalar invocation plus the cost of
741  // inserting and extracting the values.
742  return getScalarizationOverhead(Dst, true, true) + Num * Cost;
743  }
744 
745  // We already handled vector-to-vector and scalar-to-scalar conversions.
746  // This
747  // is where we handle bitcast between vectors and scalars. We need to assume
748  // that the conversion is scalarized in one way or another.
749  if (Opcode == Instruction::BitCast)
750  // Illegal bitcasts are done by storing and loading from a stack slot.
751  return (Src->isVectorTy() ? getScalarizationOverhead(Src, false, true)
752  : 0) +
753  (Dst->isVectorTy() ? getScalarizationOverhead(Dst, true, false)
754  : 0);
755 
756  llvm_unreachable("Unhandled cast");
757  }
758 
759  unsigned getExtractWithExtendCost(unsigned Opcode, Type *Dst,
760  VectorType *VecTy, unsigned Index) {
761  return static_cast<T *>(this)->getVectorInstrCost(
762  Instruction::ExtractElement, VecTy, Index) +
763  static_cast<T *>(this)->getCastInstrCost(Opcode, Dst,
764  VecTy->getElementType());
765  }
766 
767  unsigned getCFInstrCost(unsigned Opcode) {
768  // Branches are assumed to be predicted.
769  return 0;
770  }
771 
772  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
773  const Instruction *I) {
774  const TargetLoweringBase *TLI = getTLI();
775  int ISD = TLI->InstructionOpcodeToISD(Opcode);
776  assert(ISD && "Invalid opcode");
777 
778  // Selects on vectors are actually vector selects.
779  if (ISD == ISD::SELECT) {
780  assert(CondTy && "CondTy must exist");
781  if (CondTy->isVectorTy())
782  ISD = ISD::VSELECT;
783  }
784  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
785 
786  if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
787  !TLI->isOperationExpand(ISD, LT.second)) {
788  // The operation is legal. Assume it costs 1. Multiply
789  // by the type-legalization overhead.
790  return LT.first * 1;
791  }
792 
793  // Otherwise, assume that the cast is scalarized.
794  // TODO: If one of the types get legalized by splitting, handle this
795  // similarly to what getCastInstrCost() does.
796  if (ValTy->isVectorTy()) {
797  unsigned Num = ValTy->getVectorNumElements();
798  if (CondTy)
799  CondTy = CondTy->getScalarType();
800  unsigned Cost = static_cast<T *>(this)->getCmpSelInstrCost(
801  Opcode, ValTy->getScalarType(), CondTy, I);
802 
803  // Return the cost of multiple scalar invocation plus the cost of
804  // inserting and extracting the values.
805  return getScalarizationOverhead(ValTy, true, false) + Num * Cost;
806  }
807 
808  // Unknown scalar opcode.
809  return 1;
810  }
811 
812  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
813  std::pair<unsigned, MVT> LT =
814  getTLI()->getTypeLegalizationCost(DL, Val->getScalarType());
815 
816  return LT.first;
817  }
818 
819  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
820  unsigned AddressSpace, const Instruction *I = nullptr) {
821  assert(!Src->isVoidTy() && "Invalid type");
822  std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(DL, Src);
823 
824  // Assuming that all loads of legal types cost 1.
825  unsigned Cost = LT.first;
826 
827  if (Src->isVectorTy() &&
828  Src->getPrimitiveSizeInBits() < LT.second.getSizeInBits()) {
829  // This is a vector load that legalizes to a larger type than the vector
830  // itself. Unless the corresponding extending load or truncating store is
831  // legal, then this will scalarize.
833  EVT MemVT = getTLI()->getValueType(DL, Src);
834  if (Opcode == Instruction::Store)
835  LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
836  else
837  LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
838 
839  if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
840  // This is a vector load/store for some illegal type that is scalarized.
841  // We must account for the cost of building or decomposing the vector.
842  Cost += getScalarizationOverhead(Src, Opcode != Instruction::Store,
843  Opcode == Instruction::Store);
844  }
845  }
846 
847  return Cost;
848  }
849 
850  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
851  unsigned Factor,
852  ArrayRef<unsigned> Indices,
853  unsigned Alignment, unsigned AddressSpace,
854  bool UseMaskForCond = false,
855  bool UseMaskForGaps = false) {
856  VectorType *VT = dyn_cast<VectorType>(VecTy);
857  assert(VT && "Expect a vector type for interleaved memory op");
858 
859  unsigned NumElts = VT->getNumElements();
860  assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
861 
862  unsigned NumSubElts = NumElts / Factor;
863  VectorType *SubVT = VectorType::get(VT->getElementType(), NumSubElts);
864 
865  // Firstly, the cost of load/store operation.
866  unsigned Cost;
867  if (UseMaskForCond || UseMaskForGaps)
868  Cost = static_cast<T *>(this)->getMaskedMemoryOpCost(
869  Opcode, VecTy, Alignment, AddressSpace);
870  else
871  Cost = static_cast<T *>(this)->getMemoryOpCost(Opcode, VecTy, Alignment,
872  AddressSpace);
873 
874  // Legalize the vector type, and get the legalized and unlegalized type
875  // sizes.
876  MVT VecTyLT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
877  unsigned VecTySize =
878  static_cast<T *>(this)->getDataLayout().getTypeStoreSize(VecTy);
879  unsigned VecTyLTSize = VecTyLT.getStoreSize();
880 
881  // Return the ceiling of dividing A by B.
882  auto ceil = [](unsigned A, unsigned B) { return (A + B - 1) / B; };
883 
884  // Scale the cost of the memory operation by the fraction of legalized
885  // instructions that will actually be used. We shouldn't account for the
886  // cost of dead instructions since they will be removed.
887  //
888  // E.g., An interleaved load of factor 8:
889  // %vec = load <16 x i64>, <16 x i64>* %ptr
890  // %v0 = shufflevector %vec, undef, <0, 8>
891  //
892  // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
893  // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
894  // type). The other loads are unused.
895  //
896  // We only scale the cost of loads since interleaved store groups aren't
897  // allowed to have gaps.
898  if (Opcode == Instruction::Load && VecTySize > VecTyLTSize) {
899  // The number of loads of a legal type it will take to represent a load
900  // of the unlegalized vector type.
901  unsigned NumLegalInsts = ceil(VecTySize, VecTyLTSize);
902 
903  // The number of elements of the unlegalized type that correspond to a
904  // single legal instruction.
905  unsigned NumEltsPerLegalInst = ceil(NumElts, NumLegalInsts);
906 
907  // Determine which legal instructions will be used.
908  BitVector UsedInsts(NumLegalInsts, false);
909  for (unsigned Index : Indices)
910  for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
911  UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
912 
913  // Scale the cost of the load by the fraction of legal instructions that
914  // will be used.
915  Cost *= UsedInsts.count() / NumLegalInsts;
916  }
917 
918  // Then plus the cost of interleave operation.
919  if (Opcode == Instruction::Load) {
920  // The interleave cost is similar to extract sub vectors' elements
921  // from the wide vector, and insert them into sub vectors.
922  //
923  // E.g. An interleaved load of factor 2 (with one member of index 0):
924  // %vec = load <8 x i32>, <8 x i32>* %ptr
925  // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0
926  // The cost is estimated as extract elements at 0, 2, 4, 6 from the
927  // <8 x i32> vector and insert them into a <4 x i32> vector.
928 
929  assert(Indices.size() <= Factor &&
930  "Interleaved memory op has too many members");
931 
932  for (unsigned Index : Indices) {
933  assert(Index < Factor && "Invalid index for interleaved memory op");
934 
935  // Extract elements from loaded vector for each sub vector.
936  for (unsigned i = 0; i < NumSubElts; i++)
937  Cost += static_cast<T *>(this)->getVectorInstrCost(
938  Instruction::ExtractElement, VT, Index + i * Factor);
939  }
940 
941  unsigned InsSubCost = 0;
942  for (unsigned i = 0; i < NumSubElts; i++)
943  InsSubCost += static_cast<T *>(this)->getVectorInstrCost(
944  Instruction::InsertElement, SubVT, i);
945 
946  Cost += Indices.size() * InsSubCost;
947  } else {
948  // The interleave cost is extract all elements from sub vectors, and
949  // insert them into the wide vector.
950  //
951  // E.g. An interleaved store of factor 2:
952  // %v0_v1 = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7>
953  // store <8 x i32> %interleaved.vec, <8 x i32>* %ptr
954  // The cost is estimated as extract all elements from both <4 x i32>
955  // vectors and insert into the <8 x i32> vector.
956 
957  unsigned ExtSubCost = 0;
958  for (unsigned i = 0; i < NumSubElts; i++)
959  ExtSubCost += static_cast<T *>(this)->getVectorInstrCost(
960  Instruction::ExtractElement, SubVT, i);
961  Cost += ExtSubCost * Factor;
962 
963  for (unsigned i = 0; i < NumElts; i++)
964  Cost += static_cast<T *>(this)
965  ->getVectorInstrCost(Instruction::InsertElement, VT, i);
966  }
967 
968  if (!UseMaskForCond)
969  return Cost;
970 
971  Type *I8Type = Type::getInt8Ty(VT->getContext());
972  VectorType *MaskVT = VectorType::get(I8Type, NumElts);
973  SubVT = VectorType::get(I8Type, NumSubElts);
974 
975  // The Mask shuffling cost is extract all the elements of the Mask
976  // and insert each of them Factor times into the wide vector:
977  //
978  // E.g. an interleaved group with factor 3:
979  // %mask = icmp ult <8 x i32> %vec1, %vec2
980  // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
981  // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
982  // The cost is estimated as extract all mask elements from the <8xi1> mask
983  // vector and insert them factor times into the <24xi1> shuffled mask
984  // vector.
985  for (unsigned i = 0; i < NumSubElts; i++)
986  Cost += static_cast<T *>(this)->getVectorInstrCost(
987  Instruction::ExtractElement, SubVT, i);
988 
989  for (unsigned i = 0; i < NumElts; i++)
990  Cost += static_cast<T *>(this)->getVectorInstrCost(
991  Instruction::InsertElement, MaskVT, i);
992 
993  // The Gaps mask is invariant and created outside the loop, therefore the
994  // cost of creating it is not accounted for here. However if we have both
995  // a MaskForGaps and some other mask that guards the execution of the
996  // memory access, we need to account for the cost of And-ing the two masks
997  // inside the loop.
998  if (UseMaskForGaps)
999  Cost += static_cast<T *>(this)->getArithmeticInstrCost(
1000  BinaryOperator::And, MaskVT);
1001 
1002  return Cost;
1003  }
1004 
1005  /// Get intrinsic cost based on arguments.
1008  unsigned VF = 1) {
1009  unsigned RetVF = (RetTy->isVectorTy() ? RetTy->getVectorNumElements() : 1);
1010  assert((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type");
1011 
1012  switch (IID) {
1013  default: {
1014  // Assume that we need to scalarize this intrinsic.
1015  SmallVector<Type *, 4> Types;
1016  for (Value *Op : Args) {
1017  Type *OpTy = Op->getType();
1018  assert(VF == 1 || !OpTy->isVectorTy());
1019  Types.push_back(VF == 1 ? OpTy : VectorType::get(OpTy, VF));
1020  }
1021 
1022  if (VF > 1 && !RetTy->isVoidTy())
1023  RetTy = VectorType::get(RetTy, VF);
1024 
1025  // Compute the scalarization overhead based on Args for a vector
1026  // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while
1027  // CostModel will pass a vector RetTy and VF is 1.
1028  unsigned ScalarizationCost = std::numeric_limits<unsigned>::max();
1029  if (RetVF > 1 || VF > 1) {
1030  ScalarizationCost = 0;
1031  if (!RetTy->isVoidTy())
1032  ScalarizationCost += getScalarizationOverhead(RetTy, true, false);
1033  ScalarizationCost += getOperandsScalarizationOverhead(Args, VF);
1034  }
1035 
1036  return static_cast<T *>(this)->
1037  getIntrinsicInstrCost(IID, RetTy, Types, FMF, ScalarizationCost);
1038  }
1039  case Intrinsic::masked_scatter: {
1040  assert(VF == 1 && "Can't vectorize types here.");
1041  Value *Mask = Args[3];
1042  bool VarMask = !isa<Constant>(Mask);
1043  unsigned Alignment = cast<ConstantInt>(Args[2])->getZExtValue();
1044  return
1045  static_cast<T *>(this)->getGatherScatterOpCost(Instruction::Store,
1046  Args[0]->getType(),
1047  Args[1], VarMask,
1048  Alignment);
1049  }
1050  case Intrinsic::masked_gather: {
1051  assert(VF == 1 && "Can't vectorize types here.");
1052  Value *Mask = Args[2];
1053  bool VarMask = !isa<Constant>(Mask);
1054  unsigned Alignment = cast<ConstantInt>(Args[1])->getZExtValue();
1055  return
1056  static_cast<T *>(this)->getGatherScatterOpCost(Instruction::Load,
1057  RetTy, Args[0], VarMask,
1058  Alignment);
1059  }
1060  case Intrinsic::experimental_vector_reduce_add:
1061  case Intrinsic::experimental_vector_reduce_mul:
1062  case Intrinsic::experimental_vector_reduce_and:
1063  case Intrinsic::experimental_vector_reduce_or:
1064  case Intrinsic::experimental_vector_reduce_xor:
1065  case Intrinsic::experimental_vector_reduce_fadd:
1066  case Intrinsic::experimental_vector_reduce_fmul:
1067  case Intrinsic::experimental_vector_reduce_smax:
1068  case Intrinsic::experimental_vector_reduce_smin:
1069  case Intrinsic::experimental_vector_reduce_fmax:
1070  case Intrinsic::experimental_vector_reduce_fmin:
1071  case Intrinsic::experimental_vector_reduce_umax:
1072  case Intrinsic::experimental_vector_reduce_umin:
1073  return getIntrinsicInstrCost(IID, RetTy, Args[0]->getType(), FMF);
1074  }
1075  }
1076 
1077  /// Get intrinsic cost based on argument types.
1078  /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the
1079  /// cost of scalarizing the arguments and the return value will be computed
1080  /// based on types.
1082  Intrinsic::ID IID, Type *RetTy, ArrayRef<Type *> Tys, FastMathFlags FMF,
1083  unsigned ScalarizationCostPassed = std::numeric_limits<unsigned>::max()) {
1085  unsigned SingleCallCost = 10; // Library call cost. Make it expensive.
1086  switch (IID) {
1087  default: {
1088  // Assume that we need to scalarize this intrinsic.
1089  unsigned ScalarizationCost = ScalarizationCostPassed;
1090  unsigned ScalarCalls = 1;
1091  Type *ScalarRetTy = RetTy;
1092  if (RetTy->isVectorTy()) {
1093  if (ScalarizationCostPassed == std::numeric_limits<unsigned>::max())
1094  ScalarizationCost = getScalarizationOverhead(RetTy, true, false);
1095  ScalarCalls = std::max(ScalarCalls, RetTy->getVectorNumElements());
1096  ScalarRetTy = RetTy->getScalarType();
1097  }
1098  SmallVector<Type *, 4> ScalarTys;
1099  for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1100  Type *Ty = Tys[i];
1101  if (Ty->isVectorTy()) {
1102  if (ScalarizationCostPassed == std::numeric_limits<unsigned>::max())
1103  ScalarizationCost += getScalarizationOverhead(Ty, false, true);
1104  ScalarCalls = std::max(ScalarCalls, Ty->getVectorNumElements());
1105  Ty = Ty->getScalarType();
1106  }
1107  ScalarTys.push_back(Ty);
1108  }
1109  if (ScalarCalls == 1)
1110  return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
1111 
1112  unsigned ScalarCost = static_cast<T *>(this)->getIntrinsicInstrCost(
1113  IID, ScalarRetTy, ScalarTys, FMF);
1114 
1115  return ScalarCalls * ScalarCost + ScalarizationCost;
1116  }
1117  // Look for intrinsics that can be lowered directly or turned into a scalar
1118  // intrinsic call.
1119  case Intrinsic::sqrt:
1120  ISDs.push_back(ISD::FSQRT);
1121  break;
1122  case Intrinsic::sin:
1123  ISDs.push_back(ISD::FSIN);
1124  break;
1125  case Intrinsic::cos:
1126  ISDs.push_back(ISD::FCOS);
1127  break;
1128  case Intrinsic::exp:
1129  ISDs.push_back(ISD::FEXP);
1130  break;
1131  case Intrinsic::exp2:
1132  ISDs.push_back(ISD::FEXP2);
1133  break;
1134  case Intrinsic::log:
1135  ISDs.push_back(ISD::FLOG);
1136  break;
1137  case Intrinsic::log10:
1138  ISDs.push_back(ISD::FLOG10);
1139  break;
1140  case Intrinsic::log2:
1141  ISDs.push_back(ISD::FLOG2);
1142  break;
1143  case Intrinsic::fabs:
1144  ISDs.push_back(ISD::FABS);
1145  break;
1146  case Intrinsic::canonicalize:
1148  break;
1149  case Intrinsic::minnum:
1150  ISDs.push_back(ISD::FMINNUM);
1151  if (FMF.noNaNs())
1152  ISDs.push_back(ISD::FMINIMUM);
1153  break;
1154  case Intrinsic::maxnum:
1155  ISDs.push_back(ISD::FMAXNUM);
1156  if (FMF.noNaNs())
1157  ISDs.push_back(ISD::FMAXIMUM);
1158  break;
1159  case Intrinsic::copysign:
1160  ISDs.push_back(ISD::FCOPYSIGN);
1161  break;
1162  case Intrinsic::floor:
1163  ISDs.push_back(ISD::FFLOOR);
1164  break;
1165  case Intrinsic::ceil:
1166  ISDs.push_back(ISD::FCEIL);
1167  break;
1168  case Intrinsic::trunc:
1169  ISDs.push_back(ISD::FTRUNC);
1170  break;
1171  case Intrinsic::nearbyint:
1172  ISDs.push_back(ISD::FNEARBYINT);
1173  break;
1174  case Intrinsic::rint:
1175  ISDs.push_back(ISD::FRINT);
1176  break;
1177  case Intrinsic::round:
1178  ISDs.push_back(ISD::FROUND);
1179  break;
1180  case Intrinsic::pow:
1181  ISDs.push_back(ISD::FPOW);
1182  break;
1183  case Intrinsic::fma:
1184  ISDs.push_back(ISD::FMA);
1185  break;
1186  case Intrinsic::fmuladd:
1187  ISDs.push_back(ISD::FMA);
1188  break;
1189  // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
1190  case Intrinsic::lifetime_start:
1191  case Intrinsic::lifetime_end:
1192  case Intrinsic::sideeffect:
1193  return 0;
1194  case Intrinsic::masked_store:
1195  return static_cast<T *>(this)
1196  ->getMaskedMemoryOpCost(Instruction::Store, Tys[0], 0, 0);
1197  case Intrinsic::masked_load:
1198  return static_cast<T *>(this)
1199  ->getMaskedMemoryOpCost(Instruction::Load, RetTy, 0, 0);
1200  case Intrinsic::experimental_vector_reduce_add:
1201  return static_cast<T *>(this)->getArithmeticReductionCost(
1202  Instruction::Add, Tys[0], /*IsPairwiseForm=*/false);
1203  case Intrinsic::experimental_vector_reduce_mul:
1204  return static_cast<T *>(this)->getArithmeticReductionCost(
1205  Instruction::Mul, Tys[0], /*IsPairwiseForm=*/false);
1206  case Intrinsic::experimental_vector_reduce_and:
1207  return static_cast<T *>(this)->getArithmeticReductionCost(
1208  Instruction::And, Tys[0], /*IsPairwiseForm=*/false);
1209  case Intrinsic::experimental_vector_reduce_or:
1210  return static_cast<T *>(this)->getArithmeticReductionCost(
1211  Instruction::Or, Tys[0], /*IsPairwiseForm=*/false);
1212  case Intrinsic::experimental_vector_reduce_xor:
1213  return static_cast<T *>(this)->getArithmeticReductionCost(
1214  Instruction::Xor, Tys[0], /*IsPairwiseForm=*/false);
1215  case Intrinsic::experimental_vector_reduce_fadd:
1216  return static_cast<T *>(this)->getArithmeticReductionCost(
1217  Instruction::FAdd, Tys[0], /*IsPairwiseForm=*/false);
1218  case Intrinsic::experimental_vector_reduce_fmul:
1219  return static_cast<T *>(this)->getArithmeticReductionCost(
1220  Instruction::FMul, Tys[0], /*IsPairwiseForm=*/false);
1221  case Intrinsic::experimental_vector_reduce_smax:
1222  case Intrinsic::experimental_vector_reduce_smin:
1223  case Intrinsic::experimental_vector_reduce_fmax:
1224  case Intrinsic::experimental_vector_reduce_fmin:
1225  return static_cast<T *>(this)->getMinMaxReductionCost(
1226  Tys[0], CmpInst::makeCmpResultType(Tys[0]), /*IsPairwiseForm=*/false,
1227  /*IsSigned=*/true);
1228  case Intrinsic::experimental_vector_reduce_umax:
1229  case Intrinsic::experimental_vector_reduce_umin:
1230  return static_cast<T *>(this)->getMinMaxReductionCost(
1231  Tys[0], CmpInst::makeCmpResultType(Tys[0]), /*IsPairwiseForm=*/false,
1232  /*IsSigned=*/false);
1233  case Intrinsic::ctpop:
1234  ISDs.push_back(ISD::CTPOP);
1235  // In case of legalization use TCC_Expensive. This is cheaper than a
1236  // library call but still not a cheap instruction.
1237  SingleCallCost = TargetTransformInfo::TCC_Expensive;
1238  break;
1239  // FIXME: ctlz, cttz, ...
1240  }
1241 
1242  const TargetLoweringBase *TLI = getTLI();
1243  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
1244 
1245  SmallVector<unsigned, 2> LegalCost;
1246  SmallVector<unsigned, 2> CustomCost;
1247  for (unsigned ISD : ISDs) {
1248  if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
1249  if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
1250  TLI->isFAbsFree(LT.second)) {
1251  return 0;
1252  }
1253 
1254  // The operation is legal. Assume it costs 1.
1255  // If the type is split to multiple registers, assume that there is some
1256  // overhead to this.
1257  // TODO: Once we have extract/insert subvector cost we need to use them.
1258  if (LT.first > 1)
1259  LegalCost.push_back(LT.first * 2);
1260  else
1261  LegalCost.push_back(LT.first * 1);
1262  } else if (!TLI->isOperationExpand(ISD, LT.second)) {
1263  // If the operation is custom lowered then assume
1264  // that the code is twice as expensive.
1265  CustomCost.push_back(LT.first * 2);
1266  }
1267  }
1268 
1269  auto MinLegalCostI = std::min_element(LegalCost.begin(), LegalCost.end());
1270  if (MinLegalCostI != LegalCost.end())
1271  return *MinLegalCostI;
1272 
1273  auto MinCustomCostI = std::min_element(CustomCost.begin(), CustomCost.end());
1274  if (MinCustomCostI != CustomCost.end())
1275  return *MinCustomCostI;
1276 
1277  // If we can't lower fmuladd into an FMA estimate the cost as a floating
1278  // point mul followed by an add.
1279  if (IID == Intrinsic::fmuladd)
1280  return static_cast<T *>(this)
1281  ->getArithmeticInstrCost(BinaryOperator::FMul, RetTy) +
1282  static_cast<T *>(this)
1283  ->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy);
1284 
1285  // Else, assume that we need to scalarize this intrinsic. For math builtins
1286  // this will emit a costly libcall, adding call overhead and spills. Make it
1287  // very expensive.
1288  if (RetTy->isVectorTy()) {
1289  unsigned ScalarizationCost =
1290  ((ScalarizationCostPassed != std::numeric_limits<unsigned>::max())
1291  ? ScalarizationCostPassed
1292  : getScalarizationOverhead(RetTy, true, false));
1293  unsigned ScalarCalls = RetTy->getVectorNumElements();
1294  SmallVector<Type *, 4> ScalarTys;
1295  for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1296  Type *Ty = Tys[i];
1297  if (Ty->isVectorTy())
1298  Ty = Ty->getScalarType();
1299  ScalarTys.push_back(Ty);
1300  }
1301  unsigned ScalarCost = static_cast<T *>(this)->getIntrinsicInstrCost(
1302  IID, RetTy->getScalarType(), ScalarTys, FMF);
1303  for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1304  if (Tys[i]->isVectorTy()) {
1305  if (ScalarizationCostPassed == std::numeric_limits<unsigned>::max())
1306  ScalarizationCost += getScalarizationOverhead(Tys[i], false, true);
1307  ScalarCalls = std::max(ScalarCalls, Tys[i]->getVectorNumElements());
1308  }
1309  }
1310 
1311  return ScalarCalls * ScalarCost + ScalarizationCost;
1312  }
1313 
1314  // This is going to be turned into a library call, make it expensive.
1315  return SingleCallCost;
1316  }
1317 
1318  /// Compute a cost of the given call instruction.
1319  ///
1320  /// Compute the cost of calling function F with return type RetTy and
1321  /// argument types Tys. F might be nullptr, in this case the cost of an
1322  /// arbitrary call with the specified signature will be returned.
1323  /// This is used, for instance, when we estimate call of a vector
1324  /// counterpart of the given function.
1325  /// \param F Called function, might be nullptr.
1326  /// \param RetTy Return value types.
1327  /// \param Tys Argument types.
1328  /// \returns The cost of Call instruction.
1330  return 10;
1331  }
1332 
1333  unsigned getNumberOfParts(Type *Tp) {
1334  std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(DL, Tp);
1335  return LT.first;
1336  }
1337 
1339  const SCEV *) {
1340  return 0;
1341  }
1342 
1343  /// Try to calculate arithmetic and shuffle op costs for reduction operations.
1344  /// We're assuming that reduction operation are performing the following way:
1345  /// 1. Non-pairwise reduction
1346  /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
1347  /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
1348  /// \----------------v-------------/ \----------v------------/
1349  /// n/2 elements n/2 elements
1350  /// %red1 = op <n x t> %val, <n x t> val1
1351  /// After this operation we have a vector %red1 where only the first n/2
1352  /// elements are meaningful, the second n/2 elements are undefined and can be
1353  /// dropped. All other operations are actually working with the vector of
1354  /// length n/2, not n, though the real vector length is still n.
1355  /// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
1356  /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
1357  /// \----------------v-------------/ \----------v------------/
1358  /// n/4 elements 3*n/4 elements
1359  /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of
1360  /// length n/2, the resulting vector has length n/4 etc.
1361  /// 2. Pairwise reduction:
1362  /// Everything is the same except for an additional shuffle operation which
1363  /// is used to produce operands for pairwise kind of reductions.
1364  /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
1365  /// <n x i32> <i32 0, i32 2, ..., i32 n-2, i32 undef, ..., i32 undef>
1366  /// \-------------v----------/ \----------v------------/
1367  /// n/2 elements n/2 elements
1368  /// %val2 = shufflevector<n x t> %val, <n x t> %undef,
1369  /// <n x i32> <i32 1, i32 3, ..., i32 n-1, i32 undef, ..., i32 undef>
1370  /// \-------------v----------/ \----------v------------/
1371  /// n/2 elements n/2 elements
1372  /// %red1 = op <n x t> %val1, <n x t> val2
1373  /// Again, the operation is performed on <n x t> vector, but the resulting
1374  /// vector %red1 is <n/2 x t> vector.
1375  ///
1376  /// The cost model should take into account that the actual length of the
1377  /// vector is reduced on each iteration.
1378  unsigned getArithmeticReductionCost(unsigned Opcode, Type *Ty,
1379  bool IsPairwise) {
1380  assert(Ty->isVectorTy() && "Expect a vector type");
1381  Type *ScalarTy = Ty->getVectorElementType();
1382  unsigned NumVecElts = Ty->getVectorNumElements();
1383  unsigned NumReduxLevels = Log2_32(NumVecElts);
1384  unsigned ArithCost = 0;
1385  unsigned ShuffleCost = 0;
1386  auto *ConcreteTTI = static_cast<T *>(this);
1387  std::pair<unsigned, MVT> LT =
1388  ConcreteTTI->getTLI()->getTypeLegalizationCost(DL, Ty);
1389  unsigned LongVectorCount = 0;
1390  unsigned MVTLen =
1391  LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
1392  while (NumVecElts > MVTLen) {
1393  NumVecElts /= 2;
1394  Type *SubTy = VectorType::get(ScalarTy, NumVecElts);
1395  // Assume the pairwise shuffles add a cost.
1396  ShuffleCost += (IsPairwise + 1) *
1397  ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
1398  NumVecElts, SubTy);
1399  ArithCost += ConcreteTTI->getArithmeticInstrCost(Opcode, Ty);
1400  Ty = SubTy;
1401  ++LongVectorCount;
1402  }
1403  // The minimal length of the vector is limited by the real length of vector
1404  // operations performed on the current platform. That's why several final
1405  // reduction operations are performed on the vectors with the same
1406  // architecture-dependent length.
1407  ShuffleCost += (NumReduxLevels - LongVectorCount) * (IsPairwise + 1) *
1408  ConcreteTTI->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
1409  0, Ty);
1410  ArithCost += (NumReduxLevels - LongVectorCount) *
1411  ConcreteTTI->getArithmeticInstrCost(Opcode, Ty);
1412  return ShuffleCost + ArithCost + getScalarizationOverhead(Ty, false, true);
1413  }
1414 
1415  /// Try to calculate op costs for min/max reduction operations.
1416  /// \param CondTy Conditional type for the Select instruction.
1417  unsigned getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwise,
1418  bool) {
1419  assert(Ty->isVectorTy() && "Expect a vector type");
1420  Type *ScalarTy = Ty->getVectorElementType();
1421  Type *ScalarCondTy = CondTy->getVectorElementType();
1422  unsigned NumVecElts = Ty->getVectorNumElements();
1423  unsigned NumReduxLevels = Log2_32(NumVecElts);
1424  unsigned CmpOpcode;
1425  if (Ty->isFPOrFPVectorTy()) {
1426  CmpOpcode = Instruction::FCmp;
1427  } else {
1428  assert(Ty->isIntOrIntVectorTy() &&
1429  "expecting floating point or integer type for min/max reduction");
1430  CmpOpcode = Instruction::ICmp;
1431  }
1432  unsigned MinMaxCost = 0;
1433  unsigned ShuffleCost = 0;
1434  auto *ConcreteTTI = static_cast<T *>(this);
1435  std::pair<unsigned, MVT> LT =
1436  ConcreteTTI->getTLI()->getTypeLegalizationCost(DL, Ty);
1437  unsigned LongVectorCount = 0;
1438  unsigned MVTLen =
1439  LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
1440  while (NumVecElts > MVTLen) {
1441  NumVecElts /= 2;
1442  Type *SubTy = VectorType::get(ScalarTy, NumVecElts);
1443  // Assume the pairwise shuffles add a cost.
1444  ShuffleCost += (IsPairwise + 1) *
1445  ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
1446  NumVecElts, SubTy);
1447  MinMaxCost +=
1448  ConcreteTTI->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, nullptr) +
1449  ConcreteTTI->getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
1450  nullptr);
1451  Ty = SubTy;
1452  CondTy = VectorType::get(ScalarCondTy, NumVecElts);
1453  ++LongVectorCount;
1454  }
1455  // The minimal length of the vector is limited by the real length of vector
1456  // operations performed on the current platform. That's why several final
1457  // reduction opertions are perfomed on the vectors with the same
1458  // architecture-dependent length.
1459  ShuffleCost += (NumReduxLevels - LongVectorCount) * (IsPairwise + 1) *
1460  ConcreteTTI->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
1461  0, Ty);
1462  MinMaxCost +=
1463  (NumReduxLevels - LongVectorCount) *
1464  (ConcreteTTI->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, nullptr) +
1465  ConcreteTTI->getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
1466  nullptr));
1467  // Need 3 extractelement instructions for scalarization + an additional
1468  // scalar select instruction.
1469  return ShuffleCost + MinMaxCost +
1470  3 * getScalarizationOverhead(Ty, /*Insert=*/false,
1471  /*Extract=*/true) +
1472  ConcreteTTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
1473  ScalarCondTy, nullptr);
1474  }
1475 
1476  unsigned getVectorSplitCost() { return 1; }
1477 
1478  /// @}
1479 };
1480 
1481 /// Concrete BasicTTIImpl that can be used if no further customization
1482 /// is needed.
1483 class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
1485 
1487 
1488  const TargetSubtargetInfo *ST;
1489  const TargetLoweringBase *TLI;
1490 
1491  const TargetSubtargetInfo *getST() const { return ST; }
1492  const TargetLoweringBase *getTLI() const { return TLI; }
1493 
1494 public:
1495  explicit BasicTTIImpl(const TargetMachine *TM, const Function &F);
1496 };
1497 
1498 } // end namespace llvm
1499 
1500 #endif // LLVM_CODEGEN_BASICTTIIMPL_H
LegalizeAction getLoadExtAction(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return how this load with extension should be treated: either it is legal, needs to be promoted to a ...
Type * getVectorElementType() const
Definition: Type.h:371
unsigned getNumCases() const
Return the number of &#39;cases&#39; in this switch instruction, excluding the default case.
Base class for use as a mix-in that aids implementing a TargetTransformInfo-compatible class...
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:111
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:581
BitVector & set()
Definition: BitVector.h:398
unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value *> Args=ArrayRef< const Value *>())
Definition: BasicTTIImpl.h:568
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...
unsigned getIndexSizeInBits(unsigned AS) const
Size in bits of index used for address calculation in getelementptr.
Definition: DataLayout.h:365
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
iterator_range< CaseIt > cases()
Iteration adapter for range-for loops.
LLVMContext & Context
bool noNaNs() const
Definition: Operator.h:200
unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, bool VariableMask, unsigned Alignment)
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:884
Compute iterated dominance frontiers using a linear time algorithm.
Definition: AllocatorList.h:24
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0...
Definition: ISDOpcodes.h:592
unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:507
The main scalar evolution driver.
bool slt(const APInt &RHS) const
Signed less than comparison.
Definition: APInt.h:1198
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold, but used for partial/runtime unrolling (set to UINT_MAX to disable).
int getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value *> Operands)
Definition: BasicTTIImpl.h:278
MemIndexedMode
The type of load/store indexing.
static uint64_t round(uint64_t Acc, uint64_t Input)
Definition: xxhash.cpp:57
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
CaseIt case_begin()
Returns a read/write iterator that points to the first case in the SwitchInst.
unsigned getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy)
bool isSuitableForBitTests(unsigned NumDests, unsigned NumCmps, const APInt &Low, const APInt &High, const DataLayout &DL) const
Return true if lowering to a bit test is suitable for a set of case clusters which contains NumDests ...
bool sgt(const APInt &RHS) const
Signed greather than comparison.
Definition: APInt.h:1268
BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
Definition: BasicTTIImpl.h:192
int getExtCost(const Instruction *I, const Value *Src)
Definition: BasicTTIImpl.h:283
F(f)
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:503
An instruction for reading from memory.
Definition: Instructions.h:168
bool isProfitableToHoist(Instruction *I)
Definition: BasicTTIImpl.h:267
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:230
Base class which can be used to help build a TTI implementation.
Definition: BasicTTIImpl.h:78
virtual bool isZExtFree(Type *FromTy, Type *ToTy) const
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
bool isOperationLegalOrCustom(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
unsigned getMaxInterleaveFactor(unsigned VF)
Definition: BasicTTIImpl.h:566
unsigned getJumpBufAlignment() const
Returns the target&#39;s jmp_buf alignment in bytes (if never set, the default is 0)
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:130
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:263
This file provides helpers for the implementation of a TargetTransformInfo-conforming class...
CRTP base class for use as a mix-in that aids implementing a TargetTransformInfo-compatible class...
unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef< Value *> Args, FastMathFlags FMF, unsigned VF=1)
Get intrinsic cost based on arguments.
unsigned getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
unsigned getOperandsScalarizationOverhead(ArrayRef< const Value *> Args, unsigned VF)
Estimate the overhead of scalarizing an instructions unique non-constant operands.
Definition: BasicTTIImpl.h:526
unsigned getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index)
Definition: BasicTTIImpl.h:759
int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace)
Definition: BasicTTIImpl.h:253
unsigned getArithmeticReductionCost(unsigned Opcode, Type *Ty, bool IsPairwise)
Try to calculate arithmetic and shuffle op costs for reduction operations.
unsigned getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
uint64_t getNumElements() const
Definition: DerivedTypes.h:359
bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, unsigned AddressSpace, unsigned Alignment, bool *Fast) const
Definition: BasicTTIImpl.h:200
This file implements a class to represent arbitrary precision integral constant values and operations...
virtual bool isTruncateFree(Type *FromTy, Type *ToTy) const
Return true if it&#39;s free to truncate a value of type FromTy to type ToTy.
unsigned getRegisterBitWidth(bool Vector) const
Definition: BasicTTIImpl.h:503
Fast - This calling convention attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:43
unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I)
Definition: BasicTTIImpl.h:772
unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:634
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:418
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty)
Definition: BasicTTIImpl.h:391
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:33
Selects elements from the corresponding lane of either source operand.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:203
virtual bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AddrSpace, Instruction *I=nullptr) const
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
Reverse the order of the vector.
bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty, const DataLayout &DL) const
Definition: BasicTTIImpl.h:243
unsigned getFPOpCost(Type *Ty)
Definition: BasicTTIImpl.h:395
bool isTruncateFree(Type *Ty1, Type *Ty2)
Definition: BasicTTIImpl.h:263
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return &#39;this&#39;.
Definition: Type.h:304
ExtractSubvector Index indicates start offset.
bool isVoidTy() const
Return true if this is &#39;void&#39;.
Definition: Type.h:141
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP)
Definition: BasicTTIImpl.h:424
virtual bool isSuitableForJumpTable(const SwitchInst *SI, uint64_t NumCases, uint64_t Range) const
Return true if lowering to a jump table is suitable for a set of case clusters which may contain NumC...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Machine Value Type.
Concrete BasicTTIImpl that can be used if no further customization is needed.
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:46
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:69
unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy, ArrayRef< const Value *> Arguments)
unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Definition: BasicTTIImpl.h:850
Simple binary floating point operators.
Definition: ISDOpcodes.h:276
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:149
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator begin()
Definition: SmallVector.h:129
This file contains the declarations for the subclasses of Constant, which represent the different fla...
unsigned getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwise, bool)
Try to calculate op costs for min/max reduction operations.
unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef< Type *> Tys, FastMathFlags FMF, unsigned ScalarizationCostPassed=std::numeric_limits< unsigned >::max())
Get intrinsic cost based on argument types.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:371
Expected to fold away in lowering.
AMDGPU Lower Kernel Arguments
virtual bool areJTsAllowed(const Function *Fn) const
Return true if lowering to a jump table is allowed.
unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy, ArrayRef< Type *> ParamTys)
Definition: BasicTTIImpl.h:300
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
Merge elements from two source vectors into one with any shuffle mask.
unsigned getNumberOfParts(Type *Tp)
bool isIndexedLoadLegal(unsigned IdxMode, EVT VT) const
Return true if the specified indexed load is legal on this target.
static double log2(double V)
virtual bool isProfitableToHoist(Instruction *I) const
Extended Value Type.
Definition: ValueTypes.h:34
static wasm::ValType getType(const TargetRegisterClass *RC)
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_READONLY APFloat maxnum(const APFloat &A, const APFloat &B)
Implements IEEE maxNum semantics.
Definition: APFloat.h:1238
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
bool isLegalAddImmediate(int64_t imm)
Definition: BasicTTIImpl.h:218
This base class for TargetLowering contains the SelectionDAG-independent parts that can be used from ...
OperandValueProperties
Additional properties of an operand&#39;s values.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
unsigned getCFInstrCost(unsigned Opcode)
Definition: BasicTTIImpl.h:767
size_type size() const
Definition: SmallPtrSet.h:93
unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp)
Definition: BasicTTIImpl.h:615
unsigned getNumberOfRegisters(bool Vector)
Definition: BasicTTIImpl.h:501
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:418
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:312
size_type count() const
count - Returns the number of bits which are set.
Definition: BitVector.h:173
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
int getInstructionLatency(const Instruction *I)
Definition: BasicTTIImpl.h:489
virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace=0, unsigned Align=1, bool *=nullptr) const
Determine if the target supports unaligned memory accesses.
iterator end()
Definition: BasicBlock.h:265
unsigned getJumpBufSize() const
Returns the target&#39;s jmp_buf size in bytes (if never set, the default is 200)
unsigned getVectorSplitCost()
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:847
AddressSpace
Definition: NVPTXBaseInfo.h:22
cl::opt< unsigned > PartialUnrollingThreshold
static const unsigned DefaultLoadLatency
Definition: MCSchedule.h:284
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
LegalizeAction getTruncStoreAction(EVT ValVT, EVT MemVT) const
Return how this store with truncation should be treated: either it is legal, needs to be promoted to ...
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:539
unsigned getVectorNumElements() const
Definition: DerivedTypes.h:462
Class to represent vector types.
Definition: DerivedTypes.h:393
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
Class for arbitrary precision integers.
Definition: APInt.h:70
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:409
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:90
unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:819
unsigned LoopMicroOpBufferSize
Definition: MCSchedule.h:279
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:305
bool isAlwaysUniform(const Value *V)
Definition: BasicTTIImpl.h:211
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
unsigned getFlatAddressSpace()
Definition: BasicTTIImpl.h:213
TargetSubtargetInfo - Generic base class for all target subtargets.
bool isLSRCostLess(TTI::LSRCost &C1, TTI::LSRCost &C2)
virtual bool isLegalICmpImmediate(int64_t) const
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace)
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:625
unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy, ArrayRef< const Value *> Arguments)
Definition: BasicTTIImpl.h:295
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator end()
Definition: SmallVector.h:133
unsigned getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy)
Definition: BasicTTIImpl.h:405
This class represents an analyzed expression in the program.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:459
Parameters that control the generic loop unrolling transformation.
unsigned getJumpBufAlignment()
Definition: BasicTTIImpl.h:374
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable)...
Establish a view to a call site for examination.
Definition: CallSite.h:714
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return &#39;Legal&#39;) or we ...
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:107
#define I(x, y, z)
Definition: MD5.cpp:58
#define N
int getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value *> Operands)
unsigned getInliningThresholdMultiplier()
Definition: BasicTTIImpl.h:422
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:323
unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: BasicTTIImpl.h:812
block_iterator block_end() const
Definition: LoopInfo.h:155
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:309
InsertSubvector. Index indicates start offset.
bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2)
Definition: BasicTTIImpl.h:249
unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, unsigned &JumpTableSize)
Definition: BasicTTIImpl.h:317
FunTy * getCalledFunction() const
Return the function being called if this is a direct call, otherwise return null (if it&#39;s an indirect...
Definition: CallSite.h:107
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:185
const unsigned Kind
Multiway switch.
unsigned getScalarizationOverhead(Type *VecTy, ArrayRef< const Value *> Args)
Definition: BasicTTIImpl.h:549
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
The cost of a typical &#39;add&#39; instruction.
bool isSourceOfDivergence(const Value *V)
Definition: BasicTTIImpl.h:209
unsigned getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition: Type.cpp:115
LLVM Value Representation.
Definition: Value.h:73
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:295
uint64_t getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type...
Definition: DataLayout.h:411
static VectorType * get(Type *ElementType, unsigned NumElements)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:606
std::underlying_type< E >::type Mask()
Get a bitmask with 1s in all places up to the high-order bit of E&#39;s largest value.
Definition: BitmaskEnum.h:81
bool isOperationLegalOrPromote(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target or can be made legal using promotion...
Broadcast element 0 to all other elements.
bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal on this target.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:59
bool isTypeLegal(Type *Ty)
Definition: BasicTTIImpl.h:273
Type * getElementType() const
Definition: DerivedTypes.h:360
bool UpperBound
Allow using trip count upper bound to unroll loops.
virtual int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS=0) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target...
const DataLayout & getDataLayout() const
virtual bool useAA() const
Enable use of alias analysis during code generation (during MI scheduling, DAGCombine, etc.).
bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty, const DataLayout &DL) const
Definition: BasicTTIImpl.h:237
Convenience struct for specifying and reasoning about fast-math flags.
Definition: Operator.h:160
unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type *> Tys)
Compute a cost of the given call instruction.
OperandValueKind
Additional information about an operand&#39;s possible values.
bool haveFastSqrt(Type *Ty)
Definition: BasicTTIImpl.h:384
This pass exposes codegen information to IR-level passes.
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr)
Definition: BasicTTIImpl.h:226
bool isLegalICmpImmediate(int64_t imm)
Definition: BasicTTIImpl.h:222
block_iterator block_begin() const
Definition: LoopInfo.h:154
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
virtual bool isLegalAddImmediate(int64_t) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
virtual bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const
Returns true if a cast between SrcAS and DestAS is a noop.
The cost of a &#39;div&#39; instruction on x86.
static IntegerType * getInt8Ty(LLVMContext &C)
Definition: Type.cpp:174
const MCSchedModel & getSchedModel() const
Get the machine model for this subtarget&#39;s CPU.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
std::pair< int, MVT > getTypeLegalizationCost(const DataLayout &DL, Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
LLVM_READONLY APFloat minnum(const APFloat &A, const APFloat &B)
Implements IEEE minNum semantics.
Definition: APFloat.h:1227
bool isIndexedStoreLegal(unsigned IdxMode, EVT VT) const
Return true if the specified indexed load is legal on this target.
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:144
virtual bool isFAbsFree(EVT VT) const
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
This file describes how to lower LLVM code to machine code.
const BasicBlock * getParent() const
Definition: Instruction.h:67
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Shuffle elements of single source vector with any shuffle mask.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
Definition: ISDOpcodes.h:901
BRIND - Indirect branch.
Definition: ISDOpcodes.h:621