Line data Source code
1 : //===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
2 : //
3 : // The LLVM Compiler Infrastructure
4 : //
5 : // This file is distributed under the University of Illinois Open Source
6 : // License. See LICENSE.TXT for details.
7 : //
8 : //===----------------------------------------------------------------------===//
9 : //
10 : /// \file
11 : /// This file provides a helper that implements much of the TTI interface in
12 : /// terms of the target-independent code generator and TargetLowering
13 : /// interfaces.
14 : //
15 : //===----------------------------------------------------------------------===//
16 :
17 : #ifndef LLVM_CODEGEN_BASICTTIIMPL_H
18 : #define LLVM_CODEGEN_BASICTTIIMPL_H
19 :
20 : #include "llvm/ADT/APInt.h"
21 : #include "llvm/ADT/ArrayRef.h"
22 : #include "llvm/ADT/BitVector.h"
23 : #include "llvm/ADT/SmallPtrSet.h"
24 : #include "llvm/ADT/SmallVector.h"
25 : #include "llvm/Analysis/LoopInfo.h"
26 : #include "llvm/Analysis/TargetTransformInfo.h"
27 : #include "llvm/Analysis/TargetTransformInfoImpl.h"
28 : #include "llvm/CodeGen/ISDOpcodes.h"
29 : #include "llvm/CodeGen/TargetLowering.h"
30 : #include "llvm/CodeGen/TargetSubtargetInfo.h"
31 : #include "llvm/CodeGen/ValueTypes.h"
32 : #include "llvm/IR/BasicBlock.h"
33 : #include "llvm/IR/CallSite.h"
34 : #include "llvm/IR/Constant.h"
35 : #include "llvm/IR/Constants.h"
36 : #include "llvm/IR/DataLayout.h"
37 : #include "llvm/IR/DerivedTypes.h"
38 : #include "llvm/IR/InstrTypes.h"
39 : #include "llvm/IR/Instruction.h"
40 : #include "llvm/IR/Instructions.h"
41 : #include "llvm/IR/Intrinsics.h"
42 : #include "llvm/IR/Operator.h"
43 : #include "llvm/IR/Type.h"
44 : #include "llvm/IR/Value.h"
45 : #include "llvm/MC/MCSchedule.h"
46 : #include "llvm/Support/Casting.h"
47 : #include "llvm/Support/CommandLine.h"
48 : #include "llvm/Support/ErrorHandling.h"
49 : #include "llvm/Support/MachineValueType.h"
50 : #include "llvm/Support/MathExtras.h"
51 : #include <algorithm>
52 : #include <cassert>
53 : #include <cstdint>
54 : #include <limits>
55 : #include <utility>
56 :
57 : namespace llvm {
58 :
59 : class Function;
60 : class GlobalValue;
61 : class LLVMContext;
62 : class ScalarEvolution;
63 : class SCEV;
64 : class TargetMachine;
65 :
66 : extern cl::opt<unsigned> PartialUnrollingThreshold;
67 :
68 : /// Base class which can be used to help build a TTI implementation.
69 : ///
70 : /// This class provides as much implementation of the TTI interface as is
71 : /// possible using the target independent parts of the code generator.
72 : ///
73 : /// In order to subclass it, your class must implement a getST() method to
74 : /// return the subtarget, and a getTLI() method to return the target lowering.
75 : /// We need these methods implemented in the derived class so that this class
76 : /// doesn't have to duplicate storage for them.
77 : template <typename T>
78 : class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
79 : private:
80 : using BaseT = TargetTransformInfoImplCRTPBase<T>;
81 : using TTI = TargetTransformInfo;
82 :
83 : /// Estimate a cost of shuffle as a sequence of extract and insert
84 : /// operations.
85 13 : unsigned getPermuteShuffleOverhead(Type *Ty) {
86 : assert(Ty->isVectorTy() && "Can only shuffle vectors");
87 : unsigned Cost = 0;
88 : // Shuffle cost is equal to the cost of extracting element from its argument
89 : // plus the cost of inserting them onto the result vector.
90 :
91 : // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
92 : // index 0 of first vector, index 1 of second vector,index 2 of first
93 : // vector and finally index 3 of second vector and insert them at index
94 : // <0,1,2,3> of result vector.
95 69 : for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
96 56 : Cost += static_cast<T *>(this)
97 : ->getVectorInstrCost(Instruction::InsertElement, Ty, i);
98 56 : Cost += static_cast<T *>(this)
99 : ->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
100 : }
101 13 : return Cost;
102 : }
103 :
104 : /// Local query method delegates up to T which *must* implement this!
105 : const TargetSubtargetInfo *getST() const {
106 8803 : return static_cast<const T *>(this)->getST();
107 : }
108 :
109 : /// Local query method delegates up to T which *must* implement this!
110 : const TargetLoweringBase *getTLI() const {
111 147114 : return static_cast<const T *>(this)->getTLI();
112 : }
113 :
114 : static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
115 : switch (M) {
116 : case TTI::MIM_Unindexed:
117 : return ISD::UNINDEXED;
118 : case TTI::MIM_PreInc:
119 : return ISD::PRE_INC;
120 : case TTI::MIM_PreDec:
121 : return ISD::PRE_DEC;
122 : case TTI::MIM_PostInc:
123 : return ISD::POST_INC;
124 : case TTI::MIM_PostDec:
125 : return ISD::POST_DEC;
126 : }
127 0 : llvm_unreachable("Unexpected MemIndexedMode");
128 : }
129 :
130 : protected:
131 : explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
132 : : BaseT(DL) {}
133 :
134 : using TargetTransformInfoImplBase::DL;
135 :
136 : public:
137 : /// \name Scalar TTI Implementations
138 : /// @{
139 8350 : bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
140 : unsigned BitWidth, unsigned AddressSpace,
141 : unsigned Alignment, bool *Fast) const {
142 8350 : EVT E = EVT::getIntegerVT(Context, BitWidth);
143 8350 : return getTLI()->allowsMisalignedMemoryAccesses(E, AddressSpace, Alignment, Fast);
144 : }
145 118 :
146 0 : bool hasBranchDivergence() { return false; }
147 :
148 118 : bool isSourceOfDivergence(const Value *V) { return false; }
149 118 :
150 0 : bool isAlwaysUniform(const Value *V) { return false; }
151 8187 :
152 0 : unsigned getFlatAddressSpace() {
153 : // Return an invalid address space.
154 8187 : return -1;
155 8187 : }
156 :
157 : bool isLegalAddImmediate(int64_t imm) {
158 19414 : return getTLI()->isLegalAddImmediate(imm);
159 : }
160 0 :
161 : bool isLegalICmpImmediate(int64_t imm) {
162 28004 : return getTLI()->isLegalICmpImmediate(imm);
163 : }
164 0 :
165 : bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
166 0 : bool HasBaseReg, int64_t Scale,
167 : unsigned AddrSpace, Instruction *I = nullptr) {
168 : TargetLoweringBase::AddrMode AM;
169 708200 : AM.BaseGV = BaseGV;
170 708520 : AM.BaseOffs = BaseOffset;
171 708200 : AM.HasBaseReg = HasBaseReg;
172 708200 : AM.Scale = Scale;
173 708200 : return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
174 544 : }
175 :
176 7637 : bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty,
177 : const DataLayout &DL) const {
178 7637 : EVT VT = getTLI()->getValueType(DL, Ty);
179 7637 : return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
180 : }
181 45341 :
182 45341 : bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty,
183 45341 : const DataLayout &DL) const {
184 45341 : EVT VT = getTLI()->getValueType(DL, Ty);
185 45341 : return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
186 : }
187 :
188 0 : bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) {
189 13954 : return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
190 0 : }
191 0 :
192 : int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
193 0 : bool HasBaseReg, int64_t Scale, unsigned AddrSpace) {
194 : TargetLoweringBase::AddrMode AM;
195 111848 : AM.BaseGV = BaseGV;
196 111848 : AM.BaseOffs = BaseOffset;
197 111848 : AM.HasBaseReg = HasBaseReg;
198 111848 : AM.Scale = Scale;
199 111848 : return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace);
200 0 : }
201 0 :
202 : bool isTruncateFree(Type *Ty1, Type *Ty2) {
203 11678 : return getTLI()->isTruncateFree(Ty1, Ty2);
204 0 : }
205 :
206 0 : bool isProfitableToHoist(Instruction *I) {
207 5821 : return getTLI()->isProfitableToHoist(I);
208 : }
209 0 :
210 552 : bool useAA() const { return getST()->useAA(); }
211 0 :
212 351 : bool isTypeLegal(Type *Ty) {
213 351 : EVT VT = getTLI()->getValueType(DL, Ty);
214 351 : return getTLI()->isTypeLegal(VT);
215 : }
216 0 :
217 0 : int getGEPCost(Type *PointeeType, const Value *Ptr,
218 : ArrayRef<const Value *> Operands) {
219 180855 : return BaseT::getGEPCost(PointeeType, Ptr, Operands);
220 0 : }
221 942 :
222 39587 : int getExtCost(const Instruction *I, const Value *Src) {
223 39587 : if (getTLI()->isExtFree(I))
224 0 : return TargetTransformInfo::TCC_Free;
225 :
226 33778 : if (isa<ZExtInst>(I) || isa<SExtInst>(I))
227 0 : if (const LoadInst *LI = dyn_cast<LoadInst>(Src))
228 11052 : if (getTLI()->isExtLoad(LI, I, DL))
229 11031 : return TargetTransformInfo::TCC_Free;
230 :
231 : return TargetTransformInfo::TCC_Basic;
232 : }
233 444 :
234 444 : unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
235 444 : ArrayRef<const Value *> Arguments) {
236 448 : return BaseT::getIntrinsicCost(IID, RetTy, Arguments);
237 444 : }
238 :
239 642222 : unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
240 : ArrayRef<Type *> ParamTys) {
241 642697 : if (IID == Intrinsic::cttz) {
242 596 : if (getTLI()->isCheapToSpeculateCttz())
243 : return TargetTransformInfo::TCC_Basic;
244 537 : return TargetTransformInfo::TCC_Expensive;
245 0 : }
246 :
247 641626 : if (IID == Intrinsic::ctlz) {
248 1010 : if (getTLI()->isCheapToSpeculateCtlz())
249 : return TargetTransformInfo::TCC_Basic;
250 951 : return TargetTransformInfo::TCC_Expensive;
251 2 : }
252 2 :
253 640616 : return BaseT::getIntrinsicCost(IID, RetTy, ParamTys);
254 0 : }
255 0 :
256 7994 : unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
257 : unsigned &JumpTableSize) {
258 2 : /// Try to find the estimated number of clusters. Note that the number of
259 2 : /// clusters identified in this function could be different from the actural
260 2 : /// numbers found in lowering. This function ignore switches that are
261 : /// lowered with a mix of jump table / bit test / BTree. This function was
262 : /// initially intended to be used when estimating the cost of switch in
263 : /// inline cost heuristic, but it's a generic cost model to be used in other
264 : /// places (e.g., in loop unrolling).
265 34452 : unsigned N = SI.getNumCases();
266 : const TargetLoweringBase *TLI = getTLI();
267 7994 : const DataLayout &DL = this->getDataLayout();
268 169 :
269 8163 : JumpTableSize = 0;
270 7994 : bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());
271 :
272 87 : // Early exit if both a jump table and bit test are not allowed.
273 7994 : if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N))
274 34 : return N;
275 30 :
276 : APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
277 : APInt MinCaseVal = MaxCaseVal;
278 154472 : for (auto CI : SI.cases()) {
279 1 : const APInt &CaseVal = CI.getCaseValue()->getValue();
280 146479 : if (CaseVal.sgt(MaxCaseVal))
281 24276 : MaxCaseVal = CaseVal;
282 146478 : if (CaseVal.slt(MinCaseVal))
283 17736 : MinCaseVal = CaseVal;
284 : }
285 0 :
286 0 : // Check if suitable for a bit test
287 7994 : if (N <= DL.getIndexSizeInBits(0u)) {
288 : SmallPtrSet<const BasicBlock *, 4> Dests;
289 154472 : for (auto I : SI.cases())
290 146646 : Dests.insert(I.getCaseSuccessor());
291 168 :
292 15988 : if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
293 : DL))
294 86 : return 1;
295 : }
296 34 :
297 30 : // Check if suitable for a jump table.
298 7737 : if (IsJTAllowed) {
299 7737 : if (N < 2 || N < TLI->getMinimumJumpTableEntries())
300 3061 : return N;
301 4676 : uint64_t Range =
302 : (MaxCaseVal - MinCaseVal)
303 4676 : .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1;
304 0 : // Check whether a range of clusters is dense enough for a jump table
305 4676 : if (TLI->isSuitableForJumpTable(&SI, N, Range)) {
306 4592 : JumpTableSize = Range;
307 5400 : return 1;
308 : }
309 808 : }
310 12 : return N;
311 : }
312 0 :
313 0 : unsigned getJumpBufAlignment() { return getTLI()->getJumpBufAlignment(); }
314 :
315 796 : unsigned getJumpBufSize() { return getTLI()->getJumpBufSize(); }
316 12 :
317 : bool shouldBuildLookupTables() {
318 0 : const TargetLoweringBase *TLI = getTLI();
319 : return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
320 : TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
321 784 : }
322 :
323 56 : bool haveFastSqrt(Type *Ty) {
324 : const TargetLoweringBase *TLI = getTLI();
325 56 : EVT VT = TLI->getValueType(DL, Ty);
326 0 : return TLI->isTypeLegal(VT) &&
327 54 : TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
328 0 : }
329 :
330 0 : bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
331 2 : return true;
332 0 : }
333 :
334 534 : unsigned getFPOpCost(Type *Ty) {
335 : // Check whether FADD is available, as a proxy for floating-point in
336 : // general.
337 2 : const TargetLoweringBase *TLI = getTLI();
338 534 : EVT VT = TLI->getValueType(DL, Ty);
339 1340 : if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT))
340 474 : return TargetTransformInfo::TCC_Basic;
341 806 : return TargetTransformInfo::TCC_Expensive;
342 12 : }
343 :
344 1652878 : unsigned getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy) {
345 : const TargetLoweringBase *TLI = getTLI();
346 1652878 : switch (Opcode) {
347 794 : default: break;
348 13446 : case Instruction::Trunc:
349 13434 : if (TLI->isTruncateFree(OpTy, Ty))
350 13311 : return TargetTransformInfo::TCC_Free;
351 : return TargetTransformInfo::TCC_Basic;
352 0 : case Instruction::ZExt:
353 782 : if (TLI->isZExtFree(OpTy, Ty))
354 0 : return TargetTransformInfo::TCC_Free;
355 : return TargetTransformInfo::TCC_Basic;
356 0 : }
357 :
358 1639444 : return BaseT::getOperationCost(Opcode, Ty, OpTy);
359 : }
360 :
361 0 : unsigned getInliningThresholdMultiplier() { return 1; }
362 :
363 8251 : void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
364 : TTI::UnrollingPreferences &UP) {
365 : // This unrolling functionality is target independent, but to provide some
366 : // motivation for its intended use, for x86:
367 0 :
368 : // According to the Intel 64 and IA-32 Architectures Optimization Reference
369 0 : // Manual, Intel Core models and later have a loop stream detector (and
370 0 : // associated uop queue) that can benefit from partial unrolling.
371 : // The relevant requirements are:
372 : // - The loop must have no more than 4 (8 for Nehalem and later) branches
373 0 : // taken, and none of them may be calls.
374 0 : // - The loop can have no more than 18 (28 for Nehalem and later) uops.
375 :
376 : // According to the Software Optimization Guide for AMD Family 15h
377 : // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
378 0 : // and loop buffer which can benefit from partial unrolling.
379 : // The relevant requirements are:
380 0 : // - The loop must have fewer than 16 branches
381 0 : // - The loop must have less than 40 uops in all executed loop branches
382 0 :
383 0 : // The number of taken branches in a loop is hard to estimate here, and
384 : // benchmarking has revealed that it is better not to be conservative when
385 : // estimating the branch count. As a result, we'll ignore the branch limits
386 : // until someone finds a case where it matters in practice.
387 0 :
388 : unsigned MaxOps;
389 0 : const TargetSubtargetInfo *ST = getST();
390 8251 : if (PartialUnrollingThreshold.getNumOccurrences() > 0)
391 : MaxOps = PartialUnrollingThreshold;
392 8251 : else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
393 : MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
394 : else
395 : return;
396 :
397 : // Scan the loop: don't unroll loops with calls.
398 20594 : for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E;
399 0 : ++I) {
400 17600 : BasicBlock *BB = *I;
401 0 :
402 196144 : for (BasicBlock::iterator J = BB->begin(), JE = BB->end(); J != JE; ++J)
403 181821 : if (isa<CallInst>(J) || isa<InvokeInst>(J)) {
404 : ImmutableCallSite CS(&*J);
405 0 : if (const Function *F = CS.getCalledFunction()) {
406 48817 : if (!static_cast<T *>(this)->isLoweredToCall(F))
407 0 : continue;
408 : }
409 :
410 : return;
411 : }
412 0 : }
413 :
414 : // Enable runtime and partial unrolling up to the specified size.
415 : // Enable using trip count upper bound to unroll loops.
416 2994 : UP.Partial = UP.Runtime = UP.UpperBound = true;
417 2994 : UP.PartialThreshold = MaxOps;
418 :
419 : // Avoid unrolling when optimizing for size.
420 2994 : UP.OptSizeThreshold = 0;
421 2994 : UP.PartialOptSizeThreshold = 0;
422 :
423 0 : // Set number of instructions optimized when "back edge"
424 : // becomes "fall through" to default value of 2.
425 2994 : UP.BEInsns = 2;
426 0 : }
427 :
428 : int getInstructionLatency(const Instruction *I) {
429 11 : if (isa<LoadInst>(I))
430 0 : return getST()->getSchedModel().DefaultLoadLatency;
431 :
432 10 : return BaseT::getInstructionLatency(I);
433 : }
434 0 :
435 : /// @}
436 0 :
437 0 : /// \name Vector TTI Implementations
438 0 : /// @{
439 0 :
440 893 : unsigned getNumberOfRegisters(bool Vector) { return Vector ? 0 : 1; }
441 :
442 : unsigned getRegisterBitWidth(bool Vector) const { return 32; }
443 0 :
444 : /// Estimate the overhead of scalarizing an instruction. Insert and Extract
445 0 : /// are set if the result needs to be inserted and/or extracted from vectors.
446 9246 : unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
447 : assert(Ty->isVectorTy() && "Can only scalarize vectors");
448 0 : unsigned Cost = 0;
449 :
450 72316 : for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
451 63070 : if (Insert)
452 37980 : Cost += static_cast<T *>(this)
453 : ->getVectorInstrCost(Instruction::InsertElement, Ty, i);
454 63070 : if (Extract)
455 27829 : Cost += static_cast<T *>(this)
456 0 : ->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
457 0 : }
458 :
459 9246 : return Cost;
460 : }
461 0 :
462 0 : /// Estimate the overhead of scalarizing an instructions unique
463 0 : /// non-constant operands. The types of the arguments are ordinarily
464 : /// scalar, in which case the costs are multiplied with VF.
465 2836 : unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
466 : unsigned VF) {
467 : unsigned Cost = 0;
468 0 : SmallPtrSet<const Value*, 4> UniqueOperands;
469 6992 : for (const Value *A : Args) {
470 4156 : if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
471 : Type *VecTy = nullptr;
472 5240 : if (A->getType()->isVectorTy()) {
473 : VecTy = A->getType();
474 : // If A is a vector operand, VF should be 1 or correspond to A.
475 : assert((VF == 1 || VF == VecTy->getVectorNumElements()) &&
476 : "Vector argument does not match VF");
477 : }
478 : else
479 1964 : VecTy = VectorType::get(A->getType(), VF);
480 :
481 2620 : Cost += getScalarizationOverhead(VecTy, false, true);
482 0 : }
483 : }
484 :
485 2836 : return Cost;
486 0 : }
487 :
488 151 : unsigned getScalarizationOverhead(Type *VecTy, ArrayRef<const Value *> Args) {
489 : assert(VecTy->isVectorTy());
490 0 :
491 : unsigned Cost = 0;
492 0 :
493 151 : Cost += getScalarizationOverhead(VecTy, true, false);
494 151 : if (!Args.empty())
495 52 : Cost += getOperandsScalarizationOverhead(Args,
496 : VecTy->getVectorNumElements());
497 : else
498 : // When no information on arguments is provided, we add the cost
499 0 : // associated with one argument as a heuristic.
500 99 : Cost += getScalarizationOverhead(VecTy, false, true);
501 0 :
502 151 : return Cost;
503 : }
504 0 :
505 0 : unsigned getMaxInterleaveFactor(unsigned VF) { return 1; }
506 :
507 202289 : unsigned getArithmeticInstrCost(
508 : unsigned Opcode, Type *Ty,
509 : TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
510 0 : TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
511 0 : TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
512 0 : TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
513 0 : ArrayRef<const Value *> Args = ArrayRef<const Value *>()) {
514 : // Check if any of the operands are vector operands.
515 0 : const TargetLoweringBase *TLI = getTLI();
516 202289 : int ISD = TLI->InstructionOpcodeToISD(Opcode);
517 0 : assert(ISD && "Invalid opcode");
518 0 :
519 202289 : std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
520 :
521 : bool IsFloat = Ty->isFPOrFPVectorTy();
522 : // Assume that floating point arithmetic operations cost twice as much as
523 : // integer operations.
524 : unsigned OpCost = (IsFloat ? 2 : 1);
525 0 :
526 202289 : if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
527 0 : // The operation is legal. Assume it costs 1.
528 : // TODO: Once we have extract/insert subvector cost we need to use them.
529 199982 : return LT.first * OpCost;
530 : }
531 :
532 : if (!TLI->isOperationExpand(ISD, LT.second)) {
533 : // If the operation is custom lowered, then assume that the code is twice
534 : // as expensive.
535 48 : return LT.first * 2 * OpCost;
536 : }
537 0 :
538 : // Else, assume that we need to scalarize this op.
539 0 : // TODO: If one of the types get legalized by splitting, handle this
540 : // similarly to what getCastInstrCost() does.
541 2259 : if (Ty->isVectorTy()) {
542 : unsigned Num = Ty->getVectorNumElements();
543 270 : unsigned Cost = static_cast<T *>(this)
544 : ->getArithmeticInstrCost(Opcode, Ty->getScalarType());
545 0 : // Return the cost of multiple scalar invocation plus the cost of
546 : // inserting and extracting the values.
547 135 : return getScalarizationOverhead(Ty, Args) + Num * Cost;
548 : }
549 0 :
550 : // We don't know anything about this scalar instruction.
551 0 : return OpCost;
552 : }
553 :
554 0 : unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
555 0 : Type *SubTp) {
556 0 : switch (Kind) {
557 11 : case TTI::SK_Select:
558 0 : case TTI::SK_Transpose:
559 : case TTI::SK_PermuteSingleSrc:
560 0 : case TTI::SK_PermuteTwoSrc:
561 13 : return getPermuteShuffleOverhead(Tp);
562 : default:
563 : return 1;
564 12 : }
565 : }
566 :
567 5217 : unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
568 12 : const Instruction *I = nullptr) {
569 12 : const TargetLoweringBase *TLI = getTLI();
570 5229 : int ISD = TLI->InstructionOpcodeToISD(Opcode);
571 : assert(ISD && "Invalid opcode");
572 5217 : std::pair<unsigned, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, Src);
573 5217 : std::pair<unsigned, MVT> DstLT = TLI->getTypeLegalizationCost(DL, Dst);
574 :
575 : // Check for NOOP conversions.
576 5217 : if (SrcLT.first == DstLT.first &&
577 4535 : SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) {
578 0 :
579 0 : // Bitcast between types that are legalized to the same type are free.
580 2415 : if (Opcode == Instruction::BitCast || Opcode == Instruction::Trunc)
581 : return 0;
582 12 : }
583 :
584 5352 : if (Opcode == Instruction::Trunc &&
585 878 : TLI->isTruncateFree(SrcLT.second, DstLT.second))
586 330 : return 0;
587 12 :
588 5010 : if (Opcode == Instruction::ZExt &&
589 806 : TLI->isZExtFree(SrcLT.second, DstLT.second))
590 26 : return 0;
591 :
592 10159 : if (Opcode == Instruction::AddrSpaceCast &&
593 0 : TLI->isNoopAddrSpaceCast(Src->getPointerAddressSpace(),
594 5590 : Dst->getPointerAddressSpace()))
595 : return 0;
596 39 :
597 39 : // If this is a zext/sext of a load, return 0 if the corresponding
598 33 : // extending load exists on target.
599 681 : if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
600 5103 : I && isa<LoadInst>(I->getOperand(0))) {
601 281 : EVT ExtVT = EVT::getEVT(Dst);
602 281 : EVT LoadVT = EVT::getEVT(Src);
603 281 : unsigned LType =
604 : ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
605 562 : if (TLI->isLoadExtLegal(LType, ExtVT, LoadVT))
606 5805 : return 0;
607 : }
608 314 :
609 : // If the cast is marked as legal (or promote) then assume low cost.
610 8269 : if (SrcLT.first == DstLT.first &&
611 5724 : TLI->isOperationLegalOrPromote(ISD, DstLT.second))
612 2231 : return 1;
613 0 :
614 0 : // Handle scalar conversions.
615 2084 : if (!Src->isVectorTy() && !Dst->isVectorTy()) {
616 0 : // Scalar bitcasts are usually free.
617 621 : if (Opcode == Instruction::BitCast)
618 0 : return 0;
619 :
620 : // Just check the op cost. If the operation is legal then assume it costs
621 : // 1.
622 933 : if (!TLI->isOperationExpand(ISD, DstLT.second))
623 384 : return 1;
624 5276 :
625 : // Assume that illegal scalar instruction are expensive.
626 5511 : return 4;
627 : }
628 39 :
629 39 : // Check vector-to-vector casts.
630 1496 : if (Dst->isVectorTy() && Src->isVectorTy()) {
631 : // If the cast is between same-sized registers, then the check is simple.
632 1463 : if (SrcLT.first == DstLT.first &&
633 843 : SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) {
634 0 :
635 : // Assume that Zext is done using AND.
636 648 : if (Opcode == Instruction::ZExt)
637 : return 1;
638 5237 :
639 : // Assume that sext is done using SHL and SRA.
640 591 : if (Opcode == Instruction::SExt)
641 0 : return 2;
642 :
643 : // Just check the op cost. If the operation is legal then assume it
644 : // costs
645 : // 1 and multiply by the type-legalization overhead.
646 559 : if (!TLI->isOperationExpand(ISD, DstLT.second))
647 12 : return SrcLT.first * 1;
648 : }
649 :
650 : // If we are legalizing by splitting, query the concrete TTI for the cost
651 : // of casting the original vector twice. We also need to factor in the
652 : // cost of the split itself. Count that as 1, to be consistent with
653 : // TLI->getTypeLegalizationCost().
654 1362 : if ((TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
655 1362 : TargetLowering::TypeSplitVector) ||
656 864 : (TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
657 : TargetLowering::TypeSplitVector)) {
658 1372 : Type *SplitDst = VectorType::get(Dst->getVectorElementType(),
659 : Dst->getVectorNumElements() / 2);
660 1372 : Type *SplitSrc = VectorType::get(Src->getVectorElementType(),
661 : Src->getVectorNumElements() / 2);
662 : T *TTI = static_cast<T *>(this);
663 0 : return TTI->getVectorSplitCost() +
664 686 : (2 * TTI->getCastInstrCost(Opcode, SplitDst, SplitSrc, I));
665 : }
666 :
667 : // In other cases where the source or destination are illegal, assume
668 : // the operation will get scalarized.
669 : unsigned Num = Dst->getVectorNumElements();
670 676 : unsigned Cost = static_cast<T *>(this)->getCastInstrCost(
671 : Opcode, Dst->getScalarType(), Src->getScalarType(), I);
672 :
673 : // Return the cost of multiple scalar invocation plus the cost of
674 : // inserting and extracting the values.
675 676 : return getScalarizationOverhead(Dst, true, true) + Num * Cost;
676 : }
677 :
678 : // We already handled vector-to-vector and scalar-to-scalar conversions.
679 : // This
680 : // is where we handle bitcast between vectors and scalars. We need to assume
681 : // that the conversion is scalarized in one way or another.
682 0 : if (Opcode == Instruction::BitCast)
683 : // Illegal bitcasts are done by storing and loading from a stack slot.
684 0 : return (Src->isVectorTy() ? getScalarizationOverhead(Src, false, true)
685 : : 0) +
686 0 : (Dst->isVectorTy() ? getScalarizationOverhead(Dst, true, false)
687 0 : : 0);
688 :
689 0 : llvm_unreachable("Unhandled cast");
690 : }
691 :
692 8 : unsigned getExtractWithExtendCost(unsigned Opcode, Type *Dst,
693 : VectorType *VecTy, unsigned Index) {
694 : return static_cast<T *>(this)->getVectorInstrCost(
695 8 : Instruction::ExtractElement, VecTy, Index) +
696 : static_cast<T *>(this)->getCastInstrCost(Opcode, Dst,
697 8 : VecTy->getElementType());
698 : }
699 :
700 0 : unsigned getCFInstrCost(unsigned Opcode) {
701 : // Branches are assumed to be predicted.
702 0 : return 0;
703 : }
704 0 :
705 8931 : unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
706 0 : const Instruction *I) {
707 : const TargetLoweringBase *TLI = getTLI();
708 8931 : int ISD = TLI->InstructionOpcodeToISD(Opcode);
709 0 : assert(ISD && "Invalid opcode");
710 0 :
711 : // Selects on vectors are actually vector selects.
712 8931 : if (ISD == ISD::SELECT) {
713 207 : assert(CondTy && "CondTy must exist");
714 3346 : if (CondTy->isVectorTy())
715 : ISD = ISD::VSELECT;
716 207 : }
717 8931 : std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
718 :
719 8931 : if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
720 9135 : !TLI->isOperationExpand(ISD, LT.second)) {
721 : // The operation is legal. Assume it costs 1. Multiply
722 119 : // by the type-legalization overhead.
723 7204 : return LT.first * 1;
724 : }
725 207 :
726 85 : // Otherwise, assume that the cast is scalarized.
727 207 : // TODO: If one of the types get legalized by splitting, handle this
728 207 : // similarly to what getCastInstrCost() does.
729 1727 : if (ValTy->isVectorTy()) {
730 296 : unsigned Num = ValTy->getVectorNumElements();
731 2016 : if (CondTy)
732 100 : CondTy = CondTy->getScalarType();
733 1686 : unsigned Cost = static_cast<T *>(this)->getCmpSelInstrCost(
734 211 : Opcode, ValTy->getScalarType(), CondTy, I);
735 115 :
736 : // Return the cost of multiple scalar invocation plus the cost of
737 88 : // inserting and extracting the values.
738 1686 : return getScalarizationOverhead(ValTy, true, false) + Num * Cost;
739 173 : }
740 :
741 176 : // Unknown scalar opcode.
742 : return 1;
743 : }
744 :
745 0 : unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
746 137115 : std::pair<unsigned, MVT> LT =
747 0 : getTLI()->getTypeLegalizationCost(DL, Val->getScalarType());
748 :
749 0 : return LT.first;
750 0 : }
751 :
752 191 : unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
753 0 : unsigned AddressSpace, const Instruction *I = nullptr) {
754 612 : assert(!Src->isVoidTy() && "Invalid type");
755 191 : std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(DL, Src);
756 85 :
757 0 : // Assuming that all loads of legal types cost 1.
758 : unsigned Cost = LT.first;
759 0 :
760 487 : if (Src->isVectorTy() &&
761 316 : Src->getPrimitiveSizeInBits() < LT.second.getSizeInBits()) {
762 100 : // This is a vector load that legalizes to a larger type than the vector
763 0 : // itself. Unless the corresponding extending load or truncating store is
764 211 : // legal, then this will scalarize.
765 115 : TargetLowering::LegalizeAction LA = TargetLowering::Expand;
766 23 : EVT MemVT = getTLI()->getValueType(DL, Src);
767 23 : if (Opcode == Instruction::Store)
768 : LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
769 85 : else
770 : LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
771 :
772 23 : if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
773 : // This is a vector load/store for some illegal type that is scalarized.
774 : // We must account for the cost of building or decomposing the vector.
775 38 : Cost += getScalarizationOverhead(Src, Opcode != Instruction::Store,
776 : Opcode == Instruction::Store);
777 : }
778 : }
779 48 :
780 221 : return Cost;
781 : }
782 60 :
783 20 : unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
784 : unsigned Factor,
785 : ArrayRef<unsigned> Indices,
786 : unsigned Alignment, unsigned AddressSpace,
787 : bool IsMasked = false) {
788 : VectorType *VT = dyn_cast<VectorType>(VecTy);
789 24 : assert(VT && "Expect a vector type for interleaved memory op");
790 :
791 50 : unsigned NumElts = VT->getNumElements();
792 : assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
793 :
794 20 : unsigned NumSubElts = NumElts / Factor;
795 38 : VectorType *SubVT = VectorType::get(VT->getElementType(), NumSubElts);
796 :
797 0 : // Firstly, the cost of load/store operation.
798 : unsigned Cost;
799 20 : if (IsMasked)
800 3 : Cost = static_cast<T *>(this)->getMaskedMemoryOpCost(
801 0 : Opcode, VecTy, Alignment, AddressSpace);
802 0 : else
803 17 : Cost = static_cast<T *>(this)->getMemoryOpCost(Opcode, VecTy, Alignment,
804 0 : AddressSpace);
805 :
806 : // Legalize the vector type, and get the legalized and unlegalized type
807 : // sizes.
808 20 : MVT VecTyLT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
809 20 : unsigned VecTySize =
810 20 : static_cast<T *>(this)->getDataLayout().getTypeStoreSize(VecTy);
811 0 : unsigned VecTyLTSize = VecTyLT.getStoreSize();
812 :
813 0 : // Return the ceiling of dividing A by B.
814 5 : auto ceil = [](unsigned A, unsigned B) { return (A + B - 1) / B; };
815 :
816 : // Scale the cost of the memory operation by the fraction of legalized
817 0 : // instructions that will actually be used. We shouldn't account for the
818 : // cost of dead instructions since they will be removed.
819 18 : //
820 : // E.g., An interleaved load of factor 8:
821 : // %vec = load <16 x i64>, <16 x i64>* %ptr
822 : // %v0 = shufflevector %vec, undef, <0, 8>
823 48 : //
824 30 : // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
825 : // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
826 60 : // type). The other loads are unused.
827 : //
828 : // We only scale the cost of loads since interleaved store groups aren't
829 : // allowed to have gaps.
830 20 : if (Opcode == Instruction::Load && VecTySize > VecTyLTSize) {
831 : // The number of loads of a legal type it will take to represent a load
832 : // of the unlegalized vector type.
833 24 : unsigned NumLegalInsts = ceil(VecTySize, VecTyLTSize);
834 :
835 30 : // The number of elements of the unlegalized type that correspond to a
836 : // single legal instruction.
837 : unsigned NumEltsPerLegalInst = ceil(NumElts, NumLegalInsts);
838 :
839 18 : // Determine which legal instructions will be used.
840 5 : BitVector UsedInsts(NumLegalInsts, false);
841 14 : for (unsigned Index : Indices)
842 35 : for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
843 26 : UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
844 :
845 : // Scale the cost of the load by the fraction of legal instructions that
846 : // will be used.
847 5 : Cost *= UsedInsts.count() / NumLegalInsts;
848 0 : }
849 0 :
850 : // Then plus the cost of interleave operation.
851 20 : if (Opcode == Instruction::Load) {
852 : // The interleave cost is similar to extract sub vectors' elements
853 : // from the wide vector, and insert them into sub vectors.
854 0 : //
855 : // E.g. An interleaved load of factor 2 (with one member of index 0):
856 0 : // %vec = load <8 x i32>, <8 x i32>* %ptr
857 : // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0
858 : // The cost is estimated as extract elements at 0, 2, 4, 6 from the
859 : // <8 x i32> vector and insert them into a <4 x i32> vector.
860 :
861 0 : assert(Indices.size() <= Factor &&
862 : "Interleaved memory op has too many members");
863 :
864 34 : for (unsigned Index : Indices) {
865 : assert(Index < Factor && "Invalid index for interleaved memory op");
866 :
867 : // Extract elements from loaded vector for each sub vector.
868 108 : for (unsigned i = 0; i < NumSubElts; i++)
869 86 : Cost += static_cast<T *>(this)->getVectorInstrCost(
870 86 : Instruction::ExtractElement, VT, Index + i * Factor);
871 : }
872 :
873 0 : unsigned InsSubCost = 0;
874 60 : for (unsigned i = 0; i < NumSubElts; i++)
875 48 : InsSubCost += static_cast<T *>(this)->getVectorInstrCost(
876 : Instruction::InsertElement, SubVT, i);
877 :
878 12 : Cost += Indices.size() * InsSubCost;
879 : } else {
880 0 : // The interleave cost is extract all elements from sub vectors, and
881 : // insert them into the wide vector.
882 : //
883 0 : // E.g. An interleaved store of factor 2:
884 : // %v0_v1 = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7>
885 : // store <8 x i32> %interleaved.vec, <8 x i32>* %ptr
886 : // The cost is estimated as extract all elements from both <4 x i32>
887 : // vectors and insert into the <8 x i32> vector.
888 :
889 0 : unsigned ExtSubCost = 0;
890 84 : for (unsigned i = 0; i < NumSubElts; i++)
891 76 : ExtSubCost += static_cast<T *>(this)->getVectorInstrCost(
892 : Instruction::ExtractElement, SubVT, i);
893 8 : Cost += ExtSubCost * Factor;
894 :
895 166 : for (unsigned i = 0; i < NumElts; i++)
896 158 : Cost += static_cast<T *>(this)
897 0 : ->getVectorInstrCost(Instruction::InsertElement, VT, i);
898 : }
899 :
900 20 : if (!IsMasked)
901 0 : return Cost;
902 :
903 3 : Type *I8Type = Type::getInt8Ty(VT->getContext());
904 3 : VectorType *MaskVT = VectorType::get(I8Type, NumElts);
905 3 : SubVT = VectorType::get(I8Type, NumSubElts);
906 :
907 : // The Mask shuffling cost is extract all the elements of the Mask
908 0 : // and insert each of them Factor times into the wide vector:
909 : //
910 0 : // E.g. an interleaved group with factor 3:
911 0 : // %mask = icmp ult <8 x i32> %vec1, %vec2
912 : // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
913 : // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
914 : // The cost is estimated as extract all mask elements from the <8xi1> mask
915 0 : // vector and insert them factor times into the <24xi1> shuffled mask
916 : // vector.
917 27 : for (unsigned i = 0; i < NumSubElts; i++)
918 24 : Cost += static_cast<T *>(this)->getVectorInstrCost(
919 : Instruction::ExtractElement, SubVT, i);
920 :
921 63 : for (unsigned i = 0; i < NumElts; i++)
922 48 : Cost += static_cast<T *>(this)->getVectorInstrCost(
923 : Instruction::InsertElement, MaskVT, i);
924 16 :
925 : return Cost;
926 16 : }
927 16 :
928 : /// Get intrinsic cost based on arguments.
929 2695 : unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
930 16 : ArrayRef<Value *> Args, FastMathFlags FMF,
931 16 : unsigned VF = 1) {
932 2695 : unsigned RetVF = (RetTy->isVectorTy() ? RetTy->getVectorNumElements() : 1);
933 : assert((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type");
934 12 :
935 2695 : switch (IID) {
936 : default: {
937 : // Assume that we need to scalarize this intrinsic.
938 18 : SmallVector<Type *, 4> Types;
939 6565 : for (Value *Op : Args) {
940 3944 : Type *OpTy = Op->getType();
941 : assert(VF == 1 || !OpTy->isVectorTy());
942 3958 : Types.push_back(VF == 1 ? OpTy : VectorType::get(OpTy, VF));
943 4 : }
944 2 :
945 2619 : if (VF > 1 && !RetTy->isVoidTy())
946 1095 : RetTy = VectorType::get(RetTy, VF);
947 24 :
948 : // Compute the scalarization overhead based on Args for a vector
949 : // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while
950 : // CostModel will pass a vector RetTy and VF is 1.
951 : unsigned ScalarizationCost = std::numeric_limits<unsigned>::max();
952 2619 : if (RetVF > 1 || VF > 1) {
953 0 : ScalarizationCost = 0;
954 2240 : if (!RetTy->isVoidTy())
955 2227 : ScalarizationCost += getScalarizationOverhead(RetTy, true, false);
956 2231 : ScalarizationCost += getOperandsScalarizationOverhead(Args, VF);
957 0 : }
958 :
959 0 : return static_cast<T *>(this)->
960 2633 : getIntrinsicInstrCost(IID, RetTy, Types, FMF, ScalarizationCost);
961 : }
962 12 : case Intrinsic::masked_scatter: {
963 : assert(VF == 1 && "Can't vectorize types here.");
964 30 : Value *Mask = Args[3];
965 30 : bool VarMask = !isa<Constant>(Mask);
966 12 : unsigned Alignment = cast<ConstantInt>(Args[2])->getZExtValue();
967 : return
968 : static_cast<T *>(this)->getGatherScatterOpCost(Instruction::Store,
969 9 : Args[0]->getType(),
970 : Args[1], VarMask,
971 15 : Alignment);
972 : }
973 37 : case Intrinsic::masked_gather: {
974 : assert(VF == 1 && "Can't vectorize types here.");
975 37 : Value *Mask = Args[2];
976 40 : bool VarMask = !isa<Constant>(Mask);
977 40 : unsigned Alignment = cast<ConstantInt>(Args[1])->getZExtValue();
978 : return
979 : static_cast<T *>(this)->getGatherScatterOpCost(Instruction::Load,
980 0 : RetTy, Args[0], VarMask,
981 37 : Alignment);
982 : }
983 27 : case Intrinsic::experimental_vector_reduce_add:
984 6 : case Intrinsic::experimental_vector_reduce_mul:
985 : case Intrinsic::experimental_vector_reduce_and:
986 6 : case Intrinsic::experimental_vector_reduce_or:
987 6 : case Intrinsic::experimental_vector_reduce_xor:
988 : case Intrinsic::experimental_vector_reduce_fadd:
989 : case Intrinsic::experimental_vector_reduce_fmul:
990 6 : case Intrinsic::experimental_vector_reduce_smax:
991 : case Intrinsic::experimental_vector_reduce_smin:
992 : case Intrinsic::experimental_vector_reduce_fmax:
993 : case Intrinsic::experimental_vector_reduce_fmin:
994 6 : case Intrinsic::experimental_vector_reduce_umax:
995 : case Intrinsic::experimental_vector_reduce_umin:
996 54 : return getIntrinsicInstrCost(IID, RetTy, Args[0]->getType(), FMF);
997 : }
998 : }
999 :
1000 6 : /// Get intrinsic cost based on argument types.
1001 0 : /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the
1002 : /// cost of scalarizing the arguments and the return value will be computed
1003 : /// based on types.
1004 2761 : unsigned getIntrinsicInstrCost(
1005 : Intrinsic::ID IID, Type *RetTy, ArrayRef<Type *> Tys, FastMathFlags FMF,
1006 : unsigned ScalarizationCostPassed = std::numeric_limits<unsigned>::max()) {
1007 : SmallVector<unsigned, 2> ISDs;
1008 6 : unsigned SingleCallCost = 10; // Library call cost. Make it expensive.
1009 2767 : switch (IID) {
1010 395 : default: {
1011 : // Assume that we need to scalarize this intrinsic.
1012 8 : unsigned ScalarizationCost = ScalarizationCostPassed;
1013 393 : unsigned ScalarCalls = 1;
1014 8 : Type *ScalarRetTy = RetTy;
1015 393 : if (RetTy->isVectorTy()) {
1016 7 : if (ScalarizationCostPassed == std::numeric_limits<unsigned>::max())
1017 0 : ScalarizationCost = getScalarizationOverhead(RetTy, true, false);
1018 18 : ScalarCalls = std::max(ScalarCalls, RetTy->getVectorNumElements());
1019 : ScalarRetTy = RetTy->getScalarType();
1020 : }
1021 : SmallVector<Type *, 4> ScalarTys;
1022 1142 : for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1023 1498 : Type *Ty = Tys[i];
1024 753 : if (Ty->isVectorTy()) {
1025 9 : if (ScalarizationCostPassed == std::numeric_limits<unsigned>::max())
1026 0 : ScalarizationCost += getScalarizationOverhead(Ty, false, true);
1027 18 : ScalarCalls = std::max(ScalarCalls, Ty->getVectorNumElements());
1028 9 : Ty = Ty->getScalarType();
1029 2 : }
1030 749 : ScalarTys.push_back(Ty);
1031 : }
1032 393 : if (ScalarCalls == 1)
1033 : return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
1034 :
1035 12 : unsigned ScalarCost = static_cast<T *>(this)->getIntrinsicInstrCost(
1036 0 : IID, ScalarRetTy, ScalarTys, FMF);
1037 :
1038 7 : return ScalarCalls * ScalarCost + ScalarizationCost;
1039 : }
1040 0 : // Look for intrinsics that can be lowered directly or turned into a scalar
1041 0 : // intrinsic call.
1042 6 : case Intrinsic::sqrt:
1043 6 : ISDs.push_back(ISD::FSQRT);
1044 6 : break;
1045 38 : case Intrinsic::sin:
1046 38 : ISDs.push_back(ISD::FSIN);
1047 38 : break;
1048 38 : case Intrinsic::cos:
1049 38 : ISDs.push_back(ISD::FCOS);
1050 38 : break;
1051 46 : case Intrinsic::exp:
1052 46 : ISDs.push_back(ISD::FEXP);
1053 46 : break;
1054 3 : case Intrinsic::exp2:
1055 3 : ISDs.push_back(ISD::FEXP2);
1056 3 : break;
1057 35 : case Intrinsic::log:
1058 35 : ISDs.push_back(ISD::FLOG);
1059 35 : break;
1060 11 : case Intrinsic::log10:
1061 11 : ISDs.push_back(ISD::FLOG10);
1062 11 : break;
1063 0 : case Intrinsic::log2:
1064 0 : ISDs.push_back(ISD::FLOG2);
1065 0 : break;
1066 174 : case Intrinsic::fabs:
1067 174 : ISDs.push_back(ISD::FABS);
1068 174 : break;
1069 0 : case Intrinsic::canonicalize:
1070 0 : ISDs.push_back(ISD::FCANONICALIZE);
1071 0 : break;
1072 0 : case Intrinsic::minnum:
1073 0 : ISDs.push_back(ISD::FMINNUM);
1074 0 : if (FMF.noNaNs())
1075 0 : ISDs.push_back(ISD::FMINNAN);
1076 : break;
1077 0 : case Intrinsic::maxnum:
1078 0 : ISDs.push_back(ISD::FMAXNUM);
1079 0 : if (FMF.noNaNs())
1080 0 : ISDs.push_back(ISD::FMAXNAN);
1081 0 : break;
1082 162 : case Intrinsic::copysign:
1083 162 : ISDs.push_back(ISD::FCOPYSIGN);
1084 162 : break;
1085 275 : case Intrinsic::floor:
1086 275 : ISDs.push_back(ISD::FFLOOR);
1087 275 : break;
1088 254 : case Intrinsic::ceil:
1089 254 : ISDs.push_back(ISD::FCEIL);
1090 254 : break;
1091 240 : case Intrinsic::trunc:
1092 240 : ISDs.push_back(ISD::FTRUNC);
1093 240 : break;
1094 243 : case Intrinsic::nearbyint:
1095 243 : ISDs.push_back(ISD::FNEARBYINT);
1096 243 : break;
1097 240 : case Intrinsic::rint:
1098 240 : ISDs.push_back(ISD::FRINT);
1099 240 : break;
1100 0 : case Intrinsic::round:
1101 0 : ISDs.push_back(ISD::FROUND);
1102 0 : break;
1103 27 : case Intrinsic::pow:
1104 27 : ISDs.push_back(ISD::FPOW);
1105 27 : break;
1106 426 : case Intrinsic::fma:
1107 426 : ISDs.push_back(ISD::FMA);
1108 426 : break;
1109 2 : case Intrinsic::fmuladd:
1110 2 : ISDs.push_back(ISD::FMA);
1111 2 : break;
1112 : // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
1113 : case Intrinsic::lifetime_start:
1114 0 : case Intrinsic::lifetime_end:
1115 : case Intrinsic::sideeffect:
1116 : return 0;
1117 12 : case Intrinsic::masked_store:
1118 0 : return static_cast<T *>(this)
1119 12 : ->getMaskedMemoryOpCost(Instruction::Store, Tys[0], 0, 0);
1120 20 : case Intrinsic::masked_load:
1121 : return static_cast<T *>(this)
1122 20 : ->getMaskedMemoryOpCost(Instruction::Load, RetTy, 0, 0);
1123 5 : case Intrinsic::experimental_vector_reduce_add:
1124 0 : return static_cast<T *>(this)->getArithmeticReductionCost(
1125 5 : Instruction::Add, Tys[0], /*IsPairwiseForm=*/false);
1126 0 : case Intrinsic::experimental_vector_reduce_mul:
1127 : return static_cast<T *>(this)->getArithmeticReductionCost(
1128 0 : Instruction::Mul, Tys[0], /*IsPairwiseForm=*/false);
1129 0 : case Intrinsic::experimental_vector_reduce_and:
1130 : return static_cast<T *>(this)->getArithmeticReductionCost(
1131 0 : Instruction::And, Tys[0], /*IsPairwiseForm=*/false);
1132 0 : case Intrinsic::experimental_vector_reduce_or:
1133 0 : return static_cast<T *>(this)->getArithmeticReductionCost(
1134 0 : Instruction::Or, Tys[0], /*IsPairwiseForm=*/false);
1135 0 : case Intrinsic::experimental_vector_reduce_xor:
1136 0 : return static_cast<T *>(this)->getArithmeticReductionCost(
1137 0 : Instruction::Xor, Tys[0], /*IsPairwiseForm=*/false);
1138 0 : case Intrinsic::experimental_vector_reduce_fadd:
1139 : return static_cast<T *>(this)->getArithmeticReductionCost(
1140 0 : Instruction::FAdd, Tys[0], /*IsPairwiseForm=*/false);
1141 0 : case Intrinsic::experimental_vector_reduce_fmul:
1142 0 : return static_cast<T *>(this)->getArithmeticReductionCost(
1143 0 : Instruction::FMul, Tys[0], /*IsPairwiseForm=*/false);
1144 12 : case Intrinsic::experimental_vector_reduce_smax:
1145 : case Intrinsic::experimental_vector_reduce_smin:
1146 : case Intrinsic::experimental_vector_reduce_fmax:
1147 : case Intrinsic::experimental_vector_reduce_fmin:
1148 0 : return static_cast<T *>(this)->getMinMaxReductionCost(
1149 : Tys[0], CmpInst::makeCmpResultType(Tys[0]), /*IsPairwiseForm=*/false,
1150 12 : /*IsSigned=*/true);
1151 10 : case Intrinsic::experimental_vector_reduce_umax:
1152 : case Intrinsic::experimental_vector_reduce_umin:
1153 0 : return static_cast<T *>(this)->getMinMaxReductionCost(
1154 : Tys[0], CmpInst::makeCmpResultType(Tys[0]), /*IsPairwiseForm=*/false,
1155 10 : /*IsSigned=*/false);
1156 84 : case Intrinsic::ctpop:
1157 84 : ISDs.push_back(ISD::CTPOP);
1158 : // In case of legalization use TCC_Expensive. This is cheaper than a
1159 : // library call but still not a cheap instruction.
1160 0 : SingleCallCost = TargetTransformInfo::TCC_Expensive;
1161 84 : break;
1162 0 : // FIXME: ctlz, cttz, ...
1163 : }
1164 0 :
1165 0 : const TargetLoweringBase *TLI = getTLI();
1166 2304 : std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
1167 0 :
1168 : SmallVector<unsigned, 2> LegalCost;
1169 12 : SmallVector<unsigned, 2> CustomCost;
1170 4608 : for (unsigned ISD : ISDs) {
1171 2304 : if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
1172 873 : if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
1173 0 : TLI->isFAbsFree(LT.second)) {
1174 16 : return 0;
1175 16 : }
1176 :
1177 : // The operation is legal. Assume it costs 1.
1178 16 : // If the type is split to multiple registers, assume that there is some
1179 16 : // overhead to this.
1180 : // TODO: Once we have extract/insert subvector cost we need to use them.
1181 857 : if (LT.first > 1)
1182 110 : LegalCost.push_back(LT.first * 2);
1183 : else
1184 759 : LegalCost.push_back(LT.first * 1);
1185 : } else if (!TLI->isOperationExpand(ISD, LT.second)) {
1186 18 : // If the operation is custom lowered then assume
1187 4 : // that the code is twice as expensive.
1188 336 : CustomCost.push_back(LT.first * 2);
1189 : }
1190 16 : }
1191 4 :
1192 2 : auto MinLegalCostI = std::min_element(LegalCost.begin(), LegalCost.end());
1193 2304 : if (MinLegalCostI != LegalCost.end())
1194 881 : return *MinLegalCostI;
1195 24 :
1196 : auto MinCustomCostI = std::min_element(CustomCost.begin(), CustomCost.end());
1197 1447 : if (MinCustomCostI != CustomCost.end())
1198 334 : return *MinCustomCostI;
1199 :
1200 : // If we can't lower fmuladd into an FMA estimate the cost as a floating
1201 0 : // point mul followed by an add.
1202 1122 : if (IID == Intrinsic::fmuladd)
1203 0 : return static_cast<T *>(this)
1204 2 : ->getArithmeticInstrCost(BinaryOperator::FMul, RetTy) +
1205 0 : static_cast<T *>(this)
1206 2 : ->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy);
1207 0 :
1208 0 : // Else, assume that we need to scalarize this intrinsic. For math builtins
1209 : // this will emit a costly libcall, adding call overhead and spills. Make it
1210 : // very expensive.
1211 1111 : if (RetTy->isVectorTy()) {
1212 18 : unsigned ScalarizationCost =
1213 18 : ((ScalarizationCostPassed != std::numeric_limits<unsigned>::max())
1214 415 : ? ScalarizationCostPassed
1215 : : getScalarizationOverhead(RetTy, true, false));
1216 415 : unsigned ScalarCalls = RetTy->getVectorNumElements();
1217 9 : SmallVector<Type *, 4> ScalarTys;
1218 1079 : for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1219 1331 : Type *Ty = Tys[i];
1220 664 : if (Ty->isVectorTy())
1221 664 : Ty = Ty->getScalarType();
1222 664 : ScalarTys.push_back(Ty);
1223 : }
1224 418 : unsigned ScalarCost = static_cast<T *>(this)->getIntrinsicInstrCost(
1225 3 : IID, RetTy->getScalarType(), ScalarTys, FMF);
1226 1079 : for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1227 1992 : if (Tys[i]->isVectorTy()) {
1228 664 : if (ScalarizationCostPassed == std::numeric_limits<unsigned>::max())
1229 0 : ScalarizationCost += getScalarizationOverhead(Tys[i], false, true);
1230 1328 : ScalarCalls = std::max(ScalarCalls, Tys[i]->getVectorNumElements());
1231 : }
1232 6 : }
1233 :
1234 421 : return ScalarCalls * ScalarCost + ScalarizationCost;
1235 6 : }
1236 :
1237 : // This is going to be turned into a library call, make it expensive.
1238 6 : return SingleCallCost;
1239 : }
1240 :
1241 : /// Compute a cost of the given call instruction.
1242 6 : ///
1243 : /// Compute the cost of calling function F with return type RetTy and
1244 : /// argument types Tys. F might be nullptr, in this case the cost of an
1245 : /// arbitrary call with the specified signature will be returned.
1246 : /// This is used, for instance, when we estimate call of a vector
1247 : /// counterpart of the given function.
1248 6 : /// \param F Called function, might be nullptr.
1249 0 : /// \param RetTy Return value types.
1250 : /// \param Tys Argument types.
1251 : /// \returns The cost of Call instruction.
1252 0 : unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys) {
1253 0 : return 10;
1254 : }
1255 :
1256 6 : unsigned getNumberOfParts(Type *Tp) {
1257 20450 : std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(DL, Tp);
1258 2 : return LT.first;
1259 : }
1260 8 :
1261 0 : unsigned getAddressComputationCost(Type *Ty, ScalarEvolution *,
1262 8 : const SCEV *) {
1263 0 : return 0;
1264 : }
1265 :
1266 4 : /// Try to calculate arithmetic and shuffle op costs for reduction operations.
1267 : /// We're assuming that reduction operation are performing the following way:
1268 : /// 1. Non-pairwise reduction
1269 : /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
1270 : /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
1271 : /// \----------------v-------------/ \----------v------------/
1272 4 : /// n/2 elements n/2 elements
1273 : /// %red1 = op <n x t> %val, <n x t> val1
1274 : /// After this operation we have a vector %red1 where only the first n/2
1275 : /// elements are meaningful, the second n/2 elements are undefined and can be
1276 : /// dropped. All other operations are actually working with the vector of
1277 2 : /// length n/2, not n, though the real vector length is still n.
1278 : /// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
1279 : /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
1280 : /// \----------------v-------------/ \----------v------------/
1281 : /// n/4 elements 3*n/4 elements
1282 : /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of
1283 : /// length n/2, the resulting vector has length n/4 etc.
1284 0 : /// 2. Pairwise reduction:
1285 : /// Everything is the same except for an additional shuffle operation which
1286 0 : /// is used to produce operands for pairwise kind of reductions.
1287 : /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
1288 0 : /// <n x i32> <i32 0, i32 2, ..., i32 n-2, i32 undef, ..., i32 undef>
1289 0 : /// \-------------v----------/ \----------v------------/
1290 : /// n/2 elements n/2 elements
1291 0 : /// %val2 = shufflevector<n x t> %val, <n x t> %undef,
1292 : /// <n x i32> <i32 1, i32 3, ..., i32 n-1, i32 undef, ..., i32 undef>
1293 : /// \-------------v----------/ \----------v------------/
1294 0 : /// n/2 elements n/2 elements
1295 : /// %red1 = op <n x t> %val1, <n x t> val2
1296 : /// Again, the operation is performed on <n x t> vector, but the resulting
1297 0 : /// vector %red1 is <n/2 x t> vector.
1298 : ///
1299 0 : /// The cost model should take into account that the actual length of the
1300 : /// vector is reduced on each iteration.
1301 64 : unsigned getArithmeticReductionCost(unsigned Opcode, Type *Ty,
1302 : bool IsPairwise) {
1303 : assert(Ty->isVectorTy() && "Expect a vector type");
1304 64 : Type *ScalarTy = Ty->getVectorElementType();
1305 : unsigned NumVecElts = Ty->getVectorNumElements();
1306 0 : unsigned NumReduxLevels = Log2_32(NumVecElts);
1307 : unsigned ArithCost = 0;
1308 0 : unsigned ShuffleCost = 0;
1309 : auto *ConcreteTTI = static_cast<T *>(this);
1310 64 : std::pair<unsigned, MVT> LT =
1311 64 : ConcreteTTI->getTLI()->getTypeLegalizationCost(DL, Ty);
1312 : unsigned LongVectorCount = 0;
1313 64 : unsigned MVTLen =
1314 : LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
1315 88 : while (NumVecElts > MVTLen) {
1316 24 : NumVecElts /= 2;
1317 : // Assume the pairwise shuffles add a cost.
1318 24 : ShuffleCost += (IsPairwise + 1) *
1319 : ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
1320 : NumVecElts, Ty);
1321 92 : ArithCost += ConcreteTTI->getArithmeticInstrCost(Opcode, Ty);
1322 24 : Ty = VectorType::get(ScalarTy, NumVecElts);
1323 24 : ++LongVectorCount;
1324 68 : }
1325 : // The minimal length of the vector is limited by the real length of vector
1326 : // operations performed on the current platform. That's why several final
1327 : // reduction operations are performed on the vectors with the same
1328 49 : // architecture-dependent length.
1329 64 : ShuffleCost += (NumReduxLevels - LongVectorCount) * (IsPairwise + 1) *
1330 38 : ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
1331 19 : NumVecElts, Ty);
1332 64 : ArithCost += (NumReduxLevels - LongVectorCount) *
1333 68 : ConcreteTTI->getArithmeticInstrCost(Opcode, Ty);
1334 64 : return ShuffleCost + ArithCost + getScalarizationOverhead(Ty, false, true);
1335 80 : }
1336 61 :
1337 : /// Try to calculate op costs for min/max reduction operations.
1338 12 : /// \param CondTy Conditional type for the Select instruction.
1339 903 : unsigned getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwise,
1340 : bool) {
1341 12 : assert(Ty->isVectorTy() && "Expect a vector type");
1342 866 : Type *ScalarTy = Ty->getVectorElementType();
1343 866 : Type *ScalarCondTy = CondTy->getVectorElementType();
1344 : unsigned NumVecElts = Ty->getVectorNumElements();
1345 0 : unsigned NumReduxLevels = Log2_32(NumVecElts);
1346 : unsigned CmpOpcode;
1347 0 : if (Ty->isFPOrFPVectorTy()) {
1348 : CmpOpcode = Instruction::FCmp;
1349 19 : } else {
1350 : assert(Ty->isIntOrIntVectorTy() &&
1351 : "expecting floating point or integer type for min/max reduction");
1352 19 : CmpOpcode = Instruction::ICmp;
1353 : }
1354 19 : unsigned MinMaxCost = 0;
1355 : unsigned ShuffleCost = 0;
1356 : auto *ConcreteTTI = static_cast<T *>(this);
1357 854 : std::pair<unsigned, MVT> LT =
1358 854 : ConcreteTTI->getTLI()->getTypeLegalizationCost(DL, Ty);
1359 31 : unsigned LongVectorCount = 0;
1360 854 : unsigned MVTLen =
1361 : LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
1362 1737 : while (NumVecElts > MVTLen) {
1363 883 : NumVecElts /= 2;
1364 : // Assume the pairwise shuffles add a cost.
1365 852 : ShuffleCost += (IsPairwise + 1) *
1366 : ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
1367 0 : NumVecElts, Ty);
1368 852 : MinMaxCost +=
1369 852 : ConcreteTTI->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, nullptr) +
1370 : ConcreteTTI->getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
1371 : nullptr);
1372 852 : Ty = VectorType::get(ScalarTy, NumVecElts);
1373 852 : CondTy = VectorType::get(ScalarCondTy, NumVecElts);
1374 852 : ++LongVectorCount;
1375 0 : }
1376 : // The minimal length of the vector is limited by the real length of vector
1377 31 : // operations performed on the current platform. That's why several final
1378 31 : // reduction opertions are perfomed on the vectors with the same
1379 : // architecture-dependent length.
1380 885 : ShuffleCost += (NumReduxLevels - LongVectorCount) * (IsPairwise + 1) *
1381 : ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
1382 44 : NumVecElts, Ty);
1383 867 : MinMaxCost +=
1384 854 : (NumReduxLevels - LongVectorCount) *
1385 867 : (ConcreteTTI->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, nullptr) +
1386 0 : ConcreteTTI->getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
1387 : nullptr));
1388 13 : // Need 3 extractelement instructions for scalarization + an additional
1389 13 : // scalar select instruction.
1390 1708 : return ShuffleCost + MinMaxCost +
1391 854 : 3 * getScalarizationOverhead(Ty, /*Insert=*/false,
1392 867 : /*Extract=*/true) +
1393 13 : ConcreteTTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
1394 867 : ScalarCondTy, nullptr);
1395 : }
1396 :
1397 0 : unsigned getVectorSplitCost() { return 1; }
1398 :
1399 49 : /// @}
1400 31 : };
1401 :
1402 49 : /// Concrete BasicTTIImpl that can be used if no further customization
1403 31 : /// is needed.
1404 94858 : class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
1405 31 : using BaseT = BasicTTIImplBase<BasicTTIImpl>;
1406 49 :
1407 : friend class BasicTTIImplBase<BasicTTIImpl>;
1408 19 :
1409 : const TargetSubtargetInfo *ST;
1410 62 : const TargetLoweringBase *TLI;
1411 80 :
1412 31 : const TargetSubtargetInfo *getST() const { return ST; }
1413 49 : const TargetLoweringBase *getTLI() const { return TLI; }
1414 80 :
1415 : public:
1416 : explicit BasicTTIImpl(const TargetMachine *TM, const Function &F);
1417 49 : };
1418 :
1419 : } // end namespace llvm
1420 :
1421 : #endif // LLVM_CODEGEN_BASICTTIIMPL_H
|