LLVM 20.0.0git
BasicTTIImpl.h
Go to the documentation of this file.
1//===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file provides a helper that implements much of the TTI interface in
11/// terms of the target-independent code generator and TargetLowering
12/// interfaces.
13//
14//===----------------------------------------------------------------------===//
15
16#ifndef LLVM_CODEGEN_BASICTTIIMPL_H
17#define LLVM_CODEGEN_BASICTTIIMPL_H
18
19#include "llvm/ADT/APInt.h"
20#include "llvm/ADT/BitVector.h"
33#include "llvm/IR/BasicBlock.h"
34#include "llvm/IR/Constant.h"
35#include "llvm/IR/Constants.h"
36#include "llvm/IR/DataLayout.h"
38#include "llvm/IR/InstrTypes.h"
39#include "llvm/IR/Instruction.h"
41#include "llvm/IR/Intrinsics.h"
42#include "llvm/IR/Operator.h"
43#include "llvm/IR/Type.h"
44#include "llvm/IR/Value.h"
52#include <algorithm>
53#include <cassert>
54#include <cstdint>
55#include <limits>
56#include <optional>
57#include <utility>
58
59namespace llvm {
60
61class Function;
62class GlobalValue;
63class LLVMContext;
64class ScalarEvolution;
65class SCEV;
66class TargetMachine;
67
68extern cl::opt<unsigned> PartialUnrollingThreshold;
69
70/// Base class which can be used to help build a TTI implementation.
71///
72/// This class provides as much implementation of the TTI interface as is
73/// possible using the target independent parts of the code generator.
74///
75/// In order to subclass it, your class must implement a getST() method to
76/// return the subtarget, and a getTLI() method to return the target lowering.
77/// We need these methods implemented in the derived class so that this class
78/// doesn't have to duplicate storage for them.
79template <typename T>
81private:
84
85 /// Helper function to access this as a T.
86 T *thisT() { return static_cast<T *>(this); }
87
88 /// Estimate a cost of Broadcast as an extract and sequence of insert
89 /// operations.
90 InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy,
93 // Broadcast cost is equal to the cost of extracting the zero'th element
94 // plus the cost of inserting it into every element of the result vector.
95 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
96 CostKind, 0, nullptr, nullptr);
97
98 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
99 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
100 CostKind, i, nullptr, nullptr);
101 }
102 return Cost;
103 }
104
105 /// Estimate a cost of shuffle as a sequence of extract and insert
106 /// operations.
107 InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy,
110 // Shuffle cost is equal to the cost of extracting element from its argument
111 // plus the cost of inserting them onto the result vector.
112
113 // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
114 // index 0 of first vector, index 1 of second vector,index 2 of first
115 // vector and finally index 3 of second vector and insert them at index
116 // <0,1,2,3> of result vector.
117 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
118 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
119 CostKind, i, nullptr, nullptr);
120 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
121 CostKind, i, nullptr, nullptr);
122 }
123 return Cost;
124 }
125
126 /// Estimate a cost of subvector extraction as a sequence of extract and
127 /// insert operations.
128 InstructionCost getExtractSubvectorOverhead(VectorType *VTy,
130 int Index,
131 FixedVectorType *SubVTy) {
132 assert(VTy && SubVTy &&
133 "Can only extract subvectors from vectors");
134 int NumSubElts = SubVTy->getNumElements();
135 assert((!isa<FixedVectorType>(VTy) ||
136 (Index + NumSubElts) <=
137 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
138 "SK_ExtractSubvector index out of range");
139
141 // Subvector extraction cost is equal to the cost of extracting element from
142 // the source type plus the cost of inserting them into the result vector
143 // type.
144 for (int i = 0; i != NumSubElts; ++i) {
145 Cost +=
146 thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
147 CostKind, i + Index, nullptr, nullptr);
148 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy,
149 CostKind, i, nullptr, nullptr);
150 }
151 return Cost;
152 }
153
154 /// Estimate a cost of subvector insertion as a sequence of extract and
155 /// insert operations.
156 InstructionCost getInsertSubvectorOverhead(VectorType *VTy,
158 int Index,
159 FixedVectorType *SubVTy) {
160 assert(VTy && SubVTy &&
161 "Can only insert subvectors into vectors");
162 int NumSubElts = SubVTy->getNumElements();
163 assert((!isa<FixedVectorType>(VTy) ||
164 (Index + NumSubElts) <=
165 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
166 "SK_InsertSubvector index out of range");
167
169 // Subvector insertion cost is equal to the cost of extracting element from
170 // the source type plus the cost of inserting them into the result vector
171 // type.
172 for (int i = 0; i != NumSubElts; ++i) {
173 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy,
174 CostKind, i, nullptr, nullptr);
175 Cost +=
176 thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, CostKind,
177 i + Index, nullptr, nullptr);
178 }
179 return Cost;
180 }
181
182 /// Local query method delegates up to T which *must* implement this!
183 const TargetSubtargetInfo *getST() const {
184 return static_cast<const T *>(this)->getST();
185 }
186
187 /// Local query method delegates up to T which *must* implement this!
188 const TargetLoweringBase *getTLI() const {
189 return static_cast<const T *>(this)->getTLI();
190 }
191
192 static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
193 switch (M) {
195 return ISD::UNINDEXED;
196 case TTI::MIM_PreInc:
197 return ISD::PRE_INC;
198 case TTI::MIM_PreDec:
199 return ISD::PRE_DEC;
200 case TTI::MIM_PostInc:
201 return ISD::POST_INC;
202 case TTI::MIM_PostDec:
203 return ISD::POST_DEC;
204 }
205 llvm_unreachable("Unexpected MemIndexedMode");
206 }
207
208 InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
209 Align Alignment,
210 bool VariableMask,
211 bool IsGatherScatter,
213 unsigned AddressSpace = 0) {
214 // We cannot scalarize scalable vectors, so return Invalid.
215 if (isa<ScalableVectorType>(DataTy))
217
218 auto *VT = cast<FixedVectorType>(DataTy);
219 unsigned VF = VT->getNumElements();
220
221 // Assume the target does not have support for gather/scatter operations
222 // and provide a rough estimate.
223 //
224 // First, compute the cost of the individual memory operations.
225 InstructionCost AddrExtractCost =
226 IsGatherScatter
229 PointerType::get(VT->getElementType(), 0), VF),
230 /*Insert=*/false, /*Extract=*/true, CostKind)
231 : 0;
232
233 // The cost of the scalar loads/stores.
234 InstructionCost MemoryOpCost =
235 VF * thisT()->getMemoryOpCost(Opcode, VT->getElementType(), Alignment,
237
238 // Next, compute the cost of packing the result in a vector.
239 InstructionCost PackingCost =
240 getScalarizationOverhead(VT, Opcode != Instruction::Store,
241 Opcode == Instruction::Store, CostKind);
242
243 InstructionCost ConditionalCost = 0;
244 if (VariableMask) {
245 // Compute the cost of conditionally executing the memory operations with
246 // variable masks. This includes extracting the individual conditions, a
247 // branches and PHIs to combine the results.
248 // NOTE: Estimating the cost of conditionally executing the memory
249 // operations accurately is quite difficult and the current solution
250 // provides a very rough estimate only.
251 ConditionalCost =
254 /*Insert=*/false, /*Extract=*/true, CostKind) +
255 VF * (thisT()->getCFInstrCost(Instruction::Br, CostKind) +
256 thisT()->getCFInstrCost(Instruction::PHI, CostKind));
257 }
258
259 return AddrExtractCost + MemoryOpCost + PackingCost + ConditionalCost;
260 }
261
262protected:
263 explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
264 : BaseT(DL) {}
265 virtual ~BasicTTIImplBase() = default;
266
268
269public:
270 /// \name Scalar TTI Implementations
271 /// @{
273 unsigned AddressSpace, Align Alignment,
274 unsigned *Fast) const {
275 EVT E = EVT::getIntegerVT(Context, BitWidth);
276 return getTLI()->allowsMisalignedMemoryAccesses(
278 }
279
280 bool areInlineCompatible(const Function *Caller,
281 const Function *Callee) const {
282 const TargetMachine &TM = getTLI()->getTargetMachine();
283
284 const FeatureBitset &CallerBits =
285 TM.getSubtargetImpl(*Caller)->getFeatureBits();
286 const FeatureBitset &CalleeBits =
287 TM.getSubtargetImpl(*Callee)->getFeatureBits();
288
289 // Inline a callee if its target-features are a subset of the callers
290 // target-features.
291 return (CallerBits & CalleeBits) == CalleeBits;
292 }
293
294 bool hasBranchDivergence(const Function *F = nullptr) { return false; }
295
296 bool isSourceOfDivergence(const Value *V) { return false; }
297
298 bool isAlwaysUniform(const Value *V) { return false; }
299
300 bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
301 return false;
302 }
303
304 bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const {
305 return true;
306 }
307
309 // Return an invalid address space.
310 return -1;
311 }
312
314 Intrinsic::ID IID) const {
315 return false;
316 }
317
318 bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
319 return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS);
320 }
321
322 unsigned getAssumedAddrSpace(const Value *V) const {
323 return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
324 }
325
326 bool isSingleThreaded() const {
327 return getTLI()->getTargetMachine().Options.ThreadModel ==
329 }
330
331 std::pair<const Value *, unsigned>
333 return getTLI()->getTargetMachine().getPredicatedAddrSpace(V);
334 }
335
337 Value *NewV) const {
338 return nullptr;
339 }
340
341 bool isLegalAddImmediate(int64_t imm) {
342 return getTLI()->isLegalAddImmediate(imm);
343 }
344
345 bool isLegalAddScalableImmediate(int64_t Imm) {
346 return getTLI()->isLegalAddScalableImmediate(Imm);
347 }
348
349 bool isLegalICmpImmediate(int64_t imm) {
350 return getTLI()->isLegalICmpImmediate(imm);
351 }
352
353 bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
354 bool HasBaseReg, int64_t Scale, unsigned AddrSpace,
355 Instruction *I = nullptr,
356 int64_t ScalableOffset = 0) {
358 AM.BaseGV = BaseGV;
359 AM.BaseOffs = BaseOffset;
360 AM.HasBaseReg = HasBaseReg;
361 AM.Scale = Scale;
362 AM.ScalableOffset = ScalableOffset;
363 return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
364 }
365
366 int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) {
367 return getTLI()->getPreferredLargeGEPBaseOffset(MinOffset, MaxOffset);
368 }
369
370 unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
371 Type *ScalarValTy) const {
372 auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy](unsigned VF) {
373 auto *SrcTy = FixedVectorType::get(ScalarMemTy, VF / 2);
374 EVT VT = getTLI()->getValueType(DL, SrcTy);
375 if (getTLI()->isOperationLegal(ISD::STORE, VT) ||
376 getTLI()->isOperationCustom(ISD::STORE, VT))
377 return true;
378
379 EVT ValVT =
380 getTLI()->getValueType(DL, FixedVectorType::get(ScalarValTy, VF / 2));
381 EVT LegalizedVT =
382 getTLI()->getTypeToTransformTo(ScalarMemTy->getContext(), VT);
383 return getTLI()->isTruncStoreLegal(LegalizedVT, ValVT);
384 };
385 while (VF > 2 && IsSupportedByTarget(VF))
386 VF /= 2;
387 return VF;
388 }
389
391 const DataLayout &DL) const {
392 EVT VT = getTLI()->getValueType(DL, Ty);
393 return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
394 }
395
397 const DataLayout &DL) const {
398 EVT VT = getTLI()->getValueType(DL, Ty);
399 return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
400 }
401
404 }
405
408 }
409
412 }
413
416 }
417
419 StackOffset BaseOffset, bool HasBaseReg,
420 int64_t Scale, unsigned AddrSpace) {
422 AM.BaseGV = BaseGV;
423 AM.BaseOffs = BaseOffset.getFixed();
424 AM.HasBaseReg = HasBaseReg;
425 AM.Scale = Scale;
426 AM.ScalableOffset = BaseOffset.getScalable();
427 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
428 return 0;
429 return -1;
430 }
431
432 bool isTruncateFree(Type *Ty1, Type *Ty2) {
433 return getTLI()->isTruncateFree(Ty1, Ty2);
434 }
435
437 return getTLI()->isProfitableToHoist(I);
438 }
439
440 bool useAA() const { return getST()->useAA(); }
441
442 bool isTypeLegal(Type *Ty) {
443 EVT VT = getTLI()->getValueType(DL, Ty, /*AllowUnknown=*/true);
444 return getTLI()->isTypeLegal(VT);
445 }
446
447 unsigned getRegUsageForType(Type *Ty) {
448 EVT ETy = getTLI()->getValueType(DL, Ty);
449 return getTLI()->getNumRegisters(Ty->getContext(), ETy);
450 }
451
455 return BaseT::getGEPCost(PointeeType, Ptr, Operands, AccessType, CostKind);
456 }
457
459 unsigned &JumpTableSize,
461 BlockFrequencyInfo *BFI) {
462 /// Try to find the estimated number of clusters. Note that the number of
463 /// clusters identified in this function could be different from the actual
464 /// numbers found in lowering. This function ignore switches that are
465 /// lowered with a mix of jump table / bit test / BTree. This function was
466 /// initially intended to be used when estimating the cost of switch in
467 /// inline cost heuristic, but it's a generic cost model to be used in other
468 /// places (e.g., in loop unrolling).
469 unsigned N = SI.getNumCases();
470 const TargetLoweringBase *TLI = getTLI();
471 const DataLayout &DL = this->getDataLayout();
472
473 JumpTableSize = 0;
474 bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());
475
476 // Early exit if both a jump table and bit test are not allowed.
477 if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N))
478 return N;
479
480 APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
481 APInt MinCaseVal = MaxCaseVal;
482 for (auto CI : SI.cases()) {
483 const APInt &CaseVal = CI.getCaseValue()->getValue();
484 if (CaseVal.sgt(MaxCaseVal))
485 MaxCaseVal = CaseVal;
486 if (CaseVal.slt(MinCaseVal))
487 MinCaseVal = CaseVal;
488 }
489
490 // Check if suitable for a bit test
491 if (N <= DL.getIndexSizeInBits(0u)) {
493 for (auto I : SI.cases())
494 Dests.insert(I.getCaseSuccessor());
495
496 if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
497 DL))
498 return 1;
499 }
500
501 // Check if suitable for a jump table.
502 if (IsJTAllowed) {
503 if (N < 2 || N < TLI->getMinimumJumpTableEntries())
504 return N;
506 (MaxCaseVal - MinCaseVal)
507 .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1;
508 // Check whether a range of clusters is dense enough for a jump table
509 if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) {
510 JumpTableSize = Range;
511 return 1;
512 }
513 }
514 return N;
515 }
516
518 const TargetLoweringBase *TLI = getTLI();
519 return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
520 TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
521 }
522
524 const TargetMachine &TM = getTLI()->getTargetMachine();
525 // If non-PIC mode, do not generate a relative lookup table.
526 if (!TM.isPositionIndependent())
527 return false;
528
529 /// Relative lookup table entries consist of 32-bit offsets.
530 /// Do not generate relative lookup tables for large code models
531 /// in 64-bit achitectures where 32-bit offsets might not be enough.
532 if (TM.getCodeModel() == CodeModel::Medium ||
533 TM.getCodeModel() == CodeModel::Large)
534 return false;
535
536 const Triple &TargetTriple = TM.getTargetTriple();
537 if (!TargetTriple.isArch64Bit())
538 return false;
539
540 // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it
541 // there.
542 if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin())
543 return false;
544
545 return true;
546 }
547
548 bool haveFastSqrt(Type *Ty) {
549 const TargetLoweringBase *TLI = getTLI();
550 EVT VT = TLI->getValueType(DL, Ty);
551 return TLI->isTypeLegal(VT) &&
553 }
554
556 return true;
557 }
558
560 // Check whether FADD is available, as a proxy for floating-point in
561 // general.
562 const TargetLoweringBase *TLI = getTLI();
563 EVT VT = TLI->getValueType(DL, Ty);
567 }
568
570 const Function &Fn) const {
571 switch (Inst.getOpcode()) {
572 default:
573 break;
574 case Instruction::SDiv:
575 case Instruction::SRem:
576 case Instruction::UDiv:
577 case Instruction::URem: {
578 if (!isa<ConstantInt>(Inst.getOperand(1)))
579 return false;
580 EVT VT = getTLI()->getValueType(DL, Inst.getType());
581 return !getTLI()->isIntDivCheap(VT, Fn.getAttributes());
582 }
583 };
584
585 return false;
586 }
587
588 unsigned getInliningThresholdMultiplier() const { return 1; }
589 unsigned adjustInliningThreshold(const CallBase *CB) { return 0; }
590 unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const {
591 return 0;
592 }
593
594 int getInlinerVectorBonusPercent() const { return 150; }
595
599 // This unrolling functionality is target independent, but to provide some
600 // motivation for its intended use, for x86:
601
602 // According to the Intel 64 and IA-32 Architectures Optimization Reference
603 // Manual, Intel Core models and later have a loop stream detector (and
604 // associated uop queue) that can benefit from partial unrolling.
605 // The relevant requirements are:
606 // - The loop must have no more than 4 (8 for Nehalem and later) branches
607 // taken, and none of them may be calls.
608 // - The loop can have no more than 18 (28 for Nehalem and later) uops.
609
610 // According to the Software Optimization Guide for AMD Family 15h
611 // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
612 // and loop buffer which can benefit from partial unrolling.
613 // The relevant requirements are:
614 // - The loop must have fewer than 16 branches
615 // - The loop must have less than 40 uops in all executed loop branches
616
617 // The number of taken branches in a loop is hard to estimate here, and
618 // benchmarking has revealed that it is better not to be conservative when
619 // estimating the branch count. As a result, we'll ignore the branch limits
620 // until someone finds a case where it matters in practice.
621
622 unsigned MaxOps;
623 const TargetSubtargetInfo *ST = getST();
624 if (PartialUnrollingThreshold.getNumOccurrences() > 0)
626 else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
627 MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
628 else
629 return;
630
631 // Scan the loop: don't unroll loops with calls.
632 for (BasicBlock *BB : L->blocks()) {
633 for (Instruction &I : *BB) {
634 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
635 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
636 if (!thisT()->isLoweredToCall(F))
637 continue;
638 }
639
640 if (ORE) {
641 ORE->emit([&]() {
642 return OptimizationRemark("TTI", "DontUnroll", L->getStartLoc(),
643 L->getHeader())
644 << "advising against unrolling the loop because it "
645 "contains a "
646 << ore::NV("Call", &I);
647 });
648 }
649 return;
650 }
651 }
652 }
653
654 // Enable runtime and partial unrolling up to the specified size.
655 // Enable using trip count upper bound to unroll loops.
656 UP.Partial = UP.Runtime = UP.UpperBound = true;
657 UP.PartialThreshold = MaxOps;
658
659 // Avoid unrolling when optimizing for size.
660 UP.OptSizeThreshold = 0;
662
663 // Set number of instructions optimized when "back edge"
664 // becomes "fall through" to default value of 2.
665 UP.BEInsns = 2;
666 }
667
670 PP.PeelCount = 0;
671 PP.AllowPeeling = true;
672 PP.AllowLoopNestsPeeling = false;
673 PP.PeelProfiledIterations = true;
674 }
675
677 AssumptionCache &AC,
678 TargetLibraryInfo *LibInfo,
679 HardwareLoopInfo &HWLoopInfo) {
680 return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
681 }
682
685 }
686
689 }
690
692 getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) {
693 return BaseT::getPreferredTailFoldingStyle(IVUpdateMayOverflow);
694 }
695
696 std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
697 IntrinsicInst &II) {
699 }
700
701 std::optional<Value *>
703 APInt DemandedMask, KnownBits &Known,
704 bool &KnownBitsComputed) {
705 return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
706 KnownBitsComputed);
707 }
708
710 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
711 APInt &UndefElts2, APInt &UndefElts3,
712 std::function<void(Instruction *, unsigned, APInt, APInt &)>
713 SimplifyAndSetOp) {
715 IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
716 SimplifyAndSetOp);
717 }
718
719 virtual std::optional<unsigned>
721 return std::optional<unsigned>(
722 getST()->getCacheSize(static_cast<unsigned>(Level)));
723 }
724
725 virtual std::optional<unsigned>
727 std::optional<unsigned> TargetResult =
728 getST()->getCacheAssociativity(static_cast<unsigned>(Level));
729
730 if (TargetResult)
731 return TargetResult;
732
733 return BaseT::getCacheAssociativity(Level);
734 }
735
736 virtual unsigned getCacheLineSize() const {
737 return getST()->getCacheLineSize();
738 }
739
740 virtual unsigned getPrefetchDistance() const {
741 return getST()->getPrefetchDistance();
742 }
743
744 virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
745 unsigned NumStridedMemAccesses,
746 unsigned NumPrefetches,
747 bool HasCall) const {
748 return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
749 NumPrefetches, HasCall);
750 }
751
752 virtual unsigned getMaxPrefetchIterationsAhead() const {
753 return getST()->getMaxPrefetchIterationsAhead();
754 }
755
756 virtual bool enableWritePrefetching() const {
757 return getST()->enableWritePrefetching();
758 }
759
760 virtual bool shouldPrefetchAddressSpace(unsigned AS) const {
761 return getST()->shouldPrefetchAddressSpace(AS);
762 }
763
764 /// @}
765
766 /// \name Vector TTI Implementations
767 /// @{
768
770 return TypeSize::getFixed(32);
771 }
772
773 std::optional<unsigned> getMaxVScale() const { return std::nullopt; }
774 std::optional<unsigned> getVScaleForTuning() const { return std::nullopt; }
775 bool isVScaleKnownToBeAPowerOfTwo() const { return false; }
776
777 /// Estimate the overhead of scalarizing an instruction. Insert and Extract
778 /// are set if the demanded result elements need to be inserted and/or
779 /// extracted from vectors.
781 const APInt &DemandedElts,
782 bool Insert, bool Extract,
784 ArrayRef<Value *> VL = {}) {
785 /// FIXME: a bitfield is not a reasonable abstraction for talking about
786 /// which elements are needed from a scalable vector
787 if (isa<ScalableVectorType>(InTy))
789 auto *Ty = cast<FixedVectorType>(InTy);
790
791 assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&
792 (VL.empty() || VL.size() == Ty->getNumElements()) &&
793 "Vector size mismatch");
794
796
797 for (int i = 0, e = Ty->getNumElements(); i < e; ++i) {
798 if (!DemandedElts[i])
799 continue;
800 if (Insert) {
801 Value *InsertedVal = VL.empty() ? nullptr : VL[i];
802 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty,
803 CostKind, i, nullptr, InsertedVal);
804 }
805 if (Extract)
806 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
807 CostKind, i, nullptr, nullptr);
808 }
809
810 return Cost;
811 }
812
814 return false;
815 }
816
818 unsigned ScalarOpdIdx) const {
819 return false;
820 }
821
823 int OpdIdx) const {
824 return OpdIdx == -1;
825 }
826
828 int RetIdx) const {
829 return RetIdx == 0;
830 }
831
832 /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
834 bool Extract,
836 if (isa<ScalableVectorType>(InTy))
838 auto *Ty = cast<FixedVectorType>(InTy);
839
840 APInt DemandedElts = APInt::getAllOnes(Ty->getNumElements());
841 return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
842 CostKind);
843 }
844
845 /// Estimate the overhead of scalarizing an instructions unique
846 /// non-constant operands. The (potentially vector) types to use for each of
847 /// argument are passes via Tys.
852 assert(Args.size() == Tys.size() && "Expected matching Args and Tys");
853
855 SmallPtrSet<const Value*, 4> UniqueOperands;
856 for (int I = 0, E = Args.size(); I != E; I++) {
857 // Disregard things like metadata arguments.
858 const Value *A = Args[I];
859 Type *Ty = Tys[I];
860 if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() &&
861 !Ty->isPtrOrPtrVectorTy())
862 continue;
863
864 if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
865 if (auto *VecTy = dyn_cast<VectorType>(Ty))
866 Cost += getScalarizationOverhead(VecTy, /*Insert*/ false,
867 /*Extract*/ true, CostKind);
868 }
869 }
870
871 return Cost;
872 }
873
874 /// Estimate the overhead of scalarizing the inputs and outputs of an
875 /// instruction, with return type RetTy and arguments Args of type Tys. If
876 /// Args are unknown (empty), then the cost associated with one argument is
877 /// added as a heuristic.
883 RetTy, /*Insert*/ true, /*Extract*/ false, CostKind);
884 if (!Args.empty())
886 else
887 // When no information on arguments is provided, we add the cost
888 // associated with one argument as a heuristic.
889 Cost += getScalarizationOverhead(RetTy, /*Insert*/ false,
890 /*Extract*/ true, CostKind);
891
892 return Cost;
893 }
894
895 /// Estimate the cost of type-legalization and the legalized type.
896 std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const {
897 LLVMContext &C = Ty->getContext();
898 EVT MTy = getTLI()->getValueType(DL, Ty);
899
901 // We keep legalizing the type until we find a legal kind. We assume that
902 // the only operation that costs anything is the split. After splitting
903 // we need to handle two types.
904 while (true) {
906
908 // Ensure we return a sensible simple VT here, since many callers of
909 // this function require it.
910 MVT VT = MTy.isSimple() ? MTy.getSimpleVT() : MVT::i64;
911 return std::make_pair(InstructionCost::getInvalid(), VT);
912 }
913
914 if (LK.first == TargetLoweringBase::TypeLegal)
915 return std::make_pair(Cost, MTy.getSimpleVT());
916
917 if (LK.first == TargetLoweringBase::TypeSplitVector ||
919 Cost *= 2;
920
921 // Do not loop with f128 type.
922 if (MTy == LK.second)
923 return std::make_pair(Cost, MTy.getSimpleVT());
924
925 // Keep legalizing the type.
926 MTy = LK.second;
927 }
928 }
929
930 unsigned getMaxInterleaveFactor(ElementCount VF) { return 1; }
931
933 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
936 ArrayRef<const Value *> Args = {}, const Instruction *CxtI = nullptr) {
937 // Check if any of the operands are vector operands.
938 const TargetLoweringBase *TLI = getTLI();
939 int ISD = TLI->InstructionOpcodeToISD(Opcode);
940 assert(ISD && "Invalid opcode");
941
942 // TODO: Handle more cost kinds.
944 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
945 Opd1Info, Opd2Info,
946 Args, CxtI);
947
948 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
949
950 bool IsFloat = Ty->isFPOrFPVectorTy();
951 // Assume that floating point arithmetic operations cost twice as much as
952 // integer operations.
953 InstructionCost OpCost = (IsFloat ? 2 : 1);
954
955 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
956 // The operation is legal. Assume it costs 1.
957 // TODO: Once we have extract/insert subvector cost we need to use them.
958 return LT.first * OpCost;
959 }
960
961 if (!TLI->isOperationExpand(ISD, LT.second)) {
962 // If the operation is custom lowered, then assume that the code is twice
963 // as expensive.
964 return LT.first * 2 * OpCost;
965 }
966
967 // An 'Expand' of URem and SRem is special because it may default
968 // to expanding the operation into a sequence of sub-operations
969 // i.e. X % Y -> X-(X/Y)*Y.
970 if (ISD == ISD::UREM || ISD == ISD::SREM) {
971 bool IsSigned = ISD == ISD::SREM;
972 if (TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIVREM : ISD::UDIVREM,
973 LT.second) ||
974 TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIV : ISD::UDIV,
975 LT.second)) {
976 unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv;
977 InstructionCost DivCost = thisT()->getArithmeticInstrCost(
978 DivOpc, Ty, CostKind, Opd1Info, Opd2Info);
979 InstructionCost MulCost =
980 thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind);
981 InstructionCost SubCost =
982 thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
983 return DivCost + MulCost + SubCost;
984 }
985 }
986
987 // We cannot scalarize scalable vectors, so return Invalid.
988 if (isa<ScalableVectorType>(Ty))
990
991 // Else, assume that we need to scalarize this op.
992 // TODO: If one of the types get legalized by splitting, handle this
993 // similarly to what getCastInstrCost() does.
994 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
995 InstructionCost Cost = thisT()->getArithmeticInstrCost(
996 Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
997 Args, CxtI);
998 // Return the cost of multiple scalar invocation plus the cost of
999 // inserting and extracting the values.
1000 SmallVector<Type *> Tys(Args.size(), Ty);
1001 return getScalarizationOverhead(VTy, Args, Tys, CostKind) +
1002 VTy->getNumElements() * Cost;
1003 }
1004
1005 // We don't know anything about this scalar instruction.
1006 return OpCost;
1007 }
1008
1010 ArrayRef<int> Mask,
1011 VectorType *Ty, int &Index,
1012 VectorType *&SubTy) const {
1013 if (Mask.empty())
1014 return Kind;
1015 int NumSrcElts = Ty->getElementCount().getKnownMinValue();
1016 switch (Kind) {
1018 if (ShuffleVectorInst::isReverseMask(Mask, NumSrcElts))
1019 return TTI::SK_Reverse;
1020 if (ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts))
1021 return TTI::SK_Broadcast;
1022 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts, Index) &&
1023 (Index + Mask.size()) <= (size_t)NumSrcElts) {
1024 SubTy = FixedVectorType::get(Ty->getElementType(), Mask.size());
1026 }
1027 break;
1028 case TTI::SK_PermuteTwoSrc: {
1029 int NumSubElts;
1030 if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
1031 Mask, NumSrcElts, NumSubElts, Index)) {
1032 if (Index + NumSubElts > NumSrcElts)
1033 return Kind;
1034 SubTy = FixedVectorType::get(Ty->getElementType(), NumSubElts);
1036 }
1037 if (ShuffleVectorInst::isSelectMask(Mask, NumSrcElts))
1038 return TTI::SK_Select;
1039 if (ShuffleVectorInst::isTransposeMask(Mask, NumSrcElts))
1040 return TTI::SK_Transpose;
1041 if (ShuffleVectorInst::isSpliceMask(Mask, NumSrcElts, Index))
1042 return TTI::SK_Splice;
1043 break;
1044 }
1045 case TTI::SK_Select:
1046 case TTI::SK_Reverse:
1047 case TTI::SK_Broadcast:
1048 case TTI::SK_Transpose:
1051 case TTI::SK_Splice:
1052 break;
1053 }
1054 return Kind;
1055 }
1056
1058 ArrayRef<int> Mask,
1060 VectorType *SubTp,
1061 ArrayRef<const Value *> Args = {},
1062 const Instruction *CxtI = nullptr) {
1063 switch (improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp)) {
1064 case TTI::SK_Broadcast:
1065 if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
1066 return getBroadcastShuffleOverhead(FVT, CostKind);
1068 case TTI::SK_Select:
1069 case TTI::SK_Splice:
1070 case TTI::SK_Reverse:
1071 case TTI::SK_Transpose:
1074 if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
1075 return getPermuteShuffleOverhead(FVT, CostKind);
1078 return getExtractSubvectorOverhead(Tp, CostKind, Index,
1079 cast<FixedVectorType>(SubTp));
1081 return getInsertSubvectorOverhead(Tp, CostKind, Index,
1082 cast<FixedVectorType>(SubTp));
1083 }
1084 llvm_unreachable("Unknown TTI::ShuffleKind");
1085 }
1086
1087 InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1090 const Instruction *I = nullptr) {
1091 if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0)
1092 return 0;
1093
1094 const TargetLoweringBase *TLI = getTLI();
1095 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1096 assert(ISD && "Invalid opcode");
1097 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1098 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1099
1100 TypeSize SrcSize = SrcLT.second.getSizeInBits();
1101 TypeSize DstSize = DstLT.second.getSizeInBits();
1102 bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy();
1103 bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy();
1104
1105 switch (Opcode) {
1106 default:
1107 break;
1108 case Instruction::Trunc:
1109 // Check for NOOP conversions.
1110 if (TLI->isTruncateFree(SrcLT.second, DstLT.second))
1111 return 0;
1112 [[fallthrough]];
1113 case Instruction::BitCast:
1114 // Bitcast between types that are legalized to the same type are free and
1115 // assume int to/from ptr of the same size is also free.
1116 if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst &&
1117 SrcSize == DstSize)
1118 return 0;
1119 break;
1120 case Instruction::FPExt:
1121 if (I && getTLI()->isExtFree(I))
1122 return 0;
1123 break;
1124 case Instruction::ZExt:
1125 if (TLI->isZExtFree(SrcLT.second, DstLT.second))
1126 return 0;
1127 [[fallthrough]];
1128 case Instruction::SExt:
1129 if (I && getTLI()->isExtFree(I))
1130 return 0;
1131
1132 // If this is a zext/sext of a load, return 0 if the corresponding
1133 // extending load exists on target and the result type is legal.
1134 if (CCH == TTI::CastContextHint::Normal) {
1135 EVT ExtVT = EVT::getEVT(Dst);
1136 EVT LoadVT = EVT::getEVT(Src);
1137 unsigned LType =
1138 ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
1139 if (DstLT.first == SrcLT.first &&
1140 TLI->isLoadExtLegal(LType, ExtVT, LoadVT))
1141 return 0;
1142 }
1143 break;
1144 case Instruction::AddrSpaceCast:
1145 if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(),
1146 Dst->getPointerAddressSpace()))
1147 return 0;
1148 break;
1149 }
1150
1151 auto *SrcVTy = dyn_cast<VectorType>(Src);
1152 auto *DstVTy = dyn_cast<VectorType>(Dst);
1153
1154 // If the cast is marked as legal (or promote) then assume low cost.
1155 if (SrcLT.first == DstLT.first &&
1156 TLI->isOperationLegalOrPromote(ISD, DstLT.second))
1157 return SrcLT.first;
1158
1159 // Handle scalar conversions.
1160 if (!SrcVTy && !DstVTy) {
1161 // Just check the op cost. If the operation is legal then assume it costs
1162 // 1.
1163 if (!TLI->isOperationExpand(ISD, DstLT.second))
1164 return 1;
1165
1166 // Assume that illegal scalar instruction are expensive.
1167 return 4;
1168 }
1169
1170 // Check vector-to-vector casts.
1171 if (DstVTy && SrcVTy) {
1172 // If the cast is between same-sized registers, then the check is simple.
1173 if (SrcLT.first == DstLT.first && SrcSize == DstSize) {
1174
1175 // Assume that Zext is done using AND.
1176 if (Opcode == Instruction::ZExt)
1177 return SrcLT.first;
1178
1179 // Assume that sext is done using SHL and SRA.
1180 if (Opcode == Instruction::SExt)
1181 return SrcLT.first * 2;
1182
1183 // Just check the op cost. If the operation is legal then assume it
1184 // costs
1185 // 1 and multiply by the type-legalization overhead.
1186 if (!TLI->isOperationExpand(ISD, DstLT.second))
1187 return SrcLT.first * 1;
1188 }
1189
1190 // If we are legalizing by splitting, query the concrete TTI for the cost
1191 // of casting the original vector twice. We also need to factor in the
1192 // cost of the split itself. Count that as 1, to be consistent with
1193 // getTypeLegalizationCost().
1194 bool SplitSrc =
1195 TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
1197 bool SplitDst =
1198 TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
1200 if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() &&
1201 DstVTy->getElementCount().isVector()) {
1202 Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy);
1203 Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy);
1204 T *TTI = static_cast<T *>(this);
1205 // If both types need to be split then the split is free.
1206 InstructionCost SplitCost =
1207 (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0;
1208 return SplitCost +
1209 (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH,
1210 CostKind, I));
1211 }
1212
1213 // Scalarization cost is Invalid, can't assume any num elements.
1214 if (isa<ScalableVectorType>(DstVTy))
1216
1217 // In other cases where the source or destination are illegal, assume
1218 // the operation will get scalarized.
1219 unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements();
1220 InstructionCost Cost = thisT()->getCastInstrCost(
1221 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I);
1222
1223 // Return the cost of multiple scalar invocation plus the cost of
1224 // inserting and extracting the values.
1225 return getScalarizationOverhead(DstVTy, /*Insert*/ true, /*Extract*/ true,
1226 CostKind) +
1227 Num * Cost;
1228 }
1229
1230 // We already handled vector-to-vector and scalar-to-scalar conversions.
1231 // This
1232 // is where we handle bitcast between vectors and scalars. We need to assume
1233 // that the conversion is scalarized in one way or another.
1234 if (Opcode == Instruction::BitCast) {
1235 // Illegal bitcasts are done by storing and loading from a stack slot.
1236 return (SrcVTy ? getScalarizationOverhead(SrcVTy, /*Insert*/ false,
1237 /*Extract*/ true, CostKind)
1238 : 0) +
1239 (DstVTy ? getScalarizationOverhead(DstVTy, /*Insert*/ true,
1240 /*Extract*/ false, CostKind)
1241 : 0);
1242 }
1243
1244 llvm_unreachable("Unhandled cast");
1245 }
1246
1248 VectorType *VecTy, unsigned Index) {
1250 return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
1251 CostKind, Index, nullptr, nullptr) +
1252 thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(),
1254 }
1255
1257 const Instruction *I = nullptr) {
1258 return BaseT::getCFInstrCost(Opcode, CostKind, I);
1259 }
1260
1262 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
1266 const Instruction *I = nullptr) {
1267 const TargetLoweringBase *TLI = getTLI();
1268 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1269 assert(ISD && "Invalid opcode");
1270
1271 // TODO: Handle other cost kinds.
1273 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1274 Op1Info, Op2Info, I);
1275
1276 // Selects on vectors are actually vector selects.
1277 if (ISD == ISD::SELECT) {
1278 assert(CondTy && "CondTy must exist");
1279 if (CondTy->isVectorTy())
1280 ISD = ISD::VSELECT;
1281 }
1282 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1283
1284 if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
1285 !TLI->isOperationExpand(ISD, LT.second)) {
1286 // The operation is legal. Assume it costs 1. Multiply
1287 // by the type-legalization overhead.
1288 return LT.first * 1;
1289 }
1290
1291 // Otherwise, assume that the cast is scalarized.
1292 // TODO: If one of the types get legalized by splitting, handle this
1293 // similarly to what getCastInstrCost() does.
1294 if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) {
1295 if (isa<ScalableVectorType>(ValTy))
1297
1298 unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements();
1299 if (CondTy)
1300 CondTy = CondTy->getScalarType();
1301 InstructionCost Cost =
1302 thisT()->getCmpSelInstrCost(Opcode, ValVTy->getScalarType(), CondTy,
1303 VecPred, CostKind, Op1Info, Op2Info, I);
1304
1305 // Return the cost of multiple scalar invocation plus the cost of
1306 // inserting and extracting the values.
1307 return getScalarizationOverhead(ValVTy, /*Insert*/ true,
1308 /*Extract*/ false, CostKind) +
1309 Num * Cost;
1310 }
1311
1312 // Unknown scalar opcode.
1313 return 1;
1314 }
1315
1318 unsigned Index, Value *Op0, Value *Op1) {
1319 return getRegUsageForType(Val->getScalarType());
1320 }
1321
1322 /// \param ScalarUserAndIdx encodes the information about extracts from a
1323 /// vector with 'Scalar' being the value being extracted,'User' being the user
1324 /// of the extract(nullptr if user is not known before vectorization) and
1325 /// 'Idx' being the extract lane.
1327 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
1328 Value *Scalar,
1329 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
1330 return thisT()->getVectorInstrCost(Opcode, Val, CostKind, Index, nullptr,
1331 nullptr);
1332 }
1333
1336 unsigned Index) {
1337 Value *Op0 = nullptr;
1338 Value *Op1 = nullptr;
1339 if (auto *IE = dyn_cast<InsertElementInst>(&I)) {
1340 Op0 = IE->getOperand(0);
1341 Op1 = IE->getOperand(1);
1342 }
1343 return thisT()->getVectorInstrCost(I.getOpcode(), Val, CostKind, Index, Op0,
1344 Op1);
1345 }
1346
1347 InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
1348 int VF,
1349 const APInt &DemandedDstElts,
1351 assert(DemandedDstElts.getBitWidth() == (unsigned)VF * ReplicationFactor &&
1352 "Unexpected size of DemandedDstElts.");
1353
1355
1356 auto *SrcVT = FixedVectorType::get(EltTy, VF);
1357 auto *ReplicatedVT = FixedVectorType::get(EltTy, VF * ReplicationFactor);
1358
1359 // The Mask shuffling cost is extract all the elements of the Mask
1360 // and insert each of them Factor times into the wide vector:
1361 //
1362 // E.g. an interleaved group with factor 3:
1363 // %mask = icmp ult <8 x i32> %vec1, %vec2
1364 // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
1365 // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
1366 // The cost is estimated as extract all mask elements from the <8xi1> mask
1367 // vector and insert them factor times into the <24xi1> shuffled mask
1368 // vector.
1369 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedDstElts, VF);
1370 Cost += thisT()->getScalarizationOverhead(SrcVT, DemandedSrcElts,
1371 /*Insert*/ false,
1372 /*Extract*/ true, CostKind);
1373 Cost += thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts,
1374 /*Insert*/ true,
1375 /*Extract*/ false, CostKind);
1376
1377 return Cost;
1378 }
1379
1381 getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
1384 const Instruction *I = nullptr) {
1385 assert(!Src->isVoidTy() && "Invalid type");
1386 // Assume types, such as structs, are expensive.
1387 if (getTLI()->getValueType(DL, Src, true) == MVT::Other)
1388 return 4;
1389 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1390
1391 // Assuming that all loads of legal types cost 1.
1392 InstructionCost Cost = LT.first;
1394 return Cost;
1395
1396 const DataLayout &DL = this->getDataLayout();
1397 if (Src->isVectorTy() &&
1398 // In practice it's not currently possible to have a change in lane
1399 // length for extending loads or truncating stores so both types should
1400 // have the same scalable property.
1402 LT.second.getSizeInBits())) {
1403 // This is a vector load that legalizes to a larger type than the vector
1404 // itself. Unless the corresponding extending load or truncating store is
1405 // legal, then this will scalarize.
1407 EVT MemVT = getTLI()->getValueType(DL, Src);
1408 if (Opcode == Instruction::Store)
1409 LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
1410 else
1411 LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
1412
1413 if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
1414 // This is a vector load/store for some illegal type that is scalarized.
1415 // We must account for the cost of building or decomposing the vector.
1417 cast<VectorType>(Src), Opcode != Instruction::Store,
1418 Opcode == Instruction::Store, CostKind);
1419 }
1420 }
1421
1422 return Cost;
1423 }
1424
1426 Align Alignment, unsigned AddressSpace,
1428 // TODO: Pass on AddressSpace when we have test coverage.
1429 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false,
1430 CostKind);
1431 }
1432
1434 const Value *Ptr, bool VariableMask,
1435 Align Alignment,
1437 const Instruction *I = nullptr) {
1438 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask,
1439 true, CostKind);
1440 }
1441
1443 const Value *Ptr, bool VariableMask,
1444 Align Alignment,
1446 const Instruction *I) {
1447 // For a target without strided memory operations (or for an illegal
1448 // operation type on one which does), assume we lower to a gather/scatter
1449 // operation. (Which may in turn be scalarized.)
1450 return thisT()->getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1451 Alignment, CostKind, I);
1452 }
1453
1455 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1456 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1457 bool UseMaskForCond = false, bool UseMaskForGaps = false) {
1458
1459 // We cannot scalarize scalable vectors, so return Invalid.
1460 if (isa<ScalableVectorType>(VecTy))
1462
1463 auto *VT = cast<FixedVectorType>(VecTy);
1464
1465 unsigned NumElts = VT->getNumElements();
1466 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1467
1468 unsigned NumSubElts = NumElts / Factor;
1469 auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts);
1470
1471 // Firstly, the cost of load/store operation.
1473 if (UseMaskForCond || UseMaskForGaps)
1474 Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment,
1476 else
1477 Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace,
1478 CostKind);
1479
1480 // Legalize the vector type, and get the legalized and unlegalized type
1481 // sizes.
1482 MVT VecTyLT = getTypeLegalizationCost(VecTy).second;
1483 unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy);
1484 unsigned VecTyLTSize = VecTyLT.getStoreSize();
1485
1486 // Scale the cost of the memory operation by the fraction of legalized
1487 // instructions that will actually be used. We shouldn't account for the
1488 // cost of dead instructions since they will be removed.
1489 //
1490 // E.g., An interleaved load of factor 8:
1491 // %vec = load <16 x i64>, <16 x i64>* %ptr
1492 // %v0 = shufflevector %vec, undef, <0, 8>
1493 //
1494 // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
1495 // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
1496 // type). The other loads are unused.
1497 //
1498 // TODO: Note that legalization can turn masked loads/stores into unmasked
1499 // (legalized) loads/stores. This can be reflected in the cost.
1500 if (Cost.isValid() && VecTySize > VecTyLTSize) {
1501 // The number of loads of a legal type it will take to represent a load
1502 // of the unlegalized vector type.
1503 unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize);
1504
1505 // The number of elements of the unlegalized type that correspond to a
1506 // single legal instruction.
1507 unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts);
1508
1509 // Determine which legal instructions will be used.
1510 BitVector UsedInsts(NumLegalInsts, false);
1511 for (unsigned Index : Indices)
1512 for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
1513 UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
1514
1515 // Scale the cost of the load by the fraction of legal instructions that
1516 // will be used.
1517 Cost = divideCeil(UsedInsts.count() * *Cost.getValue(), NumLegalInsts);
1518 }
1519
1520 // Then plus the cost of interleave operation.
1521 assert(Indices.size() <= Factor &&
1522 "Interleaved memory op has too many members");
1523
1524 const APInt DemandedAllSubElts = APInt::getAllOnes(NumSubElts);
1525 const APInt DemandedAllResultElts = APInt::getAllOnes(NumElts);
1526
1527 APInt DemandedLoadStoreElts = APInt::getZero(NumElts);
1528 for (unsigned Index : Indices) {
1529 assert(Index < Factor && "Invalid index for interleaved memory op");
1530 for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
1531 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
1532 }
1533
1534 if (Opcode == Instruction::Load) {
1535 // The interleave cost is similar to extract sub vectors' elements
1536 // from the wide vector, and insert them into sub vectors.
1537 //
1538 // E.g. An interleaved load of factor 2 (with one member of index 0):
1539 // %vec = load <8 x i32>, <8 x i32>* %ptr
1540 // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0
1541 // The cost is estimated as extract elements at 0, 2, 4, 6 from the
1542 // <8 x i32> vector and insert them into a <4 x i32> vector.
1543 InstructionCost InsSubCost = thisT()->getScalarizationOverhead(
1544 SubVT, DemandedAllSubElts,
1545 /*Insert*/ true, /*Extract*/ false, CostKind);
1546 Cost += Indices.size() * InsSubCost;
1547 Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1548 /*Insert*/ false,
1549 /*Extract*/ true, CostKind);
1550 } else {
1551 // The interleave cost is extract elements from sub vectors, and
1552 // insert them into the wide vector.
1553 //
1554 // E.g. An interleaved store of factor 3 with 2 members at indices 0,1:
1555 // (using VF=4):
1556 // %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef>
1557 // %gaps.mask = <true, true, false, true, true, false,
1558 // true, true, false, true, true, false>
1559 // call llvm.masked.store <12 x i32> %v0_v1, <12 x i32>* %ptr,
1560 // i32 Align, <12 x i1> %gaps.mask
1561 // The cost is estimated as extract all elements (of actual members,
1562 // excluding gaps) from both <4 x i32> vectors and insert into the <12 x
1563 // i32> vector.
1564 InstructionCost ExtSubCost = thisT()->getScalarizationOverhead(
1565 SubVT, DemandedAllSubElts,
1566 /*Insert*/ false, /*Extract*/ true, CostKind);
1567 Cost += ExtSubCost * Indices.size();
1568 Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1569 /*Insert*/ true,
1570 /*Extract*/ false, CostKind);
1571 }
1572
1573 if (!UseMaskForCond)
1574 return Cost;
1575
1576 Type *I8Type = Type::getInt8Ty(VT->getContext());
1577
1578 Cost += thisT()->getReplicationShuffleCost(
1579 I8Type, Factor, NumSubElts,
1580 UseMaskForGaps ? DemandedLoadStoreElts : DemandedAllResultElts,
1581 CostKind);
1582
1583 // The Gaps mask is invariant and created outside the loop, therefore the
1584 // cost of creating it is not accounted for here. However if we have both
1585 // a MaskForGaps and some other mask that guards the execution of the
1586 // memory access, we need to account for the cost of And-ing the two masks
1587 // inside the loop.
1588 if (UseMaskForGaps) {
1589 auto *MaskVT = FixedVectorType::get(I8Type, NumElts);
1590 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT,
1591 CostKind);
1592 }
1593
1594 return Cost;
1595 }
1596
1597 /// Get intrinsic cost based on arguments.
1600 // Check for generically free intrinsics.
1602 return 0;
1603
1604 // Assume that target intrinsics are cheap.
1605 Intrinsic::ID IID = ICA.getID();
1608
1609 // VP Intrinsics should have the same cost as their non-vp counterpart.
1610 // TODO: Adjust the cost to make the vp intrinsic cheaper than its non-vp
1611 // counterpart when the vector length argument is smaller than the maximum
1612 // vector length.
1613 // TODO: Support other kinds of VPIntrinsics
1614 if (VPIntrinsic::isVPIntrinsic(ICA.getID())) {
1615 std::optional<unsigned> FOp =
1617 if (FOp) {
1618 if (ICA.getID() == Intrinsic::vp_load) {
1619 Align Alignment;
1620 if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
1621 Alignment = VPI->getPointerAlignment().valueOrOne();
1622 unsigned AS = 0;
1623 if (ICA.getArgTypes().size() > 1)
1624 if (auto *PtrTy = dyn_cast<PointerType>(ICA.getArgTypes()[0]))
1625 AS = PtrTy->getAddressSpace();
1626 return thisT()->getMemoryOpCost(*FOp, ICA.getReturnType(), Alignment,
1627 AS, CostKind);
1628 }
1629 if (ICA.getID() == Intrinsic::vp_store) {
1630 Align Alignment;
1631 if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
1632 Alignment = VPI->getPointerAlignment().valueOrOne();
1633 unsigned AS = 0;
1634 if (ICA.getArgTypes().size() >= 2)
1635 if (auto *PtrTy = dyn_cast<PointerType>(ICA.getArgTypes()[1]))
1636 AS = PtrTy->getAddressSpace();
1637 return thisT()->getMemoryOpCost(*FOp, ICA.getArgTypes()[0], Alignment,
1638 AS, CostKind);
1639 }
1641 return thisT()->getArithmeticInstrCost(*FOp, ICA.getReturnType(),
1642 CostKind);
1643 }
1644 if (VPCastIntrinsic::isVPCast(ICA.getID())) {
1645 return thisT()->getCastInstrCost(
1646 *FOp, ICA.getReturnType(), ICA.getArgTypes()[0],
1648 }
1649 if (VPCmpIntrinsic::isVPCmp(ICA.getID())) {
1650 // We can only handle vp_cmp intrinsics with underlying instructions.
1651 if (ICA.getInst()) {
1652 assert(FOp);
1653 auto *UI = cast<VPCmpIntrinsic>(ICA.getInst());
1654 return thisT()->getCmpSelInstrCost(*FOp, ICA.getArgTypes()[0],
1655 ICA.getReturnType(),
1656 UI->getPredicate(), CostKind);
1657 }
1658 }
1659 }
1660
1661 std::optional<Intrinsic::ID> FID =
1663 if (FID) {
1664 // Non-vp version will have same arg types except mask and vector
1665 // length.
1666 assert(ICA.getArgTypes().size() >= 2 &&
1667 "Expected VPIntrinsic to have Mask and Vector Length args and "
1668 "types");
1670
1671 // VPReduction intrinsics have a start value argument that their non-vp
1672 // counterparts do not have, except for the fadd and fmul non-vp
1673 // counterpart.
1675 *FID != Intrinsic::vector_reduce_fadd &&
1676 *FID != Intrinsic::vector_reduce_fmul)
1677 NewTys = NewTys.drop_front();
1678
1679 IntrinsicCostAttributes NewICA(*FID, ICA.getReturnType(), NewTys,
1680 ICA.getFlags());
1681 return thisT()->getIntrinsicInstrCost(NewICA, CostKind);
1682 }
1683 }
1684
1685 if (ICA.isTypeBasedOnly())
1687
1688 Type *RetTy = ICA.getReturnType();
1689
1690 ElementCount RetVF =
1691 (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
1693 const IntrinsicInst *I = ICA.getInst();
1694 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
1695 FastMathFlags FMF = ICA.getFlags();
1696 switch (IID) {
1697 default:
1698 break;
1699
1700 case Intrinsic::powi:
1701 if (auto *RHSC = dyn_cast<ConstantInt>(Args[1])) {
1702 bool ShouldOptForSize = I->getParent()->getParent()->hasOptSize();
1703 if (getTLI()->isBeneficialToExpandPowI(RHSC->getSExtValue(),
1704 ShouldOptForSize)) {
1705 // The cost is modeled on the expansion performed by ExpandPowI in
1706 // SelectionDAGBuilder.
1707 APInt Exponent = RHSC->getValue().abs();
1708 unsigned ActiveBits = Exponent.getActiveBits();
1709 unsigned PopCount = Exponent.popcount();
1710 InstructionCost Cost = (ActiveBits + PopCount - 2) *
1711 thisT()->getArithmeticInstrCost(
1712 Instruction::FMul, RetTy, CostKind);
1713 if (RHSC->isNegative())
1714 Cost += thisT()->getArithmeticInstrCost(Instruction::FDiv, RetTy,
1715 CostKind);
1716 return Cost;
1717 }
1718 }
1719 break;
1720 case Intrinsic::cttz:
1721 // FIXME: If necessary, this should go in target-specific overrides.
1722 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz(RetTy))
1724 break;
1725
1726 case Intrinsic::ctlz:
1727 // FIXME: If necessary, this should go in target-specific overrides.
1728 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz(RetTy))
1730 break;
1731
1732 case Intrinsic::memcpy:
1733 return thisT()->getMemcpyCost(ICA.getInst());
1734
1735 case Intrinsic::masked_scatter: {
1736 const Value *Mask = Args[3];
1737 bool VarMask = !isa<Constant>(Mask);
1738 Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue();
1739 return thisT()->getGatherScatterOpCost(Instruction::Store,
1740 ICA.getArgTypes()[0], Args[1],
1741 VarMask, Alignment, CostKind, I);
1742 }
1743 case Intrinsic::masked_gather: {
1744 const Value *Mask = Args[2];
1745 bool VarMask = !isa<Constant>(Mask);
1746 Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue();
1747 return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
1748 VarMask, Alignment, CostKind, I);
1749 }
1750 case Intrinsic::experimental_vp_strided_store: {
1751 const Value *Data = Args[0];
1752 const Value *Ptr = Args[1];
1753 const Value *Mask = Args[3];
1754 const Value *EVL = Args[4];
1755 bool VarMask = !isa<Constant>(Mask) || !isa<Constant>(EVL);
1756 Type *EltTy = cast<VectorType>(Data->getType())->getElementType();
1757 Align Alignment =
1758 I->getParamAlign(1).value_or(thisT()->DL.getABITypeAlign(EltTy));
1759 return thisT()->getStridedMemoryOpCost(Instruction::Store,
1760 Data->getType(), Ptr, VarMask,
1761 Alignment, CostKind, I);
1762 }
1763 case Intrinsic::experimental_vp_strided_load: {
1764 const Value *Ptr = Args[0];
1765 const Value *Mask = Args[2];
1766 const Value *EVL = Args[3];
1767 bool VarMask = !isa<Constant>(Mask) || !isa<Constant>(EVL);
1768 Type *EltTy = cast<VectorType>(RetTy)->getElementType();
1769 Align Alignment =
1770 I->getParamAlign(0).value_or(thisT()->DL.getABITypeAlign(EltTy));
1771 return thisT()->getStridedMemoryOpCost(Instruction::Load, RetTy, Ptr,
1772 VarMask, Alignment, CostKind, I);
1773 }
1774 case Intrinsic::stepvector: {
1775 if (isa<ScalableVectorType>(RetTy))
1777 // The cost of materialising a constant integer vector.
1779 }
1780 case Intrinsic::vector_extract: {
1781 // FIXME: Handle case where a scalable vector is extracted from a scalable
1782 // vector
1783 if (isa<ScalableVectorType>(RetTy))
1785 unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue();
1786 return thisT()->getShuffleCost(TTI::SK_ExtractSubvector,
1787 cast<VectorType>(Args[0]->getType()), {},
1788 CostKind, Index, cast<VectorType>(RetTy));
1789 }
1790 case Intrinsic::vector_insert: {
1791 // FIXME: Handle case where a scalable vector is inserted into a scalable
1792 // vector
1793 if (isa<ScalableVectorType>(Args[1]->getType()))
1795 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1796 return thisT()->getShuffleCost(
1797 TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()), {},
1798 CostKind, Index, cast<VectorType>(Args[1]->getType()));
1799 }
1800 case Intrinsic::vector_reverse: {
1801 return thisT()->getShuffleCost(TTI::SK_Reverse,
1802 cast<VectorType>(Args[0]->getType()), {},
1803 CostKind, 0, cast<VectorType>(RetTy));
1804 }
1805 case Intrinsic::vector_splice: {
1806 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1807 return thisT()->getShuffleCost(TTI::SK_Splice,
1808 cast<VectorType>(Args[0]->getType()), {},
1809 CostKind, Index, cast<VectorType>(RetTy));
1810 }
1811 case Intrinsic::vector_reduce_add:
1812 case Intrinsic::vector_reduce_mul:
1813 case Intrinsic::vector_reduce_and:
1814 case Intrinsic::vector_reduce_or:
1815 case Intrinsic::vector_reduce_xor:
1816 case Intrinsic::vector_reduce_smax:
1817 case Intrinsic::vector_reduce_smin:
1818 case Intrinsic::vector_reduce_fmax:
1819 case Intrinsic::vector_reduce_fmin:
1820 case Intrinsic::vector_reduce_fmaximum:
1821 case Intrinsic::vector_reduce_fminimum:
1822 case Intrinsic::vector_reduce_umax:
1823 case Intrinsic::vector_reduce_umin: {
1824 IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1);
1826 }
1827 case Intrinsic::vector_reduce_fadd:
1828 case Intrinsic::vector_reduce_fmul: {
1830 IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1);
1832 }
1833 case Intrinsic::fshl:
1834 case Intrinsic::fshr: {
1835 const Value *X = Args[0];
1836 const Value *Y = Args[1];
1837 const Value *Z = Args[2];
1840 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(Z);
1841 const TTI::OperandValueInfo OpInfoBW =
1843 isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2
1844 : TTI::OP_None};
1845
1846 // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
1847 // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
1849 Cost +=
1850 thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
1851 Cost +=
1852 thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
1853 Cost += thisT()->getArithmeticInstrCost(
1854 BinaryOperator::Shl, RetTy, CostKind, OpInfoX,
1855 {OpInfoZ.Kind, TTI::OP_None});
1856 Cost += thisT()->getArithmeticInstrCost(
1857 BinaryOperator::LShr, RetTy, CostKind, OpInfoY,
1858 {OpInfoZ.Kind, TTI::OP_None});
1859 // Non-constant shift amounts requires a modulo.
1860 if (!OpInfoZ.isConstant())
1861 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy,
1862 CostKind, OpInfoZ, OpInfoBW);
1863 // For non-rotates (X != Y) we must add shift-by-zero handling costs.
1864 if (X != Y) {
1865 Type *CondTy = RetTy->getWithNewBitWidth(1);
1866 Cost +=
1867 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1869 Cost +=
1870 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1872 }
1873 return Cost;
1874 }
1875 case Intrinsic::get_active_lane_mask: {
1876 EVT ResVT = getTLI()->getValueType(DL, RetTy, true);
1877 EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true);
1878
1879 // If we're not expanding the intrinsic then we assume this is cheap
1880 // to implement.
1881 if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgType)) {
1882 return getTypeLegalizationCost(RetTy).first;
1883 }
1884
1885 // Create the expanded types that will be used to calculate the uadd_sat
1886 // operation.
1887 Type *ExpRetTy = VectorType::get(
1888 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1889 IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {}, FMF);
1891 thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1892 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy,
1894 return Cost;
1895 }
1896 case Intrinsic::experimental_cttz_elts: {
1897 EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true);
1898
1899 // If we're not expanding the intrinsic then we assume this is cheap
1900 // to implement.
1901 if (!getTLI()->shouldExpandCttzElements(ArgType))
1902 return getTypeLegalizationCost(RetTy).first;
1903
1904 // TODO: The costs below reflect the expansion code in
1905 // SelectionDAGBuilder, but we may want to sacrifice some accuracy in
1906 // favour of compile time.
1907
1908 // Find the smallest "sensible" element type to use for the expansion.
1909 bool ZeroIsPoison = !cast<ConstantInt>(Args[1])->isZero();
1910 ConstantRange VScaleRange(APInt(64, 1), APInt::getZero(64));
1911 if (isa<ScalableVectorType>(ICA.getArgTypes()[0]) && I && I->getCaller())
1912 VScaleRange = getVScaleRange(I->getCaller(), 64);
1913
1914 unsigned EltWidth = getTLI()->getBitWidthForCttzElements(
1915 RetTy, ArgType.getVectorElementCount(), ZeroIsPoison, &VScaleRange);
1916 Type *NewEltTy = IntegerType::getIntNTy(RetTy->getContext(), EltWidth);
1917
1918 // Create the new vector type & get the vector length
1919 Type *NewVecTy = VectorType::get(
1920 NewEltTy, cast<VectorType>(Args[0]->getType())->getElementCount());
1921
1922 IntrinsicCostAttributes StepVecAttrs(Intrinsic::stepvector, NewVecTy, {},
1923 FMF);
1925 thisT()->getIntrinsicInstrCost(StepVecAttrs, CostKind);
1926
1927 Cost +=
1928 thisT()->getArithmeticInstrCost(Instruction::Sub, NewVecTy, CostKind);
1929 Cost += thisT()->getCastInstrCost(Instruction::SExt, NewVecTy,
1930 Args[0]->getType(),
1932 Cost +=
1933 thisT()->getArithmeticInstrCost(Instruction::And, NewVecTy, CostKind);
1934
1935 IntrinsicCostAttributes ReducAttrs(Intrinsic::vector_reduce_umax,
1936 NewEltTy, NewVecTy, FMF, I, 1);
1937 Cost += thisT()->getTypeBasedIntrinsicInstrCost(ReducAttrs, CostKind);
1938 Cost +=
1939 thisT()->getArithmeticInstrCost(Instruction::Sub, NewEltTy, CostKind);
1940
1941 return Cost;
1942 }
1943 case Intrinsic::experimental_vector_match:
1944 return thisT()->getTypeBasedIntrinsicInstrCost(ICA, CostKind);
1945 }
1946
1947 // Assume that we need to scalarize this intrinsic.)
1948 // Compute the scalarization overhead based on Args for a vector
1949 // intrinsic.
1950 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
1951 if (RetVF.isVector() && !RetVF.isScalable()) {
1952 ScalarizationCost = 0;
1953 if (!RetTy->isVoidTy())
1954 ScalarizationCost += getScalarizationOverhead(
1955 cast<VectorType>(RetTy),
1956 /*Insert*/ true, /*Extract*/ false, CostKind);
1957 ScalarizationCost +=
1959 }
1960
1961 IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
1962 ScalarizationCost);
1963 return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1964 }
1965
1966 /// Get intrinsic cost based on argument types.
1967 /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the
1968 /// cost of scalarizing the arguments and the return value will be computed
1969 /// based on types.
1973 Intrinsic::ID IID = ICA.getID();
1974 Type *RetTy = ICA.getReturnType();
1975 const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes();
1976 FastMathFlags FMF = ICA.getFlags();
1977 InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost();
1978 bool SkipScalarizationCost = ICA.skipScalarizationCost();
1979
1980 VectorType *VecOpTy = nullptr;
1981 if (!Tys.empty()) {
1982 // The vector reduction operand is operand 0 except for fadd/fmul.
1983 // Their operand 0 is a scalar start value, so the vector op is operand 1.
1984 unsigned VecTyIndex = 0;
1985 if (IID == Intrinsic::vector_reduce_fadd ||
1986 IID == Intrinsic::vector_reduce_fmul)
1987 VecTyIndex = 1;
1988 assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes");
1989 VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]);
1990 }
1991
1992 // Library call cost - other than size, make it expensive.
1993 unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10;
1994 unsigned ISD = 0;
1995 switch (IID) {
1996 default: {
1997 // Scalable vectors cannot be scalarized, so return Invalid.
1998 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
1999 return isa<ScalableVectorType>(Ty);
2000 }))
2002
2003 // Assume that we need to scalarize this intrinsic.
2004 InstructionCost ScalarizationCost =
2005 SkipScalarizationCost ? ScalarizationCostPassed : 0;
2006 unsigned ScalarCalls = 1;
2007 Type *ScalarRetTy = RetTy;
2008 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
2009 if (!SkipScalarizationCost)
2010 ScalarizationCost = getScalarizationOverhead(
2011 RetVTy, /*Insert*/ true, /*Extract*/ false, CostKind);
2012 ScalarCalls = std::max(ScalarCalls,
2013 cast<FixedVectorType>(RetVTy)->getNumElements());
2014 ScalarRetTy = RetTy->getScalarType();
2015 }
2016 SmallVector<Type *, 4> ScalarTys;
2017 for (Type *Ty : Tys) {
2018 if (auto *VTy = dyn_cast<VectorType>(Ty)) {
2019 if (!SkipScalarizationCost)
2020 ScalarizationCost += getScalarizationOverhead(
2021 VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
2022 ScalarCalls = std::max(ScalarCalls,
2023 cast<FixedVectorType>(VTy)->getNumElements());
2024 Ty = Ty->getScalarType();
2025 }
2026 ScalarTys.push_back(Ty);
2027 }
2028 if (ScalarCalls == 1)
2029 return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
2030
2031 IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF);
2032 InstructionCost ScalarCost =
2033 thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind);
2034
2035 return ScalarCalls * ScalarCost + ScalarizationCost;
2036 }
2037 // Look for intrinsics that can be lowered directly or turned into a scalar
2038 // intrinsic call.
2039 case Intrinsic::sqrt:
2040 ISD = ISD::FSQRT;
2041 break;
2042 case Intrinsic::sin:
2043 ISD = ISD::FSIN;
2044 break;
2045 case Intrinsic::cos:
2046 ISD = ISD::FCOS;
2047 break;
2048 case Intrinsic::sincos:
2049 ISD = ISD::FSINCOS;
2050 break;
2051 case Intrinsic::tan:
2052 ISD = ISD::FTAN;
2053 break;
2054 case Intrinsic::asin:
2055 ISD = ISD::FASIN;
2056 break;
2057 case Intrinsic::acos:
2058 ISD = ISD::FACOS;
2059 break;
2060 case Intrinsic::atan:
2061 ISD = ISD::FATAN;
2062 break;
2063 case Intrinsic::atan2:
2064 ISD = ISD::FATAN2;
2065 break;
2066 case Intrinsic::sinh:
2067 ISD = ISD::FSINH;
2068 break;
2069 case Intrinsic::cosh:
2070 ISD = ISD::FCOSH;
2071 break;
2072 case Intrinsic::tanh:
2073 ISD = ISD::FTANH;
2074 break;
2075 case Intrinsic::exp:
2076 ISD = ISD::FEXP;
2077 break;
2078 case Intrinsic::exp2:
2079 ISD = ISD::FEXP2;
2080 break;
2081 case Intrinsic::exp10:
2082 ISD = ISD::FEXP10;
2083 break;
2084 case Intrinsic::log:
2085 ISD = ISD::FLOG;
2086 break;
2087 case Intrinsic::log10:
2088 ISD = ISD::FLOG10;
2089 break;
2090 case Intrinsic::log2:
2091 ISD = ISD::FLOG2;
2092 break;
2093 case Intrinsic::fabs:
2094 ISD = ISD::FABS;
2095 break;
2096 case Intrinsic::canonicalize:
2097 ISD = ISD::FCANONICALIZE;
2098 break;
2099 case Intrinsic::minnum:
2100 ISD = ISD::FMINNUM;
2101 break;
2102 case Intrinsic::maxnum:
2103 ISD = ISD::FMAXNUM;
2104 break;
2105 case Intrinsic::minimum:
2106 ISD = ISD::FMINIMUM;
2107 break;
2108 case Intrinsic::maximum:
2109 ISD = ISD::FMAXIMUM;
2110 break;
2111 case Intrinsic::minimumnum:
2112 ISD = ISD::FMINIMUMNUM;
2113 break;
2114 case Intrinsic::maximumnum:
2115 ISD = ISD::FMAXIMUMNUM;
2116 break;
2117 case Intrinsic::copysign:
2118 ISD = ISD::FCOPYSIGN;
2119 break;
2120 case Intrinsic::floor:
2121 ISD = ISD::FFLOOR;
2122 break;
2123 case Intrinsic::ceil:
2124 ISD = ISD::FCEIL;
2125 break;
2126 case Intrinsic::trunc:
2127 ISD = ISD::FTRUNC;
2128 break;
2129 case Intrinsic::nearbyint:
2130 ISD = ISD::FNEARBYINT;
2131 break;
2132 case Intrinsic::rint:
2133 ISD = ISD::FRINT;
2134 break;
2135 case Intrinsic::lrint:
2136 ISD = ISD::LRINT;
2137 break;
2138 case Intrinsic::llrint:
2139 ISD = ISD::LLRINT;
2140 break;
2141 case Intrinsic::round:
2142 ISD = ISD::FROUND;
2143 break;
2144 case Intrinsic::roundeven:
2145 ISD = ISD::FROUNDEVEN;
2146 break;
2147 case Intrinsic::pow:
2148 ISD = ISD::FPOW;
2149 break;
2150 case Intrinsic::fma:
2151 ISD = ISD::FMA;
2152 break;
2153 case Intrinsic::fmuladd:
2154 ISD = ISD::FMA;
2155 break;
2156 case Intrinsic::experimental_constrained_fmuladd:
2157 ISD = ISD::STRICT_FMA;
2158 break;
2159 // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
2160 case Intrinsic::lifetime_start:
2161 case Intrinsic::lifetime_end:
2162 case Intrinsic::sideeffect:
2163 case Intrinsic::pseudoprobe:
2164 case Intrinsic::arithmetic_fence:
2165 return 0;
2166 case Intrinsic::masked_store: {
2167 Type *Ty = Tys[0];
2168 Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
2169 return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0,
2170 CostKind);
2171 }
2172 case Intrinsic::masked_load: {
2173 Type *Ty = RetTy;
2174 Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
2175 return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
2176 CostKind);
2177 }
2178 case Intrinsic::vector_reduce_add:
2179 case Intrinsic::vector_reduce_mul:
2180 case Intrinsic::vector_reduce_and:
2181 case Intrinsic::vector_reduce_or:
2182 case Intrinsic::vector_reduce_xor:
2183 return thisT()->getArithmeticReductionCost(
2184 getArithmeticReductionInstruction(IID), VecOpTy, std::nullopt,
2185 CostKind);
2186 case Intrinsic::vector_reduce_fadd:
2187 case Intrinsic::vector_reduce_fmul:
2188 return thisT()->getArithmeticReductionCost(
2189 getArithmeticReductionInstruction(IID), VecOpTy, FMF, CostKind);
2190 case Intrinsic::vector_reduce_smax:
2191 case Intrinsic::vector_reduce_smin:
2192 case Intrinsic::vector_reduce_umax:
2193 case Intrinsic::vector_reduce_umin:
2194 case Intrinsic::vector_reduce_fmax:
2195 case Intrinsic::vector_reduce_fmin:
2196 case Intrinsic::vector_reduce_fmaximum:
2197 case Intrinsic::vector_reduce_fminimum:
2198 return thisT()->getMinMaxReductionCost(getMinMaxReductionIntrinsicOp(IID),
2199 VecOpTy, ICA.getFlags(), CostKind);
2200 case Intrinsic::experimental_vector_match: {
2201 auto *SearchTy = cast<VectorType>(ICA.getArgTypes()[0]);
2202 auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]);
2203 unsigned SearchSize = NeedleTy->getNumElements();
2204
2205 // If we're not expanding the intrinsic then we assume this is cheap to
2206 // implement.
2207 EVT SearchVT = getTLI()->getValueType(DL, SearchTy);
2208 if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize))
2209 return getTypeLegalizationCost(RetTy).first;
2210
2211 // Approximate the cost based on the expansion code in
2212 // SelectionDAGBuilder.
2214 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, NeedleTy,
2215 CostKind, 1, nullptr, nullptr);
2216 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SearchTy,
2217 CostKind, 0, nullptr, nullptr);
2218 Cost += thisT()->getShuffleCost(TTI::SK_Broadcast, SearchTy, std::nullopt,
2219 CostKind, 0, nullptr);
2220 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SearchTy, RetTy,
2222 Cost +=
2223 thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
2224 Cost *= SearchSize;
2225 Cost +=
2226 thisT()->getArithmeticInstrCost(BinaryOperator::And, RetTy, CostKind);
2227 return Cost;
2228 }
2229 case Intrinsic::abs:
2230 ISD = ISD::ABS;
2231 break;
2232 case Intrinsic::smax:
2233 ISD = ISD::SMAX;
2234 break;
2235 case Intrinsic::smin:
2236 ISD = ISD::SMIN;
2237 break;
2238 case Intrinsic::umax:
2239 ISD = ISD::UMAX;
2240 break;
2241 case Intrinsic::umin:
2242 ISD = ISD::UMIN;
2243 break;
2244 case Intrinsic::sadd_sat:
2245 ISD = ISD::SADDSAT;
2246 break;
2247 case Intrinsic::ssub_sat:
2248 ISD = ISD::SSUBSAT;
2249 break;
2250 case Intrinsic::uadd_sat:
2251 ISD = ISD::UADDSAT;
2252 break;
2253 case Intrinsic::usub_sat:
2254 ISD = ISD::USUBSAT;
2255 break;
2256 case Intrinsic::smul_fix:
2257 ISD = ISD::SMULFIX;
2258 break;
2259 case Intrinsic::umul_fix:
2260 ISD = ISD::UMULFIX;
2261 break;
2262 case Intrinsic::sadd_with_overflow:
2263 ISD = ISD::SADDO;
2264 break;
2265 case Intrinsic::ssub_with_overflow:
2266 ISD = ISD::SSUBO;
2267 break;
2268 case Intrinsic::uadd_with_overflow:
2269 ISD = ISD::UADDO;
2270 break;
2271 case Intrinsic::usub_with_overflow:
2272 ISD = ISD::USUBO;
2273 break;
2274 case Intrinsic::smul_with_overflow:
2275 ISD = ISD::SMULO;
2276 break;
2277 case Intrinsic::umul_with_overflow:
2278 ISD = ISD::UMULO;
2279 break;
2280 case Intrinsic::fptosi_sat:
2281 ISD = ISD::FP_TO_SINT_SAT;
2282 break;
2283 case Intrinsic::fptoui_sat:
2284 ISD = ISD::FP_TO_UINT_SAT;
2285 break;
2286 case Intrinsic::ctpop:
2287 ISD = ISD::CTPOP;
2288 // In case of legalization use TCC_Expensive. This is cheaper than a
2289 // library call but still not a cheap instruction.
2290 SingleCallCost = TargetTransformInfo::TCC_Expensive;
2291 break;
2292 case Intrinsic::ctlz:
2293 ISD = ISD::CTLZ;
2294 break;
2295 case Intrinsic::cttz:
2296 ISD = ISD::CTTZ;
2297 break;
2298 case Intrinsic::bswap:
2299 ISD = ISD::BSWAP;
2300 break;
2301 case Intrinsic::bitreverse:
2302 ISD = ISD::BITREVERSE;
2303 break;
2304 case Intrinsic::ucmp:
2305 ISD = ISD::UCMP;
2306 break;
2307 case Intrinsic::scmp:
2308 ISD = ISD::SCMP;
2309 break;
2310 }
2311
2312 auto *ST = dyn_cast<StructType>(RetTy);
2313 Type *LegalizeTy = ST ? ST->getContainedType(0) : RetTy;
2314 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(LegalizeTy);
2315
2316 const TargetLoweringBase *TLI = getTLI();
2317
2318 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
2319 if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
2320 TLI->isFAbsFree(LT.second)) {
2321 return 0;
2322 }
2323
2324 // The operation is legal. Assume it costs 1.
2325 // If the type is split to multiple registers, assume that there is some
2326 // overhead to this.
2327 // TODO: Once we have extract/insert subvector cost we need to use them.
2328 if (LT.first > 1)
2329 return (LT.first * 2);
2330 else
2331 return (LT.first * 1);
2332 } else if (!TLI->isOperationExpand(ISD, LT.second)) {
2333 // If the operation is custom lowered then assume
2334 // that the code is twice as expensive.
2335 return (LT.first * 2);
2336 }
2337
2338 switch (IID) {
2339 case Intrinsic::fmuladd: {
2340 // If we can't lower fmuladd into an FMA estimate the cost as a floating
2341 // point mul followed by an add.
2342
2343 return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy,
2344 CostKind) +
2345 thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy,
2346 CostKind);
2347 }
2348 case Intrinsic::experimental_constrained_fmuladd: {
2349 IntrinsicCostAttributes FMulAttrs(
2350 Intrinsic::experimental_constrained_fmul, RetTy, Tys);
2351 IntrinsicCostAttributes FAddAttrs(
2352 Intrinsic::experimental_constrained_fadd, RetTy, Tys);
2353 return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) +
2354 thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind);
2355 }
2356 case Intrinsic::smin:
2357 case Intrinsic::smax:
2358 case Intrinsic::umin:
2359 case Intrinsic::umax: {
2360 // minmax(X,Y) = select(icmp(X,Y),X,Y)
2361 Type *CondTy = RetTy->getWithNewBitWidth(1);
2362 bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin;
2363 CmpInst::Predicate Pred =
2364 IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT;
2366 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2367 Pred, CostKind);
2368 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2369 Pred, CostKind);
2370 return Cost;
2371 }
2372 case Intrinsic::sadd_with_overflow:
2373 case Intrinsic::ssub_with_overflow: {
2374 Type *SumTy = RetTy->getContainedType(0);
2375 Type *OverflowTy = RetTy->getContainedType(1);
2376 unsigned Opcode = IID == Intrinsic::sadd_with_overflow
2377 ? BinaryOperator::Add
2378 : BinaryOperator::Sub;
2379
2380 // Add:
2381 // Overflow -> (Result < LHS) ^ (RHS < 0)
2382 // Sub:
2383 // Overflow -> (Result < LHS) ^ (RHS > 0)
2385 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
2386 Cost +=
2387 2 * thisT()->getCmpSelInstrCost(Instruction::ICmp, SumTy, OverflowTy,
2389 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy,
2390 CostKind);
2391 return Cost;
2392 }
2393 case Intrinsic::uadd_with_overflow:
2394 case Intrinsic::usub_with_overflow: {
2395 Type *SumTy = RetTy->getContainedType(0);
2396 Type *OverflowTy = RetTy->getContainedType(1);
2397 unsigned Opcode = IID == Intrinsic::uadd_with_overflow
2398 ? BinaryOperator::Add
2399 : BinaryOperator::Sub;
2400 CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow
2403
2405 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
2406 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy,
2407 OverflowTy, Pred, CostKind);
2408 return Cost;
2409 }
2410 case Intrinsic::smul_with_overflow:
2411 case Intrinsic::umul_with_overflow: {
2412 Type *MulTy = RetTy->getContainedType(0);
2413 Type *OverflowTy = RetTy->getContainedType(1);
2414 unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
2415 Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
2416 bool IsSigned = IID == Intrinsic::smul_with_overflow;
2417
2418 unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt;
2420
2422 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
2423 Cost +=
2424 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2425 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
2426 CCH, CostKind);
2427 Cost += thisT()->getArithmeticInstrCost(
2428 Instruction::LShr, ExtTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None},
2430
2431 if (IsSigned)
2432 Cost += thisT()->getArithmeticInstrCost(
2433 Instruction::AShr, MulTy, CostKind,
2436
2437 Cost += thisT()->getCmpSelInstrCost(
2438 BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind);
2439 return Cost;
2440 }
2441 case Intrinsic::sadd_sat:
2442 case Intrinsic::ssub_sat: {
2443 // Assume a default expansion.
2444 Type *CondTy = RetTy->getWithNewBitWidth(1);
2445
2446 Type *OpTy = StructType::create({RetTy, CondTy});
2447 Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
2448 ? Intrinsic::sadd_with_overflow
2449 : Intrinsic::ssub_with_overflow;
2451
2452 // SatMax -> Overflow && SumDiff < 0
2453 // SatMin -> Overflow && SumDiff >= 0
2455 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
2456 nullptr, ScalarizationCostPassed);
2457 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2458 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2459 Pred, CostKind);
2460 Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
2461 CondTy, Pred, CostKind);
2462 return Cost;
2463 }
2464 case Intrinsic::uadd_sat:
2465 case Intrinsic::usub_sat: {
2466 Type *CondTy = RetTy->getWithNewBitWidth(1);
2467
2468 Type *OpTy = StructType::create({RetTy, CondTy});
2469 Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
2470 ? Intrinsic::uadd_with_overflow
2471 : Intrinsic::usub_with_overflow;
2472
2474 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
2475 nullptr, ScalarizationCostPassed);
2476 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2477 Cost +=
2478 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2480 return Cost;
2481 }
2482 case Intrinsic::smul_fix:
2483 case Intrinsic::umul_fix: {
2484 unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
2485 Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);
2486
2487 unsigned ExtOp =
2488 IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
2490
2492 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
2493 Cost +=
2494 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2495 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
2496 CCH, CostKind);
2497 Cost += thisT()->getArithmeticInstrCost(
2498 Instruction::LShr, RetTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None},
2500 Cost += thisT()->getArithmeticInstrCost(
2501 Instruction::Shl, RetTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None},
2503 Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
2504 return Cost;
2505 }
2506 case Intrinsic::abs: {
2507 // abs(X) = select(icmp(X,0),X,sub(0,X))
2508 Type *CondTy = RetTy->getWithNewBitWidth(1);
2511 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2512 Pred, CostKind);
2513 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2514 Pred, CostKind);
2515 // TODO: Should we add an OperandValueProperties::OP_Zero property?
2516 Cost += thisT()->getArithmeticInstrCost(
2517 BinaryOperator::Sub, RetTy, CostKind,
2519 return Cost;
2520 }
2521 case Intrinsic::fptosi_sat:
2522 case Intrinsic::fptoui_sat: {
2523 if (Tys.empty())
2524 break;
2525 Type *FromTy = Tys[0];
2526 bool IsSigned = IID == Intrinsic::fptosi_sat;
2527
2529 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FromTy,
2530 {FromTy, FromTy});
2531 Cost += thisT()->getIntrinsicInstrCost(Attrs1, CostKind);
2532 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FromTy,
2533 {FromTy, FromTy});
2534 Cost += thisT()->getIntrinsicInstrCost(Attrs2, CostKind);
2535 Cost += thisT()->getCastInstrCost(
2536 IsSigned ? Instruction::FPToSI : Instruction::FPToUI, RetTy, FromTy,
2538 if (IsSigned) {
2539 Type *CondTy = RetTy->getWithNewBitWidth(1);
2540 Cost += thisT()->getCmpSelInstrCost(
2541 BinaryOperator::FCmp, FromTy, CondTy, CmpInst::FCMP_UNO, CostKind);
2542 Cost += thisT()->getCmpSelInstrCost(
2543 BinaryOperator::Select, RetTy, CondTy, CmpInst::FCMP_UNO, CostKind);
2544 }
2545 return Cost;
2546 }
2547 case Intrinsic::ucmp:
2548 case Intrinsic::scmp: {
2549 Type *CmpTy = Tys[0];
2550 Type *CondTy = RetTy->getWithNewBitWidth(1);
2552 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, CmpTy, CondTy,
2554 CostKind) +
2555 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, CmpTy, CondTy,
2557 CostKind);
2558
2559 EVT VT = TLI->getValueType(DL, CmpTy, true);
2560 if (TLI->shouldExpandCmpUsingSelects(VT)) {
2561 // x < y ? -1 : (x > y ? 1 : 0)
2562 Cost += 2 * thisT()->getCmpSelInstrCost(
2563 BinaryOperator::Select, RetTy, CondTy,
2565 } else {
2566 // zext(x > y) - zext(x < y)
2567 Cost +=
2568 2 * thisT()->getCastInstrCost(CastInst::ZExt, RetTy, CondTy,
2570 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy,
2571 CostKind);
2572 }
2573 return Cost;
2574 }
2575 default:
2576 break;
2577 }
2578
2579 // Else, assume that we need to scalarize this intrinsic. For math builtins
2580 // this will emit a costly libcall, adding call overhead and spills. Make it
2581 // very expensive.
2582 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
2583 // Scalable vectors cannot be scalarized, so return Invalid.
2584 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
2585 return isa<ScalableVectorType>(Ty);
2586 }))
2588
2589 InstructionCost ScalarizationCost =
2590 SkipScalarizationCost
2591 ? ScalarizationCostPassed
2592 : getScalarizationOverhead(RetVTy, /*Insert*/ true,
2593 /*Extract*/ false, CostKind);
2594
2595 unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements();
2596 SmallVector<Type *, 4> ScalarTys;
2597 for (Type *Ty : Tys) {
2598 if (Ty->isVectorTy())
2599 Ty = Ty->getScalarType();
2600 ScalarTys.push_back(Ty);
2601 }
2602 IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
2603 InstructionCost ScalarCost =
2604 thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2605 for (Type *Ty : Tys) {
2606 if (auto *VTy = dyn_cast<VectorType>(Ty)) {
2607 if (!ICA.skipScalarizationCost())
2608 ScalarizationCost += getScalarizationOverhead(
2609 VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
2610 ScalarCalls = std::max(ScalarCalls,
2611 cast<FixedVectorType>(VTy)->getNumElements());
2612 }
2613 }
2614 return ScalarCalls * ScalarCost + ScalarizationCost;
2615 }
2616
2617 // This is going to be turned into a library call, make it expensive.
2618 return SingleCallCost;
2619 }
2620
2621 /// Compute a cost of the given call instruction.
2622 ///
2623 /// Compute the cost of calling function F with return type RetTy and
2624 /// argument types Tys. F might be nullptr, in this case the cost of an
2625 /// arbitrary call with the specified signature will be returned.
2626 /// This is used, for instance, when we estimate call of a vector
2627 /// counterpart of the given function.
2628 /// \param F Called function, might be nullptr.
2629 /// \param RetTy Return value types.
2630 /// \param Tys Argument types.
2631 /// \returns The cost of Call instruction.
2633 ArrayRef<Type *> Tys,
2635 return 10;
2636 }
2637
2638 unsigned getNumberOfParts(Type *Tp) {
2639 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
2640 if (!LT.first.isValid())
2641 return 0;
2642 // Try to find actual number of parts for non-power-of-2 elements as
2643 // ceil(num-of-elements/num-of-subtype-elements).
2644 if (auto *FTp = dyn_cast<FixedVectorType>(Tp);
2645 Tp && LT.second.isFixedLengthVector() &&
2646 !has_single_bit(FTp->getNumElements())) {
2647 if (auto *SubTp = dyn_cast_if_present<FixedVectorType>(
2648 EVT(LT.second).getTypeForEVT(Tp->getContext()));
2649 SubTp && SubTp->getElementType() == FTp->getElementType())
2650 return divideCeil(FTp->getNumElements(), SubTp->getNumElements());
2651 }
2652 return *LT.first.getValue();
2653 }
2654
2656 const SCEV *) {
2657 return 0;
2658 }
2659
2660 /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
2661 /// We're assuming that reduction operation are performing the following way:
2662 ///
2663 /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
2664 /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
2665 /// \----------------v-------------/ \----------v------------/
2666 /// n/2 elements n/2 elements
2667 /// %red1 = op <n x t> %val, <n x t> val1
2668 /// After this operation we have a vector %red1 where only the first n/2
2669 /// elements are meaningful, the second n/2 elements are undefined and can be
2670 /// dropped. All other operations are actually working with the vector of
2671 /// length n/2, not n, though the real vector length is still n.
2672 /// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
2673 /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
2674 /// \----------------v-------------/ \----------v------------/
2675 /// n/4 elements 3*n/4 elements
2676 /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of
2677 /// length n/2, the resulting vector has length n/4 etc.
2678 ///
2679 /// The cost model should take into account that the actual length of the
2680 /// vector is reduced on each iteration.
2683 // Targets must implement a default value for the scalable case, since
2684 // we don't know how many lanes the vector has.
2685 if (isa<ScalableVectorType>(Ty))
2687
2688 Type *ScalarTy = Ty->getElementType();
2689 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2690 if ((Opcode == Instruction::Or || Opcode == Instruction::And) &&
2691 ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) &&
2692 NumVecElts >= 2) {
2693 // Or reduction for i1 is represented as:
2694 // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2695 // %res = cmp ne iReduxWidth %val, 0
2696 // And reduction for i1 is represented as:
2697 // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2698 // %res = cmp eq iReduxWidth %val, 11111
2699 Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts);
2700 return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty,
2702 thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy,
2705 }
2706 unsigned NumReduxLevels = Log2_32(NumVecElts);
2707 InstructionCost ArithCost = 0;
2708 InstructionCost ShuffleCost = 0;
2709 std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty);
2710 unsigned LongVectorCount = 0;
2711 unsigned MVTLen =
2712 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2713 while (NumVecElts > MVTLen) {
2714 NumVecElts /= 2;
2715 VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2716 ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, {},
2717 CostKind, NumVecElts, SubTy);
2718 ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind);
2719 Ty = SubTy;
2720 ++LongVectorCount;
2721 }
2722
2723 NumReduxLevels -= LongVectorCount;
2724
2725 // The minimal length of the vector is limited by the real length of vector
2726 // operations performed on the current platform. That's why several final
2727 // reduction operations are performed on the vectors with the same
2728 // architecture-dependent length.
2729
2730 // By default reductions need one shuffle per reduction level.
2731 ShuffleCost +=
2732 NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
2733 {}, CostKind, 0, Ty);
2734 ArithCost +=
2735 NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind);
2736 return ShuffleCost + ArithCost +
2737 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
2738 CostKind, 0, nullptr, nullptr);
2739 }
2740
2741 /// Try to calculate the cost of performing strict (in-order) reductions,
2742 /// which involves doing a sequence of floating point additions in lane
2743 /// order, starting with an initial value. For example, consider a scalar
2744 /// initial value 'InitVal' of type float and a vector of type <4 x float>:
2745 ///
2746 /// Vector = <float %v0, float %v1, float %v2, float %v3>
2747 ///
2748 /// %add1 = %InitVal + %v0
2749 /// %add2 = %add1 + %v1
2750 /// %add3 = %add2 + %v2
2751 /// %add4 = %add3 + %v3
2752 ///
2753 /// As a simple estimate we can say the cost of such a reduction is 4 times
2754 /// the cost of a scalar FP addition. We can only estimate the costs for
2755 /// fixed-width vectors here because for scalable vectors we do not know the
2756 /// runtime number of operations.
2759 // Targets must implement a default value for the scalable case, since
2760 // we don't know how many lanes the vector has.
2761 if (isa<ScalableVectorType>(Ty))
2763
2764 auto *VTy = cast<FixedVectorType>(Ty);
2766 VTy, /*Insert=*/false, /*Extract=*/true, CostKind);
2767 InstructionCost ArithCost = thisT()->getArithmeticInstrCost(
2768 Opcode, VTy->getElementType(), CostKind);
2769 ArithCost *= VTy->getNumElements();
2770
2771 return ExtractCost + ArithCost;
2772 }
2773
2775 std::optional<FastMathFlags> FMF,
2777 assert(Ty && "Unknown reduction vector type");
2779 return getOrderedReductionCost(Opcode, Ty, CostKind);
2780 return getTreeReductionCost(Opcode, Ty, CostKind);
2781 }
2782
2783 /// Try to calculate op costs for min/max reduction operations.
2784 /// \param CondTy Conditional type for the Select instruction.
2786 FastMathFlags FMF,
2788 // Targets must implement a default value for the scalable case, since
2789 // we don't know how many lanes the vector has.
2790 if (isa<ScalableVectorType>(Ty))
2792
2793 Type *ScalarTy = Ty->getElementType();
2794 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2795 unsigned NumReduxLevels = Log2_32(NumVecElts);
2796 InstructionCost MinMaxCost = 0;
2797 InstructionCost ShuffleCost = 0;
2798 std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty);
2799 unsigned LongVectorCount = 0;
2800 unsigned MVTLen =
2801 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2802 while (NumVecElts > MVTLen) {
2803 NumVecElts /= 2;
2804 auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2805
2806 ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, {},
2807 CostKind, NumVecElts, SubTy);
2808
2809 IntrinsicCostAttributes Attrs(IID, SubTy, {SubTy, SubTy}, FMF);
2810 MinMaxCost += getIntrinsicInstrCost(Attrs, CostKind);
2811 Ty = SubTy;
2812 ++LongVectorCount;
2813 }
2814
2815 NumReduxLevels -= LongVectorCount;
2816
2817 // The minimal length of the vector is limited by the real length of vector
2818 // operations performed on the current platform. That's why several final
2819 // reduction opertions are perfomed on the vectors with the same
2820 // architecture-dependent length.
2821 ShuffleCost +=
2822 NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
2823 {}, CostKind, 0, Ty);
2824 IntrinsicCostAttributes Attrs(IID, Ty, {Ty, Ty}, FMF);
2825 MinMaxCost += NumReduxLevels * getIntrinsicInstrCost(Attrs, CostKind);
2826 // The last min/max should be in vector registers and we counted it above.
2827 // So just need a single extractelement.
2828 return ShuffleCost + MinMaxCost +
2829 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
2830 CostKind, 0, nullptr, nullptr);
2831 }
2832
2833 InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned,
2834 Type *ResTy, VectorType *Ty,
2835 FastMathFlags FMF,
2837 if (auto *FTy = dyn_cast<FixedVectorType>(Ty);
2838 FTy && IsUnsigned && Opcode == Instruction::Add &&
2839 FTy->getElementType() == IntegerType::getInt1Ty(Ty->getContext())) {
2840 // Represent vector_reduce_add(ZExt(<n x i1>)) as
2841 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
2842 auto *IntTy =
2843 IntegerType::get(ResTy->getContext(), FTy->getNumElements());
2844 IntrinsicCostAttributes ICA(Intrinsic::ctpop, IntTy, {IntTy}, FMF);
2845 return thisT()->getCastInstrCost(Instruction::BitCast, IntTy, FTy,
2847 thisT()->getIntrinsicInstrCost(ICA, CostKind);
2848 }
2849 // Without any native support, this is equivalent to the cost of
2850 // vecreduce.opcode(ext(Ty A)).
2851 VectorType *ExtTy = VectorType::get(ResTy, Ty);
2852 InstructionCost RedCost =
2853 thisT()->getArithmeticReductionCost(Opcode, ExtTy, FMF, CostKind);
2854 InstructionCost ExtCost = thisT()->getCastInstrCost(
2855 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2857
2858 return RedCost + ExtCost;
2859 }
2860
2862 VectorType *Ty,
2864 // Without any native support, this is equivalent to the cost of
2865 // vecreduce.add(mul(ext(Ty A), ext(Ty B))) or
2866 // vecreduce.add(mul(A, B)).
2867 VectorType *ExtTy = VectorType::get(ResTy, Ty);
2868 InstructionCost RedCost = thisT()->getArithmeticReductionCost(
2869 Instruction::Add, ExtTy, std::nullopt, CostKind);
2870 InstructionCost ExtCost = thisT()->getCastInstrCost(
2871 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2873
2874 InstructionCost MulCost =
2875 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2876
2877 return RedCost + MulCost + 2 * ExtCost;
2878 }
2879
2881
2882 /// @}
2883};
2884
2885/// Concrete BasicTTIImpl that can be used if no further customization
2886/// is needed.
2887class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
2889
2890 friend class BasicTTIImplBase<BasicTTIImpl>;
2891
2892 const TargetSubtargetInfo *ST;
2893 const TargetLoweringBase *TLI;
2894
2895 const TargetSubtargetInfo *getST() const { return ST; }
2896 const TargetLoweringBase *getTLI() const { return TLI; }
2897
2898public:
2899 explicit BasicTTIImpl(const TargetMachine *TM, const Function &F);
2900};
2901
2902} // end namespace llvm
2903
2904#endif // LLVM_CODEGEN_BASICTTIIMPL_H
This file implements a class to represent arbitrary precision integral constant values and operations...
This file implements the BitVector class.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
return RetTy
uint32_t Index
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V)
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getNumElements(Type *Ty)
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
This file provides helpers for the implementation of a TargetTransformInfo-conforming class.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1330
bool sgt(const APInt &RHS) const
Signed greater than comparison.
Definition: APInt.h:1201
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1468
bool slt(const APInt &RHS) const
Signed less than comparison.
Definition: APInt.h:1130
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:200
an instruction to allocate memory on the stack
Definition: Instructions.h:63
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:207
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:213
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
Base class which can be used to help build a TTI implementation.
Definition: BasicTTIImpl.h:80
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef< std::tuple< Value *, User *, int > > ScalarUserAndIdx)
bool isTypeLegal(Type *Ty)
Definition: BasicTTIImpl.h:442
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const
Definition: BasicTTIImpl.h:300
virtual unsigned getPrefetchDistance() const
Definition: BasicTTIImpl.h:740
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:596
bool preferToKeepConstantsAttached(const Instruction &Inst, const Function &Fn) const
Definition: BasicTTIImpl.h:569
unsigned getMaxInterleaveFactor(ElementCount VF)
Definition: BasicTTIImpl.h:930
unsigned getNumberOfParts(Type *Tp)
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
Definition: BasicTTIImpl.h:769
std::optional< unsigned > getVScaleForTuning() const
Definition: BasicTTIImpl.h:774
InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty, TTI::TargetCostKind CostKind)
Try to calculate the cost of performing strict (in-order) reductions, which involves doing a sequence...
bool isTruncateFree(Type *Ty1, Type *Ty2)
Definition: BasicTTIImpl.h:432
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo)
Definition: BasicTTIImpl.h:676
InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty, TTI::TargetCostKind CostKind)
Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI)
Definition: BasicTTIImpl.h:687
virtual bool shouldPrefetchAddressSpace(unsigned AS) const
Definition: BasicTTIImpl.h:760
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
bool isLegalICmpImmediate(int64_t imm)
Definition: BasicTTIImpl.h:349
bool isProfitableToHoist(Instruction *I)
Definition: BasicTTIImpl.h:436
virtual unsigned getMaxPrefetchIterationsAhead() const
Definition: BasicTTIImpl.h:752
InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, TTI::TargetCostKind CostKind, unsigned Index)
std::optional< unsigned > getMaxVScale() const
Definition: BasicTTIImpl.h:773
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
unsigned getRegUsageForType(Type *Ty)
Definition: BasicTTIImpl.h:447
bool shouldBuildRelLookupTables() const
Definition: BasicTTIImpl.h:523
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const
Definition: BasicTTIImpl.h:590
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, unsigned &JumpTableSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
Definition: BasicTTIImpl.h:458
bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty, const DataLayout &DL) const
Definition: BasicTTIImpl.h:390
bool shouldDropLSRSolutionIfLessProfitable() const
Definition: BasicTTIImpl.h:410
bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2)
Definition: BasicTTIImpl.h:402
std::optional< Value * > simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, bool &KnownBitsComputed)
Definition: BasicTTIImpl.h:702
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, unsigned NumPrefetches, bool HasCall) const
Definition: BasicTTIImpl.h:744
bool hasBranchDivergence(const Function *F=nullptr)
Definition: BasicTTIImpl.h:294
bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty, const DataLayout &DL) const
Definition: BasicTTIImpl.h:396
unsigned getAssumedAddrSpace(const Value *V) const
Definition: BasicTTIImpl.h:322
InstructionCost getOperandsScalarizationOverhead(ArrayRef< const Value * > Args, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instructions unique non-constant operands.
Definition: BasicTTIImpl.h:849
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
unsigned getEpilogueVectorizationMinVF()
Definition: BasicTTIImpl.h:683
int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset)
Definition: BasicTTIImpl.h:366
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:452
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty)
Definition: BasicTTIImpl.h:555
bool isTargetIntrinsicTriviallyScalarizable(Intrinsic::ID ID) const
Definition: BasicTTIImpl.h:813
virtual std::optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const
Definition: BasicTTIImpl.h:720
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace)
Definition: BasicTTIImpl.h:418
bool isTargetIntrinsicWithStructReturnOverloadAtField(Intrinsic::ID ID, int RetIdx) const
Definition: BasicTTIImpl.h:827
bool isAlwaysUniform(const Value *V)
Definition: BasicTTIImpl.h:298
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true)
Definition: BasicTTIImpl.h:692
bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, unsigned AddressSpace, Align Alignment, unsigned *Fast) const
Definition: BasicTTIImpl.h:272
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
Definition: BasicTTIImpl.h:370
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
Definition: BasicTTIImpl.h:280
InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
Definition: BasicTTIImpl.h:833
virtual std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const
Definition: BasicTTIImpl.h:726
virtual bool enableWritePrefetching() const
Definition: BasicTTIImpl.h:756
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const
Definition: BasicTTIImpl.h:336
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:668
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const
Definition: BasicTTIImpl.h:313
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Compute a cost of the given call instruction.
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getFPOpCost(Type *Ty)
Definition: BasicTTIImpl.h:559
InstructionCost getVectorSplitCost()
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:896
bool haveFastSqrt(Type *Ty)
Definition: BasicTTIImpl.h:548
std::pair< const Value *, unsigned > getPredicatedAddrSpace(const Value *V) const
Definition: BasicTTIImpl.h:332
unsigned getInliningThresholdMultiplier() const
Definition: BasicTTIImpl.h:588
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
virtual ~BasicTTIImplBase()=default
bool isLegalAddScalableImmediate(int64_t Imm)
Definition: BasicTTIImpl.h:345
InstructionCost getScalarizationOverhead(VectorType *RetTy, ArrayRef< const Value * > Args, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing the inputs and outputs of an instruction, with return type RetTy...
Definition: BasicTTIImpl.h:878
bool isVScaleKnownToBeAPowerOfTwo() const
Definition: BasicTTIImpl.h:775
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II)
Definition: BasicTTIImpl.h:696
bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const
Definition: BasicTTIImpl.h:304
bool isLegalAddImmediate(int64_t imm)
Definition: BasicTTIImpl.h:341
unsigned getFlatAddressSpace()
Definition: BasicTTIImpl.h:308
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={})
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:780
virtual unsigned getCacheLineSize() const
Definition: BasicTTIImpl.h:736
bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const
Definition: BasicTTIImpl.h:318
bool isSourceOfDivergence(const Value *V)
Definition: BasicTTIImpl.h:296
bool isTargetIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx) const
Definition: BasicTTIImpl.h:817
int getInlinerVectorBonusPercent() const
Definition: BasicTTIImpl.h:594
InstructionCost getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on argument types.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:932
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp)
Definition: BasicTTIImpl.h:709
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:353
bool isSingleThreaded() const
Definition: BasicTTIImpl.h:326
BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
Definition: BasicTTIImpl.h:263
bool isTargetIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx) const
Definition: BasicTTIImpl.h:822
unsigned adjustInliningThreshold(const CallBase *CB)
Definition: BasicTTIImpl.h:589
bool isProfitableLSRChainElement(Instruction *I)
Definition: BasicTTIImpl.h:414
Concrete BasicTTIImpl that can be used if no further customization is needed.
size_type count() const
count - Returns the number of bits which are set.
Definition: BitVector.h:162
BitVector & set()
Definition: BitVector.h:351
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1120
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:988
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:696
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:700
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:698
@ ICMP_EQ
equal
Definition: InstrTypes.h:694
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:683
CmpInst::Predicate getLTPredicate() const
CmpInst::Predicate getGTPredicate() const
This class represents a range of values.
Definition: ConstantRange.h:47
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
Definition: DataLayout.h:434
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:843
unsigned getIndexSizeInBits(unsigned AS) const
Size in bits of index used for address calculation in getelementptr.
Definition: DataLayout.h:369
constexpr bool isVector() const
One or more elements.
Definition: TypeSize.h:326
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:322
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:563
unsigned getNumElements() const
Definition: DerivedTypes.h:606
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:353
The core instruction combiner logic.
Definition: InstCombiner.h:48
static InstructionCost getInvalid(CostType Val=0)
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:311
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
InstructionCost getScalarizationCost() const
const IntrinsicInst * getInst() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
virtual bool shouldPrefetchAddressSpace(unsigned AS) const
virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, unsigned NumPrefetches, bool HasCall) const
Return the minimum stride necessary to trigger software prefetching.
virtual bool enableWritePrefetching() const
virtual unsigned getMaxPrefetchIterationsAhead() const
Return the maximum prefetch distance in terms of loop iterations.
virtual unsigned getPrefetchDistance() const
Return the preferred prefetch distance in terms of instructions.
virtual std::optional< unsigned > getCacheAssociativity(unsigned Level) const
Return the cache associatvity for the given level of cache.
virtual std::optional< unsigned > getCacheLineSize(unsigned Level) const
Return the target cache line size in bytes at a given level.
Machine Value Type.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Analysis providing profile information.
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isSpliceMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is a splice mask, concatenating the two inputs together and then ext...
static bool isSelectMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from its source vectors without lane crossings.
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isTransposeMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask is a transpose mask.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
size_type size() const
Definition: SmallPtrSet.h:94
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
static StackOffset getScalable(int64_t Scalable)
Definition: TypeSize.h:43
static StackOffset getFixed(int64_t Fixed)
Definition: TypeSize.h:42
static StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition: Type.cpp:612
Multiway switch.
Provides information about what library functions are available for the current target.
This base class for TargetLowering contains the SelectionDAG-independent parts that can be used from ...
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
bool isIndexedStoreLegal(unsigned IdxMode, EVT VT) const
Return true if the specified indexed load is legal on this target.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
virtual bool isLegalICmpImmediate(int64_t) const
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
const TargetMachine & getTargetMachine() const
virtual bool isZExtFree(Type *FromTy, Type *ToTy) const
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
virtual bool isSuitableForJumpTable(const SwitchInst *SI, uint64_t NumCases, uint64_t Range, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) const
Return true if lowering to a jump table is suitable for a set of case clusters which may contain NumC...
virtual bool areJTsAllowed(const Function *Fn) const
Return true if lowering to a jump table is allowed.
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
virtual unsigned getNumRegisters(LLVMContext &Context, EVT VT, std::optional< MVT > RegisterVT=std::nullopt) const
Return the number of registers that this ValueType will eventually require.
virtual bool isCheapToSpeculateCttz(Type *Ty) const
Return true if it is cheap to speculate a call to intrinsic cttz.
bool isTruncStoreLegal(EVT ValVT, EVT MemVT) const
Return true if the specified store with truncation is legal on this target.
unsigned getBitWidthForCttzElements(Type *RetTy, ElementCount EC, bool ZeroIsPoison, const ConstantRange *VScaleRange) const
Return the minimum number of bits required to hold the maximum possible number of trailing zero vecto...
virtual bool shouldExpandCmpUsingSelects(EVT VT) const
Should we expand [US]CMP nodes using two selects and two compares, or by doing arithmetic on boolean ...
virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *=nullptr) const
Determine if the target supports unaligned memory accesses.
virtual bool isTruncateFree(Type *FromTy, Type *ToTy) const
Return true if it's free to truncate a value of type FromTy to type ToTy.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
bool isSuitableForBitTests(unsigned NumDests, unsigned NumCmps, const APInt &Low, const APInt &High, const DataLayout &DL) const
Return true if lowering to a bit test is suitable for a set of case clusters which contains NumDests ...
virtual bool isLegalAddImmediate(int64_t) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
virtual bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
LegalizeAction getTruncStoreAction(EVT ValVT, EVT MemVT) const
Return how this store with truncation should be treated: either it is legal, needs to be promoted to ...
LegalizeAction getLoadExtAction(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return how this load with extension should be treated: either it is legal, needs to be promoted to a ...
virtual bool isIntDivCheap(EVT VT, AttributeList Attr) const
Return true if integer divide is usually cheaper than a sequence of several shifts,...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isProfitableToHoist(Instruction *I) const
bool isIndexedLoadLegal(unsigned IdxMode, EVT VT) const
Return true if the specified indexed load is legal on this target.
bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal on this target.
virtual bool isCheapToSpeculateCtlz(Type *Ty) const
Return true if it is cheap to speculate a call to intrinsic ctlz.
virtual int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) const
Return the prefered common base offset.
LegalizeKind getTypeConversion(LLVMContext &Context, EVT VT) const
Return pair that represents the legalization kind (first) that needs to happen to EVT (second) in ord...
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
virtual bool isLegalAddScalableImmediate(int64_t) const
Return true if adding the specified scalable immediate is legal, that is the target has add instructi...
bool isBeneficialToExpandPowI(int64_t Exponent, bool OptForSize) const
Return true if it is beneficial to expand an @llvm.powi.
virtual bool isFAbsFree(EVT VT) const
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
virtual bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AddrSpace, Instruction *I=nullptr) const
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
virtual std::pair< const Value *, unsigned > getPredicatedAddrSpace(const Value *V) const
If the specified predicate checks whether a generic pointer falls within a specified address space,...
virtual bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const
Returns true if a cast between SrcAS and DestAS is a noop.
virtual unsigned getAssumedAddrSpace(const Value *V) const
If the specified generic pointer could be assumed as a pointer to a specific address space,...
TargetOptions Options
ThreadModel::Model ThreadModel
ThreadModel - This flag specifies the type of threading model to assume for things like atomics.
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual bool useAA() const
Enable use of alias analysis during code generation (during MI scheduling, DAGCombine,...
const DataLayout & getDataLayout() const
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
bool isProfitableLSRChainElement(Instruction *I) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I) const
std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info, TTI::OperandValueInfo Opd2Info, ArrayRef< const Value * > Args, const Instruction *CxtI=nullptr) const
bool isLoweredToCall(const Function *F) const
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const
std::optional< Value * > simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, bool &KnownBitsComputed) const
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, const Instruction *I) const
CRTP base class for use as a mix-in that aids implementing a TargetTransformInfo-compatible class.
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Basic
The cost of a typical 'add' instruction.
MemIndexedMode
The type of load/store indexing.
@ MIM_PostInc
Post-incrementing.
@ MIM_PostDec
Post-decrementing.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
CacheLevel
The possible cache levels.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:383
bool isArch64Bit() const
Test whether the architecture is 64-bit.
Definition: Triple.cpp:1681
bool isOSDarwin() const
Is this a "Darwin" OS (macOS, iOS, tvOS, watchOS, XROS, or DriverKit).
Definition: Triple.h:568
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:243
static IntegerType * getInt1Ty(LLVMContext &C)
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
static IntegerType * getInt8Ty(LLVMContext &C)
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:267
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:225
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
Value * getOperand(unsigned i) const
Definition: User.h:228
static bool isVPBinOp(Intrinsic::ID ID)
static bool isVPCast(Intrinsic::ID ID)
static bool isVPCmp(Intrinsic::ID ID)
static std::optional< unsigned > getFunctionalOpcodeForVP(Intrinsic::ID ID)
static std::optional< Intrinsic::ID > getFunctionalIntrinsicIDForVP(Intrinsic::ID ID)
static bool isVPIntrinsic(Intrinsic::ID)
static bool isVPReduction(Intrinsic::ID ID)
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
Base class of all SIMD vector types.
Definition: DerivedTypes.h:427
static VectorType * getHalfElementsVectorType(VectorType *VTy)
This static method returns a VectorType with half as many elements as the input type and the same ele...
Definition: DerivedTypes.h:531
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:665
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Definition: DerivedTypes.h:460
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:218
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition: APInt.cpp:2982
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:744
@ SMULFIX
RESULT = [US]MULFIX(LHS, RHS, SCALE) - Perform fixed point multiplication on 2 integers with the same...
Definition: ISDOpcodes.h:374
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:498
@ FATAN2
FATAN2 - atan2, inspired by libm.
Definition: ISDOpcodes.h:999
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:717
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:262
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:1059
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:334
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1123
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1127
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:515
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:757
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:330
@ FP_TO_UINT_SAT
Definition: ISDOpcodes.h:907
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1031
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:697
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:766
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1050
@ SCMP
[US]CMP - 3-way comparison of signed or unsigned integers.
Definition: ISDOpcodes.h:705
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition: ISDOpcodes.h:906
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:508
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
Definition: ISDOpcodes.h:1055
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
Definition: ISDOpcodes.h:1562
ID ArrayRef< Type * > Tys
Definition: Intrinsics.h:102
bool isTargetIntrinsic(ID IID)
isTargetIntrinsic - Returns true if IID is an intrinsic specific to a certain target.
Definition: Intrinsics.cpp:617
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:989
unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID)
Returns the arithmetic instruction opcode used when expanding a reduction.
Definition: LoopUtils.cpp:960
constexpr bool has_single_bit(T Value) noexcept
Definition: bit.h:146
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
ConstantRange getVScaleRange(const Function *F, unsigned BitWidth)
Determine the possible constant range of vscale with the given bit width, based on the vscale_range f...
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:217
InstructionCost Cost
cl::opt< unsigned > PartialUnrollingThreshold
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Extended Value Type.
Definition: ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
ElementCount getVectorElementCount() const
Definition: ValueTypes.h:345
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:289
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:65
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:210
Attributes of a target dependent hardware loop.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
bool AllowPeeling
Allow peeling off loop iterations.
bool AllowLoopNestsPeeling
Allow peeling off loop iterations for loop nests.
bool PeelProfiledIterations
Allow peeling basing on profile.
unsigned PeelCount
A forced peeling factor (the number of bodied of the original loop that should be peeled off before t...
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).