LLVM 20.0.0git
BasicTTIImpl.h
Go to the documentation of this file.
1//===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file provides a helper that implements much of the TTI interface in
11/// terms of the target-independent code generator and TargetLowering
12/// interfaces.
13//
14//===----------------------------------------------------------------------===//
15
16#ifndef LLVM_CODEGEN_BASICTTIIMPL_H
17#define LLVM_CODEGEN_BASICTTIIMPL_H
18
19#include "llvm/ADT/APInt.h"
20#include "llvm/ADT/ArrayRef.h"
21#include "llvm/ADT/BitVector.h"
34#include "llvm/IR/BasicBlock.h"
35#include "llvm/IR/Constant.h"
36#include "llvm/IR/Constants.h"
37#include "llvm/IR/DataLayout.h"
39#include "llvm/IR/InstrTypes.h"
40#include "llvm/IR/Instruction.h"
42#include "llvm/IR/Intrinsics.h"
43#include "llvm/IR/Operator.h"
44#include "llvm/IR/Type.h"
45#include "llvm/IR/Value.h"
53#include <algorithm>
54#include <cassert>
55#include <cstdint>
56#include <limits>
57#include <optional>
58#include <utility>
59
60namespace llvm {
61
62class Function;
63class GlobalValue;
64class LLVMContext;
65class ScalarEvolution;
66class SCEV;
67class TargetMachine;
68
69extern cl::opt<unsigned> PartialUnrollingThreshold;
70
71/// Base class which can be used to help build a TTI implementation.
72///
73/// This class provides as much implementation of the TTI interface as is
74/// possible using the target independent parts of the code generator.
75///
76/// In order to subclass it, your class must implement a getST() method to
77/// return the subtarget, and a getTLI() method to return the target lowering.
78/// We need these methods implemented in the derived class so that this class
79/// doesn't have to duplicate storage for them.
80template <typename T>
82private:
85
86 /// Helper function to access this as a T.
87 T *thisT() { return static_cast<T *>(this); }
88
89 /// Estimate a cost of Broadcast as an extract and sequence of insert
90 /// operations.
91 InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy,
94 // Broadcast cost is equal to the cost of extracting the zero'th element
95 // plus the cost of inserting it into every element of the result vector.
96 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
97 CostKind, 0, nullptr, nullptr);
98
99 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
100 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
101 CostKind, i, nullptr, nullptr);
102 }
103 return Cost;
104 }
105
106 /// Estimate a cost of shuffle as a sequence of extract and insert
107 /// operations.
108 InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy,
111 // Shuffle cost is equal to the cost of extracting element from its argument
112 // plus the cost of inserting them onto the result vector.
113
114 // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
115 // index 0 of first vector, index 1 of second vector,index 2 of first
116 // vector and finally index 3 of second vector and insert them at index
117 // <0,1,2,3> of result vector.
118 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
119 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
120 CostKind, i, nullptr, nullptr);
121 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
122 CostKind, i, nullptr, nullptr);
123 }
124 return Cost;
125 }
126
127 /// Estimate a cost of subvector extraction as a sequence of extract and
128 /// insert operations.
129 InstructionCost getExtractSubvectorOverhead(VectorType *VTy,
131 int Index,
132 FixedVectorType *SubVTy) {
133 assert(VTy && SubVTy &&
134 "Can only extract subvectors from vectors");
135 int NumSubElts = SubVTy->getNumElements();
136 assert((!isa<FixedVectorType>(VTy) ||
137 (Index + NumSubElts) <=
138 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
139 "SK_ExtractSubvector index out of range");
140
142 // Subvector extraction cost is equal to the cost of extracting element from
143 // the source type plus the cost of inserting them into the result vector
144 // type.
145 for (int i = 0; i != NumSubElts; ++i) {
146 Cost +=
147 thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
148 CostKind, i + Index, nullptr, nullptr);
149 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy,
150 CostKind, i, nullptr, nullptr);
151 }
152 return Cost;
153 }
154
155 /// Estimate a cost of subvector insertion as a sequence of extract and
156 /// insert operations.
157 InstructionCost getInsertSubvectorOverhead(VectorType *VTy,
159 int Index,
160 FixedVectorType *SubVTy) {
161 assert(VTy && SubVTy &&
162 "Can only insert subvectors into vectors");
163 int NumSubElts = SubVTy->getNumElements();
164 assert((!isa<FixedVectorType>(VTy) ||
165 (Index + NumSubElts) <=
166 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
167 "SK_InsertSubvector index out of range");
168
170 // Subvector insertion cost is equal to the cost of extracting element from
171 // the source type plus the cost of inserting them into the result vector
172 // type.
173 for (int i = 0; i != NumSubElts; ++i) {
174 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy,
175 CostKind, i, nullptr, nullptr);
176 Cost +=
177 thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, CostKind,
178 i + Index, nullptr, nullptr);
179 }
180 return Cost;
181 }
182
183 /// Local query method delegates up to T which *must* implement this!
184 const TargetSubtargetInfo *getST() const {
185 return static_cast<const T *>(this)->getST();
186 }
187
188 /// Local query method delegates up to T which *must* implement this!
189 const TargetLoweringBase *getTLI() const {
190 return static_cast<const T *>(this)->getTLI();
191 }
192
193 static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
194 switch (M) {
196 return ISD::UNINDEXED;
197 case TTI::MIM_PreInc:
198 return ISD::PRE_INC;
199 case TTI::MIM_PreDec:
200 return ISD::PRE_DEC;
201 case TTI::MIM_PostInc:
202 return ISD::POST_INC;
203 case TTI::MIM_PostDec:
204 return ISD::POST_DEC;
205 }
206 llvm_unreachable("Unexpected MemIndexedMode");
207 }
208
209 InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
210 Align Alignment,
211 bool VariableMask,
212 bool IsGatherScatter,
214 unsigned AddressSpace = 0) {
215 // We cannot scalarize scalable vectors, so return Invalid.
216 if (isa<ScalableVectorType>(DataTy))
218
219 auto *VT = cast<FixedVectorType>(DataTy);
220 unsigned VF = VT->getNumElements();
221
222 // Assume the target does not have support for gather/scatter operations
223 // and provide a rough estimate.
224 //
225 // First, compute the cost of the individual memory operations.
226 InstructionCost AddrExtractCost =
227 IsGatherScatter
230 PointerType::get(VT->getElementType(), 0), VF),
231 /*Insert=*/false, /*Extract=*/true, CostKind)
232 : 0;
233
234 // The cost of the scalar loads/stores.
235 InstructionCost MemoryOpCost =
236 VF * thisT()->getMemoryOpCost(Opcode, VT->getElementType(), Alignment,
238
239 // Next, compute the cost of packing the result in a vector.
240 InstructionCost PackingCost =
241 getScalarizationOverhead(VT, Opcode != Instruction::Store,
242 Opcode == Instruction::Store, CostKind);
243
244 InstructionCost ConditionalCost = 0;
245 if (VariableMask) {
246 // Compute the cost of conditionally executing the memory operations with
247 // variable masks. This includes extracting the individual conditions, a
248 // branches and PHIs to combine the results.
249 // NOTE: Estimating the cost of conditionally executing the memory
250 // operations accurately is quite difficult and the current solution
251 // provides a very rough estimate only.
252 ConditionalCost =
255 /*Insert=*/false, /*Extract=*/true, CostKind) +
256 VF * (thisT()->getCFInstrCost(Instruction::Br, CostKind) +
257 thisT()->getCFInstrCost(Instruction::PHI, CostKind));
258 }
259
260 return AddrExtractCost + MemoryOpCost + PackingCost + ConditionalCost;
261 }
262
263protected:
264 explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
265 : BaseT(DL) {}
266 virtual ~BasicTTIImplBase() = default;
267
269
270public:
271 /// \name Scalar TTI Implementations
272 /// @{
274 unsigned AddressSpace, Align Alignment,
275 unsigned *Fast) const {
276 EVT E = EVT::getIntegerVT(Context, BitWidth);
277 return getTLI()->allowsMisalignedMemoryAccesses(
279 }
280
281 bool hasBranchDivergence(const Function *F = nullptr) { return false; }
282
283 bool isSourceOfDivergence(const Value *V) { return false; }
284
285 bool isAlwaysUniform(const Value *V) { return false; }
286
287 bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
288 return false;
289 }
290
291 bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const {
292 return true;
293 }
294
296 // Return an invalid address space.
297 return -1;
298 }
299
301 Intrinsic::ID IID) const {
302 return false;
303 }
304
305 bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
306 return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS);
307 }
308
309 unsigned getAssumedAddrSpace(const Value *V) const {
310 return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
311 }
312
313 bool isSingleThreaded() const {
314 return getTLI()->getTargetMachine().Options.ThreadModel ==
316 }
317
318 std::pair<const Value *, unsigned>
320 return getTLI()->getTargetMachine().getPredicatedAddrSpace(V);
321 }
322
324 Value *NewV) const {
325 return nullptr;
326 }
327
328 bool isLegalAddImmediate(int64_t imm) {
329 return getTLI()->isLegalAddImmediate(imm);
330 }
331
332 bool isLegalAddScalableImmediate(int64_t Imm) {
333 return getTLI()->isLegalAddScalableImmediate(Imm);
334 }
335
336 bool isLegalICmpImmediate(int64_t imm) {
337 return getTLI()->isLegalICmpImmediate(imm);
338 }
339
340 bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
341 bool HasBaseReg, int64_t Scale, unsigned AddrSpace,
342 Instruction *I = nullptr,
343 int64_t ScalableOffset = 0) {
345 AM.BaseGV = BaseGV;
346 AM.BaseOffs = BaseOffset;
347 AM.HasBaseReg = HasBaseReg;
348 AM.Scale = Scale;
349 AM.ScalableOffset = ScalableOffset;
350 return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
351 }
352
353 int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) {
354 return getTLI()->getPreferredLargeGEPBaseOffset(MinOffset, MaxOffset);
355 }
356
357 unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
358 Type *ScalarValTy) const {
359 auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy](unsigned VF) {
360 auto *SrcTy = FixedVectorType::get(ScalarMemTy, VF / 2);
361 EVT VT = getTLI()->getValueType(DL, SrcTy);
362 if (getTLI()->isOperationLegal(ISD::STORE, VT) ||
363 getTLI()->isOperationCustom(ISD::STORE, VT))
364 return true;
365
366 EVT ValVT =
367 getTLI()->getValueType(DL, FixedVectorType::get(ScalarValTy, VF / 2));
368 EVT LegalizedVT =
369 getTLI()->getTypeToTransformTo(ScalarMemTy->getContext(), VT);
370 return getTLI()->isTruncStoreLegal(LegalizedVT, ValVT);
371 };
372 while (VF > 2 && IsSupportedByTarget(VF))
373 VF /= 2;
374 return VF;
375 }
376
378 const DataLayout &DL) const {
379 EVT VT = getTLI()->getValueType(DL, Ty);
380 return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
381 }
382
384 const DataLayout &DL) const {
385 EVT VT = getTLI()->getValueType(DL, Ty);
386 return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
387 }
388
391 }
392
395 }
396
399 }
400
403 }
404
406 StackOffset BaseOffset, bool HasBaseReg,
407 int64_t Scale, unsigned AddrSpace) {
409 AM.BaseGV = BaseGV;
410 AM.BaseOffs = BaseOffset.getFixed();
411 AM.HasBaseReg = HasBaseReg;
412 AM.Scale = Scale;
413 AM.ScalableOffset = BaseOffset.getScalable();
414 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
415 return 0;
416 return -1;
417 }
418
419 bool isTruncateFree(Type *Ty1, Type *Ty2) {
420 return getTLI()->isTruncateFree(Ty1, Ty2);
421 }
422
424 return getTLI()->isProfitableToHoist(I);
425 }
426
427 bool useAA() const { return getST()->useAA(); }
428
429 bool isTypeLegal(Type *Ty) {
430 EVT VT = getTLI()->getValueType(DL, Ty, /*AllowUnknown=*/true);
431 return getTLI()->isTypeLegal(VT);
432 }
433
434 unsigned getRegUsageForType(Type *Ty) {
435 EVT ETy = getTLI()->getValueType(DL, Ty);
436 return getTLI()->getNumRegisters(Ty->getContext(), ETy);
437 }
438
442 return BaseT::getGEPCost(PointeeType, Ptr, Operands, AccessType, CostKind);
443 }
444
446 unsigned &JumpTableSize,
448 BlockFrequencyInfo *BFI) {
449 /// Try to find the estimated number of clusters. Note that the number of
450 /// clusters identified in this function could be different from the actual
451 /// numbers found in lowering. This function ignore switches that are
452 /// lowered with a mix of jump table / bit test / BTree. This function was
453 /// initially intended to be used when estimating the cost of switch in
454 /// inline cost heuristic, but it's a generic cost model to be used in other
455 /// places (e.g., in loop unrolling).
456 unsigned N = SI.getNumCases();
457 const TargetLoweringBase *TLI = getTLI();
458 const DataLayout &DL = this->getDataLayout();
459
460 JumpTableSize = 0;
461 bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());
462
463 // Early exit if both a jump table and bit test are not allowed.
464 if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N))
465 return N;
466
467 APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
468 APInt MinCaseVal = MaxCaseVal;
469 for (auto CI : SI.cases()) {
470 const APInt &CaseVal = CI.getCaseValue()->getValue();
471 if (CaseVal.sgt(MaxCaseVal))
472 MaxCaseVal = CaseVal;
473 if (CaseVal.slt(MinCaseVal))
474 MinCaseVal = CaseVal;
475 }
476
477 // Check if suitable for a bit test
478 if (N <= DL.getIndexSizeInBits(0u)) {
480 for (auto I : SI.cases())
481 Dests.insert(I.getCaseSuccessor());
482
483 if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
484 DL))
485 return 1;
486 }
487
488 // Check if suitable for a jump table.
489 if (IsJTAllowed) {
490 if (N < 2 || N < TLI->getMinimumJumpTableEntries())
491 return N;
493 (MaxCaseVal - MinCaseVal)
494 .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1;
495 // Check whether a range of clusters is dense enough for a jump table
496 if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) {
497 JumpTableSize = Range;
498 return 1;
499 }
500 }
501 return N;
502 }
503
505 const TargetLoweringBase *TLI = getTLI();
506 return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
507 TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
508 }
509
511 const TargetMachine &TM = getTLI()->getTargetMachine();
512 // If non-PIC mode, do not generate a relative lookup table.
513 if (!TM.isPositionIndependent())
514 return false;
515
516 /// Relative lookup table entries consist of 32-bit offsets.
517 /// Do not generate relative lookup tables for large code models
518 /// in 64-bit achitectures where 32-bit offsets might not be enough.
519 if (TM.getCodeModel() == CodeModel::Medium ||
520 TM.getCodeModel() == CodeModel::Large)
521 return false;
522
523 const Triple &TargetTriple = TM.getTargetTriple();
524 if (!TargetTriple.isArch64Bit())
525 return false;
526
527 // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it
528 // there.
529 if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin())
530 return false;
531
532 return true;
533 }
534
535 bool haveFastSqrt(Type *Ty) {
536 const TargetLoweringBase *TLI = getTLI();
537 EVT VT = TLI->getValueType(DL, Ty);
538 return TLI->isTypeLegal(VT) &&
540 }
541
543 return true;
544 }
545
547 // Check whether FADD is available, as a proxy for floating-point in
548 // general.
549 const TargetLoweringBase *TLI = getTLI();
550 EVT VT = TLI->getValueType(DL, Ty);
554 }
555
557 const Function &Fn) const {
558 switch (Inst.getOpcode()) {
559 default:
560 break;
561 case Instruction::SDiv:
562 case Instruction::SRem:
563 case Instruction::UDiv:
564 case Instruction::URem: {
565 if (!isa<ConstantInt>(Inst.getOperand(1)))
566 return false;
567 EVT VT = getTLI()->getValueType(DL, Inst.getType());
568 return !getTLI()->isIntDivCheap(VT, Fn.getAttributes());
569 }
570 };
571
572 return false;
573 }
574
575 unsigned getInliningThresholdMultiplier() const { return 1; }
576 unsigned adjustInliningThreshold(const CallBase *CB) { return 0; }
577 unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const {
578 return 0;
579 }
580
581 int getInlinerVectorBonusPercent() const { return 150; }
582
586 // This unrolling functionality is target independent, but to provide some
587 // motivation for its intended use, for x86:
588
589 // According to the Intel 64 and IA-32 Architectures Optimization Reference
590 // Manual, Intel Core models and later have a loop stream detector (and
591 // associated uop queue) that can benefit from partial unrolling.
592 // The relevant requirements are:
593 // - The loop must have no more than 4 (8 for Nehalem and later) branches
594 // taken, and none of them may be calls.
595 // - The loop can have no more than 18 (28 for Nehalem and later) uops.
596
597 // According to the Software Optimization Guide for AMD Family 15h
598 // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
599 // and loop buffer which can benefit from partial unrolling.
600 // The relevant requirements are:
601 // - The loop must have fewer than 16 branches
602 // - The loop must have less than 40 uops in all executed loop branches
603
604 // The number of taken branches in a loop is hard to estimate here, and
605 // benchmarking has revealed that it is better not to be conservative when
606 // estimating the branch count. As a result, we'll ignore the branch limits
607 // until someone finds a case where it matters in practice.
608
609 unsigned MaxOps;
610 const TargetSubtargetInfo *ST = getST();
611 if (PartialUnrollingThreshold.getNumOccurrences() > 0)
613 else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
614 MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
615 else
616 return;
617
618 // Scan the loop: don't unroll loops with calls.
619 for (BasicBlock *BB : L->blocks()) {
620 for (Instruction &I : *BB) {
621 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
622 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
623 if (!thisT()->isLoweredToCall(F))
624 continue;
625 }
626
627 if (ORE) {
628 ORE->emit([&]() {
629 return OptimizationRemark("TTI", "DontUnroll", L->getStartLoc(),
630 L->getHeader())
631 << "advising against unrolling the loop because it "
632 "contains a "
633 << ore::NV("Call", &I);
634 });
635 }
636 return;
637 }
638 }
639 }
640
641 // Enable runtime and partial unrolling up to the specified size.
642 // Enable using trip count upper bound to unroll loops.
643 UP.Partial = UP.Runtime = UP.UpperBound = true;
644 UP.PartialThreshold = MaxOps;
645
646 // Avoid unrolling when optimizing for size.
647 UP.OptSizeThreshold = 0;
649
650 // Set number of instructions optimized when "back edge"
651 // becomes "fall through" to default value of 2.
652 UP.BEInsns = 2;
653 }
654
657 PP.PeelCount = 0;
658 PP.AllowPeeling = true;
659 PP.AllowLoopNestsPeeling = false;
660 PP.PeelProfiledIterations = true;
661 }
662
664 AssumptionCache &AC,
665 TargetLibraryInfo *LibInfo,
666 HardwareLoopInfo &HWLoopInfo) {
667 return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
668 }
669
672 }
673
675 getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) {
676 return BaseT::getPreferredTailFoldingStyle(IVUpdateMayOverflow);
677 }
678
679 std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
680 IntrinsicInst &II) {
682 }
683
684 std::optional<Value *>
686 APInt DemandedMask, KnownBits &Known,
687 bool &KnownBitsComputed) {
688 return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
689 KnownBitsComputed);
690 }
691
693 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
694 APInt &UndefElts2, APInt &UndefElts3,
695 std::function<void(Instruction *, unsigned, APInt, APInt &)>
696 SimplifyAndSetOp) {
698 IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
699 SimplifyAndSetOp);
700 }
701
702 virtual std::optional<unsigned>
704 return std::optional<unsigned>(
705 getST()->getCacheSize(static_cast<unsigned>(Level)));
706 }
707
708 virtual std::optional<unsigned>
710 std::optional<unsigned> TargetResult =
711 getST()->getCacheAssociativity(static_cast<unsigned>(Level));
712
713 if (TargetResult)
714 return TargetResult;
715
716 return BaseT::getCacheAssociativity(Level);
717 }
718
719 virtual unsigned getCacheLineSize() const {
720 return getST()->getCacheLineSize();
721 }
722
723 virtual unsigned getPrefetchDistance() const {
724 return getST()->getPrefetchDistance();
725 }
726
727 virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
728 unsigned NumStridedMemAccesses,
729 unsigned NumPrefetches,
730 bool HasCall) const {
731 return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
732 NumPrefetches, HasCall);
733 }
734
735 virtual unsigned getMaxPrefetchIterationsAhead() const {
736 return getST()->getMaxPrefetchIterationsAhead();
737 }
738
739 virtual bool enableWritePrefetching() const {
740 return getST()->enableWritePrefetching();
741 }
742
743 virtual bool shouldPrefetchAddressSpace(unsigned AS) const {
744 return getST()->shouldPrefetchAddressSpace(AS);
745 }
746
747 /// @}
748
749 /// \name Vector TTI Implementations
750 /// @{
751
753 return TypeSize::getFixed(32);
754 }
755
756 std::optional<unsigned> getMaxVScale() const { return std::nullopt; }
757 std::optional<unsigned> getVScaleForTuning() const { return std::nullopt; }
758 bool isVScaleKnownToBeAPowerOfTwo() const { return false; }
759
760 /// Estimate the overhead of scalarizing an instruction. Insert and Extract
761 /// are set if the demanded result elements need to be inserted and/or
762 /// extracted from vectors.
764 const APInt &DemandedElts,
765 bool Insert, bool Extract,
767 /// FIXME: a bitfield is not a reasonable abstraction for talking about
768 /// which elements are needed from a scalable vector
769 if (isa<ScalableVectorType>(InTy))
771 auto *Ty = cast<FixedVectorType>(InTy);
772
773 assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&
774 "Vector size mismatch");
775
777
778 for (int i = 0, e = Ty->getNumElements(); i < e; ++i) {
779 if (!DemandedElts[i])
780 continue;
781 if (Insert)
782 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty,
783 CostKind, i, nullptr, nullptr);
784 if (Extract)
785 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
786 CostKind, i, nullptr, nullptr);
787 }
788
789 return Cost;
790 }
791
792 /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
794 bool Extract,
796 if (isa<ScalableVectorType>(InTy))
798 auto *Ty = cast<FixedVectorType>(InTy);
799
800 APInt DemandedElts = APInt::getAllOnes(Ty->getNumElements());
801 return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
802 CostKind);
803 }
804
805 /// Estimate the overhead of scalarizing an instructions unique
806 /// non-constant operands. The (potentially vector) types to use for each of
807 /// argument are passes via Tys.
812 assert(Args.size() == Tys.size() && "Expected matching Args and Tys");
813
815 SmallPtrSet<const Value*, 4> UniqueOperands;
816 for (int I = 0, E = Args.size(); I != E; I++) {
817 // Disregard things like metadata arguments.
818 const Value *A = Args[I];
819 Type *Ty = Tys[I];
820 if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() &&
821 !Ty->isPtrOrPtrVectorTy())
822 continue;
823
824 if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
825 if (auto *VecTy = dyn_cast<VectorType>(Ty))
826 Cost += getScalarizationOverhead(VecTy, /*Insert*/ false,
827 /*Extract*/ true, CostKind);
828 }
829 }
830
831 return Cost;
832 }
833
834 /// Estimate the overhead of scalarizing the inputs and outputs of an
835 /// instruction, with return type RetTy and arguments Args of type Tys. If
836 /// Args are unknown (empty), then the cost associated with one argument is
837 /// added as a heuristic.
843 RetTy, /*Insert*/ true, /*Extract*/ false, CostKind);
844 if (!Args.empty())
846 else
847 // When no information on arguments is provided, we add the cost
848 // associated with one argument as a heuristic.
849 Cost += getScalarizationOverhead(RetTy, /*Insert*/ false,
850 /*Extract*/ true, CostKind);
851
852 return Cost;
853 }
854
855 /// Estimate the cost of type-legalization and the legalized type.
856 std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const {
857 LLVMContext &C = Ty->getContext();
858 EVT MTy = getTLI()->getValueType(DL, Ty);
859
861 // We keep legalizing the type until we find a legal kind. We assume that
862 // the only operation that costs anything is the split. After splitting
863 // we need to handle two types.
864 while (true) {
866
868 // Ensure we return a sensible simple VT here, since many callers of
869 // this function require it.
870 MVT VT = MTy.isSimple() ? MTy.getSimpleVT() : MVT::i64;
871 return std::make_pair(InstructionCost::getInvalid(), VT);
872 }
873
874 if (LK.first == TargetLoweringBase::TypeLegal)
875 return std::make_pair(Cost, MTy.getSimpleVT());
876
877 if (LK.first == TargetLoweringBase::TypeSplitVector ||
879 Cost *= 2;
880
881 // Do not loop with f128 type.
882 if (MTy == LK.second)
883 return std::make_pair(Cost, MTy.getSimpleVT());
884
885 // Keep legalizing the type.
886 MTy = LK.second;
887 }
888 }
889
890 unsigned getMaxInterleaveFactor(ElementCount VF) { return 1; }
891
893 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
896 ArrayRef<const Value *> Args = std::nullopt,
897 const Instruction *CxtI = nullptr) {
898 // Check if any of the operands are vector operands.
899 const TargetLoweringBase *TLI = getTLI();
900 int ISD = TLI->InstructionOpcodeToISD(Opcode);
901 assert(ISD && "Invalid opcode");
902
903 // TODO: Handle more cost kinds.
905 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
906 Opd1Info, Opd2Info,
907 Args, CxtI);
908
909 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
910
911 bool IsFloat = Ty->isFPOrFPVectorTy();
912 // Assume that floating point arithmetic operations cost twice as much as
913 // integer operations.
914 InstructionCost OpCost = (IsFloat ? 2 : 1);
915
916 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
917 // The operation is legal. Assume it costs 1.
918 // TODO: Once we have extract/insert subvector cost we need to use them.
919 return LT.first * OpCost;
920 }
921
922 if (!TLI->isOperationExpand(ISD, LT.second)) {
923 // If the operation is custom lowered, then assume that the code is twice
924 // as expensive.
925 return LT.first * 2 * OpCost;
926 }
927
928 // An 'Expand' of URem and SRem is special because it may default
929 // to expanding the operation into a sequence of sub-operations
930 // i.e. X % Y -> X-(X/Y)*Y.
931 if (ISD == ISD::UREM || ISD == ISD::SREM) {
932 bool IsSigned = ISD == ISD::SREM;
933 if (TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIVREM : ISD::UDIVREM,
934 LT.second) ||
935 TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIV : ISD::UDIV,
936 LT.second)) {
937 unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv;
938 InstructionCost DivCost = thisT()->getArithmeticInstrCost(
939 DivOpc, Ty, CostKind, Opd1Info, Opd2Info);
940 InstructionCost MulCost =
941 thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind);
942 InstructionCost SubCost =
943 thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
944 return DivCost + MulCost + SubCost;
945 }
946 }
947
948 // We cannot scalarize scalable vectors, so return Invalid.
949 if (isa<ScalableVectorType>(Ty))
951
952 // Else, assume that we need to scalarize this op.
953 // TODO: If one of the types get legalized by splitting, handle this
954 // similarly to what getCastInstrCost() does.
955 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
956 InstructionCost Cost = thisT()->getArithmeticInstrCost(
957 Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
958 Args, CxtI);
959 // Return the cost of multiple scalar invocation plus the cost of
960 // inserting and extracting the values.
961 SmallVector<Type *> Tys(Args.size(), Ty);
962 return getScalarizationOverhead(VTy, Args, Tys, CostKind) +
963 VTy->getNumElements() * Cost;
964 }
965
966 // We don't know anything about this scalar instruction.
967 return OpCost;
968 }
969
971 ArrayRef<int> Mask,
972 VectorType *Ty, int &Index,
973 VectorType *&SubTy) const {
974 if (Mask.empty())
975 return Kind;
976 int NumSrcElts = Ty->getElementCount().getKnownMinValue();
977 switch (Kind) {
979 if (ShuffleVectorInst::isReverseMask(Mask, NumSrcElts))
980 return TTI::SK_Reverse;
981 if (ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts))
982 return TTI::SK_Broadcast;
983 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts, Index) &&
984 (Index + Mask.size()) <= (size_t)NumSrcElts) {
985 SubTy = FixedVectorType::get(Ty->getElementType(), Mask.size());
987 }
988 break;
990 int NumSubElts;
991 if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
992 Mask, NumSrcElts, NumSubElts, Index)) {
993 if (Index + NumSubElts > NumSrcElts)
994 return Kind;
995 SubTy = FixedVectorType::get(Ty->getElementType(), NumSubElts);
997 }
998 if (ShuffleVectorInst::isSelectMask(Mask, NumSrcElts))
999 return TTI::SK_Select;
1000 if (ShuffleVectorInst::isTransposeMask(Mask, NumSrcElts))
1001 return TTI::SK_Transpose;
1002 if (ShuffleVectorInst::isSpliceMask(Mask, NumSrcElts, Index))
1003 return TTI::SK_Splice;
1004 break;
1005 }
1006 case TTI::SK_Select:
1007 case TTI::SK_Reverse:
1008 case TTI::SK_Broadcast:
1009 case TTI::SK_Transpose:
1012 case TTI::SK_Splice:
1013 break;
1014 }
1015 return Kind;
1016 }
1017
1019 ArrayRef<int> Mask,
1021 VectorType *SubTp,
1022 ArrayRef<const Value *> Args = std::nullopt,
1023 const Instruction *CxtI = nullptr) {
1024 switch (improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp)) {
1025 case TTI::SK_Broadcast:
1026 if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
1027 return getBroadcastShuffleOverhead(FVT, CostKind);
1029 case TTI::SK_Select:
1030 case TTI::SK_Splice:
1031 case TTI::SK_Reverse:
1032 case TTI::SK_Transpose:
1035 if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
1036 return getPermuteShuffleOverhead(FVT, CostKind);
1039 return getExtractSubvectorOverhead(Tp, CostKind, Index,
1040 cast<FixedVectorType>(SubTp));
1042 return getInsertSubvectorOverhead(Tp, CostKind, Index,
1043 cast<FixedVectorType>(SubTp));
1044 }
1045 llvm_unreachable("Unknown TTI::ShuffleKind");
1046 }
1047
1048 InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1051 const Instruction *I = nullptr) {
1052 if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0)
1053 return 0;
1054
1055 const TargetLoweringBase *TLI = getTLI();
1056 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1057 assert(ISD && "Invalid opcode");
1058 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1059 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1060
1061 TypeSize SrcSize = SrcLT.second.getSizeInBits();
1062 TypeSize DstSize = DstLT.second.getSizeInBits();
1063 bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy();
1064 bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy();
1065
1066 switch (Opcode) {
1067 default:
1068 break;
1069 case Instruction::Trunc:
1070 // Check for NOOP conversions.
1071 if (TLI->isTruncateFree(SrcLT.second, DstLT.second))
1072 return 0;
1073 [[fallthrough]];
1074 case Instruction::BitCast:
1075 // Bitcast between types that are legalized to the same type are free and
1076 // assume int to/from ptr of the same size is also free.
1077 if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst &&
1078 SrcSize == DstSize)
1079 return 0;
1080 break;
1081 case Instruction::FPExt:
1082 if (I && getTLI()->isExtFree(I))
1083 return 0;
1084 break;
1085 case Instruction::ZExt:
1086 if (TLI->isZExtFree(SrcLT.second, DstLT.second))
1087 return 0;
1088 [[fallthrough]];
1089 case Instruction::SExt:
1090 if (I && getTLI()->isExtFree(I))
1091 return 0;
1092
1093 // If this is a zext/sext of a load, return 0 if the corresponding
1094 // extending load exists on target and the result type is legal.
1095 if (CCH == TTI::CastContextHint::Normal) {
1096 EVT ExtVT = EVT::getEVT(Dst);
1097 EVT LoadVT = EVT::getEVT(Src);
1098 unsigned LType =
1099 ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
1100 if (DstLT.first == SrcLT.first &&
1101 TLI->isLoadExtLegal(LType, ExtVT, LoadVT))
1102 return 0;
1103 }
1104 break;
1105 case Instruction::AddrSpaceCast:
1106 if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(),
1107 Dst->getPointerAddressSpace()))
1108 return 0;
1109 break;
1110 }
1111
1112 auto *SrcVTy = dyn_cast<VectorType>(Src);
1113 auto *DstVTy = dyn_cast<VectorType>(Dst);
1114
1115 // If the cast is marked as legal (or promote) then assume low cost.
1116 if (SrcLT.first == DstLT.first &&
1117 TLI->isOperationLegalOrPromote(ISD, DstLT.second))
1118 return SrcLT.first;
1119
1120 // Handle scalar conversions.
1121 if (!SrcVTy && !DstVTy) {
1122 // Just check the op cost. If the operation is legal then assume it costs
1123 // 1.
1124 if (!TLI->isOperationExpand(ISD, DstLT.second))
1125 return 1;
1126
1127 // Assume that illegal scalar instruction are expensive.
1128 return 4;
1129 }
1130
1131 // Check vector-to-vector casts.
1132 if (DstVTy && SrcVTy) {
1133 // If the cast is between same-sized registers, then the check is simple.
1134 if (SrcLT.first == DstLT.first && SrcSize == DstSize) {
1135
1136 // Assume that Zext is done using AND.
1137 if (Opcode == Instruction::ZExt)
1138 return SrcLT.first;
1139
1140 // Assume that sext is done using SHL and SRA.
1141 if (Opcode == Instruction::SExt)
1142 return SrcLT.first * 2;
1143
1144 // Just check the op cost. If the operation is legal then assume it
1145 // costs
1146 // 1 and multiply by the type-legalization overhead.
1147 if (!TLI->isOperationExpand(ISD, DstLT.second))
1148 return SrcLT.first * 1;
1149 }
1150
1151 // If we are legalizing by splitting, query the concrete TTI for the cost
1152 // of casting the original vector twice. We also need to factor in the
1153 // cost of the split itself. Count that as 1, to be consistent with
1154 // getTypeLegalizationCost().
1155 bool SplitSrc =
1156 TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
1158 bool SplitDst =
1159 TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
1161 if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() &&
1162 DstVTy->getElementCount().isVector()) {
1163 Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy);
1164 Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy);
1165 T *TTI = static_cast<T *>(this);
1166 // If both types need to be split then the split is free.
1167 InstructionCost SplitCost =
1168 (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0;
1169 return SplitCost +
1170 (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH,
1171 CostKind, I));
1172 }
1173
1174 // Scalarization cost is Invalid, can't assume any num elements.
1175 if (isa<ScalableVectorType>(DstVTy))
1177
1178 // In other cases where the source or destination are illegal, assume
1179 // the operation will get scalarized.
1180 unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements();
1181 InstructionCost Cost = thisT()->getCastInstrCost(
1182 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I);
1183
1184 // Return the cost of multiple scalar invocation plus the cost of
1185 // inserting and extracting the values.
1186 return getScalarizationOverhead(DstVTy, /*Insert*/ true, /*Extract*/ true,
1187 CostKind) +
1188 Num * Cost;
1189 }
1190
1191 // We already handled vector-to-vector and scalar-to-scalar conversions.
1192 // This
1193 // is where we handle bitcast between vectors and scalars. We need to assume
1194 // that the conversion is scalarized in one way or another.
1195 if (Opcode == Instruction::BitCast) {
1196 // Illegal bitcasts are done by storing and loading from a stack slot.
1197 return (SrcVTy ? getScalarizationOverhead(SrcVTy, /*Insert*/ false,
1198 /*Extract*/ true, CostKind)
1199 : 0) +
1200 (DstVTy ? getScalarizationOverhead(DstVTy, /*Insert*/ true,
1201 /*Extract*/ false, CostKind)
1202 : 0);
1203 }
1204
1205 llvm_unreachable("Unhandled cast");
1206 }
1207
1209 VectorType *VecTy, unsigned Index) {
1211 return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
1212 CostKind, Index, nullptr, nullptr) +
1213 thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(),
1215 }
1216
1218 const Instruction *I = nullptr) {
1219 return BaseT::getCFInstrCost(Opcode, CostKind, I);
1220 }
1221
1222 InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1223 CmpInst::Predicate VecPred,
1225 const Instruction *I = nullptr) {
1226 const TargetLoweringBase *TLI = getTLI();
1227 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1228 assert(ISD && "Invalid opcode");
1229
1230 // TODO: Handle other cost kinds.
1232 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1233 I);
1234
1235 // Selects on vectors are actually vector selects.
1236 if (ISD == ISD::SELECT) {
1237 assert(CondTy && "CondTy must exist");
1238 if (CondTy->isVectorTy())
1239 ISD = ISD::VSELECT;
1240 }
1241 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1242
1243 if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
1244 !TLI->isOperationExpand(ISD, LT.second)) {
1245 // The operation is legal. Assume it costs 1. Multiply
1246 // by the type-legalization overhead.
1247 return LT.first * 1;
1248 }
1249
1250 // Otherwise, assume that the cast is scalarized.
1251 // TODO: If one of the types get legalized by splitting, handle this
1252 // similarly to what getCastInstrCost() does.
1253 if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) {
1254 if (isa<ScalableVectorType>(ValTy))
1256
1257 unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements();
1258 if (CondTy)
1259 CondTy = CondTy->getScalarType();
1260 InstructionCost Cost = thisT()->getCmpSelInstrCost(
1261 Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I);
1262
1263 // Return the cost of multiple scalar invocation plus the cost of
1264 // inserting and extracting the values.
1265 return getScalarizationOverhead(ValVTy, /*Insert*/ true,
1266 /*Extract*/ false, CostKind) +
1267 Num * Cost;
1268 }
1269
1270 // Unknown scalar opcode.
1271 return 1;
1272 }
1273
1276 unsigned Index, Value *Op0, Value *Op1) {
1277 return getRegUsageForType(Val->getScalarType());
1278 }
1279
1282 unsigned Index) {
1283 Value *Op0 = nullptr;
1284 Value *Op1 = nullptr;
1285 if (auto *IE = dyn_cast<InsertElementInst>(&I)) {
1286 Op0 = IE->getOperand(0);
1287 Op1 = IE->getOperand(1);
1288 }
1289 return thisT()->getVectorInstrCost(I.getOpcode(), Val, CostKind, Index, Op0,
1290 Op1);
1291 }
1292
1293 InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
1294 int VF,
1295 const APInt &DemandedDstElts,
1297 assert(DemandedDstElts.getBitWidth() == (unsigned)VF * ReplicationFactor &&
1298 "Unexpected size of DemandedDstElts.");
1299
1301
1302 auto *SrcVT = FixedVectorType::get(EltTy, VF);
1303 auto *ReplicatedVT = FixedVectorType::get(EltTy, VF * ReplicationFactor);
1304
1305 // The Mask shuffling cost is extract all the elements of the Mask
1306 // and insert each of them Factor times into the wide vector:
1307 //
1308 // E.g. an interleaved group with factor 3:
1309 // %mask = icmp ult <8 x i32> %vec1, %vec2
1310 // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
1311 // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
1312 // The cost is estimated as extract all mask elements from the <8xi1> mask
1313 // vector and insert them factor times into the <24xi1> shuffled mask
1314 // vector.
1315 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedDstElts, VF);
1316 Cost += thisT()->getScalarizationOverhead(SrcVT, DemandedSrcElts,
1317 /*Insert*/ false,
1318 /*Extract*/ true, CostKind);
1319 Cost += thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts,
1320 /*Insert*/ true,
1321 /*Extract*/ false, CostKind);
1322
1323 return Cost;
1324 }
1325
1327 getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
1330 const Instruction *I = nullptr) {
1331 assert(!Src->isVoidTy() && "Invalid type");
1332 // Assume types, such as structs, are expensive.
1333 if (getTLI()->getValueType(DL, Src, true) == MVT::Other)
1334 return 4;
1335 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1336
1337 // Assuming that all loads of legal types cost 1.
1338 InstructionCost Cost = LT.first;
1340 return Cost;
1341
1342 const DataLayout &DL = this->getDataLayout();
1343 if (Src->isVectorTy() &&
1344 // In practice it's not currently possible to have a change in lane
1345 // length for extending loads or truncating stores so both types should
1346 // have the same scalable property.
1348 LT.second.getSizeInBits())) {
1349 // This is a vector load that legalizes to a larger type than the vector
1350 // itself. Unless the corresponding extending load or truncating store is
1351 // legal, then this will scalarize.
1353 EVT MemVT = getTLI()->getValueType(DL, Src);
1354 if (Opcode == Instruction::Store)
1355 LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
1356 else
1357 LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
1358
1359 if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
1360 // This is a vector load/store for some illegal type that is scalarized.
1361 // We must account for the cost of building or decomposing the vector.
1363 cast<VectorType>(Src), Opcode != Instruction::Store,
1364 Opcode == Instruction::Store, CostKind);
1365 }
1366 }
1367
1368 return Cost;
1369 }
1370
1372 Align Alignment, unsigned AddressSpace,
1374 // TODO: Pass on AddressSpace when we have test coverage.
1375 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false,
1376 CostKind);
1377 }
1378
1380 const Value *Ptr, bool VariableMask,
1381 Align Alignment,
1383 const Instruction *I = nullptr) {
1384 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask,
1385 true, CostKind);
1386 }
1387
1389 const Value *Ptr, bool VariableMask,
1390 Align Alignment,
1392 const Instruction *I) {
1393 // For a target without strided memory operations (or for an illegal
1394 // operation type on one which does), assume we lower to a gather/scatter
1395 // operation. (Which may in turn be scalarized.)
1396 return thisT()->getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1397 Alignment, CostKind, I);
1398 }
1399
1401 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1402 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1403 bool UseMaskForCond = false, bool UseMaskForGaps = false) {
1404
1405 // We cannot scalarize scalable vectors, so return Invalid.
1406 if (isa<ScalableVectorType>(VecTy))
1408
1409 auto *VT = cast<FixedVectorType>(VecTy);
1410
1411 unsigned NumElts = VT->getNumElements();
1412 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1413
1414 unsigned NumSubElts = NumElts / Factor;
1415 auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts);
1416
1417 // Firstly, the cost of load/store operation.
1419 if (UseMaskForCond || UseMaskForGaps)
1420 Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment,
1422 else
1423 Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace,
1424 CostKind);
1425
1426 // Legalize the vector type, and get the legalized and unlegalized type
1427 // sizes.
1428 MVT VecTyLT = getTypeLegalizationCost(VecTy).second;
1429 unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy);
1430 unsigned VecTyLTSize = VecTyLT.getStoreSize();
1431
1432 // Scale the cost of the memory operation by the fraction of legalized
1433 // instructions that will actually be used. We shouldn't account for the
1434 // cost of dead instructions since they will be removed.
1435 //
1436 // E.g., An interleaved load of factor 8:
1437 // %vec = load <16 x i64>, <16 x i64>* %ptr
1438 // %v0 = shufflevector %vec, undef, <0, 8>
1439 //
1440 // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
1441 // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
1442 // type). The other loads are unused.
1443 //
1444 // TODO: Note that legalization can turn masked loads/stores into unmasked
1445 // (legalized) loads/stores. This can be reflected in the cost.
1446 if (Cost.isValid() && VecTySize > VecTyLTSize) {
1447 // The number of loads of a legal type it will take to represent a load
1448 // of the unlegalized vector type.
1449 unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize);
1450
1451 // The number of elements of the unlegalized type that correspond to a
1452 // single legal instruction.
1453 unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts);
1454
1455 // Determine which legal instructions will be used.
1456 BitVector UsedInsts(NumLegalInsts, false);
1457 for (unsigned Index : Indices)
1458 for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
1459 UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
1460
1461 // Scale the cost of the load by the fraction of legal instructions that
1462 // will be used.
1463 Cost = divideCeil(UsedInsts.count() * *Cost.getValue(), NumLegalInsts);
1464 }
1465
1466 // Then plus the cost of interleave operation.
1467 assert(Indices.size() <= Factor &&
1468 "Interleaved memory op has too many members");
1469
1470 const APInt DemandedAllSubElts = APInt::getAllOnes(NumSubElts);
1471 const APInt DemandedAllResultElts = APInt::getAllOnes(NumElts);
1472
1473 APInt DemandedLoadStoreElts = APInt::getZero(NumElts);
1474 for (unsigned Index : Indices) {
1475 assert(Index < Factor && "Invalid index for interleaved memory op");
1476 for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
1477 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
1478 }
1479
1480 if (Opcode == Instruction::Load) {
1481 // The interleave cost is similar to extract sub vectors' elements
1482 // from the wide vector, and insert them into sub vectors.
1483 //
1484 // E.g. An interleaved load of factor 2 (with one member of index 0):
1485 // %vec = load <8 x i32>, <8 x i32>* %ptr
1486 // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0
1487 // The cost is estimated as extract elements at 0, 2, 4, 6 from the
1488 // <8 x i32> vector and insert them into a <4 x i32> vector.
1489 InstructionCost InsSubCost = thisT()->getScalarizationOverhead(
1490 SubVT, DemandedAllSubElts,
1491 /*Insert*/ true, /*Extract*/ false, CostKind);
1492 Cost += Indices.size() * InsSubCost;
1493 Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1494 /*Insert*/ false,
1495 /*Extract*/ true, CostKind);
1496 } else {
1497 // The interleave cost is extract elements from sub vectors, and
1498 // insert them into the wide vector.
1499 //
1500 // E.g. An interleaved store of factor 3 with 2 members at indices 0,1:
1501 // (using VF=4):
1502 // %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef>
1503 // %gaps.mask = <true, true, false, true, true, false,
1504 // true, true, false, true, true, false>
1505 // call llvm.masked.store <12 x i32> %v0_v1, <12 x i32>* %ptr,
1506 // i32 Align, <12 x i1> %gaps.mask
1507 // The cost is estimated as extract all elements (of actual members,
1508 // excluding gaps) from both <4 x i32> vectors and insert into the <12 x
1509 // i32> vector.
1510 InstructionCost ExtSubCost = thisT()->getScalarizationOverhead(
1511 SubVT, DemandedAllSubElts,
1512 /*Insert*/ false, /*Extract*/ true, CostKind);
1513 Cost += ExtSubCost * Indices.size();
1514 Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1515 /*Insert*/ true,
1516 /*Extract*/ false, CostKind);
1517 }
1518
1519 if (!UseMaskForCond)
1520 return Cost;
1521
1522 Type *I8Type = Type::getInt8Ty(VT->getContext());
1523
1524 Cost += thisT()->getReplicationShuffleCost(
1525 I8Type, Factor, NumSubElts,
1526 UseMaskForGaps ? DemandedLoadStoreElts : DemandedAllResultElts,
1527 CostKind);
1528
1529 // The Gaps mask is invariant and created outside the loop, therefore the
1530 // cost of creating it is not accounted for here. However if we have both
1531 // a MaskForGaps and some other mask that guards the execution of the
1532 // memory access, we need to account for the cost of And-ing the two masks
1533 // inside the loop.
1534 if (UseMaskForGaps) {
1535 auto *MaskVT = FixedVectorType::get(I8Type, NumElts);
1536 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT,
1537 CostKind);
1538 }
1539
1540 return Cost;
1541 }
1542
1543 /// Get intrinsic cost based on arguments.
1546 // Check for generically free intrinsics.
1548 return 0;
1549
1550 // Assume that target intrinsics are cheap.
1551 Intrinsic::ID IID = ICA.getID();
1554
1555 if (ICA.isTypeBasedOnly())
1557
1558 Type *RetTy = ICA.getReturnType();
1559
1560 ElementCount RetVF =
1561 (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
1563 const IntrinsicInst *I = ICA.getInst();
1564 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
1565 FastMathFlags FMF = ICA.getFlags();
1566 switch (IID) {
1567 default:
1568 break;
1569
1570 case Intrinsic::powi:
1571 if (auto *RHSC = dyn_cast<ConstantInt>(Args[1])) {
1572 bool ShouldOptForSize = I->getParent()->getParent()->hasOptSize();
1573 if (getTLI()->isBeneficialToExpandPowI(RHSC->getSExtValue(),
1574 ShouldOptForSize)) {
1575 // The cost is modeled on the expansion performed by ExpandPowI in
1576 // SelectionDAGBuilder.
1577 APInt Exponent = RHSC->getValue().abs();
1578 unsigned ActiveBits = Exponent.getActiveBits();
1579 unsigned PopCount = Exponent.popcount();
1580 InstructionCost Cost = (ActiveBits + PopCount - 2) *
1581 thisT()->getArithmeticInstrCost(
1582 Instruction::FMul, RetTy, CostKind);
1583 if (RHSC->isNegative())
1584 Cost += thisT()->getArithmeticInstrCost(Instruction::FDiv, RetTy,
1585 CostKind);
1586 return Cost;
1587 }
1588 }
1589 break;
1590 case Intrinsic::cttz:
1591 // FIXME: If necessary, this should go in target-specific overrides.
1592 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz(RetTy))
1594 break;
1595
1596 case Intrinsic::ctlz:
1597 // FIXME: If necessary, this should go in target-specific overrides.
1598 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz(RetTy))
1600 break;
1601
1602 case Intrinsic::memcpy:
1603 return thisT()->getMemcpyCost(ICA.getInst());
1604
1605 case Intrinsic::masked_scatter: {
1606 const Value *Mask = Args[3];
1607 bool VarMask = !isa<Constant>(Mask);
1608 Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue();
1609 return thisT()->getGatherScatterOpCost(Instruction::Store,
1610 ICA.getArgTypes()[0], Args[1],
1611 VarMask, Alignment, CostKind, I);
1612 }
1613 case Intrinsic::masked_gather: {
1614 const Value *Mask = Args[2];
1615 bool VarMask = !isa<Constant>(Mask);
1616 Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue();
1617 return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
1618 VarMask, Alignment, CostKind, I);
1619 }
1620 case Intrinsic::experimental_vp_strided_store: {
1621 const Value *Data = Args[0];
1622 const Value *Ptr = Args[1];
1623 const Value *Mask = Args[3];
1624 const Value *EVL = Args[4];
1625 bool VarMask = !isa<Constant>(Mask) || !isa<Constant>(EVL);
1626 Type *EltTy = cast<VectorType>(Data->getType())->getElementType();
1627 Align Alignment =
1628 I->getParamAlign(1).value_or(thisT()->DL.getABITypeAlign(EltTy));
1629 return thisT()->getStridedMemoryOpCost(Instruction::Store,
1630 Data->getType(), Ptr, VarMask,
1631 Alignment, CostKind, I);
1632 }
1633 case Intrinsic::experimental_vp_strided_load: {
1634 const Value *Ptr = Args[0];
1635 const Value *Mask = Args[2];
1636 const Value *EVL = Args[3];
1637 bool VarMask = !isa<Constant>(Mask) || !isa<Constant>(EVL);
1638 Type *EltTy = cast<VectorType>(RetTy)->getElementType();
1639 Align Alignment =
1640 I->getParamAlign(0).value_or(thisT()->DL.getABITypeAlign(EltTy));
1641 return thisT()->getStridedMemoryOpCost(Instruction::Load, RetTy, Ptr,
1642 VarMask, Alignment, CostKind, I);
1643 }
1644 case Intrinsic::experimental_stepvector: {
1645 if (isa<ScalableVectorType>(RetTy))
1647 // The cost of materialising a constant integer vector.
1649 }
1650 case Intrinsic::vector_extract: {
1651 // FIXME: Handle case where a scalable vector is extracted from a scalable
1652 // vector
1653 if (isa<ScalableVectorType>(RetTy))
1655 unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue();
1656 return thisT()->getShuffleCost(
1657 TTI::SK_ExtractSubvector, cast<VectorType>(Args[0]->getType()),
1658 std::nullopt, CostKind, Index, cast<VectorType>(RetTy));
1659 }
1660 case Intrinsic::vector_insert: {
1661 // FIXME: Handle case where a scalable vector is inserted into a scalable
1662 // vector
1663 if (isa<ScalableVectorType>(Args[1]->getType()))
1665 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1666 return thisT()->getShuffleCost(
1667 TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()),
1668 std::nullopt, CostKind, Index, cast<VectorType>(Args[1]->getType()));
1669 }
1670 case Intrinsic::vector_reverse: {
1671 return thisT()->getShuffleCost(
1672 TTI::SK_Reverse, cast<VectorType>(Args[0]->getType()), std::nullopt,
1673 CostKind, 0, cast<VectorType>(RetTy));
1674 }
1675 case Intrinsic::vector_splice: {
1676 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1677 return thisT()->getShuffleCost(
1678 TTI::SK_Splice, cast<VectorType>(Args[0]->getType()), std::nullopt,
1679 CostKind, Index, cast<VectorType>(RetTy));
1680 }
1681 case Intrinsic::vector_reduce_add:
1682 case Intrinsic::vector_reduce_mul:
1683 case Intrinsic::vector_reduce_and:
1684 case Intrinsic::vector_reduce_or:
1685 case Intrinsic::vector_reduce_xor:
1686 case Intrinsic::vector_reduce_smax:
1687 case Intrinsic::vector_reduce_smin:
1688 case Intrinsic::vector_reduce_fmax:
1689 case Intrinsic::vector_reduce_fmin:
1690 case Intrinsic::vector_reduce_fmaximum:
1691 case Intrinsic::vector_reduce_fminimum:
1692 case Intrinsic::vector_reduce_umax:
1693 case Intrinsic::vector_reduce_umin: {
1694 IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1);
1696 }
1697 case Intrinsic::vector_reduce_fadd:
1698 case Intrinsic::vector_reduce_fmul: {
1700 IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1);
1702 }
1703 case Intrinsic::fshl:
1704 case Intrinsic::fshr: {
1705 const Value *X = Args[0];
1706 const Value *Y = Args[1];
1707 const Value *Z = Args[2];
1710 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(Z);
1711 const TTI::OperandValueInfo OpInfoBW =
1713 isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2
1714 : TTI::OP_None};
1715
1716 // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
1717 // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
1719 Cost +=
1720 thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
1721 Cost +=
1722 thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
1723 Cost += thisT()->getArithmeticInstrCost(
1724 BinaryOperator::Shl, RetTy, CostKind, OpInfoX,
1725 {OpInfoZ.Kind, TTI::OP_None});
1726 Cost += thisT()->getArithmeticInstrCost(
1727 BinaryOperator::LShr, RetTy, CostKind, OpInfoY,
1728 {OpInfoZ.Kind, TTI::OP_None});
1729 // Non-constant shift amounts requires a modulo.
1730 if (!OpInfoZ.isConstant())
1731 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy,
1732 CostKind, OpInfoZ, OpInfoBW);
1733 // For non-rotates (X != Y) we must add shift-by-zero handling costs.
1734 if (X != Y) {
1735 Type *CondTy = RetTy->getWithNewBitWidth(1);
1736 Cost +=
1737 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1739 Cost +=
1740 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1742 }
1743 return Cost;
1744 }
1745 case Intrinsic::get_active_lane_mask: {
1746 EVT ResVT = getTLI()->getValueType(DL, RetTy, true);
1747 EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true);
1748
1749 // If we're not expanding the intrinsic then we assume this is cheap
1750 // to implement.
1751 if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgType)) {
1752 return getTypeLegalizationCost(RetTy).first;
1753 }
1754
1755 // Create the expanded types that will be used to calculate the uadd_sat
1756 // operation.
1757 Type *ExpRetTy = VectorType::get(
1758 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1759 IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {}, FMF);
1761 thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1762 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy,
1764 return Cost;
1765 }
1766 case Intrinsic::experimental_cttz_elts: {
1767 EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true);
1768
1769 // If we're not expanding the intrinsic then we assume this is cheap
1770 // to implement.
1771 if (!getTLI()->shouldExpandCttzElements(ArgType))
1772 return getTypeLegalizationCost(RetTy).first;
1773
1774 // TODO: The costs below reflect the expansion code in
1775 // SelectionDAGBuilder, but we may want to sacrifice some accuracy in
1776 // favour of compile time.
1777
1778 // Find the smallest "sensible" element type to use for the expansion.
1779 bool ZeroIsPoison = !cast<ConstantInt>(Args[1])->isZero();
1780 ConstantRange VScaleRange(APInt(64, 1), APInt::getZero(64));
1781 if (isa<ScalableVectorType>(ICA.getArgTypes()[0]) && I && I->getCaller())
1782 VScaleRange = getVScaleRange(I->getCaller(), 64);
1783
1784 unsigned EltWidth = getTLI()->getBitWidthForCttzElements(
1785 RetTy, ArgType.getVectorElementCount(), ZeroIsPoison, &VScaleRange);
1786 Type *NewEltTy = IntegerType::getIntNTy(RetTy->getContext(), EltWidth);
1787
1788 // Create the new vector type & get the vector length
1789 Type *NewVecTy = VectorType::get(
1790 NewEltTy, cast<VectorType>(Args[0]->getType())->getElementCount());
1791
1792 IntrinsicCostAttributes StepVecAttrs(Intrinsic::experimental_stepvector,
1793 NewVecTy, {}, FMF);
1795 thisT()->getIntrinsicInstrCost(StepVecAttrs, CostKind);
1796
1797 Cost +=
1798 thisT()->getArithmeticInstrCost(Instruction::Sub, NewVecTy, CostKind);
1799 Cost += thisT()->getCastInstrCost(Instruction::SExt, NewVecTy,
1800 Args[0]->getType(),
1802 Cost +=
1803 thisT()->getArithmeticInstrCost(Instruction::And, NewVecTy, CostKind);
1804
1805 IntrinsicCostAttributes ReducAttrs(Intrinsic::vector_reduce_umax,
1806 NewEltTy, NewVecTy, FMF, I, 1);
1807 Cost += thisT()->getTypeBasedIntrinsicInstrCost(ReducAttrs, CostKind);
1808 Cost +=
1809 thisT()->getArithmeticInstrCost(Instruction::Sub, NewEltTy, CostKind);
1810
1811 return Cost;
1812 }
1813 }
1814
1815 // VP Intrinsics should have the same cost as their non-vp counterpart.
1816 // TODO: Adjust the cost to make the vp intrinsic cheaper than its non-vp
1817 // counterpart when the vector length argument is smaller than the maximum
1818 // vector length.
1819 // TODO: Support other kinds of VPIntrinsics
1820 if (VPIntrinsic::isVPIntrinsic(ICA.getID())) {
1821 std::optional<unsigned> FOp =
1823 if (FOp) {
1824 if (ICA.getID() == Intrinsic::vp_load) {
1825 Align Alignment;
1826 if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
1827 Alignment = VPI->getPointerAlignment().valueOrOne();
1828 unsigned AS = 0;
1829 if (ICA.getArgs().size() > 1)
1830 if (auto *PtrTy =
1831 dyn_cast<PointerType>(ICA.getArgs()[0]->getType()))
1832 AS = PtrTy->getAddressSpace();
1833 return thisT()->getMemoryOpCost(*FOp, ICA.getReturnType(), Alignment,
1834 AS, CostKind);
1835 }
1836 if (ICA.getID() == Intrinsic::vp_store) {
1837 Align Alignment;
1838 if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
1839 Alignment = VPI->getPointerAlignment().valueOrOne();
1840 unsigned AS = 0;
1841 if (ICA.getArgs().size() >= 2)
1842 if (auto *PtrTy =
1843 dyn_cast<PointerType>(ICA.getArgs()[1]->getType()))
1844 AS = PtrTy->getAddressSpace();
1845 return thisT()->getMemoryOpCost(*FOp, Args[0]->getType(), Alignment,
1846 AS, CostKind);
1847 }
1849 return thisT()->getArithmeticInstrCost(*FOp, ICA.getReturnType(),
1850 CostKind);
1851 }
1852 }
1853
1854 std::optional<Intrinsic::ID> FID =
1856 if (FID) {
1857 // Non-vp version will have same Args/Tys except mask and vector length.
1858 assert(ICA.getArgs().size() >= 2 && ICA.getArgTypes().size() >= 2 &&
1859 "Expected VPIntrinsic to have Mask and Vector Length args and "
1860 "types");
1862
1863 // VPReduction intrinsics have a start value argument that their non-vp
1864 // counterparts do not have, except for the fadd and fmul non-vp
1865 // counterpart.
1867 *FID != Intrinsic::vector_reduce_fadd &&
1868 *FID != Intrinsic::vector_reduce_fmul)
1869 NewTys = NewTys.drop_front();
1870
1871 IntrinsicCostAttributes NewICA(*FID, ICA.getReturnType(), NewTys,
1872 ICA.getFlags());
1873 return thisT()->getIntrinsicInstrCost(NewICA, CostKind);
1874 }
1875 }
1876
1877 // Assume that we need to scalarize this intrinsic.)
1878 // Compute the scalarization overhead based on Args for a vector
1879 // intrinsic.
1880 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
1881 if (RetVF.isVector() && !RetVF.isScalable()) {
1882 ScalarizationCost = 0;
1883 if (!RetTy->isVoidTy())
1884 ScalarizationCost += getScalarizationOverhead(
1885 cast<VectorType>(RetTy),
1886 /*Insert*/ true, /*Extract*/ false, CostKind);
1887 ScalarizationCost +=
1889 }
1890
1891 IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
1892 ScalarizationCost);
1893 return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1894 }
1895
1896 /// Get intrinsic cost based on argument types.
1897 /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the
1898 /// cost of scalarizing the arguments and the return value will be computed
1899 /// based on types.
1903 Intrinsic::ID IID = ICA.getID();
1904 Type *RetTy = ICA.getReturnType();
1905 const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes();
1906 FastMathFlags FMF = ICA.getFlags();
1907 InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost();
1908 bool SkipScalarizationCost = ICA.skipScalarizationCost();
1909
1910 VectorType *VecOpTy = nullptr;
1911 if (!Tys.empty()) {
1912 // The vector reduction operand is operand 0 except for fadd/fmul.
1913 // Their operand 0 is a scalar start value, so the vector op is operand 1.
1914 unsigned VecTyIndex = 0;
1915 if (IID == Intrinsic::vector_reduce_fadd ||
1916 IID == Intrinsic::vector_reduce_fmul)
1917 VecTyIndex = 1;
1918 assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes");
1919 VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]);
1920 }
1921
1922 // Library call cost - other than size, make it expensive.
1923 unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10;
1924 unsigned ISD = 0;
1925 switch (IID) {
1926 default: {
1927 // Scalable vectors cannot be scalarized, so return Invalid.
1928 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
1929 return isa<ScalableVectorType>(Ty);
1930 }))
1932
1933 // Assume that we need to scalarize this intrinsic.
1934 InstructionCost ScalarizationCost =
1935 SkipScalarizationCost ? ScalarizationCostPassed : 0;
1936 unsigned ScalarCalls = 1;
1937 Type *ScalarRetTy = RetTy;
1938 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
1939 if (!SkipScalarizationCost)
1940 ScalarizationCost = getScalarizationOverhead(
1941 RetVTy, /*Insert*/ true, /*Extract*/ false, CostKind);
1942 ScalarCalls = std::max(ScalarCalls,
1943 cast<FixedVectorType>(RetVTy)->getNumElements());
1944 ScalarRetTy = RetTy->getScalarType();
1945 }
1946 SmallVector<Type *, 4> ScalarTys;
1947 for (Type *Ty : Tys) {
1948 if (auto *VTy = dyn_cast<VectorType>(Ty)) {
1949 if (!SkipScalarizationCost)
1950 ScalarizationCost += getScalarizationOverhead(
1951 VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
1952 ScalarCalls = std::max(ScalarCalls,
1953 cast<FixedVectorType>(VTy)->getNumElements());
1954 Ty = Ty->getScalarType();
1955 }
1956 ScalarTys.push_back(Ty);
1957 }
1958 if (ScalarCalls == 1)
1959 return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
1960
1961 IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF);
1962 InstructionCost ScalarCost =
1963 thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind);
1964
1965 return ScalarCalls * ScalarCost + ScalarizationCost;
1966 }
1967 // Look for intrinsics that can be lowered directly or turned into a scalar
1968 // intrinsic call.
1969 case Intrinsic::sqrt:
1970 ISD = ISD::FSQRT;
1971 break;
1972 case Intrinsic::sin:
1973 ISD = ISD::FSIN;
1974 break;
1975 case Intrinsic::cos:
1976 ISD = ISD::FCOS;
1977 break;
1978 case Intrinsic::tan:
1979 ISD = ISD::FTAN;
1980 break;
1981 case Intrinsic::asin:
1982 ISD = ISD::FASIN;
1983 break;
1984 case Intrinsic::acos:
1985 ISD = ISD::FACOS;
1986 break;
1987 case Intrinsic::atan:
1988 ISD = ISD::FATAN;
1989 break;
1990 case Intrinsic::sinh:
1991 ISD = ISD::FSINH;
1992 break;
1993 case Intrinsic::cosh:
1994 ISD = ISD::FCOSH;
1995 break;
1996 case Intrinsic::tanh:
1997 ISD = ISD::FTANH;
1998 break;
1999 case Intrinsic::exp:
2000 ISD = ISD::FEXP;
2001 break;
2002 case Intrinsic::exp2:
2003 ISD = ISD::FEXP2;
2004 break;
2005 case Intrinsic::exp10:
2006 ISD = ISD::FEXP10;
2007 break;
2008 case Intrinsic::log:
2009 ISD = ISD::FLOG;
2010 break;
2011 case Intrinsic::log10:
2012 ISD = ISD::FLOG10;
2013 break;
2014 case Intrinsic::log2:
2015 ISD = ISD::FLOG2;
2016 break;
2017 case Intrinsic::fabs:
2018 ISD = ISD::FABS;
2019 break;
2020 case Intrinsic::canonicalize:
2021 ISD = ISD::FCANONICALIZE;
2022 break;
2023 case Intrinsic::minnum:
2024 ISD = ISD::FMINNUM;
2025 break;
2026 case Intrinsic::maxnum:
2027 ISD = ISD::FMAXNUM;
2028 break;
2029 case Intrinsic::minimum:
2030 ISD = ISD::FMINIMUM;
2031 break;
2032 case Intrinsic::maximum:
2033 ISD = ISD::FMAXIMUM;
2034 break;
2035 case Intrinsic::minimumnum:
2036 ISD = ISD::FMINIMUMNUM;
2037 break;
2038 case Intrinsic::maximumnum:
2039 ISD = ISD::FMAXIMUMNUM;
2040 break;
2041 case Intrinsic::copysign:
2042 ISD = ISD::FCOPYSIGN;
2043 break;
2044 case Intrinsic::floor:
2045 ISD = ISD::FFLOOR;
2046 break;
2047 case Intrinsic::ceil:
2048 ISD = ISD::FCEIL;
2049 break;
2050 case Intrinsic::trunc:
2051 ISD = ISD::FTRUNC;
2052 break;
2053 case Intrinsic::nearbyint:
2054 ISD = ISD::FNEARBYINT;
2055 break;
2056 case Intrinsic::rint:
2057 ISD = ISD::FRINT;
2058 break;
2059 case Intrinsic::lrint:
2060 ISD = ISD::LRINT;
2061 break;
2062 case Intrinsic::llrint:
2063 ISD = ISD::LLRINT;
2064 break;
2065 case Intrinsic::round:
2066 ISD = ISD::FROUND;
2067 break;
2068 case Intrinsic::roundeven:
2069 ISD = ISD::FROUNDEVEN;
2070 break;
2071 case Intrinsic::pow:
2072 ISD = ISD::FPOW;
2073 break;
2074 case Intrinsic::fma:
2075 ISD = ISD::FMA;
2076 break;
2077 case Intrinsic::fmuladd:
2078 ISD = ISD::FMA;
2079 break;
2080 case Intrinsic::experimental_constrained_fmuladd:
2081 ISD = ISD::STRICT_FMA;
2082 break;
2083 // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
2084 case Intrinsic::lifetime_start:
2085 case Intrinsic::lifetime_end:
2086 case Intrinsic::sideeffect:
2087 case Intrinsic::pseudoprobe:
2088 case Intrinsic::arithmetic_fence:
2089 return 0;
2090 case Intrinsic::masked_store: {
2091 Type *Ty = Tys[0];
2092 Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
2093 return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0,
2094 CostKind);
2095 }
2096 case Intrinsic::masked_load: {
2097 Type *Ty = RetTy;
2098 Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
2099 return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
2100 CostKind);
2101 }
2102 case Intrinsic::vector_reduce_add:
2103 case Intrinsic::vector_reduce_mul:
2104 case Intrinsic::vector_reduce_and:
2105 case Intrinsic::vector_reduce_or:
2106 case Intrinsic::vector_reduce_xor:
2107 return thisT()->getArithmeticReductionCost(
2108 getArithmeticReductionInstruction(IID), VecOpTy, std::nullopt,
2109 CostKind);
2110 case Intrinsic::vector_reduce_fadd:
2111 case Intrinsic::vector_reduce_fmul:
2112 return thisT()->getArithmeticReductionCost(
2113 getArithmeticReductionInstruction(IID), VecOpTy, FMF, CostKind);
2114 case Intrinsic::vector_reduce_smax:
2115 case Intrinsic::vector_reduce_smin:
2116 case Intrinsic::vector_reduce_umax:
2117 case Intrinsic::vector_reduce_umin:
2118 case Intrinsic::vector_reduce_fmax:
2119 case Intrinsic::vector_reduce_fmin:
2120 case Intrinsic::vector_reduce_fmaximum:
2121 case Intrinsic::vector_reduce_fminimum:
2122 return thisT()->getMinMaxReductionCost(getMinMaxReductionIntrinsicOp(IID),
2123 VecOpTy, ICA.getFlags(), CostKind);
2124 case Intrinsic::abs:
2125 ISD = ISD::ABS;
2126 break;
2127 case Intrinsic::smax:
2128 ISD = ISD::SMAX;
2129 break;
2130 case Intrinsic::smin:
2131 ISD = ISD::SMIN;
2132 break;
2133 case Intrinsic::umax:
2134 ISD = ISD::UMAX;
2135 break;
2136 case Intrinsic::umin:
2137 ISD = ISD::UMIN;
2138 break;
2139 case Intrinsic::sadd_sat:
2140 ISD = ISD::SADDSAT;
2141 break;
2142 case Intrinsic::ssub_sat:
2143 ISD = ISD::SSUBSAT;
2144 break;
2145 case Intrinsic::uadd_sat:
2146 ISD = ISD::UADDSAT;
2147 break;
2148 case Intrinsic::usub_sat:
2149 ISD = ISD::USUBSAT;
2150 break;
2151 case Intrinsic::smul_fix:
2152 ISD = ISD::SMULFIX;
2153 break;
2154 case Intrinsic::umul_fix:
2155 ISD = ISD::UMULFIX;
2156 break;
2157 case Intrinsic::sadd_with_overflow:
2158 ISD = ISD::SADDO;
2159 break;
2160 case Intrinsic::ssub_with_overflow:
2161 ISD = ISD::SSUBO;
2162 break;
2163 case Intrinsic::uadd_with_overflow:
2164 ISD = ISD::UADDO;
2165 break;
2166 case Intrinsic::usub_with_overflow:
2167 ISD = ISD::USUBO;
2168 break;
2169 case Intrinsic::smul_with_overflow:
2170 ISD = ISD::SMULO;
2171 break;
2172 case Intrinsic::umul_with_overflow:
2173 ISD = ISD::UMULO;
2174 break;
2175 case Intrinsic::fptosi_sat:
2176 ISD = ISD::FP_TO_SINT_SAT;
2177 break;
2178 case Intrinsic::fptoui_sat:
2179 ISD = ISD::FP_TO_UINT_SAT;
2180 break;
2181 case Intrinsic::ctpop:
2182 ISD = ISD::CTPOP;
2183 // In case of legalization use TCC_Expensive. This is cheaper than a
2184 // library call but still not a cheap instruction.
2185 SingleCallCost = TargetTransformInfo::TCC_Expensive;
2186 break;
2187 case Intrinsic::ctlz:
2188 ISD = ISD::CTLZ;
2189 break;
2190 case Intrinsic::cttz:
2191 ISD = ISD::CTTZ;
2192 break;
2193 case Intrinsic::bswap:
2194 ISD = ISD::BSWAP;
2195 break;
2196 case Intrinsic::bitreverse:
2197 ISD = ISD::BITREVERSE;
2198 break;
2199 }
2200
2201 auto *ST = dyn_cast<StructType>(RetTy);
2202 Type *LegalizeTy = ST ? ST->getContainedType(0) : RetTy;
2203 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(LegalizeTy);
2204
2205 const TargetLoweringBase *TLI = getTLI();
2206
2207 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
2208 if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
2209 TLI->isFAbsFree(LT.second)) {
2210 return 0;
2211 }
2212
2213 // The operation is legal. Assume it costs 1.
2214 // If the type is split to multiple registers, assume that there is some
2215 // overhead to this.
2216 // TODO: Once we have extract/insert subvector cost we need to use them.
2217 if (LT.first > 1)
2218 return (LT.first * 2);
2219 else
2220 return (LT.first * 1);
2221 } else if (!TLI->isOperationExpand(ISD, LT.second)) {
2222 // If the operation is custom lowered then assume
2223 // that the code is twice as expensive.
2224 return (LT.first * 2);
2225 }
2226
2227 switch (IID) {
2228 case Intrinsic::fmuladd: {
2229 // If we can't lower fmuladd into an FMA estimate the cost as a floating
2230 // point mul followed by an add.
2231
2232 return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy,
2233 CostKind) +
2234 thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy,
2235 CostKind);
2236 }
2237 case Intrinsic::experimental_constrained_fmuladd: {
2238 IntrinsicCostAttributes FMulAttrs(
2239 Intrinsic::experimental_constrained_fmul, RetTy, Tys);
2240 IntrinsicCostAttributes FAddAttrs(
2241 Intrinsic::experimental_constrained_fadd, RetTy, Tys);
2242 return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) +
2243 thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind);
2244 }
2245 case Intrinsic::smin:
2246 case Intrinsic::smax:
2247 case Intrinsic::umin:
2248 case Intrinsic::umax: {
2249 // minmax(X,Y) = select(icmp(X,Y),X,Y)
2250 Type *CondTy = RetTy->getWithNewBitWidth(1);
2251 bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin;
2252 CmpInst::Predicate Pred =
2253 IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT;
2255 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2256 Pred, CostKind);
2257 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2258 Pred, CostKind);
2259 return Cost;
2260 }
2261 case Intrinsic::sadd_with_overflow:
2262 case Intrinsic::ssub_with_overflow: {
2263 Type *SumTy = RetTy->getContainedType(0);
2264 Type *OverflowTy = RetTy->getContainedType(1);
2265 unsigned Opcode = IID == Intrinsic::sadd_with_overflow
2266 ? BinaryOperator::Add
2267 : BinaryOperator::Sub;
2268
2269 // Add:
2270 // Overflow -> (Result < LHS) ^ (RHS < 0)
2271 // Sub:
2272 // Overflow -> (Result < LHS) ^ (RHS > 0)
2274 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
2275 Cost +=
2276 2 * thisT()->getCmpSelInstrCost(Instruction::ICmp, SumTy, OverflowTy,
2278 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy,
2279 CostKind);
2280 return Cost;
2281 }
2282 case Intrinsic::uadd_with_overflow:
2283 case Intrinsic::usub_with_overflow: {
2284 Type *SumTy = RetTy->getContainedType(0);
2285 Type *OverflowTy = RetTy->getContainedType(1);
2286 unsigned Opcode = IID == Intrinsic::uadd_with_overflow
2287 ? BinaryOperator::Add
2288 : BinaryOperator::Sub;
2289 CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow
2292
2294 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
2295 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy,
2296 OverflowTy, Pred, CostKind);
2297 return Cost;
2298 }
2299 case Intrinsic::smul_with_overflow:
2300 case Intrinsic::umul_with_overflow: {
2301 Type *MulTy = RetTy->getContainedType(0);
2302 Type *OverflowTy = RetTy->getContainedType(1);
2303 unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
2304 Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
2305 bool IsSigned = IID == Intrinsic::smul_with_overflow;
2306
2307 unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt;
2309
2311 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
2312 Cost +=
2313 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2314 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
2315 CCH, CostKind);
2316 Cost += thisT()->getArithmeticInstrCost(
2317 Instruction::LShr, ExtTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None},
2319
2320 if (IsSigned)
2321 Cost += thisT()->getArithmeticInstrCost(
2322 Instruction::AShr, MulTy, CostKind,
2325
2326 Cost += thisT()->getCmpSelInstrCost(
2327 BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind);
2328 return Cost;
2329 }
2330 case Intrinsic::sadd_sat:
2331 case Intrinsic::ssub_sat: {
2332 // Assume a default expansion.
2333 Type *CondTy = RetTy->getWithNewBitWidth(1);
2334
2335 Type *OpTy = StructType::create({RetTy, CondTy});
2336 Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
2337 ? Intrinsic::sadd_with_overflow
2338 : Intrinsic::ssub_with_overflow;
2340
2341 // SatMax -> Overflow && SumDiff < 0
2342 // SatMin -> Overflow && SumDiff >= 0
2344 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
2345 nullptr, ScalarizationCostPassed);
2346 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2347 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2348 Pred, CostKind);
2349 Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
2350 CondTy, Pred, CostKind);
2351 return Cost;
2352 }
2353 case Intrinsic::uadd_sat:
2354 case Intrinsic::usub_sat: {
2355 Type *CondTy = RetTy->getWithNewBitWidth(1);
2356
2357 Type *OpTy = StructType::create({RetTy, CondTy});
2358 Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
2359 ? Intrinsic::uadd_with_overflow
2360 : Intrinsic::usub_with_overflow;
2361
2363 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
2364 nullptr, ScalarizationCostPassed);
2365 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2366 Cost +=
2367 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2369 return Cost;
2370 }
2371 case Intrinsic::smul_fix:
2372 case Intrinsic::umul_fix: {
2373 unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
2374 Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);
2375
2376 unsigned ExtOp =
2377 IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
2379
2381 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
2382 Cost +=
2383 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2384 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
2385 CCH, CostKind);
2386 Cost += thisT()->getArithmeticInstrCost(
2387 Instruction::LShr, RetTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None},
2389 Cost += thisT()->getArithmeticInstrCost(
2390 Instruction::Shl, RetTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None},
2392 Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
2393 return Cost;
2394 }
2395 case Intrinsic::abs: {
2396 // abs(X) = select(icmp(X,0),X,sub(0,X))
2397 Type *CondTy = RetTy->getWithNewBitWidth(1);
2400 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2401 Pred, CostKind);
2402 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2403 Pred, CostKind);
2404 // TODO: Should we add an OperandValueProperties::OP_Zero property?
2405 Cost += thisT()->getArithmeticInstrCost(
2406 BinaryOperator::Sub, RetTy, CostKind,
2408 return Cost;
2409 }
2410 case Intrinsic::fptosi_sat:
2411 case Intrinsic::fptoui_sat: {
2412 if (Tys.empty())
2413 break;
2414 Type *FromTy = Tys[0];
2415 bool IsSigned = IID == Intrinsic::fptosi_sat;
2416
2418 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FromTy,
2419 {FromTy, FromTy});
2420 Cost += thisT()->getIntrinsicInstrCost(Attrs1, CostKind);
2421 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FromTy,
2422 {FromTy, FromTy});
2423 Cost += thisT()->getIntrinsicInstrCost(Attrs2, CostKind);
2424 Cost += thisT()->getCastInstrCost(
2425 IsSigned ? Instruction::FPToSI : Instruction::FPToUI, RetTy, FromTy,
2427 if (IsSigned) {
2428 Type *CondTy = RetTy->getWithNewBitWidth(1);
2429 Cost += thisT()->getCmpSelInstrCost(
2430 BinaryOperator::FCmp, FromTy, CondTy, CmpInst::FCMP_UNO, CostKind);
2431 Cost += thisT()->getCmpSelInstrCost(
2432 BinaryOperator::Select, RetTy, CondTy, CmpInst::FCMP_UNO, CostKind);
2433 }
2434 return Cost;
2435 }
2436 default:
2437 break;
2438 }
2439
2440 // Else, assume that we need to scalarize this intrinsic. For math builtins
2441 // this will emit a costly libcall, adding call overhead and spills. Make it
2442 // very expensive.
2443 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
2444 // Scalable vectors cannot be scalarized, so return Invalid.
2445 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
2446 return isa<ScalableVectorType>(Ty);
2447 }))
2449
2450 InstructionCost ScalarizationCost =
2451 SkipScalarizationCost
2452 ? ScalarizationCostPassed
2453 : getScalarizationOverhead(RetVTy, /*Insert*/ true,
2454 /*Extract*/ false, CostKind);
2455
2456 unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements();
2457 SmallVector<Type *, 4> ScalarTys;
2458 for (Type *Ty : Tys) {
2459 if (Ty->isVectorTy())
2460 Ty = Ty->getScalarType();
2461 ScalarTys.push_back(Ty);
2462 }
2463 IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
2464 InstructionCost ScalarCost =
2465 thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2466 for (Type *Ty : Tys) {
2467 if (auto *VTy = dyn_cast<VectorType>(Ty)) {
2468 if (!ICA.skipScalarizationCost())
2469 ScalarizationCost += getScalarizationOverhead(
2470 VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
2471 ScalarCalls = std::max(ScalarCalls,
2472 cast<FixedVectorType>(VTy)->getNumElements());
2473 }
2474 }
2475 return ScalarCalls * ScalarCost + ScalarizationCost;
2476 }
2477
2478 // This is going to be turned into a library call, make it expensive.
2479 return SingleCallCost;
2480 }
2481
2482 /// Compute a cost of the given call instruction.
2483 ///
2484 /// Compute the cost of calling function F with return type RetTy and
2485 /// argument types Tys. F might be nullptr, in this case the cost of an
2486 /// arbitrary call with the specified signature will be returned.
2487 /// This is used, for instance, when we estimate call of a vector
2488 /// counterpart of the given function.
2489 /// \param F Called function, might be nullptr.
2490 /// \param RetTy Return value types.
2491 /// \param Tys Argument types.
2492 /// \returns The cost of Call instruction.
2494 ArrayRef<Type *> Tys,
2496 return 10;
2497 }
2498
2499 unsigned getNumberOfParts(Type *Tp) {
2500 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
2501 return LT.first.isValid() ? *LT.first.getValue() : 0;
2502 }
2503
2505 const SCEV *) {
2506 return 0;
2507 }
2508
2509 /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
2510 /// We're assuming that reduction operation are performing the following way:
2511 ///
2512 /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
2513 /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
2514 /// \----------------v-------------/ \----------v------------/
2515 /// n/2 elements n/2 elements
2516 /// %red1 = op <n x t> %val, <n x t> val1
2517 /// After this operation we have a vector %red1 where only the first n/2
2518 /// elements are meaningful, the second n/2 elements are undefined and can be
2519 /// dropped. All other operations are actually working with the vector of
2520 /// length n/2, not n, though the real vector length is still n.
2521 /// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
2522 /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
2523 /// \----------------v-------------/ \----------v------------/
2524 /// n/4 elements 3*n/4 elements
2525 /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of
2526 /// length n/2, the resulting vector has length n/4 etc.
2527 ///
2528 /// The cost model should take into account that the actual length of the
2529 /// vector is reduced on each iteration.
2532 // Targets must implement a default value for the scalable case, since
2533 // we don't know how many lanes the vector has.
2534 if (isa<ScalableVectorType>(Ty))
2536
2537 Type *ScalarTy = Ty->getElementType();
2538 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2539 if ((Opcode == Instruction::Or || Opcode == Instruction::And) &&
2540 ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) &&
2541 NumVecElts >= 2) {
2542 // Or reduction for i1 is represented as:
2543 // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2544 // %res = cmp ne iReduxWidth %val, 0
2545 // And reduction for i1 is represented as:
2546 // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2547 // %res = cmp eq iReduxWidth %val, 11111
2548 Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts);
2549 return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty,
2551 thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy,
2554 }
2555 unsigned NumReduxLevels = Log2_32(NumVecElts);
2556 InstructionCost ArithCost = 0;
2557 InstructionCost ShuffleCost = 0;
2558 std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty);
2559 unsigned LongVectorCount = 0;
2560 unsigned MVTLen =
2561 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2562 while (NumVecElts > MVTLen) {
2563 NumVecElts /= 2;
2564 VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2565 ShuffleCost +=
2566 thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
2567 CostKind, NumVecElts, SubTy);
2568 ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind);
2569 Ty = SubTy;
2570 ++LongVectorCount;
2571 }
2572
2573 NumReduxLevels -= LongVectorCount;
2574
2575 // The minimal length of the vector is limited by the real length of vector
2576 // operations performed on the current platform. That's why several final
2577 // reduction operations are performed on the vectors with the same
2578 // architecture-dependent length.
2579
2580 // By default reductions need one shuffle per reduction level.
2581 ShuffleCost +=
2582 NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
2583 std::nullopt, CostKind, 0, Ty);
2584 ArithCost +=
2585 NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind);
2586 return ShuffleCost + ArithCost +
2587 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
2588 CostKind, 0, nullptr, nullptr);
2589 }
2590
2591 /// Try to calculate the cost of performing strict (in-order) reductions,
2592 /// which involves doing a sequence of floating point additions in lane
2593 /// order, starting with an initial value. For example, consider a scalar
2594 /// initial value 'InitVal' of type float and a vector of type <4 x float>:
2595 ///
2596 /// Vector = <float %v0, float %v1, float %v2, float %v3>
2597 ///
2598 /// %add1 = %InitVal + %v0
2599 /// %add2 = %add1 + %v1
2600 /// %add3 = %add2 + %v2
2601 /// %add4 = %add3 + %v3
2602 ///
2603 /// As a simple estimate we can say the cost of such a reduction is 4 times
2604 /// the cost of a scalar FP addition. We can only estimate the costs for
2605 /// fixed-width vectors here because for scalable vectors we do not know the
2606 /// runtime number of operations.
2609 // Targets must implement a default value for the scalable case, since
2610 // we don't know how many lanes the vector has.
2611 if (isa<ScalableVectorType>(Ty))
2613
2614 auto *VTy = cast<FixedVectorType>(Ty);
2616 VTy, /*Insert=*/false, /*Extract=*/true, CostKind);
2617 InstructionCost ArithCost = thisT()->getArithmeticInstrCost(
2618 Opcode, VTy->getElementType(), CostKind);
2619 ArithCost *= VTy->getNumElements();
2620
2621 return ExtractCost + ArithCost;
2622 }
2623
2625 std::optional<FastMathFlags> FMF,
2627 assert(Ty && "Unknown reduction vector type");
2629 return getOrderedReductionCost(Opcode, Ty, CostKind);
2630 return getTreeReductionCost(Opcode, Ty, CostKind);
2631 }
2632
2633 /// Try to calculate op costs for min/max reduction operations.
2634 /// \param CondTy Conditional type for the Select instruction.
2636 FastMathFlags FMF,
2638 // Targets must implement a default value for the scalable case, since
2639 // we don't know how many lanes the vector has.
2640 if (isa<ScalableVectorType>(Ty))
2642
2643 Type *ScalarTy = Ty->getElementType();
2644 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2645 unsigned NumReduxLevels = Log2_32(NumVecElts);
2646 InstructionCost MinMaxCost = 0;
2647 InstructionCost ShuffleCost = 0;
2648 std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty);
2649 unsigned LongVectorCount = 0;
2650 unsigned MVTLen =
2651 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2652 while (NumVecElts > MVTLen) {
2653 NumVecElts /= 2;
2654 auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2655
2656 ShuffleCost +=
2657 thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
2658 CostKind, NumVecElts, SubTy);
2659
2660 IntrinsicCostAttributes Attrs(IID, SubTy, {SubTy, SubTy}, FMF);
2661 MinMaxCost += getIntrinsicInstrCost(Attrs, CostKind);
2662 Ty = SubTy;
2663 ++LongVectorCount;
2664 }
2665
2666 NumReduxLevels -= LongVectorCount;
2667
2668 // The minimal length of the vector is limited by the real length of vector
2669 // operations performed on the current platform. That's why several final
2670 // reduction opertions are perfomed on the vectors with the same
2671 // architecture-dependent length.
2672 ShuffleCost +=
2673 NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
2674 std::nullopt, CostKind, 0, Ty);
2675 IntrinsicCostAttributes Attrs(IID, Ty, {Ty, Ty}, FMF);
2676 MinMaxCost += NumReduxLevels * getIntrinsicInstrCost(Attrs, CostKind);
2677 // The last min/max should be in vector registers and we counted it above.
2678 // So just need a single extractelement.
2679 return ShuffleCost + MinMaxCost +
2680 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
2681 CostKind, 0, nullptr, nullptr);
2682 }
2683
2684 InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned,
2685 Type *ResTy, VectorType *Ty,
2686 FastMathFlags FMF,
2688 // Without any native support, this is equivalent to the cost of
2689 // vecreduce.opcode(ext(Ty A)).
2690 VectorType *ExtTy = VectorType::get(ResTy, Ty);
2691 InstructionCost RedCost =
2692 thisT()->getArithmeticReductionCost(Opcode, ExtTy, FMF, CostKind);
2693 InstructionCost ExtCost = thisT()->getCastInstrCost(
2694 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2696
2697 return RedCost + ExtCost;
2698 }
2699
2701 VectorType *Ty,
2703 // Without any native support, this is equivalent to the cost of
2704 // vecreduce.add(mul(ext(Ty A), ext(Ty B))) or
2705 // vecreduce.add(mul(A, B)).
2706 VectorType *ExtTy = VectorType::get(ResTy, Ty);
2707 InstructionCost RedCost = thisT()->getArithmeticReductionCost(
2708 Instruction::Add, ExtTy, std::nullopt, CostKind);
2709 InstructionCost ExtCost = thisT()->getCastInstrCost(
2710 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2712
2713 InstructionCost MulCost =
2714 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2715
2716 return RedCost + MulCost + 2 * ExtCost;
2717 }
2718
2720
2721 /// @}
2722};
2723
2724/// Concrete BasicTTIImpl that can be used if no further customization
2725/// is needed.
2726class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
2728
2729 friend class BasicTTIImplBase<BasicTTIImpl>;
2730
2731 const TargetSubtargetInfo *ST;
2732 const TargetLoweringBase *TLI;
2733
2734 const TargetSubtargetInfo *getST() const { return ST; }
2735 const TargetLoweringBase *getTLI() const { return TLI; }
2736
2737public:
2738 explicit BasicTTIImpl(const TargetMachine *TM, const Function &F);
2739};
2740
2741} // end namespace llvm
2742
2743#endif // LLVM_CODEGEN_BASICTTIIMPL_H
This file implements a class to represent arbitrary precision integral constant values and operations...
This file implements the BitVector class.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
return RetTy
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V)
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getNumElements(Type *Ty)
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This file describes how to lower LLVM code to machine code.
This file provides helpers for the implementation of a TargetTransformInfo-conforming class.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1308
bool sgt(const APInt &RHS) const
Signed greater than comparison.
Definition: APInt.h:1179
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1446
bool slt(const APInt &RHS) const
Signed less than comparison.
Definition: APInt.h:1108
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:178
an instruction to allocate memory on the stack
Definition: Instructions.h:61
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:204
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:210
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
Base class which can be used to help build a TTI implementation.
Definition: BasicTTIImpl.h:81
bool isTypeLegal(Type *Ty)
Definition: BasicTTIImpl.h:429
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const
Definition: BasicTTIImpl.h:287
virtual unsigned getPrefetchDistance() const
Definition: BasicTTIImpl.h:723
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:583
bool preferToKeepConstantsAttached(const Instruction &Inst, const Function &Fn) const
Definition: BasicTTIImpl.h:556
unsigned getMaxInterleaveFactor(ElementCount VF)
Definition: BasicTTIImpl.h:890
unsigned getNumberOfParts(Type *Tp)
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
Definition: BasicTTIImpl.h:752
std::optional< unsigned > getVScaleForTuning() const
Definition: BasicTTIImpl.h:757
InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty, TTI::TargetCostKind CostKind)
Try to calculate the cost of performing strict (in-order) reductions, which involves doing a sequence...
bool isTruncateFree(Type *Ty1, Type *Ty2)
Definition: BasicTTIImpl.h:419
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo)
Definition: BasicTTIImpl.h:663
InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty, TTI::TargetCostKind CostKind)
Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI)
Definition: BasicTTIImpl.h:670
virtual bool shouldPrefetchAddressSpace(unsigned AS) const
Definition: BasicTTIImpl.h:743
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
bool isLegalICmpImmediate(int64_t imm)
Definition: BasicTTIImpl.h:336
bool isProfitableToHoist(Instruction *I)
Definition: BasicTTIImpl.h:423
virtual unsigned getMaxPrefetchIterationsAhead() const
Definition: BasicTTIImpl.h:735
InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, TTI::TargetCostKind CostKind, unsigned Index)
std::optional< unsigned > getMaxVScale() const
Definition: BasicTTIImpl.h:756
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:970
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
unsigned getRegUsageForType(Type *Ty)
Definition: BasicTTIImpl.h:434
bool shouldBuildRelLookupTables() const
Definition: BasicTTIImpl.h:510
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const
Definition: BasicTTIImpl.h:577
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, unsigned &JumpTableSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
Definition: BasicTTIImpl.h:445
bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty, const DataLayout &DL) const
Definition: BasicTTIImpl.h:377
bool shouldDropLSRSolutionIfLessProfitable() const
Definition: BasicTTIImpl.h:397
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2)
Definition: BasicTTIImpl.h:389
std::optional< Value * > simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, bool &KnownBitsComputed)
Definition: BasicTTIImpl.h:685
virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, unsigned NumPrefetches, bool HasCall) const
Definition: BasicTTIImpl.h:727
bool hasBranchDivergence(const Function *F=nullptr)
Definition: BasicTTIImpl.h:281
bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty, const DataLayout &DL) const
Definition: BasicTTIImpl.h:383
unsigned getAssumedAddrSpace(const Value *V) const
Definition: BasicTTIImpl.h:309
InstructionCost getOperandsScalarizationOverhead(ArrayRef< const Value * > Args, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instructions unique non-constant operands.
Definition: BasicTTIImpl.h:809
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:763
int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset)
Definition: BasicTTIImpl.h:353
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:439
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty)
Definition: BasicTTIImpl.h:542
virtual std::optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const
Definition: BasicTTIImpl.h:703
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace)
Definition: BasicTTIImpl.h:405
bool isAlwaysUniform(const Value *V)
Definition: BasicTTIImpl.h:285
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true)
Definition: BasicTTIImpl.h:675
bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, unsigned AddressSpace, Align Alignment, unsigned *Fast) const
Definition: BasicTTIImpl.h:273
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
Definition: BasicTTIImpl.h:357
InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
Definition: BasicTTIImpl.h:793
virtual std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const
Definition: BasicTTIImpl.h:709
virtual bool enableWritePrefetching() const
Definition: BasicTTIImpl.h:739
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const
Definition: BasicTTIImpl.h:323
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:655
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const
Definition: BasicTTIImpl.h:300
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Compute a cost of the given call instruction.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:892
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getFPOpCost(Type *Ty)
Definition: BasicTTIImpl.h:546
InstructionCost getVectorSplitCost()
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:856
bool haveFastSqrt(Type *Ty)
Definition: BasicTTIImpl.h:535
std::pair< const Value *, unsigned > getPredicatedAddrSpace(const Value *V) const
Definition: BasicTTIImpl.h:319
unsigned getInliningThresholdMultiplier() const
Definition: BasicTTIImpl.h:575
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
virtual ~BasicTTIImplBase()=default
bool isLegalAddScalableImmediate(int64_t Imm)
Definition: BasicTTIImpl.h:332
InstructionCost getScalarizationOverhead(VectorType *RetTy, ArrayRef< const Value * > Args, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing the inputs and outputs of an instruction, with return type RetTy...
Definition: BasicTTIImpl.h:838
bool isVScaleKnownToBeAPowerOfTwo() const
Definition: BasicTTIImpl.h:758
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II)
Definition: BasicTTIImpl.h:679
bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const
Definition: BasicTTIImpl.h:291
bool isLegalAddImmediate(int64_t imm)
Definition: BasicTTIImpl.h:328
unsigned getFlatAddressSpace()
Definition: BasicTTIImpl.h:295
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
virtual unsigned getCacheLineSize() const
Definition: BasicTTIImpl.h:719
bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const
Definition: BasicTTIImpl.h:305
bool isSourceOfDivergence(const Value *V)
Definition: BasicTTIImpl.h:283
int getInlinerVectorBonusPercent() const
Definition: BasicTTIImpl.h:581
InstructionCost getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on argument types.
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp)
Definition: BasicTTIImpl.h:692
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:340
bool isSingleThreaded() const
Definition: BasicTTIImpl.h:313
BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
Definition: BasicTTIImpl.h:264
unsigned adjustInliningThreshold(const CallBase *CB)
Definition: BasicTTIImpl.h:576
bool isProfitableLSRChainElement(Instruction *I)
Definition: BasicTTIImpl.h:401
Concrete BasicTTIImpl that can be used if no further customization is needed.
size_type count() const
count - Returns the number of bits which are set.
Definition: BitVector.h:162
BitVector & set()
Definition: BitVector.h:351
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1236
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:1104
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:780
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:784
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:782
@ ICMP_EQ
equal
Definition: InstrTypes.h:778
@ ICMP_NE
not equal
Definition: InstrTypes.h:779
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:767
This class represents a range of values.
Definition: ConstantRange.h:47
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
Definition: DataLayout.h:441
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:838
unsigned getIndexSizeInBits(unsigned AS) const
Size in bits of index used for address calculation in getelementptr.
Definition: DataLayout.h:377
constexpr bool isVector() const
One or more elements.
Definition: TypeSize.h:326
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:322
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
unsigned getNumElements() const
Definition: DerivedTypes.h:582
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:680
bool isTargetIntrinsic() const
isTargetIntrinsic - Returns true if this function is an intrinsic and the intrinsic is specific to a ...
Definition: Function.cpp:955
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:357
The core instruction combiner logic.
Definition: InstCombiner.h:47
static InstructionCost getInvalid(CostType Val=0)
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:266
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
InstructionCost getScalarizationCost() const
const IntrinsicInst * getInst() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
virtual bool shouldPrefetchAddressSpace(unsigned AS) const
virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, unsigned NumPrefetches, bool HasCall) const
Return the minimum stride necessary to trigger software prefetching.
virtual bool enableWritePrefetching() const
virtual unsigned getMaxPrefetchIterationsAhead() const
Return the maximum prefetch distance in terms of loop iterations.
virtual unsigned getPrefetchDistance() const
Return the preferred prefetch distance in terms of instructions.
virtual std::optional< unsigned > getCacheAssociativity(unsigned Level) const
Return the cache associatvity for the given level of cache.
virtual std::optional< unsigned > getCacheLineSize(unsigned Level) const
Return the target cache line size in bytes at a given level.
Machine Value Type.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Analysis providing profile information.
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isSpliceMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is a splice mask, concatenating the two inputs together and then ext...
static bool isSelectMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from its source vectors without lane crossings.
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isTransposeMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask is a transpose mask.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
size_type size() const
Definition: SmallPtrSet.h:95
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:367
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:502
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
static StackOffset getScalable(int64_t Scalable)
Definition: TypeSize.h:43
static StackOffset getFixed(int64_t Fixed)
Definition: TypeSize.h:42
static StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition: Type.cpp:501
Multiway switch.
Provides information about what library functions are available for the current target.
This base class for TargetLowering contains the SelectionDAG-independent parts that can be used from ...
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
bool isIndexedStoreLegal(unsigned IdxMode, EVT VT) const
Return true if the specified indexed load is legal on this target.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
virtual bool isLegalICmpImmediate(int64_t) const
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
const TargetMachine & getTargetMachine() const
virtual bool isZExtFree(Type *FromTy, Type *ToTy) const
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
virtual bool isSuitableForJumpTable(const SwitchInst *SI, uint64_t NumCases, uint64_t Range, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) const
Return true if lowering to a jump table is suitable for a set of case clusters which may contain NumC...
virtual bool areJTsAllowed(const Function *Fn) const
Return true if lowering to a jump table is allowed.
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
virtual unsigned getNumRegisters(LLVMContext &Context, EVT VT, std::optional< MVT > RegisterVT=std::nullopt) const
Return the number of registers that this ValueType will eventually require.
virtual bool isCheapToSpeculateCttz(Type *Ty) const
Return true if it is cheap to speculate a call to intrinsic cttz.
bool isTruncStoreLegal(EVT ValVT, EVT MemVT) const
Return true if the specified store with truncation is legal on this target.
unsigned getBitWidthForCttzElements(Type *RetTy, ElementCount EC, bool ZeroIsPoison, const ConstantRange *VScaleRange) const
Return the minimum number of bits required to hold the maximum possible number of trailing zero vecto...
virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *=nullptr) const
Determine if the target supports unaligned memory accesses.
virtual bool isTruncateFree(Type *FromTy, Type *ToTy) const
Return true if it's free to truncate a value of type FromTy to type ToTy.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
bool isSuitableForBitTests(unsigned NumDests, unsigned NumCmps, const APInt &Low, const APInt &High, const DataLayout &DL) const
Return true if lowering to a bit test is suitable for a set of case clusters which contains NumDests ...
virtual bool isLegalAddImmediate(int64_t) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
virtual bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
LegalizeAction getTruncStoreAction(EVT ValVT, EVT MemVT) const
Return how this store with truncation should be treated: either it is legal, needs to be promoted to ...
LegalizeAction getLoadExtAction(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return how this load with extension should be treated: either it is legal, needs to be promoted to a ...
virtual bool isIntDivCheap(EVT VT, AttributeList Attr) const
Return true if integer divide is usually cheaper than a sequence of several shifts,...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isProfitableToHoist(Instruction *I) const
bool isIndexedLoadLegal(unsigned IdxMode, EVT VT) const
Return true if the specified indexed load is legal on this target.
bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal on this target.
virtual bool isCheapToSpeculateCtlz(Type *Ty) const
Return true if it is cheap to speculate a call to intrinsic ctlz.
virtual int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) const
Return the prefered common base offset.
LegalizeKind getTypeConversion(LLVMContext &Context, EVT VT) const
Return pair that represents the legalization kind (first) that needs to happen to EVT (second) in ord...
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
virtual bool isLegalAddScalableImmediate(int64_t) const
Return true if adding the specified scalable immediate is legal, that is the target has add instructi...
bool isBeneficialToExpandPowI(int64_t Exponent, bool OptForSize) const
Return true if it is beneficial to expand an @llvm.powi.
virtual bool isFAbsFree(EVT VT) const
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
virtual bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AddrSpace, Instruction *I=nullptr) const
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
virtual std::pair< const Value *, unsigned > getPredicatedAddrSpace(const Value *V) const
If the specified predicate checks whether a generic pointer falls within a specified address space,...
virtual bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const
Returns true if a cast between SrcAS and DestAS is a noop.
virtual unsigned getAssumedAddrSpace(const Value *V) const
If the specified generic pointer could be assumed as a pointer to a specific address space,...
TargetOptions Options
ThreadModel::Model ThreadModel
ThreadModel - This flag specifies the type of threading model to assume for things like atomics.
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual bool useAA() const
Enable use of alias analysis during code generation (during MI scheduling, DAGCombine,...
const DataLayout & getDataLayout() const
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
bool isProfitableLSRChainElement(Instruction *I) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I) const
std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info, TTI::OperandValueInfo Opd2Info, ArrayRef< const Value * > Args, const Instruction *CxtI=nullptr) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I) const
bool isLoweredToCall(const Function *F) const
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const
std::optional< Value * > simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, bool &KnownBitsComputed) const
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
CRTP base class for use as a mix-in that aids implementing a TargetTransformInfo-compatible class.
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Basic
The cost of a typical 'add' instruction.
MemIndexedMode
The type of load/store indexing.
@ MIM_PostInc
Post-incrementing.
@ MIM_PostDec
Post-decrementing.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
CacheLevel
The possible cache levels.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:373
bool isArch64Bit() const
Test whether the architecture is 64-bit.
Definition: Triple.cpp:1651
bool isOSDarwin() const
Is this a "Darwin" OS (macOS, iOS, tvOS, watchOS, XROS, or DriverKit).
Definition: Triple.h:558
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:261
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:230
static IntegerType * getInt1Ty(LLVMContext &C)
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
static IntegerType * getInt8Ty(LLVMContext &C)
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:258
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:212
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:343
Value * getOperand(unsigned i) const
Definition: User.h:169
static bool isVPBinOp(Intrinsic::ID ID)
static std::optional< unsigned > getFunctionalOpcodeForVP(Intrinsic::ID ID)
static std::optional< Intrinsic::ID > getFunctionalIntrinsicIDForVP(Intrinsic::ID ID)
static bool isVPIntrinsic(Intrinsic::ID)
static bool isVPReduction(Intrinsic::ID ID)
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
static VectorType * getHalfElementsVectorType(VectorType *VTy)
This static method returns a VectorType with half as many elements as the input type and the same ele...
Definition: DerivedTypes.h:507
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:664
Type * getElementType() const
Definition: DerivedTypes.h:436
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:218
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition: APInt.cpp:2978
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:743
@ SMULFIX
RESULT = [US]MULFIX(LHS, RHS, SCALE) - Perform fixed point multiplication on 2 integers with the same...
Definition: ISDOpcodes.h:374
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:497
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:716
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:262
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:334
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1120
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1124
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:514
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:756
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:330
@ FP_TO_UINT_SAT
Definition: ISDOpcodes.h:906
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1028
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:696
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:765
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1047
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition: ISDOpcodes.h:905
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:507
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
Definition: ISDOpcodes.h:1052
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
Definition: ISDOpcodes.h:1552
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:988
AddressSpace
Definition: NVPTXBaseInfo.h:21
unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID)
Returns the arithmetic instruction opcode used when expanding a reduction.
Definition: LoopUtils.cpp:959
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
ConstantRange getVScaleRange(const Function *F, unsigned BitWidth)
Determine the possible constant range of vscale with the given bit width, based on the vscale_range f...
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
InstructionCost Cost
cl::opt< unsigned > PartialUnrollingThreshold
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Extended Value Type.
Definition: ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
ElementCount getVectorElementCount() const
Definition: ValueTypes.h:341
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:275
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:307
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:65
Attributes of a target dependent hardware loop.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
bool AllowPeeling
Allow peeling off loop iterations.
bool AllowLoopNestsPeeling
Allow peeling off loop iterations for loop nests.
bool PeelProfiledIterations
Allow peeling basing on profile.
unsigned PeelCount
A forced peeling factor (the number of bodied of the original loop that should be peeled off before t...
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).