LLVM 19.0.0git
BasicTTIImpl.h
Go to the documentation of this file.
1//===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file provides a helper that implements much of the TTI interface in
11/// terms of the target-independent code generator and TargetLowering
12/// interfaces.
13//
14//===----------------------------------------------------------------------===//
15
16#ifndef LLVM_CODEGEN_BASICTTIIMPL_H
17#define LLVM_CODEGEN_BASICTTIIMPL_H
18
19#include "llvm/ADT/APInt.h"
20#include "llvm/ADT/ArrayRef.h"
21#include "llvm/ADT/BitVector.h"
34#include "llvm/IR/BasicBlock.h"
35#include "llvm/IR/Constant.h"
36#include "llvm/IR/Constants.h"
37#include "llvm/IR/DataLayout.h"
39#include "llvm/IR/InstrTypes.h"
40#include "llvm/IR/Instruction.h"
42#include "llvm/IR/Intrinsics.h"
43#include "llvm/IR/Operator.h"
44#include "llvm/IR/Type.h"
45#include "llvm/IR/Value.h"
53#include <algorithm>
54#include <cassert>
55#include <cstdint>
56#include <limits>
57#include <optional>
58#include <utility>
59
60namespace llvm {
61
62class Function;
63class GlobalValue;
64class LLVMContext;
65class ScalarEvolution;
66class SCEV;
67class TargetMachine;
68
69extern cl::opt<unsigned> PartialUnrollingThreshold;
70
71/// Base class which can be used to help build a TTI implementation.
72///
73/// This class provides as much implementation of the TTI interface as is
74/// possible using the target independent parts of the code generator.
75///
76/// In order to subclass it, your class must implement a getST() method to
77/// return the subtarget, and a getTLI() method to return the target lowering.
78/// We need these methods implemented in the derived class so that this class
79/// doesn't have to duplicate storage for them.
80template <typename T>
82private:
85
86 /// Helper function to access this as a T.
87 T *thisT() { return static_cast<T *>(this); }
88
89 /// Estimate a cost of Broadcast as an extract and sequence of insert
90 /// operations.
91 InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy,
94 // Broadcast cost is equal to the cost of extracting the zero'th element
95 // plus the cost of inserting it into every element of the result vector.
96 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
97 CostKind, 0, nullptr, nullptr);
98
99 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
100 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
101 CostKind, i, nullptr, nullptr);
102 }
103 return Cost;
104 }
105
106 /// Estimate a cost of shuffle as a sequence of extract and insert
107 /// operations.
108 InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy,
111 // Shuffle cost is equal to the cost of extracting element from its argument
112 // plus the cost of inserting them onto the result vector.
113
114 // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
115 // index 0 of first vector, index 1 of second vector,index 2 of first
116 // vector and finally index 3 of second vector and insert them at index
117 // <0,1,2,3> of result vector.
118 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
119 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
120 CostKind, i, nullptr, nullptr);
121 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
122 CostKind, i, nullptr, nullptr);
123 }
124 return Cost;
125 }
126
127 /// Estimate a cost of subvector extraction as a sequence of extract and
128 /// insert operations.
129 InstructionCost getExtractSubvectorOverhead(VectorType *VTy,
131 int Index,
132 FixedVectorType *SubVTy) {
133 assert(VTy && SubVTy &&
134 "Can only extract subvectors from vectors");
135 int NumSubElts = SubVTy->getNumElements();
136 assert((!isa<FixedVectorType>(VTy) ||
137 (Index + NumSubElts) <=
138 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
139 "SK_ExtractSubvector index out of range");
140
142 // Subvector extraction cost is equal to the cost of extracting element from
143 // the source type plus the cost of inserting them into the result vector
144 // type.
145 for (int i = 0; i != NumSubElts; ++i) {
146 Cost +=
147 thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
148 CostKind, i + Index, nullptr, nullptr);
149 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy,
150 CostKind, i, nullptr, nullptr);
151 }
152 return Cost;
153 }
154
155 /// Estimate a cost of subvector insertion as a sequence of extract and
156 /// insert operations.
157 InstructionCost getInsertSubvectorOverhead(VectorType *VTy,
159 int Index,
160 FixedVectorType *SubVTy) {
161 assert(VTy && SubVTy &&
162 "Can only insert subvectors into vectors");
163 int NumSubElts = SubVTy->getNumElements();
164 assert((!isa<FixedVectorType>(VTy) ||
165 (Index + NumSubElts) <=
166 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
167 "SK_InsertSubvector index out of range");
168
170 // Subvector insertion cost is equal to the cost of extracting element from
171 // the source type plus the cost of inserting them into the result vector
172 // type.
173 for (int i = 0; i != NumSubElts; ++i) {
174 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy,
175 CostKind, i, nullptr, nullptr);
176 Cost +=
177 thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, CostKind,
178 i + Index, nullptr, nullptr);
179 }
180 return Cost;
181 }
182
183 /// Local query method delegates up to T which *must* implement this!
184 const TargetSubtargetInfo *getST() const {
185 return static_cast<const T *>(this)->getST();
186 }
187
188 /// Local query method delegates up to T which *must* implement this!
189 const TargetLoweringBase *getTLI() const {
190 return static_cast<const T *>(this)->getTLI();
191 }
192
193 static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
194 switch (M) {
196 return ISD::UNINDEXED;
197 case TTI::MIM_PreInc:
198 return ISD::PRE_INC;
199 case TTI::MIM_PreDec:
200 return ISD::PRE_DEC;
201 case TTI::MIM_PostInc:
202 return ISD::POST_INC;
203 case TTI::MIM_PostDec:
204 return ISD::POST_DEC;
205 }
206 llvm_unreachable("Unexpected MemIndexedMode");
207 }
208
209 InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
210 Align Alignment,
211 bool VariableMask,
212 bool IsGatherScatter,
214 unsigned AddressSpace = 0) {
215 // We cannot scalarize scalable vectors, so return Invalid.
216 if (isa<ScalableVectorType>(DataTy))
218
219 auto *VT = cast<FixedVectorType>(DataTy);
220 unsigned VF = VT->getNumElements();
221
222 // Assume the target does not have support for gather/scatter operations
223 // and provide a rough estimate.
224 //
225 // First, compute the cost of the individual memory operations.
226 InstructionCost AddrExtractCost =
227 IsGatherScatter
230 PointerType::get(VT->getElementType(), 0), VF),
231 /*Insert=*/false, /*Extract=*/true, CostKind)
232 : 0;
233
234 // The cost of the scalar loads/stores.
235 InstructionCost MemoryOpCost =
236 VF * thisT()->getMemoryOpCost(Opcode, VT->getElementType(), Alignment,
238
239 // Next, compute the cost of packing the result in a vector.
240 InstructionCost PackingCost =
241 getScalarizationOverhead(VT, Opcode != Instruction::Store,
242 Opcode == Instruction::Store, CostKind);
243
244 InstructionCost ConditionalCost = 0;
245 if (VariableMask) {
246 // Compute the cost of conditionally executing the memory operations with
247 // variable masks. This includes extracting the individual conditions, a
248 // branches and PHIs to combine the results.
249 // NOTE: Estimating the cost of conditionally executing the memory
250 // operations accurately is quite difficult and the current solution
251 // provides a very rough estimate only.
252 ConditionalCost =
255 /*Insert=*/false, /*Extract=*/true, CostKind) +
256 VF * (thisT()->getCFInstrCost(Instruction::Br, CostKind) +
257 thisT()->getCFInstrCost(Instruction::PHI, CostKind));
258 }
259
260 return AddrExtractCost + MemoryOpCost + PackingCost + ConditionalCost;
261 }
262
263protected:
265 : BaseT(DL) {}
266 virtual ~BasicTTIImplBase() = default;
267
269
270public:
271 /// \name Scalar TTI Implementations
272 /// @{
274 unsigned AddressSpace, Align Alignment,
275 unsigned *Fast) const {
276 EVT E = EVT::getIntegerVT(Context, BitWidth);
277 return getTLI()->allowsMisalignedMemoryAccesses(
279 }
280
281 bool hasBranchDivergence(const Function *F = nullptr) { return false; }
282
283 bool isSourceOfDivergence(const Value *V) { return false; }
284
285 bool isAlwaysUniform(const Value *V) { return false; }
286
287 bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
288 return false;
289 }
290
291 bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const {
292 return true;
293 }
294
296 // Return an invalid address space.
297 return -1;
298 }
299
301 Intrinsic::ID IID) const {
302 return false;
303 }
304
305 bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
306 return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS);
307 }
308
309 unsigned getAssumedAddrSpace(const Value *V) const {
310 return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
311 }
312
313 bool isSingleThreaded() const {
314 return getTLI()->getTargetMachine().Options.ThreadModel ==
316 }
317
318 std::pair<const Value *, unsigned>
320 return getTLI()->getTargetMachine().getPredicatedAddrSpace(V);
321 }
322
324 Value *NewV) const {
325 return nullptr;
326 }
327
328 bool isLegalAddImmediate(int64_t imm) {
329 return getTLI()->isLegalAddImmediate(imm);
330 }
331
332 bool isLegalAddScalableImmediate(int64_t Imm) {
333 return getTLI()->isLegalAddScalableImmediate(Imm);
334 }
335
336 bool isLegalICmpImmediate(int64_t imm) {
337 return getTLI()->isLegalICmpImmediate(imm);
338 }
339
340 bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
341 bool HasBaseReg, int64_t Scale, unsigned AddrSpace,
342 Instruction *I = nullptr,
343 int64_t ScalableOffset = 0) {
345 AM.BaseGV = BaseGV;
346 AM.BaseOffs = BaseOffset;
347 AM.HasBaseReg = HasBaseReg;
348 AM.Scale = Scale;
349 AM.ScalableOffset = ScalableOffset;
350 return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
351 }
352
353 int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) {
354 return getTLI()->getPreferredLargeGEPBaseOffset(MinOffset, MaxOffset);
355 }
356
357 unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
358 Type *ScalarValTy) const {
359 auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy](unsigned VF) {
360 auto *SrcTy = FixedVectorType::get(ScalarMemTy, VF / 2);
361 EVT VT = getTLI()->getValueType(DL, SrcTy);
362 if (getTLI()->isOperationLegal(ISD::STORE, VT) ||
363 getTLI()->isOperationCustom(ISD::STORE, VT))
364 return true;
365
366 EVT ValVT =
367 getTLI()->getValueType(DL, FixedVectorType::get(ScalarValTy, VF / 2));
368 EVT LegalizedVT =
369 getTLI()->getTypeToTransformTo(ScalarMemTy->getContext(), VT);
370 return getTLI()->isTruncStoreLegal(LegalizedVT, ValVT);
371 };
372 while (VF > 2 && IsSupportedByTarget(VF))
373 VF /= 2;
374 return VF;
375 }
376
378 const DataLayout &DL) const {
379 EVT VT = getTLI()->getValueType(DL, Ty);
380 return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
381 }
382
384 const DataLayout &DL) const {
385 EVT VT = getTLI()->getValueType(DL, Ty);
386 return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
387 }
388
391 }
392
395 }
396
400 }
401
404 }
405
408 }
409
411 StackOffset BaseOffset, bool HasBaseReg,
412 int64_t Scale, unsigned AddrSpace) {
414 AM.BaseGV = BaseGV;
415 AM.BaseOffs = BaseOffset.getFixed();
416 AM.HasBaseReg = HasBaseReg;
417 AM.Scale = Scale;
418 AM.ScalableOffset = BaseOffset.getScalable();
419 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
420 return 0;
421 return -1;
422 }
423
424 bool isTruncateFree(Type *Ty1, Type *Ty2) {
425 return getTLI()->isTruncateFree(Ty1, Ty2);
426 }
427
429 return getTLI()->isProfitableToHoist(I);
430 }
431
432 bool useAA() const { return getST()->useAA(); }
433
434 bool isTypeLegal(Type *Ty) {
435 EVT VT = getTLI()->getValueType(DL, Ty, /*AllowUnknown=*/true);
436 return getTLI()->isTypeLegal(VT);
437 }
438
439 unsigned getRegUsageForType(Type *Ty) {
440 EVT ETy = getTLI()->getValueType(DL, Ty);
441 return getTLI()->getNumRegisters(Ty->getContext(), ETy);
442 }
443
447 return BaseT::getGEPCost(PointeeType, Ptr, Operands, AccessType, CostKind);
448 }
449
451 unsigned &JumpTableSize,
453 BlockFrequencyInfo *BFI) {
454 /// Try to find the estimated number of clusters. Note that the number of
455 /// clusters identified in this function could be different from the actual
456 /// numbers found in lowering. This function ignore switches that are
457 /// lowered with a mix of jump table / bit test / BTree. This function was
458 /// initially intended to be used when estimating the cost of switch in
459 /// inline cost heuristic, but it's a generic cost model to be used in other
460 /// places (e.g., in loop unrolling).
461 unsigned N = SI.getNumCases();
462 const TargetLoweringBase *TLI = getTLI();
463 const DataLayout &DL = this->getDataLayout();
464
465 JumpTableSize = 0;
466 bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());
467
468 // Early exit if both a jump table and bit test are not allowed.
469 if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N))
470 return N;
471
472 APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
473 APInt MinCaseVal = MaxCaseVal;
474 for (auto CI : SI.cases()) {
475 const APInt &CaseVal = CI.getCaseValue()->getValue();
476 if (CaseVal.sgt(MaxCaseVal))
477 MaxCaseVal = CaseVal;
478 if (CaseVal.slt(MinCaseVal))
479 MinCaseVal = CaseVal;
480 }
481
482 // Check if suitable for a bit test
483 if (N <= DL.getIndexSizeInBits(0u)) {
485 for (auto I : SI.cases())
486 Dests.insert(I.getCaseSuccessor());
487
488 if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
489 DL))
490 return 1;
491 }
492
493 // Check if suitable for a jump table.
494 if (IsJTAllowed) {
495 if (N < 2 || N < TLI->getMinimumJumpTableEntries())
496 return N;
498 (MaxCaseVal - MinCaseVal)
499 .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1;
500 // Check whether a range of clusters is dense enough for a jump table
501 if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) {
502 JumpTableSize = Range;
503 return 1;
504 }
505 }
506 return N;
507 }
508
510 const TargetLoweringBase *TLI = getTLI();
511 return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
512 TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
513 }
514
516 const TargetMachine &TM = getTLI()->getTargetMachine();
517 // If non-PIC mode, do not generate a relative lookup table.
518 if (!TM.isPositionIndependent())
519 return false;
520
521 /// Relative lookup table entries consist of 32-bit offsets.
522 /// Do not generate relative lookup tables for large code models
523 /// in 64-bit achitectures where 32-bit offsets might not be enough.
524 if (TM.getCodeModel() == CodeModel::Medium ||
525 TM.getCodeModel() == CodeModel::Large)
526 return false;
527
528 Triple TargetTriple = TM.getTargetTriple();
529 if (!TargetTriple.isArch64Bit())
530 return false;
531
532 // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it
533 // there.
534 if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin())
535 return false;
536
537 return true;
538 }
539
540 bool haveFastSqrt(Type *Ty) {
541 const TargetLoweringBase *TLI = getTLI();
542 EVT VT = TLI->getValueType(DL, Ty);
543 return TLI->isTypeLegal(VT) &&
545 }
546
548 return true;
549 }
550
552 // Check whether FADD is available, as a proxy for floating-point in
553 // general.
554 const TargetLoweringBase *TLI = getTLI();
555 EVT VT = TLI->getValueType(DL, Ty);
559 }
560
562 const Function &Fn) const {
563 switch (Inst.getOpcode()) {
564 default:
565 break;
566 case Instruction::SDiv:
567 case Instruction::SRem:
568 case Instruction::UDiv:
569 case Instruction::URem: {
570 if (!isa<ConstantInt>(Inst.getOperand(1)))
571 return false;
572 EVT VT = getTLI()->getValueType(DL, Inst.getType());
573 return !getTLI()->isIntDivCheap(VT, Fn.getAttributes());
574 }
575 };
576
577 return false;
578 }
579
580 unsigned getInliningThresholdMultiplier() const { return 1; }
581 unsigned adjustInliningThreshold(const CallBase *CB) { return 0; }
582 unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const {
583 return 0;
584 }
585
586 int getInlinerVectorBonusPercent() const { return 150; }
587
591 // This unrolling functionality is target independent, but to provide some
592 // motivation for its intended use, for x86:
593
594 // According to the Intel 64 and IA-32 Architectures Optimization Reference
595 // Manual, Intel Core models and later have a loop stream detector (and
596 // associated uop queue) that can benefit from partial unrolling.
597 // The relevant requirements are:
598 // - The loop must have no more than 4 (8 for Nehalem and later) branches
599 // taken, and none of them may be calls.
600 // - The loop can have no more than 18 (28 for Nehalem and later) uops.
601
602 // According to the Software Optimization Guide for AMD Family 15h
603 // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
604 // and loop buffer which can benefit from partial unrolling.
605 // The relevant requirements are:
606 // - The loop must have fewer than 16 branches
607 // - The loop must have less than 40 uops in all executed loop branches
608
609 // The number of taken branches in a loop is hard to estimate here, and
610 // benchmarking has revealed that it is better not to be conservative when
611 // estimating the branch count. As a result, we'll ignore the branch limits
612 // until someone finds a case where it matters in practice.
613
614 unsigned MaxOps;
615 const TargetSubtargetInfo *ST = getST();
616 if (PartialUnrollingThreshold.getNumOccurrences() > 0)
618 else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
619 MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
620 else
621 return;
622
623 // Scan the loop: don't unroll loops with calls.
624 for (BasicBlock *BB : L->blocks()) {
625 for (Instruction &I : *BB) {
626 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
627 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
628 if (!thisT()->isLoweredToCall(F))
629 continue;
630 }
631
632 if (ORE) {
633 ORE->emit([&]() {
634 return OptimizationRemark("TTI", "DontUnroll", L->getStartLoc(),
635 L->getHeader())
636 << "advising against unrolling the loop because it "
637 "contains a "
638 << ore::NV("Call", &I);
639 });
640 }
641 return;
642 }
643 }
644 }
645
646 // Enable runtime and partial unrolling up to the specified size.
647 // Enable using trip count upper bound to unroll loops.
648 UP.Partial = UP.Runtime = UP.UpperBound = true;
649 UP.PartialThreshold = MaxOps;
650
651 // Avoid unrolling when optimizing for size.
652 UP.OptSizeThreshold = 0;
654
655 // Set number of instructions optimized when "back edge"
656 // becomes "fall through" to default value of 2.
657 UP.BEInsns = 2;
658 }
659
662 PP.PeelCount = 0;
663 PP.AllowPeeling = true;
664 PP.AllowLoopNestsPeeling = false;
665 PP.PeelProfiledIterations = true;
666 }
667
669 AssumptionCache &AC,
670 TargetLibraryInfo *LibInfo,
671 HardwareLoopInfo &HWLoopInfo) {
672 return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
673 }
674
677 }
678
680 getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) {
681 return BaseT::getPreferredTailFoldingStyle(IVUpdateMayOverflow);
682 }
683
684 std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
685 IntrinsicInst &II) {
687 }
688
689 std::optional<Value *>
691 APInt DemandedMask, KnownBits &Known,
692 bool &KnownBitsComputed) {
693 return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
694 KnownBitsComputed);
695 }
696
698 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
699 APInt &UndefElts2, APInt &UndefElts3,
700 std::function<void(Instruction *, unsigned, APInt, APInt &)>
701 SimplifyAndSetOp) {
703 IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
704 SimplifyAndSetOp);
705 }
706
707 virtual std::optional<unsigned>
709 return std::optional<unsigned>(
710 getST()->getCacheSize(static_cast<unsigned>(Level)));
711 }
712
713 virtual std::optional<unsigned>
715 std::optional<unsigned> TargetResult =
716 getST()->getCacheAssociativity(static_cast<unsigned>(Level));
717
718 if (TargetResult)
719 return TargetResult;
720
721 return BaseT::getCacheAssociativity(Level);
722 }
723
724 virtual unsigned getCacheLineSize() const {
725 return getST()->getCacheLineSize();
726 }
727
728 virtual unsigned getPrefetchDistance() const {
729 return getST()->getPrefetchDistance();
730 }
731
732 virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
733 unsigned NumStridedMemAccesses,
734 unsigned NumPrefetches,
735 bool HasCall) const {
736 return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
737 NumPrefetches, HasCall);
738 }
739
740 virtual unsigned getMaxPrefetchIterationsAhead() const {
741 return getST()->getMaxPrefetchIterationsAhead();
742 }
743
744 virtual bool enableWritePrefetching() const {
745 return getST()->enableWritePrefetching();
746 }
747
748 virtual bool shouldPrefetchAddressSpace(unsigned AS) const {
749 return getST()->shouldPrefetchAddressSpace(AS);
750 }
751
752 /// @}
753
754 /// \name Vector TTI Implementations
755 /// @{
756
758 return TypeSize::getFixed(32);
759 }
760
761 std::optional<unsigned> getMaxVScale() const { return std::nullopt; }
762 std::optional<unsigned> getVScaleForTuning() const { return std::nullopt; }
763 bool isVScaleKnownToBeAPowerOfTwo() const { return false; }
764
765 /// Estimate the overhead of scalarizing an instruction. Insert and Extract
766 /// are set if the demanded result elements need to be inserted and/or
767 /// extracted from vectors.
769 const APInt &DemandedElts,
770 bool Insert, bool Extract,
772 /// FIXME: a bitfield is not a reasonable abstraction for talking about
773 /// which elements are needed from a scalable vector
774 if (isa<ScalableVectorType>(InTy))
776 auto *Ty = cast<FixedVectorType>(InTy);
777
778 assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&
779 "Vector size mismatch");
780
782
783 for (int i = 0, e = Ty->getNumElements(); i < e; ++i) {
784 if (!DemandedElts[i])
785 continue;
786 if (Insert)
787 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty,
788 CostKind, i, nullptr, nullptr);
789 if (Extract)
790 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
791 CostKind, i, nullptr, nullptr);
792 }
793
794 return Cost;
795 }
796
797 /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
799 bool Extract,
801 if (isa<ScalableVectorType>(InTy))
803 auto *Ty = cast<FixedVectorType>(InTy);
804
805 APInt DemandedElts = APInt::getAllOnes(Ty->getNumElements());
806 return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
807 CostKind);
808 }
809
810 /// Estimate the overhead of scalarizing an instructions unique
811 /// non-constant operands. The (potentially vector) types to use for each of
812 /// argument are passes via Tys.
817 assert(Args.size() == Tys.size() && "Expected matching Args and Tys");
818
820 SmallPtrSet<const Value*, 4> UniqueOperands;
821 for (int I = 0, E = Args.size(); I != E; I++) {
822 // Disregard things like metadata arguments.
823 const Value *A = Args[I];
824 Type *Ty = Tys[I];
825 if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() &&
826 !Ty->isPtrOrPtrVectorTy())
827 continue;
828
829 if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
830 if (auto *VecTy = dyn_cast<VectorType>(Ty))
831 Cost += getScalarizationOverhead(VecTy, /*Insert*/ false,
832 /*Extract*/ true, CostKind);
833 }
834 }
835
836 return Cost;
837 }
838
839 /// Estimate the overhead of scalarizing the inputs and outputs of an
840 /// instruction, with return type RetTy and arguments Args of type Tys. If
841 /// Args are unknown (empty), then the cost associated with one argument is
842 /// added as a heuristic.
848 RetTy, /*Insert*/ true, /*Extract*/ false, CostKind);
849 if (!Args.empty())
851 else
852 // When no information on arguments is provided, we add the cost
853 // associated with one argument as a heuristic.
854 Cost += getScalarizationOverhead(RetTy, /*Insert*/ false,
855 /*Extract*/ true, CostKind);
856
857 return Cost;
858 }
859
860 /// Estimate the cost of type-legalization and the legalized type.
861 std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const {
862 LLVMContext &C = Ty->getContext();
863 EVT MTy = getTLI()->getValueType(DL, Ty);
864
866 // We keep legalizing the type until we find a legal kind. We assume that
867 // the only operation that costs anything is the split. After splitting
868 // we need to handle two types.
869 while (true) {
871
873 // Ensure we return a sensible simple VT here, since many callers of
874 // this function require it.
875 MVT VT = MTy.isSimple() ? MTy.getSimpleVT() : MVT::i64;
876 return std::make_pair(InstructionCost::getInvalid(), VT);
877 }
878
879 if (LK.first == TargetLoweringBase::TypeLegal)
880 return std::make_pair(Cost, MTy.getSimpleVT());
881
882 if (LK.first == TargetLoweringBase::TypeSplitVector ||
884 Cost *= 2;
885
886 // Do not loop with f128 type.
887 if (MTy == LK.second)
888 return std::make_pair(Cost, MTy.getSimpleVT());
889
890 // Keep legalizing the type.
891 MTy = LK.second;
892 }
893 }
894
895 unsigned getMaxInterleaveFactor(ElementCount VF) { return 1; }
896
898 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
901 ArrayRef<const Value *> Args = std::nullopt,
902 const Instruction *CxtI = nullptr) {
903 // Check if any of the operands are vector operands.
904 const TargetLoweringBase *TLI = getTLI();
905 int ISD = TLI->InstructionOpcodeToISD(Opcode);
906 assert(ISD && "Invalid opcode");
907
908 // TODO: Handle more cost kinds.
910 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
911 Opd1Info, Opd2Info,
912 Args, CxtI);
913
914 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
915
916 bool IsFloat = Ty->isFPOrFPVectorTy();
917 // Assume that floating point arithmetic operations cost twice as much as
918 // integer operations.
919 InstructionCost OpCost = (IsFloat ? 2 : 1);
920
921 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
922 // The operation is legal. Assume it costs 1.
923 // TODO: Once we have extract/insert subvector cost we need to use them.
924 return LT.first * OpCost;
925 }
926
927 if (!TLI->isOperationExpand(ISD, LT.second)) {
928 // If the operation is custom lowered, then assume that the code is twice
929 // as expensive.
930 return LT.first * 2 * OpCost;
931 }
932
933 // An 'Expand' of URem and SRem is special because it may default
934 // to expanding the operation into a sequence of sub-operations
935 // i.e. X % Y -> X-(X/Y)*Y.
936 if (ISD == ISD::UREM || ISD == ISD::SREM) {
937 bool IsSigned = ISD == ISD::SREM;
938 if (TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIVREM : ISD::UDIVREM,
939 LT.second) ||
940 TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIV : ISD::UDIV,
941 LT.second)) {
942 unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv;
943 InstructionCost DivCost = thisT()->getArithmeticInstrCost(
944 DivOpc, Ty, CostKind, Opd1Info, Opd2Info);
945 InstructionCost MulCost =
946 thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind);
947 InstructionCost SubCost =
948 thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
949 return DivCost + MulCost + SubCost;
950 }
951 }
952
953 // We cannot scalarize scalable vectors, so return Invalid.
954 if (isa<ScalableVectorType>(Ty))
956
957 // Else, assume that we need to scalarize this op.
958 // TODO: If one of the types get legalized by splitting, handle this
959 // similarly to what getCastInstrCost() does.
960 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
961 InstructionCost Cost = thisT()->getArithmeticInstrCost(
962 Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
963 Args, CxtI);
964 // Return the cost of multiple scalar invocation plus the cost of
965 // inserting and extracting the values.
966 SmallVector<Type *> Tys(Args.size(), Ty);
967 return getScalarizationOverhead(VTy, Args, Tys, CostKind) +
968 VTy->getNumElements() * Cost;
969 }
970
971 // We don't know anything about this scalar instruction.
972 return OpCost;
973 }
974
976 ArrayRef<int> Mask,
977 VectorType *Ty, int &Index,
978 VectorType *&SubTy) const {
979 if (Mask.empty())
980 return Kind;
981 int NumSrcElts = Ty->getElementCount().getKnownMinValue();
982 switch (Kind) {
984 if (ShuffleVectorInst::isReverseMask(Mask, NumSrcElts))
985 return TTI::SK_Reverse;
986 if (ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts))
987 return TTI::SK_Broadcast;
988 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts, Index) &&
989 (Index + Mask.size()) <= (size_t)NumSrcElts) {
990 SubTy = FixedVectorType::get(Ty->getElementType(), Mask.size());
992 }
993 break;
995 int NumSubElts;
996 if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
997 Mask, NumSrcElts, NumSubElts, Index)) {
998 if (Index + NumSubElts > NumSrcElts)
999 return Kind;
1000 SubTy = FixedVectorType::get(Ty->getElementType(), NumSubElts);
1002 }
1003 if (ShuffleVectorInst::isSelectMask(Mask, NumSrcElts))
1004 return TTI::SK_Select;
1005 if (ShuffleVectorInst::isTransposeMask(Mask, NumSrcElts))
1006 return TTI::SK_Transpose;
1007 if (ShuffleVectorInst::isSpliceMask(Mask, NumSrcElts, Index))
1008 return TTI::SK_Splice;
1009 break;
1010 }
1011 case TTI::SK_Select:
1012 case TTI::SK_Reverse:
1013 case TTI::SK_Broadcast:
1014 case TTI::SK_Transpose:
1017 case TTI::SK_Splice:
1018 break;
1019 }
1020 return Kind;
1021 }
1022
1024 ArrayRef<int> Mask,
1026 VectorType *SubTp,
1027 ArrayRef<const Value *> Args = std::nullopt,
1028 const Instruction *CxtI = nullptr) {
1029 switch (improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp)) {
1030 case TTI::SK_Broadcast:
1031 if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
1032 return getBroadcastShuffleOverhead(FVT, CostKind);
1034 case TTI::SK_Select:
1035 case TTI::SK_Splice:
1036 case TTI::SK_Reverse:
1037 case TTI::SK_Transpose:
1040 if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
1041 return getPermuteShuffleOverhead(FVT, CostKind);
1044 return getExtractSubvectorOverhead(Tp, CostKind, Index,
1045 cast<FixedVectorType>(SubTp));
1047 return getInsertSubvectorOverhead(Tp, CostKind, Index,
1048 cast<FixedVectorType>(SubTp));
1049 }
1050 llvm_unreachable("Unknown TTI::ShuffleKind");
1051 }
1052
1053 InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1056 const Instruction *I = nullptr) {
1057 if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0)
1058 return 0;
1059
1060 const TargetLoweringBase *TLI = getTLI();
1061 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1062 assert(ISD && "Invalid opcode");
1063 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1064 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1065
1066 TypeSize SrcSize = SrcLT.second.getSizeInBits();
1067 TypeSize DstSize = DstLT.second.getSizeInBits();
1068 bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy();
1069 bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy();
1070
1071 switch (Opcode) {
1072 default:
1073 break;
1074 case Instruction::Trunc:
1075 // Check for NOOP conversions.
1076 if (TLI->isTruncateFree(SrcLT.second, DstLT.second))
1077 return 0;
1078 [[fallthrough]];
1079 case Instruction::BitCast:
1080 // Bitcast between types that are legalized to the same type are free and
1081 // assume int to/from ptr of the same size is also free.
1082 if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst &&
1083 SrcSize == DstSize)
1084 return 0;
1085 break;
1086 case Instruction::FPExt:
1087 if (I && getTLI()->isExtFree(I))
1088 return 0;
1089 break;
1090 case Instruction::ZExt:
1091 if (TLI->isZExtFree(SrcLT.second, DstLT.second))
1092 return 0;
1093 [[fallthrough]];
1094 case Instruction::SExt:
1095 if (I && getTLI()->isExtFree(I))
1096 return 0;
1097
1098 // If this is a zext/sext of a load, return 0 if the corresponding
1099 // extending load exists on target and the result type is legal.
1100 if (CCH == TTI::CastContextHint::Normal) {
1101 EVT ExtVT = EVT::getEVT(Dst);
1102 EVT LoadVT = EVT::getEVT(Src);
1103 unsigned LType =
1104 ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
1105 if (DstLT.first == SrcLT.first &&
1106 TLI->isLoadExtLegal(LType, ExtVT, LoadVT))
1107 return 0;
1108 }
1109 break;
1110 case Instruction::AddrSpaceCast:
1111 if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(),
1112 Dst->getPointerAddressSpace()))
1113 return 0;
1114 break;
1115 }
1116
1117 auto *SrcVTy = dyn_cast<VectorType>(Src);
1118 auto *DstVTy = dyn_cast<VectorType>(Dst);
1119
1120 // If the cast is marked as legal (or promote) then assume low cost.
1121 if (SrcLT.first == DstLT.first &&
1122 TLI->isOperationLegalOrPromote(ISD, DstLT.second))
1123 return SrcLT.first;
1124
1125 // Handle scalar conversions.
1126 if (!SrcVTy && !DstVTy) {
1127 // Just check the op cost. If the operation is legal then assume it costs
1128 // 1.
1129 if (!TLI->isOperationExpand(ISD, DstLT.second))
1130 return 1;
1131
1132 // Assume that illegal scalar instruction are expensive.
1133 return 4;
1134 }
1135
1136 // Check vector-to-vector casts.
1137 if (DstVTy && SrcVTy) {
1138 // If the cast is between same-sized registers, then the check is simple.
1139 if (SrcLT.first == DstLT.first && SrcSize == DstSize) {
1140
1141 // Assume that Zext is done using AND.
1142 if (Opcode == Instruction::ZExt)
1143 return SrcLT.first;
1144
1145 // Assume that sext is done using SHL and SRA.
1146 if (Opcode == Instruction::SExt)
1147 return SrcLT.first * 2;
1148
1149 // Just check the op cost. If the operation is legal then assume it
1150 // costs
1151 // 1 and multiply by the type-legalization overhead.
1152 if (!TLI->isOperationExpand(ISD, DstLT.second))
1153 return SrcLT.first * 1;
1154 }
1155
1156 // If we are legalizing by splitting, query the concrete TTI for the cost
1157 // of casting the original vector twice. We also need to factor in the
1158 // cost of the split itself. Count that as 1, to be consistent with
1159 // getTypeLegalizationCost().
1160 bool SplitSrc =
1161 TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
1163 bool SplitDst =
1164 TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
1166 if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() &&
1167 DstVTy->getElementCount().isVector()) {
1168 Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy);
1169 Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy);
1170 T *TTI = static_cast<T *>(this);
1171 // If both types need to be split then the split is free.
1172 InstructionCost SplitCost =
1173 (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0;
1174 return SplitCost +
1175 (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH,
1176 CostKind, I));
1177 }
1178
1179 // Scalarization cost is Invalid, can't assume any num elements.
1180 if (isa<ScalableVectorType>(DstVTy))
1182
1183 // In other cases where the source or destination are illegal, assume
1184 // the operation will get scalarized.
1185 unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements();
1186 InstructionCost Cost = thisT()->getCastInstrCost(
1187 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I);
1188
1189 // Return the cost of multiple scalar invocation plus the cost of
1190 // inserting and extracting the values.
1191 return getScalarizationOverhead(DstVTy, /*Insert*/ true, /*Extract*/ true,
1192 CostKind) +
1193 Num * Cost;
1194 }
1195
1196 // We already handled vector-to-vector and scalar-to-scalar conversions.
1197 // This
1198 // is where we handle bitcast between vectors and scalars. We need to assume
1199 // that the conversion is scalarized in one way or another.
1200 if (Opcode == Instruction::BitCast) {
1201 // Illegal bitcasts are done by storing and loading from a stack slot.
1202 return (SrcVTy ? getScalarizationOverhead(SrcVTy, /*Insert*/ false,
1203 /*Extract*/ true, CostKind)
1204 : 0) +
1205 (DstVTy ? getScalarizationOverhead(DstVTy, /*Insert*/ true,
1206 /*Extract*/ false, CostKind)
1207 : 0);
1208 }
1209
1210 llvm_unreachable("Unhandled cast");
1211 }
1212
1214 VectorType *VecTy, unsigned Index) {
1216 return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
1217 CostKind, Index, nullptr, nullptr) +
1218 thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(),
1220 }
1221
1223 const Instruction *I = nullptr) {
1224 return BaseT::getCFInstrCost(Opcode, CostKind, I);
1225 }
1226
1227 InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1228 CmpInst::Predicate VecPred,
1230 const Instruction *I = nullptr) {
1231 const TargetLoweringBase *TLI = getTLI();
1232 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1233 assert(ISD && "Invalid opcode");
1234
1235 // TODO: Handle other cost kinds.
1237 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1238 I);
1239
1240 // Selects on vectors are actually vector selects.
1241 if (ISD == ISD::SELECT) {
1242 assert(CondTy && "CondTy must exist");
1243 if (CondTy->isVectorTy())
1244 ISD = ISD::VSELECT;
1245 }
1246 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1247
1248 if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
1249 !TLI->isOperationExpand(ISD, LT.second)) {
1250 // The operation is legal. Assume it costs 1. Multiply
1251 // by the type-legalization overhead.
1252 return LT.first * 1;
1253 }
1254
1255 // Otherwise, assume that the cast is scalarized.
1256 // TODO: If one of the types get legalized by splitting, handle this
1257 // similarly to what getCastInstrCost() does.
1258 if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) {
1259 if (isa<ScalableVectorType>(ValTy))
1261
1262 unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements();
1263 if (CondTy)
1264 CondTy = CondTy->getScalarType();
1265 InstructionCost Cost = thisT()->getCmpSelInstrCost(
1266 Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I);
1267
1268 // Return the cost of multiple scalar invocation plus the cost of
1269 // inserting and extracting the values.
1270 return getScalarizationOverhead(ValVTy, /*Insert*/ true,
1271 /*Extract*/ false, CostKind) +
1272 Num * Cost;
1273 }
1274
1275 // Unknown scalar opcode.
1276 return 1;
1277 }
1278
1281 unsigned Index, Value *Op0, Value *Op1) {
1282 return getRegUsageForType(Val->getScalarType());
1283 }
1284
1287 unsigned Index) {
1288 Value *Op0 = nullptr;
1289 Value *Op1 = nullptr;
1290 if (auto *IE = dyn_cast<InsertElementInst>(&I)) {
1291 Op0 = IE->getOperand(0);
1292 Op1 = IE->getOperand(1);
1293 }
1294 return thisT()->getVectorInstrCost(I.getOpcode(), Val, CostKind, Index, Op0,
1295 Op1);
1296 }
1297
1298 InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
1299 int VF,
1300 const APInt &DemandedDstElts,
1302 assert(DemandedDstElts.getBitWidth() == (unsigned)VF * ReplicationFactor &&
1303 "Unexpected size of DemandedDstElts.");
1304
1306
1307 auto *SrcVT = FixedVectorType::get(EltTy, VF);
1308 auto *ReplicatedVT = FixedVectorType::get(EltTy, VF * ReplicationFactor);
1309
1310 // The Mask shuffling cost is extract all the elements of the Mask
1311 // and insert each of them Factor times into the wide vector:
1312 //
1313 // E.g. an interleaved group with factor 3:
1314 // %mask = icmp ult <8 x i32> %vec1, %vec2
1315 // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
1316 // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
1317 // The cost is estimated as extract all mask elements from the <8xi1> mask
1318 // vector and insert them factor times into the <24xi1> shuffled mask
1319 // vector.
1320 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedDstElts, VF);
1321 Cost += thisT()->getScalarizationOverhead(SrcVT, DemandedSrcElts,
1322 /*Insert*/ false,
1323 /*Extract*/ true, CostKind);
1324 Cost += thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts,
1325 /*Insert*/ true,
1326 /*Extract*/ false, CostKind);
1327
1328 return Cost;
1329 }
1330
1332 getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
1335 const Instruction *I = nullptr) {
1336 assert(!Src->isVoidTy() && "Invalid type");
1337 // Assume types, such as structs, are expensive.
1338 if (getTLI()->getValueType(DL, Src, true) == MVT::Other)
1339 return 4;
1340 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1341
1342 // Assuming that all loads of legal types cost 1.
1343 InstructionCost Cost = LT.first;
1345 return Cost;
1346
1347 const DataLayout &DL = this->getDataLayout();
1348 if (Src->isVectorTy() &&
1349 // In practice it's not currently possible to have a change in lane
1350 // length for extending loads or truncating stores so both types should
1351 // have the same scalable property.
1353 LT.second.getSizeInBits())) {
1354 // This is a vector load that legalizes to a larger type than the vector
1355 // itself. Unless the corresponding extending load or truncating store is
1356 // legal, then this will scalarize.
1358 EVT MemVT = getTLI()->getValueType(DL, Src);
1359 if (Opcode == Instruction::Store)
1360 LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
1361 else
1362 LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
1363
1364 if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
1365 // This is a vector load/store for some illegal type that is scalarized.
1366 // We must account for the cost of building or decomposing the vector.
1368 cast<VectorType>(Src), Opcode != Instruction::Store,
1369 Opcode == Instruction::Store, CostKind);
1370 }
1371 }
1372
1373 return Cost;
1374 }
1375
1377 Align Alignment, unsigned AddressSpace,
1379 // TODO: Pass on AddressSpace when we have test coverage.
1380 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false,
1381 CostKind);
1382 }
1383
1385 const Value *Ptr, bool VariableMask,
1386 Align Alignment,
1388 const Instruction *I = nullptr) {
1389 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask,
1390 true, CostKind);
1391 }
1392
1394 const Value *Ptr, bool VariableMask,
1395 Align Alignment,
1397 const Instruction *I) {
1398 // For a target without strided memory operations (or for an illegal
1399 // operation type on one which does), assume we lower to a gather/scatter
1400 // operation. (Which may in turn be scalarized.)
1401 return thisT()->getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1402 Alignment, CostKind, I);
1403 }
1404
1406 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1407 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1408 bool UseMaskForCond = false, bool UseMaskForGaps = false) {
1409
1410 // We cannot scalarize scalable vectors, so return Invalid.
1411 if (isa<ScalableVectorType>(VecTy))
1413
1414 auto *VT = cast<FixedVectorType>(VecTy);
1415
1416 unsigned NumElts = VT->getNumElements();
1417 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1418
1419 unsigned NumSubElts = NumElts / Factor;
1420 auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts);
1421
1422 // Firstly, the cost of load/store operation.
1424 if (UseMaskForCond || UseMaskForGaps)
1425 Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment,
1427 else
1428 Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace,
1429 CostKind);
1430
1431 // Legalize the vector type, and get the legalized and unlegalized type
1432 // sizes.
1433 MVT VecTyLT = getTypeLegalizationCost(VecTy).second;
1434 unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy);
1435 unsigned VecTyLTSize = VecTyLT.getStoreSize();
1436
1437 // Scale the cost of the memory operation by the fraction of legalized
1438 // instructions that will actually be used. We shouldn't account for the
1439 // cost of dead instructions since they will be removed.
1440 //
1441 // E.g., An interleaved load of factor 8:
1442 // %vec = load <16 x i64>, <16 x i64>* %ptr
1443 // %v0 = shufflevector %vec, undef, <0, 8>
1444 //
1445 // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
1446 // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
1447 // type). The other loads are unused.
1448 //
1449 // TODO: Note that legalization can turn masked loads/stores into unmasked
1450 // (legalized) loads/stores. This can be reflected in the cost.
1451 if (Cost.isValid() && VecTySize > VecTyLTSize) {
1452 // The number of loads of a legal type it will take to represent a load
1453 // of the unlegalized vector type.
1454 unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize);
1455
1456 // The number of elements of the unlegalized type that correspond to a
1457 // single legal instruction.
1458 unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts);
1459
1460 // Determine which legal instructions will be used.
1461 BitVector UsedInsts(NumLegalInsts, false);
1462 for (unsigned Index : Indices)
1463 for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
1464 UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
1465
1466 // Scale the cost of the load by the fraction of legal instructions that
1467 // will be used.
1468 Cost = divideCeil(UsedInsts.count() * *Cost.getValue(), NumLegalInsts);
1469 }
1470
1471 // Then plus the cost of interleave operation.
1472 assert(Indices.size() <= Factor &&
1473 "Interleaved memory op has too many members");
1474
1475 const APInt DemandedAllSubElts = APInt::getAllOnes(NumSubElts);
1476 const APInt DemandedAllResultElts = APInt::getAllOnes(NumElts);
1477
1478 APInt DemandedLoadStoreElts = APInt::getZero(NumElts);
1479 for (unsigned Index : Indices) {
1480 assert(Index < Factor && "Invalid index for interleaved memory op");
1481 for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
1482 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
1483 }
1484
1485 if (Opcode == Instruction::Load) {
1486 // The interleave cost is similar to extract sub vectors' elements
1487 // from the wide vector, and insert them into sub vectors.
1488 //
1489 // E.g. An interleaved load of factor 2 (with one member of index 0):
1490 // %vec = load <8 x i32>, <8 x i32>* %ptr
1491 // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0
1492 // The cost is estimated as extract elements at 0, 2, 4, 6 from the
1493 // <8 x i32> vector and insert them into a <4 x i32> vector.
1494 InstructionCost InsSubCost = thisT()->getScalarizationOverhead(
1495 SubVT, DemandedAllSubElts,
1496 /*Insert*/ true, /*Extract*/ false, CostKind);
1497 Cost += Indices.size() * InsSubCost;
1498 Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1499 /*Insert*/ false,
1500 /*Extract*/ true, CostKind);
1501 } else {
1502 // The interleave cost is extract elements from sub vectors, and
1503 // insert them into the wide vector.
1504 //
1505 // E.g. An interleaved store of factor 3 with 2 members at indices 0,1:
1506 // (using VF=4):
1507 // %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef>
1508 // %gaps.mask = <true, true, false, true, true, false,
1509 // true, true, false, true, true, false>
1510 // call llvm.masked.store <12 x i32> %v0_v1, <12 x i32>* %ptr,
1511 // i32 Align, <12 x i1> %gaps.mask
1512 // The cost is estimated as extract all elements (of actual members,
1513 // excluding gaps) from both <4 x i32> vectors and insert into the <12 x
1514 // i32> vector.
1515 InstructionCost ExtSubCost = thisT()->getScalarizationOverhead(
1516 SubVT, DemandedAllSubElts,
1517 /*Insert*/ false, /*Extract*/ true, CostKind);
1518 Cost += ExtSubCost * Indices.size();
1519 Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1520 /*Insert*/ true,
1521 /*Extract*/ false, CostKind);
1522 }
1523
1524 if (!UseMaskForCond)
1525 return Cost;
1526
1527 Type *I8Type = Type::getInt8Ty(VT->getContext());
1528
1529 Cost += thisT()->getReplicationShuffleCost(
1530 I8Type, Factor, NumSubElts,
1531 UseMaskForGaps ? DemandedLoadStoreElts : DemandedAllResultElts,
1532 CostKind);
1533
1534 // The Gaps mask is invariant and created outside the loop, therefore the
1535 // cost of creating it is not accounted for here. However if we have both
1536 // a MaskForGaps and some other mask that guards the execution of the
1537 // memory access, we need to account for the cost of And-ing the two masks
1538 // inside the loop.
1539 if (UseMaskForGaps) {
1540 auto *MaskVT = FixedVectorType::get(I8Type, NumElts);
1541 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT,
1542 CostKind);
1543 }
1544
1545 return Cost;
1546 }
1547
1548 /// Get intrinsic cost based on arguments.
1551 // Check for generically free intrinsics.
1553 return 0;
1554
1555 // Assume that target intrinsics are cheap.
1556 Intrinsic::ID IID = ICA.getID();
1559
1560 if (ICA.isTypeBasedOnly())
1562
1563 Type *RetTy = ICA.getReturnType();
1564
1565 ElementCount RetVF =
1566 (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
1568 const IntrinsicInst *I = ICA.getInst();
1569 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
1570 FastMathFlags FMF = ICA.getFlags();
1571 switch (IID) {
1572 default:
1573 break;
1574
1575 case Intrinsic::powi:
1576 if (auto *RHSC = dyn_cast<ConstantInt>(Args[1])) {
1577 bool ShouldOptForSize = I->getParent()->getParent()->hasOptSize();
1578 if (getTLI()->isBeneficialToExpandPowI(RHSC->getSExtValue(),
1579 ShouldOptForSize)) {
1580 // The cost is modeled on the expansion performed by ExpandPowI in
1581 // SelectionDAGBuilder.
1582 APInt Exponent = RHSC->getValue().abs();
1583 unsigned ActiveBits = Exponent.getActiveBits();
1584 unsigned PopCount = Exponent.popcount();
1585 InstructionCost Cost = (ActiveBits + PopCount - 2) *
1586 thisT()->getArithmeticInstrCost(
1587 Instruction::FMul, RetTy, CostKind);
1588 if (RHSC->isNegative())
1589 Cost += thisT()->getArithmeticInstrCost(Instruction::FDiv, RetTy,
1590 CostKind);
1591 return Cost;
1592 }
1593 }
1594 break;
1595 case Intrinsic::cttz:
1596 // FIXME: If necessary, this should go in target-specific overrides.
1597 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz(RetTy))
1599 break;
1600
1601 case Intrinsic::ctlz:
1602 // FIXME: If necessary, this should go in target-specific overrides.
1603 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz(RetTy))
1605 break;
1606
1607 case Intrinsic::memcpy:
1608 return thisT()->getMemcpyCost(ICA.getInst());
1609
1610 case Intrinsic::masked_scatter: {
1611 const Value *Mask = Args[3];
1612 bool VarMask = !isa<Constant>(Mask);
1613 Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue();
1614 return thisT()->getGatherScatterOpCost(Instruction::Store,
1615 ICA.getArgTypes()[0], Args[1],
1616 VarMask, Alignment, CostKind, I);
1617 }
1618 case Intrinsic::masked_gather: {
1619 const Value *Mask = Args[2];
1620 bool VarMask = !isa<Constant>(Mask);
1621 Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue();
1622 return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
1623 VarMask, Alignment, CostKind, I);
1624 }
1625 case Intrinsic::experimental_vp_strided_store: {
1626 const Value *Data = Args[0];
1627 const Value *Ptr = Args[1];
1628 const Value *Mask = Args[3];
1629 const Value *EVL = Args[4];
1630 bool VarMask = !isa<Constant>(Mask) || !isa<Constant>(EVL);
1631 Align Alignment = I->getParamAlign(1).valueOrOne();
1632 return thisT()->getStridedMemoryOpCost(Instruction::Store,
1633 Data->getType(), Ptr, VarMask,
1634 Alignment, CostKind, I);
1635 }
1636 case Intrinsic::experimental_vp_strided_load: {
1637 const Value *Ptr = Args[0];
1638 const Value *Mask = Args[2];
1639 const Value *EVL = Args[3];
1640 bool VarMask = !isa<Constant>(Mask) || !isa<Constant>(EVL);
1641 Align Alignment = I->getParamAlign(0).valueOrOne();
1642 return thisT()->getStridedMemoryOpCost(Instruction::Load, RetTy, Ptr,
1643 VarMask, Alignment, CostKind, I);
1644 }
1645 case Intrinsic::experimental_stepvector: {
1646 if (isa<ScalableVectorType>(RetTy))
1648 // The cost of materialising a constant integer vector.
1650 }
1651 case Intrinsic::vector_extract: {
1652 // FIXME: Handle case where a scalable vector is extracted from a scalable
1653 // vector
1654 if (isa<ScalableVectorType>(RetTy))
1656 unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue();
1657 return thisT()->getShuffleCost(
1658 TTI::SK_ExtractSubvector, cast<VectorType>(Args[0]->getType()),
1659 std::nullopt, CostKind, Index, cast<VectorType>(RetTy));
1660 }
1661 case Intrinsic::vector_insert: {
1662 // FIXME: Handle case where a scalable vector is inserted into a scalable
1663 // vector
1664 if (isa<ScalableVectorType>(Args[1]->getType()))
1666 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1667 return thisT()->getShuffleCost(
1668 TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()),
1669 std::nullopt, CostKind, Index, cast<VectorType>(Args[1]->getType()));
1670 }
1671 case Intrinsic::vector_reverse: {
1672 return thisT()->getShuffleCost(
1673 TTI::SK_Reverse, cast<VectorType>(Args[0]->getType()), std::nullopt,
1674 CostKind, 0, cast<VectorType>(RetTy));
1675 }
1676 case Intrinsic::vector_splice: {
1677 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1678 return thisT()->getShuffleCost(
1679 TTI::SK_Splice, cast<VectorType>(Args[0]->getType()), std::nullopt,
1680 CostKind, Index, cast<VectorType>(RetTy));
1681 }
1682 case Intrinsic::vector_reduce_add:
1683 case Intrinsic::vector_reduce_mul:
1684 case Intrinsic::vector_reduce_and:
1685 case Intrinsic::vector_reduce_or:
1686 case Intrinsic::vector_reduce_xor:
1687 case Intrinsic::vector_reduce_smax:
1688 case Intrinsic::vector_reduce_smin:
1689 case Intrinsic::vector_reduce_fmax:
1690 case Intrinsic::vector_reduce_fmin:
1691 case Intrinsic::vector_reduce_fmaximum:
1692 case Intrinsic::vector_reduce_fminimum:
1693 case Intrinsic::vector_reduce_umax:
1694 case Intrinsic::vector_reduce_umin: {
1695 IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1);
1697 }
1698 case Intrinsic::vector_reduce_fadd:
1699 case Intrinsic::vector_reduce_fmul: {
1701 IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1);
1703 }
1704 case Intrinsic::fshl:
1705 case Intrinsic::fshr: {
1706 const Value *X = Args[0];
1707 const Value *Y = Args[1];
1708 const Value *Z = Args[2];
1711 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(Z);
1712 const TTI::OperandValueInfo OpInfoBW =
1714 isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2
1715 : TTI::OP_None};
1716
1717 // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
1718 // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
1720 Cost +=
1721 thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
1722 Cost +=
1723 thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
1724 Cost += thisT()->getArithmeticInstrCost(
1725 BinaryOperator::Shl, RetTy, CostKind, OpInfoX,
1726 {OpInfoZ.Kind, TTI::OP_None});
1727 Cost += thisT()->getArithmeticInstrCost(
1728 BinaryOperator::LShr, RetTy, CostKind, OpInfoY,
1729 {OpInfoZ.Kind, TTI::OP_None});
1730 // Non-constant shift amounts requires a modulo.
1731 if (!OpInfoZ.isConstant())
1732 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy,
1733 CostKind, OpInfoZ, OpInfoBW);
1734 // For non-rotates (X != Y) we must add shift-by-zero handling costs.
1735 if (X != Y) {
1736 Type *CondTy = RetTy->getWithNewBitWidth(1);
1737 Cost +=
1738 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1740 Cost +=
1741 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1743 }
1744 return Cost;
1745 }
1746 case Intrinsic::get_active_lane_mask: {
1747 EVT ResVT = getTLI()->getValueType(DL, RetTy, true);
1748 EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true);
1749
1750 // If we're not expanding the intrinsic then we assume this is cheap
1751 // to implement.
1752 if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgType)) {
1753 return getTypeLegalizationCost(RetTy).first;
1754 }
1755
1756 // Create the expanded types that will be used to calculate the uadd_sat
1757 // operation.
1758 Type *ExpRetTy = VectorType::get(
1759 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1760 IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {}, FMF);
1762 thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1763 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy,
1765 return Cost;
1766 }
1767 case Intrinsic::experimental_cttz_elts: {
1768 EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true);
1769
1770 // If we're not expanding the intrinsic then we assume this is cheap
1771 // to implement.
1772 if (!getTLI()->shouldExpandCttzElements(ArgType))
1773 return getTypeLegalizationCost(RetTy).first;
1774
1775 // TODO: The costs below reflect the expansion code in
1776 // SelectionDAGBuilder, but we may want to sacrifice some accuracy in
1777 // favour of compile time.
1778
1779 // Find the smallest "sensible" element type to use for the expansion.
1780 bool ZeroIsPoison = !cast<ConstantInt>(Args[1])->isZero();
1781 ConstantRange VScaleRange(APInt(64, 1), APInt::getZero(64));
1782 if (isa<ScalableVectorType>(ICA.getArgTypes()[0]) && I && I->getCaller())
1783 VScaleRange = getVScaleRange(I->getCaller(), 64);
1784
1785 unsigned EltWidth = getTLI()->getBitWidthForCttzElements(
1786 RetTy, ArgType.getVectorElementCount(), ZeroIsPoison, &VScaleRange);
1787 Type *NewEltTy = IntegerType::getIntNTy(RetTy->getContext(), EltWidth);
1788
1789 // Create the new vector type & get the vector length
1790 Type *NewVecTy = VectorType::get(
1791 NewEltTy, cast<VectorType>(Args[0]->getType())->getElementCount());
1792
1793 IntrinsicCostAttributes StepVecAttrs(Intrinsic::experimental_stepvector,
1794 NewVecTy, {}, FMF);
1796 thisT()->getIntrinsicInstrCost(StepVecAttrs, CostKind);
1797
1798 Cost +=
1799 thisT()->getArithmeticInstrCost(Instruction::Sub, NewVecTy, CostKind);
1800 Cost += thisT()->getCastInstrCost(Instruction::SExt, NewVecTy,
1801 Args[0]->getType(),
1803 Cost +=
1804 thisT()->getArithmeticInstrCost(Instruction::And, NewVecTy, CostKind);
1805
1806 IntrinsicCostAttributes ReducAttrs(Intrinsic::vector_reduce_umax,
1807 NewEltTy, NewVecTy, FMF, I, 1);
1808 Cost += thisT()->getTypeBasedIntrinsicInstrCost(ReducAttrs, CostKind);
1809 Cost +=
1810 thisT()->getArithmeticInstrCost(Instruction::Sub, NewEltTy, CostKind);
1811
1812 return Cost;
1813 }
1814 }
1815
1816 // VP Intrinsics should have the same cost as their non-vp counterpart.
1817 // TODO: Adjust the cost to make the vp intrinsic cheaper than its non-vp
1818 // counterpart when the vector length argument is smaller than the maximum
1819 // vector length.
1820 // TODO: Support other kinds of VPIntrinsics
1821 if (VPIntrinsic::isVPIntrinsic(ICA.getID())) {
1822 std::optional<unsigned> FOp =
1824 if (FOp) {
1825 if (ICA.getID() == Intrinsic::vp_load) {
1826 Align Alignment;
1827 if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
1828 Alignment = VPI->getPointerAlignment().valueOrOne();
1829 unsigned AS = 0;
1830 if (ICA.getArgs().size() > 1)
1831 if (auto *PtrTy =
1832 dyn_cast<PointerType>(ICA.getArgs()[0]->getType()))
1833 AS = PtrTy->getAddressSpace();
1834 return thisT()->getMemoryOpCost(*FOp, ICA.getReturnType(), Alignment,
1835 AS, CostKind);
1836 }
1837 if (ICA.getID() == Intrinsic::vp_store) {
1838 Align Alignment;
1839 if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
1840 Alignment = VPI->getPointerAlignment().valueOrOne();
1841 unsigned AS = 0;
1842 if (ICA.getArgs().size() >= 2)
1843 if (auto *PtrTy =
1844 dyn_cast<PointerType>(ICA.getArgs()[1]->getType()))
1845 AS = PtrTy->getAddressSpace();
1846 return thisT()->getMemoryOpCost(*FOp, Args[0]->getType(), Alignment,
1847 AS, CostKind);
1848 }
1850 return thisT()->getArithmeticInstrCost(*FOp, ICA.getReturnType(),
1851 CostKind);
1852 }
1853 }
1854
1855 std::optional<Intrinsic::ID> FID =
1857 if (FID) {
1858 // Non-vp version will have same Args/Tys except mask and vector length.
1859 assert(ICA.getArgs().size() >= 2 && ICA.getArgTypes().size() >= 2 &&
1860 "Expected VPIntrinsic to have Mask and Vector Length args and "
1861 "types");
1863
1864 // VPReduction intrinsics have a start value argument that their non-vp
1865 // counterparts do not have, except for the fadd and fmul non-vp
1866 // counterpart.
1868 *FID != Intrinsic::vector_reduce_fadd &&
1869 *FID != Intrinsic::vector_reduce_fmul)
1870 NewTys = NewTys.drop_front();
1871
1872 IntrinsicCostAttributes NewICA(*FID, ICA.getReturnType(), NewTys,
1873 ICA.getFlags());
1874 return thisT()->getIntrinsicInstrCost(NewICA, CostKind);
1875 }
1876 }
1877
1878 // Assume that we need to scalarize this intrinsic.)
1879 // Compute the scalarization overhead based on Args for a vector
1880 // intrinsic.
1881 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
1882 if (RetVF.isVector() && !RetVF.isScalable()) {
1883 ScalarizationCost = 0;
1884 if (!RetTy->isVoidTy())
1885 ScalarizationCost += getScalarizationOverhead(
1886 cast<VectorType>(RetTy),
1887 /*Insert*/ true, /*Extract*/ false, CostKind);
1888 ScalarizationCost +=
1890 }
1891
1892 IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
1893 ScalarizationCost);
1894 return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1895 }
1896
1897 /// Get intrinsic cost based on argument types.
1898 /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the
1899 /// cost of scalarizing the arguments and the return value will be computed
1900 /// based on types.
1904 Intrinsic::ID IID = ICA.getID();
1905 Type *RetTy = ICA.getReturnType();
1906 const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes();
1907 FastMathFlags FMF = ICA.getFlags();
1908 InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost();
1909 bool SkipScalarizationCost = ICA.skipScalarizationCost();
1910
1911 VectorType *VecOpTy = nullptr;
1912 if (!Tys.empty()) {
1913 // The vector reduction operand is operand 0 except for fadd/fmul.
1914 // Their operand 0 is a scalar start value, so the vector op is operand 1.
1915 unsigned VecTyIndex = 0;
1916 if (IID == Intrinsic::vector_reduce_fadd ||
1917 IID == Intrinsic::vector_reduce_fmul)
1918 VecTyIndex = 1;
1919 assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes");
1920 VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]);
1921 }
1922
1923 // Library call cost - other than size, make it expensive.
1924 unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10;
1925 unsigned ISD = 0;
1926 switch (IID) {
1927 default: {
1928 // Scalable vectors cannot be scalarized, so return Invalid.
1929 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
1930 return isa<ScalableVectorType>(Ty);
1931 }))
1933
1934 // Assume that we need to scalarize this intrinsic.
1935 InstructionCost ScalarizationCost =
1936 SkipScalarizationCost ? ScalarizationCostPassed : 0;
1937 unsigned ScalarCalls = 1;
1938 Type *ScalarRetTy = RetTy;
1939 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
1940 if (!SkipScalarizationCost)
1941 ScalarizationCost = getScalarizationOverhead(
1942 RetVTy, /*Insert*/ true, /*Extract*/ false, CostKind);
1943 ScalarCalls = std::max(ScalarCalls,
1944 cast<FixedVectorType>(RetVTy)->getNumElements());
1945 ScalarRetTy = RetTy->getScalarType();
1946 }
1947 SmallVector<Type *, 4> ScalarTys;
1948 for (Type *Ty : Tys) {
1949 if (auto *VTy = dyn_cast<VectorType>(Ty)) {
1950 if (!SkipScalarizationCost)
1951 ScalarizationCost += getScalarizationOverhead(
1952 VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
1953 ScalarCalls = std::max(ScalarCalls,
1954 cast<FixedVectorType>(VTy)->getNumElements());
1955 Ty = Ty->getScalarType();
1956 }
1957 ScalarTys.push_back(Ty);
1958 }
1959 if (ScalarCalls == 1)
1960 return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
1961
1962 IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF);
1963 InstructionCost ScalarCost =
1964 thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind);
1965
1966 return ScalarCalls * ScalarCost + ScalarizationCost;
1967 }
1968 // Look for intrinsics that can be lowered directly or turned into a scalar
1969 // intrinsic call.
1970 case Intrinsic::sqrt:
1971 ISD = ISD::FSQRT;
1972 break;
1973 case Intrinsic::sin:
1974 ISD = ISD::FSIN;
1975 break;
1976 case Intrinsic::cos:
1977 ISD = ISD::FCOS;
1978 break;
1979 case Intrinsic::tan:
1980 ISD = ISD::FTAN;
1981 break;
1982 case Intrinsic::exp:
1983 ISD = ISD::FEXP;
1984 break;
1985 case Intrinsic::exp2:
1986 ISD = ISD::FEXP2;
1987 break;
1988 case Intrinsic::exp10:
1989 ISD = ISD::FEXP10;
1990 break;
1991 case Intrinsic::log:
1992 ISD = ISD::FLOG;
1993 break;
1994 case Intrinsic::log10:
1995 ISD = ISD::FLOG10;
1996 break;
1997 case Intrinsic::log2:
1998 ISD = ISD::FLOG2;
1999 break;
2000 case Intrinsic::fabs:
2001 ISD = ISD::FABS;
2002 break;
2003 case Intrinsic::canonicalize:
2004 ISD = ISD::FCANONICALIZE;
2005 break;
2006 case Intrinsic::minnum:
2007 ISD = ISD::FMINNUM;
2008 break;
2009 case Intrinsic::maxnum:
2010 ISD = ISD::FMAXNUM;
2011 break;
2012 case Intrinsic::minimum:
2013 ISD = ISD::FMINIMUM;
2014 break;
2015 case Intrinsic::maximum:
2016 ISD = ISD::FMAXIMUM;
2017 break;
2018 case Intrinsic::copysign:
2019 ISD = ISD::FCOPYSIGN;
2020 break;
2021 case Intrinsic::floor:
2022 ISD = ISD::FFLOOR;
2023 break;
2024 case Intrinsic::ceil:
2025 ISD = ISD::FCEIL;
2026 break;
2027 case Intrinsic::trunc:
2028 ISD = ISD::FTRUNC;
2029 break;
2030 case Intrinsic::nearbyint:
2031 ISD = ISD::FNEARBYINT;
2032 break;
2033 case Intrinsic::rint:
2034 ISD = ISD::FRINT;
2035 break;
2036 case Intrinsic::lrint:
2037 ISD = ISD::LRINT;
2038 break;
2039 case Intrinsic::llrint:
2040 ISD = ISD::LLRINT;
2041 break;
2042 case Intrinsic::round:
2043 ISD = ISD::FROUND;
2044 break;
2045 case Intrinsic::roundeven:
2046 ISD = ISD::FROUNDEVEN;
2047 break;
2048 case Intrinsic::pow:
2049 ISD = ISD::FPOW;
2050 break;
2051 case Intrinsic::fma:
2052 ISD = ISD::FMA;
2053 break;
2054 case Intrinsic::fmuladd:
2055 ISD = ISD::FMA;
2056 break;
2057 case Intrinsic::experimental_constrained_fmuladd:
2058 ISD = ISD::STRICT_FMA;
2059 break;
2060 // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
2061 case Intrinsic::lifetime_start:
2062 case Intrinsic::lifetime_end:
2063 case Intrinsic::sideeffect:
2064 case Intrinsic::pseudoprobe:
2065 case Intrinsic::arithmetic_fence:
2066 return 0;
2067 case Intrinsic::masked_store: {
2068 Type *Ty = Tys[0];
2069 Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
2070 return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0,
2071 CostKind);
2072 }
2073 case Intrinsic::masked_load: {
2074 Type *Ty = RetTy;
2075 Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
2076 return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
2077 CostKind);
2078 }
2079 case Intrinsic::vector_reduce_add:
2080 case Intrinsic::vector_reduce_mul:
2081 case Intrinsic::vector_reduce_and:
2082 case Intrinsic::vector_reduce_or:
2083 case Intrinsic::vector_reduce_xor:
2084 return thisT()->getArithmeticReductionCost(
2085 getArithmeticReductionInstruction(IID), VecOpTy, std::nullopt,
2086 CostKind);
2087 case Intrinsic::vector_reduce_fadd:
2088 case Intrinsic::vector_reduce_fmul:
2089 return thisT()->getArithmeticReductionCost(
2090 getArithmeticReductionInstruction(IID), VecOpTy, FMF, CostKind);
2091 case Intrinsic::vector_reduce_smax:
2092 case Intrinsic::vector_reduce_smin:
2093 case Intrinsic::vector_reduce_umax:
2094 case Intrinsic::vector_reduce_umin:
2095 case Intrinsic::vector_reduce_fmax:
2096 case Intrinsic::vector_reduce_fmin:
2097 case Intrinsic::vector_reduce_fmaximum:
2098 case Intrinsic::vector_reduce_fminimum:
2099 return thisT()->getMinMaxReductionCost(getMinMaxReductionIntrinsicOp(IID),
2100 VecOpTy, ICA.getFlags(), CostKind);
2101 case Intrinsic::abs: {
2102 // abs(X) = select(icmp(X,0),X,sub(0,X))
2103 Type *CondTy = RetTy->getWithNewBitWidth(1);
2106 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2107 Pred, CostKind);
2108 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2109 Pred, CostKind);
2110 // TODO: Should we add an OperandValueProperties::OP_Zero property?
2111 Cost += thisT()->getArithmeticInstrCost(
2112 BinaryOperator::Sub, RetTy, CostKind, {TTI::OK_UniformConstantValue, TTI::OP_None});
2113 return Cost;
2114 }
2115 case Intrinsic::smax:
2116 case Intrinsic::smin:
2117 case Intrinsic::umax:
2118 case Intrinsic::umin: {
2119 // minmax(X,Y) = select(icmp(X,Y),X,Y)
2120 Type *CondTy = RetTy->getWithNewBitWidth(1);
2121 bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin;
2122 CmpInst::Predicate Pred =
2123 IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT;
2125 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2126 Pred, CostKind);
2127 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2128 Pred, CostKind);
2129 return Cost;
2130 }
2131 case Intrinsic::sadd_sat:
2132 case Intrinsic::ssub_sat: {
2133 Type *CondTy = RetTy->getWithNewBitWidth(1);
2134
2135 Type *OpTy = StructType::create({RetTy, CondTy});
2136 Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
2137 ? Intrinsic::sadd_with_overflow
2138 : Intrinsic::ssub_with_overflow;
2140
2141 // SatMax -> Overflow && SumDiff < 0
2142 // SatMin -> Overflow && SumDiff >= 0
2144 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
2145 nullptr, ScalarizationCostPassed);
2146 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2147 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2148 Pred, CostKind);
2149 Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
2150 CondTy, Pred, CostKind);
2151 return Cost;
2152 }
2153 case Intrinsic::uadd_sat:
2154 case Intrinsic::usub_sat: {
2155 Type *CondTy = RetTy->getWithNewBitWidth(1);
2156
2157 Type *OpTy = StructType::create({RetTy, CondTy});
2158 Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
2159 ? Intrinsic::uadd_with_overflow
2160 : Intrinsic::usub_with_overflow;
2161
2163 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
2164 nullptr, ScalarizationCostPassed);
2165 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2166 Cost +=
2167 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2169 return Cost;
2170 }
2171 case Intrinsic::smul_fix:
2172 case Intrinsic::umul_fix: {
2173 unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
2174 Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);
2175
2176 unsigned ExtOp =
2177 IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
2179
2181 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
2182 Cost +=
2183 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2184 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
2185 CCH, CostKind);
2186 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy,
2187 CostKind,
2190 Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind,
2193 Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
2194 return Cost;
2195 }
2196 case Intrinsic::sadd_with_overflow:
2197 case Intrinsic::ssub_with_overflow: {
2198 Type *SumTy = RetTy->getContainedType(0);
2199 Type *OverflowTy = RetTy->getContainedType(1);
2200 unsigned Opcode = IID == Intrinsic::sadd_with_overflow
2201 ? BinaryOperator::Add
2202 : BinaryOperator::Sub;
2203
2204 // Add:
2205 // Overflow -> (Result < LHS) ^ (RHS < 0)
2206 // Sub:
2207 // Overflow -> (Result < LHS) ^ (RHS > 0)
2209 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
2210 Cost += 2 * thisT()->getCmpSelInstrCost(
2211 Instruction::ICmp, SumTy, OverflowTy,
2213 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy,
2214 CostKind);
2215 return Cost;
2216 }
2217 case Intrinsic::uadd_with_overflow:
2218 case Intrinsic::usub_with_overflow: {
2219 Type *SumTy = RetTy->getContainedType(0);
2220 Type *OverflowTy = RetTy->getContainedType(1);
2221 unsigned Opcode = IID == Intrinsic::uadd_with_overflow
2222 ? BinaryOperator::Add
2223 : BinaryOperator::Sub;
2224 CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow
2227
2229 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
2230 Cost +=
2231 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy,
2232 Pred, CostKind);
2233 return Cost;
2234 }
2235 case Intrinsic::smul_with_overflow:
2236 case Intrinsic::umul_with_overflow: {
2237 Type *MulTy = RetTy->getContainedType(0);
2238 Type *OverflowTy = RetTy->getContainedType(1);
2239 unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
2240 Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
2241 bool IsSigned = IID == Intrinsic::smul_with_overflow;
2242
2243 unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt;
2245
2247 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
2248 Cost +=
2249 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2250 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
2251 CCH, CostKind);
2252 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, ExtTy,
2253 CostKind,
2256
2257 if (IsSigned)
2258 Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy,
2259 CostKind,
2262
2263 Cost += thisT()->getCmpSelInstrCost(
2264 BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind);
2265 return Cost;
2266 }
2267 case Intrinsic::fptosi_sat:
2268 case Intrinsic::fptoui_sat: {
2269 if (Tys.empty())
2270 break;
2271 Type *FromTy = Tys[0];
2272 bool IsSigned = IID == Intrinsic::fptosi_sat;
2273
2275 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FromTy,
2276 {FromTy, FromTy});
2277 Cost += thisT()->getIntrinsicInstrCost(Attrs1, CostKind);
2278 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FromTy,
2279 {FromTy, FromTy});
2280 Cost += thisT()->getIntrinsicInstrCost(Attrs2, CostKind);
2281 Cost += thisT()->getCastInstrCost(
2282 IsSigned ? Instruction::FPToSI : Instruction::FPToUI, RetTy, FromTy,
2284 if (IsSigned) {
2285 Type *CondTy = RetTy->getWithNewBitWidth(1);
2286 Cost += thisT()->getCmpSelInstrCost(
2287 BinaryOperator::FCmp, FromTy, CondTy, CmpInst::FCMP_UNO, CostKind);
2288 Cost += thisT()->getCmpSelInstrCost(
2289 BinaryOperator::Select, RetTy, CondTy, CmpInst::FCMP_UNO, CostKind);
2290 }
2291 return Cost;
2292 }
2293 case Intrinsic::ctpop:
2294 ISD = ISD::CTPOP;
2295 // In case of legalization use TCC_Expensive. This is cheaper than a
2296 // library call but still not a cheap instruction.
2297 SingleCallCost = TargetTransformInfo::TCC_Expensive;
2298 break;
2299 case Intrinsic::ctlz:
2300 ISD = ISD::CTLZ;
2301 break;
2302 case Intrinsic::cttz:
2303 ISD = ISD::CTTZ;
2304 break;
2305 case Intrinsic::bswap:
2306 ISD = ISD::BSWAP;
2307 break;
2308 case Intrinsic::bitreverse:
2309 ISD = ISD::BITREVERSE;
2310 break;
2311 }
2312
2313 const TargetLoweringBase *TLI = getTLI();
2314 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
2315
2316 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
2317 if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
2318 TLI->isFAbsFree(LT.second)) {
2319 return 0;
2320 }
2321
2322 // The operation is legal. Assume it costs 1.
2323 // If the type is split to multiple registers, assume that there is some
2324 // overhead to this.
2325 // TODO: Once we have extract/insert subvector cost we need to use them.
2326 if (LT.first > 1)
2327 return (LT.first * 2);
2328 else
2329 return (LT.first * 1);
2330 } else if (!TLI->isOperationExpand(ISD, LT.second)) {
2331 // If the operation is custom lowered then assume
2332 // that the code is twice as expensive.
2333 return (LT.first * 2);
2334 }
2335
2336 // If we can't lower fmuladd into an FMA estimate the cost as a floating
2337 // point mul followed by an add.
2338 if (IID == Intrinsic::fmuladd)
2339 return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy,
2340 CostKind) +
2341 thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy,
2342 CostKind);
2343 if (IID == Intrinsic::experimental_constrained_fmuladd) {
2344 IntrinsicCostAttributes FMulAttrs(
2345 Intrinsic::experimental_constrained_fmul, RetTy, Tys);
2346 IntrinsicCostAttributes FAddAttrs(
2347 Intrinsic::experimental_constrained_fadd, RetTy, Tys);
2348 return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) +
2349 thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind);
2350 }
2351
2352 // Else, assume that we need to scalarize this intrinsic. For math builtins
2353 // this will emit a costly libcall, adding call overhead and spills. Make it
2354 // very expensive.
2355 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
2356 // Scalable vectors cannot be scalarized, so return Invalid.
2357 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
2358 return isa<ScalableVectorType>(Ty);
2359 }))
2361
2362 InstructionCost ScalarizationCost =
2363 SkipScalarizationCost
2364 ? ScalarizationCostPassed
2365 : getScalarizationOverhead(RetVTy, /*Insert*/ true,
2366 /*Extract*/ false, CostKind);
2367
2368 unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements();
2369 SmallVector<Type *, 4> ScalarTys;
2370 for (Type *Ty : Tys) {
2371 if (Ty->isVectorTy())
2372 Ty = Ty->getScalarType();
2373 ScalarTys.push_back(Ty);
2374 }
2375 IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
2376 InstructionCost ScalarCost =
2377 thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2378 for (Type *Ty : Tys) {
2379 if (auto *VTy = dyn_cast<VectorType>(Ty)) {
2380 if (!ICA.skipScalarizationCost())
2381 ScalarizationCost += getScalarizationOverhead(
2382 VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
2383 ScalarCalls = std::max(ScalarCalls,
2384 cast<FixedVectorType>(VTy)->getNumElements());
2385 }
2386 }
2387 return ScalarCalls * ScalarCost + ScalarizationCost;
2388 }
2389
2390 // This is going to be turned into a library call, make it expensive.
2391 return SingleCallCost;
2392 }
2393
2394 /// Compute a cost of the given call instruction.
2395 ///
2396 /// Compute the cost of calling function F with return type RetTy and
2397 /// argument types Tys. F might be nullptr, in this case the cost of an
2398 /// arbitrary call with the specified signature will be returned.
2399 /// This is used, for instance, when we estimate call of a vector
2400 /// counterpart of the given function.
2401 /// \param F Called function, might be nullptr.
2402 /// \param RetTy Return value types.
2403 /// \param Tys Argument types.
2404 /// \returns The cost of Call instruction.
2406 ArrayRef<Type *> Tys,
2408 return 10;
2409 }
2410
2411 unsigned getNumberOfParts(Type *Tp) {
2412 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
2413 return LT.first.isValid() ? *LT.first.getValue() : 0;
2414 }
2415
2417 const SCEV *) {
2418 return 0;
2419 }
2420
2421 /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
2422 /// We're assuming that reduction operation are performing the following way:
2423 ///
2424 /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
2425 /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
2426 /// \----------------v-------------/ \----------v------------/
2427 /// n/2 elements n/2 elements
2428 /// %red1 = op <n x t> %val, <n x t> val1
2429 /// After this operation we have a vector %red1 where only the first n/2
2430 /// elements are meaningful, the second n/2 elements are undefined and can be
2431 /// dropped. All other operations are actually working with the vector of
2432 /// length n/2, not n, though the real vector length is still n.
2433 /// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
2434 /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
2435 /// \----------------v-------------/ \----------v------------/
2436 /// n/4 elements 3*n/4 elements
2437 /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of
2438 /// length n/2, the resulting vector has length n/4 etc.
2439 ///
2440 /// The cost model should take into account that the actual length of the
2441 /// vector is reduced on each iteration.
2444 // Targets must implement a default value for the scalable case, since
2445 // we don't know how many lanes the vector has.
2446 if (isa<ScalableVectorType>(Ty))
2448
2449 Type *ScalarTy = Ty->getElementType();
2450 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2451 if ((Opcode == Instruction::Or || Opcode == Instruction::And) &&
2452 ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) &&
2453 NumVecElts >= 2) {
2454 // Or reduction for i1 is represented as:
2455 // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2456 // %res = cmp ne iReduxWidth %val, 0
2457 // And reduction for i1 is represented as:
2458 // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2459 // %res = cmp eq iReduxWidth %val, 11111
2460 Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts);
2461 return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty,
2463 thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy,
2466 }
2467 unsigned NumReduxLevels = Log2_32(NumVecElts);
2468 InstructionCost ArithCost = 0;
2469 InstructionCost ShuffleCost = 0;
2470 std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty);
2471 unsigned LongVectorCount = 0;
2472 unsigned MVTLen =
2473 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2474 while (NumVecElts > MVTLen) {
2475 NumVecElts /= 2;
2476 VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2477 ShuffleCost +=
2478 thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
2479 CostKind, NumVecElts, SubTy);
2480 ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind);
2481 Ty = SubTy;
2482 ++LongVectorCount;
2483 }
2484
2485 NumReduxLevels -= LongVectorCount;
2486
2487 // The minimal length of the vector is limited by the real length of vector
2488 // operations performed on the current platform. That's why several final
2489 // reduction operations are performed on the vectors with the same
2490 // architecture-dependent length.
2491
2492 // By default reductions need one shuffle per reduction level.
2493 ShuffleCost +=
2494 NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
2495 std::nullopt, CostKind, 0, Ty);
2496 ArithCost +=
2497 NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind);
2498 return ShuffleCost + ArithCost +
2499 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
2500 CostKind, 0, nullptr, nullptr);
2501 }
2502
2503 /// Try to calculate the cost of performing strict (in-order) reductions,
2504 /// which involves doing a sequence of floating point additions in lane
2505 /// order, starting with an initial value. For example, consider a scalar
2506 /// initial value 'InitVal' of type float and a vector of type <4 x float>:
2507 ///
2508 /// Vector = <float %v0, float %v1, float %v2, float %v3>
2509 ///
2510 /// %add1 = %InitVal + %v0
2511 /// %add2 = %add1 + %v1
2512 /// %add3 = %add2 + %v2
2513 /// %add4 = %add3 + %v3
2514 ///
2515 /// As a simple estimate we can say the cost of such a reduction is 4 times
2516 /// the cost of a scalar FP addition. We can only estimate the costs for
2517 /// fixed-width vectors here because for scalable vectors we do not know the
2518 /// runtime number of operations.
2521 // Targets must implement a default value for the scalable case, since
2522 // we don't know how many lanes the vector has.
2523 if (isa<ScalableVectorType>(Ty))
2525
2526 auto *VTy = cast<FixedVectorType>(Ty);
2528 VTy, /*Insert=*/false, /*Extract=*/true, CostKind);
2529 InstructionCost ArithCost = thisT()->getArithmeticInstrCost(
2530 Opcode, VTy->getElementType(), CostKind);
2531 ArithCost *= VTy->getNumElements();
2532
2533 return ExtractCost + ArithCost;
2534 }
2535
2537 std::optional<FastMathFlags> FMF,
2539 assert(Ty && "Unknown reduction vector type");
2541 return getOrderedReductionCost(Opcode, Ty, CostKind);
2542 return getTreeReductionCost(Opcode, Ty, CostKind);
2543 }
2544
2545 /// Try to calculate op costs for min/max reduction operations.
2546 /// \param CondTy Conditional type for the Select instruction.
2548 FastMathFlags FMF,
2550 // Targets must implement a default value for the scalable case, since
2551 // we don't know how many lanes the vector has.
2552 if (isa<ScalableVectorType>(Ty))
2554
2555 Type *ScalarTy = Ty->getElementType();
2556 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2557 unsigned NumReduxLevels = Log2_32(NumVecElts);
2558 InstructionCost MinMaxCost = 0;
2559 InstructionCost ShuffleCost = 0;
2560 std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty);
2561 unsigned LongVectorCount = 0;
2562 unsigned MVTLen =
2563 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2564 while (NumVecElts > MVTLen) {
2565 NumVecElts /= 2;
2566 auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2567
2568 ShuffleCost +=
2569 thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
2570 CostKind, NumVecElts, SubTy);
2571
2572 IntrinsicCostAttributes Attrs(IID, SubTy, {SubTy, SubTy}, FMF);
2573 MinMaxCost += getIntrinsicInstrCost(Attrs, CostKind);
2574 Ty = SubTy;
2575 ++LongVectorCount;
2576 }
2577
2578 NumReduxLevels -= LongVectorCount;
2579
2580 // The minimal length of the vector is limited by the real length of vector
2581 // operations performed on the current platform. That's why several final
2582 // reduction opertions are perfomed on the vectors with the same
2583 // architecture-dependent length.
2584 ShuffleCost +=
2585 NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
2586 std::nullopt, CostKind, 0, Ty);
2587 IntrinsicCostAttributes Attrs(IID, Ty, {Ty, Ty}, FMF);
2588 MinMaxCost += NumReduxLevels * getIntrinsicInstrCost(Attrs, CostKind);
2589 // The last min/max should be in vector registers and we counted it above.
2590 // So just need a single extractelement.
2591 return ShuffleCost + MinMaxCost +
2592 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
2593 CostKind, 0, nullptr, nullptr);
2594 }
2595
2596 InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned,
2597 Type *ResTy, VectorType *Ty,
2598 FastMathFlags FMF,
2600 // Without any native support, this is equivalent to the cost of
2601 // vecreduce.opcode(ext(Ty A)).
2602 VectorType *ExtTy = VectorType::get(ResTy, Ty);
2603 InstructionCost RedCost =
2604 thisT()->getArithmeticReductionCost(Opcode, ExtTy, FMF, CostKind);
2605 InstructionCost ExtCost = thisT()->getCastInstrCost(
2606 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2608
2609 return RedCost + ExtCost;
2610 }
2611
2613 VectorType *Ty,
2615 // Without any native support, this is equivalent to the cost of
2616 // vecreduce.add(mul(ext(Ty A), ext(Ty B))) or
2617 // vecreduce.add(mul(A, B)).
2618 VectorType *ExtTy = VectorType::get(ResTy, Ty);
2619 InstructionCost RedCost = thisT()->getArithmeticReductionCost(
2620 Instruction::Add, ExtTy, std::nullopt, CostKind);
2621 InstructionCost ExtCost = thisT()->getCastInstrCost(
2622 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2624
2625 InstructionCost MulCost =
2626 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2627
2628 return RedCost + MulCost + 2 * ExtCost;
2629 }
2630
2632
2633 /// @}
2634};
2635
2636/// Concrete BasicTTIImpl that can be used if no further customization
2637/// is needed.
2638class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
2640
2641 friend class BasicTTIImplBase<BasicTTIImpl>;
2642
2643 const TargetSubtargetInfo *ST;
2644 const TargetLoweringBase *TLI;
2645
2646 const TargetSubtargetInfo *getST() const { return ST; }
2647 const TargetLoweringBase *getTLI() const { return TLI; }
2648
2649public:
2650 explicit BasicTTIImpl(const TargetMachine *TM, const Function &F);
2651};
2652
2653} // end namespace llvm
2654
2655#endif // LLVM_CODEGEN_BASICTTIIMPL_H
This file implements a class to represent arbitrary precision integral constant values and operations...
This file implements the BitVector class.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
return RetTy
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
const char LLVMTargetMachineRef TM
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This file describes how to lower LLVM code to machine code.
This file provides helpers for the implementation of a TargetTransformInfo-conforming class.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:77
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:213
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1309
bool sgt(const APInt &RHS) const
Signed greater than comparison.
Definition: APInt.h:1180
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1447
bool slt(const APInt &RHS) const
Signed less than comparison.
Definition: APInt.h:1109
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:179
an instruction to allocate memory on the stack
Definition: Instructions.h:60
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:204
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:210
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
Base class which can be used to help build a TTI implementation.
Definition: BasicTTIImpl.h:81
bool isTypeLegal(Type *Ty)
Definition: BasicTTIImpl.h:434
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const
Definition: BasicTTIImpl.h:287
virtual unsigned getPrefetchDistance() const
Definition: BasicTTIImpl.h:728
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:588
bool preferToKeepConstantsAttached(const Instruction &Inst, const Function &Fn) const
Definition: BasicTTIImpl.h:561
unsigned getMaxInterleaveFactor(ElementCount VF)
Definition: BasicTTIImpl.h:895
unsigned getNumberOfParts(Type *Tp)
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
Definition: BasicTTIImpl.h:757
std::optional< unsigned > getVScaleForTuning() const
Definition: BasicTTIImpl.h:762
InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty, TTI::TargetCostKind CostKind)
Try to calculate the cost of performing strict (in-order) reductions, which involves doing a sequence...
bool isTruncateFree(Type *Ty1, Type *Ty2)
Definition: BasicTTIImpl.h:424
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo)
Definition: BasicTTIImpl.h:668
InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty, TTI::TargetCostKind CostKind)
Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI)
Definition: BasicTTIImpl.h:675
virtual bool shouldPrefetchAddressSpace(unsigned AS) const
Definition: BasicTTIImpl.h:748
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
bool isLegalICmpImmediate(int64_t imm)
Definition: BasicTTIImpl.h:336
bool isProfitableToHoist(Instruction *I)
Definition: BasicTTIImpl.h:428
virtual unsigned getMaxPrefetchIterationsAhead() const
Definition: BasicTTIImpl.h:740
InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, TTI::TargetCostKind CostKind, unsigned Index)
std::optional< unsigned > getMaxVScale() const
Definition: BasicTTIImpl.h:761
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:975
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
unsigned getRegUsageForType(Type *Ty)
Definition: BasicTTIImpl.h:439
bool shouldBuildRelLookupTables() const
Definition: BasicTTIImpl.h:515
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const
Definition: BasicTTIImpl.h:582
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, unsigned &JumpTableSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
Definition: BasicTTIImpl.h:450
bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty, const DataLayout &DL) const
Definition: BasicTTIImpl.h:377
bool shouldDropLSRSolutionIfLessProfitable() const
Definition: BasicTTIImpl.h:402
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2)
Definition: BasicTTIImpl.h:389
std::optional< Value * > simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, bool &KnownBitsComputed)
Definition: BasicTTIImpl.h:690
bool shouldFoldTerminatingConditionAfterLSR() const
Definition: BasicTTIImpl.h:397
virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, unsigned NumPrefetches, bool HasCall) const
Definition: BasicTTIImpl.h:732
bool hasBranchDivergence(const Function *F=nullptr)
Definition: BasicTTIImpl.h:281
bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty, const DataLayout &DL) const
Definition: BasicTTIImpl.h:383
unsigned getAssumedAddrSpace(const Value *V) const
Definition: BasicTTIImpl.h:309
InstructionCost getOperandsScalarizationOverhead(ArrayRef< const Value * > Args, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instructions unique non-constant operands.
Definition: BasicTTIImpl.h:814
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:768
int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset)
Definition: BasicTTIImpl.h:353
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:444
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty)
Definition: BasicTTIImpl.h:547
virtual std::optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const
Definition: BasicTTIImpl.h:708
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace)
Definition: BasicTTIImpl.h:410
bool isAlwaysUniform(const Value *V)
Definition: BasicTTIImpl.h:285
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true)
Definition: BasicTTIImpl.h:680
bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, unsigned AddressSpace, Align Alignment, unsigned *Fast) const
Definition: BasicTTIImpl.h:273
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
Definition: BasicTTIImpl.h:357
InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
Definition: BasicTTIImpl.h:798
virtual std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const
Definition: BasicTTIImpl.h:714
virtual bool enableWritePrefetching() const
Definition: BasicTTIImpl.h:744
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const
Definition: BasicTTIImpl.h:323
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:660
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const
Definition: BasicTTIImpl.h:300
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Compute a cost of the given call instruction.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:897
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getFPOpCost(Type *Ty)
Definition: BasicTTIImpl.h:551
InstructionCost getVectorSplitCost()
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:861
bool haveFastSqrt(Type *Ty)
Definition: BasicTTIImpl.h:540
std::pair< const Value *, unsigned > getPredicatedAddrSpace(const Value *V) const
Definition: BasicTTIImpl.h:319
unsigned getInliningThresholdMultiplier() const
Definition: BasicTTIImpl.h:580
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
virtual ~BasicTTIImplBase()=default
bool isLegalAddScalableImmediate(int64_t Imm)
Definition: BasicTTIImpl.h:332
InstructionCost getScalarizationOverhead(VectorType *RetTy, ArrayRef< const Value * > Args, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing the inputs and outputs of an instruction, with return type RetTy...
Definition: BasicTTIImpl.h:843
bool isVScaleKnownToBeAPowerOfTwo() const
Definition: BasicTTIImpl.h:763
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II)
Definition: BasicTTIImpl.h:684
bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const
Definition: BasicTTIImpl.h:291
bool isLegalAddImmediate(int64_t imm)
Definition: BasicTTIImpl.h:328
unsigned getFlatAddressSpace()
Definition: BasicTTIImpl.h:295
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
virtual unsigned getCacheLineSize() const
Definition: BasicTTIImpl.h:724
bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const
Definition: BasicTTIImpl.h:305
bool isSourceOfDivergence(const Value *V)
Definition: BasicTTIImpl.h:283
int getInlinerVectorBonusPercent() const
Definition: BasicTTIImpl.h:586
InstructionCost getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on argument types.
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp)
Definition: BasicTTIImpl.h:697
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:340
bool isSingleThreaded() const
Definition: BasicTTIImpl.h:313
BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
Definition: BasicTTIImpl.h:264
unsigned adjustInliningThreshold(const CallBase *CB)
Definition: BasicTTIImpl.h:581
bool isProfitableLSRChainElement(Instruction *I)
Definition: BasicTTIImpl.h:406
Concrete BasicTTIImpl that can be used if no further customization is needed.
size_type count() const
count - Returns the number of bits which are set.
Definition: BitVector.h:162
BitVector & set()
Definition: BitVector.h:351
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1236
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:1104
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:780
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:784
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:782
@ ICMP_EQ
equal
Definition: InstrTypes.h:778
@ ICMP_NE
not equal
Definition: InstrTypes.h:779
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:767
This class represents a range of values.
Definition: ConstantRange.h:47
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
Definition: DataLayout.h:484
unsigned getIndexSizeInBits(unsigned AS) const
Size in bits of index used for address calculation in getelementptr.
Definition: DataLayout.h:420
constexpr bool isVector() const
One or more elements.
Definition: TypeSize.h:323
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:308
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:319
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
unsigned getNumElements() const
Definition: DerivedTypes.h:582
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
bool isTargetIntrinsic() const
isTargetIntrinsic - Returns true if this function is an intrinsic and the intrinsic is specific to a ...
Definition: Function.cpp:902
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:350
The core instruction combiner logic.
Definition: InstCombiner.h:47
static InstructionCost getInvalid(CostType Val=0)
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
InstructionCost getScalarizationCost() const
const IntrinsicInst * getInst() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
virtual bool shouldPrefetchAddressSpace(unsigned AS) const
virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, unsigned NumPrefetches, bool HasCall) const
Return the minimum stride necessary to trigger software prefetching.
virtual bool enableWritePrefetching() const
virtual unsigned getMaxPrefetchIterationsAhead() const
Return the maximum prefetch distance in terms of loop iterations.
virtual unsigned getPrefetchDistance() const
Return the preferred prefetch distance in terms of instructions.
virtual std::optional< unsigned > getCacheAssociativity(unsigned Level) const
Return the cache associatvity for the given level of cache.
virtual std::optional< unsigned > getCacheLineSize(unsigned Level) const
Return the target cache line size in bytes at a given level.
Machine Value Type.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Analysis providing profile information.
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isSpliceMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is a splice mask, concatenating the two inputs together and then ext...
static bool isSelectMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from its source vectors without lane crossings.
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isTransposeMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask is a transpose mask.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
size_type size() const
Definition: SmallPtrSet.h:94
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:344
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:479
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
static StackOffset getScalable(int64_t Scalable)
Definition: TypeSize.h:43
static StackOffset getFixed(int64_t Fixed)
Definition: TypeSize.h:42
static StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition: Type.cpp:513
Multiway switch.
Provides information about what library functions are available for the current target.
This base class for TargetLowering contains the SelectionDAG-independent parts that can be used from ...
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
bool isIndexedStoreLegal(unsigned IdxMode, EVT VT) const
Return true if the specified indexed load is legal on this target.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
virtual bool isLegalICmpImmediate(int64_t) const
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
const TargetMachine & getTargetMachine() const
virtual bool isZExtFree(Type *FromTy, Type *ToTy) const
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
virtual bool isSuitableForJumpTable(const SwitchInst *SI, uint64_t NumCases, uint64_t Range, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) const
Return true if lowering to a jump table is suitable for a set of case clusters which may contain NumC...
virtual bool areJTsAllowed(const Function *Fn) const
Return true if lowering to a jump table is allowed.
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
virtual unsigned getNumRegisters(LLVMContext &Context, EVT VT, std::optional< MVT > RegisterVT=std::nullopt) const
Return the number of registers that this ValueType will eventually require.
virtual bool isCheapToSpeculateCttz(Type *Ty) const
Return true if it is cheap to speculate a call to intrinsic cttz.
bool isTruncStoreLegal(EVT ValVT, EVT MemVT) const
Return true if the specified store with truncation is legal on this target.
unsigned getBitWidthForCttzElements(Type *RetTy, ElementCount EC, bool ZeroIsPoison, const ConstantRange *VScaleRange) const
Return the minimum number of bits required to hold the maximum possible number of trailing zero vecto...
virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *=nullptr) const
Determine if the target supports unaligned memory accesses.
virtual bool isTruncateFree(Type *FromTy, Type *ToTy) const
Return true if it's free to truncate a value of type FromTy to type ToTy.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
bool isSuitableForBitTests(unsigned NumDests, unsigned NumCmps, const APInt &Low, const APInt &High, const DataLayout &DL) const
Return true if lowering to a bit test is suitable for a set of case clusters which contains NumDests ...
virtual bool isLegalAddImmediate(int64_t) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
virtual bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
LegalizeAction getTruncStoreAction(EVT ValVT, EVT MemVT) const
Return how this store with truncation should be treated: either it is legal, needs to be promoted to ...
LegalizeAction getLoadExtAction(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return how this load with extension should be treated: either it is legal, needs to be promoted to a ...
virtual bool isIntDivCheap(EVT VT, AttributeList Attr) const
Return true if integer divide is usually cheaper than a sequence of several shifts,...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isProfitableToHoist(Instruction *I) const
bool isIndexedLoadLegal(unsigned IdxMode, EVT VT) const
Return true if the specified indexed load is legal on this target.
bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal on this target.
virtual bool isCheapToSpeculateCtlz(Type *Ty) const
Return true if it is cheap to speculate a call to intrinsic ctlz.
virtual int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) const
Return the prefered common base offset.
LegalizeKind getTypeConversion(LLVMContext &Context, EVT VT) const
Return pair that represents the legalization kind (first) that needs to happen to EVT (second) in ord...
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
virtual bool isLegalAddScalableImmediate(int64_t) const
Return true if adding the specified scalable immediate is legal, that is the target has add instructi...
bool isBeneficialToExpandPowI(int64_t Exponent, bool OptForSize) const
Return true if it is beneficial to expand an @llvm.powi.
virtual bool isFAbsFree(EVT VT) const
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
virtual bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AddrSpace, Instruction *I=nullptr) const
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
virtual std::pair< const Value *, unsigned > getPredicatedAddrSpace(const Value *V) const
If the specified predicate checks whether a generic pointer falls within a specified address space,...
virtual bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const
Returns true if a cast between SrcAS and DestAS is a noop.
virtual unsigned getAssumedAddrSpace(const Value *V) const
If the specified generic pointer could be assumed as a pointer to a specific address space,...
TargetOptions Options
ThreadModel::Model ThreadModel
ThreadModel - This flag specifies the type of threading model to assume for things like atomics.
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual bool useAA() const
Enable use of alias analysis during code generation (during MI scheduling, DAGCombine,...
const DataLayout & getDataLayout() const
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
bool isProfitableLSRChainElement(Instruction *I) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I) const
std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info, TTI::OperandValueInfo Opd2Info, ArrayRef< const Value * > Args, const Instruction *CxtI=nullptr) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I) const
bool isLoweredToCall(const Function *F) const
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const
std::optional< Value * > simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, bool &KnownBitsComputed) const
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
CRTP base class for use as a mix-in that aids implementing a TargetTransformInfo-compatible class.
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Basic
The cost of a typical 'add' instruction.
MemIndexedMode
The type of load/store indexing.
@ MIM_PostInc
Post-incrementing.
@ MIM_PostDec
Post-decrementing.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
CacheLevel
The possible cache levels.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:373
bool isArch64Bit() const
Test whether the architecture is 64-bit.
Definition: Triple.cpp:1661
bool isOSDarwin() const
Is this a "Darwin" OS (macOS, iOS, tvOS, watchOS, XROS, or DriverKit).
Definition: Triple.h:558
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:342
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:234
static IntegerType * getInt1Ty(LLVMContext &C)
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
static IntegerType * getInt8Ty(LLVMContext &C)
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:262
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:216
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
Value * getOperand(unsigned i) const
Definition: User.h:169
static bool isVPBinOp(Intrinsic::ID ID)
static std::optional< unsigned > getFunctionalOpcodeForVP(Intrinsic::ID ID)
static std::optional< Intrinsic::ID > getFunctionalIntrinsicIDForVP(Intrinsic::ID ID)
static bool isVPIntrinsic(Intrinsic::ID)
static bool isVPReduction(Intrinsic::ID ID)
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
static VectorType * getHalfElementsVectorType(VectorType *VTy)
This static method returns a VectorType with half as many elements as the input type and the same ele...
Definition: DerivedTypes.h:507
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
Type * getElementType() const
Definition: DerivedTypes.h:436
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:215
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition: APInt.cpp:2978
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:728
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:491
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:262
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1073
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1077
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:508
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:741
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:986
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:750
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1005
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:501
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
Definition: ISDOpcodes.h:1503
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:950
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:431
AddressSpace
Definition: NVPTXBaseInfo.h:21
unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID)
Returns the arithmetic instruction opcode used when expanding a reduction.
Definition: LoopUtils.cpp:921
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
ConstantRange getVScaleRange(const Function *F, unsigned BitWidth)
Determine the possible constant range of vscale with the given bit width, based on the vscale_range f...
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
InstructionCost Cost
cl::opt< unsigned > PartialUnrollingThreshold
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Extended Value Type.
Definition: ValueTypes.h:34
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
ElementCount getVectorElementCount() const
Definition: ValueTypes.h:340
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:274
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
Attributes of a target dependent hardware loop.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
bool AllowPeeling
Allow peeling off loop iterations.
bool AllowLoopNestsPeeling
Allow peeling off loop iterations for loop nests.
bool PeelProfiledIterations
Allow peeling basing on profile.
unsigned PeelCount
A forced peeling factor (the number of bodied of the original loop that should be peeled off before t...
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).