LLVM 20.0.0git
BasicTTIImpl.h
Go to the documentation of this file.
1//===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file provides a helper that implements much of the TTI interface in
11/// terms of the target-independent code generator and TargetLowering
12/// interfaces.
13//
14//===----------------------------------------------------------------------===//
15
16#ifndef LLVM_CODEGEN_BASICTTIIMPL_H
17#define LLVM_CODEGEN_BASICTTIIMPL_H
18
19#include "llvm/ADT/APInt.h"
20#include "llvm/ADT/BitVector.h"
33#include "llvm/IR/BasicBlock.h"
34#include "llvm/IR/Constant.h"
35#include "llvm/IR/Constants.h"
36#include "llvm/IR/DataLayout.h"
38#include "llvm/IR/InstrTypes.h"
39#include "llvm/IR/Instruction.h"
41#include "llvm/IR/Intrinsics.h"
42#include "llvm/IR/Operator.h"
43#include "llvm/IR/Type.h"
44#include "llvm/IR/Value.h"
52#include <algorithm>
53#include <cassert>
54#include <cstdint>
55#include <limits>
56#include <optional>
57#include <utility>
58
59namespace llvm {
60
61class Function;
62class GlobalValue;
63class LLVMContext;
64class ScalarEvolution;
65class SCEV;
66class TargetMachine;
67
68extern cl::opt<unsigned> PartialUnrollingThreshold;
69
70/// Base class which can be used to help build a TTI implementation.
71///
72/// This class provides as much implementation of the TTI interface as is
73/// possible using the target independent parts of the code generator.
74///
75/// In order to subclass it, your class must implement a getST() method to
76/// return the subtarget, and a getTLI() method to return the target lowering.
77/// We need these methods implemented in the derived class so that this class
78/// doesn't have to duplicate storage for them.
79template <typename T>
81private:
84
85 /// Helper function to access this as a T.
86 T *thisT() { return static_cast<T *>(this); }
87
88 /// Estimate a cost of Broadcast as an extract and sequence of insert
89 /// operations.
90 InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy,
93 // Broadcast cost is equal to the cost of extracting the zero'th element
94 // plus the cost of inserting it into every element of the result vector.
95 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
96 CostKind, 0, nullptr, nullptr);
97
98 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
99 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
100 CostKind, i, nullptr, nullptr);
101 }
102 return Cost;
103 }
104
105 /// Estimate a cost of shuffle as a sequence of extract and insert
106 /// operations.
107 InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy,
110 // Shuffle cost is equal to the cost of extracting element from its argument
111 // plus the cost of inserting them onto the result vector.
112
113 // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
114 // index 0 of first vector, index 1 of second vector,index 2 of first
115 // vector and finally index 3 of second vector and insert them at index
116 // <0,1,2,3> of result vector.
117 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
118 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
119 CostKind, i, nullptr, nullptr);
120 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
121 CostKind, i, nullptr, nullptr);
122 }
123 return Cost;
124 }
125
126 /// Estimate a cost of subvector extraction as a sequence of extract and
127 /// insert operations.
128 InstructionCost getExtractSubvectorOverhead(VectorType *VTy,
130 int Index,
131 FixedVectorType *SubVTy) {
132 assert(VTy && SubVTy &&
133 "Can only extract subvectors from vectors");
134 int NumSubElts = SubVTy->getNumElements();
135 assert((!isa<FixedVectorType>(VTy) ||
136 (Index + NumSubElts) <=
137 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
138 "SK_ExtractSubvector index out of range");
139
141 // Subvector extraction cost is equal to the cost of extracting element from
142 // the source type plus the cost of inserting them into the result vector
143 // type.
144 for (int i = 0; i != NumSubElts; ++i) {
145 Cost +=
146 thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
147 CostKind, i + Index, nullptr, nullptr);
148 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy,
149 CostKind, i, nullptr, nullptr);
150 }
151 return Cost;
152 }
153
154 /// Estimate a cost of subvector insertion as a sequence of extract and
155 /// insert operations.
156 InstructionCost getInsertSubvectorOverhead(VectorType *VTy,
158 int Index,
159 FixedVectorType *SubVTy) {
160 assert(VTy && SubVTy &&
161 "Can only insert subvectors into vectors");
162 int NumSubElts = SubVTy->getNumElements();
163 assert((!isa<FixedVectorType>(VTy) ||
164 (Index + NumSubElts) <=
165 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
166 "SK_InsertSubvector index out of range");
167
169 // Subvector insertion cost is equal to the cost of extracting element from
170 // the source type plus the cost of inserting them into the result vector
171 // type.
172 for (int i = 0; i != NumSubElts; ++i) {
173 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy,
174 CostKind, i, nullptr, nullptr);
175 Cost +=
176 thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, CostKind,
177 i + Index, nullptr, nullptr);
178 }
179 return Cost;
180 }
181
182 /// Local query method delegates up to T which *must* implement this!
183 const TargetSubtargetInfo *getST() const {
184 return static_cast<const T *>(this)->getST();
185 }
186
187 /// Local query method delegates up to T which *must* implement this!
188 const TargetLoweringBase *getTLI() const {
189 return static_cast<const T *>(this)->getTLI();
190 }
191
192 static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
193 switch (M) {
195 return ISD::UNINDEXED;
196 case TTI::MIM_PreInc:
197 return ISD::PRE_INC;
198 case TTI::MIM_PreDec:
199 return ISD::PRE_DEC;
200 case TTI::MIM_PostInc:
201 return ISD::POST_INC;
202 case TTI::MIM_PostDec:
203 return ISD::POST_DEC;
204 }
205 llvm_unreachable("Unexpected MemIndexedMode");
206 }
207
208 InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
209 Align Alignment,
210 bool VariableMask,
211 bool IsGatherScatter,
213 unsigned AddressSpace = 0) {
214 // We cannot scalarize scalable vectors, so return Invalid.
215 if (isa<ScalableVectorType>(DataTy))
217
218 auto *VT = cast<FixedVectorType>(DataTy);
219 unsigned VF = VT->getNumElements();
220
221 // Assume the target does not have support for gather/scatter operations
222 // and provide a rough estimate.
223 //
224 // First, compute the cost of the individual memory operations.
225 InstructionCost AddrExtractCost =
226 IsGatherScatter
229 PointerType::get(VT->getElementType(), 0), VF),
230 /*Insert=*/false, /*Extract=*/true, CostKind)
231 : 0;
232
233 // The cost of the scalar loads/stores.
234 InstructionCost MemoryOpCost =
235 VF * thisT()->getMemoryOpCost(Opcode, VT->getElementType(), Alignment,
237
238 // Next, compute the cost of packing the result in a vector.
239 InstructionCost PackingCost =
240 getScalarizationOverhead(VT, Opcode != Instruction::Store,
241 Opcode == Instruction::Store, CostKind);
242
243 InstructionCost ConditionalCost = 0;
244 if (VariableMask) {
245 // Compute the cost of conditionally executing the memory operations with
246 // variable masks. This includes extracting the individual conditions, a
247 // branches and PHIs to combine the results.
248 // NOTE: Estimating the cost of conditionally executing the memory
249 // operations accurately is quite difficult and the current solution
250 // provides a very rough estimate only.
251 ConditionalCost =
254 /*Insert=*/false, /*Extract=*/true, CostKind) +
255 VF * (thisT()->getCFInstrCost(Instruction::Br, CostKind) +
256 thisT()->getCFInstrCost(Instruction::PHI, CostKind));
257 }
258
259 return AddrExtractCost + MemoryOpCost + PackingCost + ConditionalCost;
260 }
261
262 /// Checks if the provided mask \p is a splat mask, i.e. it contains only -1
263 /// or same non -1 index value and this index value contained at least twice.
264 /// So, mask <0, -1,-1, -1> is not considered splat (it is just identity),
265 /// same for <-1, 0, -1, -1> (just a slide), while <2, -1, 2, -1> is a splat
266 /// with \p Index=2.
267 static bool isSplatMask(ArrayRef<int> Mask, unsigned NumSrcElts, int &Index) {
268 // Check that the broadcast index meets at least twice.
269 bool IsCompared = false;
270 if (int SplatIdx = PoisonMaskElem;
271 all_of(enumerate(Mask), [&](const auto &P) {
272 if (P.value() == PoisonMaskElem)
273 return P.index() != Mask.size() - 1 || IsCompared;
274 if (static_cast<unsigned>(P.value()) >= NumSrcElts * 2)
275 return false;
276 if (SplatIdx == PoisonMaskElem) {
277 SplatIdx = P.value();
278 return P.index() != Mask.size() - 1;
279 }
280 IsCompared = true;
281 return SplatIdx == P.value();
282 })) {
283 Index = SplatIdx;
284 return true;
285 }
286 return false;
287 }
288
289protected:
290 explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
291 : BaseT(DL) {}
292 virtual ~BasicTTIImplBase() = default;
293
295
296public:
297 /// \name Scalar TTI Implementations
298 /// @{
300 unsigned AddressSpace, Align Alignment,
301 unsigned *Fast) const {
302 EVT E = EVT::getIntegerVT(Context, BitWidth);
303 return getTLI()->allowsMisalignedMemoryAccesses(
305 }
306
307 bool areInlineCompatible(const Function *Caller,
308 const Function *Callee) const {
309 const TargetMachine &TM = getTLI()->getTargetMachine();
310
311 const FeatureBitset &CallerBits =
312 TM.getSubtargetImpl(*Caller)->getFeatureBits();
313 const FeatureBitset &CalleeBits =
314 TM.getSubtargetImpl(*Callee)->getFeatureBits();
315
316 // Inline a callee if its target-features are a subset of the callers
317 // target-features.
318 return (CallerBits & CalleeBits) == CalleeBits;
319 }
320
321 bool hasBranchDivergence(const Function *F = nullptr) { return false; }
322
323 bool isSourceOfDivergence(const Value *V) { return false; }
324
325 bool isAlwaysUniform(const Value *V) { return false; }
326
327 bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
328 return false;
329 }
330
331 bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const {
332 return true;
333 }
334
336 // Return an invalid address space.
337 return -1;
338 }
339
341 Intrinsic::ID IID) const {
342 return false;
343 }
344
345 bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
346 return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS);
347 }
348
349 unsigned getAssumedAddrSpace(const Value *V) const {
350 return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
351 }
352
353 bool isSingleThreaded() const {
354 return getTLI()->getTargetMachine().Options.ThreadModel ==
356 }
357
358 std::pair<const Value *, unsigned>
360 return getTLI()->getTargetMachine().getPredicatedAddrSpace(V);
361 }
362
364 Value *NewV) const {
365 return nullptr;
366 }
367
368 bool isLegalAddImmediate(int64_t imm) {
369 return getTLI()->isLegalAddImmediate(imm);
370 }
371
372 bool isLegalAddScalableImmediate(int64_t Imm) {
373 return getTLI()->isLegalAddScalableImmediate(Imm);
374 }
375
376 bool isLegalICmpImmediate(int64_t imm) {
377 return getTLI()->isLegalICmpImmediate(imm);
378 }
379
380 bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
381 bool HasBaseReg, int64_t Scale, unsigned AddrSpace,
382 Instruction *I = nullptr,
383 int64_t ScalableOffset = 0) {
385 AM.BaseGV = BaseGV;
386 AM.BaseOffs = BaseOffset;
387 AM.HasBaseReg = HasBaseReg;
388 AM.Scale = Scale;
389 AM.ScalableOffset = ScalableOffset;
390 return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
391 }
392
393 int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) {
394 return getTLI()->getPreferredLargeGEPBaseOffset(MinOffset, MaxOffset);
395 }
396
397 unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
398 Type *ScalarValTy) const {
399 auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy](unsigned VF) {
400 auto *SrcTy = FixedVectorType::get(ScalarMemTy, VF / 2);
401 EVT VT = getTLI()->getValueType(DL, SrcTy);
402 if (getTLI()->isOperationLegal(ISD::STORE, VT) ||
403 getTLI()->isOperationCustom(ISD::STORE, VT))
404 return true;
405
406 EVT ValVT =
407 getTLI()->getValueType(DL, FixedVectorType::get(ScalarValTy, VF / 2));
408 EVT LegalizedVT =
409 getTLI()->getTypeToTransformTo(ScalarMemTy->getContext(), VT);
410 return getTLI()->isTruncStoreLegal(LegalizedVT, ValVT);
411 };
412 while (VF > 2 && IsSupportedByTarget(VF))
413 VF /= 2;
414 return VF;
415 }
416
418 const DataLayout &DL) const {
419 EVT VT = getTLI()->getValueType(DL, Ty);
420 return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
421 }
422
424 const DataLayout &DL) const {
425 EVT VT = getTLI()->getValueType(DL, Ty);
426 return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
427 }
428
431 }
432
435 }
436
439 }
440
443 }
444
446 StackOffset BaseOffset, bool HasBaseReg,
447 int64_t Scale, unsigned AddrSpace) {
449 AM.BaseGV = BaseGV;
450 AM.BaseOffs = BaseOffset.getFixed();
451 AM.HasBaseReg = HasBaseReg;
452 AM.Scale = Scale;
453 AM.ScalableOffset = BaseOffset.getScalable();
454 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
455 return 0;
456 return -1;
457 }
458
459 bool isTruncateFree(Type *Ty1, Type *Ty2) {
460 return getTLI()->isTruncateFree(Ty1, Ty2);
461 }
462
464 return getTLI()->isProfitableToHoist(I);
465 }
466
467 bool useAA() const { return getST()->useAA(); }
468
469 bool isTypeLegal(Type *Ty) {
470 EVT VT = getTLI()->getValueType(DL, Ty, /*AllowUnknown=*/true);
471 return getTLI()->isTypeLegal(VT);
472 }
473
474 unsigned getRegUsageForType(Type *Ty) {
475 EVT ETy = getTLI()->getValueType(DL, Ty);
476 return getTLI()->getNumRegisters(Ty->getContext(), ETy);
477 }
478
482 return BaseT::getGEPCost(PointeeType, Ptr, Operands, AccessType, CostKind);
483 }
484
486 unsigned &JumpTableSize,
488 BlockFrequencyInfo *BFI) {
489 /// Try to find the estimated number of clusters. Note that the number of
490 /// clusters identified in this function could be different from the actual
491 /// numbers found in lowering. This function ignore switches that are
492 /// lowered with a mix of jump table / bit test / BTree. This function was
493 /// initially intended to be used when estimating the cost of switch in
494 /// inline cost heuristic, but it's a generic cost model to be used in other
495 /// places (e.g., in loop unrolling).
496 unsigned N = SI.getNumCases();
497 const TargetLoweringBase *TLI = getTLI();
498 const DataLayout &DL = this->getDataLayout();
499
500 JumpTableSize = 0;
501 bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());
502
503 // Early exit if both a jump table and bit test are not allowed.
504 if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N))
505 return N;
506
507 APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
508 APInt MinCaseVal = MaxCaseVal;
509 for (auto CI : SI.cases()) {
510 const APInt &CaseVal = CI.getCaseValue()->getValue();
511 if (CaseVal.sgt(MaxCaseVal))
512 MaxCaseVal = CaseVal;
513 if (CaseVal.slt(MinCaseVal))
514 MinCaseVal = CaseVal;
515 }
516
517 // Check if suitable for a bit test
518 if (N <= DL.getIndexSizeInBits(0u)) {
520 for (auto I : SI.cases())
521 Dests.insert(I.getCaseSuccessor());
522
523 if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
524 DL))
525 return 1;
526 }
527
528 // Check if suitable for a jump table.
529 if (IsJTAllowed) {
530 if (N < 2 || N < TLI->getMinimumJumpTableEntries())
531 return N;
533 (MaxCaseVal - MinCaseVal)
534 .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1;
535 // Check whether a range of clusters is dense enough for a jump table
536 if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) {
537 JumpTableSize = Range;
538 return 1;
539 }
540 }
541 return N;
542 }
543
545 const TargetLoweringBase *TLI = getTLI();
546 return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
547 TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
548 }
549
551 const TargetMachine &TM = getTLI()->getTargetMachine();
552 // If non-PIC mode, do not generate a relative lookup table.
553 if (!TM.isPositionIndependent())
554 return false;
555
556 /// Relative lookup table entries consist of 32-bit offsets.
557 /// Do not generate relative lookup tables for large code models
558 /// in 64-bit achitectures where 32-bit offsets might not be enough.
559 if (TM.getCodeModel() == CodeModel::Medium ||
560 TM.getCodeModel() == CodeModel::Large)
561 return false;
562
563 const Triple &TargetTriple = TM.getTargetTriple();
564 if (!TargetTriple.isArch64Bit())
565 return false;
566
567 // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it
568 // there.
569 if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin())
570 return false;
571
572 return true;
573 }
574
575 bool haveFastSqrt(Type *Ty) {
576 const TargetLoweringBase *TLI = getTLI();
577 EVT VT = TLI->getValueType(DL, Ty);
578 return TLI->isTypeLegal(VT) &&
580 }
581
583 return true;
584 }
585
587 // Check whether FADD is available, as a proxy for floating-point in
588 // general.
589 const TargetLoweringBase *TLI = getTLI();
590 EVT VT = TLI->getValueType(DL, Ty);
594 }
595
597 const Function &Fn) const {
598 switch (Inst.getOpcode()) {
599 default:
600 break;
601 case Instruction::SDiv:
602 case Instruction::SRem:
603 case Instruction::UDiv:
604 case Instruction::URem: {
605 if (!isa<ConstantInt>(Inst.getOperand(1)))
606 return false;
607 EVT VT = getTLI()->getValueType(DL, Inst.getType());
608 return !getTLI()->isIntDivCheap(VT, Fn.getAttributes());
609 }
610 };
611
612 return false;
613 }
614
615 unsigned getInliningThresholdMultiplier() const { return 1; }
616 unsigned adjustInliningThreshold(const CallBase *CB) { return 0; }
617 unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const {
618 return 0;
619 }
620
621 int getInlinerVectorBonusPercent() const { return 150; }
622
626 // This unrolling functionality is target independent, but to provide some
627 // motivation for its intended use, for x86:
628
629 // According to the Intel 64 and IA-32 Architectures Optimization Reference
630 // Manual, Intel Core models and later have a loop stream detector (and
631 // associated uop queue) that can benefit from partial unrolling.
632 // The relevant requirements are:
633 // - The loop must have no more than 4 (8 for Nehalem and later) branches
634 // taken, and none of them may be calls.
635 // - The loop can have no more than 18 (28 for Nehalem and later) uops.
636
637 // According to the Software Optimization Guide for AMD Family 15h
638 // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
639 // and loop buffer which can benefit from partial unrolling.
640 // The relevant requirements are:
641 // - The loop must have fewer than 16 branches
642 // - The loop must have less than 40 uops in all executed loop branches
643
644 // The number of taken branches in a loop is hard to estimate here, and
645 // benchmarking has revealed that it is better not to be conservative when
646 // estimating the branch count. As a result, we'll ignore the branch limits
647 // until someone finds a case where it matters in practice.
648
649 unsigned MaxOps;
650 const TargetSubtargetInfo *ST = getST();
651 if (PartialUnrollingThreshold.getNumOccurrences() > 0)
653 else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
654 MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
655 else
656 return;
657
658 // Scan the loop: don't unroll loops with calls.
659 for (BasicBlock *BB : L->blocks()) {
660 for (Instruction &I : *BB) {
661 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
662 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
663 if (!thisT()->isLoweredToCall(F))
664 continue;
665 }
666
667 if (ORE) {
668 ORE->emit([&]() {
669 return OptimizationRemark("TTI", "DontUnroll", L->getStartLoc(),
670 L->getHeader())
671 << "advising against unrolling the loop because it "
672 "contains a "
673 << ore::NV("Call", &I);
674 });
675 }
676 return;
677 }
678 }
679 }
680
681 // Enable runtime and partial unrolling up to the specified size.
682 // Enable using trip count upper bound to unroll loops.
683 UP.Partial = UP.Runtime = UP.UpperBound = true;
684 UP.PartialThreshold = MaxOps;
685
686 // Avoid unrolling when optimizing for size.
687 UP.OptSizeThreshold = 0;
689
690 // Set number of instructions optimized when "back edge"
691 // becomes "fall through" to default value of 2.
692 UP.BEInsns = 2;
693 }
694
697 PP.PeelCount = 0;
698 PP.AllowPeeling = true;
699 PP.AllowLoopNestsPeeling = false;
700 PP.PeelProfiledIterations = true;
701 }
702
704 AssumptionCache &AC,
705 TargetLibraryInfo *LibInfo,
706 HardwareLoopInfo &HWLoopInfo) {
707 return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
708 }
709
712 }
713
716 }
717
719 getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) {
720 return BaseT::getPreferredTailFoldingStyle(IVUpdateMayOverflow);
721 }
722
723 std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
724 IntrinsicInst &II) {
726 }
727
728 std::optional<Value *>
730 APInt DemandedMask, KnownBits &Known,
731 bool &KnownBitsComputed) {
732 return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
733 KnownBitsComputed);
734 }
735
737 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
738 APInt &UndefElts2, APInt &UndefElts3,
739 std::function<void(Instruction *, unsigned, APInt, APInt &)>
740 SimplifyAndSetOp) {
742 IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
743 SimplifyAndSetOp);
744 }
745
746 virtual std::optional<unsigned>
748 return std::optional<unsigned>(
749 getST()->getCacheSize(static_cast<unsigned>(Level)));
750 }
751
752 virtual std::optional<unsigned>
754 std::optional<unsigned> TargetResult =
755 getST()->getCacheAssociativity(static_cast<unsigned>(Level));
756
757 if (TargetResult)
758 return TargetResult;
759
760 return BaseT::getCacheAssociativity(Level);
761 }
762
763 virtual unsigned getCacheLineSize() const {
764 return getST()->getCacheLineSize();
765 }
766
767 virtual unsigned getPrefetchDistance() const {
768 return getST()->getPrefetchDistance();
769 }
770
771 virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
772 unsigned NumStridedMemAccesses,
773 unsigned NumPrefetches,
774 bool HasCall) const {
775 return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
776 NumPrefetches, HasCall);
777 }
778
779 virtual unsigned getMaxPrefetchIterationsAhead() const {
780 return getST()->getMaxPrefetchIterationsAhead();
781 }
782
783 virtual bool enableWritePrefetching() const {
784 return getST()->enableWritePrefetching();
785 }
786
787 virtual bool shouldPrefetchAddressSpace(unsigned AS) const {
788 return getST()->shouldPrefetchAddressSpace(AS);
789 }
790
791 /// @}
792
793 /// \name Vector TTI Implementations
794 /// @{
795
797 return TypeSize::getFixed(32);
798 }
799
800 std::optional<unsigned> getMaxVScale() const { return std::nullopt; }
801 std::optional<unsigned> getVScaleForTuning() const { return std::nullopt; }
802 bool isVScaleKnownToBeAPowerOfTwo() const { return false; }
803
804 /// Estimate the overhead of scalarizing an instruction. Insert and Extract
805 /// are set if the demanded result elements need to be inserted and/or
806 /// extracted from vectors.
808 const APInt &DemandedElts,
809 bool Insert, bool Extract,
811 ArrayRef<Value *> VL = {}) {
812 /// FIXME: a bitfield is not a reasonable abstraction for talking about
813 /// which elements are needed from a scalable vector
814 if (isa<ScalableVectorType>(InTy))
816 auto *Ty = cast<FixedVectorType>(InTy);
817
818 assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&
819 (VL.empty() || VL.size() == Ty->getNumElements()) &&
820 "Vector size mismatch");
821
823
824 for (int i = 0, e = Ty->getNumElements(); i < e; ++i) {
825 if (!DemandedElts[i])
826 continue;
827 if (Insert) {
828 Value *InsertedVal = VL.empty() ? nullptr : VL[i];
829 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty,
830 CostKind, i, nullptr, InsertedVal);
831 }
832 if (Extract)
833 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
834 CostKind, i, nullptr, nullptr);
835 }
836
837 return Cost;
838 }
839
841 return false;
842 }
843
845 unsigned ScalarOpdIdx) const {
846 return false;
847 }
848
850 int OpdIdx) const {
851 return OpdIdx == -1;
852 }
853
855 int RetIdx) const {
856 return RetIdx == 0;
857 }
858
859 /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
861 bool Extract,
863 if (isa<ScalableVectorType>(InTy))
865 auto *Ty = cast<FixedVectorType>(InTy);
866
867 APInt DemandedElts = APInt::getAllOnes(Ty->getNumElements());
868 return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
869 CostKind);
870 }
871
872 /// Estimate the overhead of scalarizing an instructions unique
873 /// non-constant operands. The (potentially vector) types to use for each of
874 /// argument are passes via Tys.
879 assert(Args.size() == Tys.size() && "Expected matching Args and Tys");
880
882 SmallPtrSet<const Value*, 4> UniqueOperands;
883 for (int I = 0, E = Args.size(); I != E; I++) {
884 // Disregard things like metadata arguments.
885 const Value *A = Args[I];
886 Type *Ty = Tys[I];
887 if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() &&
888 !Ty->isPtrOrPtrVectorTy())
889 continue;
890
891 if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
892 if (auto *VecTy = dyn_cast<VectorType>(Ty))
893 Cost += getScalarizationOverhead(VecTy, /*Insert*/ false,
894 /*Extract*/ true, CostKind);
895 }
896 }
897
898 return Cost;
899 }
900
901 /// Estimate the overhead of scalarizing the inputs and outputs of an
902 /// instruction, with return type RetTy and arguments Args of type Tys. If
903 /// Args are unknown (empty), then the cost associated with one argument is
904 /// added as a heuristic.
910 RetTy, /*Insert*/ true, /*Extract*/ false, CostKind);
911 if (!Args.empty())
913 else
914 // When no information on arguments is provided, we add the cost
915 // associated with one argument as a heuristic.
916 Cost += getScalarizationOverhead(RetTy, /*Insert*/ false,
917 /*Extract*/ true, CostKind);
918
919 return Cost;
920 }
921
922 /// Estimate the cost of type-legalization and the legalized type.
923 std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const {
924 LLVMContext &C = Ty->getContext();
925 EVT MTy = getTLI()->getValueType(DL, Ty);
926
928 // We keep legalizing the type until we find a legal kind. We assume that
929 // the only operation that costs anything is the split. After splitting
930 // we need to handle two types.
931 while (true) {
933
935 // Ensure we return a sensible simple VT here, since many callers of
936 // this function require it.
937 MVT VT = MTy.isSimple() ? MTy.getSimpleVT() : MVT::i64;
938 return std::make_pair(InstructionCost::getInvalid(), VT);
939 }
940
941 if (LK.first == TargetLoweringBase::TypeLegal)
942 return std::make_pair(Cost, MTy.getSimpleVT());
943
944 if (LK.first == TargetLoweringBase::TypeSplitVector ||
946 Cost *= 2;
947
948 // Do not loop with f128 type.
949 if (MTy == LK.second)
950 return std::make_pair(Cost, MTy.getSimpleVT());
951
952 // Keep legalizing the type.
953 MTy = LK.second;
954 }
955 }
956
957 unsigned getMaxInterleaveFactor(ElementCount VF) { return 1; }
958
960 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
963 ArrayRef<const Value *> Args = {}, const Instruction *CxtI = nullptr) {
964 // Check if any of the operands are vector operands.
965 const TargetLoweringBase *TLI = getTLI();
966 int ISD = TLI->InstructionOpcodeToISD(Opcode);
967 assert(ISD && "Invalid opcode");
968
969 // TODO: Handle more cost kinds.
971 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
972 Opd1Info, Opd2Info,
973 Args, CxtI);
974
975 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
976
977 bool IsFloat = Ty->isFPOrFPVectorTy();
978 // Assume that floating point arithmetic operations cost twice as much as
979 // integer operations.
980 InstructionCost OpCost = (IsFloat ? 2 : 1);
981
982 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
983 // The operation is legal. Assume it costs 1.
984 // TODO: Once we have extract/insert subvector cost we need to use them.
985 return LT.first * OpCost;
986 }
987
988 if (!TLI->isOperationExpand(ISD, LT.second)) {
989 // If the operation is custom lowered, then assume that the code is twice
990 // as expensive.
991 return LT.first * 2 * OpCost;
992 }
993
994 // An 'Expand' of URem and SRem is special because it may default
995 // to expanding the operation into a sequence of sub-operations
996 // i.e. X % Y -> X-(X/Y)*Y.
997 if (ISD == ISD::UREM || ISD == ISD::SREM) {
998 bool IsSigned = ISD == ISD::SREM;
999 if (TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIVREM : ISD::UDIVREM,
1000 LT.second) ||
1001 TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIV : ISD::UDIV,
1002 LT.second)) {
1003 unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv;
1004 InstructionCost DivCost = thisT()->getArithmeticInstrCost(
1005 DivOpc, Ty, CostKind, Opd1Info, Opd2Info);
1006 InstructionCost MulCost =
1007 thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind);
1008 InstructionCost SubCost =
1009 thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
1010 return DivCost + MulCost + SubCost;
1011 }
1012 }
1013
1014 // We cannot scalarize scalable vectors, so return Invalid.
1015 if (isa<ScalableVectorType>(Ty))
1017
1018 // Else, assume that we need to scalarize this op.
1019 // TODO: If one of the types get legalized by splitting, handle this
1020 // similarly to what getCastInstrCost() does.
1021 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
1022 InstructionCost Cost = thisT()->getArithmeticInstrCost(
1023 Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
1024 Args, CxtI);
1025 // Return the cost of multiple scalar invocation plus the cost of
1026 // inserting and extracting the values.
1027 SmallVector<Type *> Tys(Args.size(), Ty);
1028 return getScalarizationOverhead(VTy, Args, Tys, CostKind) +
1029 VTy->getNumElements() * Cost;
1030 }
1031
1032 // We don't know anything about this scalar instruction.
1033 return OpCost;
1034 }
1035
1037 ArrayRef<int> Mask,
1038 VectorType *Ty, int &Index,
1039 VectorType *&SubTy) const {
1040 if (Mask.empty())
1041 return Kind;
1042 int NumSrcElts = Ty->getElementCount().getKnownMinValue();
1043 switch (Kind) {
1045 if (ShuffleVectorInst::isReverseMask(Mask, NumSrcElts))
1046 return TTI::SK_Reverse;
1047 if (ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts))
1048 return TTI::SK_Broadcast;
1049 if (isSplatMask(Mask, NumSrcElts, Index))
1050 return TTI::SK_Broadcast;
1051 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts, Index) &&
1052 (Index + Mask.size()) <= (size_t)NumSrcElts) {
1053 SubTy = FixedVectorType::get(Ty->getElementType(), Mask.size());
1055 }
1056 break;
1057 }
1058 case TTI::SK_PermuteTwoSrc: {
1059 int NumSubElts;
1060 if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
1061 Mask, NumSrcElts, NumSubElts, Index)) {
1062 if (Index + NumSubElts > NumSrcElts)
1063 return Kind;
1064 SubTy = FixedVectorType::get(Ty->getElementType(), NumSubElts);
1066 }
1067 if (ShuffleVectorInst::isSelectMask(Mask, NumSrcElts))
1068 return TTI::SK_Select;
1069 if (ShuffleVectorInst::isTransposeMask(Mask, NumSrcElts))
1070 return TTI::SK_Transpose;
1071 if (ShuffleVectorInst::isSpliceMask(Mask, NumSrcElts, Index))
1072 return TTI::SK_Splice;
1073 break;
1074 }
1075 case TTI::SK_Select:
1076 case TTI::SK_Reverse:
1077 case TTI::SK_Broadcast:
1078 case TTI::SK_Transpose:
1081 case TTI::SK_Splice:
1082 break;
1083 }
1084 return Kind;
1085 }
1086
1088 ArrayRef<int> Mask,
1090 VectorType *SubTp,
1091 ArrayRef<const Value *> Args = {},
1092 const Instruction *CxtI = nullptr) {
1093 switch (improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp)) {
1094 case TTI::SK_Broadcast:
1095 if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
1096 return getBroadcastShuffleOverhead(FVT, CostKind);
1098 case TTI::SK_Select:
1099 case TTI::SK_Splice:
1100 case TTI::SK_Reverse:
1101 case TTI::SK_Transpose:
1104 if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
1105 return getPermuteShuffleOverhead(FVT, CostKind);
1108 return getExtractSubvectorOverhead(Tp, CostKind, Index,
1109 cast<FixedVectorType>(SubTp));
1111 return getInsertSubvectorOverhead(Tp, CostKind, Index,
1112 cast<FixedVectorType>(SubTp));
1113 }
1114 llvm_unreachable("Unknown TTI::ShuffleKind");
1115 }
1116
1117 InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1120 const Instruction *I = nullptr) {
1121 if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0)
1122 return 0;
1123
1124 const TargetLoweringBase *TLI = getTLI();
1125 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1126 assert(ISD && "Invalid opcode");
1127 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1128 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1129
1130 TypeSize SrcSize = SrcLT.second.getSizeInBits();
1131 TypeSize DstSize = DstLT.second.getSizeInBits();
1132 bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy();
1133 bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy();
1134
1135 switch (Opcode) {
1136 default:
1137 break;
1138 case Instruction::Trunc:
1139 // Check for NOOP conversions.
1140 if (TLI->isTruncateFree(SrcLT.second, DstLT.second))
1141 return 0;
1142 [[fallthrough]];
1143 case Instruction::BitCast:
1144 // Bitcast between types that are legalized to the same type are free and
1145 // assume int to/from ptr of the same size is also free.
1146 if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst &&
1147 SrcSize == DstSize)
1148 return 0;
1149 break;
1150 case Instruction::FPExt:
1151 if (I && getTLI()->isExtFree(I))
1152 return 0;
1153 break;
1154 case Instruction::ZExt:
1155 if (TLI->isZExtFree(SrcLT.second, DstLT.second))
1156 return 0;
1157 [[fallthrough]];
1158 case Instruction::SExt:
1159 if (I && getTLI()->isExtFree(I))
1160 return 0;
1161
1162 // If this is a zext/sext of a load, return 0 if the corresponding
1163 // extending load exists on target and the result type is legal.
1164 if (CCH == TTI::CastContextHint::Normal) {
1165 EVT ExtVT = EVT::getEVT(Dst);
1166 EVT LoadVT = EVT::getEVT(Src);
1167 unsigned LType =
1168 ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
1169 if (DstLT.first == SrcLT.first &&
1170 TLI->isLoadExtLegal(LType, ExtVT, LoadVT))
1171 return 0;
1172 }
1173 break;
1174 case Instruction::AddrSpaceCast:
1175 if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(),
1176 Dst->getPointerAddressSpace()))
1177 return 0;
1178 break;
1179 }
1180
1181 auto *SrcVTy = dyn_cast<VectorType>(Src);
1182 auto *DstVTy = dyn_cast<VectorType>(Dst);
1183
1184 // If the cast is marked as legal (or promote) then assume low cost.
1185 if (SrcLT.first == DstLT.first &&
1186 TLI->isOperationLegalOrPromote(ISD, DstLT.second))
1187 return SrcLT.first;
1188
1189 // Handle scalar conversions.
1190 if (!SrcVTy && !DstVTy) {
1191 // Just check the op cost. If the operation is legal then assume it costs
1192 // 1.
1193 if (!TLI->isOperationExpand(ISD, DstLT.second))
1194 return 1;
1195
1196 // Assume that illegal scalar instruction are expensive.
1197 return 4;
1198 }
1199
1200 // Check vector-to-vector casts.
1201 if (DstVTy && SrcVTy) {
1202 // If the cast is between same-sized registers, then the check is simple.
1203 if (SrcLT.first == DstLT.first && SrcSize == DstSize) {
1204
1205 // Assume that Zext is done using AND.
1206 if (Opcode == Instruction::ZExt)
1207 return SrcLT.first;
1208
1209 // Assume that sext is done using SHL and SRA.
1210 if (Opcode == Instruction::SExt)
1211 return SrcLT.first * 2;
1212
1213 // Just check the op cost. If the operation is legal then assume it
1214 // costs
1215 // 1 and multiply by the type-legalization overhead.
1216 if (!TLI->isOperationExpand(ISD, DstLT.second))
1217 return SrcLT.first * 1;
1218 }
1219
1220 // If we are legalizing by splitting, query the concrete TTI for the cost
1221 // of casting the original vector twice. We also need to factor in the
1222 // cost of the split itself. Count that as 1, to be consistent with
1223 // getTypeLegalizationCost().
1224 bool SplitSrc =
1225 TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
1227 bool SplitDst =
1228 TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
1230 if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() &&
1231 DstVTy->getElementCount().isVector()) {
1232 Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy);
1233 Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy);
1234 T *TTI = static_cast<T *>(this);
1235 // If both types need to be split then the split is free.
1236 InstructionCost SplitCost =
1237 (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0;
1238 return SplitCost +
1239 (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH,
1240 CostKind, I));
1241 }
1242
1243 // Scalarization cost is Invalid, can't assume any num elements.
1244 if (isa<ScalableVectorType>(DstVTy))
1246
1247 // In other cases where the source or destination are illegal, assume
1248 // the operation will get scalarized.
1249 unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements();
1250 InstructionCost Cost = thisT()->getCastInstrCost(
1251 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I);
1252
1253 // Return the cost of multiple scalar invocation plus the cost of
1254 // inserting and extracting the values.
1255 return getScalarizationOverhead(DstVTy, /*Insert*/ true, /*Extract*/ true,
1256 CostKind) +
1257 Num * Cost;
1258 }
1259
1260 // We already handled vector-to-vector and scalar-to-scalar conversions.
1261 // This
1262 // is where we handle bitcast between vectors and scalars. We need to assume
1263 // that the conversion is scalarized in one way or another.
1264 if (Opcode == Instruction::BitCast) {
1265 // Illegal bitcasts are done by storing and loading from a stack slot.
1266 return (SrcVTy ? getScalarizationOverhead(SrcVTy, /*Insert*/ false,
1267 /*Extract*/ true, CostKind)
1268 : 0) +
1269 (DstVTy ? getScalarizationOverhead(DstVTy, /*Insert*/ true,
1270 /*Extract*/ false, CostKind)
1271 : 0);
1272 }
1273
1274 llvm_unreachable("Unhandled cast");
1275 }
1276
1278 VectorType *VecTy, unsigned Index) {
1280 return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
1281 CostKind, Index, nullptr, nullptr) +
1282 thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(),
1284 }
1285
1287 const Instruction *I = nullptr) {
1288 return BaseT::getCFInstrCost(Opcode, CostKind, I);
1289 }
1290
1292 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
1296 const Instruction *I = nullptr) {
1297 const TargetLoweringBase *TLI = getTLI();
1298 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1299 assert(ISD && "Invalid opcode");
1300
1301 // TODO: Handle other cost kinds.
1303 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1304 Op1Info, Op2Info, I);
1305
1306 // Selects on vectors are actually vector selects.
1307 if (ISD == ISD::SELECT) {
1308 assert(CondTy && "CondTy must exist");
1309 if (CondTy->isVectorTy())
1310 ISD = ISD::VSELECT;
1311 }
1312 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1313
1314 if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
1315 !TLI->isOperationExpand(ISD, LT.second)) {
1316 // The operation is legal. Assume it costs 1. Multiply
1317 // by the type-legalization overhead.
1318 return LT.first * 1;
1319 }
1320
1321 // Otherwise, assume that the cast is scalarized.
1322 // TODO: If one of the types get legalized by splitting, handle this
1323 // similarly to what getCastInstrCost() does.
1324 if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) {
1325 if (isa<ScalableVectorType>(ValTy))
1327
1328 unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements();
1329 if (CondTy)
1330 CondTy = CondTy->getScalarType();
1331 InstructionCost Cost =
1332 thisT()->getCmpSelInstrCost(Opcode, ValVTy->getScalarType(), CondTy,
1333 VecPred, CostKind, Op1Info, Op2Info, I);
1334
1335 // Return the cost of multiple scalar invocation plus the cost of
1336 // inserting and extracting the values.
1337 return getScalarizationOverhead(ValVTy, /*Insert*/ true,
1338 /*Extract*/ false, CostKind) +
1339 Num * Cost;
1340 }
1341
1342 // Unknown scalar opcode.
1343 return 1;
1344 }
1345
1348 unsigned Index, Value *Op0, Value *Op1) {
1349 return getRegUsageForType(Val->getScalarType());
1350 }
1351
1352 /// \param ScalarUserAndIdx encodes the information about extracts from a
1353 /// vector with 'Scalar' being the value being extracted,'User' being the user
1354 /// of the extract(nullptr if user is not known before vectorization) and
1355 /// 'Idx' being the extract lane.
1357 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
1358 Value *Scalar,
1359 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
1360 return thisT()->getVectorInstrCost(Opcode, Val, CostKind, Index, nullptr,
1361 nullptr);
1362 }
1363
1366 unsigned Index) {
1367 Value *Op0 = nullptr;
1368 Value *Op1 = nullptr;
1369 if (auto *IE = dyn_cast<InsertElementInst>(&I)) {
1370 Op0 = IE->getOperand(0);
1371 Op1 = IE->getOperand(1);
1372 }
1373 return thisT()->getVectorInstrCost(I.getOpcode(), Val, CostKind, Index, Op0,
1374 Op1);
1375 }
1376
1377 InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
1378 int VF,
1379 const APInt &DemandedDstElts,
1381 assert(DemandedDstElts.getBitWidth() == (unsigned)VF * ReplicationFactor &&
1382 "Unexpected size of DemandedDstElts.");
1383
1385
1386 auto *SrcVT = FixedVectorType::get(EltTy, VF);
1387 auto *ReplicatedVT = FixedVectorType::get(EltTy, VF * ReplicationFactor);
1388
1389 // The Mask shuffling cost is extract all the elements of the Mask
1390 // and insert each of them Factor times into the wide vector:
1391 //
1392 // E.g. an interleaved group with factor 3:
1393 // %mask = icmp ult <8 x i32> %vec1, %vec2
1394 // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
1395 // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
1396 // The cost is estimated as extract all mask elements from the <8xi1> mask
1397 // vector and insert them factor times into the <24xi1> shuffled mask
1398 // vector.
1399 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedDstElts, VF);
1400 Cost += thisT()->getScalarizationOverhead(SrcVT, DemandedSrcElts,
1401 /*Insert*/ false,
1402 /*Extract*/ true, CostKind);
1403 Cost += thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts,
1404 /*Insert*/ true,
1405 /*Extract*/ false, CostKind);
1406
1407 return Cost;
1408 }
1409
1411 getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
1414 const Instruction *I = nullptr) {
1415 assert(!Src->isVoidTy() && "Invalid type");
1416 // Assume types, such as structs, are expensive.
1417 if (getTLI()->getValueType(DL, Src, true) == MVT::Other)
1418 return 4;
1419 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1420
1421 // Assuming that all loads of legal types cost 1.
1422 InstructionCost Cost = LT.first;
1424 return Cost;
1425
1426 const DataLayout &DL = this->getDataLayout();
1427 if (Src->isVectorTy() &&
1428 // In practice it's not currently possible to have a change in lane
1429 // length for extending loads or truncating stores so both types should
1430 // have the same scalable property.
1432 LT.second.getSizeInBits())) {
1433 // This is a vector load that legalizes to a larger type than the vector
1434 // itself. Unless the corresponding extending load or truncating store is
1435 // legal, then this will scalarize.
1437 EVT MemVT = getTLI()->getValueType(DL, Src);
1438 if (Opcode == Instruction::Store)
1439 LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
1440 else
1441 LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
1442
1443 if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
1444 // This is a vector load/store for some illegal type that is scalarized.
1445 // We must account for the cost of building or decomposing the vector.
1447 cast<VectorType>(Src), Opcode != Instruction::Store,
1448 Opcode == Instruction::Store, CostKind);
1449 }
1450 }
1451
1452 return Cost;
1453 }
1454
1456 Align Alignment, unsigned AddressSpace,
1458 // TODO: Pass on AddressSpace when we have test coverage.
1459 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false,
1460 CostKind);
1461 }
1462
1464 const Value *Ptr, bool VariableMask,
1465 Align Alignment,
1467 const Instruction *I = nullptr) {
1468 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask,
1469 true, CostKind);
1470 }
1471
1473 const Value *Ptr, bool VariableMask,
1474 Align Alignment,
1476 const Instruction *I) {
1477 // For a target without strided memory operations (or for an illegal
1478 // operation type on one which does), assume we lower to a gather/scatter
1479 // operation. (Which may in turn be scalarized.)
1480 return thisT()->getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1481 Alignment, CostKind, I);
1482 }
1483
1485 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1486 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1487 bool UseMaskForCond = false, bool UseMaskForGaps = false) {
1488
1489 // We cannot scalarize scalable vectors, so return Invalid.
1490 if (isa<ScalableVectorType>(VecTy))
1492
1493 auto *VT = cast<FixedVectorType>(VecTy);
1494
1495 unsigned NumElts = VT->getNumElements();
1496 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1497
1498 unsigned NumSubElts = NumElts / Factor;
1499 auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts);
1500
1501 // Firstly, the cost of load/store operation.
1503 if (UseMaskForCond || UseMaskForGaps)
1504 Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment,
1506 else
1507 Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace,
1508 CostKind);
1509
1510 // Legalize the vector type, and get the legalized and unlegalized type
1511 // sizes.
1512 MVT VecTyLT = getTypeLegalizationCost(VecTy).second;
1513 unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy);
1514 unsigned VecTyLTSize = VecTyLT.getStoreSize();
1515
1516 // Scale the cost of the memory operation by the fraction of legalized
1517 // instructions that will actually be used. We shouldn't account for the
1518 // cost of dead instructions since they will be removed.
1519 //
1520 // E.g., An interleaved load of factor 8:
1521 // %vec = load <16 x i64>, <16 x i64>* %ptr
1522 // %v0 = shufflevector %vec, undef, <0, 8>
1523 //
1524 // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
1525 // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
1526 // type). The other loads are unused.
1527 //
1528 // TODO: Note that legalization can turn masked loads/stores into unmasked
1529 // (legalized) loads/stores. This can be reflected in the cost.
1530 if (Cost.isValid() && VecTySize > VecTyLTSize) {
1531 // The number of loads of a legal type it will take to represent a load
1532 // of the unlegalized vector type.
1533 unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize);
1534
1535 // The number of elements of the unlegalized type that correspond to a
1536 // single legal instruction.
1537 unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts);
1538
1539 // Determine which legal instructions will be used.
1540 BitVector UsedInsts(NumLegalInsts, false);
1541 for (unsigned Index : Indices)
1542 for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
1543 UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
1544
1545 // Scale the cost of the load by the fraction of legal instructions that
1546 // will be used.
1547 Cost = divideCeil(UsedInsts.count() * *Cost.getValue(), NumLegalInsts);
1548 }
1549
1550 // Then plus the cost of interleave operation.
1551 assert(Indices.size() <= Factor &&
1552 "Interleaved memory op has too many members");
1553
1554 const APInt DemandedAllSubElts = APInt::getAllOnes(NumSubElts);
1555 const APInt DemandedAllResultElts = APInt::getAllOnes(NumElts);
1556
1557 APInt DemandedLoadStoreElts = APInt::getZero(NumElts);
1558 for (unsigned Index : Indices) {
1559 assert(Index < Factor && "Invalid index for interleaved memory op");
1560 for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
1561 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
1562 }
1563
1564 if (Opcode == Instruction::Load) {
1565 // The interleave cost is similar to extract sub vectors' elements
1566 // from the wide vector, and insert them into sub vectors.
1567 //
1568 // E.g. An interleaved load of factor 2 (with one member of index 0):
1569 // %vec = load <8 x i32>, <8 x i32>* %ptr
1570 // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0
1571 // The cost is estimated as extract elements at 0, 2, 4, 6 from the
1572 // <8 x i32> vector and insert them into a <4 x i32> vector.
1573 InstructionCost InsSubCost = thisT()->getScalarizationOverhead(
1574 SubVT, DemandedAllSubElts,
1575 /*Insert*/ true, /*Extract*/ false, CostKind);
1576 Cost += Indices.size() * InsSubCost;
1577 Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1578 /*Insert*/ false,
1579 /*Extract*/ true, CostKind);
1580 } else {
1581 // The interleave cost is extract elements from sub vectors, and
1582 // insert them into the wide vector.
1583 //
1584 // E.g. An interleaved store of factor 3 with 2 members at indices 0,1:
1585 // (using VF=4):
1586 // %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef>
1587 // %gaps.mask = <true, true, false, true, true, false,
1588 // true, true, false, true, true, false>
1589 // call llvm.masked.store <12 x i32> %v0_v1, <12 x i32>* %ptr,
1590 // i32 Align, <12 x i1> %gaps.mask
1591 // The cost is estimated as extract all elements (of actual members,
1592 // excluding gaps) from both <4 x i32> vectors and insert into the <12 x
1593 // i32> vector.
1594 InstructionCost ExtSubCost = thisT()->getScalarizationOverhead(
1595 SubVT, DemandedAllSubElts,
1596 /*Insert*/ false, /*Extract*/ true, CostKind);
1597 Cost += ExtSubCost * Indices.size();
1598 Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1599 /*Insert*/ true,
1600 /*Extract*/ false, CostKind);
1601 }
1602
1603 if (!UseMaskForCond)
1604 return Cost;
1605
1606 Type *I8Type = Type::getInt8Ty(VT->getContext());
1607
1608 Cost += thisT()->getReplicationShuffleCost(
1609 I8Type, Factor, NumSubElts,
1610 UseMaskForGaps ? DemandedLoadStoreElts : DemandedAllResultElts,
1611 CostKind);
1612
1613 // The Gaps mask is invariant and created outside the loop, therefore the
1614 // cost of creating it is not accounted for here. However if we have both
1615 // a MaskForGaps and some other mask that guards the execution of the
1616 // memory access, we need to account for the cost of And-ing the two masks
1617 // inside the loop.
1618 if (UseMaskForGaps) {
1619 auto *MaskVT = FixedVectorType::get(I8Type, NumElts);
1620 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT,
1621 CostKind);
1622 }
1623
1624 return Cost;
1625 }
1626
1627 /// Get intrinsic cost based on arguments.
1630 // Check for generically free intrinsics.
1632 return 0;
1633
1634 // Assume that target intrinsics are cheap.
1635 Intrinsic::ID IID = ICA.getID();
1638
1639 // VP Intrinsics should have the same cost as their non-vp counterpart.
1640 // TODO: Adjust the cost to make the vp intrinsic cheaper than its non-vp
1641 // counterpart when the vector length argument is smaller than the maximum
1642 // vector length.
1643 // TODO: Support other kinds of VPIntrinsics
1644 if (VPIntrinsic::isVPIntrinsic(ICA.getID())) {
1645 std::optional<unsigned> FOp =
1647 if (FOp) {
1648 if (ICA.getID() == Intrinsic::vp_load) {
1649 Align Alignment;
1650 if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
1651 Alignment = VPI->getPointerAlignment().valueOrOne();
1652 unsigned AS = 0;
1653 if (ICA.getArgTypes().size() > 1)
1654 if (auto *PtrTy = dyn_cast<PointerType>(ICA.getArgTypes()[0]))
1655 AS = PtrTy->getAddressSpace();
1656 return thisT()->getMemoryOpCost(*FOp, ICA.getReturnType(), Alignment,
1657 AS, CostKind);
1658 }
1659 if (ICA.getID() == Intrinsic::vp_store) {
1660 Align Alignment;
1661 if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
1662 Alignment = VPI->getPointerAlignment().valueOrOne();
1663 unsigned AS = 0;
1664 if (ICA.getArgTypes().size() >= 2)
1665 if (auto *PtrTy = dyn_cast<PointerType>(ICA.getArgTypes()[1]))
1666 AS = PtrTy->getAddressSpace();
1667 return thisT()->getMemoryOpCost(*FOp, ICA.getArgTypes()[0], Alignment,
1668 AS, CostKind);
1669 }
1671 return thisT()->getArithmeticInstrCost(*FOp, ICA.getReturnType(),
1672 CostKind);
1673 }
1674 if (VPCastIntrinsic::isVPCast(ICA.getID())) {
1675 return thisT()->getCastInstrCost(
1676 *FOp, ICA.getReturnType(), ICA.getArgTypes()[0],
1678 }
1679 if (VPCmpIntrinsic::isVPCmp(ICA.getID())) {
1680 // We can only handle vp_cmp intrinsics with underlying instructions.
1681 if (ICA.getInst()) {
1682 assert(FOp);
1683 auto *UI = cast<VPCmpIntrinsic>(ICA.getInst());
1684 return thisT()->getCmpSelInstrCost(*FOp, ICA.getArgTypes()[0],
1685 ICA.getReturnType(),
1686 UI->getPredicate(), CostKind);
1687 }
1688 }
1689 }
1690
1691 std::optional<Intrinsic::ID> FID =
1693 if (FID) {
1694 // Non-vp version will have same arg types except mask and vector
1695 // length.
1696 assert(ICA.getArgTypes().size() >= 2 &&
1697 "Expected VPIntrinsic to have Mask and Vector Length args and "
1698 "types");
1700
1701 // VPReduction intrinsics have a start value argument that their non-vp
1702 // counterparts do not have, except for the fadd and fmul non-vp
1703 // counterpart.
1705 *FID != Intrinsic::vector_reduce_fadd &&
1706 *FID != Intrinsic::vector_reduce_fmul)
1707 NewTys = NewTys.drop_front();
1708
1709 IntrinsicCostAttributes NewICA(*FID, ICA.getReturnType(), NewTys,
1710 ICA.getFlags());
1711 return thisT()->getIntrinsicInstrCost(NewICA, CostKind);
1712 }
1713 }
1714
1715 if (ICA.isTypeBasedOnly())
1717
1718 Type *RetTy = ICA.getReturnType();
1719
1720 ElementCount RetVF =
1721 (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
1723 const IntrinsicInst *I = ICA.getInst();
1724 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
1725 FastMathFlags FMF = ICA.getFlags();
1726 switch (IID) {
1727 default:
1728 break;
1729
1730 case Intrinsic::powi:
1731 if (auto *RHSC = dyn_cast<ConstantInt>(Args[1])) {
1732 bool ShouldOptForSize = I->getParent()->getParent()->hasOptSize();
1733 if (getTLI()->isBeneficialToExpandPowI(RHSC->getSExtValue(),
1734 ShouldOptForSize)) {
1735 // The cost is modeled on the expansion performed by ExpandPowI in
1736 // SelectionDAGBuilder.
1737 APInt Exponent = RHSC->getValue().abs();
1738 unsigned ActiveBits = Exponent.getActiveBits();
1739 unsigned PopCount = Exponent.popcount();
1740 InstructionCost Cost = (ActiveBits + PopCount - 2) *
1741 thisT()->getArithmeticInstrCost(
1742 Instruction::FMul, RetTy, CostKind);
1743 if (RHSC->isNegative())
1744 Cost += thisT()->getArithmeticInstrCost(Instruction::FDiv, RetTy,
1745 CostKind);
1746 return Cost;
1747 }
1748 }
1749 break;
1750 case Intrinsic::cttz:
1751 // FIXME: If necessary, this should go in target-specific overrides.
1752 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz(RetTy))
1754 break;
1755
1756 case Intrinsic::ctlz:
1757 // FIXME: If necessary, this should go in target-specific overrides.
1758 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz(RetTy))
1760 break;
1761
1762 case Intrinsic::memcpy:
1763 return thisT()->getMemcpyCost(ICA.getInst());
1764
1765 case Intrinsic::masked_scatter: {
1766 const Value *Mask = Args[3];
1767 bool VarMask = !isa<Constant>(Mask);
1768 Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue();
1769 return thisT()->getGatherScatterOpCost(Instruction::Store,
1770 ICA.getArgTypes()[0], Args[1],
1771 VarMask, Alignment, CostKind, I);
1772 }
1773 case Intrinsic::masked_gather: {
1774 const Value *Mask = Args[2];
1775 bool VarMask = !isa<Constant>(Mask);
1776 Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue();
1777 return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
1778 VarMask, Alignment, CostKind, I);
1779 }
1780 case Intrinsic::experimental_vp_strided_store: {
1781 const Value *Data = Args[0];
1782 const Value *Ptr = Args[1];
1783 const Value *Mask = Args[3];
1784 const Value *EVL = Args[4];
1785 bool VarMask = !isa<Constant>(Mask) || !isa<Constant>(EVL);
1786 Type *EltTy = cast<VectorType>(Data->getType())->getElementType();
1787 Align Alignment =
1788 I->getParamAlign(1).value_or(thisT()->DL.getABITypeAlign(EltTy));
1789 return thisT()->getStridedMemoryOpCost(Instruction::Store,
1790 Data->getType(), Ptr, VarMask,
1791 Alignment, CostKind, I);
1792 }
1793 case Intrinsic::experimental_vp_strided_load: {
1794 const Value *Ptr = Args[0];
1795 const Value *Mask = Args[2];
1796 const Value *EVL = Args[3];
1797 bool VarMask = !isa<Constant>(Mask) || !isa<Constant>(EVL);
1798 Type *EltTy = cast<VectorType>(RetTy)->getElementType();
1799 Align Alignment =
1800 I->getParamAlign(0).value_or(thisT()->DL.getABITypeAlign(EltTy));
1801 return thisT()->getStridedMemoryOpCost(Instruction::Load, RetTy, Ptr,
1802 VarMask, Alignment, CostKind, I);
1803 }
1804 case Intrinsic::stepvector: {
1805 if (isa<ScalableVectorType>(RetTy))
1807 // The cost of materialising a constant integer vector.
1809 }
1810 case Intrinsic::vector_extract: {
1811 // FIXME: Handle case where a scalable vector is extracted from a scalable
1812 // vector
1813 if (isa<ScalableVectorType>(RetTy))
1815 unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue();
1816 return thisT()->getShuffleCost(TTI::SK_ExtractSubvector,
1817 cast<VectorType>(Args[0]->getType()), {},
1818 CostKind, Index, cast<VectorType>(RetTy));
1819 }
1820 case Intrinsic::vector_insert: {
1821 // FIXME: Handle case where a scalable vector is inserted into a scalable
1822 // vector
1823 if (isa<ScalableVectorType>(Args[1]->getType()))
1825 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1826 return thisT()->getShuffleCost(
1827 TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()), {},
1828 CostKind, Index, cast<VectorType>(Args[1]->getType()));
1829 }
1830 case Intrinsic::vector_reverse: {
1831 return thisT()->getShuffleCost(TTI::SK_Reverse,
1832 cast<VectorType>(Args[0]->getType()), {},
1833 CostKind, 0, cast<VectorType>(RetTy));
1834 }
1835 case Intrinsic::vector_splice: {
1836 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1837 return thisT()->getShuffleCost(TTI::SK_Splice,
1838 cast<VectorType>(Args[0]->getType()), {},
1839 CostKind, Index, cast<VectorType>(RetTy));
1840 }
1841 case Intrinsic::vector_reduce_add:
1842 case Intrinsic::vector_reduce_mul:
1843 case Intrinsic::vector_reduce_and:
1844 case Intrinsic::vector_reduce_or:
1845 case Intrinsic::vector_reduce_xor:
1846 case Intrinsic::vector_reduce_smax:
1847 case Intrinsic::vector_reduce_smin:
1848 case Intrinsic::vector_reduce_fmax:
1849 case Intrinsic::vector_reduce_fmin:
1850 case Intrinsic::vector_reduce_fmaximum:
1851 case Intrinsic::vector_reduce_fminimum:
1852 case Intrinsic::vector_reduce_umax:
1853 case Intrinsic::vector_reduce_umin: {
1854 IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1);
1856 }
1857 case Intrinsic::vector_reduce_fadd:
1858 case Intrinsic::vector_reduce_fmul: {
1860 IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1);
1862 }
1863 case Intrinsic::fshl:
1864 case Intrinsic::fshr: {
1865 const Value *X = Args[0];
1866 const Value *Y = Args[1];
1867 const Value *Z = Args[2];
1870 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(Z);
1871 const TTI::OperandValueInfo OpInfoBW =
1873 isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2
1874 : TTI::OP_None};
1875
1876 // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
1877 // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
1879 Cost +=
1880 thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
1881 Cost +=
1882 thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
1883 Cost += thisT()->getArithmeticInstrCost(
1884 BinaryOperator::Shl, RetTy, CostKind, OpInfoX,
1885 {OpInfoZ.Kind, TTI::OP_None});
1886 Cost += thisT()->getArithmeticInstrCost(
1887 BinaryOperator::LShr, RetTy, CostKind, OpInfoY,
1888 {OpInfoZ.Kind, TTI::OP_None});
1889 // Non-constant shift amounts requires a modulo.
1890 if (!OpInfoZ.isConstant())
1891 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy,
1892 CostKind, OpInfoZ, OpInfoBW);
1893 // For non-rotates (X != Y) we must add shift-by-zero handling costs.
1894 if (X != Y) {
1895 Type *CondTy = RetTy->getWithNewBitWidth(1);
1896 Cost +=
1897 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1899 Cost +=
1900 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1902 }
1903 return Cost;
1904 }
1905 case Intrinsic::get_active_lane_mask: {
1906 EVT ResVT = getTLI()->getValueType(DL, RetTy, true);
1907 EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true);
1908
1909 // If we're not expanding the intrinsic then we assume this is cheap
1910 // to implement.
1911 if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgType)) {
1912 return getTypeLegalizationCost(RetTy).first;
1913 }
1914
1915 // Create the expanded types that will be used to calculate the uadd_sat
1916 // operation.
1917 Type *ExpRetTy = VectorType::get(
1918 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1919 IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {}, FMF);
1921 thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1922 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy,
1924 return Cost;
1925 }
1926 case Intrinsic::experimental_cttz_elts: {
1927 EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true);
1928
1929 // If we're not expanding the intrinsic then we assume this is cheap
1930 // to implement.
1931 if (!getTLI()->shouldExpandCttzElements(ArgType))
1932 return getTypeLegalizationCost(RetTy).first;
1933
1934 // TODO: The costs below reflect the expansion code in
1935 // SelectionDAGBuilder, but we may want to sacrifice some accuracy in
1936 // favour of compile time.
1937
1938 // Find the smallest "sensible" element type to use for the expansion.
1939 bool ZeroIsPoison = !cast<ConstantInt>(Args[1])->isZero();
1940 ConstantRange VScaleRange(APInt(64, 1), APInt::getZero(64));
1941 if (isa<ScalableVectorType>(ICA.getArgTypes()[0]) && I && I->getCaller())
1942 VScaleRange = getVScaleRange(I->getCaller(), 64);
1943
1944 unsigned EltWidth = getTLI()->getBitWidthForCttzElements(
1945 RetTy, ArgType.getVectorElementCount(), ZeroIsPoison, &VScaleRange);
1946 Type *NewEltTy = IntegerType::getIntNTy(RetTy->getContext(), EltWidth);
1947
1948 // Create the new vector type & get the vector length
1949 Type *NewVecTy = VectorType::get(
1950 NewEltTy, cast<VectorType>(Args[0]->getType())->getElementCount());
1951
1952 IntrinsicCostAttributes StepVecAttrs(Intrinsic::stepvector, NewVecTy, {},
1953 FMF);
1955 thisT()->getIntrinsicInstrCost(StepVecAttrs, CostKind);
1956
1957 Cost +=
1958 thisT()->getArithmeticInstrCost(Instruction::Sub, NewVecTy, CostKind);
1959 Cost += thisT()->getCastInstrCost(Instruction::SExt, NewVecTy,
1960 Args[0]->getType(),
1962 Cost +=
1963 thisT()->getArithmeticInstrCost(Instruction::And, NewVecTy, CostKind);
1964
1965 IntrinsicCostAttributes ReducAttrs(Intrinsic::vector_reduce_umax,
1966 NewEltTy, NewVecTy, FMF, I, 1);
1967 Cost += thisT()->getTypeBasedIntrinsicInstrCost(ReducAttrs, CostKind);
1968 Cost +=
1969 thisT()->getArithmeticInstrCost(Instruction::Sub, NewEltTy, CostKind);
1970
1971 return Cost;
1972 }
1973 case Intrinsic::experimental_vector_match:
1974 return thisT()->getTypeBasedIntrinsicInstrCost(ICA, CostKind);
1975 }
1976
1977 // Assume that we need to scalarize this intrinsic.)
1978 // Compute the scalarization overhead based on Args for a vector
1979 // intrinsic.
1980 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
1981 if (RetVF.isVector() && !RetVF.isScalable()) {
1982 ScalarizationCost = 0;
1983 if (!RetTy->isVoidTy())
1984 ScalarizationCost += getScalarizationOverhead(
1985 cast<VectorType>(RetTy),
1986 /*Insert*/ true, /*Extract*/ false, CostKind);
1987 ScalarizationCost +=
1989 }
1990
1991 IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
1992 ScalarizationCost);
1993 return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1994 }
1995
1996 /// Get intrinsic cost based on argument types.
1997 /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the
1998 /// cost of scalarizing the arguments and the return value will be computed
1999 /// based on types.
2003 Intrinsic::ID IID = ICA.getID();
2004 Type *RetTy = ICA.getReturnType();
2005 const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes();
2006 FastMathFlags FMF = ICA.getFlags();
2007 InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost();
2008 bool SkipScalarizationCost = ICA.skipScalarizationCost();
2009
2010 VectorType *VecOpTy = nullptr;
2011 if (!Tys.empty()) {
2012 // The vector reduction operand is operand 0 except for fadd/fmul.
2013 // Their operand 0 is a scalar start value, so the vector op is operand 1.
2014 unsigned VecTyIndex = 0;
2015 if (IID == Intrinsic::vector_reduce_fadd ||
2016 IID == Intrinsic::vector_reduce_fmul)
2017 VecTyIndex = 1;
2018 assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes");
2019 VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]);
2020 }
2021
2022 // Library call cost - other than size, make it expensive.
2023 unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10;
2024 unsigned ISD = 0;
2025 switch (IID) {
2026 default: {
2027 // Scalable vectors cannot be scalarized, so return Invalid.
2028 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
2029 return isa<ScalableVectorType>(Ty);
2030 }))
2032
2033 // Assume that we need to scalarize this intrinsic.
2034 InstructionCost ScalarizationCost =
2035 SkipScalarizationCost ? ScalarizationCostPassed : 0;
2036 unsigned ScalarCalls = 1;
2037 Type *ScalarRetTy = RetTy;
2038 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
2039 if (!SkipScalarizationCost)
2040 ScalarizationCost = getScalarizationOverhead(
2041 RetVTy, /*Insert*/ true, /*Extract*/ false, CostKind);
2042 ScalarCalls = std::max(ScalarCalls,
2043 cast<FixedVectorType>(RetVTy)->getNumElements());
2044 ScalarRetTy = RetTy->getScalarType();
2045 }
2046 SmallVector<Type *, 4> ScalarTys;
2047 for (Type *Ty : Tys) {
2048 if (auto *VTy = dyn_cast<VectorType>(Ty)) {
2049 if (!SkipScalarizationCost)
2050 ScalarizationCost += getScalarizationOverhead(
2051 VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
2052 ScalarCalls = std::max(ScalarCalls,
2053 cast<FixedVectorType>(VTy)->getNumElements());
2054 Ty = Ty->getScalarType();
2055 }
2056 ScalarTys.push_back(Ty);
2057 }
2058 if (ScalarCalls == 1)
2059 return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
2060
2061 IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF);
2062 InstructionCost ScalarCost =
2063 thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind);
2064
2065 return ScalarCalls * ScalarCost + ScalarizationCost;
2066 }
2067 // Look for intrinsics that can be lowered directly or turned into a scalar
2068 // intrinsic call.
2069 case Intrinsic::sqrt:
2070 ISD = ISD::FSQRT;
2071 break;
2072 case Intrinsic::sin:
2073 ISD = ISD::FSIN;
2074 break;
2075 case Intrinsic::cos:
2076 ISD = ISD::FCOS;
2077 break;
2078 case Intrinsic::sincos:
2079 ISD = ISD::FSINCOS;
2080 break;
2081 case Intrinsic::tan:
2082 ISD = ISD::FTAN;
2083 break;
2084 case Intrinsic::asin:
2085 ISD = ISD::FASIN;
2086 break;
2087 case Intrinsic::acos:
2088 ISD = ISD::FACOS;
2089 break;
2090 case Intrinsic::atan:
2091 ISD = ISD::FATAN;
2092 break;
2093 case Intrinsic::atan2:
2094 ISD = ISD::FATAN2;
2095 break;
2096 case Intrinsic::sinh:
2097 ISD = ISD::FSINH;
2098 break;
2099 case Intrinsic::cosh:
2100 ISD = ISD::FCOSH;
2101 break;
2102 case Intrinsic::tanh:
2103 ISD = ISD::FTANH;
2104 break;
2105 case Intrinsic::exp:
2106 ISD = ISD::FEXP;
2107 break;
2108 case Intrinsic::exp2:
2109 ISD = ISD::FEXP2;
2110 break;
2111 case Intrinsic::exp10:
2112 ISD = ISD::FEXP10;
2113 break;
2114 case Intrinsic::log:
2115 ISD = ISD::FLOG;
2116 break;
2117 case Intrinsic::log10:
2118 ISD = ISD::FLOG10;
2119 break;
2120 case Intrinsic::log2:
2121 ISD = ISD::FLOG2;
2122 break;
2123 case Intrinsic::fabs:
2124 ISD = ISD::FABS;
2125 break;
2126 case Intrinsic::canonicalize:
2127 ISD = ISD::FCANONICALIZE;
2128 break;
2129 case Intrinsic::minnum:
2130 ISD = ISD::FMINNUM;
2131 break;
2132 case Intrinsic::maxnum:
2133 ISD = ISD::FMAXNUM;
2134 break;
2135 case Intrinsic::minimum:
2136 ISD = ISD::FMINIMUM;
2137 break;
2138 case Intrinsic::maximum:
2139 ISD = ISD::FMAXIMUM;
2140 break;
2141 case Intrinsic::minimumnum:
2142 ISD = ISD::FMINIMUMNUM;
2143 break;
2144 case Intrinsic::maximumnum:
2145 ISD = ISD::FMAXIMUMNUM;
2146 break;
2147 case Intrinsic::copysign:
2148 ISD = ISD::FCOPYSIGN;
2149 break;
2150 case Intrinsic::floor:
2151 ISD = ISD::FFLOOR;
2152 break;
2153 case Intrinsic::ceil:
2154 ISD = ISD::FCEIL;
2155 break;
2156 case Intrinsic::trunc:
2157 ISD = ISD::FTRUNC;
2158 break;
2159 case Intrinsic::nearbyint:
2160 ISD = ISD::FNEARBYINT;
2161 break;
2162 case Intrinsic::rint:
2163 ISD = ISD::FRINT;
2164 break;
2165 case Intrinsic::lrint:
2166 ISD = ISD::LRINT;
2167 break;
2168 case Intrinsic::llrint:
2169 ISD = ISD::LLRINT;
2170 break;
2171 case Intrinsic::round:
2172 ISD = ISD::FROUND;
2173 break;
2174 case Intrinsic::roundeven:
2175 ISD = ISD::FROUNDEVEN;
2176 break;
2177 case Intrinsic::pow:
2178 ISD = ISD::FPOW;
2179 break;
2180 case Intrinsic::fma:
2181 ISD = ISD::FMA;
2182 break;
2183 case Intrinsic::fmuladd:
2184 ISD = ISD::FMA;
2185 break;
2186 case Intrinsic::experimental_constrained_fmuladd:
2187 ISD = ISD::STRICT_FMA;
2188 break;
2189 // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
2190 case Intrinsic::lifetime_start:
2191 case Intrinsic::lifetime_end:
2192 case Intrinsic::sideeffect:
2193 case Intrinsic::pseudoprobe:
2194 case Intrinsic::arithmetic_fence:
2195 return 0;
2196 case Intrinsic::masked_store: {
2197 Type *Ty = Tys[0];
2198 Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
2199 return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0,
2200 CostKind);
2201 }
2202 case Intrinsic::masked_load: {
2203 Type *Ty = RetTy;
2204 Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
2205 return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
2206 CostKind);
2207 }
2208 case Intrinsic::vector_reduce_add:
2209 case Intrinsic::vector_reduce_mul:
2210 case Intrinsic::vector_reduce_and:
2211 case Intrinsic::vector_reduce_or:
2212 case Intrinsic::vector_reduce_xor:
2213 return thisT()->getArithmeticReductionCost(
2214 getArithmeticReductionInstruction(IID), VecOpTy, std::nullopt,
2215 CostKind);
2216 case Intrinsic::vector_reduce_fadd:
2217 case Intrinsic::vector_reduce_fmul:
2218 return thisT()->getArithmeticReductionCost(
2219 getArithmeticReductionInstruction(IID), VecOpTy, FMF, CostKind);
2220 case Intrinsic::vector_reduce_smax:
2221 case Intrinsic::vector_reduce_smin:
2222 case Intrinsic::vector_reduce_umax:
2223 case Intrinsic::vector_reduce_umin:
2224 case Intrinsic::vector_reduce_fmax:
2225 case Intrinsic::vector_reduce_fmin:
2226 case Intrinsic::vector_reduce_fmaximum:
2227 case Intrinsic::vector_reduce_fminimum:
2228 return thisT()->getMinMaxReductionCost(getMinMaxReductionIntrinsicOp(IID),
2229 VecOpTy, ICA.getFlags(), CostKind);
2230 case Intrinsic::experimental_vector_match: {
2231 auto *SearchTy = cast<VectorType>(ICA.getArgTypes()[0]);
2232 auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]);
2233 unsigned SearchSize = NeedleTy->getNumElements();
2234
2235 // If we're not expanding the intrinsic then we assume this is cheap to
2236 // implement.
2237 EVT SearchVT = getTLI()->getValueType(DL, SearchTy);
2238 if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize))
2239 return getTypeLegalizationCost(RetTy).first;
2240
2241 // Approximate the cost based on the expansion code in
2242 // SelectionDAGBuilder.
2244 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, NeedleTy,
2245 CostKind, 1, nullptr, nullptr);
2246 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SearchTy,
2247 CostKind, 0, nullptr, nullptr);
2248 Cost += thisT()->getShuffleCost(TTI::SK_Broadcast, SearchTy, std::nullopt,
2249 CostKind, 0, nullptr);
2250 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SearchTy, RetTy,
2252 Cost +=
2253 thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
2254 Cost *= SearchSize;
2255 Cost +=
2256 thisT()->getArithmeticInstrCost(BinaryOperator::And, RetTy, CostKind);
2257 return Cost;
2258 }
2259 case Intrinsic::abs:
2260 ISD = ISD::ABS;
2261 break;
2262 case Intrinsic::smax:
2263 ISD = ISD::SMAX;
2264 break;
2265 case Intrinsic::smin:
2266 ISD = ISD::SMIN;
2267 break;
2268 case Intrinsic::umax:
2269 ISD = ISD::UMAX;
2270 break;
2271 case Intrinsic::umin:
2272 ISD = ISD::UMIN;
2273 break;
2274 case Intrinsic::sadd_sat:
2275 ISD = ISD::SADDSAT;
2276 break;
2277 case Intrinsic::ssub_sat:
2278 ISD = ISD::SSUBSAT;
2279 break;
2280 case Intrinsic::uadd_sat:
2281 ISD = ISD::UADDSAT;
2282 break;
2283 case Intrinsic::usub_sat:
2284 ISD = ISD::USUBSAT;
2285 break;
2286 case Intrinsic::smul_fix:
2287 ISD = ISD::SMULFIX;
2288 break;
2289 case Intrinsic::umul_fix:
2290 ISD = ISD::UMULFIX;
2291 break;
2292 case Intrinsic::sadd_with_overflow:
2293 ISD = ISD::SADDO;
2294 break;
2295 case Intrinsic::ssub_with_overflow:
2296 ISD = ISD::SSUBO;
2297 break;
2298 case Intrinsic::uadd_with_overflow:
2299 ISD = ISD::UADDO;
2300 break;
2301 case Intrinsic::usub_with_overflow:
2302 ISD = ISD::USUBO;
2303 break;
2304 case Intrinsic::smul_with_overflow:
2305 ISD = ISD::SMULO;
2306 break;
2307 case Intrinsic::umul_with_overflow:
2308 ISD = ISD::UMULO;
2309 break;
2310 case Intrinsic::fptosi_sat:
2311 ISD = ISD::FP_TO_SINT_SAT;
2312 break;
2313 case Intrinsic::fptoui_sat:
2314 ISD = ISD::FP_TO_UINT_SAT;
2315 break;
2316 case Intrinsic::ctpop:
2317 ISD = ISD::CTPOP;
2318 // In case of legalization use TCC_Expensive. This is cheaper than a
2319 // library call but still not a cheap instruction.
2320 SingleCallCost = TargetTransformInfo::TCC_Expensive;
2321 break;
2322 case Intrinsic::ctlz:
2323 ISD = ISD::CTLZ;
2324 break;
2325 case Intrinsic::cttz:
2326 ISD = ISD::CTTZ;
2327 break;
2328 case Intrinsic::bswap:
2329 ISD = ISD::BSWAP;
2330 break;
2331 case Intrinsic::bitreverse:
2332 ISD = ISD::BITREVERSE;
2333 break;
2334 case Intrinsic::ucmp:
2335 ISD = ISD::UCMP;
2336 break;
2337 case Intrinsic::scmp:
2338 ISD = ISD::SCMP;
2339 break;
2340 }
2341
2342 auto *ST = dyn_cast<StructType>(RetTy);
2343 Type *LegalizeTy = ST ? ST->getContainedType(0) : RetTy;
2344 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(LegalizeTy);
2345
2346 const TargetLoweringBase *TLI = getTLI();
2347
2348 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
2349 if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
2350 TLI->isFAbsFree(LT.second)) {
2351 return 0;
2352 }
2353
2354 // The operation is legal. Assume it costs 1.
2355 // If the type is split to multiple registers, assume that there is some
2356 // overhead to this.
2357 // TODO: Once we have extract/insert subvector cost we need to use them.
2358 if (LT.first > 1)
2359 return (LT.first * 2);
2360 else
2361 return (LT.first * 1);
2362 } else if (!TLI->isOperationExpand(ISD, LT.second)) {
2363 // If the operation is custom lowered then assume
2364 // that the code is twice as expensive.
2365 return (LT.first * 2);
2366 }
2367
2368 switch (IID) {
2369 case Intrinsic::fmuladd: {
2370 // If we can't lower fmuladd into an FMA estimate the cost as a floating
2371 // point mul followed by an add.
2372
2373 return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy,
2374 CostKind) +
2375 thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy,
2376 CostKind);
2377 }
2378 case Intrinsic::experimental_constrained_fmuladd: {
2379 IntrinsicCostAttributes FMulAttrs(
2380 Intrinsic::experimental_constrained_fmul, RetTy, Tys);
2381 IntrinsicCostAttributes FAddAttrs(
2382 Intrinsic::experimental_constrained_fadd, RetTy, Tys);
2383 return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) +
2384 thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind);
2385 }
2386 case Intrinsic::smin:
2387 case Intrinsic::smax:
2388 case Intrinsic::umin:
2389 case Intrinsic::umax: {
2390 // minmax(X,Y) = select(icmp(X,Y),X,Y)
2391 Type *CondTy = RetTy->getWithNewBitWidth(1);
2392 bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin;
2393 CmpInst::Predicate Pred =
2394 IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT;
2396 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2397 Pred, CostKind);
2398 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2399 Pred, CostKind);
2400 return Cost;
2401 }
2402 case Intrinsic::sadd_with_overflow:
2403 case Intrinsic::ssub_with_overflow: {
2404 Type *SumTy = RetTy->getContainedType(0);
2405 Type *OverflowTy = RetTy->getContainedType(1);
2406 unsigned Opcode = IID == Intrinsic::sadd_with_overflow
2407 ? BinaryOperator::Add
2408 : BinaryOperator::Sub;
2409
2410 // Add:
2411 // Overflow -> (Result < LHS) ^ (RHS < 0)
2412 // Sub:
2413 // Overflow -> (Result < LHS) ^ (RHS > 0)
2415 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
2416 Cost +=
2417 2 * thisT()->getCmpSelInstrCost(Instruction::ICmp, SumTy, OverflowTy,
2419 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy,
2420 CostKind);
2421 return Cost;
2422 }
2423 case Intrinsic::uadd_with_overflow:
2424 case Intrinsic::usub_with_overflow: {
2425 Type *SumTy = RetTy->getContainedType(0);
2426 Type *OverflowTy = RetTy->getContainedType(1);
2427 unsigned Opcode = IID == Intrinsic::uadd_with_overflow
2428 ? BinaryOperator::Add
2429 : BinaryOperator::Sub;
2430 CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow
2433
2435 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
2436 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy,
2437 OverflowTy, Pred, CostKind);
2438 return Cost;
2439 }
2440 case Intrinsic::smul_with_overflow:
2441 case Intrinsic::umul_with_overflow: {
2442 Type *MulTy = RetTy->getContainedType(0);
2443 Type *OverflowTy = RetTy->getContainedType(1);
2444 unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
2445 Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
2446 bool IsSigned = IID == Intrinsic::smul_with_overflow;
2447
2448 unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt;
2450
2452 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
2453 Cost +=
2454 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2455 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
2456 CCH, CostKind);
2457 Cost += thisT()->getArithmeticInstrCost(
2458 Instruction::LShr, ExtTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None},
2460
2461 if (IsSigned)
2462 Cost += thisT()->getArithmeticInstrCost(
2463 Instruction::AShr, MulTy, CostKind,
2466
2467 Cost += thisT()->getCmpSelInstrCost(
2468 BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind);
2469 return Cost;
2470 }
2471 case Intrinsic::sadd_sat:
2472 case Intrinsic::ssub_sat: {
2473 // Assume a default expansion.
2474 Type *CondTy = RetTy->getWithNewBitWidth(1);
2475
2476 Type *OpTy = StructType::create({RetTy, CondTy});
2477 Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
2478 ? Intrinsic::sadd_with_overflow
2479 : Intrinsic::ssub_with_overflow;
2481
2482 // SatMax -> Overflow && SumDiff < 0
2483 // SatMin -> Overflow && SumDiff >= 0
2485 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
2486 nullptr, ScalarizationCostPassed);
2487 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2488 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2489 Pred, CostKind);
2490 Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
2491 CondTy, Pred, CostKind);
2492 return Cost;
2493 }
2494 case Intrinsic::uadd_sat:
2495 case Intrinsic::usub_sat: {
2496 Type *CondTy = RetTy->getWithNewBitWidth(1);
2497
2498 Type *OpTy = StructType::create({RetTy, CondTy});
2499 Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
2500 ? Intrinsic::uadd_with_overflow
2501 : Intrinsic::usub_with_overflow;
2502
2504 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
2505 nullptr, ScalarizationCostPassed);
2506 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2507 Cost +=
2508 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2510 return Cost;
2511 }
2512 case Intrinsic::smul_fix:
2513 case Intrinsic::umul_fix: {
2514 unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
2515 Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);
2516
2517 unsigned ExtOp =
2518 IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
2520
2522 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
2523 Cost +=
2524 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2525 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
2526 CCH, CostKind);
2527 Cost += thisT()->getArithmeticInstrCost(
2528 Instruction::LShr, RetTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None},
2530 Cost += thisT()->getArithmeticInstrCost(
2531 Instruction::Shl, RetTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None},
2533 Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
2534 return Cost;
2535 }
2536 case Intrinsic::abs: {
2537 // abs(X) = select(icmp(X,0),X,sub(0,X))
2538 Type *CondTy = RetTy->getWithNewBitWidth(1);
2541 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2542 Pred, CostKind);
2543 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2544 Pred, CostKind);
2545 // TODO: Should we add an OperandValueProperties::OP_Zero property?
2546 Cost += thisT()->getArithmeticInstrCost(
2547 BinaryOperator::Sub, RetTy, CostKind,
2549 return Cost;
2550 }
2551 case Intrinsic::fptosi_sat:
2552 case Intrinsic::fptoui_sat: {
2553 if (Tys.empty())
2554 break;
2555 Type *FromTy = Tys[0];
2556 bool IsSigned = IID == Intrinsic::fptosi_sat;
2557
2559 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FromTy,
2560 {FromTy, FromTy});
2561 Cost += thisT()->getIntrinsicInstrCost(Attrs1, CostKind);
2562 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FromTy,
2563 {FromTy, FromTy});
2564 Cost += thisT()->getIntrinsicInstrCost(Attrs2, CostKind);
2565 Cost += thisT()->getCastInstrCost(
2566 IsSigned ? Instruction::FPToSI : Instruction::FPToUI, RetTy, FromTy,
2568 if (IsSigned) {
2569 Type *CondTy = RetTy->getWithNewBitWidth(1);
2570 Cost += thisT()->getCmpSelInstrCost(
2571 BinaryOperator::FCmp, FromTy, CondTy, CmpInst::FCMP_UNO, CostKind);
2572 Cost += thisT()->getCmpSelInstrCost(
2573 BinaryOperator::Select, RetTy, CondTy, CmpInst::FCMP_UNO, CostKind);
2574 }
2575 return Cost;
2576 }
2577 case Intrinsic::ucmp:
2578 case Intrinsic::scmp: {
2579 Type *CmpTy = Tys[0];
2580 Type *CondTy = RetTy->getWithNewBitWidth(1);
2582 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, CmpTy, CondTy,
2584 CostKind) +
2585 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, CmpTy, CondTy,
2587 CostKind);
2588
2589 EVT VT = TLI->getValueType(DL, CmpTy, true);
2590 if (TLI->shouldExpandCmpUsingSelects(VT)) {
2591 // x < y ? -1 : (x > y ? 1 : 0)
2592 Cost += 2 * thisT()->getCmpSelInstrCost(
2593 BinaryOperator::Select, RetTy, CondTy,
2595 } else {
2596 // zext(x > y) - zext(x < y)
2597 Cost +=
2598 2 * thisT()->getCastInstrCost(CastInst::ZExt, RetTy, CondTy,
2600 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy,
2601 CostKind);
2602 }
2603 return Cost;
2604 }
2605 default:
2606 break;
2607 }
2608
2609 // Else, assume that we need to scalarize this intrinsic. For math builtins
2610 // this will emit a costly libcall, adding call overhead and spills. Make it
2611 // very expensive.
2612 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
2613 // Scalable vectors cannot be scalarized, so return Invalid.
2614 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
2615 return isa<ScalableVectorType>(Ty);
2616 }))
2618
2619 InstructionCost ScalarizationCost =
2620 SkipScalarizationCost
2621 ? ScalarizationCostPassed
2622 : getScalarizationOverhead(RetVTy, /*Insert*/ true,
2623 /*Extract*/ false, CostKind);
2624
2625 unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements();
2626 SmallVector<Type *, 4> ScalarTys;
2627 for (Type *Ty : Tys) {
2628 if (Ty->isVectorTy())
2629 Ty = Ty->getScalarType();
2630 ScalarTys.push_back(Ty);
2631 }
2632 IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
2633 InstructionCost ScalarCost =
2634 thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2635 for (Type *Ty : Tys) {
2636 if (auto *VTy = dyn_cast<VectorType>(Ty)) {
2637 if (!ICA.skipScalarizationCost())
2638 ScalarizationCost += getScalarizationOverhead(
2639 VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
2640 ScalarCalls = std::max(ScalarCalls,
2641 cast<FixedVectorType>(VTy)->getNumElements());
2642 }
2643 }
2644 return ScalarCalls * ScalarCost + ScalarizationCost;
2645 }
2646
2647 // This is going to be turned into a library call, make it expensive.
2648 return SingleCallCost;
2649 }
2650
2651 /// Compute a cost of the given call instruction.
2652 ///
2653 /// Compute the cost of calling function F with return type RetTy and
2654 /// argument types Tys. F might be nullptr, in this case the cost of an
2655 /// arbitrary call with the specified signature will be returned.
2656 /// This is used, for instance, when we estimate call of a vector
2657 /// counterpart of the given function.
2658 /// \param F Called function, might be nullptr.
2659 /// \param RetTy Return value types.
2660 /// \param Tys Argument types.
2661 /// \returns The cost of Call instruction.
2663 ArrayRef<Type *> Tys,
2665 return 10;
2666 }
2667
2668 unsigned getNumberOfParts(Type *Tp) {
2669 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
2670 if (!LT.first.isValid())
2671 return 0;
2672 // Try to find actual number of parts for non-power-of-2 elements as
2673 // ceil(num-of-elements/num-of-subtype-elements).
2674 if (auto *FTp = dyn_cast<FixedVectorType>(Tp);
2675 Tp && LT.second.isFixedLengthVector() &&
2676 !has_single_bit(FTp->getNumElements())) {
2677 if (auto *SubTp = dyn_cast_if_present<FixedVectorType>(
2678 EVT(LT.second).getTypeForEVT(Tp->getContext()));
2679 SubTp && SubTp->getElementType() == FTp->getElementType())
2680 return divideCeil(FTp->getNumElements(), SubTp->getNumElements());
2681 }
2682 return *LT.first.getValue();
2683 }
2684
2686 const SCEV *) {
2687 return 0;
2688 }
2689
2690 /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
2691 /// We're assuming that reduction operation are performing the following way:
2692 ///
2693 /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
2694 /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
2695 /// \----------------v-------------/ \----------v------------/
2696 /// n/2 elements n/2 elements
2697 /// %red1 = op <n x t> %val, <n x t> val1
2698 /// After this operation we have a vector %red1 where only the first n/2
2699 /// elements are meaningful, the second n/2 elements are undefined and can be
2700 /// dropped. All other operations are actually working with the vector of
2701 /// length n/2, not n, though the real vector length is still n.
2702 /// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
2703 /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
2704 /// \----------------v-------------/ \----------v------------/
2705 /// n/4 elements 3*n/4 elements
2706 /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of
2707 /// length n/2, the resulting vector has length n/4 etc.
2708 ///
2709 /// The cost model should take into account that the actual length of the
2710 /// vector is reduced on each iteration.
2713 // Targets must implement a default value for the scalable case, since
2714 // we don't know how many lanes the vector has.
2715 if (isa<ScalableVectorType>(Ty))
2717
2718 Type *ScalarTy = Ty->getElementType();
2719 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2720 if ((Opcode == Instruction::Or || Opcode == Instruction::And) &&
2721 ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) &&
2722 NumVecElts >= 2) {
2723 // Or reduction for i1 is represented as:
2724 // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2725 // %res = cmp ne iReduxWidth %val, 0
2726 // And reduction for i1 is represented as:
2727 // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2728 // %res = cmp eq iReduxWidth %val, 11111
2729 Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts);
2730 return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty,
2732 thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy,
2735 }
2736 unsigned NumReduxLevels = Log2_32(NumVecElts);
2737 InstructionCost ArithCost = 0;
2738 InstructionCost ShuffleCost = 0;
2739 std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty);
2740 unsigned LongVectorCount = 0;
2741 unsigned MVTLen =
2742 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2743 while (NumVecElts > MVTLen) {
2744 NumVecElts /= 2;
2745 VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2746 ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, {},
2747 CostKind, NumVecElts, SubTy);
2748 ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind);
2749 Ty = SubTy;
2750 ++LongVectorCount;
2751 }
2752
2753 NumReduxLevels -= LongVectorCount;
2754
2755 // The minimal length of the vector is limited by the real length of vector
2756 // operations performed on the current platform. That's why several final
2757 // reduction operations are performed on the vectors with the same
2758 // architecture-dependent length.
2759
2760 // By default reductions need one shuffle per reduction level.
2761 ShuffleCost +=
2762 NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
2763 {}, CostKind, 0, Ty);
2764 ArithCost +=
2765 NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind);
2766 return ShuffleCost + ArithCost +
2767 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
2768 CostKind, 0, nullptr, nullptr);
2769 }
2770
2771 /// Try to calculate the cost of performing strict (in-order) reductions,
2772 /// which involves doing a sequence of floating point additions in lane
2773 /// order, starting with an initial value. For example, consider a scalar
2774 /// initial value 'InitVal' of type float and a vector of type <4 x float>:
2775 ///
2776 /// Vector = <float %v0, float %v1, float %v2, float %v3>
2777 ///
2778 /// %add1 = %InitVal + %v0
2779 /// %add2 = %add1 + %v1
2780 /// %add3 = %add2 + %v2
2781 /// %add4 = %add3 + %v3
2782 ///
2783 /// As a simple estimate we can say the cost of such a reduction is 4 times
2784 /// the cost of a scalar FP addition. We can only estimate the costs for
2785 /// fixed-width vectors here because for scalable vectors we do not know the
2786 /// runtime number of operations.
2789 // Targets must implement a default value for the scalable case, since
2790 // we don't know how many lanes the vector has.
2791 if (isa<ScalableVectorType>(Ty))
2793
2794 auto *VTy = cast<FixedVectorType>(Ty);
2796 VTy, /*Insert=*/false, /*Extract=*/true, CostKind);
2797 InstructionCost ArithCost = thisT()->getArithmeticInstrCost(
2798 Opcode, VTy->getElementType(), CostKind);
2799 ArithCost *= VTy->getNumElements();
2800
2801 return ExtractCost + ArithCost;
2802 }
2803
2805 std::optional<FastMathFlags> FMF,
2807 assert(Ty && "Unknown reduction vector type");
2809 return getOrderedReductionCost(Opcode, Ty, CostKind);
2810 return getTreeReductionCost(Opcode, Ty, CostKind);
2811 }
2812
2813 /// Try to calculate op costs for min/max reduction operations.
2814 /// \param CondTy Conditional type for the Select instruction.
2816 FastMathFlags FMF,
2818 // Targets must implement a default value for the scalable case, since
2819 // we don't know how many lanes the vector has.
2820 if (isa<ScalableVectorType>(Ty))
2822
2823 Type *ScalarTy = Ty->getElementType();
2824 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2825 unsigned NumReduxLevels = Log2_32(NumVecElts);
2826 InstructionCost MinMaxCost = 0;
2827 InstructionCost ShuffleCost = 0;
2828 std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty);
2829 unsigned LongVectorCount = 0;
2830 unsigned MVTLen =
2831 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2832 while (NumVecElts > MVTLen) {
2833 NumVecElts /= 2;
2834 auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2835
2836 ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, {},
2837 CostKind, NumVecElts, SubTy);
2838
2839 IntrinsicCostAttributes Attrs(IID, SubTy, {SubTy, SubTy}, FMF);
2840 MinMaxCost += getIntrinsicInstrCost(Attrs, CostKind);
2841 Ty = SubTy;
2842 ++LongVectorCount;
2843 }
2844
2845 NumReduxLevels -= LongVectorCount;
2846
2847 // The minimal length of the vector is limited by the real length of vector
2848 // operations performed on the current platform. That's why several final
2849 // reduction opertions are perfomed on the vectors with the same
2850 // architecture-dependent length.
2851 ShuffleCost +=
2852 NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
2853 {}, CostKind, 0, Ty);
2854 IntrinsicCostAttributes Attrs(IID, Ty, {Ty, Ty}, FMF);
2855 MinMaxCost += NumReduxLevels * getIntrinsicInstrCost(Attrs, CostKind);
2856 // The last min/max should be in vector registers and we counted it above.
2857 // So just need a single extractelement.
2858 return ShuffleCost + MinMaxCost +
2859 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
2860 CostKind, 0, nullptr, nullptr);
2861 }
2862
2863 InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned,
2864 Type *ResTy, VectorType *Ty,
2865 FastMathFlags FMF,
2867 if (auto *FTy = dyn_cast<FixedVectorType>(Ty);
2868 FTy && IsUnsigned && Opcode == Instruction::Add &&
2869 FTy->getElementType() == IntegerType::getInt1Ty(Ty->getContext())) {
2870 // Represent vector_reduce_add(ZExt(<n x i1>)) as
2871 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
2872 auto *IntTy =
2873 IntegerType::get(ResTy->getContext(), FTy->getNumElements());
2874 IntrinsicCostAttributes ICA(Intrinsic::ctpop, IntTy, {IntTy}, FMF);
2875 return thisT()->getCastInstrCost(Instruction::BitCast, IntTy, FTy,
2877 thisT()->getIntrinsicInstrCost(ICA, CostKind);
2878 }
2879 // Without any native support, this is equivalent to the cost of
2880 // vecreduce.opcode(ext(Ty A)).
2881 VectorType *ExtTy = VectorType::get(ResTy, Ty);
2882 InstructionCost RedCost =
2883 thisT()->getArithmeticReductionCost(Opcode, ExtTy, FMF, CostKind);
2884 InstructionCost ExtCost = thisT()->getCastInstrCost(
2885 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2887
2888 return RedCost + ExtCost;
2889 }
2890
2892 VectorType *Ty,
2894 // Without any native support, this is equivalent to the cost of
2895 // vecreduce.add(mul(ext(Ty A), ext(Ty B))) or
2896 // vecreduce.add(mul(A, B)).
2897 VectorType *ExtTy = VectorType::get(ResTy, Ty);
2898 InstructionCost RedCost = thisT()->getArithmeticReductionCost(
2899 Instruction::Add, ExtTy, std::nullopt, CostKind);
2900 InstructionCost ExtCost = thisT()->getCastInstrCost(
2901 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2903
2904 InstructionCost MulCost =
2905 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2906
2907 return RedCost + MulCost + 2 * ExtCost;
2908 }
2909
2911
2912 /// @}
2913};
2914
2915/// Concrete BasicTTIImpl that can be used if no further customization
2916/// is needed.
2917class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
2919
2920 friend class BasicTTIImplBase<BasicTTIImpl>;
2921
2922 const TargetSubtargetInfo *ST;
2923 const TargetLoweringBase *TLI;
2924
2925 const TargetSubtargetInfo *getST() const { return ST; }
2926 const TargetLoweringBase *getTLI() const { return TLI; }
2927
2928public:
2929 explicit BasicTTIImpl(const TargetMachine *TM, const Function &F);
2930};
2931
2932} // end namespace llvm
2933
2934#endif // LLVM_CODEGEN_BASICTTIIMPL_H
This file implements a class to represent arbitrary precision integral constant values and operations...
This file implements the BitVector class.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
return RetTy
uint32_t Index
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V)
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
if(PassOpts->AAPipeline)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getNumElements(Type *Ty)
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
This file provides helpers for the implementation of a TargetTransformInfo-conforming class.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1330
bool sgt(const APInt &RHS) const
Signed greater than comparison.
Definition: APInt.h:1201
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1468
bool slt(const APInt &RHS) const
Signed less than comparison.
Definition: APInt.h:1130
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:200
an instruction to allocate memory on the stack
Definition: Instructions.h:63
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:207
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:213
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
Base class which can be used to help build a TTI implementation.
Definition: BasicTTIImpl.h:80
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef< std::tuple< Value *, User *, int > > ScalarUserAndIdx)
bool isTypeLegal(Type *Ty)
Definition: BasicTTIImpl.h:469
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const
Definition: BasicTTIImpl.h:327
virtual unsigned getPrefetchDistance() const
Definition: BasicTTIImpl.h:767
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:623
bool preferToKeepConstantsAttached(const Instruction &Inst, const Function &Fn) const
Definition: BasicTTIImpl.h:596
unsigned getMaxInterleaveFactor(ElementCount VF)
Definition: BasicTTIImpl.h:957
unsigned getNumberOfParts(Type *Tp)
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
Definition: BasicTTIImpl.h:796
std::optional< unsigned > getVScaleForTuning() const
Definition: BasicTTIImpl.h:801
InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty, TTI::TargetCostKind CostKind)
Try to calculate the cost of performing strict (in-order) reductions, which involves doing a sequence...
bool isTruncateFree(Type *Ty1, Type *Ty2)
Definition: BasicTTIImpl.h:459
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo)
Definition: BasicTTIImpl.h:703
InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty, TTI::TargetCostKind CostKind)
Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI)
Definition: BasicTTIImpl.h:714
virtual bool shouldPrefetchAddressSpace(unsigned AS) const
Definition: BasicTTIImpl.h:787
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
bool isLegalICmpImmediate(int64_t imm)
Definition: BasicTTIImpl.h:376
bool isProfitableToHoist(Instruction *I)
Definition: BasicTTIImpl.h:463
virtual unsigned getMaxPrefetchIterationsAhead() const
Definition: BasicTTIImpl.h:779
InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, TTI::TargetCostKind CostKind, unsigned Index)
std::optional< unsigned > getMaxVScale() const
Definition: BasicTTIImpl.h:800
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
unsigned getRegUsageForType(Type *Ty)
Definition: BasicTTIImpl.h:474
bool shouldBuildRelLookupTables() const
Definition: BasicTTIImpl.h:550
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const
Definition: BasicTTIImpl.h:617
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, unsigned &JumpTableSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
Definition: BasicTTIImpl.h:485
bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty, const DataLayout &DL) const
Definition: BasicTTIImpl.h:417
bool shouldDropLSRSolutionIfLessProfitable() const
Definition: BasicTTIImpl.h:437
bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2)
Definition: BasicTTIImpl.h:429
std::optional< Value * > simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, bool &KnownBitsComputed)
Definition: BasicTTIImpl.h:729
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, unsigned NumPrefetches, bool HasCall) const
Definition: BasicTTIImpl.h:771
bool hasBranchDivergence(const Function *F=nullptr)
Definition: BasicTTIImpl.h:321
bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty, const DataLayout &DL) const
Definition: BasicTTIImpl.h:423
unsigned getAssumedAddrSpace(const Value *V) const
Definition: BasicTTIImpl.h:349
InstructionCost getOperandsScalarizationOverhead(ArrayRef< const Value * > Args, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instructions unique non-constant operands.
Definition: BasicTTIImpl.h:876
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
unsigned getEpilogueVectorizationMinVF()
Definition: BasicTTIImpl.h:710
int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset)
Definition: BasicTTIImpl.h:393
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:479
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty)
Definition: BasicTTIImpl.h:582
bool isTargetIntrinsicTriviallyScalarizable(Intrinsic::ID ID) const
Definition: BasicTTIImpl.h:840
virtual std::optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const
Definition: BasicTTIImpl.h:747
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace)
Definition: BasicTTIImpl.h:445
bool isTargetIntrinsicWithStructReturnOverloadAtField(Intrinsic::ID ID, int RetIdx) const
Definition: BasicTTIImpl.h:854
bool isAlwaysUniform(const Value *V)
Definition: BasicTTIImpl.h:325
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true)
Definition: BasicTTIImpl.h:719
bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, unsigned AddressSpace, Align Alignment, unsigned *Fast) const
Definition: BasicTTIImpl.h:299
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
Definition: BasicTTIImpl.h:397
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
Definition: BasicTTIImpl.h:307
InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
Definition: BasicTTIImpl.h:860
virtual std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const
Definition: BasicTTIImpl.h:753
virtual bool enableWritePrefetching() const
Definition: BasicTTIImpl.h:783
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const
Definition: BasicTTIImpl.h:363
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:695
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const
Definition: BasicTTIImpl.h:340
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Compute a cost of the given call instruction.
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getFPOpCost(Type *Ty)
Definition: BasicTTIImpl.h:586
InstructionCost getVectorSplitCost()
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:923
bool haveFastSqrt(Type *Ty)
Definition: BasicTTIImpl.h:575
std::pair< const Value *, unsigned > getPredicatedAddrSpace(const Value *V) const
Definition: BasicTTIImpl.h:359
unsigned getInliningThresholdMultiplier() const
Definition: BasicTTIImpl.h:615
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
virtual ~BasicTTIImplBase()=default
bool isLegalAddScalableImmediate(int64_t Imm)
Definition: BasicTTIImpl.h:372
InstructionCost getScalarizationOverhead(VectorType *RetTy, ArrayRef< const Value * > Args, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing the inputs and outputs of an instruction, with return type RetTy...
Definition: BasicTTIImpl.h:905
bool isVScaleKnownToBeAPowerOfTwo() const
Definition: BasicTTIImpl.h:802
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II)
Definition: BasicTTIImpl.h:723
bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const
Definition: BasicTTIImpl.h:331
bool isLegalAddImmediate(int64_t imm)
Definition: BasicTTIImpl.h:368
unsigned getFlatAddressSpace()
Definition: BasicTTIImpl.h:335
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={})
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:807
virtual unsigned getCacheLineSize() const
Definition: BasicTTIImpl.h:763
bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const
Definition: BasicTTIImpl.h:345
bool isSourceOfDivergence(const Value *V)
Definition: BasicTTIImpl.h:323
bool isTargetIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx) const
Definition: BasicTTIImpl.h:844
int getInlinerVectorBonusPercent() const
Definition: BasicTTIImpl.h:621
InstructionCost getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on argument types.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:959
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp)
Definition: BasicTTIImpl.h:736
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:380
bool isSingleThreaded() const
Definition: BasicTTIImpl.h:353
BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
Definition: BasicTTIImpl.h:290
bool isTargetIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx) const
Definition: BasicTTIImpl.h:849
unsigned adjustInliningThreshold(const CallBase *CB)
Definition: BasicTTIImpl.h:616
bool isProfitableLSRChainElement(Instruction *I)
Definition: BasicTTIImpl.h:441
Concrete BasicTTIImpl that can be used if no further customization is needed.
size_type count() const
count - Returns the number of bits which are set.
Definition: BitVector.h:162
BitVector & set()
Definition: BitVector.h:351
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1120
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:988
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:696
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:700
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:698
@ ICMP_EQ
equal
Definition: InstrTypes.h:694
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:683
CmpInst::Predicate getLTPredicate() const
CmpInst::Predicate getGTPredicate() const
This class represents a range of values.
Definition: ConstantRange.h:47
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
Definition: DataLayout.h:434
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:843
unsigned getIndexSizeInBits(unsigned AS) const
Size in bits of index used for address calculation in getelementptr.
Definition: DataLayout.h:369
constexpr bool isVector() const
One or more elements.
Definition: TypeSize.h:326
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:322
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:563
unsigned getNumElements() const
Definition: DerivedTypes.h:606
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:353
The core instruction combiner logic.
Definition: InstCombiner.h:48
static InstructionCost getInvalid(CostType Val=0)
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:311
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
InstructionCost getScalarizationCost() const
const IntrinsicInst * getInst() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
virtual bool shouldPrefetchAddressSpace(unsigned AS) const
virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, unsigned NumPrefetches, bool HasCall) const
Return the minimum stride necessary to trigger software prefetching.
virtual bool enableWritePrefetching() const
virtual unsigned getMaxPrefetchIterationsAhead() const
Return the maximum prefetch distance in terms of loop iterations.
virtual unsigned getPrefetchDistance() const
Return the preferred prefetch distance in terms of instructions.
virtual std::optional< unsigned > getCacheAssociativity(unsigned Level) const
Return the cache associatvity for the given level of cache.
virtual std::optional< unsigned > getCacheLineSize(unsigned Level) const
Return the target cache line size in bytes at a given level.
Machine Value Type.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Analysis providing profile information.
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isSpliceMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is a splice mask, concatenating the two inputs together and then ext...
static bool isSelectMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from its source vectors without lane crossings.
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isTransposeMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask is a transpose mask.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
size_type size() const
Definition: SmallPtrSet.h:94
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
static StackOffset getScalable(int64_t Scalable)
Definition: TypeSize.h:43
static StackOffset getFixed(int64_t Fixed)
Definition: TypeSize.h:42
static StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition: Type.cpp:612
Multiway switch.
Provides information about what library functions are available for the current target.
This base class for TargetLowering contains the SelectionDAG-independent parts that can be used from ...
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
bool isIndexedStoreLegal(unsigned IdxMode, EVT VT) const
Return true if the specified indexed load is legal on this target.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
virtual bool isLegalICmpImmediate(int64_t) const
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
const TargetMachine & getTargetMachine() const
virtual bool isZExtFree(Type *FromTy, Type *ToTy) const
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
virtual bool isSuitableForJumpTable(const SwitchInst *SI, uint64_t NumCases, uint64_t Range, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) const
Return true if lowering to a jump table is suitable for a set of case clusters which may contain NumC...
virtual bool areJTsAllowed(const Function *Fn) const
Return true if lowering to a jump table is allowed.
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
virtual unsigned getNumRegisters(LLVMContext &Context, EVT VT, std::optional< MVT > RegisterVT=std::nullopt) const
Return the number of registers that this ValueType will eventually require.
virtual bool isCheapToSpeculateCttz(Type *Ty) const
Return true if it is cheap to speculate a call to intrinsic cttz.
bool isTruncStoreLegal(EVT ValVT, EVT MemVT) const
Return true if the specified store with truncation is legal on this target.
unsigned getBitWidthForCttzElements(Type *RetTy, ElementCount EC, bool ZeroIsPoison, const ConstantRange *VScaleRange) const
Return the minimum number of bits required to hold the maximum possible number of trailing zero vecto...
virtual bool shouldExpandCmpUsingSelects(EVT VT) const
Should we expand [US]CMP nodes using two selects and two compares, or by doing arithmetic on boolean ...
virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *=nullptr) const
Determine if the target supports unaligned memory accesses.
virtual bool isTruncateFree(Type *FromTy, Type *ToTy) const
Return true if it's free to truncate a value of type FromTy to type ToTy.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
bool isSuitableForBitTests(unsigned NumDests, unsigned NumCmps, const APInt &Low, const APInt &High, const DataLayout &DL) const
Return true if lowering to a bit test is suitable for a set of case clusters which contains NumDests ...
virtual bool isLegalAddImmediate(int64_t) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
virtual bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
LegalizeAction getTruncStoreAction(EVT ValVT, EVT MemVT) const
Return how this store with truncation should be treated: either it is legal, needs to be promoted to ...
LegalizeAction getLoadExtAction(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return how this load with extension should be treated: either it is legal, needs to be promoted to a ...
virtual bool isIntDivCheap(EVT VT, AttributeList Attr) const
Return true if integer divide is usually cheaper than a sequence of several shifts,...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isProfitableToHoist(Instruction *I) const
bool isIndexedLoadLegal(unsigned IdxMode, EVT VT) const
Return true if the specified indexed load is legal on this target.
bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal on this target.
virtual bool isCheapToSpeculateCtlz(Type *Ty) const
Return true if it is cheap to speculate a call to intrinsic ctlz.
virtual int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) const
Return the prefered common base offset.
LegalizeKind getTypeConversion(LLVMContext &Context, EVT VT) const
Return pair that represents the legalization kind (first) that needs to happen to EVT (second) in ord...
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
virtual bool isLegalAddScalableImmediate(int64_t) const
Return true if adding the specified scalable immediate is legal, that is the target has add instructi...
bool isBeneficialToExpandPowI(int64_t Exponent, bool OptForSize) const
Return true if it is beneficial to expand an @llvm.powi.
virtual bool isFAbsFree(EVT VT) const
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
virtual bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AddrSpace, Instruction *I=nullptr) const
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
virtual std::pair< const Value *, unsigned > getPredicatedAddrSpace(const Value *V) const
If the specified predicate checks whether a generic pointer falls within a specified address space,...
virtual bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const
Returns true if a cast between SrcAS and DestAS is a noop.
virtual unsigned getAssumedAddrSpace(const Value *V) const
If the specified generic pointer could be assumed as a pointer to a specific address space,...
TargetOptions Options
ThreadModel::Model ThreadModel
ThreadModel - This flag specifies the type of threading model to assume for things like atomics.
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual bool useAA() const
Enable use of alias analysis during code generation (during MI scheduling, DAGCombine,...
const DataLayout & getDataLayout() const
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
bool isProfitableLSRChainElement(Instruction *I) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I) const
std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info, TTI::OperandValueInfo Opd2Info, ArrayRef< const Value * > Args, const Instruction *CxtI=nullptr) const
bool isLoweredToCall(const Function *F) const
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const
std::optional< Value * > simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, bool &KnownBitsComputed) const
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, const Instruction *I) const
CRTP base class for use as a mix-in that aids implementing a TargetTransformInfo-compatible class.
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Basic
The cost of a typical 'add' instruction.
MemIndexedMode
The type of load/store indexing.
@ MIM_PostInc
Post-incrementing.
@ MIM_PostDec
Post-decrementing.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
CacheLevel
The possible cache levels.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:383
bool isArch64Bit() const
Test whether the architecture is 64-bit.
Definition: Triple.cpp:1681
bool isOSDarwin() const
Is this a "Darwin" OS (macOS, iOS, tvOS, watchOS, XROS, or DriverKit).
Definition: Triple.h:568
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:243
static IntegerType * getInt1Ty(LLVMContext &C)
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
static IntegerType * getInt8Ty(LLVMContext &C)
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:267
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:225
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
Value * getOperand(unsigned i) const
Definition: User.h:228
static bool isVPBinOp(Intrinsic::ID ID)
static bool isVPCast(Intrinsic::ID ID)
static bool isVPCmp(Intrinsic::ID ID)
static std::optional< unsigned > getFunctionalOpcodeForVP(Intrinsic::ID ID)
static std::optional< Intrinsic::ID > getFunctionalIntrinsicIDForVP(Intrinsic::ID ID)
static bool isVPIntrinsic(Intrinsic::ID)
static bool isVPReduction(Intrinsic::ID ID)
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
Base class of all SIMD vector types.
Definition: DerivedTypes.h:427
static VectorType * getHalfElementsVectorType(VectorType *VTy)
This static method returns a VectorType with half as many elements as the input type and the same ele...
Definition: DerivedTypes.h:531
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:665
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Definition: DerivedTypes.h:460
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:218
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition: APInt.cpp:2982
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:744
@ SMULFIX
RESULT = [US]MULFIX(LHS, RHS, SCALE) - Perform fixed point multiplication on 2 integers with the same...
Definition: ISDOpcodes.h:374
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:498
@ FATAN2
FATAN2 - atan2, inspired by libm.
Definition: ISDOpcodes.h:999
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:717
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:262
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:1059
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:334
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1123
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1127
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:515
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:757
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:330
@ FP_TO_UINT_SAT
Definition: ISDOpcodes.h:907
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1031
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:697
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:766
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1050
@ SCMP
[US]CMP - 3-way comparison of signed or unsigned integers.
Definition: ISDOpcodes.h:705
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition: ISDOpcodes.h:906
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:508
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
Definition: ISDOpcodes.h:1055
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
Definition: ISDOpcodes.h:1551
ID ArrayRef< Type * > Tys
Definition: Intrinsics.h:102
bool isTargetIntrinsic(ID IID)
isTargetIntrinsic - Returns true if IID is an intrinsic specific to a certain target.
Definition: Intrinsics.cpp:617
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:989
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID)
Returns the arithmetic instruction opcode used when expanding a reduction.
Definition: LoopUtils.cpp:960
constexpr bool has_single_bit(T Value) noexcept
Definition: bit.h:146
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
ConstantRange getVScaleRange(const Function *F, unsigned BitWidth)
Determine the possible constant range of vscale with the given bit width, based on the vscale_range f...
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:217
InstructionCost Cost
cl::opt< unsigned > PartialUnrollingThreshold
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Extended Value Type.
Definition: ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
ElementCount getVectorElementCount() const
Definition: ValueTypes.h:345
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:289
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:65
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:210
Attributes of a target dependent hardware loop.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
bool AllowPeeling
Allow peeling off loop iterations.
bool AllowLoopNestsPeeling
Allow peeling off loop iterations for loop nests.
bool PeelProfiledIterations
Allow peeling basing on profile.
unsigned PeelCount
A forced peeling factor (the number of bodied of the original loop that should be peeled off before t...
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).