LLVM 18.0.0git
BasicTTIImpl.h
Go to the documentation of this file.
1//===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file provides a helper that implements much of the TTI interface in
11/// terms of the target-independent code generator and TargetLowering
12/// interfaces.
13//
14//===----------------------------------------------------------------------===//
15
16#ifndef LLVM_CODEGEN_BASICTTIIMPL_H
17#define LLVM_CODEGEN_BASICTTIIMPL_H
18
19#include "llvm/ADT/APInt.h"
20#include "llvm/ADT/ArrayRef.h"
21#include "llvm/ADT/BitVector.h"
33#include "llvm/IR/BasicBlock.h"
34#include "llvm/IR/Constant.h"
35#include "llvm/IR/Constants.h"
36#include "llvm/IR/DataLayout.h"
38#include "llvm/IR/InstrTypes.h"
39#include "llvm/IR/Instruction.h"
41#include "llvm/IR/Intrinsics.h"
42#include "llvm/IR/Operator.h"
43#include "llvm/IR/Type.h"
44#include "llvm/IR/Value.h"
51#include <algorithm>
52#include <cassert>
53#include <cstdint>
54#include <limits>
55#include <optional>
56#include <utility>
57
58namespace llvm {
59
60class Function;
61class GlobalValue;
62class LLVMContext;
63class ScalarEvolution;
64class SCEV;
65class TargetMachine;
66
67extern cl::opt<unsigned> PartialUnrollingThreshold;
68
69/// Base class which can be used to help build a TTI implementation.
70///
71/// This class provides as much implementation of the TTI interface as is
72/// possible using the target independent parts of the code generator.
73///
74/// In order to subclass it, your class must implement a getST() method to
75/// return the subtarget, and a getTLI() method to return the target lowering.
76/// We need these methods implemented in the derived class so that this class
77/// doesn't have to duplicate storage for them.
78template <typename T>
80private:
83
84 /// Helper function to access this as a T.
85 T *thisT() { return static_cast<T *>(this); }
86
87 /// Estimate a cost of Broadcast as an extract and sequence of insert
88 /// operations.
89 InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy,
92 // Broadcast cost is equal to the cost of extracting the zero'th element
93 // plus the cost of inserting it into every element of the result vector.
94 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
95 CostKind, 0, nullptr, nullptr);
96
97 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
98 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
99 CostKind, i, nullptr, nullptr);
100 }
101 return Cost;
102 }
103
104 /// Estimate a cost of shuffle as a sequence of extract and insert
105 /// operations.
106 InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy,
109 // Shuffle cost is equal to the cost of extracting element from its argument
110 // plus the cost of inserting them onto the result vector.
111
112 // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
113 // index 0 of first vector, index 1 of second vector,index 2 of first
114 // vector and finally index 3 of second vector and insert them at index
115 // <0,1,2,3> of result vector.
116 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
117 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
118 CostKind, i, nullptr, nullptr);
119 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
120 CostKind, i, nullptr, nullptr);
121 }
122 return Cost;
123 }
124
125 /// Estimate a cost of subvector extraction as a sequence of extract and
126 /// insert operations.
127 InstructionCost getExtractSubvectorOverhead(VectorType *VTy,
129 int Index,
130 FixedVectorType *SubVTy) {
131 assert(VTy && SubVTy &&
132 "Can only extract subvectors from vectors");
133 int NumSubElts = SubVTy->getNumElements();
134 assert((!isa<FixedVectorType>(VTy) ||
135 (Index + NumSubElts) <=
136 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
137 "SK_ExtractSubvector index out of range");
138
140 // Subvector extraction cost is equal to the cost of extracting element from
141 // the source type plus the cost of inserting them into the result vector
142 // type.
143 for (int i = 0; i != NumSubElts; ++i) {
144 Cost +=
145 thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
146 CostKind, i + Index, nullptr, nullptr);
147 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy,
148 CostKind, i, nullptr, nullptr);
149 }
150 return Cost;
151 }
152
153 /// Estimate a cost of subvector insertion as a sequence of extract and
154 /// insert operations.
155 InstructionCost getInsertSubvectorOverhead(VectorType *VTy,
157 int Index,
158 FixedVectorType *SubVTy) {
159 assert(VTy && SubVTy &&
160 "Can only insert subvectors into vectors");
161 int NumSubElts = SubVTy->getNumElements();
162 assert((!isa<FixedVectorType>(VTy) ||
163 (Index + NumSubElts) <=
164 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
165 "SK_InsertSubvector index out of range");
166
168 // Subvector insertion cost is equal to the cost of extracting element from
169 // the source type plus the cost of inserting them into the result vector
170 // type.
171 for (int i = 0; i != NumSubElts; ++i) {
172 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy,
173 CostKind, i, nullptr, nullptr);
174 Cost +=
175 thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, CostKind,
176 i + Index, nullptr, nullptr);
177 }
178 return Cost;
179 }
180
181 /// Local query method delegates up to T which *must* implement this!
182 const TargetSubtargetInfo *getST() const {
183 return static_cast<const T *>(this)->getST();
184 }
185
186 /// Local query method delegates up to T which *must* implement this!
187 const TargetLoweringBase *getTLI() const {
188 return static_cast<const T *>(this)->getTLI();
189 }
190
191 static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
192 switch (M) {
194 return ISD::UNINDEXED;
195 case TTI::MIM_PreInc:
196 return ISD::PRE_INC;
197 case TTI::MIM_PreDec:
198 return ISD::PRE_DEC;
199 case TTI::MIM_PostInc:
200 return ISD::POST_INC;
201 case TTI::MIM_PostDec:
202 return ISD::POST_DEC;
203 }
204 llvm_unreachable("Unexpected MemIndexedMode");
205 }
206
207 InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
208 Align Alignment,
209 bool VariableMask,
210 bool IsGatherScatter,
212 // We cannot scalarize scalable vectors, so return Invalid.
213 if (isa<ScalableVectorType>(DataTy))
215
216 auto *VT = cast<FixedVectorType>(DataTy);
217 // Assume the target does not have support for gather/scatter operations
218 // and provide a rough estimate.
219 //
220 // First, compute the cost of the individual memory operations.
221 InstructionCost AddrExtractCost =
222 IsGatherScatter
223 ? getVectorInstrCost(Instruction::ExtractElement,
225 PointerType::get(VT->getElementType(), 0),
226 VT->getNumElements()),
227 CostKind, -1, nullptr, nullptr)
228 : 0;
229 InstructionCost LoadCost =
230 VT->getNumElements() *
231 (AddrExtractCost +
232 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind));
233
234 // Next, compute the cost of packing the result in a vector.
235 InstructionCost PackingCost =
236 getScalarizationOverhead(VT, Opcode != Instruction::Store,
237 Opcode == Instruction::Store, CostKind);
238
239 InstructionCost ConditionalCost = 0;
240 if (VariableMask) {
241 // Compute the cost of conditionally executing the memory operations with
242 // variable masks. This includes extracting the individual conditions, a
243 // branches and PHIs to combine the results.
244 // NOTE: Estimating the cost of conditionally executing the memory
245 // operations accurately is quite difficult and the current solution
246 // provides a very rough estimate only.
247 ConditionalCost =
248 VT->getNumElements() *
250 Instruction::ExtractElement,
252 VT->getNumElements()),
253 CostKind, -1, nullptr, nullptr) +
254 getCFInstrCost(Instruction::Br, CostKind) +
255 getCFInstrCost(Instruction::PHI, CostKind));
256 }
257
258 return LoadCost + PackingCost + ConditionalCost;
259 }
260
261protected:
263 : BaseT(DL) {}
264 virtual ~BasicTTIImplBase() = default;
265
267
268public:
269 /// \name Scalar TTI Implementations
270 /// @{
272 unsigned AddressSpace, Align Alignment,
273 unsigned *Fast) const {
275 return getTLI()->allowsMisalignedMemoryAccesses(
277 }
278
279 bool hasBranchDivergence(const Function *F = nullptr) { return false; }
280
281 bool isSourceOfDivergence(const Value *V) { return false; }
282
283 bool isAlwaysUniform(const Value *V) { return false; }
284
285 bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
286 return false;
287 }
288
289 bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const {
290 return true;
291 }
292
294 // Return an invalid address space.
295 return -1;
296 }
297
299 Intrinsic::ID IID) const {
300 return false;
301 }
302
303 bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
304 return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS);
305 }
306
307 unsigned getAssumedAddrSpace(const Value *V) const {
308 return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
309 }
310
311 bool isSingleThreaded() const {
312 return getTLI()->getTargetMachine().Options.ThreadModel ==
314 }
315
316 std::pair<const Value *, unsigned>
318 return getTLI()->getTargetMachine().getPredicatedAddrSpace(V);
319 }
320
322 Value *NewV) const {
323 return nullptr;
324 }
325
326 bool isLegalAddImmediate(int64_t imm) {
327 return getTLI()->isLegalAddImmediate(imm);
328 }
329
330 bool isLegalICmpImmediate(int64_t imm) {
331 return getTLI()->isLegalICmpImmediate(imm);
332 }
333
334 bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
335 bool HasBaseReg, int64_t Scale,
336 unsigned AddrSpace, Instruction *I = nullptr) {
338 AM.BaseGV = BaseGV;
339 AM.BaseOffs = BaseOffset;
340 AM.HasBaseReg = HasBaseReg;
341 AM.Scale = Scale;
342 return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
343 }
344
345 unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
346 Type *ScalarValTy) const {
347 auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy](unsigned VF) {
348 auto *SrcTy = FixedVectorType::get(ScalarMemTy, VF / 2);
349 EVT VT = getTLI()->getValueType(DL, SrcTy);
350 if (getTLI()->isOperationLegal(ISD::STORE, VT) ||
351 getTLI()->isOperationCustom(ISD::STORE, VT))
352 return true;
353
354 EVT ValVT =
355 getTLI()->getValueType(DL, FixedVectorType::get(ScalarValTy, VF / 2));
356 EVT LegalizedVT =
357 getTLI()->getTypeToTransformTo(ScalarMemTy->getContext(), VT);
358 return getTLI()->isTruncStoreLegal(LegalizedVT, ValVT);
359 };
360 while (VF > 2 && IsSupportedByTarget(VF))
361 VF /= 2;
362 return VF;
363 }
364
366 const DataLayout &DL) const {
367 EVT VT = getTLI()->getValueType(DL, Ty);
368 return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
369 }
370
372 const DataLayout &DL) const {
373 EVT VT = getTLI()->getValueType(DL, Ty);
374 return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
375 }
376
379 }
380
383 }
384
387 }
388
390 int64_t BaseOffset, bool HasBaseReg,
391 int64_t Scale, unsigned AddrSpace) {
393 AM.BaseGV = BaseGV;
394 AM.BaseOffs = BaseOffset;
395 AM.HasBaseReg = HasBaseReg;
396 AM.Scale = Scale;
397 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
398 return 0;
399 return -1;
400 }
401
402 bool isTruncateFree(Type *Ty1, Type *Ty2) {
403 return getTLI()->isTruncateFree(Ty1, Ty2);
404 }
405
407 return getTLI()->isProfitableToHoist(I);
408 }
409
410 bool useAA() const { return getST()->useAA(); }
411
412 bool isTypeLegal(Type *Ty) {
413 EVT VT = getTLI()->getValueType(DL, Ty);
414 return getTLI()->isTypeLegal(VT);
415 }
416
417 unsigned getRegUsageForType(Type *Ty) {
418 EVT ETy = getTLI()->getValueType(DL, Ty);
419 return getTLI()->getNumRegisters(Ty->getContext(), ETy);
420 }
421
425 return BaseT::getGEPCost(PointeeType, Ptr, Operands, AccessType, CostKind);
426 }
427
429 unsigned &JumpTableSize,
431 BlockFrequencyInfo *BFI) {
432 /// Try to find the estimated number of clusters. Note that the number of
433 /// clusters identified in this function could be different from the actual
434 /// numbers found in lowering. This function ignore switches that are
435 /// lowered with a mix of jump table / bit test / BTree. This function was
436 /// initially intended to be used when estimating the cost of switch in
437 /// inline cost heuristic, but it's a generic cost model to be used in other
438 /// places (e.g., in loop unrolling).
439 unsigned N = SI.getNumCases();
440 const TargetLoweringBase *TLI = getTLI();
441 const DataLayout &DL = this->getDataLayout();
442
443 JumpTableSize = 0;
444 bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());
445
446 // Early exit if both a jump table and bit test are not allowed.
447 if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N))
448 return N;
449
450 APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
451 APInt MinCaseVal = MaxCaseVal;
452 for (auto CI : SI.cases()) {
453 const APInt &CaseVal = CI.getCaseValue()->getValue();
454 if (CaseVal.sgt(MaxCaseVal))
455 MaxCaseVal = CaseVal;
456 if (CaseVal.slt(MinCaseVal))
457 MinCaseVal = CaseVal;
458 }
459
460 // Check if suitable for a bit test
461 if (N <= DL.getIndexSizeInBits(0u)) {
463 for (auto I : SI.cases())
464 Dests.insert(I.getCaseSuccessor());
465
466 if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
467 DL))
468 return 1;
469 }
470
471 // Check if suitable for a jump table.
472 if (IsJTAllowed) {
473 if (N < 2 || N < TLI->getMinimumJumpTableEntries())
474 return N;
475 uint64_t Range =
476 (MaxCaseVal - MinCaseVal)
477 .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1;
478 // Check whether a range of clusters is dense enough for a jump table
479 if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) {
480 JumpTableSize = Range;
481 return 1;
482 }
483 }
484 return N;
485 }
486
488 const TargetLoweringBase *TLI = getTLI();
489 return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
490 TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
491 }
492
494 const TargetMachine &TM = getTLI()->getTargetMachine();
495 // If non-PIC mode, do not generate a relative lookup table.
496 if (!TM.isPositionIndependent())
497 return false;
498
499 /// Relative lookup table entries consist of 32-bit offsets.
500 /// Do not generate relative lookup tables for large code models
501 /// in 64-bit achitectures where 32-bit offsets might not be enough.
502 if (TM.getCodeModel() == CodeModel::Medium ||
503 TM.getCodeModel() == CodeModel::Large)
504 return false;
505
506 Triple TargetTriple = TM.getTargetTriple();
507 if (!TargetTriple.isArch64Bit())
508 return false;
509
510 // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it
511 // there.
512 if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin())
513 return false;
514
515 return true;
516 }
517
518 bool haveFastSqrt(Type *Ty) {
519 const TargetLoweringBase *TLI = getTLI();
520 EVT VT = TLI->getValueType(DL, Ty);
521 return TLI->isTypeLegal(VT) &&
523 }
524
526 return true;
527 }
528
530 // Check whether FADD is available, as a proxy for floating-point in
531 // general.
532 const TargetLoweringBase *TLI = getTLI();
533 EVT VT = TLI->getValueType(DL, Ty);
537 }
538
539 unsigned getInliningThresholdMultiplier() const { return 1; }
540 unsigned adjustInliningThreshold(const CallBase *CB) { return 0; }
541 unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const {
542 return 0;
543 }
544
545 int getInlinerVectorBonusPercent() const { return 150; }
546
550 // This unrolling functionality is target independent, but to provide some
551 // motivation for its intended use, for x86:
552
553 // According to the Intel 64 and IA-32 Architectures Optimization Reference
554 // Manual, Intel Core models and later have a loop stream detector (and
555 // associated uop queue) that can benefit from partial unrolling.
556 // The relevant requirements are:
557 // - The loop must have no more than 4 (8 for Nehalem and later) branches
558 // taken, and none of them may be calls.
559 // - The loop can have no more than 18 (28 for Nehalem and later) uops.
560
561 // According to the Software Optimization Guide for AMD Family 15h
562 // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
563 // and loop buffer which can benefit from partial unrolling.
564 // The relevant requirements are:
565 // - The loop must have fewer than 16 branches
566 // - The loop must have less than 40 uops in all executed loop branches
567
568 // The number of taken branches in a loop is hard to estimate here, and
569 // benchmarking has revealed that it is better not to be conservative when
570 // estimating the branch count. As a result, we'll ignore the branch limits
571 // until someone finds a case where it matters in practice.
572
573 unsigned MaxOps;
574 const TargetSubtargetInfo *ST = getST();
575 if (PartialUnrollingThreshold.getNumOccurrences() > 0)
577 else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
578 MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
579 else
580 return;
581
582 // Scan the loop: don't unroll loops with calls.
583 for (BasicBlock *BB : L->blocks()) {
584 for (Instruction &I : *BB) {
585 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
586 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
587 if (!thisT()->isLoweredToCall(F))
588 continue;
589 }
590
591 if (ORE) {
592 ORE->emit([&]() {
593 return OptimizationRemark("TTI", "DontUnroll", L->getStartLoc(),
594 L->getHeader())
595 << "advising against unrolling the loop because it "
596 "contains a "
597 << ore::NV("Call", &I);
598 });
599 }
600 return;
601 }
602 }
603 }
604
605 // Enable runtime and partial unrolling up to the specified size.
606 // Enable using trip count upper bound to unroll loops.
607 UP.Partial = UP.Runtime = UP.UpperBound = true;
608 UP.PartialThreshold = MaxOps;
609
610 // Avoid unrolling when optimizing for size.
611 UP.OptSizeThreshold = 0;
613
614 // Set number of instructions optimized when "back edge"
615 // becomes "fall through" to default value of 2.
616 UP.BEInsns = 2;
617 }
618
621 PP.PeelCount = 0;
622 PP.AllowPeeling = true;
623 PP.AllowLoopNestsPeeling = false;
624 PP.PeelProfiledIterations = true;
625 }
626
628 AssumptionCache &AC,
629 TargetLibraryInfo *LibInfo,
630 HardwareLoopInfo &HWLoopInfo) {
631 return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
632 }
633
636 }
637
639 getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) {
640 return BaseT::getPreferredTailFoldingStyle(IVUpdateMayOverflow);
641 }
642
643 std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
644 IntrinsicInst &II) {
645 return BaseT::instCombineIntrinsic(IC, II);
646 }
647
648 std::optional<Value *>
650 APInt DemandedMask, KnownBits &Known,
651 bool &KnownBitsComputed) {
652 return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
653 KnownBitsComputed);
654 }
655
657 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
658 APInt &UndefElts2, APInt &UndefElts3,
659 std::function<void(Instruction *, unsigned, APInt, APInt &)>
660 SimplifyAndSetOp) {
662 IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
663 SimplifyAndSetOp);
664 }
665
666 virtual std::optional<unsigned>
668 return std::optional<unsigned>(
669 getST()->getCacheSize(static_cast<unsigned>(Level)));
670 }
671
672 virtual std::optional<unsigned>
674 std::optional<unsigned> TargetResult =
675 getST()->getCacheAssociativity(static_cast<unsigned>(Level));
676
677 if (TargetResult)
678 return TargetResult;
679
680 return BaseT::getCacheAssociativity(Level);
681 }
682
683 virtual unsigned getCacheLineSize() const {
684 return getST()->getCacheLineSize();
685 }
686
687 virtual unsigned getPrefetchDistance() const {
688 return getST()->getPrefetchDistance();
689 }
690
691 virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
692 unsigned NumStridedMemAccesses,
693 unsigned NumPrefetches,
694 bool HasCall) const {
695 return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
696 NumPrefetches, HasCall);
697 }
698
699 virtual unsigned getMaxPrefetchIterationsAhead() const {
700 return getST()->getMaxPrefetchIterationsAhead();
701 }
702
703 virtual bool enableWritePrefetching() const {
704 return getST()->enableWritePrefetching();
705 }
706
707 virtual bool shouldPrefetchAddressSpace(unsigned AS) const {
708 return getST()->shouldPrefetchAddressSpace(AS);
709 }
710
711 /// @}
712
713 /// \name Vector TTI Implementations
714 /// @{
715
717 return TypeSize::getFixed(32);
718 }
719
720 std::optional<unsigned> getMaxVScale() const { return std::nullopt; }
721 std::optional<unsigned> getVScaleForTuning() const { return std::nullopt; }
722 bool isVScaleKnownToBeAPowerOfTwo() const { return false; }
723
724 /// Estimate the overhead of scalarizing an instruction. Insert and Extract
725 /// are set if the demanded result elements need to be inserted and/or
726 /// extracted from vectors.
728 const APInt &DemandedElts,
729 bool Insert, bool Extract,
731 /// FIXME: a bitfield is not a reasonable abstraction for talking about
732 /// which elements are needed from a scalable vector
733 if (isa<ScalableVectorType>(InTy))
735 auto *Ty = cast<FixedVectorType>(InTy);
736
737 assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&
738 "Vector size mismatch");
739
741
742 for (int i = 0, e = Ty->getNumElements(); i < e; ++i) {
743 if (!DemandedElts[i])
744 continue;
745 if (Insert)
746 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty,
747 CostKind, i, nullptr, nullptr);
748 if (Extract)
749 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
750 CostKind, i, nullptr, nullptr);
751 }
752
753 return Cost;
754 }
755
756 /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
758 bool Extract,
760 if (isa<ScalableVectorType>(InTy))
762 auto *Ty = cast<FixedVectorType>(InTy);
763
764 APInt DemandedElts = APInt::getAllOnes(Ty->getNumElements());
765 return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
766 CostKind);
767 }
768
769 /// Estimate the overhead of scalarizing an instructions unique
770 /// non-constant operands. The (potentially vector) types to use for each of
771 /// argument are passes via Tys.
776 assert(Args.size() == Tys.size() && "Expected matching Args and Tys");
777
779 SmallPtrSet<const Value*, 4> UniqueOperands;
780 for (int I = 0, E = Args.size(); I != E; I++) {
781 // Disregard things like metadata arguments.
782 const Value *A = Args[I];
783 Type *Ty = Tys[I];
784 if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() &&
785 !Ty->isPtrOrPtrVectorTy())
786 continue;
787
788 if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
789 if (auto *VecTy = dyn_cast<VectorType>(Ty))
790 Cost += getScalarizationOverhead(VecTy, /*Insert*/ false,
791 /*Extract*/ true, CostKind);
792 }
793 }
794
795 return Cost;
796 }
797
798 /// Estimate the overhead of scalarizing the inputs and outputs of an
799 /// instruction, with return type RetTy and arguments Args of type Tys. If
800 /// Args are unknown (empty), then the cost associated with one argument is
801 /// added as a heuristic.
807 RetTy, /*Insert*/ true, /*Extract*/ false, CostKind);
808 if (!Args.empty())
810 else
811 // When no information on arguments is provided, we add the cost
812 // associated with one argument as a heuristic.
813 Cost += getScalarizationOverhead(RetTy, /*Insert*/ false,
814 /*Extract*/ true, CostKind);
815
816 return Cost;
817 }
818
819 /// Estimate the cost of type-legalization and the legalized type.
820 std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const {
821 LLVMContext &C = Ty->getContext();
822 EVT MTy = getTLI()->getValueType(DL, Ty);
823
825 // We keep legalizing the type until we find a legal kind. We assume that
826 // the only operation that costs anything is the split. After splitting
827 // we need to handle two types.
828 while (true) {
830
832 // Ensure we return a sensible simple VT here, since many callers of
833 // this function require it.
834 MVT VT = MTy.isSimple() ? MTy.getSimpleVT() : MVT::i64;
835 return std::make_pair(InstructionCost::getInvalid(), VT);
836 }
837
838 if (LK.first == TargetLoweringBase::TypeLegal)
839 return std::make_pair(Cost, MTy.getSimpleVT());
840
841 if (LK.first == TargetLoweringBase::TypeSplitVector ||
843 Cost *= 2;
844
845 // Do not loop with f128 type.
846 if (MTy == LK.second)
847 return std::make_pair(Cost, MTy.getSimpleVT());
848
849 // Keep legalizing the type.
850 MTy = LK.second;
851 }
852 }
853
854 unsigned getMaxInterleaveFactor(ElementCount VF) { return 1; }
855
857 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
860 ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
861 const Instruction *CxtI = nullptr) {
862 // Check if any of the operands are vector operands.
863 const TargetLoweringBase *TLI = getTLI();
864 int ISD = TLI->InstructionOpcodeToISD(Opcode);
865 assert(ISD && "Invalid opcode");
866
867 // TODO: Handle more cost kinds.
869 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
870 Opd1Info, Opd2Info,
871 Args, CxtI);
872
873 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
874
875 bool IsFloat = Ty->isFPOrFPVectorTy();
876 // Assume that floating point arithmetic operations cost twice as much as
877 // integer operations.
878 InstructionCost OpCost = (IsFloat ? 2 : 1);
879
880 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
881 // The operation is legal. Assume it costs 1.
882 // TODO: Once we have extract/insert subvector cost we need to use them.
883 return LT.first * OpCost;
884 }
885
886 if (!TLI->isOperationExpand(ISD, LT.second)) {
887 // If the operation is custom lowered, then assume that the code is twice
888 // as expensive.
889 return LT.first * 2 * OpCost;
890 }
891
892 // An 'Expand' of URem and SRem is special because it may default
893 // to expanding the operation into a sequence of sub-operations
894 // i.e. X % Y -> X-(X/Y)*Y.
895 if (ISD == ISD::UREM || ISD == ISD::SREM) {
896 bool IsSigned = ISD == ISD::SREM;
897 if (TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIVREM : ISD::UDIVREM,
898 LT.second) ||
899 TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIV : ISD::UDIV,
900 LT.second)) {
901 unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv;
902 InstructionCost DivCost = thisT()->getArithmeticInstrCost(
903 DivOpc, Ty, CostKind, Opd1Info, Opd2Info);
904 InstructionCost MulCost =
905 thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind);
906 InstructionCost SubCost =
907 thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
908 return DivCost + MulCost + SubCost;
909 }
910 }
911
912 // We cannot scalarize scalable vectors, so return Invalid.
913 if (isa<ScalableVectorType>(Ty))
915
916 // Else, assume that we need to scalarize this op.
917 // TODO: If one of the types get legalized by splitting, handle this
918 // similarly to what getCastInstrCost() does.
919 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
920 InstructionCost Cost = thisT()->getArithmeticInstrCost(
921 Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
922 Args, CxtI);
923 // Return the cost of multiple scalar invocation plus the cost of
924 // inserting and extracting the values.
925 SmallVector<Type *> Tys(Args.size(), Ty);
926 return getScalarizationOverhead(VTy, Args, Tys, CostKind) +
927 VTy->getNumElements() * Cost;
928 }
929
930 // We don't know anything about this scalar instruction.
931 return OpCost;
932 }
933
935 ArrayRef<int> Mask,
936 VectorType *Ty, int &Index,
937 VectorType *&SubTy) const {
938 int Limit = Mask.size() * 2;
939 if (Mask.empty() ||
940 // Extra check required by isSingleSourceMaskImpl function (called by
941 // ShuffleVectorInst::isSingleSourceMask).
942 any_of(Mask, [Limit](int I) { return I >= Limit; }))
943 return Kind;
944 switch (Kind) {
947 return TTI::SK_Reverse;
949 return TTI::SK_Broadcast;
950 break;
952 int NumSubElts;
953 if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
954 Mask, Mask.size(), NumSubElts, Index)) {
955 SubTy = FixedVectorType::get(Ty->getElementType(), NumSubElts);
957 }
959 return TTI::SK_Select;
961 return TTI::SK_Transpose;
963 return TTI::SK_Splice;
964 break;
965 }
966 case TTI::SK_Select:
967 case TTI::SK_Reverse:
972 case TTI::SK_Splice:
973 break;
974 }
975 return Kind;
976 }
977
979 ArrayRef<int> Mask,
981 VectorType *SubTp,
982 ArrayRef<const Value *> Args = std::nullopt) {
983 switch (improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp)) {
985 if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
986 return getBroadcastShuffleOverhead(FVT, CostKind);
988 case TTI::SK_Select:
989 case TTI::SK_Splice:
990 case TTI::SK_Reverse:
994 if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
995 return getPermuteShuffleOverhead(FVT, CostKind);
998 return getExtractSubvectorOverhead(Tp, CostKind, Index,
999 cast<FixedVectorType>(SubTp));
1001 return getInsertSubvectorOverhead(Tp, CostKind, Index,
1002 cast<FixedVectorType>(SubTp));
1003 }
1004 llvm_unreachable("Unknown TTI::ShuffleKind");
1005 }
1006
1007 InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1010 const Instruction *I = nullptr) {
1011 if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0)
1012 return 0;
1013
1014 const TargetLoweringBase *TLI = getTLI();
1015 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1016 assert(ISD && "Invalid opcode");
1017 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1018 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1019
1020 TypeSize SrcSize = SrcLT.second.getSizeInBits();
1021 TypeSize DstSize = DstLT.second.getSizeInBits();
1022 bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy();
1023 bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy();
1024
1025 switch (Opcode) {
1026 default:
1027 break;
1028 case Instruction::Trunc:
1029 // Check for NOOP conversions.
1030 if (TLI->isTruncateFree(SrcLT.second, DstLT.second))
1031 return 0;
1032 [[fallthrough]];
1033 case Instruction::BitCast:
1034 // Bitcast between types that are legalized to the same type are free and
1035 // assume int to/from ptr of the same size is also free.
1036 if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst &&
1037 SrcSize == DstSize)
1038 return 0;
1039 break;
1040 case Instruction::FPExt:
1041 if (I && getTLI()->isExtFree(I))
1042 return 0;
1043 break;
1044 case Instruction::ZExt:
1045 if (TLI->isZExtFree(SrcLT.second, DstLT.second))
1046 return 0;
1047 [[fallthrough]];
1048 case Instruction::SExt:
1049 if (I && getTLI()->isExtFree(I))
1050 return 0;
1051
1052 // If this is a zext/sext of a load, return 0 if the corresponding
1053 // extending load exists on target and the result type is legal.
1054 if (CCH == TTI::CastContextHint::Normal) {
1055 EVT ExtVT = EVT::getEVT(Dst);
1056 EVT LoadVT = EVT::getEVT(Src);
1057 unsigned LType =
1058 ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
1059 if (DstLT.first == SrcLT.first &&
1060 TLI->isLoadExtLegal(LType, ExtVT, LoadVT))
1061 return 0;
1062 }
1063 break;
1064 case Instruction::AddrSpaceCast:
1065 if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(),
1066 Dst->getPointerAddressSpace()))
1067 return 0;
1068 break;
1069 }
1070
1071 auto *SrcVTy = dyn_cast<VectorType>(Src);
1072 auto *DstVTy = dyn_cast<VectorType>(Dst);
1073
1074 // If the cast is marked as legal (or promote) then assume low cost.
1075 if (SrcLT.first == DstLT.first &&
1076 TLI->isOperationLegalOrPromote(ISD, DstLT.second))
1077 return SrcLT.first;
1078
1079 // Handle scalar conversions.
1080 if (!SrcVTy && !DstVTy) {
1081 // Just check the op cost. If the operation is legal then assume it costs
1082 // 1.
1083 if (!TLI->isOperationExpand(ISD, DstLT.second))
1084 return 1;
1085
1086 // Assume that illegal scalar instruction are expensive.
1087 return 4;
1088 }
1089
1090 // Check vector-to-vector casts.
1091 if (DstVTy && SrcVTy) {
1092 // If the cast is between same-sized registers, then the check is simple.
1093 if (SrcLT.first == DstLT.first && SrcSize == DstSize) {
1094
1095 // Assume that Zext is done using AND.
1096 if (Opcode == Instruction::ZExt)
1097 return SrcLT.first;
1098
1099 // Assume that sext is done using SHL and SRA.
1100 if (Opcode == Instruction::SExt)
1101 return SrcLT.first * 2;
1102
1103 // Just check the op cost. If the operation is legal then assume it
1104 // costs
1105 // 1 and multiply by the type-legalization overhead.
1106 if (!TLI->isOperationExpand(ISD, DstLT.second))
1107 return SrcLT.first * 1;
1108 }
1109
1110 // If we are legalizing by splitting, query the concrete TTI for the cost
1111 // of casting the original vector twice. We also need to factor in the
1112 // cost of the split itself. Count that as 1, to be consistent with
1113 // getTypeLegalizationCost().
1114 bool SplitSrc =
1115 TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
1117 bool SplitDst =
1118 TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
1120 if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() &&
1121 DstVTy->getElementCount().isVector()) {
1122 Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy);
1123 Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy);
1124 T *TTI = static_cast<T *>(this);
1125 // If both types need to be split then the split is free.
1126 InstructionCost SplitCost =
1127 (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0;
1128 return SplitCost +
1129 (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH,
1130 CostKind, I));
1131 }
1132
1133 // Scalarization cost is Invalid, can't assume any num elements.
1134 if (isa<ScalableVectorType>(DstVTy))
1136
1137 // In other cases where the source or destination are illegal, assume
1138 // the operation will get scalarized.
1139 unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements();
1140 InstructionCost Cost = thisT()->getCastInstrCost(
1141 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I);
1142
1143 // Return the cost of multiple scalar invocation plus the cost of
1144 // inserting and extracting the values.
1145 return getScalarizationOverhead(DstVTy, /*Insert*/ true, /*Extract*/ true,
1146 CostKind) +
1147 Num * Cost;
1148 }
1149
1150 // We already handled vector-to-vector and scalar-to-scalar conversions.
1151 // This
1152 // is where we handle bitcast between vectors and scalars. We need to assume
1153 // that the conversion is scalarized in one way or another.
1154 if (Opcode == Instruction::BitCast) {
1155 // Illegal bitcasts are done by storing and loading from a stack slot.
1156 return (SrcVTy ? getScalarizationOverhead(SrcVTy, /*Insert*/ false,
1157 /*Extract*/ true, CostKind)
1158 : 0) +
1159 (DstVTy ? getScalarizationOverhead(DstVTy, /*Insert*/ true,
1160 /*Extract*/ false, CostKind)
1161 : 0);
1162 }
1163
1164 llvm_unreachable("Unhandled cast");
1165 }
1166
1168 VectorType *VecTy, unsigned Index) {
1170 return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
1171 CostKind, Index, nullptr, nullptr) +
1172 thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(),
1174 }
1175
1177 const Instruction *I = nullptr) {
1178 return BaseT::getCFInstrCost(Opcode, CostKind, I);
1179 }
1180
1181 InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1182 CmpInst::Predicate VecPred,
1184 const Instruction *I = nullptr) {
1185 const TargetLoweringBase *TLI = getTLI();
1186 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1187 assert(ISD && "Invalid opcode");
1188
1189 // TODO: Handle other cost kinds.
1191 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1192 I);
1193
1194 // Selects on vectors are actually vector selects.
1195 if (ISD == ISD::SELECT) {
1196 assert(CondTy && "CondTy must exist");
1197 if (CondTy->isVectorTy())
1198 ISD = ISD::VSELECT;
1199 }
1200 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1201
1202 if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
1203 !TLI->isOperationExpand(ISD, LT.second)) {
1204 // The operation is legal. Assume it costs 1. Multiply
1205 // by the type-legalization overhead.
1206 return LT.first * 1;
1207 }
1208
1209 // Otherwise, assume that the cast is scalarized.
1210 // TODO: If one of the types get legalized by splitting, handle this
1211 // similarly to what getCastInstrCost() does.
1212 if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) {
1213 if (isa<ScalableVectorType>(ValTy))
1215
1216 unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements();
1217 if (CondTy)
1218 CondTy = CondTy->getScalarType();
1219 InstructionCost Cost = thisT()->getCmpSelInstrCost(
1220 Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I);
1221
1222 // Return the cost of multiple scalar invocation plus the cost of
1223 // inserting and extracting the values.
1224 return getScalarizationOverhead(ValVTy, /*Insert*/ true,
1225 /*Extract*/ false, CostKind) +
1226 Num * Cost;
1227 }
1228
1229 // Unknown scalar opcode.
1230 return 1;
1231 }
1232
1235 unsigned Index, Value *Op0, Value *Op1) {
1236 return getRegUsageForType(Val->getScalarType());
1237 }
1238
1241 unsigned Index) {
1242 Value *Op0 = nullptr;
1243 Value *Op1 = nullptr;
1244 if (auto *IE = dyn_cast<InsertElementInst>(&I)) {
1245 Op0 = IE->getOperand(0);
1246 Op1 = IE->getOperand(1);
1247 }
1248 return thisT()->getVectorInstrCost(I.getOpcode(), Val, CostKind, Index, Op0,
1249 Op1);
1250 }
1251
1252 InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
1253 int VF,
1254 const APInt &DemandedDstElts,
1256 assert(DemandedDstElts.getBitWidth() == (unsigned)VF * ReplicationFactor &&
1257 "Unexpected size of DemandedDstElts.");
1258
1260
1261 auto *SrcVT = FixedVectorType::get(EltTy, VF);
1262 auto *ReplicatedVT = FixedVectorType::get(EltTy, VF * ReplicationFactor);
1263
1264 // The Mask shuffling cost is extract all the elements of the Mask
1265 // and insert each of them Factor times into the wide vector:
1266 //
1267 // E.g. an interleaved group with factor 3:
1268 // %mask = icmp ult <8 x i32> %vec1, %vec2
1269 // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
1270 // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
1271 // The cost is estimated as extract all mask elements from the <8xi1> mask
1272 // vector and insert them factor times into the <24xi1> shuffled mask
1273 // vector.
1274 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedDstElts, VF);
1275 Cost += thisT()->getScalarizationOverhead(SrcVT, DemandedSrcElts,
1276 /*Insert*/ false,
1277 /*Extract*/ true, CostKind);
1278 Cost += thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts,
1279 /*Insert*/ true,
1280 /*Extract*/ false, CostKind);
1281
1282 return Cost;
1283 }
1284
1286 getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
1289 const Instruction *I = nullptr) {
1290 assert(!Src->isVoidTy() && "Invalid type");
1291 // Assume types, such as structs, are expensive.
1292 if (getTLI()->getValueType(DL, Src, true) == MVT::Other)
1293 return 4;
1294 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1295
1296 // Assuming that all loads of legal types cost 1.
1297 InstructionCost Cost = LT.first;
1299 return Cost;
1300
1301 const DataLayout &DL = this->getDataLayout();
1302 if (Src->isVectorTy() &&
1303 // In practice it's not currently possible to have a change in lane
1304 // length for extending loads or truncating stores so both types should
1305 // have the same scalable property.
1307 LT.second.getSizeInBits())) {
1308 // This is a vector load that legalizes to a larger type than the vector
1309 // itself. Unless the corresponding extending load or truncating store is
1310 // legal, then this will scalarize.
1312 EVT MemVT = getTLI()->getValueType(DL, Src);
1313 if (Opcode == Instruction::Store)
1314 LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
1315 else
1316 LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
1317
1318 if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
1319 // This is a vector load/store for some illegal type that is scalarized.
1320 // We must account for the cost of building or decomposing the vector.
1322 cast<VectorType>(Src), Opcode != Instruction::Store,
1323 Opcode == Instruction::Store, CostKind);
1324 }
1325 }
1326
1327 return Cost;
1328 }
1329
1331 Align Alignment, unsigned AddressSpace,
1333 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false,
1334 CostKind);
1335 }
1336
1338 const Value *Ptr, bool VariableMask,
1339 Align Alignment,
1341 const Instruction *I = nullptr) {
1342 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask,
1343 true, CostKind);
1344 }
1345
1347 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1348 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1349 bool UseMaskForCond = false, bool UseMaskForGaps = false) {
1350
1351 // We cannot scalarize scalable vectors, so return Invalid.
1352 if (isa<ScalableVectorType>(VecTy))
1354
1355 auto *VT = cast<FixedVectorType>(VecTy);
1356
1357 unsigned NumElts = VT->getNumElements();
1358 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1359
1360 unsigned NumSubElts = NumElts / Factor;
1361 auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts);
1362
1363 // Firstly, the cost of load/store operation.
1365 if (UseMaskForCond || UseMaskForGaps)
1366 Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment,
1368 else
1369 Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace,
1370 CostKind);
1371
1372 // Legalize the vector type, and get the legalized and unlegalized type
1373 // sizes.
1374 MVT VecTyLT = getTypeLegalizationCost(VecTy).second;
1375 unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy);
1376 unsigned VecTyLTSize = VecTyLT.getStoreSize();
1377
1378 // Scale the cost of the memory operation by the fraction of legalized
1379 // instructions that will actually be used. We shouldn't account for the
1380 // cost of dead instructions since they will be removed.
1381 //
1382 // E.g., An interleaved load of factor 8:
1383 // %vec = load <16 x i64>, <16 x i64>* %ptr
1384 // %v0 = shufflevector %vec, undef, <0, 8>
1385 //
1386 // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
1387 // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
1388 // type). The other loads are unused.
1389 //
1390 // TODO: Note that legalization can turn masked loads/stores into unmasked
1391 // (legalized) loads/stores. This can be reflected in the cost.
1392 if (Cost.isValid() && VecTySize > VecTyLTSize) {
1393 // The number of loads of a legal type it will take to represent a load
1394 // of the unlegalized vector type.
1395 unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize);
1396
1397 // The number of elements of the unlegalized type that correspond to a
1398 // single legal instruction.
1399 unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts);
1400
1401 // Determine which legal instructions will be used.
1402 BitVector UsedInsts(NumLegalInsts, false);
1403 for (unsigned Index : Indices)
1404 for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
1405 UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
1406
1407 // Scale the cost of the load by the fraction of legal instructions that
1408 // will be used.
1409 Cost = divideCeil(UsedInsts.count() * *Cost.getValue(), NumLegalInsts);
1410 }
1411
1412 // Then plus the cost of interleave operation.
1413 assert(Indices.size() <= Factor &&
1414 "Interleaved memory op has too many members");
1415
1416 const APInt DemandedAllSubElts = APInt::getAllOnes(NumSubElts);
1417 const APInt DemandedAllResultElts = APInt::getAllOnes(NumElts);
1418
1419 APInt DemandedLoadStoreElts = APInt::getZero(NumElts);
1420 for (unsigned Index : Indices) {
1421 assert(Index < Factor && "Invalid index for interleaved memory op");
1422 for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
1423 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
1424 }
1425
1426 if (Opcode == Instruction::Load) {
1427 // The interleave cost is similar to extract sub vectors' elements
1428 // from the wide vector, and insert them into sub vectors.
1429 //
1430 // E.g. An interleaved load of factor 2 (with one member of index 0):
1431 // %vec = load <8 x i32>, <8 x i32>* %ptr
1432 // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0
1433 // The cost is estimated as extract elements at 0, 2, 4, 6 from the
1434 // <8 x i32> vector and insert them into a <4 x i32> vector.
1435 InstructionCost InsSubCost = thisT()->getScalarizationOverhead(
1436 SubVT, DemandedAllSubElts,
1437 /*Insert*/ true, /*Extract*/ false, CostKind);
1438 Cost += Indices.size() * InsSubCost;
1439 Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1440 /*Insert*/ false,
1441 /*Extract*/ true, CostKind);
1442 } else {
1443 // The interleave cost is extract elements from sub vectors, and
1444 // insert them into the wide vector.
1445 //
1446 // E.g. An interleaved store of factor 3 with 2 members at indices 0,1:
1447 // (using VF=4):
1448 // %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef>
1449 // %gaps.mask = <true, true, false, true, true, false,
1450 // true, true, false, true, true, false>
1451 // call llvm.masked.store <12 x i32> %v0_v1, <12 x i32>* %ptr,
1452 // i32 Align, <12 x i1> %gaps.mask
1453 // The cost is estimated as extract all elements (of actual members,
1454 // excluding gaps) from both <4 x i32> vectors and insert into the <12 x
1455 // i32> vector.
1456 InstructionCost ExtSubCost = thisT()->getScalarizationOverhead(
1457 SubVT, DemandedAllSubElts,
1458 /*Insert*/ false, /*Extract*/ true, CostKind);
1459 Cost += ExtSubCost * Indices.size();
1460 Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1461 /*Insert*/ true,
1462 /*Extract*/ false, CostKind);
1463 }
1464
1465 if (!UseMaskForCond)
1466 return Cost;
1467
1468 Type *I8Type = Type::getInt8Ty(VT->getContext());
1469
1470 Cost += thisT()->getReplicationShuffleCost(
1471 I8Type, Factor, NumSubElts,
1472 UseMaskForGaps ? DemandedLoadStoreElts : DemandedAllResultElts,
1473 CostKind);
1474
1475 // The Gaps mask is invariant and created outside the loop, therefore the
1476 // cost of creating it is not accounted for here. However if we have both
1477 // a MaskForGaps and some other mask that guards the execution of the
1478 // memory access, we need to account for the cost of And-ing the two masks
1479 // inside the loop.
1480 if (UseMaskForGaps) {
1481 auto *MaskVT = FixedVectorType::get(I8Type, NumElts);
1482 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT,
1483 CostKind);
1484 }
1485
1486 return Cost;
1487 }
1488
1489 /// Get intrinsic cost based on arguments.
1492 // Check for generically free intrinsics.
1494 return 0;
1495
1496 // Assume that target intrinsics are cheap.
1497 Intrinsic::ID IID = ICA.getID();
1500
1501 if (ICA.isTypeBasedOnly())
1503
1504 Type *RetTy = ICA.getReturnType();
1505
1506 ElementCount RetVF =
1507 (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
1509 const IntrinsicInst *I = ICA.getInst();
1510 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
1511 FastMathFlags FMF = ICA.getFlags();
1512 switch (IID) {
1513 default:
1514 break;
1515
1516 case Intrinsic::powi:
1517 if (auto *RHSC = dyn_cast<ConstantInt>(Args[1])) {
1518 bool ShouldOptForSize = I->getParent()->getParent()->hasOptSize();
1519 if (getTLI()->isBeneficialToExpandPowI(RHSC->getSExtValue(),
1520 ShouldOptForSize)) {
1521 // The cost is modeled on the expansion performed by ExpandPowI in
1522 // SelectionDAGBuilder.
1523 APInt Exponent = RHSC->getValue().abs();
1524 unsigned ActiveBits = Exponent.getActiveBits();
1525 unsigned PopCount = Exponent.popcount();
1526 InstructionCost Cost = (ActiveBits + PopCount - 2) *
1527 thisT()->getArithmeticInstrCost(
1528 Instruction::FMul, RetTy, CostKind);
1529 if (RHSC->isNegative())
1530 Cost += thisT()->getArithmeticInstrCost(Instruction::FDiv, RetTy,
1531 CostKind);
1532 return Cost;
1533 }
1534 }
1535 break;
1536 case Intrinsic::cttz:
1537 // FIXME: If necessary, this should go in target-specific overrides.
1538 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz(RetTy))
1540 break;
1541
1542 case Intrinsic::ctlz:
1543 // FIXME: If necessary, this should go in target-specific overrides.
1544 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz(RetTy))
1546 break;
1547
1548 case Intrinsic::memcpy:
1549 return thisT()->getMemcpyCost(ICA.getInst());
1550
1551 case Intrinsic::masked_scatter: {
1552 const Value *Mask = Args[3];
1553 bool VarMask = !isa<Constant>(Mask);
1554 Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue();
1555 return thisT()->getGatherScatterOpCost(Instruction::Store,
1556 ICA.getArgTypes()[0], Args[1],
1557 VarMask, Alignment, CostKind, I);
1558 }
1559 case Intrinsic::masked_gather: {
1560 const Value *Mask = Args[2];
1561 bool VarMask = !isa<Constant>(Mask);
1562 Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue();
1563 return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
1564 VarMask, Alignment, CostKind, I);
1565 }
1566 case Intrinsic::experimental_stepvector: {
1567 if (isa<ScalableVectorType>(RetTy))
1569 // The cost of materialising a constant integer vector.
1571 }
1572 case Intrinsic::vector_extract: {
1573 // FIXME: Handle case where a scalable vector is extracted from a scalable
1574 // vector
1575 if (isa<ScalableVectorType>(RetTy))
1577 unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue();
1578 return thisT()->getShuffleCost(
1579 TTI::SK_ExtractSubvector, cast<VectorType>(Args[0]->getType()),
1580 std::nullopt, CostKind, Index, cast<VectorType>(RetTy));
1581 }
1582 case Intrinsic::vector_insert: {
1583 // FIXME: Handle case where a scalable vector is inserted into a scalable
1584 // vector
1585 if (isa<ScalableVectorType>(Args[1]->getType()))
1587 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1588 return thisT()->getShuffleCost(
1589 TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()),
1590 std::nullopt, CostKind, Index, cast<VectorType>(Args[1]->getType()));
1591 }
1592 case Intrinsic::experimental_vector_reverse: {
1593 return thisT()->getShuffleCost(
1594 TTI::SK_Reverse, cast<VectorType>(Args[0]->getType()), std::nullopt,
1595 CostKind, 0, cast<VectorType>(RetTy));
1596 }
1597 case Intrinsic::experimental_vector_splice: {
1598 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1599 return thisT()->getShuffleCost(
1600 TTI::SK_Splice, cast<VectorType>(Args[0]->getType()), std::nullopt,
1601 CostKind, Index, cast<VectorType>(RetTy));
1602 }
1603 case Intrinsic::vector_reduce_add:
1604 case Intrinsic::vector_reduce_mul:
1605 case Intrinsic::vector_reduce_and:
1606 case Intrinsic::vector_reduce_or:
1607 case Intrinsic::vector_reduce_xor:
1608 case Intrinsic::vector_reduce_smax:
1609 case Intrinsic::vector_reduce_smin:
1610 case Intrinsic::vector_reduce_fmax:
1611 case Intrinsic::vector_reduce_fmin:
1612 case Intrinsic::vector_reduce_fmaximum:
1613 case Intrinsic::vector_reduce_fminimum:
1614 case Intrinsic::vector_reduce_umax:
1615 case Intrinsic::vector_reduce_umin: {
1616 IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1);
1618 }
1619 case Intrinsic::vector_reduce_fadd:
1620 case Intrinsic::vector_reduce_fmul: {
1622 IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1);
1624 }
1625 case Intrinsic::fshl:
1626 case Intrinsic::fshr: {
1627 const Value *X = Args[0];
1628 const Value *Y = Args[1];
1629 const Value *Z = Args[2];
1632 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(Z);
1633 const TTI::OperandValueInfo OpInfoBW =
1635 isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2
1636 : TTI::OP_None};
1637
1638 // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
1639 // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
1641 Cost +=
1642 thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
1643 Cost +=
1644 thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
1645 Cost += thisT()->getArithmeticInstrCost(
1646 BinaryOperator::Shl, RetTy, CostKind, OpInfoX,
1647 {OpInfoZ.Kind, TTI::OP_None});
1648 Cost += thisT()->getArithmeticInstrCost(
1649 BinaryOperator::LShr, RetTy, CostKind, OpInfoY,
1650 {OpInfoZ.Kind, TTI::OP_None});
1651 // Non-constant shift amounts requires a modulo.
1652 if (!OpInfoZ.isConstant())
1653 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy,
1654 CostKind, OpInfoZ, OpInfoBW);
1655 // For non-rotates (X != Y) we must add shift-by-zero handling costs.
1656 if (X != Y) {
1657 Type *CondTy = RetTy->getWithNewBitWidth(1);
1658 Cost +=
1659 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1661 Cost +=
1662 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1664 }
1665 return Cost;
1666 }
1667 case Intrinsic::get_active_lane_mask: {
1668 EVT ResVT = getTLI()->getValueType(DL, RetTy, true);
1669 EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true);
1670
1671 // If we're not expanding the intrinsic then we assume this is cheap
1672 // to implement.
1673 if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgType)) {
1674 return getTypeLegalizationCost(RetTy).first;
1675 }
1676
1677 // Create the expanded types that will be used to calculate the uadd_sat
1678 // operation.
1679 Type *ExpRetTy = VectorType::get(
1680 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1681 IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {}, FMF);
1683 thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1684 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy,
1686 return Cost;
1687 }
1688 }
1689
1690 // Assume that we need to scalarize this intrinsic.
1691 // Compute the scalarization overhead based on Args for a vector
1692 // intrinsic.
1693 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
1694 if (RetVF.isVector() && !RetVF.isScalable()) {
1695 ScalarizationCost = 0;
1696 if (!RetTy->isVoidTy())
1697 ScalarizationCost += getScalarizationOverhead(
1698 cast<VectorType>(RetTy),
1699 /*Insert*/ true, /*Extract*/ false, CostKind);
1700 ScalarizationCost +=
1702 }
1703
1704 IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
1705 ScalarizationCost);
1706 return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1707 }
1708
1709 /// Get intrinsic cost based on argument types.
1710 /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the
1711 /// cost of scalarizing the arguments and the return value will be computed
1712 /// based on types.
1716 Intrinsic::ID IID = ICA.getID();
1717 Type *RetTy = ICA.getReturnType();
1718 const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes();
1719 FastMathFlags FMF = ICA.getFlags();
1720 InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost();
1721 bool SkipScalarizationCost = ICA.skipScalarizationCost();
1722
1723 VectorType *VecOpTy = nullptr;
1724 if (!Tys.empty()) {
1725 // The vector reduction operand is operand 0 except for fadd/fmul.
1726 // Their operand 0 is a scalar start value, so the vector op is operand 1.
1727 unsigned VecTyIndex = 0;
1728 if (IID == Intrinsic::vector_reduce_fadd ||
1729 IID == Intrinsic::vector_reduce_fmul)
1730 VecTyIndex = 1;
1731 assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes");
1732 VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]);
1733 }
1734
1735 // Library call cost - other than size, make it expensive.
1736 unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10;
1737 unsigned ISD = 0;
1738 switch (IID) {
1739 default: {
1740 // Scalable vectors cannot be scalarized, so return Invalid.
1741 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
1742 return isa<ScalableVectorType>(Ty);
1743 }))
1745
1746 // Assume that we need to scalarize this intrinsic.
1747 InstructionCost ScalarizationCost =
1748 SkipScalarizationCost ? ScalarizationCostPassed : 0;
1749 unsigned ScalarCalls = 1;
1750 Type *ScalarRetTy = RetTy;
1751 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
1752 if (!SkipScalarizationCost)
1753 ScalarizationCost = getScalarizationOverhead(
1754 RetVTy, /*Insert*/ true, /*Extract*/ false, CostKind);
1755 ScalarCalls = std::max(ScalarCalls,
1756 cast<FixedVectorType>(RetVTy)->getNumElements());
1757 ScalarRetTy = RetTy->getScalarType();
1758 }
1759 SmallVector<Type *, 4> ScalarTys;
1760 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1761 Type *Ty = Tys[i];
1762 if (auto *VTy = dyn_cast<VectorType>(Ty)) {
1763 if (!SkipScalarizationCost)
1764 ScalarizationCost += getScalarizationOverhead(
1765 VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
1766 ScalarCalls = std::max(ScalarCalls,
1767 cast<FixedVectorType>(VTy)->getNumElements());
1768 Ty = Ty->getScalarType();
1769 }
1770 ScalarTys.push_back(Ty);
1771 }
1772 if (ScalarCalls == 1)
1773 return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
1774
1775 IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF);
1776 InstructionCost ScalarCost =
1777 thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind);
1778
1779 return ScalarCalls * ScalarCost + ScalarizationCost;
1780 }
1781 // Look for intrinsics that can be lowered directly or turned into a scalar
1782 // intrinsic call.
1783 case Intrinsic::sqrt:
1784 ISD = ISD::FSQRT;
1785 break;
1786 case Intrinsic::sin:
1787 ISD = ISD::FSIN;
1788 break;
1789 case Intrinsic::cos:
1790 ISD = ISD::FCOS;
1791 break;
1792 case Intrinsic::exp:
1793 ISD = ISD::FEXP;
1794 break;
1795 case Intrinsic::exp2:
1796 ISD = ISD::FEXP2;
1797 break;
1798 case Intrinsic::exp10:
1799 ISD = ISD::FEXP10;
1800 break;
1801 case Intrinsic::log:
1802 ISD = ISD::FLOG;
1803 break;
1804 case Intrinsic::log10:
1805 ISD = ISD::FLOG10;
1806 break;
1807 case Intrinsic::log2:
1808 ISD = ISD::FLOG2;
1809 break;
1810 case Intrinsic::fabs:
1811 ISD = ISD::FABS;
1812 break;
1813 case Intrinsic::canonicalize:
1814 ISD = ISD::FCANONICALIZE;
1815 break;
1816 case Intrinsic::minnum:
1817 ISD = ISD::FMINNUM;
1818 break;
1819 case Intrinsic::maxnum:
1820 ISD = ISD::FMAXNUM;
1821 break;
1822 case Intrinsic::minimum:
1823 ISD = ISD::FMINIMUM;
1824 break;
1825 case Intrinsic::maximum:
1826 ISD = ISD::FMAXIMUM;
1827 break;
1828 case Intrinsic::copysign:
1829 ISD = ISD::FCOPYSIGN;
1830 break;
1831 case Intrinsic::floor:
1832 ISD = ISD::FFLOOR;
1833 break;
1834 case Intrinsic::ceil:
1835 ISD = ISD::FCEIL;
1836 break;
1837 case Intrinsic::trunc:
1838 ISD = ISD::FTRUNC;
1839 break;
1840 case Intrinsic::nearbyint:
1841 ISD = ISD::FNEARBYINT;
1842 break;
1843 case Intrinsic::rint:
1844 ISD = ISD::FRINT;
1845 break;
1846 case Intrinsic::round:
1847 ISD = ISD::FROUND;
1848 break;
1849 case Intrinsic::roundeven:
1850 ISD = ISD::FROUNDEVEN;
1851 break;
1852 case Intrinsic::pow:
1853 ISD = ISD::FPOW;
1854 break;
1855 case Intrinsic::fma:
1856 ISD = ISD::FMA;
1857 break;
1858 case Intrinsic::fmuladd:
1859 ISD = ISD::FMA;
1860 break;
1861 case Intrinsic::experimental_constrained_fmuladd:
1862 ISD = ISD::STRICT_FMA;
1863 break;
1864 // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
1865 case Intrinsic::lifetime_start:
1866 case Intrinsic::lifetime_end:
1867 case Intrinsic::sideeffect:
1868 case Intrinsic::pseudoprobe:
1869 case Intrinsic::arithmetic_fence:
1870 return 0;
1871 case Intrinsic::masked_store: {
1872 Type *Ty = Tys[0];
1873 Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
1874 return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0,
1875 CostKind);
1876 }
1877 case Intrinsic::masked_load: {
1878 Type *Ty = RetTy;
1879 Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
1880 return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
1881 CostKind);
1882 }
1883 case Intrinsic::vector_reduce_add:
1884 return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy,
1885 std::nullopt, CostKind);
1886 case Intrinsic::vector_reduce_mul:
1887 return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy,
1888 std::nullopt, CostKind);
1889 case Intrinsic::vector_reduce_and:
1890 return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy,
1891 std::nullopt, CostKind);
1892 case Intrinsic::vector_reduce_or:
1893 return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy,
1894 std::nullopt, CostKind);
1895 case Intrinsic::vector_reduce_xor:
1896 return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy,
1897 std::nullopt, CostKind);
1898 case Intrinsic::vector_reduce_fadd:
1899 return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy,
1900 FMF, CostKind);
1901 case Intrinsic::vector_reduce_fmul:
1902 return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy,
1903 FMF, CostKind);
1904 case Intrinsic::vector_reduce_smax:
1905 return thisT()->getMinMaxReductionCost(Intrinsic::smax, VecOpTy,
1906 ICA.getFlags(), CostKind);
1907 case Intrinsic::vector_reduce_smin:
1908 return thisT()->getMinMaxReductionCost(Intrinsic::smin, VecOpTy,
1909 ICA.getFlags(), CostKind);
1910 case Intrinsic::vector_reduce_umax:
1911 return thisT()->getMinMaxReductionCost(Intrinsic::umax, VecOpTy,
1912 ICA.getFlags(), CostKind);
1913 case Intrinsic::vector_reduce_umin:
1914 return thisT()->getMinMaxReductionCost(Intrinsic::umin, VecOpTy,
1915 ICA.getFlags(), CostKind);
1916 case Intrinsic::vector_reduce_fmax:
1917 return thisT()->getMinMaxReductionCost(Intrinsic::maxnum, VecOpTy,
1918 ICA.getFlags(), CostKind);
1919 case Intrinsic::vector_reduce_fmin:
1920 return thisT()->getMinMaxReductionCost(Intrinsic::minnum, VecOpTy,
1921 ICA.getFlags(), CostKind);
1922 case Intrinsic::vector_reduce_fmaximum:
1923 return thisT()->getMinMaxReductionCost(Intrinsic::maximum, VecOpTy,
1924 ICA.getFlags(), CostKind);
1925 case Intrinsic::vector_reduce_fminimum:
1926 return thisT()->getMinMaxReductionCost(Intrinsic::minimum, VecOpTy,
1927 ICA.getFlags(), CostKind);
1928 case Intrinsic::abs: {
1929 // abs(X) = select(icmp(X,0),X,sub(0,X))
1930 Type *CondTy = RetTy->getWithNewBitWidth(1);
1933 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1934 Pred, CostKind);
1935 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1936 Pred, CostKind);
1937 // TODO: Should we add an OperandValueProperties::OP_Zero property?
1938 Cost += thisT()->getArithmeticInstrCost(
1939 BinaryOperator::Sub, RetTy, CostKind, {TTI::OK_UniformConstantValue, TTI::OP_None});
1940 return Cost;
1941 }
1942 case Intrinsic::smax:
1943 case Intrinsic::smin:
1944 case Intrinsic::umax:
1945 case Intrinsic::umin: {
1946 // minmax(X,Y) = select(icmp(X,Y),X,Y)
1947 Type *CondTy = RetTy->getWithNewBitWidth(1);
1948 bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin;
1949 CmpInst::Predicate Pred =
1950 IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT;
1952 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1953 Pred, CostKind);
1954 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1955 Pred, CostKind);
1956 return Cost;
1957 }
1958 case Intrinsic::sadd_sat:
1959 case Intrinsic::ssub_sat: {
1960 Type *CondTy = RetTy->getWithNewBitWidth(1);
1961
1962 Type *OpTy = StructType::create({RetTy, CondTy});
1963 Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
1964 ? Intrinsic::sadd_with_overflow
1965 : Intrinsic::ssub_with_overflow;
1967
1968 // SatMax -> Overflow && SumDiff < 0
1969 // SatMin -> Overflow && SumDiff >= 0
1971 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
1972 nullptr, ScalarizationCostPassed);
1973 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1974 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1975 Pred, CostKind);
1976 Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
1977 CondTy, Pred, CostKind);
1978 return Cost;
1979 }
1980 case Intrinsic::uadd_sat:
1981 case Intrinsic::usub_sat: {
1982 Type *CondTy = RetTy->getWithNewBitWidth(1);
1983
1984 Type *OpTy = StructType::create({RetTy, CondTy});
1985 Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
1986 ? Intrinsic::uadd_with_overflow
1987 : Intrinsic::usub_with_overflow;
1988
1990 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
1991 nullptr, ScalarizationCostPassed);
1992 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1993 Cost +=
1994 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1996 return Cost;
1997 }
1998 case Intrinsic::smul_fix:
1999 case Intrinsic::umul_fix: {
2000 unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
2001 Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);
2002
2003 unsigned ExtOp =
2004 IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
2006
2008 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
2009 Cost +=
2010 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2011 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
2012 CCH, CostKind);
2013 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy,
2014 CostKind,
2017 Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind,
2020 Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
2021 return Cost;
2022 }
2023 case Intrinsic::sadd_with_overflow:
2024 case Intrinsic::ssub_with_overflow: {
2025 Type *SumTy = RetTy->getContainedType(0);
2026 Type *OverflowTy = RetTy->getContainedType(1);
2027 unsigned Opcode = IID == Intrinsic::sadd_with_overflow
2028 ? BinaryOperator::Add
2029 : BinaryOperator::Sub;
2030
2031 // Add:
2032 // Overflow -> (Result < LHS) ^ (RHS < 0)
2033 // Sub:
2034 // Overflow -> (Result < LHS) ^ (RHS > 0)
2036 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
2037 Cost += 2 * thisT()->getCmpSelInstrCost(
2038 Instruction::ICmp, SumTy, OverflowTy,
2040 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy,
2041 CostKind);
2042 return Cost;
2043 }
2044 case Intrinsic::uadd_with_overflow:
2045 case Intrinsic::usub_with_overflow: {
2046 Type *SumTy = RetTy->getContainedType(0);
2047 Type *OverflowTy = RetTy->getContainedType(1);
2048 unsigned Opcode = IID == Intrinsic::uadd_with_overflow
2049 ? BinaryOperator::Add
2050 : BinaryOperator::Sub;
2051 CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow
2054
2056 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
2057 Cost +=
2058 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy,
2059 Pred, CostKind);
2060 return Cost;
2061 }
2062 case Intrinsic::smul_with_overflow:
2063 case Intrinsic::umul_with_overflow: {
2064 Type *MulTy = RetTy->getContainedType(0);
2065 Type *OverflowTy = RetTy->getContainedType(1);
2066 unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
2067 Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
2068 bool IsSigned = IID == Intrinsic::smul_with_overflow;
2069
2070 unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt;
2072
2074 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
2075 Cost +=
2076 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2077 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
2078 CCH, CostKind);
2079 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, ExtTy,
2080 CostKind,
2083
2084 if (IsSigned)
2085 Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy,
2086 CostKind,
2089
2090 Cost += thisT()->getCmpSelInstrCost(
2091 BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind);
2092 return Cost;
2093 }
2094 case Intrinsic::fptosi_sat:
2095 case Intrinsic::fptoui_sat: {
2096 if (Tys.empty())
2097 break;
2098 Type *FromTy = Tys[0];
2099 bool IsSigned = IID == Intrinsic::fptosi_sat;
2100
2102 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FromTy,
2103 {FromTy, FromTy});
2104 Cost += thisT()->getIntrinsicInstrCost(Attrs1, CostKind);
2105 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FromTy,
2106 {FromTy, FromTy});
2107 Cost += thisT()->getIntrinsicInstrCost(Attrs2, CostKind);
2108 Cost += thisT()->getCastInstrCost(
2109 IsSigned ? Instruction::FPToSI : Instruction::FPToUI, RetTy, FromTy,
2111 if (IsSigned) {
2112 Type *CondTy = RetTy->getWithNewBitWidth(1);
2113 Cost += thisT()->getCmpSelInstrCost(
2114 BinaryOperator::FCmp, FromTy, CondTy, CmpInst::FCMP_UNO, CostKind);
2115 Cost += thisT()->getCmpSelInstrCost(
2116 BinaryOperator::Select, RetTy, CondTy, CmpInst::FCMP_UNO, CostKind);
2117 }
2118 return Cost;
2119 }
2120 case Intrinsic::ctpop:
2121 ISD = ISD::CTPOP;
2122 // In case of legalization use TCC_Expensive. This is cheaper than a
2123 // library call but still not a cheap instruction.
2124 SingleCallCost = TargetTransformInfo::TCC_Expensive;
2125 break;
2126 case Intrinsic::ctlz:
2127 ISD = ISD::CTLZ;
2128 break;
2129 case Intrinsic::cttz:
2130 ISD = ISD::CTTZ;
2131 break;
2132 case Intrinsic::bswap:
2133 ISD = ISD::BSWAP;
2134 break;
2135 case Intrinsic::bitreverse:
2136 ISD = ISD::BITREVERSE;
2137 break;
2138 }
2139
2140 const TargetLoweringBase *TLI = getTLI();
2141 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
2142
2143 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
2144 if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
2145 TLI->isFAbsFree(LT.second)) {
2146 return 0;
2147 }
2148
2149 // The operation is legal. Assume it costs 1.
2150 // If the type is split to multiple registers, assume that there is some
2151 // overhead to this.
2152 // TODO: Once we have extract/insert subvector cost we need to use them.
2153 if (LT.first > 1)
2154 return (LT.first * 2);
2155 else
2156 return (LT.first * 1);
2157 } else if (!TLI->isOperationExpand(ISD, LT.second)) {
2158 // If the operation is custom lowered then assume
2159 // that the code is twice as expensive.
2160 return (LT.first * 2);
2161 }
2162
2163 // If we can't lower fmuladd into an FMA estimate the cost as a floating
2164 // point mul followed by an add.
2165 if (IID == Intrinsic::fmuladd)
2166 return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy,
2167 CostKind) +
2168 thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy,
2169 CostKind);
2170 if (IID == Intrinsic::experimental_constrained_fmuladd) {
2171 IntrinsicCostAttributes FMulAttrs(
2172 Intrinsic::experimental_constrained_fmul, RetTy, Tys);
2173 IntrinsicCostAttributes FAddAttrs(
2174 Intrinsic::experimental_constrained_fadd, RetTy, Tys);
2175 return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) +
2176 thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind);
2177 }
2178
2179 // Else, assume that we need to scalarize this intrinsic. For math builtins
2180 // this will emit a costly libcall, adding call overhead and spills. Make it
2181 // very expensive.
2182 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
2183 // Scalable vectors cannot be scalarized, so return Invalid.
2184 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
2185 return isa<ScalableVectorType>(Ty);
2186 }))
2188
2189 InstructionCost ScalarizationCost =
2190 SkipScalarizationCost
2191 ? ScalarizationCostPassed
2192 : getScalarizationOverhead(RetVTy, /*Insert*/ true,
2193 /*Extract*/ false, CostKind);
2194
2195 unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements();
2196 SmallVector<Type *, 4> ScalarTys;
2197 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
2198 Type *Ty = Tys[i];
2199 if (Ty->isVectorTy())
2200 Ty = Ty->getScalarType();
2201 ScalarTys.push_back(Ty);
2202 }
2203 IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
2204 InstructionCost ScalarCost =
2205 thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2206 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
2207 if (auto *VTy = dyn_cast<VectorType>(Tys[i])) {
2208 if (!ICA.skipScalarizationCost())
2209 ScalarizationCost += getScalarizationOverhead(
2210 VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
2211 ScalarCalls = std::max(ScalarCalls,
2212 cast<FixedVectorType>(VTy)->getNumElements());
2213 }
2214 }
2215 return ScalarCalls * ScalarCost + ScalarizationCost;
2216 }
2217
2218 // This is going to be turned into a library call, make it expensive.
2219 return SingleCallCost;
2220 }
2221
2222 /// Compute a cost of the given call instruction.
2223 ///
2224 /// Compute the cost of calling function F with return type RetTy and
2225 /// argument types Tys. F might be nullptr, in this case the cost of an
2226 /// arbitrary call with the specified signature will be returned.
2227 /// This is used, for instance, when we estimate call of a vector
2228 /// counterpart of the given function.
2229 /// \param F Called function, might be nullptr.
2230 /// \param RetTy Return value types.
2231 /// \param Tys Argument types.
2232 /// \returns The cost of Call instruction.
2234 ArrayRef<Type *> Tys,
2236 return 10;
2237 }
2238
2239 unsigned getNumberOfParts(Type *Tp) {
2240 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
2241 return LT.first.isValid() ? *LT.first.getValue() : 0;
2242 }
2243
2245 const SCEV *) {
2246 return 0;
2247 }
2248
2249 /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
2250 /// We're assuming that reduction operation are performing the following way:
2251 ///
2252 /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
2253 /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
2254 /// \----------------v-------------/ \----------v------------/
2255 /// n/2 elements n/2 elements
2256 /// %red1 = op <n x t> %val, <n x t> val1
2257 /// After this operation we have a vector %red1 where only the first n/2
2258 /// elements are meaningful, the second n/2 elements are undefined and can be
2259 /// dropped. All other operations are actually working with the vector of
2260 /// length n/2, not n, though the real vector length is still n.
2261 /// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
2262 /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
2263 /// \----------------v-------------/ \----------v------------/
2264 /// n/4 elements 3*n/4 elements
2265 /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of
2266 /// length n/2, the resulting vector has length n/4 etc.
2267 ///
2268 /// The cost model should take into account that the actual length of the
2269 /// vector is reduced on each iteration.
2272 // Targets must implement a default value for the scalable case, since
2273 // we don't know how many lanes the vector has.
2274 if (isa<ScalableVectorType>(Ty))
2276
2277 Type *ScalarTy = Ty->getElementType();
2278 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2279 if ((Opcode == Instruction::Or || Opcode == Instruction::And) &&
2280 ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) &&
2281 NumVecElts >= 2) {
2282 // Or reduction for i1 is represented as:
2283 // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2284 // %res = cmp ne iReduxWidth %val, 0
2285 // And reduction for i1 is represented as:
2286 // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2287 // %res = cmp eq iReduxWidth %val, 11111
2288 Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts);
2289 return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty,
2291 thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy,
2294 }
2295 unsigned NumReduxLevels = Log2_32(NumVecElts);
2296 InstructionCost ArithCost = 0;
2297 InstructionCost ShuffleCost = 0;
2298 std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty);
2299 unsigned LongVectorCount = 0;
2300 unsigned MVTLen =
2301 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2302 while (NumVecElts > MVTLen) {
2303 NumVecElts /= 2;
2304 VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2305 ShuffleCost +=
2306 thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
2307 CostKind, NumVecElts, SubTy);
2308 ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind);
2309 Ty = SubTy;
2310 ++LongVectorCount;
2311 }
2312
2313 NumReduxLevels -= LongVectorCount;
2314
2315 // The minimal length of the vector is limited by the real length of vector
2316 // operations performed on the current platform. That's why several final
2317 // reduction operations are performed on the vectors with the same
2318 // architecture-dependent length.
2319
2320 // By default reductions need one shuffle per reduction level.
2321 ShuffleCost +=
2322 NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
2323 std::nullopt, CostKind, 0, Ty);
2324 ArithCost +=
2325 NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind);
2326 return ShuffleCost + ArithCost +
2327 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
2328 CostKind, 0, nullptr, nullptr);
2329 }
2330
2331 /// Try to calculate the cost of performing strict (in-order) reductions,
2332 /// which involves doing a sequence of floating point additions in lane
2333 /// order, starting with an initial value. For example, consider a scalar
2334 /// initial value 'InitVal' of type float and a vector of type <4 x float>:
2335 ///
2336 /// Vector = <float %v0, float %v1, float %v2, float %v3>
2337 ///
2338 /// %add1 = %InitVal + %v0
2339 /// %add2 = %add1 + %v1
2340 /// %add3 = %add2 + %v2
2341 /// %add4 = %add3 + %v3
2342 ///
2343 /// As a simple estimate we can say the cost of such a reduction is 4 times
2344 /// the cost of a scalar FP addition. We can only estimate the costs for
2345 /// fixed-width vectors here because for scalable vectors we do not know the
2346 /// runtime number of operations.
2349 // Targets must implement a default value for the scalable case, since
2350 // we don't know how many lanes the vector has.
2351 if (isa<ScalableVectorType>(Ty))
2353
2354 auto *VTy = cast<FixedVectorType>(Ty);
2356 VTy, /*Insert=*/false, /*Extract=*/true, CostKind);
2357 InstructionCost ArithCost = thisT()->getArithmeticInstrCost(
2358 Opcode, VTy->getElementType(), CostKind);
2359 ArithCost *= VTy->getNumElements();
2360
2361 return ExtractCost + ArithCost;
2362 }
2363
2365 std::optional<FastMathFlags> FMF,
2367 assert(Ty && "Unknown reduction vector type");
2369 return getOrderedReductionCost(Opcode, Ty, CostKind);
2370 return getTreeReductionCost(Opcode, Ty, CostKind);
2371 }
2372
2373 /// Try to calculate op costs for min/max reduction operations.
2374 /// \param CondTy Conditional type for the Select instruction.
2376 FastMathFlags FMF,
2378 // Targets must implement a default value for the scalable case, since
2379 // we don't know how many lanes the vector has.
2380 if (isa<ScalableVectorType>(Ty))
2382
2383 Type *ScalarTy = Ty->getElementType();
2384 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2385 unsigned NumReduxLevels = Log2_32(NumVecElts);
2386 InstructionCost MinMaxCost = 0;
2387 InstructionCost ShuffleCost = 0;
2388 std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty);
2389 unsigned LongVectorCount = 0;
2390 unsigned MVTLen =
2391 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2392 while (NumVecElts > MVTLen) {
2393 NumVecElts /= 2;
2394 auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2395
2396 ShuffleCost +=
2397 thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
2398 CostKind, NumVecElts, SubTy);
2399
2400 IntrinsicCostAttributes Attrs(IID, SubTy, {SubTy, SubTy}, FMF);
2401 MinMaxCost += getIntrinsicInstrCost(Attrs, CostKind);
2402 Ty = SubTy;
2403 ++LongVectorCount;
2404 }
2405
2406 NumReduxLevels -= LongVectorCount;
2407
2408 // The minimal length of the vector is limited by the real length of vector
2409 // operations performed on the current platform. That's why several final
2410 // reduction opertions are perfomed on the vectors with the same
2411 // architecture-dependent length.
2412 ShuffleCost +=
2413 NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
2414 std::nullopt, CostKind, 0, Ty);
2415 IntrinsicCostAttributes Attrs(IID, Ty, {Ty, Ty}, FMF);
2416 MinMaxCost += NumReduxLevels * getIntrinsicInstrCost(Attrs, CostKind);
2417 // The last min/max should be in vector registers and we counted it above.
2418 // So just need a single extractelement.
2419 return ShuffleCost + MinMaxCost +
2420 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
2421 CostKind, 0, nullptr, nullptr);
2422 }
2423
2424 InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned,
2425 Type *ResTy, VectorType *Ty,
2426 FastMathFlags FMF,
2428 // Without any native support, this is equivalent to the cost of
2429 // vecreduce.opcode(ext(Ty A)).
2430 VectorType *ExtTy = VectorType::get(ResTy, Ty);
2431 InstructionCost RedCost =
2432 thisT()->getArithmeticReductionCost(Opcode, ExtTy, FMF, CostKind);
2433 InstructionCost ExtCost = thisT()->getCastInstrCost(
2434 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2436
2437 return RedCost + ExtCost;
2438 }
2439
2441 VectorType *Ty,
2443 // Without any native support, this is equivalent to the cost of
2444 // vecreduce.add(mul(ext(Ty A), ext(Ty B))) or
2445 // vecreduce.add(mul(A, B)).
2446 VectorType *ExtTy = VectorType::get(ResTy, Ty);
2447 InstructionCost RedCost = thisT()->getArithmeticReductionCost(
2448 Instruction::Add, ExtTy, std::nullopt, CostKind);
2449 InstructionCost ExtCost = thisT()->getCastInstrCost(
2450 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2452
2453 InstructionCost MulCost =
2454 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2455
2456 return RedCost + MulCost + 2 * ExtCost;
2457 }
2458
2460
2461 /// @}
2462};
2463
2464/// Concrete BasicTTIImpl that can be used if no further customization
2465/// is needed.
2466class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
2468
2469 friend class BasicTTIImplBase<BasicTTIImpl>;
2470
2471 const TargetSubtargetInfo *ST;
2472 const TargetLoweringBase *TLI;
2473
2474 const TargetSubtargetInfo *getST() const { return ST; }
2475 const TargetLoweringBase *getTLI() const { return TLI; }
2476
2477public:
2478 explicit BasicTTIImpl(const TargetMachine *TM, const Function &F);
2479};
2480
2481} // end namespace llvm
2482
2483#endif // LLVM_CODEGEN_BASICTTIIMPL_H
This file implements a class to represent arbitrary precision integral constant values and operations...
This file implements the BitVector class.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
return RetTy
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
LLVMContext & Context
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
const char LLVMTargetMachineRef TM
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This file describes how to lower LLVM code to machine code.
This file provides helpers for the implementation of a TargetTransformInfo-conforming class.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:76
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1302
bool sgt(const APInt &RHS) const
Signed greater than comparison.
Definition: APInt.h:1173
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1433
bool slt(const APInt &RHS) const
Signed less than comparison.
Definition: APInt.h:1102
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:178
an instruction to allocate memory on the stack
Definition: Instructions.h:58
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:56
Base class which can be used to help build a TTI implementation.
Definition: BasicTTIImpl.h:79
bool isTypeLegal(Type *Ty)
Definition: BasicTTIImpl.h:412
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const
Definition: BasicTTIImpl.h:285
virtual unsigned getPrefetchDistance() const
Definition: BasicTTIImpl.h:687
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace)
Definition: BasicTTIImpl.h:389
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:547
unsigned getMaxInterleaveFactor(ElementCount VF)
Definition: BasicTTIImpl.h:854
unsigned getNumberOfParts(Type *Tp)
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
Definition: BasicTTIImpl.h:716
std::optional< unsigned > getVScaleForTuning() const
Definition: BasicTTIImpl.h:721
InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty, TTI::TargetCostKind CostKind)
Try to calculate the cost of performing strict (in-order) reductions, which involves doing a sequence...
bool isTruncateFree(Type *Ty1, Type *Ty2)
Definition: BasicTTIImpl.h:402
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo)
Definition: BasicTTIImpl.h:627
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:856
InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty, TTI::TargetCostKind CostKind)
Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI)
Definition: BasicTTIImpl.h:634
virtual bool shouldPrefetchAddressSpace(unsigned AS) const
Definition: BasicTTIImpl.h:707
bool isLegalICmpImmediate(int64_t imm)
Definition: BasicTTIImpl.h:330
bool isProfitableToHoist(Instruction *I)
Definition: BasicTTIImpl.h:406
virtual unsigned getMaxPrefetchIterationsAhead() const
Definition: BasicTTIImpl.h:699
InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, TTI::TargetCostKind CostKind, unsigned Index)
std::optional< unsigned > getMaxVScale() const
Definition: BasicTTIImpl.h:720
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:934
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
unsigned getRegUsageForType(Type *Ty)
Definition: BasicTTIImpl.h:417
bool shouldBuildRelLookupTables() const
Definition: BasicTTIImpl.h:493
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const
Definition: BasicTTIImpl.h:541
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
Definition: BasicTTIImpl.h:978
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, unsigned &JumpTableSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
Definition: BasicTTIImpl.h:428
bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty, const DataLayout &DL) const
Definition: BasicTTIImpl.h:365
bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2)
Definition: BasicTTIImpl.h:377
std::optional< Value * > simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, bool &KnownBitsComputed)
Definition: BasicTTIImpl.h:649
virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, unsigned NumPrefetches, bool HasCall) const
Definition: BasicTTIImpl.h:691
bool hasBranchDivergence(const Function *F=nullptr)
Definition: BasicTTIImpl.h:279
bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty, const DataLayout &DL) const
Definition: BasicTTIImpl.h:371
unsigned getAssumedAddrSpace(const Value *V) const
Definition: BasicTTIImpl.h:307
InstructionCost getOperandsScalarizationOverhead(ArrayRef< const Value * > Args, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instructions unique non-constant operands.
Definition: BasicTTIImpl.h:773
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:727
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:422
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty)
Definition: BasicTTIImpl.h:525
virtual std::optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const
Definition: BasicTTIImpl.h:667
bool isAlwaysUniform(const Value *V)
Definition: BasicTTIImpl.h:283
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr)
Definition: BasicTTIImpl.h:334
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true)
Definition: BasicTTIImpl.h:639
bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, unsigned AddressSpace, Align Alignment, unsigned *Fast) const
Definition: BasicTTIImpl.h:271
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
Definition: BasicTTIImpl.h:345
InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
Definition: BasicTTIImpl.h:757
virtual std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const
Definition: BasicTTIImpl.h:673
virtual bool enableWritePrefetching() const
Definition: BasicTTIImpl.h:703
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const
Definition: BasicTTIImpl.h:321
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:619
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const
Definition: BasicTTIImpl.h:298
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Compute a cost of the given call instruction.
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getFPOpCost(Type *Ty)
Definition: BasicTTIImpl.h:529
InstructionCost getVectorSplitCost()
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:820
bool haveFastSqrt(Type *Ty)
Definition: BasicTTIImpl.h:518
std::pair< const Value *, unsigned > getPredicatedAddrSpace(const Value *V) const
Definition: BasicTTIImpl.h:317
unsigned getInliningThresholdMultiplier() const
Definition: BasicTTIImpl.h:539
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
virtual ~BasicTTIImplBase()=default
InstructionCost getScalarizationOverhead(VectorType *RetTy, ArrayRef< const Value * > Args, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing the inputs and outputs of an instruction, with return type RetTy...
Definition: BasicTTIImpl.h:802
bool isVScaleKnownToBeAPowerOfTwo() const
Definition: BasicTTIImpl.h:722
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II)
Definition: BasicTTIImpl.h:643
bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const
Definition: BasicTTIImpl.h:289
bool isLegalAddImmediate(int64_t imm)
Definition: BasicTTIImpl.h:326
unsigned getFlatAddressSpace()
Definition: BasicTTIImpl.h:293
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
virtual unsigned getCacheLineSize() const
Definition: BasicTTIImpl.h:683
bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const
Definition: BasicTTIImpl.h:303
bool isSourceOfDivergence(const Value *V)
Definition: BasicTTIImpl.h:281
int getInlinerVectorBonusPercent() const
Definition: BasicTTIImpl.h:545
InstructionCost getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on argument types.
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp)
Definition: BasicTTIImpl.h:656
bool isSingleThreaded() const
Definition: BasicTTIImpl.h:311
BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
Definition: BasicTTIImpl.h:262
unsigned adjustInliningThreshold(const CallBase *CB)
Definition: BasicTTIImpl.h:540
bool isProfitableLSRChainElement(Instruction *I)
Definition: BasicTTIImpl.h:385
Concrete BasicTTIImpl that can be used if no further customization is needed.
size_type count() const
count - Returns the number of bits which are set.
Definition: BitVector.h:162
BitVector & set()
Definition: BitVector.h:351
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1190
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:1058
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:711
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:734
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:738
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:736
@ ICMP_EQ
equal
Definition: InstrTypes.h:732
@ ICMP_NE
not equal
Definition: InstrTypes.h:733
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:721
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
Definition: DataLayout.h:484
unsigned getIndexSizeInBits(unsigned AS) const
Size in bits of index used for address calculation in getelementptr.
Definition: DataLayout.h:420
constexpr bool isVector() const
One or more elements.
Definition: TypeSize.h:306
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:291
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:302
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:536
unsigned getNumElements() const
Definition: DerivedTypes.h:579
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:693
bool isTargetIntrinsic() const
isTargetIntrinsic - Returns true if this function is an intrinsic and the intrinsic is specific to a ...
Definition: Function.cpp:849
The core instruction combiner logic.
Definition: InstCombiner.h:46
static InstructionCost getInvalid(CostType Val=0)
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:279
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
InstructionCost getScalarizationCost() const
const IntrinsicInst * getInst() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:47
virtual bool shouldPrefetchAddressSpace(unsigned AS) const
virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, unsigned NumPrefetches, bool HasCall) const
Return the minimum stride necessary to trigger software prefetching.
virtual bool enableWritePrefetching() const
virtual unsigned getMaxPrefetchIterationsAhead() const
Return the maximum prefetch distance in terms of loop iterations.
virtual unsigned getPrefetchDistance() const
Return the preferred prefetch distance in terms of instructions.
virtual std::optional< unsigned > getCacheAssociativity(unsigned Level) const
Return the cache associatvity for the given level of cache.
virtual std::optional< unsigned > getCacheLineSize(unsigned Level) const
Return the target cache line size in bytes at a given level.
Machine Value Type.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Analysis providing profile information.
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static bool isSelectMask(ArrayRef< int > Mask)
Return true if this shuffle mask chooses elements from its source vectors without lane crossings.
static bool isReverseMask(ArrayRef< int > Mask)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isZeroEltSplatMask(ArrayRef< int > Mask)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static bool isSpliceMask(ArrayRef< int > Mask, int &Index)
Return true if this shuffle mask is a splice mask, concatenating the two inputs together and then ext...
static bool isTransposeMask(ArrayRef< int > Mask)
Return true if this shuffle mask is a transpose mask.
size_type size() const
Definition: SmallPtrSet.h:93
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:366
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:451
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:577
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
static StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition: Type.cpp:514
Multiway switch.
Provides information about what library functions are available for the current target.
This base class for TargetLowering contains the SelectionDAG-independent parts that can be used from ...
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
bool isIndexedStoreLegal(unsigned IdxMode, EVT VT) const
Return true if the specified indexed load is legal on this target.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
virtual bool isLegalICmpImmediate(int64_t) const
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
const TargetMachine & getTargetMachine() const
virtual bool isZExtFree(Type *FromTy, Type *ToTy) const
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
virtual bool isSuitableForJumpTable(const SwitchInst *SI, uint64_t NumCases, uint64_t Range, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) const
Return true if lowering to a jump table is suitable for a set of case clusters which may contain NumC...
virtual bool areJTsAllowed(const Function *Fn) const
Return true if lowering to a jump table is allowed.
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
virtual unsigned getNumRegisters(LLVMContext &Context, EVT VT, std::optional< MVT > RegisterVT=std::nullopt) const
Return the number of registers that this ValueType will eventually require.
virtual bool isCheapToSpeculateCttz(Type *Ty) const
Return true if it is cheap to speculate a call to intrinsic cttz.
bool isTruncStoreLegal(EVT ValVT, EVT MemVT) const
Return true if the specified store with truncation is legal on this target.
virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *=nullptr) const
Determine if the target supports unaligned memory accesses.
virtual bool isTruncateFree(Type *FromTy, Type *ToTy) const
Return true if it's free to truncate a value of type FromTy to type ToTy.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
bool isSuitableForBitTests(unsigned NumDests, unsigned NumCmps, const APInt &Low, const APInt &High, const DataLayout &DL) const
Return true if lowering to a bit test is suitable for a set of case clusters which contains NumDests ...
virtual bool isLegalAddImmediate(int64_t) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
virtual bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
LegalizeAction getTruncStoreAction(EVT ValVT, EVT MemVT) const
Return how this store with truncation should be treated: either it is legal, needs to be promoted to ...
LegalizeAction getLoadExtAction(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return how this load with extension should be treated: either it is legal, needs to be promoted to a ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isProfitableToHoist(Instruction *I) const
bool isIndexedLoadLegal(unsigned IdxMode, EVT VT) const
Return true if the specified indexed load is legal on this target.
bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal on this target.
virtual bool isCheapToSpeculateCtlz(Type *Ty) const
Return true if it is cheap to speculate a call to intrinsic ctlz.
LegalizeKind getTypeConversion(LLVMContext &Context, EVT VT) const
Return pair that represents the legalization kind (first) that needs to happen to EVT (second) in ord...
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
bool isBeneficialToExpandPowI(int64_t Exponent, bool OptForSize) const
Return true if it is beneficial to expand an @llvm.powi.
virtual bool isFAbsFree(EVT VT) const
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
virtual bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AddrSpace, Instruction *I=nullptr) const
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:78
virtual std::pair< const Value *, unsigned > getPredicatedAddrSpace(const Value *V) const
If the specified predicate checks whether a generic pointer falls within a specified address space,...
virtual bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const
Returns true if a cast between SrcAS and DestAS is a noop.
virtual unsigned getAssumedAddrSpace(const Value *V) const
If the specified generic pointer could be assumed as a pointer to a specific address space,...
TargetOptions Options
ThreadModel::Model ThreadModel
ThreadModel - This flag specifies the type of threading model to assume for things like atomics.
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual bool useAA() const
Enable use of alias analysis during code generation (during MI scheduling, DAGCombine,...
const DataLayout & getDataLayout() const
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
bool isProfitableLSRChainElement(Instruction *I) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I) const
std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info, TTI::OperandValueInfo Opd2Info, ArrayRef< const Value * > Args, const Instruction *CxtI=nullptr) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I) const
bool isLoweredToCall(const Function *F) const
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const
std::optional< Value * > simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, bool &KnownBitsComputed) const
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
CRTP base class for use as a mix-in that aids implementing a TargetTransformInfo-compatible class.
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Basic
The cost of a typical 'add' instruction.
MemIndexedMode
The type of load/store indexing.
@ MIM_PostInc
Post-incrementing.
@ MIM_PostDec
Post-decrementing.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
CacheLevel
The possible cache levels.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:355
bool isArch64Bit() const
Test whether the architecture is 64-bit.
Definition: Triple.cpp:1463
bool isOSDarwin() const
Is this a "Darwin" OS (macOS, iOS, tvOS, watchOS, or DriverKit).
Definition: Triple.h:518
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:322
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:234
static IntegerType * getInt1Ty(LLVMContext &C)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
static IntegerType * getInt8Ty(LLVMContext &C)
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:262
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:216
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
LLVM Value Representation.
Definition: Value.h:74
Base class of all SIMD vector types.
Definition: DerivedTypes.h:400
static VectorType * getHalfElementsVectorType(VectorType *VTy)
This static method returns a VectorType with half as many elements as the input type and the same ele...
Definition: DerivedTypes.h:504
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:677
Type * getElementType() const
Definition: DerivedTypes.h:433
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:198
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:166
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition: APInt.cpp:2986
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:714
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:483
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:255
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1047
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1051
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:500
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:727
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:966
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:736
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:979
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:493
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
Definition: ISDOpcodes.h:1452
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:414
AddressSpace
Definition: NVPTXBaseInfo.h:21
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1734
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:313
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:264
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:184
InstructionCost Cost
cl::opt< unsigned > PartialUnrollingThreshold
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Extended Value Type.
Definition: ValueTypes.h:34
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:129
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:616
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:299
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
Attributes of a target dependent hardware loop.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...
bool AllowPeeling
Allow peeling off loop iterations.
bool AllowLoopNestsPeeling
Allow peeling off loop iterations for loop nests.
bool PeelProfiledIterations
Allow peeling basing on profile.
unsigned PeelCount
A forced peeling factor (the number of bodied of the original loop that should be peeled off before t...
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).