LLVM 17.0.0git
BasicTTIImpl.h
Go to the documentation of this file.
1//===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file provides a helper that implements much of the TTI interface in
11/// terms of the target-independent code generator and TargetLowering
12/// interfaces.
13//
14//===----------------------------------------------------------------------===//
15
16#ifndef LLVM_CODEGEN_BASICTTIIMPL_H
17#define LLVM_CODEGEN_BASICTTIIMPL_H
18
19#include "llvm/ADT/APInt.h"
20#include "llvm/ADT/ArrayRef.h"
21#include "llvm/ADT/BitVector.h"
32#include "llvm/IR/BasicBlock.h"
33#include "llvm/IR/Constant.h"
34#include "llvm/IR/Constants.h"
35#include "llvm/IR/DataLayout.h"
37#include "llvm/IR/InstrTypes.h"
38#include "llvm/IR/Instruction.h"
40#include "llvm/IR/Intrinsics.h"
41#include "llvm/IR/Operator.h"
42#include "llvm/IR/Type.h"
43#include "llvm/IR/Value.h"
51#include <algorithm>
52#include <cassert>
53#include <cstdint>
54#include <limits>
55#include <optional>
56#include <utility>
57
58namespace llvm {
59
60class Function;
61class GlobalValue;
62class LLVMContext;
63class ScalarEvolution;
64class SCEV;
65class TargetMachine;
66
67extern cl::opt<unsigned> PartialUnrollingThreshold;
68
69/// Base class which can be used to help build a TTI implementation.
70///
71/// This class provides as much implementation of the TTI interface as is
72/// possible using the target independent parts of the code generator.
73///
74/// In order to subclass it, your class must implement a getST() method to
75/// return the subtarget, and a getTLI() method to return the target lowering.
76/// We need these methods implemented in the derived class so that this class
77/// doesn't have to duplicate storage for them.
78template <typename T>
80private:
83
84 /// Helper function to access this as a T.
85 T *thisT() { return static_cast<T *>(this); }
86
87 /// Estimate a cost of Broadcast as an extract and sequence of insert
88 /// operations.
89 InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy,
92 // Broadcast cost is equal to the cost of extracting the zero'th element
93 // plus the cost of inserting it into every element of the result vector.
94 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
95 CostKind, 0, nullptr, nullptr);
96
97 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
98 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
99 CostKind, i, nullptr, nullptr);
100 }
101 return Cost;
102 }
103
104 /// Estimate a cost of shuffle as a sequence of extract and insert
105 /// operations.
106 InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy,
109 // Shuffle cost is equal to the cost of extracting element from its argument
110 // plus the cost of inserting them onto the result vector.
111
112 // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
113 // index 0 of first vector, index 1 of second vector,index 2 of first
114 // vector and finally index 3 of second vector and insert them at index
115 // <0,1,2,3> of result vector.
116 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
117 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
118 CostKind, i, nullptr, nullptr);
119 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
120 CostKind, i, nullptr, nullptr);
121 }
122 return Cost;
123 }
124
125 /// Estimate a cost of subvector extraction as a sequence of extract and
126 /// insert operations.
127 InstructionCost getExtractSubvectorOverhead(VectorType *VTy,
129 int Index,
130 FixedVectorType *SubVTy) {
131 assert(VTy && SubVTy &&
132 "Can only extract subvectors from vectors");
133 int NumSubElts = SubVTy->getNumElements();
134 assert((!isa<FixedVectorType>(VTy) ||
135 (Index + NumSubElts) <=
136 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
137 "SK_ExtractSubvector index out of range");
138
140 // Subvector extraction cost is equal to the cost of extracting element from
141 // the source type plus the cost of inserting them into the result vector
142 // type.
143 for (int i = 0; i != NumSubElts; ++i) {
144 Cost +=
145 thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
146 CostKind, i + Index, nullptr, nullptr);
147 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy,
148 CostKind, i, nullptr, nullptr);
149 }
150 return Cost;
151 }
152
153 /// Estimate a cost of subvector insertion as a sequence of extract and
154 /// insert operations.
155 InstructionCost getInsertSubvectorOverhead(VectorType *VTy,
157 int Index,
158 FixedVectorType *SubVTy) {
159 assert(VTy && SubVTy &&
160 "Can only insert subvectors into vectors");
161 int NumSubElts = SubVTy->getNumElements();
162 assert((!isa<FixedVectorType>(VTy) ||
163 (Index + NumSubElts) <=
164 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
165 "SK_InsertSubvector index out of range");
166
168 // Subvector insertion cost is equal to the cost of extracting element from
169 // the source type plus the cost of inserting them into the result vector
170 // type.
171 for (int i = 0; i != NumSubElts; ++i) {
172 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy,
173 CostKind, i, nullptr, nullptr);
174 Cost +=
175 thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, CostKind,
176 i + Index, nullptr, nullptr);
177 }
178 return Cost;
179 }
180
181 /// Local query method delegates up to T which *must* implement this!
182 const TargetSubtargetInfo *getST() const {
183 return static_cast<const T *>(this)->getST();
184 }
185
186 /// Local query method delegates up to T which *must* implement this!
187 const TargetLoweringBase *getTLI() const {
188 return static_cast<const T *>(this)->getTLI();
189 }
190
191 static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
192 switch (M) {
194 return ISD::UNINDEXED;
195 case TTI::MIM_PreInc:
196 return ISD::PRE_INC;
197 case TTI::MIM_PreDec:
198 return ISD::PRE_DEC;
199 case TTI::MIM_PostInc:
200 return ISD::POST_INC;
201 case TTI::MIM_PostDec:
202 return ISD::POST_DEC;
203 }
204 llvm_unreachable("Unexpected MemIndexedMode");
205 }
206
207 InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
208 Align Alignment,
209 bool VariableMask,
210 bool IsGatherScatter,
212 // We cannot scalarize scalable vectors, so return Invalid.
213 if (isa<ScalableVectorType>(DataTy))
215
216 auto *VT = cast<FixedVectorType>(DataTy);
217 // Assume the target does not have support for gather/scatter operations
218 // and provide a rough estimate.
219 //
220 // First, compute the cost of the individual memory operations.
221 InstructionCost AddrExtractCost =
222 IsGatherScatter
223 ? getVectorInstrCost(Instruction::ExtractElement,
225 PointerType::get(VT->getElementType(), 0),
226 VT->getNumElements()),
227 CostKind, -1, nullptr, nullptr)
228 : 0;
229 InstructionCost LoadCost =
230 VT->getNumElements() *
231 (AddrExtractCost +
232 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind));
233
234 // Next, compute the cost of packing the result in a vector.
235 InstructionCost PackingCost =
236 getScalarizationOverhead(VT, Opcode != Instruction::Store,
237 Opcode == Instruction::Store, CostKind);
238
239 InstructionCost ConditionalCost = 0;
240 if (VariableMask) {
241 // Compute the cost of conditionally executing the memory operations with
242 // variable masks. This includes extracting the individual conditions, a
243 // branches and PHIs to combine the results.
244 // NOTE: Estimating the cost of conditionally executing the memory
245 // operations accurately is quite difficult and the current solution
246 // provides a very rough estimate only.
247 ConditionalCost =
248 VT->getNumElements() *
250 Instruction::ExtractElement,
252 VT->getNumElements()),
253 CostKind, -1, nullptr, nullptr) +
254 getCFInstrCost(Instruction::Br, CostKind) +
255 getCFInstrCost(Instruction::PHI, CostKind));
256 }
257
258 return LoadCost + PackingCost + ConditionalCost;
259 }
260
261protected:
263 : BaseT(DL) {}
264 virtual ~BasicTTIImplBase() = default;
265
267
268public:
269 /// \name Scalar TTI Implementations
270 /// @{
272 unsigned AddressSpace, Align Alignment,
273 unsigned *Fast) const {
275 return getTLI()->allowsMisalignedMemoryAccesses(
277 }
278
279 bool hasBranchDivergence() { return false; }
280
281 bool useGPUDivergenceAnalysis() { return false; }
282
283 bool isSourceOfDivergence(const Value *V) { return false; }
284
285 bool isAlwaysUniform(const Value *V) { return false; }
286
288 // Return an invalid address space.
289 return -1;
290 }
291
293 Intrinsic::ID IID) const {
294 return false;
295 }
296
297 bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
298 return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS);
299 }
300
301 unsigned getAssumedAddrSpace(const Value *V) const {
302 return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
303 }
304
305 bool isSingleThreaded() const {
306 return getTLI()->getTargetMachine().Options.ThreadModel ==
308 }
309
310 std::pair<const Value *, unsigned>
312 return getTLI()->getTargetMachine().getPredicatedAddrSpace(V);
313 }
314
316 Value *NewV) const {
317 return nullptr;
318 }
319
320 bool isLegalAddImmediate(int64_t imm) {
321 return getTLI()->isLegalAddImmediate(imm);
322 }
323
324 bool isLegalICmpImmediate(int64_t imm) {
325 return getTLI()->isLegalICmpImmediate(imm);
326 }
327
328 bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
329 bool HasBaseReg, int64_t Scale,
330 unsigned AddrSpace, Instruction *I = nullptr) {
332 AM.BaseGV = BaseGV;
333 AM.BaseOffs = BaseOffset;
334 AM.HasBaseReg = HasBaseReg;
335 AM.Scale = Scale;
336 return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
337 }
338
339 unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
340 Type *ScalarValTy) const {
341 auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy](unsigned VF) {
342 auto *SrcTy = FixedVectorType::get(ScalarMemTy, VF / 2);
343 EVT VT = getTLI()->getValueType(DL, SrcTy);
344 if (getTLI()->isOperationLegal(ISD::STORE, VT) ||
345 getTLI()->isOperationCustom(ISD::STORE, VT))
346 return true;
347
348 EVT ValVT =
349 getTLI()->getValueType(DL, FixedVectorType::get(ScalarValTy, VF / 2));
350 EVT LegalizedVT =
351 getTLI()->getTypeToTransformTo(ScalarMemTy->getContext(), VT);
352 return getTLI()->isTruncStoreLegal(LegalizedVT, ValVT);
353 };
354 while (VF > 2 && IsSupportedByTarget(VF))
355 VF /= 2;
356 return VF;
357 }
358
360 const DataLayout &DL) const {
361 EVT VT = getTLI()->getValueType(DL, Ty);
362 return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
363 }
364
366 const DataLayout &DL) const {
367 EVT VT = getTLI()->getValueType(DL, Ty);
368 return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
369 }
370
373 }
374
377 }
378
381 }
382
384 int64_t BaseOffset, bool HasBaseReg,
385 int64_t Scale, unsigned AddrSpace) {
387 AM.BaseGV = BaseGV;
388 AM.BaseOffs = BaseOffset;
389 AM.HasBaseReg = HasBaseReg;
390 AM.Scale = Scale;
391 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
392 return 0;
393 return -1;
394 }
395
396 bool isTruncateFree(Type *Ty1, Type *Ty2) {
397 return getTLI()->isTruncateFree(Ty1, Ty2);
398 }
399
401 return getTLI()->isProfitableToHoist(I);
402 }
403
404 bool useAA() const { return getST()->useAA(); }
405
406 bool isTypeLegal(Type *Ty) {
407 EVT VT = getTLI()->getValueType(DL, Ty);
408 return getTLI()->isTypeLegal(VT);
409 }
410
411 unsigned getRegUsageForType(Type *Ty) {
412 EVT ETy = getTLI()->getValueType(DL, Ty);
413 return getTLI()->getNumRegisters(Ty->getContext(), ETy);
414 }
415
419 return BaseT::getGEPCost(PointeeType, Ptr, Operands, CostKind);
420 }
421
423 unsigned &JumpTableSize,
425 BlockFrequencyInfo *BFI) {
426 /// Try to find the estimated number of clusters. Note that the number of
427 /// clusters identified in this function could be different from the actual
428 /// numbers found in lowering. This function ignore switches that are
429 /// lowered with a mix of jump table / bit test / BTree. This function was
430 /// initially intended to be used when estimating the cost of switch in
431 /// inline cost heuristic, but it's a generic cost model to be used in other
432 /// places (e.g., in loop unrolling).
433 unsigned N = SI.getNumCases();
434 const TargetLoweringBase *TLI = getTLI();
435 const DataLayout &DL = this->getDataLayout();
436
437 JumpTableSize = 0;
438 bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());
439
440 // Early exit if both a jump table and bit test are not allowed.
441 if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N))
442 return N;
443
444 APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
445 APInt MinCaseVal = MaxCaseVal;
446 for (auto CI : SI.cases()) {
447 const APInt &CaseVal = CI.getCaseValue()->getValue();
448 if (CaseVal.sgt(MaxCaseVal))
449 MaxCaseVal = CaseVal;
450 if (CaseVal.slt(MinCaseVal))
451 MinCaseVal = CaseVal;
452 }
453
454 // Check if suitable for a bit test
455 if (N <= DL.getIndexSizeInBits(0u)) {
457 for (auto I : SI.cases())
458 Dests.insert(I.getCaseSuccessor());
459
460 if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
461 DL))
462 return 1;
463 }
464
465 // Check if suitable for a jump table.
466 if (IsJTAllowed) {
467 if (N < 2 || N < TLI->getMinimumJumpTableEntries())
468 return N;
469 uint64_t Range =
470 (MaxCaseVal - MinCaseVal)
471 .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1;
472 // Check whether a range of clusters is dense enough for a jump table
473 if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) {
474 JumpTableSize = Range;
475 return 1;
476 }
477 }
478 return N;
479 }
480
482 const TargetLoweringBase *TLI = getTLI();
485 }
486
488 const TargetMachine &TM = getTLI()->getTargetMachine();
489 // If non-PIC mode, do not generate a relative lookup table.
490 if (!TM.isPositionIndependent())
491 return false;
492
493 /// Relative lookup table entries consist of 32-bit offsets.
494 /// Do not generate relative lookup tables for large code models
495 /// in 64-bit achitectures where 32-bit offsets might not be enough.
496 if (TM.getCodeModel() == CodeModel::Medium ||
497 TM.getCodeModel() == CodeModel::Large)
498 return false;
499
500 Triple TargetTriple = TM.getTargetTriple();
501 if (!TargetTriple.isArch64Bit())
502 return false;
503
504 // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it
505 // there.
506 if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin())
507 return false;
508
509 return true;
510 }
511
512 bool haveFastSqrt(Type *Ty) {
513 const TargetLoweringBase *TLI = getTLI();
514 EVT VT = TLI->getValueType(DL, Ty);
515 return TLI->isTypeLegal(VT) &&
517 }
518
520 return true;
521 }
522
524 // Check whether FADD is available, as a proxy for floating-point in
525 // general.
526 const TargetLoweringBase *TLI = getTLI();
527 EVT VT = TLI->getValueType(DL, Ty);
531 }
532
533 unsigned getInliningThresholdMultiplier() { return 1; }
534 unsigned adjustInliningThreshold(const CallBase *CB) { return 0; }
535
536 int getInlinerVectorBonusPercent() { return 150; }
537
541 // This unrolling functionality is target independent, but to provide some
542 // motivation for its intended use, for x86:
543
544 // According to the Intel 64 and IA-32 Architectures Optimization Reference
545 // Manual, Intel Core models and later have a loop stream detector (and
546 // associated uop queue) that can benefit from partial unrolling.
547 // The relevant requirements are:
548 // - The loop must have no more than 4 (8 for Nehalem and later) branches
549 // taken, and none of them may be calls.
550 // - The loop can have no more than 18 (28 for Nehalem and later) uops.
551
552 // According to the Software Optimization Guide for AMD Family 15h
553 // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
554 // and loop buffer which can benefit from partial unrolling.
555 // The relevant requirements are:
556 // - The loop must have fewer than 16 branches
557 // - The loop must have less than 40 uops in all executed loop branches
558
559 // The number of taken branches in a loop is hard to estimate here, and
560 // benchmarking has revealed that it is better not to be conservative when
561 // estimating the branch count. As a result, we'll ignore the branch limits
562 // until someone finds a case where it matters in practice.
563
564 unsigned MaxOps;
565 const TargetSubtargetInfo *ST = getST();
566 if (PartialUnrollingThreshold.getNumOccurrences() > 0)
568 else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
569 MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
570 else
571 return;
572
573 // Scan the loop: don't unroll loops with calls.
574 for (BasicBlock *BB : L->blocks()) {
575 for (Instruction &I : *BB) {
576 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
577 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
578 if (!thisT()->isLoweredToCall(F))
579 continue;
580 }
581
582 if (ORE) {
583 ORE->emit([&]() {
584 return OptimizationRemark("TTI", "DontUnroll", L->getStartLoc(),
585 L->getHeader())
586 << "advising against unrolling the loop because it "
587 "contains a "
588 << ore::NV("Call", &I);
589 });
590 }
591 return;
592 }
593 }
594 }
595
596 // Enable runtime and partial unrolling up to the specified size.
597 // Enable using trip count upper bound to unroll loops.
598 UP.Partial = UP.Runtime = UP.UpperBound = true;
599 UP.PartialThreshold = MaxOps;
600
601 // Avoid unrolling when optimizing for size.
602 UP.OptSizeThreshold = 0;
604
605 // Set number of instructions optimized when "back edge"
606 // becomes "fall through" to default value of 2.
607 UP.BEInsns = 2;
608 }
609
612 PP.PeelCount = 0;
613 PP.AllowPeeling = true;
614 PP.AllowLoopNestsPeeling = false;
615 PP.PeelProfiledIterations = true;
616 }
617
619 AssumptionCache &AC,
620 TargetLibraryInfo *LibInfo,
621 HardwareLoopInfo &HWLoopInfo) {
622 return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
623 }
624
627 DominatorTree *DT,
630 return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LVL, IAI);
631 }
632
634 getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) {
635 return BaseT::getPreferredTailFoldingStyle(IVUpdateMayOverflow);
636 }
637
638 std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
639 IntrinsicInst &II) {
640 return BaseT::instCombineIntrinsic(IC, II);
641 }
642
643 std::optional<Value *>
645 APInt DemandedMask, KnownBits &Known,
646 bool &KnownBitsComputed) {
647 return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
648 KnownBitsComputed);
649 }
650
652 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
653 APInt &UndefElts2, APInt &UndefElts3,
654 std::function<void(Instruction *, unsigned, APInt, APInt &)>
655 SimplifyAndSetOp) {
657 IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
658 SimplifyAndSetOp);
659 }
660
661 virtual std::optional<unsigned>
663 return std::optional<unsigned>(
664 getST()->getCacheSize(static_cast<unsigned>(Level)));
665 }
666
667 virtual std::optional<unsigned>
669 std::optional<unsigned> TargetResult =
670 getST()->getCacheAssociativity(static_cast<unsigned>(Level));
671
672 if (TargetResult)
673 return TargetResult;
674
675 return BaseT::getCacheAssociativity(Level);
676 }
677
678 virtual unsigned getCacheLineSize() const {
679 return getST()->getCacheLineSize();
680 }
681
682 virtual unsigned getPrefetchDistance() const {
683 return getST()->getPrefetchDistance();
684 }
685
686 virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
687 unsigned NumStridedMemAccesses,
688 unsigned NumPrefetches,
689 bool HasCall) const {
690 return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
691 NumPrefetches, HasCall);
692 }
693
694 virtual unsigned getMaxPrefetchIterationsAhead() const {
695 return getST()->getMaxPrefetchIterationsAhead();
696 }
697
698 virtual bool enableWritePrefetching() const {
699 return getST()->enableWritePrefetching();
700 }
701
702 virtual bool shouldPrefetchAddressSpace(unsigned AS) const {
703 return getST()->shouldPrefetchAddressSpace(AS);
704 }
705
706 /// @}
707
708 /// \name Vector TTI Implementations
709 /// @{
710
712 return TypeSize::getFixed(32);
713 }
714
715 std::optional<unsigned> getMaxVScale() const { return std::nullopt; }
716 std::optional<unsigned> getVScaleForTuning() const { return std::nullopt; }
717
718 /// Estimate the overhead of scalarizing an instruction. Insert and Extract
719 /// are set if the demanded result elements need to be inserted and/or
720 /// extracted from vectors.
722 const APInt &DemandedElts,
723 bool Insert, bool Extract,
725 /// FIXME: a bitfield is not a reasonable abstraction for talking about
726 /// which elements are needed from a scalable vector
727 if (isa<ScalableVectorType>(InTy))
729 auto *Ty = cast<FixedVectorType>(InTy);
730
731 assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&
732 "Vector size mismatch");
733
735
736 for (int i = 0, e = Ty->getNumElements(); i < e; ++i) {
737 if (!DemandedElts[i])
738 continue;
739 if (Insert)
740 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty,
741 CostKind, i, nullptr, nullptr);
742 if (Extract)
743 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
744 CostKind, i, nullptr, nullptr);
745 }
746
747 return Cost;
748 }
749
750 /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
752 bool Extract,
754 if (isa<ScalableVectorType>(InTy))
756 auto *Ty = cast<FixedVectorType>(InTy);
757
758 APInt DemandedElts = APInt::getAllOnes(Ty->getNumElements());
759 return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
760 CostKind);
761 }
762
763 /// Estimate the overhead of scalarizing an instructions unique
764 /// non-constant operands. The (potentially vector) types to use for each of
765 /// argument are passes via Tys.
770 assert(Args.size() == Tys.size() && "Expected matching Args and Tys");
771
773 SmallPtrSet<const Value*, 4> UniqueOperands;
774 for (int I = 0, E = Args.size(); I != E; I++) {
775 // Disregard things like metadata arguments.
776 const Value *A = Args[I];
777 Type *Ty = Tys[I];
778 if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() &&
779 !Ty->isPtrOrPtrVectorTy())
780 continue;
781
782 if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
783 if (auto *VecTy = dyn_cast<VectorType>(Ty))
784 Cost += getScalarizationOverhead(VecTy, /*Insert*/ false,
785 /*Extract*/ true, CostKind);
786 }
787 }
788
789 return Cost;
790 }
791
792 /// Estimate the overhead of scalarizing the inputs and outputs of an
793 /// instruction, with return type RetTy and arguments Args of type Tys. If
794 /// Args are unknown (empty), then the cost associated with one argument is
795 /// added as a heuristic.
801 RetTy, /*Insert*/ true, /*Extract*/ false, CostKind);
802 if (!Args.empty())
804 else
805 // When no information on arguments is provided, we add the cost
806 // associated with one argument as a heuristic.
807 Cost += getScalarizationOverhead(RetTy, /*Insert*/ false,
808 /*Extract*/ true, CostKind);
809
810 return Cost;
811 }
812
813 /// Estimate the cost of type-legalization and the legalized type.
814 std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const {
815 LLVMContext &C = Ty->getContext();
816 EVT MTy = getTLI()->getValueType(DL, Ty);
817
819 // We keep legalizing the type until we find a legal kind. We assume that
820 // the only operation that costs anything is the split. After splitting
821 // we need to handle two types.
822 while (true) {
824
826 // Ensure we return a sensible simple VT here, since many callers of
827 // this function require it.
828 MVT VT = MTy.isSimple() ? MTy.getSimpleVT() : MVT::i64;
829 return std::make_pair(InstructionCost::getInvalid(), VT);
830 }
831
832 if (LK.first == TargetLoweringBase::TypeLegal)
833 return std::make_pair(Cost, MTy.getSimpleVT());
834
835 if (LK.first == TargetLoweringBase::TypeSplitVector ||
837 Cost *= 2;
838
839 // Do not loop with f128 type.
840 if (MTy == LK.second)
841 return std::make_pair(Cost, MTy.getSimpleVT());
842
843 // Keep legalizing the type.
844 MTy = LK.second;
845 }
846 }
847
848 unsigned getMaxInterleaveFactor(ElementCount VF) { return 1; }
849
851 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
854 ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
855 const Instruction *CxtI = nullptr) {
856 // Check if any of the operands are vector operands.
857 const TargetLoweringBase *TLI = getTLI();
858 int ISD = TLI->InstructionOpcodeToISD(Opcode);
859 assert(ISD && "Invalid opcode");
860
861 // TODO: Handle more cost kinds.
863 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
864 Opd1Info, Opd2Info,
865 Args, CxtI);
866
867 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
868
869 bool IsFloat = Ty->isFPOrFPVectorTy();
870 // Assume that floating point arithmetic operations cost twice as much as
871 // integer operations.
872 InstructionCost OpCost = (IsFloat ? 2 : 1);
873
874 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
875 // The operation is legal. Assume it costs 1.
876 // TODO: Once we have extract/insert subvector cost we need to use them.
877 return LT.first * OpCost;
878 }
879
880 if (!TLI->isOperationExpand(ISD, LT.second)) {
881 // If the operation is custom lowered, then assume that the code is twice
882 // as expensive.
883 return LT.first * 2 * OpCost;
884 }
885
886 // An 'Expand' of URem and SRem is special because it may default
887 // to expanding the operation into a sequence of sub-operations
888 // i.e. X % Y -> X-(X/Y)*Y.
889 if (ISD == ISD::UREM || ISD == ISD::SREM) {
890 bool IsSigned = ISD == ISD::SREM;
891 if (TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIVREM : ISD::UDIVREM,
892 LT.second) ||
893 TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIV : ISD::UDIV,
894 LT.second)) {
895 unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv;
896 InstructionCost DivCost = thisT()->getArithmeticInstrCost(
897 DivOpc, Ty, CostKind, Opd1Info, Opd2Info);
898 InstructionCost MulCost =
899 thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind);
900 InstructionCost SubCost =
901 thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
902 return DivCost + MulCost + SubCost;
903 }
904 }
905
906 // We cannot scalarize scalable vectors, so return Invalid.
907 if (isa<ScalableVectorType>(Ty))
909
910 // Else, assume that we need to scalarize this op.
911 // TODO: If one of the types get legalized by splitting, handle this
912 // similarly to what getCastInstrCost() does.
913 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
914 InstructionCost Cost = thisT()->getArithmeticInstrCost(
915 Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
916 Args, CxtI);
917 // Return the cost of multiple scalar invocation plus the cost of
918 // inserting and extracting the values.
919 SmallVector<Type *> Tys(Args.size(), Ty);
920 return getScalarizationOverhead(VTy, Args, Tys, CostKind) +
921 VTy->getNumElements() * Cost;
922 }
923
924 // We don't know anything about this scalar instruction.
925 return OpCost;
926 }
927
929 ArrayRef<int> Mask) const {
930 int Limit = Mask.size() * 2;
931 if (Mask.empty() ||
932 // Extra check required by isSingleSourceMaskImpl function (called by
933 // ShuffleVectorInst::isSingleSourceMask).
934 any_of(Mask, [Limit](int I) { return I >= Limit; }))
935 return Kind;
936 int Index;
937 switch (Kind) {
940 return TTI::SK_Reverse;
942 return TTI::SK_Broadcast;
943 break;
946 return TTI::SK_Select;
948 return TTI::SK_Transpose;
950 return TTI::SK_Splice;
951 break;
952 case TTI::SK_Select:
953 case TTI::SK_Reverse:
958 case TTI::SK_Splice:
959 break;
960 }
961 return Kind;
962 }
963
965 ArrayRef<int> Mask,
967 VectorType *SubTp,
968 ArrayRef<const Value *> Args = std::nullopt) {
969
970 switch (improveShuffleKindFromMask(Kind, Mask)) {
972 if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
973 return getBroadcastShuffleOverhead(FVT, CostKind);
975 case TTI::SK_Select:
976 case TTI::SK_Splice:
977 case TTI::SK_Reverse:
981 if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
982 return getPermuteShuffleOverhead(FVT, CostKind);
985 return getExtractSubvectorOverhead(Tp, CostKind, Index,
986 cast<FixedVectorType>(SubTp));
988 return getInsertSubvectorOverhead(Tp, CostKind, Index,
989 cast<FixedVectorType>(SubTp));
990 }
991 llvm_unreachable("Unknown TTI::ShuffleKind");
992 }
993
994 InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
997 const Instruction *I = nullptr) {
998 if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0)
999 return 0;
1000
1001 const TargetLoweringBase *TLI = getTLI();
1002 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1003 assert(ISD && "Invalid opcode");
1004 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1005 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1006
1007 TypeSize SrcSize = SrcLT.second.getSizeInBits();
1008 TypeSize DstSize = DstLT.second.getSizeInBits();
1009 bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy();
1010 bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy();
1011
1012 switch (Opcode) {
1013 default:
1014 break;
1015 case Instruction::Trunc:
1016 // Check for NOOP conversions.
1017 if (TLI->isTruncateFree(SrcLT.second, DstLT.second))
1018 return 0;
1019 [[fallthrough]];
1020 case Instruction::BitCast:
1021 // Bitcast between types that are legalized to the same type are free and
1022 // assume int to/from ptr of the same size is also free.
1023 if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst &&
1024 SrcSize == DstSize)
1025 return 0;
1026 break;
1027 case Instruction::FPExt:
1028 if (I && getTLI()->isExtFree(I))
1029 return 0;
1030 break;
1031 case Instruction::ZExt:
1032 if (TLI->isZExtFree(SrcLT.second, DstLT.second))
1033 return 0;
1034 [[fallthrough]];
1035 case Instruction::SExt:
1036 if (I && getTLI()->isExtFree(I))
1037 return 0;
1038
1039 // If this is a zext/sext of a load, return 0 if the corresponding
1040 // extending load exists on target and the result type is legal.
1041 if (CCH == TTI::CastContextHint::Normal) {
1042 EVT ExtVT = EVT::getEVT(Dst);
1043 EVT LoadVT = EVT::getEVT(Src);
1044 unsigned LType =
1045 ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
1046 if (DstLT.first == SrcLT.first &&
1047 TLI->isLoadExtLegal(LType, ExtVT, LoadVT))
1048 return 0;
1049 }
1050 break;
1051 case Instruction::AddrSpaceCast:
1052 if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(),
1053 Dst->getPointerAddressSpace()))
1054 return 0;
1055 break;
1056 }
1057
1058 auto *SrcVTy = dyn_cast<VectorType>(Src);
1059 auto *DstVTy = dyn_cast<VectorType>(Dst);
1060
1061 // If the cast is marked as legal (or promote) then assume low cost.
1062 if (SrcLT.first == DstLT.first &&
1063 TLI->isOperationLegalOrPromote(ISD, DstLT.second))
1064 return SrcLT.first;
1065
1066 // Handle scalar conversions.
1067 if (!SrcVTy && !DstVTy) {
1068 // Just check the op cost. If the operation is legal then assume it costs
1069 // 1.
1070 if (!TLI->isOperationExpand(ISD, DstLT.second))
1071 return 1;
1072
1073 // Assume that illegal scalar instruction are expensive.
1074 return 4;
1075 }
1076
1077 // Check vector-to-vector casts.
1078 if (DstVTy && SrcVTy) {
1079 // If the cast is between same-sized registers, then the check is simple.
1080 if (SrcLT.first == DstLT.first && SrcSize == DstSize) {
1081
1082 // Assume that Zext is done using AND.
1083 if (Opcode == Instruction::ZExt)
1084 return SrcLT.first;
1085
1086 // Assume that sext is done using SHL and SRA.
1087 if (Opcode == Instruction::SExt)
1088 return SrcLT.first * 2;
1089
1090 // Just check the op cost. If the operation is legal then assume it
1091 // costs
1092 // 1 and multiply by the type-legalization overhead.
1093 if (!TLI->isOperationExpand(ISD, DstLT.second))
1094 return SrcLT.first * 1;
1095 }
1096
1097 // If we are legalizing by splitting, query the concrete TTI for the cost
1098 // of casting the original vector twice. We also need to factor in the
1099 // cost of the split itself. Count that as 1, to be consistent with
1100 // getTypeLegalizationCost().
1101 bool SplitSrc =
1102 TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
1104 bool SplitDst =
1105 TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
1107 if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() &&
1108 DstVTy->getElementCount().isVector()) {
1109 Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy);
1110 Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy);
1111 T *TTI = static_cast<T *>(this);
1112 // If both types need to be split then the split is free.
1113 InstructionCost SplitCost =
1114 (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0;
1115 return SplitCost +
1116 (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH,
1117 CostKind, I));
1118 }
1119
1120 // Scalarization cost is Invalid, can't assume any num elements.
1121 if (isa<ScalableVectorType>(DstVTy))
1123
1124 // In other cases where the source or destination are illegal, assume
1125 // the operation will get scalarized.
1126 unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements();
1127 InstructionCost Cost = thisT()->getCastInstrCost(
1128 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I);
1129
1130 // Return the cost of multiple scalar invocation plus the cost of
1131 // inserting and extracting the values.
1132 return getScalarizationOverhead(DstVTy, /*Insert*/ true, /*Extract*/ true,
1133 CostKind) +
1134 Num * Cost;
1135 }
1136
1137 // We already handled vector-to-vector and scalar-to-scalar conversions.
1138 // This
1139 // is where we handle bitcast between vectors and scalars. We need to assume
1140 // that the conversion is scalarized in one way or another.
1141 if (Opcode == Instruction::BitCast) {
1142 // Illegal bitcasts are done by storing and loading from a stack slot.
1143 return (SrcVTy ? getScalarizationOverhead(SrcVTy, /*Insert*/ false,
1144 /*Extract*/ true, CostKind)
1145 : 0) +
1146 (DstVTy ? getScalarizationOverhead(DstVTy, /*Insert*/ true,
1147 /*Extract*/ false, CostKind)
1148 : 0);
1149 }
1150
1151 llvm_unreachable("Unhandled cast");
1152 }
1153
1155 VectorType *VecTy, unsigned Index) {
1157 return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
1158 CostKind, Index, nullptr, nullptr) +
1159 thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(),
1161 }
1162
1164 const Instruction *I = nullptr) {
1165 return BaseT::getCFInstrCost(Opcode, CostKind, I);
1166 }
1167
1168 InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1169 CmpInst::Predicate VecPred,
1171 const Instruction *I = nullptr) {
1172 const TargetLoweringBase *TLI = getTLI();
1173 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1174 assert(ISD && "Invalid opcode");
1175
1176 // TODO: Handle other cost kinds.
1178 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1179 I);
1180
1181 // Selects on vectors are actually vector selects.
1182 if (ISD == ISD::SELECT) {
1183 assert(CondTy && "CondTy must exist");
1184 if (CondTy->isVectorTy())
1185 ISD = ISD::VSELECT;
1186 }
1187 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1188
1189 if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
1190 !TLI->isOperationExpand(ISD, LT.second)) {
1191 // The operation is legal. Assume it costs 1. Multiply
1192 // by the type-legalization overhead.
1193 return LT.first * 1;
1194 }
1195
1196 // Otherwise, assume that the cast is scalarized.
1197 // TODO: If one of the types get legalized by splitting, handle this
1198 // similarly to what getCastInstrCost() does.
1199 if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) {
1200 if (isa<ScalableVectorType>(ValTy))
1202
1203 unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements();
1204 if (CondTy)
1205 CondTy = CondTy->getScalarType();
1206 InstructionCost Cost = thisT()->getCmpSelInstrCost(
1207 Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I);
1208
1209 // Return the cost of multiple scalar invocation plus the cost of
1210 // inserting and extracting the values.
1211 return getScalarizationOverhead(ValVTy, /*Insert*/ true,
1212 /*Extract*/ false, CostKind) +
1213 Num * Cost;
1214 }
1215
1216 // Unknown scalar opcode.
1217 return 1;
1218 }
1219
1222 unsigned Index, Value *Op0, Value *Op1) {
1223 return getRegUsageForType(Val->getScalarType());
1224 }
1225
1228 unsigned Index) {
1229 Value *Op0 = nullptr;
1230 Value *Op1 = nullptr;
1231 if (auto *IE = dyn_cast<InsertElementInst>(&I)) {
1232 Op0 = IE->getOperand(0);
1233 Op1 = IE->getOperand(1);
1234 }
1235 return thisT()->getVectorInstrCost(I.getOpcode(), Val, CostKind, Index, Op0,
1236 Op1);
1237 }
1238
1239 InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
1240 int VF,
1241 const APInt &DemandedDstElts,
1243 assert(DemandedDstElts.getBitWidth() == (unsigned)VF * ReplicationFactor &&
1244 "Unexpected size of DemandedDstElts.");
1245
1247
1248 auto *SrcVT = FixedVectorType::get(EltTy, VF);
1249 auto *ReplicatedVT = FixedVectorType::get(EltTy, VF * ReplicationFactor);
1250
1251 // The Mask shuffling cost is extract all the elements of the Mask
1252 // and insert each of them Factor times into the wide vector:
1253 //
1254 // E.g. an interleaved group with factor 3:
1255 // %mask = icmp ult <8 x i32> %vec1, %vec2
1256 // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
1257 // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
1258 // The cost is estimated as extract all mask elements from the <8xi1> mask
1259 // vector and insert them factor times into the <24xi1> shuffled mask
1260 // vector.
1261 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedDstElts, VF);
1262 Cost += thisT()->getScalarizationOverhead(SrcVT, DemandedSrcElts,
1263 /*Insert*/ false,
1264 /*Extract*/ true, CostKind);
1265 Cost += thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts,
1266 /*Insert*/ true,
1267 /*Extract*/ false, CostKind);
1268
1269 return Cost;
1270 }
1271
1273 getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
1276 const Instruction *I = nullptr) {
1277 assert(!Src->isVoidTy() && "Invalid type");
1278 // Assume types, such as structs, are expensive.
1279 if (getTLI()->getValueType(DL, Src, true) == MVT::Other)
1280 return 4;
1281 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1282
1283 // Assuming that all loads of legal types cost 1.
1284 InstructionCost Cost = LT.first;
1286 return Cost;
1287
1288 const DataLayout &DL = this->getDataLayout();
1289 if (Src->isVectorTy() &&
1290 // In practice it's not currently possible to have a change in lane
1291 // length for extending loads or truncating stores so both types should
1292 // have the same scalable property.
1294 LT.second.getSizeInBits())) {
1295 // This is a vector load that legalizes to a larger type than the vector
1296 // itself. Unless the corresponding extending load or truncating store is
1297 // legal, then this will scalarize.
1299 EVT MemVT = getTLI()->getValueType(DL, Src);
1300 if (Opcode == Instruction::Store)
1301 LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
1302 else
1303 LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
1304
1305 if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
1306 // This is a vector load/store for some illegal type that is scalarized.
1307 // We must account for the cost of building or decomposing the vector.
1309 cast<VectorType>(Src), Opcode != Instruction::Store,
1310 Opcode == Instruction::Store, CostKind);
1311 }
1312 }
1313
1314 return Cost;
1315 }
1316
1318 Align Alignment, unsigned AddressSpace,
1320 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false,
1321 CostKind);
1322 }
1323
1325 const Value *Ptr, bool VariableMask,
1326 Align Alignment,
1328 const Instruction *I = nullptr) {
1329 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask,
1330 true, CostKind);
1331 }
1332
1334 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1335 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1336 bool UseMaskForCond = false, bool UseMaskForGaps = false) {
1337
1338 // We cannot scalarize scalable vectors, so return Invalid.
1339 if (isa<ScalableVectorType>(VecTy))
1341
1342 auto *VT = cast<FixedVectorType>(VecTy);
1343
1344 unsigned NumElts = VT->getNumElements();
1345 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1346
1347 unsigned NumSubElts = NumElts / Factor;
1348 auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts);
1349
1350 // Firstly, the cost of load/store operation.
1352 if (UseMaskForCond || UseMaskForGaps)
1353 Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment,
1355 else
1356 Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace,
1357 CostKind);
1358
1359 // Legalize the vector type, and get the legalized and unlegalized type
1360 // sizes.
1361 MVT VecTyLT = getTypeLegalizationCost(VecTy).second;
1362 unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy);
1363 unsigned VecTyLTSize = VecTyLT.getStoreSize();
1364
1365 // Scale the cost of the memory operation by the fraction of legalized
1366 // instructions that will actually be used. We shouldn't account for the
1367 // cost of dead instructions since they will be removed.
1368 //
1369 // E.g., An interleaved load of factor 8:
1370 // %vec = load <16 x i64>, <16 x i64>* %ptr
1371 // %v0 = shufflevector %vec, undef, <0, 8>
1372 //
1373 // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
1374 // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
1375 // type). The other loads are unused.
1376 //
1377 // TODO: Note that legalization can turn masked loads/stores into unmasked
1378 // (legalized) loads/stores. This can be reflected in the cost.
1379 if (Cost.isValid() && VecTySize > VecTyLTSize) {
1380 // The number of loads of a legal type it will take to represent a load
1381 // of the unlegalized vector type.
1382 unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize);
1383
1384 // The number of elements of the unlegalized type that correspond to a
1385 // single legal instruction.
1386 unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts);
1387
1388 // Determine which legal instructions will be used.
1389 BitVector UsedInsts(NumLegalInsts, false);
1390 for (unsigned Index : Indices)
1391 for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
1392 UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
1393
1394 // Scale the cost of the load by the fraction of legal instructions that
1395 // will be used.
1396 Cost = divideCeil(UsedInsts.count() * *Cost.getValue(), NumLegalInsts);
1397 }
1398
1399 // Then plus the cost of interleave operation.
1400 assert(Indices.size() <= Factor &&
1401 "Interleaved memory op has too many members");
1402
1403 const APInt DemandedAllSubElts = APInt::getAllOnes(NumSubElts);
1404 const APInt DemandedAllResultElts = APInt::getAllOnes(NumElts);
1405
1406 APInt DemandedLoadStoreElts = APInt::getZero(NumElts);
1407 for (unsigned Index : Indices) {
1408 assert(Index < Factor && "Invalid index for interleaved memory op");
1409 for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
1410 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
1411 }
1412
1413 if (Opcode == Instruction::Load) {
1414 // The interleave cost is similar to extract sub vectors' elements
1415 // from the wide vector, and insert them into sub vectors.
1416 //
1417 // E.g. An interleaved load of factor 2 (with one member of index 0):
1418 // %vec = load <8 x i32>, <8 x i32>* %ptr
1419 // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0
1420 // The cost is estimated as extract elements at 0, 2, 4, 6 from the
1421 // <8 x i32> vector and insert them into a <4 x i32> vector.
1422 InstructionCost InsSubCost = thisT()->getScalarizationOverhead(
1423 SubVT, DemandedAllSubElts,
1424 /*Insert*/ true, /*Extract*/ false, CostKind);
1425 Cost += Indices.size() * InsSubCost;
1426 Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1427 /*Insert*/ false,
1428 /*Extract*/ true, CostKind);
1429 } else {
1430 // The interleave cost is extract elements from sub vectors, and
1431 // insert them into the wide vector.
1432 //
1433 // E.g. An interleaved store of factor 3 with 2 members at indices 0,1:
1434 // (using VF=4):
1435 // %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef>
1436 // %gaps.mask = <true, true, false, true, true, false,
1437 // true, true, false, true, true, false>
1438 // call llvm.masked.store <12 x i32> %v0_v1, <12 x i32>* %ptr,
1439 // i32 Align, <12 x i1> %gaps.mask
1440 // The cost is estimated as extract all elements (of actual members,
1441 // excluding gaps) from both <4 x i32> vectors and insert into the <12 x
1442 // i32> vector.
1443 InstructionCost ExtSubCost = thisT()->getScalarizationOverhead(
1444 SubVT, DemandedAllSubElts,
1445 /*Insert*/ false, /*Extract*/ true, CostKind);
1446 Cost += ExtSubCost * Indices.size();
1447 Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1448 /*Insert*/ true,
1449 /*Extract*/ false, CostKind);
1450 }
1451
1452 if (!UseMaskForCond)
1453 return Cost;
1454
1455 Type *I8Type = Type::getInt8Ty(VT->getContext());
1456
1457 Cost += thisT()->getReplicationShuffleCost(
1458 I8Type, Factor, NumSubElts,
1459 UseMaskForGaps ? DemandedLoadStoreElts : DemandedAllResultElts,
1460 CostKind);
1461
1462 // The Gaps mask is invariant and created outside the loop, therefore the
1463 // cost of creating it is not accounted for here. However if we have both
1464 // a MaskForGaps and some other mask that guards the execution of the
1465 // memory access, we need to account for the cost of And-ing the two masks
1466 // inside the loop.
1467 if (UseMaskForGaps) {
1468 auto *MaskVT = FixedVectorType::get(I8Type, NumElts);
1469 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT,
1470 CostKind);
1471 }
1472
1473 return Cost;
1474 }
1475
1476 /// Get intrinsic cost based on arguments.
1479 // Check for generically free intrinsics.
1481 return 0;
1482
1483 // Assume that target intrinsics are cheap.
1484 Intrinsic::ID IID = ICA.getID();
1487
1488 if (ICA.isTypeBasedOnly())
1490
1491 Type *RetTy = ICA.getReturnType();
1492
1493 ElementCount RetVF =
1494 (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
1496 const IntrinsicInst *I = ICA.getInst();
1497 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
1498 FastMathFlags FMF = ICA.getFlags();
1499 switch (IID) {
1500 default:
1501 break;
1502
1503 case Intrinsic::powi:
1504 if (auto *RHSC = dyn_cast<ConstantInt>(Args[1])) {
1505 bool ShouldOptForSize = I->getParent()->getParent()->hasOptSize();
1506 if (getTLI()->isBeneficialToExpandPowI(RHSC->getSExtValue(),
1507 ShouldOptForSize)) {
1508 // The cost is modeled on the expansion performed by ExpandPowI in
1509 // SelectionDAGBuilder.
1510 APInt Exponent = RHSC->getValue().abs();
1511 unsigned ActiveBits = Exponent.getActiveBits();
1512 unsigned PopCount = Exponent.popcount();
1513 InstructionCost Cost = (ActiveBits + PopCount - 2) *
1514 thisT()->getArithmeticInstrCost(
1515 Instruction::FMul, RetTy, CostKind);
1516 if (RHSC->getSExtValue() < 0)
1517 Cost += thisT()->getArithmeticInstrCost(Instruction::FDiv, RetTy,
1518 CostKind);
1519 return Cost;
1520 }
1521 }
1522 break;
1523 case Intrinsic::cttz:
1524 // FIXME: If necessary, this should go in target-specific overrides.
1525 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz(RetTy))
1527 break;
1528
1529 case Intrinsic::ctlz:
1530 // FIXME: If necessary, this should go in target-specific overrides.
1531 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz(RetTy))
1533 break;
1534
1535 case Intrinsic::memcpy:
1536 return thisT()->getMemcpyCost(ICA.getInst());
1537
1538 case Intrinsic::masked_scatter: {
1539 const Value *Mask = Args[3];
1540 bool VarMask = !isa<Constant>(Mask);
1541 Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue();
1542 return thisT()->getGatherScatterOpCost(Instruction::Store,
1543 ICA.getArgTypes()[0], Args[1],
1544 VarMask, Alignment, CostKind, I);
1545 }
1546 case Intrinsic::masked_gather: {
1547 const Value *Mask = Args[2];
1548 bool VarMask = !isa<Constant>(Mask);
1549 Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue();
1550 return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
1551 VarMask, Alignment, CostKind, I);
1552 }
1553 case Intrinsic::experimental_stepvector: {
1554 if (isa<ScalableVectorType>(RetTy))
1556 // The cost of materialising a constant integer vector.
1558 }
1559 case Intrinsic::vector_extract: {
1560 // FIXME: Handle case where a scalable vector is extracted from a scalable
1561 // vector
1562 if (isa<ScalableVectorType>(RetTy))
1564 unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue();
1565 return thisT()->getShuffleCost(
1566 TTI::SK_ExtractSubvector, cast<VectorType>(Args[0]->getType()),
1567 std::nullopt, CostKind, Index, cast<VectorType>(RetTy));
1568 }
1569 case Intrinsic::vector_insert: {
1570 // FIXME: Handle case where a scalable vector is inserted into a scalable
1571 // vector
1572 if (isa<ScalableVectorType>(Args[1]->getType()))
1574 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1575 return thisT()->getShuffleCost(
1576 TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()),
1577 std::nullopt, CostKind, Index, cast<VectorType>(Args[1]->getType()));
1578 }
1579 case Intrinsic::experimental_vector_reverse: {
1580 return thisT()->getShuffleCost(
1581 TTI::SK_Reverse, cast<VectorType>(Args[0]->getType()), std::nullopt,
1582 CostKind, 0, cast<VectorType>(RetTy));
1583 }
1584 case Intrinsic::experimental_vector_splice: {
1585 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1586 return thisT()->getShuffleCost(
1587 TTI::SK_Splice, cast<VectorType>(Args[0]->getType()), std::nullopt,
1588 CostKind, Index, cast<VectorType>(RetTy));
1589 }
1590 case Intrinsic::vector_reduce_add:
1591 case Intrinsic::vector_reduce_mul:
1592 case Intrinsic::vector_reduce_and:
1593 case Intrinsic::vector_reduce_or:
1594 case Intrinsic::vector_reduce_xor:
1595 case Intrinsic::vector_reduce_smax:
1596 case Intrinsic::vector_reduce_smin:
1597 case Intrinsic::vector_reduce_fmax:
1598 case Intrinsic::vector_reduce_fmin:
1599 case Intrinsic::vector_reduce_umax:
1600 case Intrinsic::vector_reduce_umin: {
1601 IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1);
1603 }
1604 case Intrinsic::vector_reduce_fadd:
1605 case Intrinsic::vector_reduce_fmul: {
1607 IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1);
1609 }
1610 case Intrinsic::fshl:
1611 case Intrinsic::fshr: {
1612 const Value *X = Args[0];
1613 const Value *Y = Args[1];
1614 const Value *Z = Args[2];
1617 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(Z);
1618 const TTI::OperandValueInfo OpInfoBW =
1620 isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2
1621 : TTI::OP_None};
1622
1623 // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
1624 // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
1626 Cost +=
1627 thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
1628 Cost +=
1629 thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
1630 Cost += thisT()->getArithmeticInstrCost(
1631 BinaryOperator::Shl, RetTy, CostKind, OpInfoX,
1632 {OpInfoZ.Kind, TTI::OP_None});
1633 Cost += thisT()->getArithmeticInstrCost(
1634 BinaryOperator::LShr, RetTy, CostKind, OpInfoY,
1635 {OpInfoZ.Kind, TTI::OP_None});
1636 // Non-constant shift amounts requires a modulo.
1637 if (!OpInfoZ.isConstant())
1638 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy,
1639 CostKind, OpInfoZ, OpInfoBW);
1640 // For non-rotates (X != Y) we must add shift-by-zero handling costs.
1641 if (X != Y) {
1642 Type *CondTy = RetTy->getWithNewBitWidth(1);
1643 Cost +=
1644 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1646 Cost +=
1647 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1649 }
1650 return Cost;
1651 }
1652 case Intrinsic::get_active_lane_mask: {
1653 EVT ResVT = getTLI()->getValueType(DL, RetTy, true);
1654 EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true);
1655
1656 // If we're not expanding the intrinsic then we assume this is cheap
1657 // to implement.
1658 if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgType)) {
1659 return getTypeLegalizationCost(RetTy).first;
1660 }
1661
1662 // Create the expanded types that will be used to calculate the uadd_sat
1663 // operation.
1664 Type *ExpRetTy = VectorType::get(
1665 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1666 IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {}, FMF);
1668 thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1669 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy,
1671 return Cost;
1672 }
1673 }
1674
1675 // Assume that we need to scalarize this intrinsic.
1676 // Compute the scalarization overhead based on Args for a vector
1677 // intrinsic.
1678 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
1679 if (RetVF.isVector() && !RetVF.isScalable()) {
1680 ScalarizationCost = 0;
1681 if (!RetTy->isVoidTy())
1682 ScalarizationCost += getScalarizationOverhead(
1683 cast<VectorType>(RetTy),
1684 /*Insert*/ true, /*Extract*/ false, CostKind);
1685 ScalarizationCost +=
1687 }
1688
1689 IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
1690 ScalarizationCost);
1691 return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1692 }
1693
1694 /// Get intrinsic cost based on argument types.
1695 /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the
1696 /// cost of scalarizing the arguments and the return value will be computed
1697 /// based on types.
1701 Intrinsic::ID IID = ICA.getID();
1702 Type *RetTy = ICA.getReturnType();
1703 const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes();
1704 FastMathFlags FMF = ICA.getFlags();
1705 InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost();
1706 bool SkipScalarizationCost = ICA.skipScalarizationCost();
1707
1708 VectorType *VecOpTy = nullptr;
1709 if (!Tys.empty()) {
1710 // The vector reduction operand is operand 0 except for fadd/fmul.
1711 // Their operand 0 is a scalar start value, so the vector op is operand 1.
1712 unsigned VecTyIndex = 0;
1713 if (IID == Intrinsic::vector_reduce_fadd ||
1714 IID == Intrinsic::vector_reduce_fmul)
1715 VecTyIndex = 1;
1716 assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes");
1717 VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]);
1718 }
1719
1720 // Library call cost - other than size, make it expensive.
1721 unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10;
1722 unsigned ISD = 0;
1723 switch (IID) {
1724 default: {
1725 // Scalable vectors cannot be scalarized, so return Invalid.
1726 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
1727 return isa<ScalableVectorType>(Ty);
1728 }))
1730
1731 // Assume that we need to scalarize this intrinsic.
1732 InstructionCost ScalarizationCost =
1733 SkipScalarizationCost ? ScalarizationCostPassed : 0;
1734 unsigned ScalarCalls = 1;
1735 Type *ScalarRetTy = RetTy;
1736 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
1737 if (!SkipScalarizationCost)
1738 ScalarizationCost = getScalarizationOverhead(
1739 RetVTy, /*Insert*/ true, /*Extract*/ false, CostKind);
1740 ScalarCalls = std::max(ScalarCalls,
1741 cast<FixedVectorType>(RetVTy)->getNumElements());
1742 ScalarRetTy = RetTy->getScalarType();
1743 }
1744 SmallVector<Type *, 4> ScalarTys;
1745 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1746 Type *Ty = Tys[i];
1747 if (auto *VTy = dyn_cast<VectorType>(Ty)) {
1748 if (!SkipScalarizationCost)
1749 ScalarizationCost += getScalarizationOverhead(
1750 VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
1751 ScalarCalls = std::max(ScalarCalls,
1752 cast<FixedVectorType>(VTy)->getNumElements());
1753 Ty = Ty->getScalarType();
1754 }
1755 ScalarTys.push_back(Ty);
1756 }
1757 if (ScalarCalls == 1)
1758 return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
1759
1760 IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF);
1761 InstructionCost ScalarCost =
1762 thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind);
1763
1764 return ScalarCalls * ScalarCost + ScalarizationCost;
1765 }
1766 // Look for intrinsics that can be lowered directly or turned into a scalar
1767 // intrinsic call.
1768 case Intrinsic::sqrt:
1769 ISD = ISD::FSQRT;
1770 break;
1771 case Intrinsic::sin:
1772 ISD = ISD::FSIN;
1773 break;
1774 case Intrinsic::cos:
1775 ISD = ISD::FCOS;
1776 break;
1777 case Intrinsic::exp:
1778 ISD = ISD::FEXP;
1779 break;
1780 case Intrinsic::exp2:
1781 ISD = ISD::FEXP2;
1782 break;
1783 case Intrinsic::log:
1784 ISD = ISD::FLOG;
1785 break;
1786 case Intrinsic::log10:
1787 ISD = ISD::FLOG10;
1788 break;
1789 case Intrinsic::log2:
1790 ISD = ISD::FLOG2;
1791 break;
1792 case Intrinsic::fabs:
1793 ISD = ISD::FABS;
1794 break;
1795 case Intrinsic::canonicalize:
1796 ISD = ISD::FCANONICALIZE;
1797 break;
1798 case Intrinsic::minnum:
1799 ISD = ISD::FMINNUM;
1800 break;
1801 case Intrinsic::maxnum:
1802 ISD = ISD::FMAXNUM;
1803 break;
1804 case Intrinsic::minimum:
1805 ISD = ISD::FMINIMUM;
1806 break;
1807 case Intrinsic::maximum:
1808 ISD = ISD::FMAXIMUM;
1809 break;
1810 case Intrinsic::copysign:
1811 ISD = ISD::FCOPYSIGN;
1812 break;
1813 case Intrinsic::floor:
1814 ISD = ISD::FFLOOR;
1815 break;
1816 case Intrinsic::ceil:
1817 ISD = ISD::FCEIL;
1818 break;
1819 case Intrinsic::trunc:
1820 ISD = ISD::FTRUNC;
1821 break;
1822 case Intrinsic::nearbyint:
1823 ISD = ISD::FNEARBYINT;
1824 break;
1825 case Intrinsic::rint:
1826 ISD = ISD::FRINT;
1827 break;
1828 case Intrinsic::round:
1829 ISD = ISD::FROUND;
1830 break;
1831 case Intrinsic::roundeven:
1832 ISD = ISD::FROUNDEVEN;
1833 break;
1834 case Intrinsic::pow:
1835 ISD = ISD::FPOW;
1836 break;
1837 case Intrinsic::fma:
1838 ISD = ISD::FMA;
1839 break;
1840 case Intrinsic::fmuladd:
1841 ISD = ISD::FMA;
1842 break;
1843 case Intrinsic::experimental_constrained_fmuladd:
1844 ISD = ISD::STRICT_FMA;
1845 break;
1846 // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
1847 case Intrinsic::lifetime_start:
1848 case Intrinsic::lifetime_end:
1849 case Intrinsic::sideeffect:
1850 case Intrinsic::pseudoprobe:
1851 case Intrinsic::arithmetic_fence:
1852 return 0;
1853 case Intrinsic::masked_store: {
1854 Type *Ty = Tys[0];
1855 Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
1856 return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0,
1857 CostKind);
1858 }
1859 case Intrinsic::masked_load: {
1860 Type *Ty = RetTy;
1861 Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
1862 return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
1863 CostKind);
1864 }
1865 case Intrinsic::vector_reduce_add:
1866 return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy,
1867 std::nullopt, CostKind);
1868 case Intrinsic::vector_reduce_mul:
1869 return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy,
1870 std::nullopt, CostKind);
1871 case Intrinsic::vector_reduce_and:
1872 return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy,
1873 std::nullopt, CostKind);
1874 case Intrinsic::vector_reduce_or:
1875 return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy,
1876 std::nullopt, CostKind);
1877 case Intrinsic::vector_reduce_xor:
1878 return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy,
1879 std::nullopt, CostKind);
1880 case Intrinsic::vector_reduce_fadd:
1881 return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy,
1882 FMF, CostKind);
1883 case Intrinsic::vector_reduce_fmul:
1884 return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy,
1885 FMF, CostKind);
1886 case Intrinsic::vector_reduce_smax:
1887 case Intrinsic::vector_reduce_smin:
1888 case Intrinsic::vector_reduce_fmax:
1889 case Intrinsic::vector_reduce_fmin:
1890 return thisT()->getMinMaxReductionCost(
1891 VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
1892 /*IsUnsigned=*/false, CostKind);
1893 case Intrinsic::vector_reduce_umax:
1894 case Intrinsic::vector_reduce_umin:
1895 return thisT()->getMinMaxReductionCost(
1896 VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
1897 /*IsUnsigned=*/true, CostKind);
1898 case Intrinsic::abs: {
1899 // abs(X) = select(icmp(X,0),X,sub(0,X))
1900 Type *CondTy = RetTy->getWithNewBitWidth(1);
1903 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1904 Pred, CostKind);
1905 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1906 Pred, CostKind);
1907 // TODO: Should we add an OperandValueProperties::OP_Zero property?
1908 Cost += thisT()->getArithmeticInstrCost(
1909 BinaryOperator::Sub, RetTy, CostKind, {TTI::OK_UniformConstantValue, TTI::OP_None});
1910 return Cost;
1911 }
1912 case Intrinsic::smax:
1913 case Intrinsic::smin:
1914 case Intrinsic::umax:
1915 case Intrinsic::umin: {
1916 // minmax(X,Y) = select(icmp(X,Y),X,Y)
1917 Type *CondTy = RetTy->getWithNewBitWidth(1);
1918 bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin;
1919 CmpInst::Predicate Pred =
1920 IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT;
1922 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1923 Pred, CostKind);
1924 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1925 Pred, CostKind);
1926 return Cost;
1927 }
1928 case Intrinsic::sadd_sat:
1929 case Intrinsic::ssub_sat: {
1930 Type *CondTy = RetTy->getWithNewBitWidth(1);
1931
1932 Type *OpTy = StructType::create({RetTy, CondTy});
1933 Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
1934 ? Intrinsic::sadd_with_overflow
1935 : Intrinsic::ssub_with_overflow;
1937
1938 // SatMax -> Overflow && SumDiff < 0
1939 // SatMin -> Overflow && SumDiff >= 0
1941 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
1942 nullptr, ScalarizationCostPassed);
1943 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1944 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1945 Pred, CostKind);
1946 Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
1947 CondTy, Pred, CostKind);
1948 return Cost;
1949 }
1950 case Intrinsic::uadd_sat:
1951 case Intrinsic::usub_sat: {
1952 Type *CondTy = RetTy->getWithNewBitWidth(1);
1953
1954 Type *OpTy = StructType::create({RetTy, CondTy});
1955 Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
1956 ? Intrinsic::uadd_with_overflow
1957 : Intrinsic::usub_with_overflow;
1958
1960 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
1961 nullptr, ScalarizationCostPassed);
1962 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1963 Cost +=
1964 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1966 return Cost;
1967 }
1968 case Intrinsic::smul_fix:
1969 case Intrinsic::umul_fix: {
1970 unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
1971 Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);
1972
1973 unsigned ExtOp =
1974 IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
1976
1978 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
1979 Cost +=
1980 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
1981 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
1982 CCH, CostKind);
1983 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy,
1984 CostKind,
1987 Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind,
1990 Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
1991 return Cost;
1992 }
1993 case Intrinsic::sadd_with_overflow:
1994 case Intrinsic::ssub_with_overflow: {
1995 Type *SumTy = RetTy->getContainedType(0);
1996 Type *OverflowTy = RetTy->getContainedType(1);
1997 unsigned Opcode = IID == Intrinsic::sadd_with_overflow
1998 ? BinaryOperator::Add
1999 : BinaryOperator::Sub;
2000
2001 // Add:
2002 // Overflow -> (Result < LHS) ^ (RHS < 0)
2003 // Sub:
2004 // Overflow -> (Result < LHS) ^ (RHS > 0)
2006 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
2007 Cost += 2 * thisT()->getCmpSelInstrCost(
2008 Instruction::ICmp, SumTy, OverflowTy,
2010 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy,
2011 CostKind);
2012 return Cost;
2013 }
2014 case Intrinsic::uadd_with_overflow:
2015 case Intrinsic::usub_with_overflow: {
2016 Type *SumTy = RetTy->getContainedType(0);
2017 Type *OverflowTy = RetTy->getContainedType(1);
2018 unsigned Opcode = IID == Intrinsic::uadd_with_overflow
2019 ? BinaryOperator::Add
2020 : BinaryOperator::Sub;
2021 CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow
2024
2026 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
2027 Cost +=
2028 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy,
2029 Pred, CostKind);
2030 return Cost;
2031 }
2032 case Intrinsic::smul_with_overflow:
2033 case Intrinsic::umul_with_overflow: {
2034 Type *MulTy = RetTy->getContainedType(0);
2035 Type *OverflowTy = RetTy->getContainedType(1);
2036 unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
2037 Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
2038 bool IsSigned = IID == Intrinsic::smul_with_overflow;
2039
2040 unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt;
2042
2044 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
2045 Cost +=
2046 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2047 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
2048 CCH, CostKind);
2049 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, ExtTy,
2050 CostKind,
2053
2054 if (IsSigned)
2055 Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy,
2056 CostKind,
2059
2060 Cost += thisT()->getCmpSelInstrCost(
2061 BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind);
2062 return Cost;
2063 }
2064 case Intrinsic::fptosi_sat:
2065 case Intrinsic::fptoui_sat: {
2066 if (Tys.empty())
2067 break;
2068 Type *FromTy = Tys[0];
2069 bool IsSigned = IID == Intrinsic::fptosi_sat;
2070
2072 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FromTy,
2073 {FromTy, FromTy});
2074 Cost += thisT()->getIntrinsicInstrCost(Attrs1, CostKind);
2075 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FromTy,
2076 {FromTy, FromTy});
2077 Cost += thisT()->getIntrinsicInstrCost(Attrs2, CostKind);
2078 Cost += thisT()->getCastInstrCost(
2079 IsSigned ? Instruction::FPToSI : Instruction::FPToUI, RetTy, FromTy,
2081 if (IsSigned) {
2082 Type *CondTy = RetTy->getWithNewBitWidth(1);
2083 Cost += thisT()->getCmpSelInstrCost(
2084 BinaryOperator::FCmp, FromTy, CondTy, CmpInst::FCMP_UNO, CostKind);
2085 Cost += thisT()->getCmpSelInstrCost(
2086 BinaryOperator::Select, RetTy, CondTy, CmpInst::FCMP_UNO, CostKind);
2087 }
2088 return Cost;
2089 }
2090 case Intrinsic::ctpop:
2091 ISD = ISD::CTPOP;
2092 // In case of legalization use TCC_Expensive. This is cheaper than a
2093 // library call but still not a cheap instruction.
2094 SingleCallCost = TargetTransformInfo::TCC_Expensive;
2095 break;
2096 case Intrinsic::ctlz:
2097 ISD = ISD::CTLZ;
2098 break;
2099 case Intrinsic::cttz:
2100 ISD = ISD::CTTZ;
2101 break;
2102 case Intrinsic::bswap:
2103 ISD = ISD::BSWAP;
2104 break;
2105 case Intrinsic::bitreverse:
2106 ISD = ISD::BITREVERSE;
2107 break;
2108 }
2109
2110 const TargetLoweringBase *TLI = getTLI();
2111 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
2112
2113 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
2114 if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
2115 TLI->isFAbsFree(LT.second)) {
2116 return 0;
2117 }
2118
2119 // The operation is legal. Assume it costs 1.
2120 // If the type is split to multiple registers, assume that there is some
2121 // overhead to this.
2122 // TODO: Once we have extract/insert subvector cost we need to use them.
2123 if (LT.first > 1)
2124 return (LT.first * 2);
2125 else
2126 return (LT.first * 1);
2127 } else if (!TLI->isOperationExpand(ISD, LT.second)) {
2128 // If the operation is custom lowered then assume
2129 // that the code is twice as expensive.
2130 return (LT.first * 2);
2131 }
2132
2133 // If we can't lower fmuladd into an FMA estimate the cost as a floating
2134 // point mul followed by an add.
2135 if (IID == Intrinsic::fmuladd)
2136 return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy,
2137 CostKind) +
2138 thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy,
2139 CostKind);
2140 if (IID == Intrinsic::experimental_constrained_fmuladd) {
2141 IntrinsicCostAttributes FMulAttrs(
2142 Intrinsic::experimental_constrained_fmul, RetTy, Tys);
2143 IntrinsicCostAttributes FAddAttrs(
2144 Intrinsic::experimental_constrained_fadd, RetTy, Tys);
2145 return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) +
2146 thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind);
2147 }
2148
2149 // Else, assume that we need to scalarize this intrinsic. For math builtins
2150 // this will emit a costly libcall, adding call overhead and spills. Make it
2151 // very expensive.
2152 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
2153 // Scalable vectors cannot be scalarized, so return Invalid.
2154 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
2155 return isa<ScalableVectorType>(Ty);
2156 }))
2158
2159 InstructionCost ScalarizationCost =
2160 SkipScalarizationCost
2161 ? ScalarizationCostPassed
2162 : getScalarizationOverhead(RetVTy, /*Insert*/ true,
2163 /*Extract*/ false, CostKind);
2164
2165 unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements();
2166 SmallVector<Type *, 4> ScalarTys;
2167 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
2168 Type *Ty = Tys[i];
2169 if (Ty->isVectorTy())
2170 Ty = Ty->getScalarType();
2171 ScalarTys.push_back(Ty);
2172 }
2173 IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
2174 InstructionCost ScalarCost =
2175 thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2176 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
2177 if (auto *VTy = dyn_cast<VectorType>(Tys[i])) {
2178 if (!ICA.skipScalarizationCost())
2179 ScalarizationCost += getScalarizationOverhead(
2180 VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
2181 ScalarCalls = std::max(ScalarCalls,
2182 cast<FixedVectorType>(VTy)->getNumElements());
2183 }
2184 }
2185 return ScalarCalls * ScalarCost + ScalarizationCost;
2186 }
2187
2188 // This is going to be turned into a library call, make it expensive.
2189 return SingleCallCost;
2190 }
2191
2192 /// Compute a cost of the given call instruction.
2193 ///
2194 /// Compute the cost of calling function F with return type RetTy and
2195 /// argument types Tys. F might be nullptr, in this case the cost of an
2196 /// arbitrary call with the specified signature will be returned.
2197 /// This is used, for instance, when we estimate call of a vector
2198 /// counterpart of the given function.
2199 /// \param F Called function, might be nullptr.
2200 /// \param RetTy Return value types.
2201 /// \param Tys Argument types.
2202 /// \returns The cost of Call instruction.
2204 ArrayRef<Type *> Tys,
2206 return 10;
2207 }
2208
2209 unsigned getNumberOfParts(Type *Tp) {
2210 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
2211 return LT.first.isValid() ? *LT.first.getValue() : 0;
2212 }
2213
2215 const SCEV *) {
2216 return 0;
2217 }
2218
2219 /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
2220 /// We're assuming that reduction operation are performing the following way:
2221 ///
2222 /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
2223 /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
2224 /// \----------------v-------------/ \----------v------------/
2225 /// n/2 elements n/2 elements
2226 /// %red1 = op <n x t> %val, <n x t> val1
2227 /// After this operation we have a vector %red1 where only the first n/2
2228 /// elements are meaningful, the second n/2 elements are undefined and can be
2229 /// dropped. All other operations are actually working with the vector of
2230 /// length n/2, not n, though the real vector length is still n.
2231 /// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
2232 /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
2233 /// \----------------v-------------/ \----------v------------/
2234 /// n/4 elements 3*n/4 elements
2235 /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of
2236 /// length n/2, the resulting vector has length n/4 etc.
2237 ///
2238 /// The cost model should take into account that the actual length of the
2239 /// vector is reduced on each iteration.
2242 // Targets must implement a default value for the scalable case, since
2243 // we don't know how many lanes the vector has.
2244 if (isa<ScalableVectorType>(Ty))
2246
2247 Type *ScalarTy = Ty->getElementType();
2248 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2249 if ((Opcode == Instruction::Or || Opcode == Instruction::And) &&
2250 ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) &&
2251 NumVecElts >= 2) {
2252 // Or reduction for i1 is represented as:
2253 // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2254 // %res = cmp ne iReduxWidth %val, 0
2255 // And reduction for i1 is represented as:
2256 // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2257 // %res = cmp eq iReduxWidth %val, 11111
2258 Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts);
2259 return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty,
2261 thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy,
2264 }
2265 unsigned NumReduxLevels = Log2_32(NumVecElts);
2266 InstructionCost ArithCost = 0;
2267 InstructionCost ShuffleCost = 0;
2268 std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty);
2269 unsigned LongVectorCount = 0;
2270 unsigned MVTLen =
2271 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2272 while (NumVecElts > MVTLen) {
2273 NumVecElts /= 2;
2274 VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2275 ShuffleCost +=
2276 thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
2277 CostKind, NumVecElts, SubTy);
2278 ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind);
2279 Ty = SubTy;
2280 ++LongVectorCount;
2281 }
2282
2283 NumReduxLevels -= LongVectorCount;
2284
2285 // The minimal length of the vector is limited by the real length of vector
2286 // operations performed on the current platform. That's why several final
2287 // reduction operations are performed on the vectors with the same
2288 // architecture-dependent length.
2289
2290 // By default reductions need one shuffle per reduction level.
2291 ShuffleCost +=
2292 NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
2293 std::nullopt, CostKind, 0, Ty);
2294 ArithCost +=
2295 NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind);
2296 return ShuffleCost + ArithCost +
2297 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
2298 CostKind, 0, nullptr, nullptr);
2299 }
2300
2301 /// Try to calculate the cost of performing strict (in-order) reductions,
2302 /// which involves doing a sequence of floating point additions in lane
2303 /// order, starting with an initial value. For example, consider a scalar
2304 /// initial value 'InitVal' of type float and a vector of type <4 x float>:
2305 ///
2306 /// Vector = <float %v0, float %v1, float %v2, float %v3>
2307 ///
2308 /// %add1 = %InitVal + %v0
2309 /// %add2 = %add1 + %v1
2310 /// %add3 = %add2 + %v2
2311 /// %add4 = %add3 + %v3
2312 ///
2313 /// As a simple estimate we can say the cost of such a reduction is 4 times
2314 /// the cost of a scalar FP addition. We can only estimate the costs for
2315 /// fixed-width vectors here because for scalable vectors we do not know the
2316 /// runtime number of operations.
2319 // Targets must implement a default value for the scalable case, since
2320 // we don't know how many lanes the vector has.
2321 if (isa<ScalableVectorType>(Ty))
2323
2324 auto *VTy = cast<FixedVectorType>(Ty);
2326 VTy, /*Insert=*/false, /*Extract=*/true, CostKind);
2327 InstructionCost ArithCost = thisT()->getArithmeticInstrCost(
2328 Opcode, VTy->getElementType(), CostKind);
2329 ArithCost *= VTy->getNumElements();
2330
2331 return ExtractCost + ArithCost;
2332 }
2333
2335 std::optional<FastMathFlags> FMF,
2337 assert(Ty && "Unknown reduction vector type");
2339 return getOrderedReductionCost(Opcode, Ty, CostKind);
2340 return getTreeReductionCost(Opcode, Ty, CostKind);
2341 }
2342
2343 /// Try to calculate op costs for min/max reduction operations.
2344 /// \param CondTy Conditional type for the Select instruction.
2346 bool IsUnsigned,
2348 // Targets must implement a default value for the scalable case, since
2349 // we don't know how many lanes the vector has.
2350 if (isa<ScalableVectorType>(Ty))
2352
2353 Type *ScalarTy = Ty->getElementType();
2354 Type *ScalarCondTy = CondTy->getElementType();
2355 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2356 unsigned NumReduxLevels = Log2_32(NumVecElts);
2357 unsigned CmpOpcode;
2358 if (Ty->isFPOrFPVectorTy()) {
2359 CmpOpcode = Instruction::FCmp;
2360 } else {
2361 assert(Ty->isIntOrIntVectorTy() &&
2362 "expecting floating point or integer type for min/max reduction");
2363 CmpOpcode = Instruction::ICmp;
2364 }
2365 InstructionCost MinMaxCost = 0;
2366 InstructionCost ShuffleCost = 0;
2367 std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty);
2368 unsigned LongVectorCount = 0;
2369 unsigned MVTLen =
2370 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2371 while (NumVecElts > MVTLen) {
2372 NumVecElts /= 2;
2373 auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2374 CondTy = FixedVectorType::get(ScalarCondTy, NumVecElts);
2375
2376 ShuffleCost +=
2377 thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
2378 CostKind, NumVecElts, SubTy);
2379 MinMaxCost +=
2380 thisT()->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy,
2382 thisT()->getCmpSelInstrCost(Instruction::Select, SubTy, CondTy,
2384 Ty = SubTy;
2385 ++LongVectorCount;
2386 }
2387
2388 NumReduxLevels -= LongVectorCount;
2389
2390 // The minimal length of the vector is limited by the real length of vector
2391 // operations performed on the current platform. That's why several final
2392 // reduction opertions are perfomed on the vectors with the same
2393 // architecture-dependent length.
2394 ShuffleCost +=
2395 NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
2396 std::nullopt, CostKind, 0, Ty);
2397 MinMaxCost +=
2398 NumReduxLevels *
2399 (thisT()->getCmpSelInstrCost(CmpOpcode, Ty, CondTy,
2401 thisT()->getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
2403 // The last min/max should be in vector registers and we counted it above.
2404 // So just need a single extractelement.
2405 return ShuffleCost + MinMaxCost +
2406 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
2407 CostKind, 0, nullptr, nullptr);
2408 }
2409
2410 InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned,
2411 Type *ResTy, VectorType *Ty,
2412 std::optional<FastMathFlags> FMF,
2414 // Without any native support, this is equivalent to the cost of
2415 // vecreduce.opcode(ext(Ty A)).
2416 VectorType *ExtTy = VectorType::get(ResTy, Ty);
2417 InstructionCost RedCost =
2418 thisT()->getArithmeticReductionCost(Opcode, ExtTy, FMF, CostKind);
2419 InstructionCost ExtCost = thisT()->getCastInstrCost(
2420 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2422
2423 return RedCost + ExtCost;
2424 }
2425
2427 VectorType *Ty,
2429 // Without any native support, this is equivalent to the cost of
2430 // vecreduce.add(mul(ext(Ty A), ext(Ty B))) or
2431 // vecreduce.add(mul(A, B)).
2432 VectorType *ExtTy = VectorType::get(ResTy, Ty);
2433 InstructionCost RedCost = thisT()->getArithmeticReductionCost(
2434 Instruction::Add, ExtTy, std::nullopt, CostKind);
2435 InstructionCost ExtCost = thisT()->getCastInstrCost(
2436 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2438
2439 InstructionCost MulCost =
2440 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2441
2442 return RedCost + MulCost + 2 * ExtCost;
2443 }
2444
2446
2447 /// @}
2448};
2449
2450/// Concrete BasicTTIImpl that can be used if no further customization
2451/// is needed.
2452class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
2454
2455 friend class BasicTTIImplBase<BasicTTIImpl>;
2456
2457 const TargetSubtargetInfo *ST;
2458 const TargetLoweringBase *TLI;
2459
2460 const TargetSubtargetInfo *getST() const { return ST; }
2461 const TargetLoweringBase *getTLI() const { return TLI; }
2462
2463public:
2464 explicit BasicTTIImpl(const TargetMachine *TM, const Function &F);
2465};
2466
2467} // end namespace llvm
2468
2469#endif // LLVM_CODEGEN_BASICTTIIMPL_H
This file implements a class to represent arbitrary precision integral constant values and operations...
This file implements the BitVector class.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
return RetTy
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
LLVMContext & Context
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
const char LLVMTargetMachineRef TM
@ SI
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This file describes how to lower LLVM code to machine code.
This file provides helpers for the implementation of a TargetTransformInfo-conforming class.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:75
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:214
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1308
bool sgt(const APInt &RHS) const
Signed greater than comparison.
Definition: APInt.h:1179
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1439
bool slt(const APInt &RHS) const
Signed less than comparison.
Definition: APInt.h:1108
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:177
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:163
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:56
Base class which can be used to help build a TTI implementation.
Definition: BasicTTIImpl.h:79
bool isTypeLegal(Type *Ty)
Definition: BasicTTIImpl.h:406
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
virtual unsigned getPrefetchDistance() const
Definition: BasicTTIImpl.h:682
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace)
Definition: BasicTTIImpl.h:383
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:538
unsigned getMaxInterleaveFactor(ElementCount VF)
Definition: BasicTTIImpl.h:848
unsigned getNumberOfParts(Type *Tp)
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
Definition: BasicTTIImpl.h:711
std::optional< unsigned > getVScaleForTuning() const
Definition: BasicTTIImpl.h:716
InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty, TTI::TargetCostKind CostKind)
Try to calculate the cost of performing strict (in-order) reductions, which involves doing a sequence...
bool isTruncateFree(Type *Ty1, Type *Ty2)
Definition: BasicTTIImpl.h:396
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo)
Definition: BasicTTIImpl.h:618
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:850
InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty, TTI::TargetCostKind CostKind)
Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
virtual bool shouldPrefetchAddressSpace(unsigned AS) const
Definition: BasicTTIImpl.h:702
bool isLegalICmpImmediate(int64_t imm)
Definition: BasicTTIImpl.h:324
bool isProfitableToHoist(Instruction *I)
Definition: BasicTTIImpl.h:400
virtual unsigned getMaxPrefetchIterationsAhead() const
Definition: BasicTTIImpl.h:694
InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, TTI::TargetCostKind CostKind, unsigned Index)
std::optional< unsigned > getMaxVScale() const
Definition: BasicTTIImpl.h:715
unsigned getRegUsageForType(Type *Ty)
Definition: BasicTTIImpl.h:411
bool shouldBuildRelLookupTables() const
Definition: BasicTTIImpl.h:487
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
Definition: BasicTTIImpl.h:964
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, unsigned &JumpTableSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
Definition: BasicTTIImpl.h:422
bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty, const DataLayout &DL) const
Definition: BasicTTIImpl.h:359
bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2)
Definition: BasicTTIImpl.h:371
std::optional< Value * > simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, bool &KnownBitsComputed)
Definition: BasicTTIImpl.h:644
virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, unsigned NumPrefetches, bool HasCall) const
Definition: BasicTTIImpl.h:686
bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty, const DataLayout &DL) const
Definition: BasicTTIImpl.h:365
unsigned getAssumedAddrSpace(const Value *V) const
Definition: BasicTTIImpl.h:301
InstructionCost getOperandsScalarizationOverhead(ArrayRef< const Value * > Args, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instructions unique non-constant operands.
Definition: BasicTTIImpl.h:767
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:721
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty)
Definition: BasicTTIImpl.h:519
unsigned getInliningThresholdMultiplier()
Definition: BasicTTIImpl.h:533
virtual std::optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const
Definition: BasicTTIImpl.h:662
bool isAlwaysUniform(const Value *V)
Definition: BasicTTIImpl.h:285
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr)
Definition: BasicTTIImpl.h:328
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true)
Definition: BasicTTIImpl.h:634
bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, unsigned AddressSpace, Align Alignment, unsigned *Fast) const
Definition: BasicTTIImpl.h:271
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
Definition: BasicTTIImpl.h:339
InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
Definition: BasicTTIImpl.h:751
virtual std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const
Definition: BasicTTIImpl.h:668
virtual bool enableWritePrefetching() const
Definition: BasicTTIImpl.h:698
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const
Definition: BasicTTIImpl.h:315
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:610
InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const
Definition: BasicTTIImpl.h:292
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Compute a cost of the given call instruction.
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getFPOpCost(Type *Ty)
Definition: BasicTTIImpl.h:523
InstructionCost getVectorSplitCost()
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:814
bool haveFastSqrt(Type *Ty)
Definition: BasicTTIImpl.h:512
std::pair< const Value *, unsigned > getPredicatedAddrSpace(const Value *V) const
Definition: BasicTTIImpl.h:311
bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL, InterleavedAccessInfo *IAI)
Definition: BasicTTIImpl.h:625
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask) const
Definition: BasicTTIImpl.h:928
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
virtual ~BasicTTIImplBase()=default
InstructionCost getScalarizationOverhead(VectorType *RetTy, ArrayRef< const Value * > Args, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing the inputs and outputs of an instruction, with return type RetTy...
Definition: BasicTTIImpl.h:796
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:416
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II)
Definition: BasicTTIImpl.h:638
bool isLegalAddImmediate(int64_t imm)
Definition: BasicTTIImpl.h:320
unsigned getFlatAddressSpace()
Definition: BasicTTIImpl.h:287
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:994
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
virtual unsigned getCacheLineSize() const
Definition: BasicTTIImpl.h:678
bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const
Definition: BasicTTIImpl.h:297
bool isSourceOfDivergence(const Value *V)
Definition: BasicTTIImpl.h:283
InstructionCost getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on argument types.
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp)
Definition: BasicTTIImpl.h:651
bool isSingleThreaded() const
Definition: BasicTTIImpl.h:305
BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
Definition: BasicTTIImpl.h:262
unsigned adjustInliningThreshold(const CallBase *CB)
Definition: BasicTTIImpl.h:534
bool isProfitableLSRChainElement(Instruction *I)
Definition: BasicTTIImpl.h:379
Concrete BasicTTIImpl that can be used if no further customization is needed.
size_type count() const
count - Returns the number of bits which are set.
Definition: BitVector.h:162
BitVector & set()
Definition: BitVector.h:351
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1186
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:1054
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:718
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:741
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:745
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:743
@ ICMP_EQ
equal
Definition: InstrTypes.h:739
@ ICMP_NE
not equal
Definition: InstrTypes.h:740
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:728
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
Definition: DataLayout.h:480
unsigned getIndexSizeInBits(unsigned AS) const
Size in bits of index used for address calculation in getelementptr.
Definition: DataLayout.h:416
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
constexpr bool isVector() const
One or more elements.
Definition: TypeSize.h:306
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:291
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:302
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:21
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:525
unsigned getNumElements() const
Definition: DerivedTypes.h:568
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:704
bool isTargetIntrinsic() const
isTargetIntrinsic - Returns true if this function is an intrinsic and the intrinsic is specific to a ...
Definition: Function.cpp:835
The core instruction combiner logic.
Definition: InstCombiner.h:45
static InstructionCost getInvalid(CostType Val=0)
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:331
Drive the analysis of interleaved memory accesses in the loop.
Definition: VectorUtils.h:780
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
InstructionCost getScalarizationCost() const
const IntrinsicInst * getInst() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:547
virtual bool shouldPrefetchAddressSpace(unsigned AS) const
virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, unsigned NumPrefetches, bool HasCall) const
Return the minimum stride necessary to trigger software prefetching.
virtual bool enableWritePrefetching() const
virtual unsigned getMaxPrefetchIterationsAhead() const
Return the maximum prefetch distance in terms of loop iterations.
virtual unsigned getPrefetchDistance() const
Return the preferred prefetch distance in terms of instructions.
virtual std::optional< unsigned > getCacheAssociativity(unsigned Level) const
Return the cache associatvity for the given level of cache.
virtual std::optional< unsigned > getCacheLineSize(unsigned Level) const
Return the target cache line size in bytes at a given level.
Machine Value Type.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Analysis providing profile information.
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static bool isSelectMask(ArrayRef< int > Mask)
Return true if this shuffle mask chooses elements from its source vectors without lane crossings.
static bool isReverseMask(ArrayRef< int > Mask)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isZeroEltSplatMask(ArrayRef< int > Mask)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isSpliceMask(ArrayRef< int > Mask, int &Index)
Return true if this shuffle mask is a splice mask, concatenating the two inputs together and then ext...
static bool isTransposeMask(ArrayRef< int > Mask)
Return true if this shuffle mask is a transpose mask.
size_type size() const
Definition: SmallPtrSet.h:93
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:365
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:450
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:577
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
static StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition: Type.cpp:533
Multiway switch.
Provides information about what library functions are available for the current target.
This base class for TargetLowering contains the SelectionDAG-independent parts that can be used from ...
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
bool isIndexedStoreLegal(unsigned IdxMode, EVT VT) const
Return true if the specified indexed load is legal on this target.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
virtual bool isLegalICmpImmediate(int64_t) const
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
bool isBeneficialToExpandPowI(int Exponent, bool OptForSize) const
Return true if it is beneficial to expand an @llvm.powi.
const TargetMachine & getTargetMachine() const
virtual bool isZExtFree(Type *FromTy, Type *ToTy) const
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
virtual bool isSuitableForJumpTable(const SwitchInst *SI, uint64_t NumCases, uint64_t Range, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) const
Return true if lowering to a jump table is suitable for a set of case clusters which may contain NumC...
virtual bool areJTsAllowed(const Function *Fn) const
Return true if lowering to a jump table is allowed.
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
virtual unsigned getNumRegisters(LLVMContext &Context, EVT VT, std::optional< MVT > RegisterVT=std::nullopt) const
Return the number of registers that this ValueType will eventually require.
virtual bool isCheapToSpeculateCttz(Type *Ty) const
Return true if it is cheap to speculate a call to intrinsic cttz.
bool isTruncStoreLegal(EVT ValVT, EVT MemVT) const
Return true if the specified store with truncation is legal on this target.
virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *=nullptr) const
Determine if the target supports unaligned memory accesses.
virtual bool isTruncateFree(Type *FromTy, Type *ToTy) const
Return true if it's free to truncate a value of type FromTy to type ToTy.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
bool isSuitableForBitTests(unsigned NumDests, unsigned NumCmps, const APInt &Low, const APInt &High, const DataLayout &DL) const
Return true if lowering to a bit test is suitable for a set of case clusters which contains NumDests ...
virtual bool isLegalAddImmediate(int64_t) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
virtual bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
LegalizeAction getTruncStoreAction(EVT ValVT, EVT MemVT) const
Return how this store with truncation should be treated: either it is legal, needs to be promoted to ...
LegalizeAction getLoadExtAction(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return how this load with extension should be treated: either it is legal, needs to be promoted to a ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isProfitableToHoist(Instruction *I) const
bool isIndexedLoadLegal(unsigned IdxMode, EVT VT) const
Return true if the specified indexed load is legal on this target.
bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal on this target.
virtual bool isCheapToSpeculateCtlz(Type *Ty) const
Return true if it is cheap to speculate a call to intrinsic ctlz.
LegalizeKind getTypeConversion(LLVMContext &Context, EVT VT) const
Return pair that represents the legalization kind (first) that needs to happen to EVT (second) in ord...
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
virtual bool isFAbsFree(EVT VT) const
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
virtual bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AddrSpace, Instruction *I=nullptr) const
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:78
virtual std::pair< const Value *, unsigned > getPredicatedAddrSpace(const Value *V) const
If the specified predicate checks whether a generic pointer falls within a specified address space,...
virtual bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const
Returns true if a cast between SrcAS and DestAS is a noop.
virtual unsigned getAssumedAddrSpace(const Value *V) const
If the specified generic pointer could be assumed as a pointer to a specific address space,...
TargetOptions Options
ThreadModel::Model ThreadModel
ThreadModel - This flag specifies the type of threading model to assume for things like atomics.
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual bool useAA() const
Enable use of alias analysis during code generation (during MI scheduling, DAGCombine,...
const DataLayout & getDataLayout() const
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL, InterleavedAccessInfo *IAI) const
bool isProfitableLSRChainElement(Instruction *I) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I) const
std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info, TTI::OperandValueInfo Opd2Info, ArrayRef< const Value * > Args, const Instruction *CxtI=nullptr) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I) const
bool isLoweredToCall(const Function *F) const
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const
std::optional< Value * > simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, bool &KnownBitsComputed) const
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
CRTP base class for use as a mix-in that aids implementing a TargetTransformInfo-compatible class.
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Basic
The cost of a typical 'add' instruction.
MemIndexedMode
The type of load/store indexing.
@ MIM_PostInc
Post-incrementing.
@ MIM_PostDec
Post-decrementing.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
CacheLevel
The possible cache levels.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:356
bool isArch64Bit() const
Test whether the architecture is 64-bit.
Definition: Triple.cpp:1473
bool isOSDarwin() const
Is this a "Darwin" OS (macOS, iOS, tvOS, watchOS, or DriverKit).
Definition: Triple.h:519
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:322
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:267
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:237
static IntegerType * getInt1Ty(LLVMContext &C)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
static IntegerType * getInt8Ty(LLVMContext &C)
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:264
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:219
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:350
LLVM Value Representation.
Definition: Value.h:74
Base class of all SIMD vector types.
Definition: DerivedTypes.h:389
static VectorType * getHalfElementsVectorType(VectorType *VTy)
This static method returns a VectorType with half as many elements as the input type and the same ele...
Definition: DerivedTypes.h:493
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:688
Type * getElementType() const
Definition: DerivedTypes.h:422
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:198
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:166
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition: APInt.cpp:2960
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:713
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:482
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:255
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:999
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1003
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:499
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:726
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:955
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:735
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:968
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:492
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
Definition: ISDOpcodes.h:1396
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:522
AddressSpace
Definition: NVPTXBaseInfo.h:21
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1826
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:382
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:292
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:184
cl::opt< unsigned > PartialUnrollingThreshold
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Extended Value Type.
Definition: ValueTypes.h:34
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:129
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:615
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:299
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
Attributes of a target dependent hardware loop.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...
bool AllowPeeling
Allow peeling off loop iterations.
bool AllowLoopNestsPeeling
Allow peeling off loop iterations for loop nests.
bool PeelProfiledIterations
Allow peeling basing on profile.
unsigned PeelCount
A forced peeling factor (the number of bodied of the original loop that should be peeled off before t...
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).