Bug Summary

File:llvm/include/llvm/CodeGen/BasicTTIImpl.h
Warning:line 1076, column 11
Called C++ object pointer is null

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name ARMTargetTransformInfo.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/lib/Target/ARM -resource-dir /usr/lib/llvm-14/lib/clang/14.0.0 -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/lib/Target/ARM -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/lib/Target/ARM -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/include -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/include -D NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-14/lib/clang/14.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/lib/Target/ARM -fdebug-prefix-map=/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e=. -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2021-09-04-040900-46481-1 -x c++ /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

1//===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "ARMTargetTransformInfo.h"
10#include "ARMSubtarget.h"
11#include "MCTargetDesc/ARMAddressingModes.h"
12#include "llvm/ADT/APInt.h"
13#include "llvm/ADT/SmallVector.h"
14#include "llvm/Analysis/LoopInfo.h"
15#include "llvm/CodeGen/CostTable.h"
16#include "llvm/CodeGen/ISDOpcodes.h"
17#include "llvm/CodeGen/ValueTypes.h"
18#include "llvm/IR/BasicBlock.h"
19#include "llvm/IR/DataLayout.h"
20#include "llvm/IR/DerivedTypes.h"
21#include "llvm/IR/Instruction.h"
22#include "llvm/IR/Instructions.h"
23#include "llvm/IR/Intrinsics.h"
24#include "llvm/IR/IntrinsicInst.h"
25#include "llvm/IR/IntrinsicsARM.h"
26#include "llvm/IR/PatternMatch.h"
27#include "llvm/IR/Type.h"
28#include "llvm/MC/SubtargetFeature.h"
29#include "llvm/Support/Casting.h"
30#include "llvm/Support/KnownBits.h"
31#include "llvm/Support/MachineValueType.h"
32#include "llvm/Target/TargetMachine.h"
33#include "llvm/Transforms/InstCombine/InstCombiner.h"
34#include "llvm/Transforms/Utils/Local.h"
35#include "llvm/Transforms/Utils/LoopUtils.h"
36#include <algorithm>
37#include <cassert>
38#include <cstdint>
39#include <utility>
40
41using namespace llvm;
42
43#define DEBUG_TYPE"armtti" "armtti"
44
45static cl::opt<bool> EnableMaskedLoadStores(
46 "enable-arm-maskedldst", cl::Hidden, cl::init(true),
47 cl::desc("Enable the generation of masked loads and stores"));
48
49static cl::opt<bool> DisableLowOverheadLoops(
50 "disable-arm-loloops", cl::Hidden, cl::init(false),
51 cl::desc("Disable the generation of low-overhead loops"));
52
53static cl::opt<bool>
54 AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
55 cl::desc("Enable the generation of WLS loops"));
56
57extern cl::opt<TailPredication::Mode> EnableTailPredication;
58
59extern cl::opt<bool> EnableMaskedGatherScatters;
60
61extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor;
62
63/// Convert a vector load intrinsic into a simple llvm load instruction.
64/// This is beneficial when the underlying object being addressed comes
65/// from a constant, since we get constant-folding for free.
66static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
67 InstCombiner::BuilderTy &Builder) {
68 auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
69
70 if (!IntrAlign)
71 return nullptr;
72
73 unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
74 ? MemAlign
75 : IntrAlign->getLimitedValue();
76
77 if (!isPowerOf2_32(Alignment))
78 return nullptr;
79
80 auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
81 PointerType::get(II.getType(), 0));
82 return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
83}
84
85bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
86 const Function *Callee) const {
87 const TargetMachine &TM = getTLI()->getTargetMachine();
88 const FeatureBitset &CallerBits =
89 TM.getSubtargetImpl(*Caller)->getFeatureBits();
90 const FeatureBitset &CalleeBits =
91 TM.getSubtargetImpl(*Callee)->getFeatureBits();
92
93 // To inline a callee, all features not in the allowed list must match exactly.
94 bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
95 (CalleeBits & ~InlineFeaturesAllowed);
96 // For features in the allowed list, the callee's features must be a subset of
97 // the callers'.
98 bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
99 (CalleeBits & InlineFeaturesAllowed);
100 return MatchExact && MatchSubset;
101}
102
103TTI::AddressingModeKind
104ARMTTIImpl::getPreferredAddressingMode(const Loop *L,
105 ScalarEvolution *SE) const {
106 if (ST->hasMVEIntegerOps())
107 return TTI::AMK_PostIndexed;
108
109 if (L->getHeader()->getParent()->hasOptSize())
110 return TTI::AMK_None;
111
112 if (ST->isMClass() && ST->isThumb2() &&
113 L->getNumBlocks() == 1)
114 return TTI::AMK_PreIndexed;
115
116 return TTI::AMK_None;
117}
118
119Optional<Instruction *>
120ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
121 using namespace PatternMatch;
122 Intrinsic::ID IID = II.getIntrinsicID();
123 switch (IID) {
124 default:
125 break;
126 case Intrinsic::arm_neon_vld1: {
127 Align MemAlign =
128 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
129 &IC.getAssumptionCache(), &IC.getDominatorTree());
130 if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
131 return IC.replaceInstUsesWith(II, V);
132 }
133 break;
134 }
135
136 case Intrinsic::arm_neon_vld2:
137 case Intrinsic::arm_neon_vld3:
138 case Intrinsic::arm_neon_vld4:
139 case Intrinsic::arm_neon_vld2lane:
140 case Intrinsic::arm_neon_vld3lane:
141 case Intrinsic::arm_neon_vld4lane:
142 case Intrinsic::arm_neon_vst1:
143 case Intrinsic::arm_neon_vst2:
144 case Intrinsic::arm_neon_vst3:
145 case Intrinsic::arm_neon_vst4:
146 case Intrinsic::arm_neon_vst2lane:
147 case Intrinsic::arm_neon_vst3lane:
148 case Intrinsic::arm_neon_vst4lane: {
149 Align MemAlign =
150 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
151 &IC.getAssumptionCache(), &IC.getDominatorTree());
152 unsigned AlignArg = II.getNumArgOperands() - 1;
153 Value *AlignArgOp = II.getArgOperand(AlignArg);
154 MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
155 if (Align && *Align < MemAlign) {
156 return IC.replaceOperand(
157 II, AlignArg,
158 ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(),
159 false));
160 }
161 break;
162 }
163
164 case Intrinsic::arm_mve_pred_i2v: {
165 Value *Arg = II.getArgOperand(0);
166 Value *ArgArg;
167 if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
168 PatternMatch::m_Value(ArgArg))) &&
169 II.getType() == ArgArg->getType()) {
170 return IC.replaceInstUsesWith(II, ArgArg);
171 }
172 Constant *XorMask;
173 if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
174 PatternMatch::m_Value(ArgArg)),
175 PatternMatch::m_Constant(XorMask))) &&
176 II.getType() == ArgArg->getType()) {
177 if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
178 if (CI->getValue().trunc(16).isAllOnesValue()) {
179 auto TrueVector = IC.Builder.CreateVectorSplat(
180 cast<FixedVectorType>(II.getType())->getNumElements(),
181 IC.Builder.getTrue());
182 return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
183 }
184 }
185 }
186 KnownBits ScalarKnown(32);
187 if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
188 ScalarKnown, 0)) {
189 return &II;
190 }
191 break;
192 }
193 case Intrinsic::arm_mve_pred_v2i: {
194 Value *Arg = II.getArgOperand(0);
195 Value *ArgArg;
196 if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
197 PatternMatch::m_Value(ArgArg)))) {
198 return IC.replaceInstUsesWith(II, ArgArg);
199 }
200 if (!II.getMetadata(LLVMContext::MD_range)) {
201 Type *IntTy32 = Type::getInt32Ty(II.getContext());
202 Metadata *M[] = {
203 ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)),
204 ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0x10000))};
205 II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M));
206 return &II;
207 }
208 break;
209 }
210 case Intrinsic::arm_mve_vadc:
211 case Intrinsic::arm_mve_vadc_predicated: {
212 unsigned CarryOp =
213 (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
214 assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&(static_cast<void> (0))
215 "Bad type for intrinsic!")(static_cast<void> (0));
216
217 KnownBits CarryKnown(32);
218 if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
219 CarryKnown)) {
220 return &II;
221 }
222 break;
223 }
224 case Intrinsic::arm_mve_vmldava: {
225 Instruction *I = cast<Instruction>(&II);
226 if (I->hasOneUse()) {
227 auto *User = cast<Instruction>(*I->user_begin());
228 Value *OpZ;
229 if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
230 match(I->getOperand(3), m_Zero())) {
231 Value *OpX = I->getOperand(4);
232 Value *OpY = I->getOperand(5);
233 Type *OpTy = OpX->getType();
234
235 IC.Builder.SetInsertPoint(User);
236 Value *V =
237 IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
238 {I->getOperand(0), I->getOperand(1),
239 I->getOperand(2), OpZ, OpX, OpY});
240
241 IC.replaceInstUsesWith(*User, V);
242 return IC.eraseInstFromFunction(*User);
243 }
244 }
245 return None;
246 }
247 }
248 return None;
249}
250
251InstructionCost ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
252 TTI::TargetCostKind CostKind) {
253 assert(Ty->isIntegerTy())(static_cast<void> (0));
254
255 unsigned Bits = Ty->getPrimitiveSizeInBits();
256 if (Bits == 0 || Imm.getActiveBits() >= 64)
257 return 4;
258
259 int64_t SImmVal = Imm.getSExtValue();
260 uint64_t ZImmVal = Imm.getZExtValue();
261 if (!ST->isThumb()) {
262 if ((SImmVal >= 0 && SImmVal < 65536) ||
263 (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
264 (ARM_AM::getSOImmVal(~ZImmVal) != -1))
265 return 1;
266 return ST->hasV6T2Ops() ? 2 : 3;
267 }
268 if (ST->isThumb2()) {
269 if ((SImmVal >= 0 && SImmVal < 65536) ||
270 (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
271 (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
272 return 1;
273 return ST->hasV6T2Ops() ? 2 : 3;
274 }
275 // Thumb1, any i8 imm cost 1.
276 if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
277 return 1;
278 if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
279 return 2;
280 // Load from constantpool.
281 return 3;
282}
283
284// Constants smaller than 256 fit in the immediate field of
285// Thumb1 instructions so we return a zero cost and 1 otherwise.
286InstructionCost ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
287 const APInt &Imm, Type *Ty) {
288 if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
289 return 0;
290
291 return 1;
292}
293
294// Checks whether Inst is part of a min(max()) or max(min()) pattern
295// that will match to an SSAT instruction
296static bool isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
297 Value *LHS, *RHS;
298 ConstantInt *C;
299 SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor;
300
301 if (InstSPF == SPF_SMAX &&
302 PatternMatch::match(RHS, PatternMatch::m_ConstantInt(C)) &&
303 C->getValue() == Imm && Imm.isNegative() && (-Imm).isPowerOf2()) {
304
305 auto isSSatMin = [&](Value *MinInst) {
306 if (isa<SelectInst>(MinInst)) {
307 Value *MinLHS, *MinRHS;
308 ConstantInt *MinC;
309 SelectPatternFlavor MinSPF =
310 matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
311 if (MinSPF == SPF_SMIN &&
312 PatternMatch::match(MinRHS, PatternMatch::m_ConstantInt(MinC)) &&
313 MinC->getValue() == ((-Imm) - 1))
314 return true;
315 }
316 return false;
317 };
318
319 if (isSSatMin(Inst->getOperand(1)) ||
320 (Inst->hasNUses(2) && (isSSatMin(*Inst->user_begin()) ||
321 isSSatMin(*(++Inst->user_begin())))))
322 return true;
323 }
324 return false;
325}
326
327InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
328 const APInt &Imm, Type *Ty,
329 TTI::TargetCostKind CostKind,
330 Instruction *Inst) {
331 // Division by a constant can be turned into multiplication, but only if we
332 // know it's constant. So it's not so much that the immediate is cheap (it's
333 // not), but that the alternative is worse.
334 // FIXME: this is probably unneeded with GlobalISel.
335 if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
336 Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
337 Idx == 1)
338 return 0;
339
340 // Leave any gep offsets for the CodeGenPrepare, which will do a better job at
341 // splitting any large offsets.
342 if (Opcode == Instruction::GetElementPtr && Idx != 0)
343 return 0;
344
345 if (Opcode == Instruction::And) {
346 // UXTB/UXTH
347 if (Imm == 255 || Imm == 65535)
348 return 0;
349 // Conversion to BIC is free, and means we can use ~Imm instead.
350 return std::min(getIntImmCost(Imm, Ty, CostKind),
351 getIntImmCost(~Imm, Ty, CostKind));
352 }
353
354 if (Opcode == Instruction::Add)
355 // Conversion to SUB is free, and means we can use -Imm instead.
356 return std::min(getIntImmCost(Imm, Ty, CostKind),
357 getIntImmCost(-Imm, Ty, CostKind));
358
359 if (Opcode == Instruction::ICmp && Imm.isNegative() &&
360 Ty->getIntegerBitWidth() == 32) {
361 int64_t NegImm = -Imm.getSExtValue();
362 if (ST->isThumb2() && NegImm < 1<<12)
363 // icmp X, #-C -> cmn X, #C
364 return 0;
365 if (ST->isThumb() && NegImm < 1<<8)
366 // icmp X, #-C -> adds X, #C
367 return 0;
368 }
369
370 // xor a, -1 can always be folded to MVN
371 if (Opcode == Instruction::Xor && Imm.isAllOnesValue())
372 return 0;
373
374 // Ensures negative constant of min(max()) or max(min()) patterns that
375 // match to SSAT instructions don't get hoisted
376 if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
377 Ty->getIntegerBitWidth() <= 32) {
378 if (isSSATMinMaxPattern(Inst, Imm) ||
379 (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
380 isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm)))
381 return 0;
382 }
383
384 return getIntImmCost(Imm, Ty, CostKind);
385}
386
387InstructionCost ARMTTIImpl::getCFInstrCost(unsigned Opcode,
388 TTI::TargetCostKind CostKind,
389 const Instruction *I) {
390 if (CostKind == TTI::TCK_RecipThroughput &&
391 (ST->hasNEON() || ST->hasMVEIntegerOps())) {
392 // FIXME: The vectorizer is highly sensistive to the cost of these
393 // instructions, which suggests that it may be using the costs incorrectly.
394 // But, for now, just make them free to avoid performance regressions for
395 // vector targets.
396 return 0;
397 }
398 return BaseT::getCFInstrCost(Opcode, CostKind, I);
399}
400
401InstructionCost ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
402 Type *Src,
403 TTI::CastContextHint CCH,
404 TTI::TargetCostKind CostKind,
405 const Instruction *I) {
406 int ISD = TLI->InstructionOpcodeToISD(Opcode);
407 assert(ISD && "Invalid opcode")(static_cast<void> (0));
408
409 // TODO: Allow non-throughput costs that aren't binary.
410 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
411 if (CostKind != TTI::TCK_RecipThroughput)
412 return Cost == 0 ? 0 : 1;
413 return Cost;
414 };
415 auto IsLegalFPType = [this](EVT VT) {
416 EVT EltVT = VT.getScalarType();
417 return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
418 (EltVT == MVT::f64 && ST->hasFP64()) ||
419 (EltVT == MVT::f16 && ST->hasFullFP16());
420 };
421
422 EVT SrcTy = TLI->getValueType(DL, Src);
423 EVT DstTy = TLI->getValueType(DL, Dst);
424
425 if (!SrcTy.isSimple() || !DstTy.isSimple())
426 return AdjustCost(
427 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
428
429 // Extending masked load/Truncating masked stores is expensive because we
430 // currently don't split them. This means that we'll likely end up
431 // loading/storing each element individually (hence the high cost).
432 if ((ST->hasMVEIntegerOps() &&
433 (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
434 Opcode == Instruction::SExt)) ||
435 (ST->hasMVEFloatOps() &&
436 (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
437 IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
438 if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
439 return 2 * DstTy.getVectorNumElements() *
440 ST->getMVEVectorCostFactor(CostKind);
441
442 // The extend of other kinds of load is free
443 if (CCH == TTI::CastContextHint::Normal ||
444 CCH == TTI::CastContextHint::Masked) {
445 static const TypeConversionCostTblEntry LoadConversionTbl[] = {
446 {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
447 {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
448 {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
449 {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
450 {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
451 {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
452 {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
453 {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
454 {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
455 {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
456 {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
457 {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
458 };
459 if (const auto *Entry = ConvertCostTableLookup(
460 LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
461 return AdjustCost(Entry->Cost);
462
463 static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
464 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
465 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
466 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
467 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
468 {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
469 {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
470 // The following extend from a legal type to an illegal type, so need to
471 // split the load. This introduced an extra load operation, but the
472 // extend is still "free".
473 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1},
474 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1},
475 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3},
476 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3},
477 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1},
478 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1},
479 };
480 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
481 if (const auto *Entry =
482 ConvertCostTableLookup(MVELoadConversionTbl, ISD,
483 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
484 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
485 }
486
487 static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
488 // FPExtends are similar but also require the VCVT instructions.
489 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1},
490 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3},
491 };
492 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
493 if (const auto *Entry =
494 ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
495 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
496 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
497 }
498
499 // The truncate of a store is free. This is the mirror of extends above.
500 static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
501 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0},
502 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0},
503 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0},
504 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1},
505 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1},
506 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3},
507 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1},
508 };
509 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
510 if (const auto *Entry =
511 ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
512 SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
513 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
514 }
515
516 static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
517 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
518 {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
519 };
520 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
521 if (const auto *Entry =
522 ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
523 SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
524 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
525 }
526 }
527
528 // NEON vector operations that can extend their inputs.
529 if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
530 I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
531 static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
532 // vaddl
533 { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
534 { ISD::ADD, MVT::v8i16, MVT::v8i8, 0 },
535 // vsubl
536 { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
537 { ISD::SUB, MVT::v8i16, MVT::v8i8, 0 },
538 // vmull
539 { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
540 { ISD::MUL, MVT::v8i16, MVT::v8i8, 0 },
541 // vshll
542 { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
543 { ISD::SHL, MVT::v8i16, MVT::v8i8, 0 },
544 };
545
546 auto *User = cast<Instruction>(*I->user_begin());
547 int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
548 if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
549 DstTy.getSimpleVT(),
550 SrcTy.getSimpleVT())) {
551 return AdjustCost(Entry->Cost);
552 }
553 }
554
555 // Single to/from double precision conversions.
556 if (Src->isVectorTy() && ST->hasNEON() &&
557 ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
558 DstTy.getScalarType() == MVT::f32) ||
559 (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
560 DstTy.getScalarType() == MVT::f64))) {
561 static const CostTblEntry NEONFltDblTbl[] = {
562 // Vector fptrunc/fpext conversions.
563 {ISD::FP_ROUND, MVT::v2f64, 2},
564 {ISD::FP_EXTEND, MVT::v2f32, 2},
565 {ISD::FP_EXTEND, MVT::v4f32, 4}};
566
567 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
568 if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
569 return AdjustCost(LT.first * Entry->Cost);
570 }
571
572 // Some arithmetic, load and store operations have specific instructions
573 // to cast up/down their types automatically at no extra cost.
574 // TODO: Get these tables to know at least what the related operations are.
575 static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
576 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
577 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
578 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
579 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
580 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
581 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
582
583 // The number of vmovl instructions for the extension.
584 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
585 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
586 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
587 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
588 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
589 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
590 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
591 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
592 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
593 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
594 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
595 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
596 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
597 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
598 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
599 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
600 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
601 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
602
603 // Operations that we legalize using splitting.
604 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
605 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
606
607 // Vector float <-> i32 conversions.
608 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
609 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
610
611 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
612 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
613 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
614 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
615 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
616 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
617 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
618 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
619 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
620 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
621 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
622 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
623 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
624 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
625 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
626 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
627 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
628 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
629 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
630 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
631
632 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
633 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
634 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 },
635 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 },
636 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
637 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
638
639 // Vector double <-> i32 conversions.
640 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
641 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
642
643 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
644 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
645 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
646 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
647 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
648 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
649
650 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
651 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
652 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 4 },
653 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 4 },
654 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 8 },
655 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 8 }
656 };
657
658 if (SrcTy.isVector() && ST->hasNEON()) {
659 if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
660 DstTy.getSimpleVT(),
661 SrcTy.getSimpleVT()))
662 return AdjustCost(Entry->Cost);
663 }
664
665 // Scalar float to integer conversions.
666 static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
667 { ISD::FP_TO_SINT, MVT::i1, MVT::f32, 2 },
668 { ISD::FP_TO_UINT, MVT::i1, MVT::f32, 2 },
669 { ISD::FP_TO_SINT, MVT::i1, MVT::f64, 2 },
670 { ISD::FP_TO_UINT, MVT::i1, MVT::f64, 2 },
671 { ISD::FP_TO_SINT, MVT::i8, MVT::f32, 2 },
672 { ISD::FP_TO_UINT, MVT::i8, MVT::f32, 2 },
673 { ISD::FP_TO_SINT, MVT::i8, MVT::f64, 2 },
674 { ISD::FP_TO_UINT, MVT::i8, MVT::f64, 2 },
675 { ISD::FP_TO_SINT, MVT::i16, MVT::f32, 2 },
676 { ISD::FP_TO_UINT, MVT::i16, MVT::f32, 2 },
677 { ISD::FP_TO_SINT, MVT::i16, MVT::f64, 2 },
678 { ISD::FP_TO_UINT, MVT::i16, MVT::f64, 2 },
679 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 2 },
680 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 2 },
681 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 2 },
682 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 2 },
683 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 10 },
684 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 10 },
685 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 10 },
686 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 10 }
687 };
688 if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
689 if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
690 DstTy.getSimpleVT(),
691 SrcTy.getSimpleVT()))
692 return AdjustCost(Entry->Cost);
693 }
694
695 // Scalar integer to float conversions.
696 static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
697 { ISD::SINT_TO_FP, MVT::f32, MVT::i1, 2 },
698 { ISD::UINT_TO_FP, MVT::f32, MVT::i1, 2 },
699 { ISD::SINT_TO_FP, MVT::f64, MVT::i1, 2 },
700 { ISD::UINT_TO_FP, MVT::f64, MVT::i1, 2 },
701 { ISD::SINT_TO_FP, MVT::f32, MVT::i8, 2 },
702 { ISD::UINT_TO_FP, MVT::f32, MVT::i8, 2 },
703 { ISD::SINT_TO_FP, MVT::f64, MVT::i8, 2 },
704 { ISD::UINT_TO_FP, MVT::f64, MVT::i8, 2 },
705 { ISD::SINT_TO_FP, MVT::f32, MVT::i16, 2 },
706 { ISD::UINT_TO_FP, MVT::f32, MVT::i16, 2 },
707 { ISD::SINT_TO_FP, MVT::f64, MVT::i16, 2 },
708 { ISD::UINT_TO_FP, MVT::f64, MVT::i16, 2 },
709 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 2 },
710 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 2 },
711 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 2 },
712 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 2 },
713 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 10 },
714 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 10 },
715 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 10 },
716 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 10 }
717 };
718
719 if (SrcTy.isInteger() && ST->hasNEON()) {
720 if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
721 ISD, DstTy.getSimpleVT(),
722 SrcTy.getSimpleVT()))
723 return AdjustCost(Entry->Cost);
724 }
725
726 // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
727 // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
728 // are linearised so take more.
729 static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
730 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
731 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
732 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
733 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
734 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
735 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
736 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
737 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
738 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
739 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
740 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
741 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
742 };
743
744 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
745 if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
746 ISD, DstTy.getSimpleVT(),
747 SrcTy.getSimpleVT()))
748 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
749 }
750
751 if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
752 // As general rule, fp converts that were not matched above are scalarized
753 // and cost 1 vcvt for each lane, so long as the instruction is available.
754 // If not it will become a series of function calls.
755 const InstructionCost CallCost =
756 getCallInstrCost(nullptr, Dst, {Src}, CostKind);
757 int Lanes = 1;
758 if (SrcTy.isFixedLengthVector())
759 Lanes = SrcTy.getVectorNumElements();
760
761 if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
762 return Lanes;
763 else
764 return Lanes * CallCost;
765 }
766
767 if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
768 SrcTy.isFixedLengthVector()) {
769 // Treat a truncate with larger than legal source (128bits for MVE) as
770 // expensive, 2 instructions per lane.
771 if ((SrcTy.getScalarType() == MVT::i8 ||
772 SrcTy.getScalarType() == MVT::i16 ||
773 SrcTy.getScalarType() == MVT::i32) &&
774 SrcTy.getSizeInBits() > 128 &&
775 SrcTy.getSizeInBits() > DstTy.getSizeInBits())
776 return SrcTy.getVectorNumElements() * 2;
777 }
778
779 // Scalar integer conversion costs.
780 static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
781 // i16 -> i64 requires two dependent operations.
782 { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
783
784 // Truncates on i64 are assumed to be free.
785 { ISD::TRUNCATE, MVT::i32, MVT::i64, 0 },
786 { ISD::TRUNCATE, MVT::i16, MVT::i64, 0 },
787 { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 },
788 { ISD::TRUNCATE, MVT::i1, MVT::i64, 0 }
789 };
790
791 if (SrcTy.isInteger()) {
792 if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
793 DstTy.getSimpleVT(),
794 SrcTy.getSimpleVT()))
795 return AdjustCost(Entry->Cost);
796 }
797
798 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
799 ? ST->getMVEVectorCostFactor(CostKind)
800 : 1;
801 return AdjustCost(
802 BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
803}
804
805InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
806 unsigned Index) {
807 // Penalize inserting into an D-subregister. We end up with a three times
808 // lower estimated throughput on swift.
809 if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
810 ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
811 return 3;
812
813 if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
814 Opcode == Instruction::ExtractElement)) {
815 // Cross-class copies are expensive on many microarchitectures,
816 // so assume they are expensive by default.
817 if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
818 return 3;
819
820 // Even if it's not a cross class copy, this likely leads to mixing
821 // of NEON and VFP code and should be therefore penalized.
822 if (ValTy->isVectorTy() &&
823 ValTy->getScalarSizeInBits() <= 32)
824 return std::max<InstructionCost>(
825 BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
826 }
827
828 if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
829 Opcode == Instruction::ExtractElement)) {
830 // Integer cross-lane moves are more expensive than float, which can
831 // sometimes just be vmovs. Integer involve being passes to GPR registers,
832 // causing more of a delay.
833 std::pair<InstructionCost, MVT> LT =
834 getTLI()->getTypeLegalizationCost(DL, ValTy->getScalarType());
835 return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
836 }
837
838 return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
839}
840
841InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
842 Type *CondTy,
843 CmpInst::Predicate VecPred,
844 TTI::TargetCostKind CostKind,
845 const Instruction *I) {
846 int ISD = TLI->InstructionOpcodeToISD(Opcode);
847
848 // Thumb scalar code size cost for select.
849 if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
40
Assuming 'CostKind' is not equal to TCK_CodeSize
850 ST->isThumb() && !ValTy->isVectorTy()) {
851 // Assume expensive structs.
852 if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
853 return TTI::TCC_Expensive;
854
855 // Select costs can vary because they:
856 // - may require one or more conditional mov (including an IT),
857 // - can't operate directly on immediates,
858 // - require live flags, which we can't copy around easily.
859 InstructionCost Cost = TLI->getTypeLegalizationCost(DL, ValTy).first;
860
861 // Possible IT instruction for Thumb2, or more for Thumb1.
862 ++Cost;
863
864 // i1 values may need rematerialising by using mov immediates and/or
865 // flag setting instructions.
866 if (ValTy->isIntegerTy(1))
867 ++Cost;
868
869 return Cost;
870 }
871
872 // If this is a vector min/max/abs, use the cost of that intrinsic directly
873 // instead. Hopefully when min/max intrinsics are more prevalent this code
874 // will not be needed.
875 const Instruction *Sel = I;
876 if ((Opcode
40.1
'Opcode' is equal to ICmp
40.1
'Opcode' is equal to ICmp
40.1
'Opcode' is equal to ICmp
40.1
'Opcode' is equal to ICmp
40.1
'Opcode' is equal to ICmp
40.1
'Opcode' is equal to ICmp
40.1
'Opcode' is equal to ICmp
== Instruction::ICmp || Opcode == Instruction::FCmp) && Sel
40.2
'Sel' is null
40.2
'Sel' is null
40.2
'Sel' is null
40.2
'Sel' is null
40.2
'Sel' is null
40.2
'Sel' is null
40.2
'Sel' is null
&&
877 Sel->hasOneUse())
878 Sel = cast<Instruction>(Sel->user_back());
879 if (Sel
40.3
'Sel' is null
40.3
'Sel' is null
40.3
'Sel' is null
40.3
'Sel' is null
40.3
'Sel' is null
40.3
'Sel' is null
40.3
'Sel' is null
&& ValTy->isVectorTy() &&
880 (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
881 const Value *LHS, *RHS;
882 SelectPatternFlavor SPF = matchSelectPattern(Sel, LHS, RHS).Flavor;
883 unsigned IID = 0;
884 switch (SPF) {
885 case SPF_ABS:
886 IID = Intrinsic::abs;
887 break;
888 case SPF_SMIN:
889 IID = Intrinsic::smin;
890 break;
891 case SPF_SMAX:
892 IID = Intrinsic::smax;
893 break;
894 case SPF_UMIN:
895 IID = Intrinsic::umin;
896 break;
897 case SPF_UMAX:
898 IID = Intrinsic::umax;
899 break;
900 case SPF_FMINNUM:
901 IID = Intrinsic::minnum;
902 break;
903 case SPF_FMAXNUM:
904 IID = Intrinsic::maxnum;
905 break;
906 default:
907 break;
908 }
909 if (IID) {
910 // The ICmp is free, the select gets the cost of the min/max/etc
911 if (Sel != I)
912 return 0;
913 IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
914 return getIntrinsicInstrCost(CostAttrs, CostKind);
915 }
916 }
917
918 // On NEON a vector select gets lowered to vbsl.
919 if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
41
Assuming the condition is false
920 // Lowering of some vector selects is currently far from perfect.
921 static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
922 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
923 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
924 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
925 };
926
927 EVT SelCondTy = TLI->getValueType(DL, CondTy);
928 EVT SelValTy = TLI->getValueType(DL, ValTy);
929 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
930 if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
931 SelCondTy.getSimpleVT(),
932 SelValTy.getSimpleVT()))
933 return Entry->Cost;
934 }
935
936 std::pair<InstructionCost, MVT> LT =
937 TLI->getTypeLegalizationCost(DL, ValTy);
938 return LT.first;
939 }
940
941 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
42
Assuming the condition is true
43
Calling 'Type::isVectorTy'
47
Returning from 'Type::isVectorTy'
50
Taking true branch
942 (Opcode
47.1
'Opcode' is equal to ICmp
47.1
'Opcode' is equal to ICmp
47.1
'Opcode' is equal to ICmp
47.1
'Opcode' is equal to ICmp
47.1
'Opcode' is equal to ICmp
47.1
'Opcode' is equal to ICmp
47.1
'Opcode' is equal to ICmp
== Instruction::ICmp || Opcode == Instruction::FCmp) &&
943 cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
48
'ValTy' is a 'FixedVectorType'
49
Assuming the condition is true
944 FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
51
'ValTy' is a 'FixedVectorType'
945 FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy);
52
Assuming pointer value is null
53
Assuming null pointer is passed into cast
946 if (!VecCondTy
53.1
'VecCondTy' is null
53.1
'VecCondTy' is null
53.1
'VecCondTy' is null
53.1
'VecCondTy' is null
53.1
'VecCondTy' is null
53.1
'VecCondTy' is null
53.1
'VecCondTy' is null
)
54
Taking true branch
947 VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy));
55
The object is a 'FixedVectorType'
948
949 // If we don't have mve.fp any fp operations will need to be scalarized.
950 if (Opcode
55.1
'Opcode' is not equal to FCmp
55.1
'Opcode' is not equal to FCmp
55.1
'Opcode' is not equal to FCmp
55.1
'Opcode' is not equal to FCmp
55.1
'Opcode' is not equal to FCmp
55.1
'Opcode' is not equal to FCmp
55.1
'Opcode' is not equal to FCmp
== Instruction::FCmp && !ST->hasMVEFloatOps()) {
951 // One scalaization insert, one scalarization extract and the cost of the
952 // fcmps.
953 return BaseT::getScalarizationOverhead(VecValTy, false, true) +
954 BaseT::getScalarizationOverhead(VecCondTy, true, false) +
955 VecValTy->getNumElements() *
956 getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
957 VecCondTy->getScalarType(), VecPred, CostKind,
958 I);
959 }
960
961 std::pair<InstructionCost, MVT> LT =
962 TLI->getTypeLegalizationCost(DL, ValTy);
963 int BaseCost = ST->getMVEVectorCostFactor(CostKind);
964 // There are two types - the input that specifies the type of the compare
965 // and the output vXi1 type. Because we don't know how the output will be
966 // split, we may need an expensive shuffle to get two in sync. This has the
967 // effect of making larger than legal compares (v8i32 for example)
968 // expensive.
969 if (LT.second.getVectorNumElements() > 2) {
56
Assuming the condition is false
57
Taking false branch
970 if (LT.first > 1)
971 return LT.first * BaseCost +
972 BaseT::getScalarizationOverhead(VecCondTy, true, false);
973 return BaseCost;
974 }
975 }
976
977 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
978 // for "multiple beats" potentially needed by MVE instructions.
979 int BaseCost = 1;
980 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
58
Taking false branch
981 BaseCost = ST->getMVEVectorCostFactor(CostKind);
982
983 return BaseCost *
984 BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
59
Passing null pointer value via 3rd parameter 'CondTy'
60
Calling 'BasicTTIImplBase::getCmpSelInstrCost'
985}
986
987InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty,
988 ScalarEvolution *SE,
989 const SCEV *Ptr) {
990 // Address computations in vectorized code with non-consecutive addresses will
991 // likely result in more instructions compared to scalar code where the
992 // computation can more often be merged into the index mode. The resulting
993 // extra micro-ops can significantly decrease throughput.
994 unsigned NumVectorInstToHideOverhead = 10;
995 int MaxMergeDistance = 64;
996
997 if (ST->hasNEON()) {
998 if (Ty->isVectorTy() && SE &&
999 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
1000 return NumVectorInstToHideOverhead;
1001
1002 // In many cases the address computation is not merged into the instruction
1003 // addressing mode.
1004 return 1;
1005 }
1006 return BaseT::getAddressComputationCost(Ty, SE, Ptr);
1007}
1008
1009bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) {
1010 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
1011 // If a VCTP is part of a chain, it's already profitable and shouldn't be
1012 // optimized, else LSR may block tail-predication.
1013 switch (II->getIntrinsicID()) {
1014 case Intrinsic::arm_mve_vctp8:
1015 case Intrinsic::arm_mve_vctp16:
1016 case Intrinsic::arm_mve_vctp32:
1017 case Intrinsic::arm_mve_vctp64:
1018 return true;
1019 default:
1020 break;
1021 }
1022 }
1023 return false;
1024}
1025
1026bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
1027 if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
1028 return false;
1029
1030 if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
1031 // Don't support v2i1 yet.
1032 if (VecTy->getNumElements() == 2)
1033 return false;
1034
1035 // We don't support extending fp types.
1036 unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
1037 if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
1038 return false;
1039 }
1040
1041 unsigned EltWidth = DataTy->getScalarSizeInBits();
1042 return (EltWidth == 32 && Alignment >= 4) ||
1043 (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
1044}
1045
1046bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) {
1047 if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
1048 return false;
1049
1050 // This method is called in 2 places:
1051 // - from the vectorizer with a scalar type, in which case we need to get
1052 // this as good as we can with the limited info we have (and rely on the cost
1053 // model for the rest).
1054 // - from the masked intrinsic lowering pass with the actual vector type.
1055 // For MVE, we have a custom lowering pass that will already have custom
1056 // legalised any gathers that we can to MVE intrinsics, and want to expand all
1057 // the rest. The pass runs before the masked intrinsic lowering pass, so if we
1058 // are here, we know we want to expand.
1059 if (isa<VectorType>(Ty))
1060 return false;
1061
1062 unsigned EltWidth = Ty->getScalarSizeInBits();
1063 return ((EltWidth == 32 && Alignment >= 4) ||
1064 (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1065}
1066
1067/// Given a memcpy/memset/memmove instruction, return the number of memory
1068/// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1069/// call is used.
1070int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const {
1071 MemOp MOp;
1072 unsigned DstAddrSpace = ~0u;
1073 unsigned SrcAddrSpace = ~0u;
1074 const Function *F = I->getParent()->getParent();
1075
1076 if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
1077 ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
1078 // If 'size' is not a constant, a library call will be generated.
1079 if (!C)
1080 return -1;
1081
1082 const unsigned Size = C->getValue().getZExtValue();
1083 const Align DstAlign = *MC->getDestAlign();
1084 const Align SrcAlign = *MC->getSourceAlign();
1085
1086 MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
1087 /*IsVolatile*/ false);
1088 DstAddrSpace = MC->getDestAddressSpace();
1089 SrcAddrSpace = MC->getSourceAddressSpace();
1090 }
1091 else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
1092 ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
1093 // If 'size' is not a constant, a library call will be generated.
1094 if (!C)
1095 return -1;
1096
1097 const unsigned Size = C->getValue().getZExtValue();
1098 const Align DstAlign = *MS->getDestAlign();
1099
1100 MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
1101 /*IsZeroMemset*/ false, /*IsVolatile*/ false);
1102 DstAddrSpace = MS->getDestAddressSpace();
1103 }
1104 else
1105 llvm_unreachable("Expected a memcpy/move or memset!")__builtin_unreachable();
1106
1107 unsigned Limit, Factor = 2;
1108 switch(I->getIntrinsicID()) {
1109 case Intrinsic::memcpy:
1110 Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
1111 break;
1112 case Intrinsic::memmove:
1113 Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
1114 break;
1115 case Intrinsic::memset:
1116 Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
1117 Factor = 1;
1118 break;
1119 default:
1120 llvm_unreachable("Expected a memcpy/move or memset!")__builtin_unreachable();
1121 }
1122
1123 // MemOps will be poplulated with a list of data types that needs to be
1124 // loaded and stored. That's why we multiply the number of elements by 2 to
1125 // get the cost for this memcpy.
1126 std::vector<EVT> MemOps;
1127 if (getTLI()->findOptimalMemOpLowering(
1128 MemOps, Limit, MOp, DstAddrSpace,
1129 SrcAddrSpace, F->getAttributes()))
1130 return MemOps.size() * Factor;
1131
1132 // If we can't find an optimal memop lowering, return the default cost
1133 return -1;
1134}
1135
1136InstructionCost ARMTTIImpl::getMemcpyCost(const Instruction *I) {
1137 int NumOps = getNumMemOps(cast<IntrinsicInst>(I));
1138
1139 // To model the cost of a library call, we assume 1 for the call, and
1140 // 3 for the argument setup.
1141 if (NumOps == -1)
1142 return 4;
1143 return NumOps;
1144}
1145
1146InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1147 VectorType *Tp, ArrayRef<int> Mask,
1148 int Index, VectorType *SubTp) {
1149 Kind = improveShuffleKindFromMask(Kind, Mask);
1150 if (ST->hasNEON()) {
1151 if (Kind == TTI::SK_Broadcast) {
1152 static const CostTblEntry NEONDupTbl[] = {
1153 // VDUP handles these cases.
1154 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1155 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1156 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1157 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1158 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1159 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1160
1161 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1162 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1163 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1164 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
1165
1166 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1167 if (const auto *Entry =
1168 CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
1169 return LT.first * Entry->Cost;
1170 }
1171 if (Kind == TTI::SK_Reverse) {
1172 static const CostTblEntry NEONShuffleTbl[] = {
1173 // Reverse shuffle cost one instruction if we are shuffling within a
1174 // double word (vrev) or two if we shuffle a quad word (vrev, vext).
1175 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1176 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1177 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1178 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1179 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1180 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1181
1182 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1183 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1184 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
1185 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
1186
1187 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1188 if (const auto *Entry =
1189 CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
1190 return LT.first * Entry->Cost;
1191 }
1192 if (Kind == TTI::SK_Select) {
1193 static const CostTblEntry NEONSelShuffleTbl[] = {
1194 // Select shuffle cost table for ARM. Cost is the number of
1195 // instructions
1196 // required to create the shuffled vector.
1197
1198 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1199 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1200 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1201 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1202
1203 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1204 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1205 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
1206
1207 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
1208
1209 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
1210
1211 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1212 if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
1213 ISD::VECTOR_SHUFFLE, LT.second))
1214 return LT.first * Entry->Cost;
1215 }
1216 }
1217 if (ST->hasMVEIntegerOps()) {
1218 if (Kind == TTI::SK_Broadcast) {
1219 static const CostTblEntry MVEDupTbl[] = {
1220 // VDUP handles these cases.
1221 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1222 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1223 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
1224 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1225 {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};
1226
1227 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1228 if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
1229 LT.second))
1230 return LT.first * Entry->Cost *
1231 ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput);
1232 }
1233
1234 if (!Mask.empty()) {
1235 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1236 if (Mask.size() <= LT.second.getVectorNumElements() &&
1237 (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
1238 isVREVMask(Mask, LT.second, 64)))
1239 return ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) * LT.first;
1240 }
1241 }
1242
1243 int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
1244 ? ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput)
1245 : 1;
1246 return BaseCost * BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
1247}
1248
1249InstructionCost ARMTTIImpl::getArithmeticInstrCost(
1250 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1251 TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
1252 TTI::OperandValueProperties Opd1PropInfo,
1253 TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
1254 const Instruction *CxtI) {
1255 int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1256 if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
1257 // Make operations on i1 relatively expensive as this often involves
1258 // combining predicates. AND and XOR should be easier to handle with IT
1259 // blocks.
1260 switch (ISDOpcode) {
1261 default:
1262 break;
1263 case ISD::AND:
1264 case ISD::XOR:
1265 return 2;
1266 case ISD::OR:
1267 return 3;
1268 }
1269 }
1270
1271 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
1272
1273 if (ST->hasNEON()) {
1274 const unsigned FunctionCallDivCost = 20;
1275 const unsigned ReciprocalDivCost = 10;
1276 static const CostTblEntry CostTbl[] = {
1277 // Division.
1278 // These costs are somewhat random. Choose a cost of 20 to indicate that
1279 // vectorizing devision (added function call) is going to be very expensive.
1280 // Double registers types.
1281 { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1282 { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1283 { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
1284 { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
1285 { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1286 { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1287 { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
1288 { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
1289 { ISD::SDIV, MVT::v4i16, ReciprocalDivCost},
1290 { ISD::UDIV, MVT::v4i16, ReciprocalDivCost},
1291 { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
1292 { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
1293 { ISD::SDIV, MVT::v8i8, ReciprocalDivCost},
1294 { ISD::UDIV, MVT::v8i8, ReciprocalDivCost},
1295 { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost},
1296 { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost},
1297 // Quad register types.
1298 { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1299 { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1300 { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
1301 { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
1302 { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1303 { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1304 { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
1305 { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
1306 { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1307 { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1308 { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
1309 { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
1310 { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1311 { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1312 { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
1313 { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
1314 // Multiplication.
1315 };
1316
1317 if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
1318 return LT.first * Entry->Cost;
1319
1320 InstructionCost Cost = BaseT::getArithmeticInstrCost(
1321 Opcode, Ty, CostKind, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo);
1322
1323 // This is somewhat of a hack. The problem that we are facing is that SROA
1324 // creates a sequence of shift, and, or instructions to construct values.
1325 // These sequences are recognized by the ISel and have zero-cost. Not so for
1326 // the vectorized code. Because we have support for v2i64 but not i64 those
1327 // sequences look particularly beneficial to vectorize.
1328 // To work around this we increase the cost of v2i64 operations to make them
1329 // seem less beneficial.
1330 if (LT.second == MVT::v2i64 &&
1331 Op2Info == TargetTransformInfo::OK_UniformConstantValue)
1332 Cost += 4;
1333
1334 return Cost;
1335 }
1336
1337 // If this operation is a shift on arm/thumb2, it might well be folded into
1338 // the following instruction, hence having a cost of 0.
1339 auto LooksLikeAFreeShift = [&]() {
1340 if (ST->isThumb1Only() || Ty->isVectorTy())
1341 return false;
1342
1343 if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
1344 return false;
1345 if (Op2Info != TargetTransformInfo::OK_UniformConstantValue)
1346 return false;
1347
1348 // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1349 switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
1350 case Instruction::Add:
1351 case Instruction::Sub:
1352 case Instruction::And:
1353 case Instruction::Xor:
1354 case Instruction::Or:
1355 case Instruction::ICmp:
1356 return true;
1357 default:
1358 return false;
1359 }
1360 };
1361 if (LooksLikeAFreeShift())
1362 return 0;
1363
1364 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1365 // for "multiple beats" potentially needed by MVE instructions.
1366 int BaseCost = 1;
1367 if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1368 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1369
1370 // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
1371 // without treating floats as more expensive that scalars or increasing the
1372 // costs for custom operations. The results is also multiplied by the
1373 // MVEVectorCostFactor where appropriate.
1374 if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
1375 return LT.first * BaseCost;
1376
1377 // Else this is expand, assume that we need to scalarize this op.
1378 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
1379 unsigned Num = VTy->getNumElements();
1380 InstructionCost Cost =
1381 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind);
1382 // Return the cost of multiple scalar invocation plus the cost of
1383 // inserting and extracting the values.
1384 SmallVector<Type *> Tys(Args.size(), Ty);
1385 return BaseT::getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
1386 }
1387
1388 return BaseCost;
1389}
1390
1391InstructionCost ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1392 MaybeAlign Alignment,
1393 unsigned AddressSpace,
1394 TTI::TargetCostKind CostKind,
1395 const Instruction *I) {
1396 // TODO: Handle other cost kinds.
1397 if (CostKind != TTI::TCK_RecipThroughput)
1398 return 1;
1399
1400 // Type legalization can't handle structs
1401 if (TLI->getValueType(DL, Src, true) == MVT::Other)
1402 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1403 CostKind);
1404
1405 if (ST->hasNEON() && Src->isVectorTy() &&
1406 (Alignment && *Alignment != Align(16)) &&
1407 cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
1408 // Unaligned loads/stores are extremely inefficient.
1409 // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1410 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
1411 return LT.first * 4;
1412 }
1413
1414 // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
1415 // Same for stores.
1416 if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
1417 ((Opcode == Instruction::Load && I->hasOneUse() &&
1418 isa<FPExtInst>(*I->user_begin())) ||
1419 (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
1420 FixedVectorType *SrcVTy = cast<FixedVectorType>(Src);
1421 Type *DstTy =
1422 Opcode == Instruction::Load
1423 ? (*I->user_begin())->getType()
1424 : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
1425 if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
1426 DstTy->getScalarType()->isFloatTy())
1427 return ST->getMVEVectorCostFactor(CostKind);
1428 }
1429
1430 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1431 ? ST->getMVEVectorCostFactor(CostKind)
1432 : 1;
1433 return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1434 CostKind, I);
1435}
1436
1437InstructionCost
1438ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1439 unsigned AddressSpace,
1440 TTI::TargetCostKind CostKind) {
1441 if (ST->hasMVEIntegerOps()) {
1442 if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment))
1443 return ST->getMVEVectorCostFactor(CostKind);
1444 if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment))
1445 return ST->getMVEVectorCostFactor(CostKind);
1446 }
1447 if (!isa<FixedVectorType>(Src))
1448 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1449 CostKind);
1450 // Scalar cost, which is currently very high due to the efficiency of the
1451 // generated code.
1452 return cast<FixedVectorType>(Src)->getNumElements() * 8;
1453}
1454
1455InstructionCost ARMTTIImpl::getInterleavedMemoryOpCost(
1456 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1457 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1458 bool UseMaskForCond, bool UseMaskForGaps) {
1459 assert(Factor >= 2 && "Invalid interleave factor")(static_cast<void> (0));
1460 assert(isa<VectorType>(VecTy) && "Expect a vector type")(static_cast<void> (0));
1461
1462 // vldN/vstN doesn't support vector types of i64/f64 element.
1463 bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
1464
1465 if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
1466 !UseMaskForCond && !UseMaskForGaps) {
1467 unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1468 auto *SubVecTy =
1469 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
1470
1471 // vldN/vstN only support legal vector types of size 64 or 128 in bits.
1472 // Accesses having vector types that are a multiple of 128 bits can be
1473 // matched to more than one vldN/vstN instruction.
1474 int BaseCost =
1475 ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1;
1476 if (NumElts % Factor == 0 &&
1477 TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
1478 return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1479
1480 // Some smaller than legal interleaved patterns are cheap as we can make
1481 // use of the vmovn or vrev patterns to interleave a standard load. This is
1482 // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1483 // promoted differently). The cost of 2 here is then a load and vrev or
1484 // vmovn.
1485 if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
1486 VecTy->isIntOrIntVectorTy() &&
1487 DL.getTypeSizeInBits(SubVecTy).getFixedSize() <= 64)
1488 return 2 * BaseCost;
1489 }
1490
1491 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1492 Alignment, AddressSpace, CostKind,
1493 UseMaskForCond, UseMaskForGaps);
1494}
1495
1496InstructionCost ARMTTIImpl::getGatherScatterOpCost(
1497 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1498 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
1499 using namespace PatternMatch;
1500 if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
1501 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1502 Alignment, CostKind, I);
1503
1504 assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!")(static_cast<void> (0));
1505 auto *VTy = cast<FixedVectorType>(DataTy);
1506
1507 // TODO: Splitting, once we do that.
1508
1509 unsigned NumElems = VTy->getNumElements();
1510 unsigned EltSize = VTy->getScalarSizeInBits();
1511 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, DataTy);
1512
1513 // For now, it is assumed that for the MVE gather instructions the loads are
1514 // all effectively serialised. This means the cost is the scalar cost
1515 // multiplied by the number of elements being loaded. This is possibly very
1516 // conservative, but even so we still end up vectorising loops because the
1517 // cost per iteration for many loops is lower than for scalar loops.
1518 InstructionCost VectorCost =
1519 NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
1520 // The scalarization cost should be a lot higher. We use the number of vector
1521 // elements plus the scalarization overhead.
1522 InstructionCost ScalarCost =
1523 NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, true, false) +
1524 BaseT::getScalarizationOverhead(VTy, false, true);
1525
1526 if (EltSize < 8 || Alignment < EltSize / 8)
1527 return ScalarCost;
1528
1529 unsigned ExtSize = EltSize;
1530 // Check whether there's a single user that asks for an extended type
1531 if (I != nullptr) {
1532 // Dependent of the caller of this function, a gather instruction will
1533 // either have opcode Instruction::Load or be a call to the masked_gather
1534 // intrinsic
1535 if ((I->getOpcode() == Instruction::Load ||
1536 match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
1537 I->hasOneUse()) {
1538 const User *Us = *I->users().begin();
1539 if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
1540 // only allow valid type combinations
1541 unsigned TypeSize =
1542 cast<Instruction>(Us)->getType()->getScalarSizeInBits();
1543 if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
1544 (TypeSize == 16 && EltSize == 8)) &&
1545 TypeSize * NumElems == 128) {
1546 ExtSize = TypeSize;
1547 }
1548 }
1549 }
1550 // Check whether the input data needs to be truncated
1551 TruncInst *T;
1552 if ((I->getOpcode() == Instruction::Store ||
1553 match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
1554 (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
1555 // Only allow valid type combinations
1556 unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
1557 if (((EltSize == 16 && TypeSize == 32) ||
1558 (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
1559 TypeSize * NumElems == 128)
1560 ExtSize = TypeSize;
1561 }
1562 }
1563
1564 if (ExtSize * NumElems != 128 || NumElems < 4)
1565 return ScalarCost;
1566
1567 // Any (aligned) i32 gather will not need to be scalarised.
1568 if (ExtSize == 32)
1569 return VectorCost;
1570 // For smaller types, we need to ensure that the gep's inputs are correctly
1571 // extended from a small enough value. Other sizes (including i64) are
1572 // scalarized for now.
1573 if (ExtSize != 8 && ExtSize != 16)
1574 return ScalarCost;
1575
1576 if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
1577 Ptr = BC->getOperand(0);
1578 if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
1579 if (GEP->getNumOperands() != 2)
1580 return ScalarCost;
1581 unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
1582 // Scale needs to be correct (which is only relevant for i16s).
1583 if (Scale != 1 && Scale * 8 != ExtSize)
1584 return ScalarCost;
1585 // And we need to zext (not sext) the indexes from a small enough type.
1586 if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
1587 if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
1588 return VectorCost;
1589 }
1590 return ScalarCost;
1591 }
1592 return ScalarCost;
1593}
1594
1595InstructionCost
1596ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
1597 Optional<FastMathFlags> FMF,
1598 TTI::TargetCostKind CostKind) {
1599 if (TTI::requiresOrderedReduction(FMF))
1600 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1601
1602 EVT ValVT = TLI->getValueType(DL, ValTy);
1603 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1604 if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD)
1605 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1606
1607 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1608
1609 static const CostTblEntry CostTblAdd[]{
1610 {ISD::ADD, MVT::v16i8, 1},
1611 {ISD::ADD, MVT::v8i16, 1},
1612 {ISD::ADD, MVT::v4i32, 1},
1613 };
1614 if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
1615 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1616
1617 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1618}
1619
1620InstructionCost
1621ARMTTIImpl::getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
1622 Type *ResTy, VectorType *ValTy,
1623 TTI::TargetCostKind CostKind) {
1624 EVT ValVT = TLI->getValueType(DL, ValTy);
1625 EVT ResVT = TLI->getValueType(DL, ResTy);
1626
1627 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1628 std::pair<InstructionCost, MVT> LT =
1629 TLI->getTypeLegalizationCost(DL, ValTy);
1630
1631 // The legal cases are:
1632 // VADDV u/s 8/16/32
1633 // VMLAV u/s 8/16/32
1634 // VADDLV u/s 32
1635 // VMLALV u/s 16/32
1636 // Codegen currently cannot always handle larger than legal vectors very
1637 // well, especially for predicated reductions where the mask needs to be
1638 // split, so restrict to 128bit or smaller input types.
1639 unsigned RevVTSize = ResVT.getSizeInBits();
1640 if (ValVT.getSizeInBits() <= 128 &&
1641 ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1642 (LT.second == MVT::v8i16 && RevVTSize <= (IsMLA ? 64u : 32u)) ||
1643 (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1644 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1645 }
1646
1647 return BaseT::getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, ValTy,
1648 CostKind);
1649}
1650
1651InstructionCost
1652ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1653 TTI::TargetCostKind CostKind) {
1654 switch (ICA.getID()) {
1655 case Intrinsic::get_active_lane_mask:
1656 // Currently we make a somewhat optimistic assumption that
1657 // active_lane_mask's are always free. In reality it may be freely folded
1658 // into a tail predicated loop, expanded into a VCPT or expanded into a lot
1659 // of add/icmp code. We may need to improve this in the future, but being
1660 // able to detect if it is free or not involves looking at a lot of other
1661 // code. We currently assume that the vectorizer inserted these, and knew
1662 // what it was doing in adding one.
1663 if (ST->hasMVEIntegerOps())
1664 return 0;
1665 break;
1666 case Intrinsic::sadd_sat:
1667 case Intrinsic::ssub_sat:
1668 case Intrinsic::uadd_sat:
1669 case Intrinsic::usub_sat: {
1670 if (!ST->hasMVEIntegerOps())
1671 break;
1672 Type *VT = ICA.getReturnType();
1673
1674 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1675 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1676 LT.second == MVT::v16i8) {
1677 // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
1678 // need to extend the type, as it uses shr(qadd(shl, shl)).
1679 unsigned Instrs =
1680 LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4;
1681 return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
1682 }
1683 break;
1684 }
1685 case Intrinsic::abs:
1686 case Intrinsic::smin:
1687 case Intrinsic::smax:
1688 case Intrinsic::umin:
1689 case Intrinsic::umax: {
1690 if (!ST->hasMVEIntegerOps())
1691 break;
1692 Type *VT = ICA.getReturnType();
1693
1694 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1695 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1696 LT.second == MVT::v16i8)
1697 return LT.first * ST->getMVEVectorCostFactor(CostKind);
1698 break;
1699 }
1700 case Intrinsic::minnum:
1701 case Intrinsic::maxnum: {
1702 if (!ST->hasMVEFloatOps())
1703 break;
1704 Type *VT = ICA.getReturnType();
1705 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1706 if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
1707 return LT.first * ST->getMVEVectorCostFactor(CostKind);
1708 break;
1709 }
1710 }
1711
1712 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1
'Default' branch taken. Execution continues on line 1712
2
Calling 'BasicTTIImplBase::getIntrinsicInstrCost'
1713}
1714
1715bool ARMTTIImpl::isLoweredToCall(const Function *F) {
1716 if (!F->isIntrinsic())
1717 BaseT::isLoweredToCall(F);
1718
1719 // Assume all Arm-specific intrinsics map to an instruction.
1720 if (F->getName().startswith("llvm.arm"))
1721 return false;
1722
1723 switch (F->getIntrinsicID()) {
1724 default: break;
1725 case Intrinsic::powi:
1726 case Intrinsic::sin:
1727 case Intrinsic::cos:
1728 case Intrinsic::pow:
1729 case Intrinsic::log:
1730 case Intrinsic::log10:
1731 case Intrinsic::log2:
1732 case Intrinsic::exp:
1733 case Intrinsic::exp2:
1734 return true;
1735 case Intrinsic::sqrt:
1736 case Intrinsic::fabs:
1737 case Intrinsic::copysign:
1738 case Intrinsic::floor:
1739 case Intrinsic::ceil:
1740 case Intrinsic::trunc:
1741 case Intrinsic::rint:
1742 case Intrinsic::nearbyint:
1743 case Intrinsic::round:
1744 case Intrinsic::canonicalize:
1745 case Intrinsic::lround:
1746 case Intrinsic::llround:
1747 case Intrinsic::lrint:
1748 case Intrinsic::llrint:
1749 if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
1750 return true;
1751 if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
1752 return true;
1753 // Some operations can be handled by vector instructions and assume
1754 // unsupported vectors will be expanded into supported scalar ones.
1755 // TODO Handle scalar operations properly.
1756 return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
1757 case Intrinsic::masked_store:
1758 case Intrinsic::masked_load:
1759 case Intrinsic::masked_gather:
1760 case Intrinsic::masked_scatter:
1761 return !ST->hasMVEIntegerOps();
1762 case Intrinsic::sadd_with_overflow:
1763 case Intrinsic::uadd_with_overflow:
1764 case Intrinsic::ssub_with_overflow:
1765 case Intrinsic::usub_with_overflow:
1766 case Intrinsic::sadd_sat:
1767 case Intrinsic::uadd_sat:
1768 case Intrinsic::ssub_sat:
1769 case Intrinsic::usub_sat:
1770 return false;
1771 }
1772
1773 return BaseT::isLoweredToCall(F);
1774}
1775
1776bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) {
1777 unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
1778 EVT VT = TLI->getValueType(DL, I.getType(), true);
1779 if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
1780 return true;
1781
1782 // Check if an intrinsic will be lowered to a call and assume that any
1783 // other CallInst will generate a bl.
1784 if (auto *Call = dyn_cast<CallInst>(&I)) {
1785 if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
1786 switch(II->getIntrinsicID()) {
1787 case Intrinsic::memcpy:
1788 case Intrinsic::memset:
1789 case Intrinsic::memmove:
1790 return getNumMemOps(II) == -1;
1791 default:
1792 if (const Function *F = Call->getCalledFunction())
1793 return isLoweredToCall(F);
1794 }
1795 }
1796 return true;
1797 }
1798
1799 // FPv5 provides conversions between integer, double-precision,
1800 // single-precision, and half-precision formats.
1801 switch (I.getOpcode()) {
1802 default:
1803 break;
1804 case Instruction::FPToSI:
1805 case Instruction::FPToUI:
1806 case Instruction::SIToFP:
1807 case Instruction::UIToFP:
1808 case Instruction::FPTrunc:
1809 case Instruction::FPExt:
1810 return !ST->hasFPARMv8Base();
1811 }
1812
1813 // FIXME: Unfortunately the approach of checking the Operation Action does
1814 // not catch all cases of Legalization that use library calls. Our
1815 // Legalization step categorizes some transformations into library calls as
1816 // Custom, Expand or even Legal when doing type legalization. So for now
1817 // we have to special case for instance the SDIV of 64bit integers and the
1818 // use of floating point emulation.
1819 if (VT.isInteger() && VT.getSizeInBits() >= 64) {
1820 switch (ISD) {
1821 default:
1822 break;
1823 case ISD::SDIV:
1824 case ISD::UDIV:
1825 case ISD::SREM:
1826 case ISD::UREM:
1827 case ISD::SDIVREM:
1828 case ISD::UDIVREM:
1829 return true;
1830 }
1831 }
1832
1833 // Assume all other non-float operations are supported.
1834 if (!VT.isFloatingPoint())
1835 return false;
1836
1837 // We'll need a library call to handle most floats when using soft.
1838 if (TLI->useSoftFloat()) {
1839 switch (I.getOpcode()) {
1840 default:
1841 return true;
1842 case Instruction::Alloca:
1843 case Instruction::Load:
1844 case Instruction::Store:
1845 case Instruction::Select:
1846 case Instruction::PHI:
1847 return false;
1848 }
1849 }
1850
1851 // We'll need a libcall to perform double precision operations on a single
1852 // precision only FPU.
1853 if (I.getType()->isDoubleTy() && !ST->hasFP64())
1854 return true;
1855
1856 // Likewise for half precision arithmetic.
1857 if (I.getType()->isHalfTy() && !ST->hasFullFP16())
1858 return true;
1859
1860 return false;
1861}
1862
1863bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
1864 AssumptionCache &AC,
1865 TargetLibraryInfo *LibInfo,
1866 HardwareLoopInfo &HWLoopInfo) {
1867 // Low-overhead branches are only supported in the 'low-overhead branch'
1868 // extension of v8.1-m.
1869 if (!ST->hasLOB() || DisableLowOverheadLoops) {
1870 LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n")do { } while (false);
1871 return false;
1872 }
1873
1874 if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
1875 LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n")do { } while (false);
1876 return false;
1877 }
1878
1879 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
1880 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
1881 LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n")do { } while (false);
1882 return false;
1883 }
1884
1885 const SCEV *TripCountSCEV =
1886 SE.getAddExpr(BackedgeTakenCount,
1887 SE.getOne(BackedgeTakenCount->getType()));
1888
1889 // We need to store the trip count in LR, a 32-bit register.
1890 if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
1891 LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n")do { } while (false);
1892 return false;
1893 }
1894
1895 // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
1896 // point in generating a hardware loop if that's going to happen.
1897
1898 auto IsHardwareLoopIntrinsic = [](Instruction &I) {
1899 if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
1900 switch (Call->getIntrinsicID()) {
1901 default:
1902 break;
1903 case Intrinsic::start_loop_iterations:
1904 case Intrinsic::test_start_loop_iterations:
1905 case Intrinsic::loop_decrement:
1906 case Intrinsic::loop_decrement_reg:
1907 return true;
1908 }
1909 }
1910 return false;
1911 };
1912
1913 // Scan the instructions to see if there's any that we know will turn into a
1914 // call or if this loop is already a low-overhead loop or will become a tail
1915 // predicated loop.
1916 bool IsTailPredLoop = false;
1917 auto ScanLoop = [&](Loop *L) {
1918 for (auto *BB : L->getBlocks()) {
1919 for (auto &I : *BB) {
1920 if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
1921 isa<InlineAsm>(I)) {
1922 LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n")do { } while (false);
1923 return false;
1924 }
1925 if (auto *II = dyn_cast<IntrinsicInst>(&I))
1926 IsTailPredLoop |=
1927 II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
1928 II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
1929 II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
1930 II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
1931 II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
1932 }
1933 }
1934 return true;
1935 };
1936
1937 // Visit inner loops.
1938 for (auto Inner : *L)
1939 if (!ScanLoop(Inner))
1940 return false;
1941
1942 if (!ScanLoop(L))
1943 return false;
1944
1945 // TODO: Check whether the trip count calculation is expensive. If L is the
1946 // inner loop but we know it has a low trip count, calculating that trip
1947 // count (in the parent loop) may be detrimental.
1948
1949 LLVMContext &C = L->getHeader()->getContext();
1950 HWLoopInfo.CounterInReg = true;
1951 HWLoopInfo.IsNestingLegal = false;
1952 HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
1953 HWLoopInfo.CountType = Type::getInt32Ty(C);
1954 HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
1955 return true;
1956}
1957
1958static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
1959 // We don't allow icmp's, and because we only look at single block loops,
1960 // we simply count the icmps, i.e. there should only be 1 for the backedge.
1961 if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
1962 return false;
1963 // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
1964 // not currently canonical, but soon will be. Code without them uses icmp, and
1965 // so is not tail predicated as per the condition above. In order to get the
1966 // same performance we treat min and max the same as an icmp for tailpred
1967 // purposes for the moment (we often rely on non-tailpred and higher VF's to
1968 // pick more optimial instructions like VQDMULH. They need to be recognized
1969 // directly by the vectorizer).
1970 if (auto *II = dyn_cast<IntrinsicInst>(&I))
1971 if ((II->getIntrinsicID() == Intrinsic::smin ||
1972 II->getIntrinsicID() == Intrinsic::smax ||
1973 II->getIntrinsicID() == Intrinsic::umin ||
1974 II->getIntrinsicID() == Intrinsic::umax) &&
1975 ++ICmpCount > 1)
1976 return false;
1977
1978 if (isa<FCmpInst>(&I))
1979 return false;
1980
1981 // We could allow extending/narrowing FP loads/stores, but codegen is
1982 // too inefficient so reject this for now.
1983 if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
1984 return false;
1985
1986 // Extends have to be extending-loads
1987 if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
1988 if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
1989 return false;
1990
1991 // Truncs have to be narrowing-stores
1992 if (isa<TruncInst>(&I) )
1993 if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
1994 return false;
1995
1996 return true;
1997}
1998
1999// To set up a tail-predicated loop, we need to know the total number of
2000// elements processed by that loop. Thus, we need to determine the element
2001// size and:
2002// 1) it should be uniform for all operations in the vector loop, so we
2003// e.g. don't want any widening/narrowing operations.
2004// 2) it should be smaller than i64s because we don't have vector operations
2005// that work on i64s.
2006// 3) we don't want elements to be reversed or shuffled, to make sure the
2007// tail-predication masks/predicates the right lanes.
2008//
2009static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
2010 const DataLayout &DL,
2011 const LoopAccessInfo *LAI) {
2012 LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n")do { } while (false);
2013
2014 // If there are live-out values, it is probably a reduction. We can predicate
2015 // most reduction operations freely under MVE using a combination of
2016 // prefer-predicated-reduction-select and inloop reductions. We limit this to
2017 // floating point and integer reductions, but don't check for operators
2018 // specifically here. If the value ends up not being a reduction (and so the
2019 // vectorizer cannot tailfold the loop), we should fall back to standard
2020 // vectorization automatically.
2021 SmallVector< Instruction *, 8 > LiveOuts;
2022 LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
2023 bool ReductionsDisabled =
2024 EnableTailPredication == TailPredication::EnabledNoReductions ||
2025 EnableTailPredication == TailPredication::ForceEnabledNoReductions;
2026
2027 for (auto *I : LiveOuts) {
2028 if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
2029 !I->getType()->isHalfTy()) {
2030 LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "do { } while (false)
2031 "live-out value\n")do { } while (false);
2032 return false;
2033 }
2034 if (ReductionsDisabled) {
2035 LLVM_DEBUG(dbgs() << "Reductions not enabled\n")do { } while (false);
2036 return false;
2037 }
2038 }
2039
2040 // Next, check that all instructions can be tail-predicated.
2041 PredicatedScalarEvolution PSE = LAI->getPSE();
2042 SmallVector<Instruction *, 16> LoadStores;
2043 int ICmpCount = 0;
2044
2045 for (BasicBlock *BB : L->blocks()) {
2046 for (Instruction &I : BB->instructionsWithoutDebug()) {
2047 if (isa<PHINode>(&I))
2048 continue;
2049 if (!canTailPredicateInstruction(I, ICmpCount)) {
2050 LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump())do { } while (false);
2051 return false;
2052 }
2053
2054 Type *T = I.getType();
2055 if (T->isPointerTy())
2056 T = T->getPointerElementType();
2057
2058 if (T->getScalarSizeInBits() > 32) {
2059 LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump())do { } while (false);
2060 return false;
2061 }
2062 if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
2063 Value *Ptr = isa<LoadInst>(I) ? I.getOperand(0) : I.getOperand(1);
2064 int64_t NextStride = getPtrStride(PSE, Ptr, L);
2065 if (NextStride == 1) {
2066 // TODO: for now only allow consecutive strides of 1. We could support
2067 // other strides as long as it is uniform, but let's keep it simple
2068 // for now.
2069 continue;
2070 } else if (NextStride == -1 ||
2071 (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
2072 (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
2073 LLVM_DEBUG(dbgs()do { } while (false)
2074 << "Consecutive strides of 2 found, vld2/vstr2 can't "do { } while (false)
2075 "be tail-predicated\n.")do { } while (false);
2076 return false;
2077 // TODO: don't tail predicate if there is a reversed load?
2078 } else if (EnableMaskedGatherScatters) {
2079 // Gather/scatters do allow loading from arbitrary strides, at
2080 // least if they are loop invariant.
2081 // TODO: Loop variant strides should in theory work, too, but
2082 // this requires further testing.
2083 const SCEV *PtrScev =
2084 replaceSymbolicStrideSCEV(PSE, llvm::ValueToValueMap(), Ptr);
2085 if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
2086 const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
2087 if (PSE.getSE()->isLoopInvariant(Step, L))
2088 continue;
2089 }
2090 }
2091 LLVM_DEBUG(dbgs() << "Bad stride found, can't "do { } while (false)
2092 "tail-predicate\n.")do { } while (false);
2093 return false;
2094 }
2095 }
2096 }
2097
2098 LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n")do { } while (false);
2099 return true;
2100}
2101
2102bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
2103 ScalarEvolution &SE,
2104 AssumptionCache &AC,
2105 TargetLibraryInfo *TLI,
2106 DominatorTree *DT,
2107 const LoopAccessInfo *LAI) {
2108 if (!EnableTailPredication) {
2109 LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n")do { } while (false);
2110 return false;
2111 }
2112
2113 // Creating a predicated vector loop is the first step for generating a
2114 // tail-predicated hardware loop, for which we need the MVE masked
2115 // load/stores instructions:
2116 if (!ST->hasMVEIntegerOps())
2117 return false;
2118
2119 // For now, restrict this to single block loops.
2120 if (L->getNumBlocks() > 1) {
2121 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "do { } while (false)
2122 "loop.\n")do { } while (false);
2123 return false;
2124 }
2125
2126 assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected")(static_cast<void> (0));
2127
2128 HardwareLoopInfo HWLoopInfo(L);
2129 if (!HWLoopInfo.canAnalyze(*LI)) {
2130 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "do { } while (false)
2131 "analyzable.\n")do { } while (false);
2132 return false;
2133 }
2134
2135 // This checks if we have the low-overhead branch architecture
2136 // extension, and if we will create a hardware-loop:
2137 if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {
2138 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "do { } while (false)
2139 "profitable.\n")do { } while (false);
2140 return false;
2141 }
2142
2143 if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) {
2144 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "do { } while (false)
2145 "a candidate.\n")do { } while (false);
2146 return false;
2147 }
2148
2149 return canTailPredicateLoop(L, LI, SE, DL, LAI);
2150}
2151
2152bool ARMTTIImpl::emitGetActiveLaneMask() const {
2153 if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
2154 return false;
2155
2156 // Intrinsic @llvm.get.active.lane.mask is supported.
2157 // It is used in the MVETailPredication pass, which requires the number of
2158 // elements processed by this vector loop to setup the tail-predicated
2159 // loop.
2160 return true;
2161}
2162void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
2163 TTI::UnrollingPreferences &UP,
2164 OptimizationRemarkEmitter *ORE) {
2165 // Enable Upper bound unrolling universally, not dependant upon the conditions
2166 // below.
2167 UP.UpperBound = true;
2168
2169 // Only currently enable these preferences for M-Class cores.
2170 if (!ST->isMClass())
2171 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2172
2173 // Disable loop unrolling for Oz and Os.
2174 UP.OptSizeThreshold = 0;
2175 UP.PartialOptSizeThreshold = 0;
2176 if (L->getHeader()->getParent()->hasOptSize())
2177 return;
2178
2179 SmallVector<BasicBlock*, 4> ExitingBlocks;
2180 L->getExitingBlocks(ExitingBlocks);
2181 LLVM_DEBUG(dbgs() << "Loop has:\n"do { } while (false)
2182 << "Blocks: " << L->getNumBlocks() << "\n"do { } while (false)
2183 << "Exit blocks: " << ExitingBlocks.size() << "\n")do { } while (false);
2184
2185 // Only allow another exit other than the latch. This acts as an early exit
2186 // as it mirrors the profitability calculation of the runtime unroller.
2187 if (ExitingBlocks.size() > 2)
2188 return;
2189
2190 // Limit the CFG of the loop body for targets with a branch predictor.
2191 // Allowing 4 blocks permits if-then-else diamonds in the body.
2192 if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
2193 return;
2194
2195 // Don't unroll vectorized loops, including the remainder loop
2196 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
2197 return;
2198
2199 // Scan the loop: don't unroll loops with calls as this could prevent
2200 // inlining.
2201 InstructionCost Cost = 0;
2202 for (auto *BB : L->getBlocks()) {
2203 for (auto &I : *BB) {
2204 // Don't unroll vectorised loop. MVE does not benefit from it as much as
2205 // scalar code.
2206 if (I.getType()->isVectorTy())
2207 return;
2208
2209 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2210 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2211 if (!isLoweredToCall(F))
2212 continue;
2213 }
2214 return;
2215 }
2216
2217 SmallVector<const Value*, 4> Operands(I.operand_values());
2218 Cost +=
2219 getUserCost(&I, Operands, TargetTransformInfo::TCK_SizeAndLatency);
2220 }
2221 }
2222
2223 // On v6m cores, there are very few registers available. We can easily end up
2224 // spilling and reloading more registers in an unrolled loop. Look at the
2225 // number of LCSSA phis as a rough measure of how many registers will need to
2226 // be live out of the loop, reducing the default unroll count if more than 1
2227 // value is needed. In the long run, all of this should be being learnt by a
2228 // machine.
2229 unsigned UnrollCount = 4;
2230 if (ST->isThumb1Only()) {
2231 unsigned ExitingValues = 0;
2232 SmallVector<BasicBlock *, 4> ExitBlocks;
2233 L->getExitBlocks(ExitBlocks);
2234 for (auto *Exit : ExitBlocks) {
2235 // Count the number of LCSSA phis. Exclude values coming from GEP's as
2236 // only the last is expected to be needed for address operands.
2237 unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) {
2238 return PH.getNumOperands() != 1 ||
2239 !isa<GetElementPtrInst>(PH.getOperand(0));
2240 });
2241 ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
2242 }
2243 if (ExitingValues)
2244 UnrollCount /= ExitingValues;
2245 if (UnrollCount <= 1)
2246 return;
2247 }
2248
2249 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n")do { } while (false);
2250 LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n")do { } while (false);
2251
2252 UP.Partial = true;
2253 UP.Runtime = true;
2254 UP.UnrollRemainder = true;
2255 UP.DefaultUnrollRuntimeCount = UnrollCount;
2256 UP.UnrollAndJam = true;
2257 UP.UnrollAndJamInnerLoopThreshold = 60;
2258
2259 // Force unrolling small loops can be very useful because of the branch
2260 // taken cost of the backedge.
2261 if (Cost < 12)
2262 UP.Force = true;
2263}
2264
2265void ARMTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
2266 TTI::PeelingPreferences &PP) {
2267 BaseT::getPeelingPreferences(L, SE, PP);
2268}
2269
2270bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
2271 TTI::ReductionFlags Flags) const {
2272 if (!ST->hasMVEIntegerOps())
2273 return false;
2274
2275 unsigned ScalarBits = Ty->getScalarSizeInBits();
2276 switch (Opcode) {
2277 case Instruction::Add:
2278 return ScalarBits <= 64;
2279 default:
2280 return false;
2281 }
2282}
2283
2284bool ARMTTIImpl::preferPredicatedReductionSelect(
2285 unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const {
2286 if (!ST->hasMVEIntegerOps())
2287 return false;
2288 return true;
2289}

/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/include/llvm/CodeGen/BasicTTIImpl.h

1//===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file provides a helper that implements much of the TTI interface in
11/// terms of the target-independent code generator and TargetLowering
12/// interfaces.
13//
14//===----------------------------------------------------------------------===//
15
16#ifndef LLVM_CODEGEN_BASICTTIIMPL_H
17#define LLVM_CODEGEN_BASICTTIIMPL_H
18
19#include "llvm/ADT/APInt.h"
20#include "llvm/ADT/ArrayRef.h"
21#include "llvm/ADT/BitVector.h"
22#include "llvm/ADT/SmallPtrSet.h"
23#include "llvm/ADT/SmallVector.h"
24#include "llvm/Analysis/LoopInfo.h"
25#include "llvm/Analysis/OptimizationRemarkEmitter.h"
26#include "llvm/Analysis/TargetTransformInfo.h"
27#include "llvm/Analysis/TargetTransformInfoImpl.h"
28#include "llvm/CodeGen/ISDOpcodes.h"
29#include "llvm/CodeGen/TargetLowering.h"
30#include "llvm/CodeGen/TargetSubtargetInfo.h"
31#include "llvm/CodeGen/ValueTypes.h"
32#include "llvm/IR/BasicBlock.h"
33#include "llvm/IR/Constant.h"
34#include "llvm/IR/Constants.h"
35#include "llvm/IR/DataLayout.h"
36#include "llvm/IR/DerivedTypes.h"
37#include "llvm/IR/InstrTypes.h"
38#include "llvm/IR/Instruction.h"
39#include "llvm/IR/Instructions.h"
40#include "llvm/IR/Intrinsics.h"
41#include "llvm/IR/Operator.h"
42#include "llvm/IR/Type.h"
43#include "llvm/IR/Value.h"
44#include "llvm/Support/Casting.h"
45#include "llvm/Support/CommandLine.h"
46#include "llvm/Support/ErrorHandling.h"
47#include "llvm/Support/MachineValueType.h"
48#include "llvm/Support/MathExtras.h"
49#include "llvm/Target/TargetMachine.h"
50#include <algorithm>
51#include <cassert>
52#include <cstdint>
53#include <limits>
54#include <utility>
55
56namespace llvm {
57
58class Function;
59class GlobalValue;
60class LLVMContext;
61class ScalarEvolution;
62class SCEV;
63class TargetMachine;
64
65extern cl::opt<unsigned> PartialUnrollingThreshold;
66
67/// Base class which can be used to help build a TTI implementation.
68///
69/// This class provides as much implementation of the TTI interface as is
70/// possible using the target independent parts of the code generator.
71///
72/// In order to subclass it, your class must implement a getST() method to
73/// return the subtarget, and a getTLI() method to return the target lowering.
74/// We need these methods implemented in the derived class so that this class
75/// doesn't have to duplicate storage for them.
76template <typename T>
77class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
78private:
79 using BaseT = TargetTransformInfoImplCRTPBase<T>;
80 using TTI = TargetTransformInfo;
81
82 /// Helper function to access this as a T.
83 T *thisT() { return static_cast<T *>(this); }
84
85 /// Estimate a cost of Broadcast as an extract and sequence of insert
86 /// operations.
87 InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy) {
88 InstructionCost Cost = 0;
89 // Broadcast cost is equal to the cost of extracting the zero'th element
90 // plus the cost of inserting it into every element of the result vector.
91 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 0);
92
93 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
94 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i);
95 }
96 return Cost;
97 }
98
99 /// Estimate a cost of shuffle as a sequence of extract and insert
100 /// operations.
101 InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy) {
102 InstructionCost Cost = 0;
103 // Shuffle cost is equal to the cost of extracting element from its argument
104 // plus the cost of inserting them onto the result vector.
105
106 // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
107 // index 0 of first vector, index 1 of second vector,index 2 of first
108 // vector and finally index 3 of second vector and insert them at index
109 // <0,1,2,3> of result vector.
110 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
111 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i);
112 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, i);
113 }
114 return Cost;
115 }
116
117 /// Estimate a cost of subvector extraction as a sequence of extract and
118 /// insert operations.
119 InstructionCost getExtractSubvectorOverhead(VectorType *VTy, int Index,
120 FixedVectorType *SubVTy) {
121 assert(VTy && SubVTy &&(static_cast<void> (0))
122 "Can only extract subvectors from vectors")(static_cast<void> (0));
123 int NumSubElts = SubVTy->getNumElements();
124 assert((!isa<FixedVectorType>(VTy) ||(static_cast<void> (0))
125 (Index + NumSubElts) <=(static_cast<void> (0))
126 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&(static_cast<void> (0))
127 "SK_ExtractSubvector index out of range")(static_cast<void> (0));
128
129 InstructionCost Cost = 0;
130 // Subvector extraction cost is equal to the cost of extracting element from
131 // the source type plus the cost of inserting them into the result vector
132 // type.
133 for (int i = 0; i != NumSubElts; ++i) {
134 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
135 i + Index);
136 Cost +=
137 thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, i);
138 }
139 return Cost;
140 }
141
142 /// Estimate a cost of subvector insertion as a sequence of extract and
143 /// insert operations.
144 InstructionCost getInsertSubvectorOverhead(VectorType *VTy, int Index,
145 FixedVectorType *SubVTy) {
146 assert(VTy && SubVTy &&(static_cast<void> (0))
147 "Can only insert subvectors into vectors")(static_cast<void> (0));
148 int NumSubElts = SubVTy->getNumElements();
149 assert((!isa<FixedVectorType>(VTy) ||(static_cast<void> (0))
150 (Index + NumSubElts) <=(static_cast<void> (0))
151 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&(static_cast<void> (0))
152 "SK_InsertSubvector index out of range")(static_cast<void> (0));
153
154 InstructionCost Cost = 0;
155 // Subvector insertion cost is equal to the cost of extracting element from
156 // the source type plus the cost of inserting them into the result vector
157 // type.
158 for (int i = 0; i != NumSubElts; ++i) {
159 Cost +=
160 thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy, i);
161 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
162 i + Index);
163 }
164 return Cost;
165 }
166
167 /// Local query method delegates up to T which *must* implement this!
168 const TargetSubtargetInfo *getST() const {
169 return static_cast<const T *>(this)->getST();
170 }
171
172 /// Local query method delegates up to T which *must* implement this!
173 const TargetLoweringBase *getTLI() const {
174 return static_cast<const T *>(this)->getTLI();
175 }
176
177 static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
178 switch (M) {
179 case TTI::MIM_Unindexed:
180 return ISD::UNINDEXED;
181 case TTI::MIM_PreInc:
182 return ISD::PRE_INC;
183 case TTI::MIM_PreDec:
184 return ISD::PRE_DEC;
185 case TTI::MIM_PostInc:
186 return ISD::POST_INC;
187 case TTI::MIM_PostDec:
188 return ISD::POST_DEC;
189 }
190 llvm_unreachable("Unexpected MemIndexedMode")__builtin_unreachable();
191 }
192
193 InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
194 Align Alignment,
195 bool VariableMask,
196 bool IsGatherScatter,
197 TTI::TargetCostKind CostKind) {
198 auto *VT = cast<FixedVectorType>(DataTy);
199 // Assume the target does not have support for gather/scatter operations
200 // and provide a rough estimate.
201 //
202 // First, compute the cost of the individual memory operations.
203 InstructionCost AddrExtractCost =
204 IsGatherScatter
205 ? getVectorInstrCost(Instruction::ExtractElement,
206 FixedVectorType::get(
207 PointerType::get(VT->getElementType(), 0),
208 VT->getNumElements()),
209 -1)
210 : 0;
211 InstructionCost LoadCost =
212 VT->getNumElements() *
213 (AddrExtractCost +
214 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind));
215
216 // Next, compute the cost of packing the result in a vector.
217 InstructionCost PackingCost = getScalarizationOverhead(
218 VT, Opcode != Instruction::Store, Opcode == Instruction::Store);
219
220 InstructionCost ConditionalCost = 0;
221 if (VariableMask) {
222 // Compute the cost of conditionally executing the memory operations with
223 // variable masks. This includes extracting the individual conditions, a
224 // branches and PHIs to combine the results.
225 // NOTE: Estimating the cost of conditionally executing the memory
226 // operations accurately is quite difficult and the current solution
227 // provides a very rough estimate only.
228 ConditionalCost =
229 VT->getNumElements() *
230 (getVectorInstrCost(
231 Instruction::ExtractElement,
232 FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()),
233 VT->getNumElements()),
234 -1) +
235 getCFInstrCost(Instruction::Br, CostKind) +
236 getCFInstrCost(Instruction::PHI, CostKind));
237 }
238
239 return LoadCost + PackingCost + ConditionalCost;
240 }
241
242protected:
243 explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
244 : BaseT(DL) {}
245 virtual ~BasicTTIImplBase() = default;
246
247 using TargetTransformInfoImplBase::DL;
248
249public:
250 /// \name Scalar TTI Implementations
251 /// @{
252 bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth,
253 unsigned AddressSpace, Align Alignment,
254 bool *Fast) const {
255 EVT E = EVT::getIntegerVT(Context, BitWidth);
256 return getTLI()->allowsMisalignedMemoryAccesses(
257 E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast);
258 }
259
260 bool hasBranchDivergence() { return false; }
261
262 bool useGPUDivergenceAnalysis() { return false; }
263
264 bool isSourceOfDivergence(const Value *V) { return false; }
265
266 bool isAlwaysUniform(const Value *V) { return false; }
267
268 unsigned getFlatAddressSpace() {
269 // Return an invalid address space.
270 return -1;
271 }
272
273 bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
274 Intrinsic::ID IID) const {
275 return false;
276 }
277
278 bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
279 return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS);
280 }
281
282 unsigned getAssumedAddrSpace(const Value *V) const {
283 return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
284 }
285
286 Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
287 Value *NewV) const {
288 return nullptr;
289 }
290
291 bool isLegalAddImmediate(int64_t imm) {
292 return getTLI()->isLegalAddImmediate(imm);
293 }
294
295 bool isLegalICmpImmediate(int64_t imm) {
296 return getTLI()->isLegalICmpImmediate(imm);
297 }
298
299 bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
300 bool HasBaseReg, int64_t Scale,
301 unsigned AddrSpace, Instruction *I = nullptr) {
302 TargetLoweringBase::AddrMode AM;
303 AM.BaseGV = BaseGV;
304 AM.BaseOffs = BaseOffset;
305 AM.HasBaseReg = HasBaseReg;
306 AM.Scale = Scale;
307 return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
308 }
309
310 bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty,
311 const DataLayout &DL) const {
312 EVT VT = getTLI()->getValueType(DL, Ty);
313 return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
314 }
315
316 bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty,
317 const DataLayout &DL) const {
318 EVT VT = getTLI()->getValueType(DL, Ty);
319 return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
320 }
321
322 bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) {
323 return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
324 }
325
326 bool isNumRegsMajorCostOfLSR() {
327 return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR();
328 }
329
330 bool isProfitableLSRChainElement(Instruction *I) {
331 return TargetTransformInfoImplBase::isProfitableLSRChainElement(I);
332 }
333
334 InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
335 int64_t BaseOffset, bool HasBaseReg,
336 int64_t Scale, unsigned AddrSpace) {
337 TargetLoweringBase::AddrMode AM;
338 AM.BaseGV = BaseGV;
339 AM.BaseOffs = BaseOffset;
340 AM.HasBaseReg = HasBaseReg;
341 AM.Scale = Scale;
342 return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace);
343 }
344
345 bool isTruncateFree(Type *Ty1, Type *Ty2) {
346 return getTLI()->isTruncateFree(Ty1, Ty2);
347 }
348
349 bool isProfitableToHoist(Instruction *I) {
350 return getTLI()->isProfitableToHoist(I);
351 }
352
353 bool useAA() const { return getST()->useAA(); }
354
355 bool isTypeLegal(Type *Ty) {
356 EVT VT = getTLI()->getValueType(DL, Ty);
357 return getTLI()->isTypeLegal(VT);
358 }
359
360 InstructionCost getRegUsageForType(Type *Ty) {
361 InstructionCost Val = getTLI()->getTypeLegalizationCost(DL, Ty).first;
362 assert(Val >= 0 && "Negative cost!")(static_cast<void> (0));
363 return Val;
364 }
365
366 InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr,
367 ArrayRef<const Value *> Operands) {
368 return BaseT::getGEPCost(PointeeType, Ptr, Operands);
369 }
370
371 unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
372 unsigned &JumpTableSize,
373 ProfileSummaryInfo *PSI,
374 BlockFrequencyInfo *BFI) {
375 /// Try to find the estimated number of clusters. Note that the number of
376 /// clusters identified in this function could be different from the actual
377 /// numbers found in lowering. This function ignore switches that are
378 /// lowered with a mix of jump table / bit test / BTree. This function was
379 /// initially intended to be used when estimating the cost of switch in
380 /// inline cost heuristic, but it's a generic cost model to be used in other
381 /// places (e.g., in loop unrolling).
382 unsigned N = SI.getNumCases();
383 const TargetLoweringBase *TLI = getTLI();
384 const DataLayout &DL = this->getDataLayout();
385
386 JumpTableSize = 0;
387 bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());
388
389 // Early exit if both a jump table and bit test are not allowed.
390 if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N))
391 return N;
392
393 APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
394 APInt MinCaseVal = MaxCaseVal;
395 for (auto CI : SI.cases()) {
396 const APInt &CaseVal = CI.getCaseValue()->getValue();
397 if (CaseVal.sgt(MaxCaseVal))
398 MaxCaseVal = CaseVal;
399 if (CaseVal.slt(MinCaseVal))
400 MinCaseVal = CaseVal;
401 }
402
403 // Check if suitable for a bit test
404 if (N <= DL.getIndexSizeInBits(0u)) {
405 SmallPtrSet<const BasicBlock *, 4> Dests;
406 for (auto I : SI.cases())
407 Dests.insert(I.getCaseSuccessor());
408
409 if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
410 DL))
411 return 1;
412 }
413
414 // Check if suitable for a jump table.
415 if (IsJTAllowed) {
416 if (N < 2 || N < TLI->getMinimumJumpTableEntries())
417 return N;
418 uint64_t Range =
419 (MaxCaseVal - MinCaseVal)
420 .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1;
421 // Check whether a range of clusters is dense enough for a jump table
422 if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) {
423 JumpTableSize = Range;
424 return 1;
425 }
426 }
427 return N;
428 }
429
430 bool shouldBuildLookupTables() {
431 const TargetLoweringBase *TLI = getTLI();
432 return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
433 TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
434 }
435
436 bool shouldBuildRelLookupTables() const {
437 const TargetMachine &TM = getTLI()->getTargetMachine();
438 // If non-PIC mode, do not generate a relative lookup table.
439 if (!TM.isPositionIndependent())
440 return false;
441
442 /// Relative lookup table entries consist of 32-bit offsets.
443 /// Do not generate relative lookup tables for large code models
444 /// in 64-bit achitectures where 32-bit offsets might not be enough.
445 if (TM.getCodeModel() == CodeModel::Medium ||
446 TM.getCodeModel() == CodeModel::Large)
447 return false;
448
449 Triple TargetTriple = TM.getTargetTriple();
450 if (!TargetTriple.isArch64Bit())
451 return false;
452
453 // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it
454 // there.
455 if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin())
456 return false;
457
458 return true;
459 }
460
461 bool haveFastSqrt(Type *Ty) {
462 const TargetLoweringBase *TLI = getTLI();
463 EVT VT = TLI->getValueType(DL, Ty);
464 return TLI->isTypeLegal(VT) &&
465 TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
466 }
467
468 bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
469 return true;
470 }
471
472 InstructionCost getFPOpCost(Type *Ty) {
473 // Check whether FADD is available, as a proxy for floating-point in
474 // general.
475 const TargetLoweringBase *TLI = getTLI();
476 EVT VT = TLI->getValueType(DL, Ty);
477 if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT))
478 return TargetTransformInfo::TCC_Basic;
479 return TargetTransformInfo::TCC_Expensive;
480 }
481
482 unsigned getInliningThresholdMultiplier() { return 1; }
483 unsigned adjustInliningThreshold(const CallBase *CB) { return 0; }
484
485 int getInlinerVectorBonusPercent() { return 150; }
486
487 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
488 TTI::UnrollingPreferences &UP,
489 OptimizationRemarkEmitter *ORE) {
490 // This unrolling functionality is target independent, but to provide some
491 // motivation for its intended use, for x86:
492
493 // According to the Intel 64 and IA-32 Architectures Optimization Reference
494 // Manual, Intel Core models and later have a loop stream detector (and
495 // associated uop queue) that can benefit from partial unrolling.
496 // The relevant requirements are:
497 // - The loop must have no more than 4 (8 for Nehalem and later) branches
498 // taken, and none of them may be calls.
499 // - The loop can have no more than 18 (28 for Nehalem and later) uops.
500
501 // According to the Software Optimization Guide for AMD Family 15h
502 // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
503 // and loop buffer which can benefit from partial unrolling.
504 // The relevant requirements are:
505 // - The loop must have fewer than 16 branches
506 // - The loop must have less than 40 uops in all executed loop branches
507
508 // The number of taken branches in a loop is hard to estimate here, and
509 // benchmarking has revealed that it is better not to be conservative when
510 // estimating the branch count. As a result, we'll ignore the branch limits
511 // until someone finds a case where it matters in practice.
512
513 unsigned MaxOps;
514 const TargetSubtargetInfo *ST = getST();
515 if (PartialUnrollingThreshold.getNumOccurrences() > 0)
516 MaxOps = PartialUnrollingThreshold;
517 else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
518 MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
519 else
520 return;
521
522 // Scan the loop: don't unroll loops with calls.
523 for (BasicBlock *BB : L->blocks()) {
524 for (Instruction &I : *BB) {
525 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
526 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
527 if (!thisT()->isLoweredToCall(F))
528 continue;
529 }
530
531 if (ORE) {
532 ORE->emit([&]() {
533 return OptimizationRemark("TTI", "DontUnroll", L->getStartLoc(),
534 L->getHeader())
535 << "advising against unrolling the loop because it "
536 "contains a "
537 << ore::NV("Call", &I);
538 });
539 }
540 return;
541 }
542 }
543 }
544
545 // Enable runtime and partial unrolling up to the specified size.
546 // Enable using trip count upper bound to unroll loops.
547 UP.Partial = UP.Runtime = UP.UpperBound = true;
548 UP.PartialThreshold = MaxOps;
549
550 // Avoid unrolling when optimizing for size.
551 UP.OptSizeThreshold = 0;
552 UP.PartialOptSizeThreshold = 0;
553
554 // Set number of instructions optimized when "back edge"
555 // becomes "fall through" to default value of 2.
556 UP.BEInsns = 2;
557 }
558
559 void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
560 TTI::PeelingPreferences &PP) {
561 PP.PeelCount = 0;
562 PP.AllowPeeling = true;
563 PP.AllowLoopNestsPeeling = false;
564 PP.PeelProfiledIterations = true;
565 }
566
567 bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
568 AssumptionCache &AC,
569 TargetLibraryInfo *LibInfo,
570 HardwareLoopInfo &HWLoopInfo) {
571 return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
572 }
573
574 bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
575 AssumptionCache &AC, TargetLibraryInfo *TLI,
576 DominatorTree *DT,
577 const LoopAccessInfo *LAI) {
578 return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
579 }
580
581 bool emitGetActiveLaneMask() {
582 return BaseT::emitGetActiveLaneMask();
583 }
584
585 Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
586 IntrinsicInst &II) {
587 return BaseT::instCombineIntrinsic(IC, II);
588 }
589
590 Optional<Value *> simplifyDemandedUseBitsIntrinsic(InstCombiner &IC,
591 IntrinsicInst &II,
592 APInt DemandedMask,
593 KnownBits &Known,
594 bool &KnownBitsComputed) {
595 return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
596 KnownBitsComputed);
597 }
598
599 Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
600 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
601 APInt &UndefElts2, APInt &UndefElts3,
602 std::function<void(Instruction *, unsigned, APInt, APInt &)>
603 SimplifyAndSetOp) {
604 return BaseT::simplifyDemandedVectorEltsIntrinsic(
605 IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
606 SimplifyAndSetOp);
607 }
608
609 InstructionCost getInstructionLatency(const Instruction *I) {
610 if (isa<LoadInst>(I))
611 return getST()->getSchedModel().DefaultLoadLatency;
612
613 return BaseT::getInstructionLatency(I);
614 }
615
616 virtual Optional<unsigned>
617 getCacheSize(TargetTransformInfo::CacheLevel Level) const {
618 return Optional<unsigned>(
619 getST()->getCacheSize(static_cast<unsigned>(Level)));
620 }
621
622 virtual Optional<unsigned>
623 getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const {
624 Optional<unsigned> TargetResult =
625 getST()->getCacheAssociativity(static_cast<unsigned>(Level));
626
627 if (TargetResult)
628 return TargetResult;
629
630 return BaseT::getCacheAssociativity(Level);
631 }
632
633 virtual unsigned getCacheLineSize() const {
634 return getST()->getCacheLineSize();
635 }
636
637 virtual unsigned getPrefetchDistance() const {
638 return getST()->getPrefetchDistance();
639 }
640
641 virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
642 unsigned NumStridedMemAccesses,
643 unsigned NumPrefetches,
644 bool HasCall) const {
645 return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
646 NumPrefetches, HasCall);
647 }
648
649 virtual unsigned getMaxPrefetchIterationsAhead() const {
650 return getST()->getMaxPrefetchIterationsAhead();
651 }
652
653 virtual bool enableWritePrefetching() const {
654 return getST()->enableWritePrefetching();
655 }
656
657 /// @}
658
659 /// \name Vector TTI Implementations
660 /// @{
661
662 TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
663 return TypeSize::getFixed(32);
664 }
665
666 Optional<unsigned> getMaxVScale() const { return None; }
667
668 /// Estimate the overhead of scalarizing an instruction. Insert and Extract
669 /// are set if the demanded result elements need to be inserted and/or
670 /// extracted from vectors.
671 InstructionCost getScalarizationOverhead(VectorType *InTy,
672 const APInt &DemandedElts,
673 bool Insert, bool Extract) {
674 /// FIXME: a bitfield is not a reasonable abstraction for talking about
675 /// which elements are needed from a scalable vector
676 auto *Ty = cast<FixedVectorType>(InTy);
677
678 assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&(static_cast<void> (0))
679 "Vector size mismatch")(static_cast<void> (0));
680
681 InstructionCost Cost = 0;
682
683 for (int i = 0, e = Ty->getNumElements(); i < e; ++i) {
684 if (!DemandedElts[i])
685 continue;
686 if (Insert)
687 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, i);
688 if (Extract)
689 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
690 }
691
692 return Cost;
693 }
694
695 /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
696 InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert,
697 bool Extract) {
698 auto *Ty = cast<FixedVectorType>(InTy);
699
700 APInt DemandedElts = APInt::getAllOnesValue(Ty->getNumElements());
701 return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
702 }
703
704 /// Estimate the overhead of scalarizing an instructions unique
705 /// non-constant operands. The (potentially vector) types to use for each of
706 /// argument are passes via Tys.
707 InstructionCost getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
708 ArrayRef<Type *> Tys) {
709 assert(Args.size() == Tys.size() && "Expected matching Args and Tys")(static_cast<void> (0));
710
711 InstructionCost Cost = 0;
712 SmallPtrSet<const Value*, 4> UniqueOperands;
713 for (int I = 0, E = Args.size(); I != E; I++) {
714 // Disregard things like metadata arguments.
715 const Value *A = Args[I];
716 Type *Ty = Tys[I];
717 if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() &&
718 !Ty->isPtrOrPtrVectorTy())
719 continue;
720
721 if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
722 if (auto *VecTy = dyn_cast<VectorType>(Ty))
723 Cost += getScalarizationOverhead(VecTy, false, true);
724 }
725 }
726
727 return Cost;
728 }
729
730 /// Estimate the overhead of scalarizing the inputs and outputs of an
731 /// instruction, with return type RetTy and arguments Args of type Tys. If
732 /// Args are unknown (empty), then the cost associated with one argument is
733 /// added as a heuristic.
734 InstructionCost getScalarizationOverhead(VectorType *RetTy,
735 ArrayRef<const Value *> Args,
736 ArrayRef<Type *> Tys) {
737 InstructionCost Cost = getScalarizationOverhead(RetTy, true, false);
738 if (!Args.empty())
739 Cost += getOperandsScalarizationOverhead(Args, Tys);
740 else
741 // When no information on arguments is provided, we add the cost
742 // associated with one argument as a heuristic.
743 Cost += getScalarizationOverhead(RetTy, false, true);
744
745 return Cost;
746 }
747
748 unsigned getMaxInterleaveFactor(unsigned VF) { return 1; }
749
750 InstructionCost getArithmeticInstrCost(
751 unsigned Opcode, Type *Ty,
752 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
753 TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
754 TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
755 TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
756 TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
757 ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
758 const Instruction *CxtI = nullptr) {
759 // Check if any of the operands are vector operands.
760 const TargetLoweringBase *TLI = getTLI();
761 int ISD = TLI->InstructionOpcodeToISD(Opcode);
762 assert(ISD && "Invalid opcode")(static_cast<void> (0));
763
764 // TODO: Handle more cost kinds.
765 if (CostKind != TTI::TCK_RecipThroughput)
766 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
767 Opd1Info, Opd2Info,
768 Opd1PropInfo, Opd2PropInfo,
769 Args, CxtI);
770
771 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
772
773 bool IsFloat = Ty->isFPOrFPVectorTy();
774 // Assume that floating point arithmetic operations cost twice as much as
775 // integer operations.
776 InstructionCost OpCost = (IsFloat ? 2 : 1);
777
778 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
779 // The operation is legal. Assume it costs 1.
780 // TODO: Once we have extract/insert subvector cost we need to use them.
781 return LT.first * OpCost;
782 }
783
784 if (!TLI->isOperationExpand(ISD, LT.second)) {
785 // If the operation is custom lowered, then assume that the code is twice
786 // as expensive.
787 return LT.first * 2 * OpCost;
788 }
789
790 // An 'Expand' of URem and SRem is special because it may default
791 // to expanding the operation into a sequence of sub-operations
792 // i.e. X % Y -> X-(X/Y)*Y.
793 if (ISD == ISD::UREM || ISD == ISD::SREM) {
794 bool IsSigned = ISD == ISD::SREM;
795 if (TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIVREM : ISD::UDIVREM,
796 LT.second) ||
797 TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIV : ISD::UDIV,
798 LT.second)) {
799 unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv;
800 InstructionCost DivCost = thisT()->getArithmeticInstrCost(
801 DivOpc, Ty, CostKind, Opd1Info, Opd2Info, Opd1PropInfo,
802 Opd2PropInfo);
803 InstructionCost MulCost =
804 thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind);
805 InstructionCost SubCost =
806 thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
807 return DivCost + MulCost + SubCost;
808 }
809 }
810
811 // We cannot scalarize scalable vectors, so return Invalid.
812 if (isa<ScalableVectorType>(Ty))
813 return InstructionCost::getInvalid();
814
815 // Else, assume that we need to scalarize this op.
816 // TODO: If one of the types get legalized by splitting, handle this
817 // similarly to what getCastInstrCost() does.
818 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
819 InstructionCost Cost = thisT()->getArithmeticInstrCost(
820 Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
821 Opd1PropInfo, Opd2PropInfo, Args, CxtI);
822 // Return the cost of multiple scalar invocation plus the cost of
823 // inserting and extracting the values.
824 SmallVector<Type *> Tys(Args.size(), Ty);
825 return getScalarizationOverhead(VTy, Args, Tys) +
826 VTy->getNumElements() * Cost;
827 }
828
829 // We don't know anything about this scalar instruction.
830 return OpCost;
831 }
832
833 TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind,
834 ArrayRef<int> Mask) const {
835 int Limit = Mask.size() * 2;
836 if (Mask.empty() ||
837 // Extra check required by isSingleSourceMaskImpl function (called by
838 // ShuffleVectorInst::isSingleSourceMask).
839 any_of(Mask, [Limit](int I) { return I >= Limit; }))
840 return Kind;
841 switch (Kind) {
842 case TTI::SK_PermuteSingleSrc:
843 if (ShuffleVectorInst::isReverseMask(Mask))
844 return TTI::SK_Reverse;
845 if (ShuffleVectorInst::isZeroEltSplatMask(Mask))
846 return TTI::SK_Broadcast;
847 break;
848 case TTI::SK_PermuteTwoSrc:
849 if (ShuffleVectorInst::isSelectMask(Mask))
850 return TTI::SK_Select;
851 if (ShuffleVectorInst::isTransposeMask(Mask))
852 return TTI::SK_Transpose;
853 break;
854 case TTI::SK_Select:
855 case TTI::SK_Reverse:
856 case TTI::SK_Broadcast:
857 case TTI::SK_Transpose:
858 case TTI::SK_InsertSubvector:
859 case TTI::SK_ExtractSubvector:
860 case TTI::SK_Splice:
861 break;
862 }
863 return Kind;
864 }
865
866 InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
867 ArrayRef<int> Mask, int Index,
868 VectorType *SubTp) {
869
870 switch (improveShuffleKindFromMask(Kind, Mask)) {
871 case TTI::SK_Broadcast:
872 return getBroadcastShuffleOverhead(cast<FixedVectorType>(Tp));
873 case TTI::SK_Select:
874 case TTI::SK_Splice:
875 case TTI::SK_Reverse:
876 case TTI::SK_Transpose:
877 case TTI::SK_PermuteSingleSrc:
878 case TTI::SK_PermuteTwoSrc:
879 return getPermuteShuffleOverhead(cast<FixedVectorType>(Tp));
880 case TTI::SK_ExtractSubvector:
881 return getExtractSubvectorOverhead(Tp, Index,
882 cast<FixedVectorType>(SubTp));
883 case TTI::SK_InsertSubvector:
884 return getInsertSubvectorOverhead(Tp, Index,
885 cast<FixedVectorType>(SubTp));
886 }
887 llvm_unreachable("Unknown TTI::ShuffleKind")__builtin_unreachable();
888 }
889
890 InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
891 TTI::CastContextHint CCH,
892 TTI::TargetCostKind CostKind,
893 const Instruction *I = nullptr) {
894 if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0)
895 return 0;
896
897 const TargetLoweringBase *TLI = getTLI();
898 int ISD = TLI->InstructionOpcodeToISD(Opcode);
899 assert(ISD && "Invalid opcode")(static_cast<void> (0));
900 std::pair<InstructionCost, MVT> SrcLT =
901 TLI->getTypeLegalizationCost(DL, Src);
902 std::pair<InstructionCost, MVT> DstLT =
903 TLI->getTypeLegalizationCost(DL, Dst);
904
905 TypeSize SrcSize = SrcLT.second.getSizeInBits();
906 TypeSize DstSize = DstLT.second.getSizeInBits();
907 bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy();
908 bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy();
909
910 switch (Opcode) {
911 default:
912 break;
913 case Instruction::Trunc:
914 // Check for NOOP conversions.
915 if (TLI->isTruncateFree(SrcLT.second, DstLT.second))
916 return 0;
917 LLVM_FALLTHROUGH[[gnu::fallthrough]];
918 case Instruction::BitCast:
919 // Bitcast between types that are legalized to the same type are free and
920 // assume int to/from ptr of the same size is also free.
921 if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst &&
922 SrcSize == DstSize)
923 return 0;
924 break;
925 case Instruction::FPExt:
926 if (I && getTLI()->isExtFree(I))
927 return 0;
928 break;
929 case Instruction::ZExt:
930 if (TLI->isZExtFree(SrcLT.second, DstLT.second))
931 return 0;
932 LLVM_FALLTHROUGH[[gnu::fallthrough]];
933 case Instruction::SExt:
934 if (I && getTLI()->isExtFree(I))
935 return 0;
936
937 // If this is a zext/sext of a load, return 0 if the corresponding
938 // extending load exists on target and the result type is legal.
939 if (CCH == TTI::CastContextHint::Normal) {
940 EVT ExtVT = EVT::getEVT(Dst);
941 EVT LoadVT = EVT::getEVT(Src);
942 unsigned LType =
943 ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
944 if (DstLT.first == SrcLT.first &&
945 TLI->isLoadExtLegal(LType, ExtVT, LoadVT))
946 return 0;
947 }
948 break;
949 case Instruction::AddrSpaceCast:
950 if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(),
951 Dst->getPointerAddressSpace()))
952 return 0;
953 break;
954 }
955
956 auto *SrcVTy = dyn_cast<VectorType>(Src);
957 auto *DstVTy = dyn_cast<VectorType>(Dst);
958
959 // If the cast is marked as legal (or promote) then assume low cost.
960 if (SrcLT.first == DstLT.first &&
961 TLI->isOperationLegalOrPromote(ISD, DstLT.second))
962 return SrcLT.first;
963
964 // Handle scalar conversions.
965 if (!SrcVTy && !DstVTy) {
966 // Just check the op cost. If the operation is legal then assume it costs
967 // 1.
968 if (!TLI->isOperationExpand(ISD, DstLT.second))
969 return 1;
970
971 // Assume that illegal scalar instruction are expensive.
972 return 4;
973 }
974
975 // Check vector-to-vector casts.
976 if (DstVTy && SrcVTy) {
977 // If the cast is between same-sized registers, then the check is simple.
978 if (SrcLT.first == DstLT.first && SrcSize == DstSize) {
979
980 // Assume that Zext is done using AND.
981 if (Opcode == Instruction::ZExt)
982 return SrcLT.first;
983
984 // Assume that sext is done using SHL and SRA.
985 if (Opcode == Instruction::SExt)
986 return SrcLT.first * 2;
987
988 // Just check the op cost. If the operation is legal then assume it
989 // costs
990 // 1 and multiply by the type-legalization overhead.
991 if (!TLI->isOperationExpand(ISD, DstLT.second))
992 return SrcLT.first * 1;
993 }
994
995 // If we are legalizing by splitting, query the concrete TTI for the cost
996 // of casting the original vector twice. We also need to factor in the
997 // cost of the split itself. Count that as 1, to be consistent with
998 // TLI->getTypeLegalizationCost().
999 bool SplitSrc =
1000 TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
1001 TargetLowering::TypeSplitVector;
1002 bool SplitDst =
1003 TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
1004 TargetLowering::TypeSplitVector;
1005 if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() &&
1006 DstVTy->getElementCount().isVector()) {
1007 Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy);
1008 Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy);
1009 T *TTI = static_cast<T *>(this);
1010 // If both types need to be split then the split is free.
1011 InstructionCost SplitCost =
1012 (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0;
1013 return SplitCost +
1014 (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH,
1015 CostKind, I));
1016 }
1017
1018 // Scalarization cost is Invalid, can't assume any num elements.
1019 if (isa<ScalableVectorType>(DstVTy))
1020 return InstructionCost::getInvalid();
1021
1022 // In other cases where the source or destination are illegal, assume
1023 // the operation will get scalarized.
1024 unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements();
1025 InstructionCost Cost = thisT()->getCastInstrCost(
1026 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I);
1027
1028 // Return the cost of multiple scalar invocation plus the cost of
1029 // inserting and extracting the values.
1030 return getScalarizationOverhead(DstVTy, true, true) + Num * Cost;
1031 }
1032
1033 // We already handled vector-to-vector and scalar-to-scalar conversions.
1034 // This
1035 // is where we handle bitcast between vectors and scalars. We need to assume
1036 // that the conversion is scalarized in one way or another.
1037 if (Opcode == Instruction::BitCast) {
1038 // Illegal bitcasts are done by storing and loading from a stack slot.
1039 return (SrcVTy ? getScalarizationOverhead(SrcVTy, false, true) : 0) +
1040 (DstVTy ? getScalarizationOverhead(DstVTy, true, false) : 0);
1041 }
1042
1043 llvm_unreachable("Unhandled cast")__builtin_unreachable();
1044 }
1045
1046 InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst,
1047 VectorType *VecTy, unsigned Index) {
1048 return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
1049 Index) +
1050 thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(),
1051 TTI::CastContextHint::None,
1052 TTI::TCK_RecipThroughput);
1053 }
1054
1055 InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
1056 const Instruction *I = nullptr) {
1057 return BaseT::getCFInstrCost(Opcode, CostKind, I);
1058 }
1059
1060 InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1061 CmpInst::Predicate VecPred,
1062 TTI::TargetCostKind CostKind,
1063 const Instruction *I = nullptr) {
1064 const TargetLoweringBase *TLI = getTLI();
1065 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1066 assert(ISD && "Invalid opcode")(static_cast<void> (0));
1067
1068 // TODO: Handle other cost kinds.
1069 if (CostKind != TTI::TCK_RecipThroughput)
61
Assuming 'CostKind' is equal to TCK_RecipThroughput
62
Taking false branch
1070 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1071 I);
1072
1073 // Selects on vectors are actually vector selects.
1074 if (ISD == ISD::SELECT) {
63
Assuming 'ISD' is equal to SELECT
64
Taking true branch
1075 assert(CondTy && "CondTy must exist")(static_cast<void> (0));
1076 if (CondTy->isVectorTy())
65
Called C++ object pointer is null
1077 ISD = ISD::VSELECT;
1078 }
1079 std::pair<InstructionCost, MVT> LT =
1080 TLI->getTypeLegalizationCost(DL, ValTy);
1081
1082 if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
1083 !TLI->isOperationExpand(ISD, LT.second)) {
1084 // The operation is legal. Assume it costs 1. Multiply
1085 // by the type-legalization overhead.
1086 return LT.first * 1;
1087 }
1088
1089 // Otherwise, assume that the cast is scalarized.
1090 // TODO: If one of the types get legalized by splitting, handle this
1091 // similarly to what getCastInstrCost() does.
1092 if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) {
1093 unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements();
1094 if (CondTy)
1095 CondTy = CondTy->getScalarType();
1096 InstructionCost Cost = thisT()->getCmpSelInstrCost(
1097 Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I);
1098
1099 // Return the cost of multiple scalar invocation plus the cost of
1100 // inserting and extracting the values.
1101 return getScalarizationOverhead(ValVTy, true, false) + Num * Cost;
1102 }
1103
1104 // Unknown scalar opcode.
1105 return 1;
1106 }
1107
1108 InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
1109 unsigned Index) {
1110 std::pair<InstructionCost, MVT> LT =
1111 getTLI()->getTypeLegalizationCost(DL, Val->getScalarType());
1112
1113 return LT.first;
1114 }
1115
1116 InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src,
1117 MaybeAlign Alignment, unsigned AddressSpace,
1118 TTI::TargetCostKind CostKind,
1119 const Instruction *I = nullptr) {
1120 assert(!Src->isVoidTy() && "Invalid type")(static_cast<void> (0));
1121 // Assume types, such as structs, are expensive.
1122 if (getTLI()->getValueType(DL, Src, true) == MVT::Other)
1123 return 4;
1124 std::pair<InstructionCost, MVT> LT =
1125 getTLI()->getTypeLegalizationCost(DL, Src);
1126
1127 // Assuming that all loads of legal types cost 1.
1128 InstructionCost Cost = LT.first;
1129 if (CostKind != TTI::TCK_RecipThroughput)
1130 return Cost;
1131
1132 if (Src->isVectorTy() &&
1133 // In practice it's not currently possible to have a change in lane
1134 // length for extending loads or truncating stores so both types should
1135 // have the same scalable property.
1136 TypeSize::isKnownLT(Src->getPrimitiveSizeInBits(),
1137 LT.second.getSizeInBits())) {
1138 // This is a vector load that legalizes to a larger type than the vector
1139 // itself. Unless the corresponding extending load or truncating store is
1140 // legal, then this will scalarize.
1141 TargetLowering::LegalizeAction LA = TargetLowering::Expand;
1142 EVT MemVT = getTLI()->getValueType(DL, Src);
1143 if (Opcode == Instruction::Store)
1144 LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
1145 else
1146 LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
1147
1148 if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
1149 // This is a vector load/store for some illegal type that is scalarized.
1150 // We must account for the cost of building or decomposing the vector.
1151 Cost += getScalarizationOverhead(cast<VectorType>(Src),
1152 Opcode != Instruction::Store,
1153 Opcode == Instruction::Store);
1154 }
1155 }
1156
1157 return Cost;
1158 }
1159
1160 InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
1161 Align Alignment, unsigned AddressSpace,
1162 TTI::TargetCostKind CostKind) {
1163 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false,
1164 CostKind);
1165 }
1166
1167 InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
1168 const Value *Ptr, bool VariableMask,
1169 Align Alignment,
1170 TTI::TargetCostKind CostKind,
1171 const Instruction *I = nullptr) {
1172 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask,
1173 true, CostKind);
1174 }
1175
1176 InstructionCost getInterleavedMemoryOpCost(
1177 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1178 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1179 bool UseMaskForCond = false, bool UseMaskForGaps = false) {
1180 auto *VT = cast<FixedVectorType>(VecTy);
1181
1182 unsigned NumElts = VT->getNumElements();
1183 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor")(static_cast<void> (0));
1184
1185 unsigned NumSubElts = NumElts / Factor;
1186 auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts);
1187
1188 // Firstly, the cost of load/store operation.
1189 InstructionCost Cost;
1190 if (UseMaskForCond || UseMaskForGaps)
1191 Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment,
1192 AddressSpace, CostKind);
1193 else
1194 Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace,
1195 CostKind);
1196
1197 // Legalize the vector type, and get the legalized and unlegalized type
1198 // sizes.
1199 MVT VecTyLT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
1200 unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy);
1201 unsigned VecTyLTSize = VecTyLT.getStoreSize();
1202
1203 // Scale the cost of the memory operation by the fraction of legalized
1204 // instructions that will actually be used. We shouldn't account for the
1205 // cost of dead instructions since they will be removed.
1206 //
1207 // E.g., An interleaved load of factor 8:
1208 // %vec = load <16 x i64>, <16 x i64>* %ptr
1209 // %v0 = shufflevector %vec, undef, <0, 8>
1210 //
1211 // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
1212 // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
1213 // type). The other loads are unused.
1214 //
1215 // TODO: Note that legalization can turn masked loads/stores into unmasked
1216 // (legalized) loads/stores. This can be reflected in the cost.
1217 if (VecTySize > VecTyLTSize) {
1218 // The number of loads of a legal type it will take to represent a load
1219 // of the unlegalized vector type.
1220 unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize);
1221
1222 // The number of elements of the unlegalized type that correspond to a
1223 // single legal instruction.
1224 unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts);
1225
1226 // Determine which legal instructions will be used.
1227 BitVector UsedInsts(NumLegalInsts, false);
1228 for (unsigned Index : Indices)
1229 for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
1230 UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
1231
1232 // Scale the cost of the load by the fraction of legal instructions that
1233 // will be used.
1234 Cost *= UsedInsts.count() / NumLegalInsts;
1235 }
1236
1237 // Then plus the cost of interleave operation.
1238 assert(Indices.size() <= Factor &&(static_cast<void> (0))
1239 "Interleaved memory op has too many members")(static_cast<void> (0));
1240 if (Opcode == Instruction::Load) {
1241 // The interleave cost is similar to extract sub vectors' elements
1242 // from the wide vector, and insert them into sub vectors.
1243 //
1244 // E.g. An interleaved load of factor 2 (with one member of index 0):
1245 // %vec = load <8 x i32>, <8 x i32>* %ptr
1246 // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0
1247 // The cost is estimated as extract elements at 0, 2, 4, 6 from the
1248 // <8 x i32> vector and insert them into a <4 x i32> vector.
1249 for (unsigned Index : Indices) {
1250 assert(Index < Factor && "Invalid index for interleaved memory op")(static_cast<void> (0));
1251
1252 // Extract elements from loaded vector for each sub vector.
1253 for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
1254 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VT,
1255 Index + Elm * Factor);
1256 }
1257
1258 InstructionCost InsSubCost = 0;
1259 for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
1260 InsSubCost +=
1261 thisT()->getVectorInstrCost(Instruction::InsertElement, SubVT, Elm);
1262
1263 Cost += Indices.size() * InsSubCost;
1264 } else {
1265 // The interleave cost is extract elements from sub vectors, and
1266 // insert them into the wide vector.
1267 //
1268 // E.g. An interleaved store of factor 3 with 2 members at indices 0,1:
1269 // (using VF=4):
1270 // %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef>
1271 // %gaps.mask = <true, true, false, true, true, false,
1272 // true, true, false, true, true, false>
1273 // call llvm.masked.store <12 x i32> %v0_v1, <12 x i32>* %ptr,
1274 // i32 Align, <12 x i1> %gaps.mask
1275 // The cost is estimated as extract all elements (of actual members,
1276 // excluding gaps) from both <4 x i32> vectors and insert into the <12 x
1277 // i32> vector.
1278 InstructionCost ExtSubCost = 0;
1279 for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
1280 ExtSubCost += thisT()->getVectorInstrCost(Instruction::ExtractElement,
1281 SubVT, Elm);
1282 Cost += ExtSubCost * Indices.size();
1283
1284 for (unsigned Index : Indices) {
1285 assert(Index < Factor && "Invalid index for interleaved memory op")(static_cast<void> (0));
1286
1287 // Insert elements from loaded vector for each sub vector.
1288 for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
1289 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VT,
1290 Index + Elm * Factor);
1291 }
1292 }
1293
1294 if (!UseMaskForCond)
1295 return Cost;
1296
1297 Type *I8Type = Type::getInt8Ty(VT->getContext());
1298 auto *MaskVT = FixedVectorType::get(I8Type, NumElts);
1299 SubVT = FixedVectorType::get(I8Type, NumSubElts);
1300
1301 // The Mask shuffling cost is extract all the elements of the Mask
1302 // and insert each of them Factor times into the wide vector:
1303 //
1304 // E.g. an interleaved group with factor 3:
1305 // %mask = icmp ult <8 x i32> %vec1, %vec2
1306 // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
1307 // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
1308 // The cost is estimated as extract all mask elements from the <8xi1> mask
1309 // vector and insert them factor times into the <24xi1> shuffled mask
1310 // vector.
1311 for (unsigned i = 0; i < NumSubElts; i++)
1312 Cost +=
1313 thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVT, i);
1314
1315 for (unsigned i = 0; i < NumElts; i++)
1316 Cost +=
1317 thisT()->getVectorInstrCost(Instruction::InsertElement, MaskVT, i);
1318
1319 // The Gaps mask is invariant and created outside the loop, therefore the
1320 // cost of creating it is not accounted for here. However if we have both
1321 // a MaskForGaps and some other mask that guards the execution of the
1322 // memory access, we need to account for the cost of And-ing the two masks
1323 // inside the loop.
1324 if (UseMaskForGaps)
1325 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT,
1326 CostKind);
1327
1328 return Cost;
1329 }
1330
1331 /// Get intrinsic cost based on arguments.
1332 InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1333 TTI::TargetCostKind CostKind) {
1334 // Check for generically free intrinsics.
1335 if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0)
3
Calling 'InstructionCost::operator=='
8
Returning from 'InstructionCost::operator=='
9
Taking false branch
1336 return 0;
1337
1338 // Assume that target intrinsics are cheap.
1339 Intrinsic::ID IID = ICA.getID();
1340 if (Function::isTargetIntrinsic(IID))
10
Assuming the condition is false
11
Taking false branch
1341 return TargetTransformInfo::TCC_Basic;
1342
1343 if (ICA.isTypeBasedOnly())
12
Calling 'IntrinsicCostAttributes::isTypeBasedOnly'
18
Returning from 'IntrinsicCostAttributes::isTypeBasedOnly'
19
Taking false branch
1344 return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
1345
1346 Type *RetTy = ICA.getReturnType();
1347
1348 ElementCount RetVF =
1349 (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
20
'?' condition is false
1350 : ElementCount::getFixed(1));
1351 const IntrinsicInst *I = ICA.getInst();
1352 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
1353 FastMathFlags FMF = ICA.getFlags();
1354 switch (IID) {
21
Control jumps to 'case fshl:' at line 1445
1355 default:
1356 break;
1357
1358 case Intrinsic::cttz:
1359 // FIXME: If necessary, this should go in target-specific overrides.
1360 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz())
1361 return TargetTransformInfo::TCC_Basic;
1362 break;
1363
1364 case Intrinsic::ctlz:
1365 // FIXME: If necessary, this should go in target-specific overrides.
1366 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz())
1367 return TargetTransformInfo::TCC_Basic;
1368 break;
1369
1370 case Intrinsic::memcpy:
1371 return thisT()->getMemcpyCost(ICA.getInst());
1372
1373 case Intrinsic::masked_scatter: {
1374 const Value *Mask = Args[3];
1375 bool VarMask = !isa<Constant>(Mask);
1376 Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue();
1377 return thisT()->getGatherScatterOpCost(Instruction::Store,
1378 ICA.getArgTypes()[0], Args[1],
1379 VarMask, Alignment, CostKind, I);
1380 }
1381 case Intrinsic::masked_gather: {
1382 const Value *Mask = Args[2];
1383 bool VarMask = !isa<Constant>(Mask);
1384 Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue();
1385 return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
1386 VarMask, Alignment, CostKind, I);
1387 }
1388 case Intrinsic::experimental_stepvector: {
1389 if (isa<ScalableVectorType>(RetTy))
1390 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1391 // The cost of materialising a constant integer vector.
1392 return TargetTransformInfo::TCC_Basic;
1393 }
1394 case Intrinsic::experimental_vector_extract: {
1395 // FIXME: Handle case where a scalable vector is extracted from a scalable
1396 // vector
1397 if (isa<ScalableVectorType>(RetTy))
1398 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1399 unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue();
1400 return thisT()->getShuffleCost(TTI::SK_ExtractSubvector,
1401 cast<VectorType>(Args[0]->getType()), None,
1402 Index, cast<VectorType>(RetTy));
1403 }
1404 case Intrinsic::experimental_vector_insert: {
1405 // FIXME: Handle case where a scalable vector is inserted into a scalable
1406 // vector
1407 if (isa<ScalableVectorType>(Args[1]->getType()))
1408 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1409 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1410 return thisT()->getShuffleCost(
1411 TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()), None,
1412 Index, cast<VectorType>(Args[1]->getType()));
1413 }
1414 case Intrinsic::experimental_vector_reverse: {
1415 return thisT()->getShuffleCost(TTI::SK_Reverse,
1416 cast<VectorType>(Args[0]->getType()), None,
1417 0, cast<VectorType>(RetTy));
1418 }
1419 case Intrinsic::experimental_vector_splice: {
1420 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1421 return thisT()->getShuffleCost(TTI::SK_Splice,
1422 cast<VectorType>(Args[0]->getType()), None,
1423 Index, cast<VectorType>(RetTy));
1424 }
1425 case Intrinsic::vector_reduce_add:
1426 case Intrinsic::vector_reduce_mul:
1427 case Intrinsic::vector_reduce_and:
1428 case Intrinsic::vector_reduce_or:
1429 case Intrinsic::vector_reduce_xor:
1430 case Intrinsic::vector_reduce_smax:
1431 case Intrinsic::vector_reduce_smin:
1432 case Intrinsic::vector_reduce_fmax:
1433 case Intrinsic::vector_reduce_fmin:
1434 case Intrinsic::vector_reduce_umax:
1435 case Intrinsic::vector_reduce_umin: {
1436 IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1);
1437 return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1438 }
1439 case Intrinsic::vector_reduce_fadd:
1440 case Intrinsic::vector_reduce_fmul: {
1441 IntrinsicCostAttributes Attrs(
1442 IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1);
1443 return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1444 }
1445 case Intrinsic::fshl:
1446 case Intrinsic::fshr: {
1447 if (isa<ScalableVectorType>(RetTy))
22
Assuming 'RetTy' is not a 'ScalableVectorType'
23
Taking false branch
1448 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1449 const Value *X = Args[0];
1450 const Value *Y = Args[1];
1451 const Value *Z = Args[2];
1452 TTI::OperandValueProperties OpPropsX, OpPropsY, OpPropsZ, OpPropsBW;
1453 TTI::OperandValueKind OpKindX = TTI::getOperandInfo(X, OpPropsX);
1454 TTI::OperandValueKind OpKindY = TTI::getOperandInfo(Y, OpPropsY);
1455 TTI::OperandValueKind OpKindZ = TTI::getOperandInfo(Z, OpPropsZ);
1456 TTI::OperandValueKind OpKindBW = TTI::OK_UniformConstantValue;
1457 OpPropsBW = isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2
24
'?' condition is false
1458 : TTI::OP_None;
1459 // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
1460 // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
1461 InstructionCost Cost = 0;
1462 Cost +=
1463 thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
1464 Cost +=
1465 thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
1466 Cost += thisT()->getArithmeticInstrCost(
1467 BinaryOperator::Shl, RetTy, CostKind, OpKindX, OpKindZ, OpPropsX);
1468 Cost += thisT()->getArithmeticInstrCost(
1469 BinaryOperator::LShr, RetTy, CostKind, OpKindY, OpKindZ, OpPropsY);
1470 // Non-constant shift amounts requires a modulo.
1471 if (OpKindZ != TTI::OK_UniformConstantValue &&
25
Assuming 'OpKindZ' is equal to OK_UniformConstantValue
1472 OpKindZ != TTI::OK_NonUniformConstantValue)
1473 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy,
1474 CostKind, OpKindZ, OpKindBW,
1475 OpPropsZ, OpPropsBW);
1476 // For non-rotates (X != Y) we must add shift-by-zero handling costs.
1477 if (X != Y) {
26
Assuming 'X' is not equal to 'Y'
27
Taking true branch
1478 Type *CondTy = RetTy->getWithNewBitWidth(1);
28
Calling 'Type::getWithNewBitWidth'
36
Returning from 'Type::getWithNewBitWidth'
37
'CondTy' initialized here
1479 Cost +=
1480 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
38
Passing 'CondTy' via 3rd parameter 'CondTy'
39
Calling 'ARMTTIImpl::getCmpSelInstrCost'
1481 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1482 Cost +=
1483 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1484 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1485 }
1486 return Cost;
1487 }
1488 }
1489
1490 // Assume that we need to scalarize this intrinsic.
1491 // Compute the scalarization overhead based on Args for a vector
1492 // intrinsic.
1493 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
1494 if (RetVF.isVector() && !RetVF.isScalable()) {
1495 ScalarizationCost = 0;
1496 if (!RetTy->isVoidTy())
1497 ScalarizationCost +=
1498 getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
1499 ScalarizationCost +=
1500 getOperandsScalarizationOverhead(Args, ICA.getArgTypes());
1501 }
1502
1503 IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
1504 ScalarizationCost);
1505 return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1506 }
1507
1508 /// Get intrinsic cost based on argument types.
1509 /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the
1510 /// cost of scalarizing the arguments and the return value will be computed
1511 /// based on types.
1512 InstructionCost
1513 getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1514 TTI::TargetCostKind CostKind) {
1515 Intrinsic::ID IID = ICA.getID();
1516 Type *RetTy = ICA.getReturnType();
1517 const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes();
1518 FastMathFlags FMF = ICA.getFlags();
1519 InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost();
1520 bool SkipScalarizationCost = ICA.skipScalarizationCost();
1521
1522 VectorType *VecOpTy = nullptr;
1523 if (!Tys.empty()) {
1524 // The vector reduction operand is operand 0 except for fadd/fmul.
1525 // Their operand 0 is a scalar start value, so the vector op is operand 1.
1526 unsigned VecTyIndex = 0;
1527 if (IID == Intrinsic::vector_reduce_fadd ||
1528 IID == Intrinsic::vector_reduce_fmul)
1529 VecTyIndex = 1;
1530 assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes")(static_cast<void> (0));
1531 VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]);
1532 }
1533
1534 // Library call cost - other than size, make it expensive.
1535 unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10;
1536 SmallVector<unsigned, 2> ISDs;
1537 switch (IID) {
1538 default: {
1539 // Scalable vectors cannot be scalarized, so return Invalid.
1540 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
1541 return isa<ScalableVectorType>(Ty);
1542 }))
1543 return InstructionCost::getInvalid();
1544
1545 // Assume that we need to scalarize this intrinsic.
1546 InstructionCost ScalarizationCost =
1547 SkipScalarizationCost ? ScalarizationCostPassed : 0;
1548 unsigned ScalarCalls = 1;
1549 Type *ScalarRetTy = RetTy;
1550 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
1551 if (!SkipScalarizationCost)
1552 ScalarizationCost = getScalarizationOverhead(RetVTy, true, false);
1553 ScalarCalls = std::max(ScalarCalls,
1554 cast<FixedVectorType>(RetVTy)->getNumElements());
1555 ScalarRetTy = RetTy->getScalarType();
1556 }
1557 SmallVector<Type *, 4> ScalarTys;
1558 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1559 Type *Ty = Tys[i];
1560 if (auto *VTy = dyn_cast<VectorType>(Ty)) {
1561 if (!SkipScalarizationCost)
1562 ScalarizationCost += getScalarizationOverhead(VTy, false, true);
1563 ScalarCalls = std::max(ScalarCalls,
1564 cast<FixedVectorType>(VTy)->getNumElements());
1565 Ty = Ty->getScalarType();
1566 }
1567 ScalarTys.push_back(Ty);
1568 }
1569 if (ScalarCalls == 1)
1570 return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
1571
1572 IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF);
1573 InstructionCost ScalarCost =
1574 thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind);
1575
1576 return ScalarCalls * ScalarCost + ScalarizationCost;
1577 }
1578 // Look for intrinsics that can be lowered directly or turned into a scalar
1579 // intrinsic call.
1580 case Intrinsic::sqrt:
1581 ISDs.push_back(ISD::FSQRT);
1582 break;
1583 case Intrinsic::sin:
1584 ISDs.push_back(ISD::FSIN);
1585 break;
1586 case Intrinsic::cos:
1587 ISDs.push_back(ISD::FCOS);
1588 break;
1589 case Intrinsic::exp:
1590 ISDs.push_back(ISD::FEXP);
1591 break;
1592 case Intrinsic::exp2:
1593 ISDs.push_back(ISD::FEXP2);
1594 break;
1595 case Intrinsic::log:
1596 ISDs.push_back(ISD::FLOG);
1597 break;
1598 case Intrinsic::log10:
1599 ISDs.push_back(ISD::FLOG10);
1600 break;
1601 case Intrinsic::log2:
1602 ISDs.push_back(ISD::FLOG2);
1603 break;
1604 case Intrinsic::fabs:
1605 ISDs.push_back(ISD::FABS);
1606 break;
1607 case Intrinsic::canonicalize:
1608 ISDs.push_back(ISD::FCANONICALIZE);
1609 break;
1610 case Intrinsic::minnum:
1611 ISDs.push_back(ISD::FMINNUM);
1612 break;
1613 case Intrinsic::maxnum:
1614 ISDs.push_back(ISD::FMAXNUM);
1615 break;
1616 case Intrinsic::minimum:
1617 ISDs.push_back(ISD::FMINIMUM);
1618 break;
1619 case Intrinsic::maximum:
1620 ISDs.push_back(ISD::FMAXIMUM);
1621 break;
1622 case Intrinsic::copysign:
1623 ISDs.push_back(ISD::FCOPYSIGN);
1624 break;
1625 case Intrinsic::floor:
1626 ISDs.push_back(ISD::FFLOOR);
1627 break;
1628 case Intrinsic::ceil:
1629 ISDs.push_back(ISD::FCEIL);
1630 break;
1631 case Intrinsic::trunc:
1632 ISDs.push_back(ISD::FTRUNC);
1633 break;
1634 case Intrinsic::nearbyint:
1635 ISDs.push_back(ISD::FNEARBYINT);
1636 break;
1637 case Intrinsic::rint:
1638 ISDs.push_back(ISD::FRINT);
1639 break;
1640 case Intrinsic::round:
1641 ISDs.push_back(ISD::FROUND);
1642 break;
1643 case Intrinsic::roundeven:
1644 ISDs.push_back(ISD::FROUNDEVEN);
1645 break;
1646 case Intrinsic::pow:
1647 ISDs.push_back(ISD::FPOW);
1648 break;
1649 case Intrinsic::fma:
1650 ISDs.push_back(ISD::FMA);
1651 break;
1652 case Intrinsic::fmuladd:
1653 ISDs.push_back(ISD::FMA);
1654 break;
1655 case Intrinsic::experimental_constrained_fmuladd:
1656 ISDs.push_back(ISD::STRICT_FMA);
1657 break;
1658 // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
1659 case Intrinsic::lifetime_start:
1660 case Intrinsic::lifetime_end:
1661 case Intrinsic::sideeffect:
1662 case Intrinsic::pseudoprobe:
1663 case Intrinsic::arithmetic_fence:
1664 return 0;
1665 case Intrinsic::masked_store: {
1666 Type *Ty = Tys[0];
1667 Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
1668 return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0,
1669 CostKind);
1670 }
1671 case Intrinsic::masked_load: {
1672 Type *Ty = RetTy;
1673 Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
1674 return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
1675 CostKind);
1676 }
1677 case Intrinsic::vector_reduce_add:
1678 return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy,
1679 None, CostKind);
1680 case Intrinsic::vector_reduce_mul:
1681 return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy,
1682 None, CostKind);
1683 case Intrinsic::vector_reduce_and:
1684 return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy,
1685 None, CostKind);
1686 case Intrinsic::vector_reduce_or:
1687 return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy, None,
1688 CostKind);
1689 case Intrinsic::vector_reduce_xor:
1690 return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy,
1691 None, CostKind);
1692 case Intrinsic::vector_reduce_fadd:
1693 return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy,
1694 FMF, CostKind);
1695 case Intrinsic::vector_reduce_fmul:
1696 return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy,
1697 FMF, CostKind);
1698 case Intrinsic::vector_reduce_smax:
1699 case Intrinsic::vector_reduce_smin:
1700 case Intrinsic::vector_reduce_fmax:
1701 case Intrinsic::vector_reduce_fmin:
1702 return thisT()->getMinMaxReductionCost(
1703 VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
1704 /*IsUnsigned=*/false, CostKind);
1705 case Intrinsic::vector_reduce_umax:
1706 case Intrinsic::vector_reduce_umin:
1707 return thisT()->getMinMaxReductionCost(
1708 VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
1709 /*IsUnsigned=*/true, CostKind);
1710 case Intrinsic::abs:
1711 case Intrinsic::smax:
1712 case Intrinsic::smin:
1713 case Intrinsic::umax:
1714 case Intrinsic::umin: {
1715 // abs(X) = select(icmp(X,0),X,sub(0,X))
1716 // minmax(X,Y) = select(icmp(X,Y),X,Y)
1717 Type *CondTy = RetTy->getWithNewBitWidth(1);
1718 InstructionCost Cost = 0;
1719 // TODO: Ideally getCmpSelInstrCost would accept an icmp condition code.
1720 Cost +=
1721 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1722 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1723 Cost +=
1724 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1725 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1726 // TODO: Should we add an OperandValueProperties::OP_Zero property?
1727 if (IID == Intrinsic::abs)
1728 Cost += thisT()->getArithmeticInstrCost(
1729 BinaryOperator::Sub, RetTy, CostKind, TTI::OK_UniformConstantValue);
1730 return Cost;
1731 }
1732 case Intrinsic::sadd_sat:
1733 case Intrinsic::ssub_sat: {
1734 Type *CondTy = RetTy->getWithNewBitWidth(1);
1735
1736 Type *OpTy = StructType::create({RetTy, CondTy});
1737 Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
1738 ? Intrinsic::sadd_with_overflow
1739 : Intrinsic::ssub_with_overflow;
1740
1741 // SatMax -> Overflow && SumDiff < 0
1742 // SatMin -> Overflow && SumDiff >= 0
1743 InstructionCost Cost = 0;
1744 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
1745 nullptr, ScalarizationCostPassed);
1746 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1747 Cost +=
1748 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1749 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1750 Cost += 2 * thisT()->getCmpSelInstrCost(
1751 BinaryOperator::Select, RetTy, CondTy,
1752 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1753 return Cost;
1754 }
1755 case Intrinsic::uadd_sat:
1756 case Intrinsic::usub_sat: {
1757 Type *CondTy = RetTy->getWithNewBitWidth(1);
1758
1759 Type *OpTy = StructType::create({RetTy, CondTy});
1760 Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
1761 ? Intrinsic::uadd_with_overflow
1762 : Intrinsic::usub_with_overflow;
1763
1764 InstructionCost Cost = 0;
1765 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
1766 nullptr, ScalarizationCostPassed);
1767 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1768 Cost +=
1769 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1770 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1771 return Cost;
1772 }
1773 case Intrinsic::smul_fix:
1774 case Intrinsic::umul_fix: {
1775 unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
1776 Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);
1777
1778 unsigned ExtOp =
1779 IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
1780 TTI::CastContextHint CCH = TTI::CastContextHint::None;
1781
1782 InstructionCost Cost = 0;
1783 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
1784 Cost +=
1785 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
1786 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
1787 CCH, CostKind);
1788 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy,
1789 CostKind, TTI::OK_AnyValue,
1790 TTI::OK_UniformConstantValue);
1791 Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind,
1792 TTI::OK_AnyValue,
1793 TTI::OK_UniformConstantValue);
1794 Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
1795 return Cost;
1796 }
1797 case Intrinsic::sadd_with_overflow:
1798 case Intrinsic::ssub_with_overflow: {
1799 Type *SumTy = RetTy->getContainedType(0);
1800 Type *OverflowTy = RetTy->getContainedType(1);
1801 unsigned Opcode = IID == Intrinsic::sadd_with_overflow
1802 ? BinaryOperator::Add
1803 : BinaryOperator::Sub;
1804
1805 // LHSSign -> LHS >= 0
1806 // RHSSign -> RHS >= 0
1807 // SumSign -> Sum >= 0
1808 //
1809 // Add:
1810 // Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign)
1811 // Sub:
1812 // Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign)
1813 InstructionCost Cost = 0;
1814 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
1815 Cost += 3 * thisT()->getCmpSelInstrCost(
1816 Instruction::ICmp, SumTy, OverflowTy,
1817 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1818 Cost += 2 * thisT()->getCmpSelInstrCost(
1819 Instruction::Select, OverflowTy, OverflowTy,
1820 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1821 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, OverflowTy,
1822 CostKind);
1823 return Cost;
1824 }
1825 case Intrinsic::uadd_with_overflow:
1826 case Intrinsic::usub_with_overflow: {
1827 Type *SumTy = RetTy->getContainedType(0);
1828 Type *OverflowTy = RetTy->getContainedType(1);
1829 unsigned Opcode = IID == Intrinsic::uadd_with_overflow
1830 ? BinaryOperator::Add
1831 : BinaryOperator::Sub;
1832
1833 InstructionCost Cost = 0;
1834 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
1835 Cost +=
1836 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy,
1837 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1838 return Cost;
1839 }
1840 case Intrinsic::smul_with_overflow:
1841 case Intrinsic::umul_with_overflow: {
1842 Type *MulTy = RetTy->getContainedType(0);
1843 Type *OverflowTy = RetTy->getContainedType(1);
1844 unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
1845 Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
1846
1847 unsigned ExtOp =
1848 IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
1849 TTI::CastContextHint CCH = TTI::CastContextHint::None;
1850
1851 InstructionCost Cost = 0;
1852 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
1853 Cost +=
1854 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
1855 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
1856 CCH, CostKind);
1857 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, MulTy,
1858 CostKind, TTI::OK_AnyValue,
1859 TTI::OK_UniformConstantValue);
1860
1861 if (IID == Intrinsic::smul_with_overflow)
1862 Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy,
1863 CostKind, TTI::OK_AnyValue,
1864 TTI::OK_UniformConstantValue);
1865
1866 Cost +=
1867 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, MulTy, OverflowTy,
1868 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1869 return Cost;
1870 }
1871 case Intrinsic::ctpop:
1872 ISDs.push_back(ISD::CTPOP);
1873 // In case of legalization use TCC_Expensive. This is cheaper than a
1874 // library call but still not a cheap instruction.
1875 SingleCallCost = TargetTransformInfo::TCC_Expensive;
1876 break;
1877 case Intrinsic::ctlz:
1878 ISDs.push_back(ISD::CTLZ);
1879 break;
1880 case Intrinsic::cttz:
1881 ISDs.push_back(ISD::CTTZ);
1882 break;
1883 case Intrinsic::bswap:
1884 ISDs.push_back(ISD::BSWAP);
1885 break;
1886 case Intrinsic::bitreverse:
1887 ISDs.push_back(ISD::BITREVERSE);
1888 break;
1889 }
1890
1891 const TargetLoweringBase *TLI = getTLI();
1892 std::pair<InstructionCost, MVT> LT =
1893 TLI->getTypeLegalizationCost(DL, RetTy);
1894
1895 SmallVector<InstructionCost, 2> LegalCost;
1896 SmallVector<InstructionCost, 2> CustomCost;
1897 for (unsigned ISD : ISDs) {
1898 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
1899 if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
1900 TLI->isFAbsFree(LT.second)) {
1901 return 0;
1902 }
1903
1904 // The operation is legal. Assume it costs 1.
1905 // If the type is split to multiple registers, assume that there is some
1906 // overhead to this.
1907 // TODO: Once we have extract/insert subvector cost we need to use them.
1908 if (LT.first > 1)
1909 LegalCost.push_back(LT.first * 2);
1910 else
1911 LegalCost.push_back(LT.first * 1);
1912 } else if (!TLI->isOperationExpand(ISD, LT.second)) {
1913 // If the operation is custom lowered then assume
1914 // that the code is twice as expensive.
1915 CustomCost.push_back(LT.first * 2);
1916 }
1917 }
1918
1919 auto *MinLegalCostI = std::min_element(LegalCost.begin(), LegalCost.end());
1920 if (MinLegalCostI != LegalCost.end())
1921 return *MinLegalCostI;
1922
1923 auto MinCustomCostI =
1924 std::min_element(CustomCost.begin(), CustomCost.end());
1925 if (MinCustomCostI != CustomCost.end())
1926 return *MinCustomCostI;
1927
1928 // If we can't lower fmuladd into an FMA estimate the cost as a floating
1929 // point mul followed by an add.
1930 if (IID == Intrinsic::fmuladd)
1931 return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy,
1932 CostKind) +
1933 thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy,
1934 CostKind);
1935 if (IID == Intrinsic::experimental_constrained_fmuladd) {
1936 IntrinsicCostAttributes FMulAttrs(
1937 Intrinsic::experimental_constrained_fmul, RetTy, Tys);
1938 IntrinsicCostAttributes FAddAttrs(
1939 Intrinsic::experimental_constrained_fadd, RetTy, Tys);
1940 return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) +
1941 thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind);
1942 }
1943
1944 // Else, assume that we need to scalarize this intrinsic. For math builtins
1945 // this will emit a costly libcall, adding call overhead and spills. Make it
1946 // very expensive.
1947 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
1948 // Scalable vectors cannot be scalarized, so return Invalid.
1949 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
1950 return isa<ScalableVectorType>(Ty);
1951 }))
1952 return InstructionCost::getInvalid();
1953
1954 InstructionCost ScalarizationCost =
1955 SkipScalarizationCost ? ScalarizationCostPassed
1956 : getScalarizationOverhead(RetVTy, true, false);
1957
1958 unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements();
1959 SmallVector<Type *, 4> ScalarTys;
1960 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1961 Type *Ty = Tys[i];
1962 if (Ty->isVectorTy())
1963 Ty = Ty->getScalarType();
1964 ScalarTys.push_back(Ty);
1965 }
1966 IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
1967 InstructionCost ScalarCost =
1968 thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1969 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1970 if (auto *VTy = dyn_cast<VectorType>(Tys[i])) {
1971 if (!ICA.skipScalarizationCost())
1972 ScalarizationCost += getScalarizationOverhead(VTy, false, true);
1973 ScalarCalls = std::max(ScalarCalls,
1974 cast<FixedVectorType>(VTy)->getNumElements());
1975 }
1976 }
1977 return ScalarCalls * ScalarCost + ScalarizationCost;
1978 }
1979
1980 // This is going to be turned into a library call, make it expensive.
1981 return SingleCallCost;
1982 }
1983
1984 /// Compute a cost of the given call instruction.
1985 ///
1986 /// Compute the cost of calling function F with return type RetTy and
1987 /// argument types Tys. F might be nullptr, in this case the cost of an
1988 /// arbitrary call with the specified signature will be returned.
1989 /// This is used, for instance, when we estimate call of a vector
1990 /// counterpart of the given function.
1991 /// \param F Called function, might be nullptr.
1992 /// \param RetTy Return value types.
1993 /// \param Tys Argument types.
1994 /// \returns The cost of Call instruction.
1995 InstructionCost
1996 getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys,
1997 TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) {
1998 return 10;
1999 }
2000
2001 unsigned getNumberOfParts(Type *Tp) {
2002 std::pair<InstructionCost, MVT> LT =
2003 getTLI()->getTypeLegalizationCost(DL, Tp);
2004 return *LT.first.getValue();
2005 }
2006
2007 InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *,
2008 const SCEV *) {
2009 return 0;
2010 }
2011
2012 /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
2013 /// We're assuming that reduction operation are performing the following way:
2014 ///
2015 /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
2016 /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
2017 /// \----------------v-------------/ \----------v------------/
2018 /// n/2 elements n/2 elements
2019 /// %red1 = op <n x t> %val, <n x t> val1
2020 /// After this operation we have a vector %red1 where only the first n/2
2021 /// elements are meaningful, the second n/2 elements are undefined and can be
2022 /// dropped. All other operations are actually working with the vector of
2023 /// length n/2, not n, though the real vector length is still n.
2024 /// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
2025 /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
2026 /// \----------------v-------------/ \----------v------------/
2027 /// n/4 elements 3*n/4 elements
2028 /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of
2029 /// length n/2, the resulting vector has length n/4 etc.
2030 ///
2031 /// The cost model should take into account that the actual length of the
2032 /// vector is reduced on each iteration.
2033 InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty,
2034 TTI::TargetCostKind CostKind) {
2035 Type *ScalarTy = Ty->getElementType();
2036 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2037 if ((Opcode == Instruction::Or || Opcode == Instruction::And) &&
2038 ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) &&
2039 NumVecElts >= 2) {
2040 // Or reduction for i1 is represented as:
2041 // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2042 // %res = cmp ne iReduxWidth %val, 0
2043 // And reduction for i1 is represented as:
2044 // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2045 // %res = cmp eq iReduxWidth %val, 11111
2046 Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts);
2047 return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty,
2048 TTI::CastContextHint::None, CostKind) +
2049 thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy,
2050 CmpInst::makeCmpResultType(ValTy),
2051 CmpInst::BAD_ICMP_PREDICATE, CostKind);
2052 }
2053 unsigned NumReduxLevels = Log2_32(NumVecElts);
2054 InstructionCost ArithCost = 0;
2055 InstructionCost ShuffleCost = 0;
2056 std::pair<InstructionCost, MVT> LT =
2057 thisT()->getTLI()->getTypeLegalizationCost(DL, Ty);
2058 unsigned LongVectorCount = 0;
2059 unsigned MVTLen =
2060 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2061 while (NumVecElts > MVTLen) {
2062 NumVecElts /= 2;
2063 VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2064 ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None,
2065 NumVecElts, SubTy);
2066 ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind);
2067 Ty = SubTy;
2068 ++LongVectorCount;
2069 }
2070
2071 NumReduxLevels -= LongVectorCount;
2072
2073 // The minimal length of the vector is limited by the real length of vector
2074 // operations performed on the current platform. That's why several final
2075 // reduction operations are performed on the vectors with the same
2076 // architecture-dependent length.
2077
2078 // By default reductions need one shuffle per reduction level.
2079 ShuffleCost += NumReduxLevels * thisT()->getShuffleCost(
2080 TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty);
2081 ArithCost += NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty);
2082 return ShuffleCost + ArithCost +
2083 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
2084 }
2085
2086 /// Try to calculate the cost of performing strict (in-order) reductions,
2087 /// which involves doing a sequence of floating point additions in lane
2088 /// order, starting with an initial value. For example, consider a scalar
2089 /// initial value 'InitVal' of type float and a vector of type <4 x float>:
2090 ///
2091 /// Vector = <float %v0, float %v1, float %v2, float %v3>
2092 ///
2093 /// %add1 = %InitVal + %v0
2094 /// %add2 = %add1 + %v1
2095 /// %add3 = %add2 + %v2
2096 /// %add4 = %add3 + %v3
2097 ///
2098 /// As a simple estimate we can say the cost of such a reduction is 4 times
2099 /// the cost of a scalar FP addition. We can only estimate the costs for
2100 /// fixed-width vectors here because for scalable vectors we do not know the
2101 /// runtime number of operations.
2102 InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty,
2103 TTI::TargetCostKind CostKind) {
2104 // Targets must implement a default value for the scalable case, since
2105 // we don't know how many lanes the vector has.
2106 if (isa<ScalableVectorType>(Ty))
2107 return InstructionCost::getInvalid();
2108
2109 auto *VTy = cast<FixedVectorType>(Ty);
2110 InstructionCost ExtractCost =
2111 getScalarizationOverhead(VTy, /*Insert=*/false, /*Extract=*/true);
2112 InstructionCost ArithCost = thisT()->getArithmeticInstrCost(
2113 Opcode, VTy->getElementType(), CostKind);
2114 ArithCost *= VTy->getNumElements();
2115
2116 return ExtractCost + ArithCost;
2117 }
2118
2119 InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
2120 Optional<FastMathFlags> FMF,
2121 TTI::TargetCostKind CostKind) {
2122 if (TTI::requiresOrderedReduction(FMF))
2123 return getOrderedReductionCost(Opcode, Ty, CostKind);
2124 return getTreeReductionCost(Opcode, Ty, CostKind);
2125 }
2126
2127 /// Try to calculate op costs for min/max reduction operations.
2128 /// \param CondTy Conditional type for the Select instruction.
2129 InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
2130 bool IsUnsigned,
2131 TTI::TargetCostKind CostKind) {
2132 Type *ScalarTy = Ty->getElementType();
2133 Type *ScalarCondTy = CondTy->getElementType();
2134 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2135 unsigned NumReduxLevels = Log2_32(NumVecElts);
2136 unsigned CmpOpcode;
2137 if (Ty->isFPOrFPVectorTy()) {
2138 CmpOpcode = Instruction::FCmp;
2139 } else {
2140 assert(Ty->isIntOrIntVectorTy() &&(static_cast<void> (0))
2141 "expecting floating point or integer type for min/max reduction")(static_cast<void> (0));
2142 CmpOpcode = Instruction::ICmp;
2143 }
2144 InstructionCost MinMaxCost = 0;
2145 InstructionCost ShuffleCost = 0;
2146 std::pair<InstructionCost, MVT> LT =
2147 thisT()->getTLI()->getTypeLegalizationCost(DL, Ty);
2148 unsigned LongVectorCount = 0;
2149 unsigned MVTLen =
2150 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2151 while (NumVecElts > MVTLen) {
2152 NumVecElts /= 2;
2153 auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2154 CondTy = FixedVectorType::get(ScalarCondTy, NumVecElts);
2155
2156 ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None,
2157 NumVecElts, SubTy);
2158 MinMaxCost +=
2159 thisT()->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy,
2160 CmpInst::BAD_ICMP_PREDICATE, CostKind) +
2161 thisT()->getCmpSelInstrCost(Instruction::Select, SubTy, CondTy,
2162 CmpInst::BAD_ICMP_PREDICATE, CostKind);
2163 Ty = SubTy;
2164 ++LongVectorCount;
2165 }
2166
2167 NumReduxLevels -= LongVectorCount;
2168
2169 // The minimal length of the vector is limited by the real length of vector
2170 // operations performed on the current platform. That's why several final
2171 // reduction opertions are perfomed on the vectors with the same
2172 // architecture-dependent length.
2173 ShuffleCost += NumReduxLevels * thisT()->getShuffleCost(
2174 TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty);
2175 MinMaxCost +=
2176 NumReduxLevels *
2177 (thisT()->getCmpSelInstrCost(CmpOpcode, Ty, CondTy,
2178 CmpInst::BAD_ICMP_PREDICATE, CostKind) +
2179 thisT()->getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
2180 CmpInst::BAD_ICMP_PREDICATE, CostKind));
2181 // The last min/max should be in vector registers and we counted it above.
2182 // So just need a single extractelement.
2183 return ShuffleCost + MinMaxCost +
2184 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
2185 }
2186
2187 InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
2188 Type *ResTy, VectorType *Ty,
2189 TTI::TargetCostKind CostKind) {
2190 // Without any native support, this is equivalent to the cost of
2191 // vecreduce.add(ext) or if IsMLA vecreduce.add(mul(ext, ext))
2192 VectorType *ExtTy = VectorType::get(ResTy, Ty);
2193 InstructionCost RedCost = thisT()->getArithmeticReductionCost(
2194 Instruction::Add, ExtTy, None, CostKind);
2195 InstructionCost MulCost = 0;
2196 InstructionCost ExtCost = thisT()->getCastInstrCost(
2197 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2198 TTI::CastContextHint::None, CostKind);
2199 if (IsMLA) {
2200 MulCost =
2201 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2202 ExtCost *= 2;
2203 }
2204
2205 return RedCost + MulCost + ExtCost;
2206 }
2207
2208 InstructionCost getVectorSplitCost() { return 1; }
2209
2210 /// @}
2211};
2212
2213/// Concrete BasicTTIImpl that can be used if no further customization
2214/// is needed.
2215class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
2216 using BaseT = BasicTTIImplBase<BasicTTIImpl>;
2217
2218 friend class BasicTTIImplBase<BasicTTIImpl>;
2219
2220 const TargetSubtargetInfo *ST;
2221 const TargetLoweringBase *TLI;
2222
2223 const TargetSubtargetInfo *getST() const { return ST; }
2224 const TargetLoweringBase *getTLI() const { return TLI; }
2225
2226public:
2227 explicit BasicTTIImpl(const TargetMachine *TM, const Function &F);
2228};
2229
2230} // end namespace llvm
2231
2232#endif // LLVM_CODEGEN_BASICTTIIMPL_H

/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/include/llvm/Support/InstructionCost.h

1//===- InstructionCost.h ----------------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file defines an InstructionCost class that is used when calculating
10/// the cost of an instruction, or a group of instructions. In addition to a
11/// numeric value representing the cost the class also contains a state that
12/// can be used to encode particular properties, such as a cost being invalid.
13/// Operations on InstructionCost implement saturation arithmetic, so that
14/// accumulating costs on large cost-values don't overflow.
15///
16//===----------------------------------------------------------------------===//
17
18#ifndef LLVM_SUPPORT_INSTRUCTIONCOST_H
19#define LLVM_SUPPORT_INSTRUCTIONCOST_H
20
21#include "llvm/ADT/Optional.h"
22#include "llvm/Support/MathExtras.h"
23#include <limits>
24
25namespace llvm {
26
27class raw_ostream;
28
29class InstructionCost {
30public:
31 using CostType = int64_t;
32
33 /// CostState describes the state of a cost.
34 enum CostState {
35 Valid, /// < The cost value represents a valid cost, even when the
36 /// cost-value is large.
37 Invalid /// < Invalid indicates there is no way to represent the cost as a
38 /// numeric value. This state exists to represent a possible issue,
39 /// e.g. if the cost-model knows the operation cannot be expanded
40 /// into a valid code-sequence by the code-generator. While some
41 /// passes may assert that the calculated cost must be valid, it is
42 /// up to individual passes how to interpret an Invalid cost. For
43 /// example, a transformation pass could choose not to perform a
44 /// transformation if the resulting cost would end up Invalid.
45 /// Because some passes may assert a cost is Valid, it is not
46 /// recommended to use Invalid costs to model 'Unknown'.
47 /// Note that Invalid is semantically different from a (very) high,
48 /// but valid cost, which intentionally indicates no issue, but
49 /// rather a strong preference not to select a certain operation.
50 };
51
52private:
53 CostType Value = 0;
54 CostState State = Valid;
55
56 void propagateState(const InstructionCost &RHS) {
57 if (RHS.State == Invalid)
58 State = Invalid;
59 }
60
61 static CostType getMaxValue() { return std::numeric_limits<CostType>::max(); }
62 static CostType getMinValue() { return std::numeric_limits<CostType>::min(); }
63
64public:
65 // A default constructed InstructionCost is a valid zero cost
66 InstructionCost() = default;
67
68 InstructionCost(CostState) = delete;
69 InstructionCost(CostType Val) : Value(Val), State(Valid) {}
70
71 static InstructionCost getMax() { return getMaxValue(); }
72 static InstructionCost getMin() { return getMinValue(); }
73 static InstructionCost getInvalid(CostType Val = 0) {
74 InstructionCost Tmp(Val);
75 Tmp.setInvalid();
76 return Tmp;
77 }
78
79 bool isValid() const { return State == Valid; }
80 void setValid() { State = Valid; }
81 void setInvalid() { State = Invalid; }
82 CostState getState() const { return State; }
83
84 /// This function is intended to be used as sparingly as possible, since the
85 /// class provides the full range of operator support required for arithmetic
86 /// and comparisons.
87 Optional<CostType> getValue() const {
88 if (isValid())
89 return Value;
90 return None;
91 }
92
93 /// For all of the arithmetic operators provided here any invalid state is
94 /// perpetuated and cannot be removed. Once a cost becomes invalid it stays
95 /// invalid, and it also inherits any invalid state from the RHS.
96 /// Arithmetic work on the actual values is implemented with saturation,
97 /// to avoid overflow when using more extreme cost values.
98
99 InstructionCost &operator+=(const InstructionCost &RHS) {
100 propagateState(RHS);
101
102 // Saturating addition.
103 InstructionCost::CostType Result;
104 if (AddOverflow(Value, RHS.Value, Result))
105 Result = RHS.Value > 0 ? getMaxValue() : getMinValue();
106
107 Value = Result;
108 return *this;
109 }
110
111 InstructionCost &operator+=(const CostType RHS) {
112 InstructionCost RHS2(RHS);
113 *this += RHS2;
114 return *this;
115 }
116
117 InstructionCost &operator-=(const InstructionCost &RHS) {
118 propagateState(RHS);
119
120 // Saturating subtract.
121 InstructionCost::CostType Result;
122 if (SubOverflow(Value, RHS.Value, Result))
123 Result = RHS.Value > 0 ? getMinValue() : getMaxValue();
124 Value = Result;
125 return *this;
126 }
127
128 InstructionCost &operator-=(const CostType RHS) {
129 InstructionCost RHS2(RHS);
130 *this -= RHS2;
131 return *this;
132 }
133
134 InstructionCost &operator*=(const InstructionCost &RHS) {
135 propagateState(RHS);
136
137 // Saturating multiply.
138 InstructionCost::CostType Result;
139 if (MulOverflow(Value, RHS.Value, Result)) {
140 if ((Value > 0 && RHS.Value > 0) || (Value < 0 && RHS.Value < 0))
141 Result = getMaxValue();
142 else
143 Result = getMinValue();
144 }
145
146 Value = Result;
147 return *this;
148 }
149
150 InstructionCost &operator*=(const CostType RHS) {
151 InstructionCost RHS2(RHS);
152 *this *= RHS2;
153 return *this;
154 }
155
156 InstructionCost &operator/=(const InstructionCost &RHS) {
157 propagateState(RHS);
158 Value /= RHS.Value;
159 return *this;
160 }
161
162 InstructionCost &operator/=(const CostType RHS) {
163 InstructionCost RHS2(RHS);
164 *this /= RHS2;
165 return *this;
166 }
167
168 InstructionCost &operator++() {
169 *this += 1;
170 return *this;
171 }
172
173 InstructionCost operator++(int) {
174 InstructionCost Copy = *this;
175 ++*this;
176 return Copy;
177 }
178
179 InstructionCost &operator--() {
180 *this -= 1;
181 return *this;
182 }
183
184 InstructionCost operator--(int) {
185 InstructionCost Copy = *this;
186 --*this;
187 return Copy;
188 }
189
190 /// For the comparison operators we have chosen to use lexicographical
191 /// ordering where valid costs are always considered to be less than invalid
192 /// costs. This avoids having to add asserts to the comparison operators that
193 /// the states are valid and users can test for validity of the cost
194 /// explicitly.
195 bool operator<(const InstructionCost &RHS) const {
196 if (State != RHS.State)
197 return State < RHS.State;
198 return Value < RHS.Value;
199 }
200
201 // Implement in terms of operator< to ensure that the two comparisons stay in
202 // sync
203 bool operator==(const InstructionCost &RHS) const {
204 return !(*this < RHS) && !(RHS < *this);
5
Returning zero, which participates in a condition later
205 }
206
207 bool operator!=(const InstructionCost &RHS) const { return !(*this == RHS); }
208
209 bool operator==(const CostType RHS) const {
210 InstructionCost RHS2(RHS);
211 return *this == RHS2;
4
Calling 'InstructionCost::operator=='
6
Returning from 'InstructionCost::operator=='
7
Returning zero, which participates in a condition later
212 }
213
214 bool operator!=(const CostType RHS) const { return !(*this == RHS); }
215
216 bool operator>(const InstructionCost &RHS) const { return RHS < *this; }
217
218 bool operator<=(const InstructionCost &RHS) const { return !(RHS < *this); }
219
220 bool operator>=(const InstructionCost &RHS) const { return !(*this < RHS); }
221
222 bool operator<(const CostType RHS) const {
223 InstructionCost RHS2(RHS);
224 return *this < RHS2;
225 }
226
227 bool operator>(const CostType RHS) const {
228 InstructionCost RHS2(RHS);
229 return *this > RHS2;
230 }
231
232 bool operator<=(const CostType RHS) const {
233 InstructionCost RHS2(RHS);
234 return *this <= RHS2;
235 }
236
237 bool operator>=(const CostType RHS) const {
238 InstructionCost RHS2(RHS);
239 return *this >= RHS2;
240 }
241
242 void print(raw_ostream &OS) const;
243
244 template <class Function>
245 auto map(const Function &F) const -> InstructionCost {
246 if (isValid())
247 return F(*getValue());
248 return getInvalid();
249 }
250};
251
252inline InstructionCost operator+(const InstructionCost &LHS,
253 const InstructionCost &RHS) {
254 InstructionCost LHS2(LHS);
255 LHS2 += RHS;
256 return LHS2;
257}
258
259inline InstructionCost operator-(const InstructionCost &LHS,
260 const InstructionCost &RHS) {
261 InstructionCost LHS2(LHS);
262 LHS2 -= RHS;
263 return LHS2;
264}
265
266inline InstructionCost operator*(const InstructionCost &LHS,
267 const InstructionCost &RHS) {
268 InstructionCost LHS2(LHS);
269 LHS2 *= RHS;
270 return LHS2;
271}
272
273inline InstructionCost operator/(const InstructionCost &LHS,
274 const InstructionCost &RHS) {
275 InstructionCost LHS2(LHS);
276 LHS2 /= RHS;
277 return LHS2;
278}
279
280inline raw_ostream &operator<<(raw_ostream &OS, const InstructionCost &V) {
281 V.print(OS);
282 return OS;
283}
284
285} // namespace llvm
286
287#endif

/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/include/llvm/Analysis/TargetTransformInfo.h

1//===- TargetTransformInfo.h ------------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This pass exposes codegen information to IR-level passes. Every
10/// transformation that uses codegen information is broken into three parts:
11/// 1. The IR-level analysis pass.
12/// 2. The IR-level transformation interface which provides the needed
13/// information.
14/// 3. Codegen-level implementation which uses target-specific hooks.
15///
16/// This file defines #2, which is the interface that IR-level transformations
17/// use for querying the codegen.
18///
19//===----------------------------------------------------------------------===//
20
21#ifndef LLVM_ANALYSIS_TARGETTRANSFORMINFO_H
22#define LLVM_ANALYSIS_TARGETTRANSFORMINFO_H
23
24#include "llvm/IR/InstrTypes.h"
25#include "llvm/IR/Operator.h"
26#include "llvm/IR/PassManager.h"
27#include "llvm/Pass.h"
28#include "llvm/Support/AtomicOrdering.h"
29#include "llvm/Support/BranchProbability.h"
30#include "llvm/Support/DataTypes.h"
31#include "llvm/Support/InstructionCost.h"
32#include <functional>
33
34namespace llvm {
35
36namespace Intrinsic {
37typedef unsigned ID;
38}
39
40class AssumptionCache;
41class BlockFrequencyInfo;
42class DominatorTree;
43class BranchInst;
44class CallBase;
45class ExtractElementInst;
46class Function;
47class GlobalValue;
48class InstCombiner;
49class OptimizationRemarkEmitter;
50class IntrinsicInst;
51class LoadInst;
52class LoopAccessInfo;
53class Loop;
54class LoopInfo;
55class ProfileSummaryInfo;
56class RecurrenceDescriptor;
57class SCEV;
58class ScalarEvolution;
59class StoreInst;
60class SwitchInst;
61class TargetLibraryInfo;
62class Type;
63class User;
64class Value;
65class VPIntrinsic;
66struct KnownBits;
67template <typename T> class Optional;
68
69/// Information about a load/store intrinsic defined by the target.
70struct MemIntrinsicInfo {
71 /// This is the pointer that the intrinsic is loading from or storing to.
72 /// If this is non-null, then analysis/optimization passes can assume that
73 /// this intrinsic is functionally equivalent to a load/store from this
74 /// pointer.
75 Value *PtrVal = nullptr;
76
77 // Ordering for atomic operations.
78 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
79
80 // Same Id is set by the target for corresponding load/store intrinsics.
81 unsigned short MatchingId = 0;
82
83 bool ReadMem = false;
84 bool WriteMem = false;
85 bool IsVolatile = false;
86
87 bool isUnordered() const {
88 return (Ordering == AtomicOrdering::NotAtomic ||
89 Ordering == AtomicOrdering::Unordered) &&
90 !IsVolatile;
91 }
92};
93
94/// Attributes of a target dependent hardware loop.
95struct HardwareLoopInfo {
96 HardwareLoopInfo() = delete;
97 HardwareLoopInfo(Loop *L) : L(L) {}
98 Loop *L = nullptr;
99 BasicBlock *ExitBlock = nullptr;
100 BranchInst *ExitBranch = nullptr;
101 const SCEV *ExitCount = nullptr;
102 IntegerType *CountType = nullptr;
103 Value *LoopDecrement = nullptr; // Decrement the loop counter by this
104 // value in every iteration.
105 bool IsNestingLegal = false; // Can a hardware loop be a parent to
106 // another hardware loop?
107 bool CounterInReg = false; // Should loop counter be updated in
108 // the loop via a phi?
109 bool PerformEntryTest = false; // Generate the intrinsic which also performs
110 // icmp ne zero on the loop counter value and
111 // produces an i1 to guard the loop entry.
112 bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI,
113 DominatorTree &DT, bool ForceNestedLoop = false,
114 bool ForceHardwareLoopPHI = false);
115 bool canAnalyze(LoopInfo &LI);
116};
117
118class IntrinsicCostAttributes {
119 const IntrinsicInst *II = nullptr;
120 Type *RetTy = nullptr;
121 Intrinsic::ID IID;
122 SmallVector<Type *, 4> ParamTys;
123 SmallVector<const Value *, 4> Arguments;
124 FastMathFlags FMF;
125 // If ScalarizationCost is UINT_MAX, the cost of scalarizing the
126 // arguments and the return value will be computed based on types.
127 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
128
129public:
130 IntrinsicCostAttributes(
131 Intrinsic::ID Id, const CallBase &CI,
132 InstructionCost ScalarCost = InstructionCost::getInvalid());
133
134 IntrinsicCostAttributes(
135 Intrinsic::ID Id, Type *RTy, ArrayRef<Type *> Tys,
136 FastMathFlags Flags = FastMathFlags(), const IntrinsicInst *I = nullptr,
137 InstructionCost ScalarCost = InstructionCost::getInvalid());
138
139 IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
140 ArrayRef<const Value *> Args);
141
142 IntrinsicCostAttributes(
143 Intrinsic::ID Id, Type *RTy, ArrayRef<const Value *> Args,
144 ArrayRef<Type *> Tys, FastMathFlags Flags = FastMathFlags(),
145 const IntrinsicInst *I = nullptr,
146 InstructionCost ScalarCost = InstructionCost::getInvalid());
147
148 Intrinsic::ID getID() const { return IID; }
149 const IntrinsicInst *getInst() const { return II; }
150 Type *getReturnType() const { return RetTy; }
151 FastMathFlags getFlags() const { return FMF; }
152 InstructionCost getScalarizationCost() const { return ScalarizationCost; }
153 const SmallVectorImpl<const Value *> &getArgs() const { return Arguments; }
154 const SmallVectorImpl<Type *> &getArgTypes() const { return ParamTys; }
155
156 bool isTypeBasedOnly() const {
157 return Arguments.empty();
13
Calling 'SmallVectorBase::empty'
16
Returning from 'SmallVectorBase::empty'
17
Returning zero, which participates in a condition later
158 }
159
160 bool skipScalarizationCost() const { return ScalarizationCost.isValid(); }
161};
162
163class TargetTransformInfo;
164typedef TargetTransformInfo TTI;
165
166/// This pass provides access to the codegen interfaces that are needed
167/// for IR-level transformations.
168class TargetTransformInfo {
169public:
170 /// Construct a TTI object using a type implementing the \c Concept
171 /// API below.
172 ///
173 /// This is used by targets to construct a TTI wrapping their target-specific
174 /// implementation that encodes appropriate costs for their target.
175 template <typename T> TargetTransformInfo(T Impl);
176
177 /// Construct a baseline TTI object using a minimal implementation of
178 /// the \c Concept API below.
179 ///
180 /// The TTI implementation will reflect the information in the DataLayout
181 /// provided if non-null.
182 explicit TargetTransformInfo(const DataLayout &DL);
183
184 // Provide move semantics.
185 TargetTransformInfo(TargetTransformInfo &&Arg);
186 TargetTransformInfo &operator=(TargetTransformInfo &&RHS);
187
188 // We need to define the destructor out-of-line to define our sub-classes
189 // out-of-line.
190 ~TargetTransformInfo();
191
192 /// Handle the invalidation of this information.
193 ///
194 /// When used as a result of \c TargetIRAnalysis this method will be called
195 /// when the function this was computed for changes. When it returns false,
196 /// the information is preserved across those changes.
197 bool invalidate(Function &, const PreservedAnalyses &,
198 FunctionAnalysisManager::Invalidator &) {
199 // FIXME: We should probably in some way ensure that the subtarget
200 // information for a function hasn't changed.
201 return false;
202 }
203
204 /// \name Generic Target Information
205 /// @{
206
207 /// The kind of cost model.
208 ///
209 /// There are several different cost models that can be customized by the
210 /// target. The normalization of each cost model may be target specific.
211 enum TargetCostKind {
212 TCK_RecipThroughput, ///< Reciprocal throughput.
213 TCK_Latency, ///< The latency of instruction.
214 TCK_CodeSize, ///< Instruction code size.
215 TCK_SizeAndLatency ///< The weighted sum of size and latency.
216 };
217
218 /// Query the cost of a specified instruction.
219 ///
220 /// Clients should use this interface to query the cost of an existing
221 /// instruction. The instruction must have a valid parent (basic block).
222 ///
223 /// Note, this method does not cache the cost calculation and it
224 /// can be expensive in some cases.
225 InstructionCost getInstructionCost(const Instruction *I,
226 enum TargetCostKind kind) const {
227 InstructionCost Cost;
228 switch (kind) {
229 case TCK_RecipThroughput:
230 Cost = getInstructionThroughput(I);
231 break;
232 case TCK_Latency:
233 Cost = getInstructionLatency(I);
234 break;
235 case TCK_CodeSize:
236 case TCK_SizeAndLatency:
237 Cost = getUserCost(I, kind);
238 break;
239 }
240 return Cost;
241 }
242
243 /// Underlying constants for 'cost' values in this interface.
244 ///
245 /// Many APIs in this interface return a cost. This enum defines the
246 /// fundamental values that should be used to interpret (and produce) those
247 /// costs. The costs are returned as an int rather than a member of this
248 /// enumeration because it is expected that the cost of one IR instruction
249 /// may have a multiplicative factor to it or otherwise won't fit directly
250 /// into the enum. Moreover, it is common to sum or average costs which works
251 /// better as simple integral values. Thus this enum only provides constants.
252 /// Also note that the returned costs are signed integers to make it natural
253 /// to add, subtract, and test with zero (a common boundary condition). It is
254 /// not expected that 2^32 is a realistic cost to be modeling at any point.
255 ///
256 /// Note that these costs should usually reflect the intersection of code-size
257 /// cost and execution cost. A free instruction is typically one that folds
258 /// into another instruction. For example, reg-to-reg moves can often be
259 /// skipped by renaming the registers in the CPU, but they still are encoded
260 /// and thus wouldn't be considered 'free' here.
261 enum TargetCostConstants {
262 TCC_Free = 0, ///< Expected to fold away in lowering.
263 TCC_Basic = 1, ///< The cost of a typical 'add' instruction.
264 TCC_Expensive = 4 ///< The cost of a 'div' instruction on x86.
265 };
266
267 /// Estimate the cost of a GEP operation when lowered.
268 InstructionCost
269 getGEPCost(Type *PointeeType, const Value *Ptr,
270 ArrayRef<const Value *> Operands,
271 TargetCostKind CostKind = TCK_SizeAndLatency) const;
272
273 /// \returns A value by which our inlining threshold should be multiplied.
274 /// This is primarily used to bump up the inlining threshold wholesale on
275 /// targets where calls are unusually expensive.
276 ///
277 /// TODO: This is a rather blunt instrument. Perhaps altering the costs of
278 /// individual classes of instructions would be better.
279 unsigned getInliningThresholdMultiplier() const;
280
281 /// \returns A value to be added to the inlining threshold.
282 unsigned adjustInliningThreshold(const CallBase *CB) const;
283
284 /// \returns Vector bonus in percent.
285 ///
286 /// Vector bonuses: We want to more aggressively inline vector-dense kernels
287 /// and apply this bonus based on the percentage of vector instructions. A
288 /// bonus is applied if the vector instructions exceed 50% and half that
289 /// amount is applied if it exceeds 10%. Note that these bonuses are some what
290 /// arbitrary and evolved over time by accident as much as because they are
291 /// principled bonuses.
292 /// FIXME: It would be nice to base the bonus values on something more
293 /// scientific. A target may has no bonus on vector instructions.
294 int getInlinerVectorBonusPercent() const;
295
296 /// \return the expected cost of a memcpy, which could e.g. depend on the
297 /// source/destination type and alignment and the number of bytes copied.
298 InstructionCost getMemcpyCost(const Instruction *I) const;
299
300 /// \return The estimated number of case clusters when lowering \p 'SI'.
301 /// \p JTSize Set a jump table size only when \p SI is suitable for a jump
302 /// table.
303 unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
304 unsigned &JTSize,
305 ProfileSummaryInfo *PSI,
306 BlockFrequencyInfo *BFI) const;
307
308 /// Estimate the cost of a given IR user when lowered.
309 ///
310 /// This can estimate the cost of either a ConstantExpr or Instruction when
311 /// lowered.
312 ///
313 /// \p Operands is a list of operands which can be a result of transformations
314 /// of the current operands. The number of the operands on the list must equal
315 /// to the number of the current operands the IR user has. Their order on the
316 /// list must be the same as the order of the current operands the IR user
317 /// has.
318 ///
319 /// The returned cost is defined in terms of \c TargetCostConstants, see its
320 /// comments for a detailed explanation of the cost values.
321 InstructionCost getUserCost(const User *U, ArrayRef<const Value *> Operands,
322 TargetCostKind CostKind) const;
323
324 /// This is a helper function which calls the two-argument getUserCost
325 /// with \p Operands which are the current operands U has.
326 InstructionCost getUserCost(const User *U, TargetCostKind CostKind) const {
327 SmallVector<const Value *, 4> Operands(U->operand_values());
328 return getUserCost(U, Operands, CostKind);
329 }
330
331 /// If a branch or a select condition is skewed in one direction by more than
332 /// this factor, it is very likely to be predicted correctly.
333 BranchProbability getPredictableBranchThreshold() const;
334
335 /// Return true if branch divergence exists.
336 ///
337 /// Branch divergence has a significantly negative impact on GPU performance
338 /// when threads in the same wavefront take different paths due to conditional
339 /// branches.
340 bool hasBranchDivergence() const;
341
342 /// Return true if the target prefers to use GPU divergence analysis to
343 /// replace the legacy version.
344 bool useGPUDivergenceAnalysis() const;
345
346 /// Returns whether V is a source of divergence.
347 ///
348 /// This function provides the target-dependent information for
349 /// the target-independent LegacyDivergenceAnalysis. LegacyDivergenceAnalysis
350 /// first builds the dependency graph, and then runs the reachability
351 /// algorithm starting with the sources of divergence.
352 bool isSourceOfDivergence(const Value *V) const;
353
354 // Returns true for the target specific
355 // set of operations which produce uniform result
356 // even taking non-uniform arguments
357 bool isAlwaysUniform(const Value *V) const;
358
359 /// Returns the address space ID for a target's 'flat' address space. Note
360 /// this is not necessarily the same as addrspace(0), which LLVM sometimes
361 /// refers to as the generic address space. The flat address space is a
362 /// generic address space that can be used access multiple segments of memory
363 /// with different address spaces. Access of a memory location through a
364 /// pointer with this address space is expected to be legal but slower
365 /// compared to the same memory location accessed through a pointer with a
366 /// different address space.
367 //
368 /// This is for targets with different pointer representations which can
369 /// be converted with the addrspacecast instruction. If a pointer is converted
370 /// to this address space, optimizations should attempt to replace the access
371 /// with the source address space.
372 ///
373 /// \returns ~0u if the target does not have such a flat address space to
374 /// optimize away.
375 unsigned getFlatAddressSpace() const;
376
377 /// Return any intrinsic address operand indexes which may be rewritten if
378 /// they use a flat address space pointer.
379 ///
380 /// \returns true if the intrinsic was handled.
381 bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
382 Intrinsic::ID IID) const;
383
384 bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const;
385
386 unsigned getAssumedAddrSpace(const Value *V) const;
387
388 /// Rewrite intrinsic call \p II such that \p OldV will be replaced with \p
389 /// NewV, which has a different address space. This should happen for every
390 /// operand index that collectFlatAddressOperands returned for the intrinsic.
391 /// \returns nullptr if the intrinsic was not handled. Otherwise, returns the
392 /// new value (which may be the original \p II with modified operands).
393 Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
394 Value *NewV) const;
395
396 /// Test whether calls to a function lower to actual program function
397 /// calls.
398 ///
399 /// The idea is to test whether the program is likely to require a 'call'
400 /// instruction or equivalent in order to call the given function.
401 ///
402 /// FIXME: It's not clear that this is a good or useful query API. Client's
403 /// should probably move to simpler cost metrics using the above.
404 /// Alternatively, we could split the cost interface into distinct code-size
405 /// and execution-speed costs. This would allow modelling the core of this
406 /// query more accurately as a call is a single small instruction, but
407 /// incurs significant execution cost.
408 bool isLoweredToCall(const Function *F) const;
409
410 struct LSRCost {
411 /// TODO: Some of these could be merged. Also, a lexical ordering
412 /// isn't always optimal.
413 unsigned Insns;
414 unsigned NumRegs;
415 unsigned AddRecCost;
416 unsigned NumIVMuls;
417 unsigned NumBaseAdds;
418 unsigned ImmCost;
419 unsigned SetupCost;
420 unsigned ScaleCost;
421 };
422
423 /// Parameters that control the generic loop unrolling transformation.
424 struct UnrollingPreferences {
425 /// The cost threshold for the unrolled loop. Should be relative to the
426 /// getUserCost values returned by this API, and the expectation is that
427 /// the unrolled loop's instructions when run through that interface should
428 /// not exceed this cost. However, this is only an estimate. Also, specific
429 /// loops may be unrolled even with a cost above this threshold if deemed
430 /// profitable. Set this to UINT_MAX to disable the loop body cost
431 /// restriction.
432 unsigned Threshold;
433 /// If complete unrolling will reduce the cost of the loop, we will boost
434 /// the Threshold by a certain percent to allow more aggressive complete
435 /// unrolling. This value provides the maximum boost percentage that we
436 /// can apply to Threshold (The value should be no less than 100).
437 /// BoostedThreshold = Threshold * min(RolledCost / UnrolledCost,
438 /// MaxPercentThresholdBoost / 100)
439 /// E.g. if complete unrolling reduces the loop execution time by 50%
440 /// then we boost the threshold by the factor of 2x. If unrolling is not
441 /// expected to reduce the running time, then we do not increase the
442 /// threshold.
443 unsigned MaxPercentThresholdBoost;
444 /// The cost threshold for the unrolled loop when optimizing for size (set
445 /// to UINT_MAX to disable).
446 unsigned OptSizeThreshold;
447 /// The cost threshold for the unrolled loop, like Threshold, but used
448 /// for partial/runtime unrolling (set to UINT_MAX to disable).
449 unsigned PartialThreshold;
450 /// The cost threshold for the unrolled loop when optimizing for size, like
451 /// OptSizeThreshold, but used for partial/runtime unrolling (set to
452 /// UINT_MAX to disable).
453 unsigned PartialOptSizeThreshold;
454 /// A forced unrolling factor (the number of concatenated bodies of the
455 /// original loop in the unrolled loop body). When set to 0, the unrolling
456 /// transformation will select an unrolling factor based on the current cost
457 /// threshold and other factors.
458 unsigned Count;
459 /// Default unroll count for loops with run-time trip count.
460 unsigned DefaultUnrollRuntimeCount;
461 // Set the maximum unrolling factor. The unrolling factor may be selected
462 // using the appropriate cost threshold, but may not exceed this number
463 // (set to UINT_MAX to disable). This does not apply in cases where the
464 // loop is being fully unrolled.
465 unsigned MaxCount;
466 /// Set the maximum unrolling factor for full unrolling. Like MaxCount, but
467 /// applies even if full unrolling is selected. This allows a target to fall
468 /// back to Partial unrolling if full unrolling is above FullUnrollMaxCount.
469 unsigned FullUnrollMaxCount;
470 // Represents number of instructions optimized when "back edge"
471 // becomes "fall through" in unrolled loop.
472 // For now we count a conditional branch on a backedge and a comparison
473 // feeding it.
474 unsigned BEInsns;
475 /// Allow partial unrolling (unrolling of loops to expand the size of the
476 /// loop body, not only to eliminate small constant-trip-count loops).
477 bool Partial;
478 /// Allow runtime unrolling (unrolling of loops to expand the size of the
479 /// loop body even when the number of loop iterations is not known at
480 /// compile time).
481 bool Runtime;
482 /// Allow generation of a loop remainder (extra iterations after unroll).
483 bool AllowRemainder;
484 /// Allow emitting expensive instructions (such as divisions) when computing
485 /// the trip count of a loop for runtime unrolling.
486 bool AllowExpensiveTripCount;
487 /// Apply loop unroll on any kind of loop
488 /// (mainly to loops that fail runtime unrolling).
489 bool Force;
490 /// Allow using trip count upper bound to unroll loops.
491 bool UpperBound;
492 /// Allow unrolling of all the iterations of the runtime loop remainder.
493 bool UnrollRemainder;
494 /// Allow unroll and jam. Used to enable unroll and jam for the target.
495 bool UnrollAndJam;
496 /// Threshold for unroll and jam, for inner loop size. The 'Threshold'
497 /// value above is used during unroll and jam for the outer loop size.
498 /// This value is used in the same manner to limit the size of the inner
499 /// loop.
500 unsigned UnrollAndJamInnerLoopThreshold;
501 /// Don't allow loop unrolling to simulate more than this number of
502 /// iterations when checking full unroll profitability
503 unsigned MaxIterationsCountToAnalyze;
504 };
505
506 /// Get target-customized preferences for the generic loop unrolling
507 /// transformation. The caller will initialize UP with the current
508 /// target-independent defaults.
509 void getUnrollingPreferences(Loop *L, ScalarEvolution &,
510 UnrollingPreferences &UP,
511 OptimizationRemarkEmitter *ORE) const;
512
513 /// Query the target whether it would be profitable to convert the given loop
514 /// into a hardware loop.
515 bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
516 AssumptionCache &AC, TargetLibraryInfo *LibInfo,
517 HardwareLoopInfo &HWLoopInfo) const;
518
519 /// Query the target whether it would be prefered to create a predicated
520 /// vector loop, which can avoid the need to emit a scalar epilogue loop.
521 bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
522 AssumptionCache &AC, TargetLibraryInfo *TLI,
523 DominatorTree *DT,
524 const LoopAccessInfo *LAI) const;
525
526 /// Query the target whether lowering of the llvm.get.active.lane.mask
527 /// intrinsic is supported.
528 bool emitGetActiveLaneMask() const;
529
530 // Parameters that control the loop peeling transformation
531 struct PeelingPreferences {
532 /// A forced peeling factor (the number of bodied of the original loop
533 /// that should be peeled off before the loop body). When set to 0, the
534 /// a peeling factor based on profile information and other factors.
535 unsigned PeelCount;
536 /// Allow peeling off loop iterations.
537 bool AllowPeeling;
538 /// Allow peeling off loop iterations for loop nests.
539 bool AllowLoopNestsPeeling;
540 /// Allow peeling basing on profile. Uses to enable peeling off all
541 /// iterations basing on provided profile.
542 /// If the value is true the peeling cost model can decide to peel only
543 /// some iterations and in this case it will set this to false.
544 bool PeelProfiledIterations;
545 };
546
547 /// Get target-customized preferences for the generic loop peeling
548 /// transformation. The caller will initialize \p PP with the current
549 /// target-independent defaults with information from \p L and \p SE.
550 void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
551 PeelingPreferences &PP) const;
552
553 /// Targets can implement their own combinations for target-specific
554 /// intrinsics. This function will be called from the InstCombine pass every
555 /// time a target-specific intrinsic is encountered.
556 ///
557 /// \returns None to not do anything target specific or a value that will be
558 /// returned from the InstCombiner. It is possible to return null and stop
559 /// further processing of the intrinsic by returning nullptr.
560 Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
561 IntrinsicInst &II) const;
562 /// Can be used to implement target-specific instruction combining.
563 /// \see instCombineIntrinsic
564 Optional<Value *>
565 simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II,
566 APInt DemandedMask, KnownBits &Known,
567 bool &KnownBitsComputed) const;
568 /// Can be used to implement target-specific instruction combining.
569 /// \see instCombineIntrinsic
570 Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
571 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
572 APInt &UndefElts2, APInt &UndefElts3,
573 std::function<void(Instruction *, unsigned, APInt, APInt &)>
574 SimplifyAndSetOp) const;
575 /// @}
576
577 /// \name Scalar Target Information
578 /// @{
579
580 /// Flags indicating the kind of support for population count.
581 ///
582 /// Compared to the SW implementation, HW support is supposed to
583 /// significantly boost the performance when the population is dense, and it
584 /// may or may not degrade performance if the population is sparse. A HW
585 /// support is considered as "Fast" if it can outperform, or is on a par
586 /// with, SW implementation when the population is sparse; otherwise, it is
587 /// considered as "Slow".
588 enum PopcntSupportKind { PSK_Software, PSK_SlowHardware, PSK_FastHardware };
589
590 /// Return true if the specified immediate is legal add immediate, that
591 /// is the target has add instructions which can add a register with the
592 /// immediate without having to materialize the immediate into a register.
593 bool isLegalAddImmediate(int64_t Imm) const;
594
595 /// Return true if the specified immediate is legal icmp immediate,
596 /// that is the target has icmp instructions which can compare a register
597 /// against the immediate without having to materialize the immediate into a
598 /// register.
599 bool isLegalICmpImmediate(int64_t Imm) const;
600
601 /// Return true if the addressing mode represented by AM is legal for
602 /// this target, for a load/store of the specified type.
603 /// The type may be VoidTy, in which case only return true if the addressing
604 /// mode is legal for a load/store of any legal type.
605 /// If target returns true in LSRWithInstrQueries(), I may be valid.
606 /// TODO: Handle pre/postinc as well.
607 bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
608 bool HasBaseReg, int64_t Scale,
609 unsigned AddrSpace = 0,
610 Instruction *I = nullptr) const;
611
612 /// Return true if LSR cost of C1 is lower than C1.
613 bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
614 TargetTransformInfo::LSRCost &C2) const;
615
616 /// Return true if LSR major cost is number of registers. Targets which
617 /// implement their own isLSRCostLess and unset number of registers as major
618 /// cost should return false, otherwise return true.
619 bool isNumRegsMajorCostOfLSR() const;
620
621 /// \returns true if LSR should not optimize a chain that includes \p I.
622 bool isProfitableLSRChainElement(Instruction *I) const;
623
624 /// Return true if the target can fuse a compare and branch.
625 /// Loop-strength-reduction (LSR) uses that knowledge to adjust its cost
626 /// calculation for the instructions in a loop.
627 bool canMacroFuseCmp() const;
628
629 /// Return true if the target can save a compare for loop count, for example
630 /// hardware loop saves a compare.
631 bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI,
632 DominatorTree *DT, AssumptionCache *AC,
633 TargetLibraryInfo *LibInfo) const;
634
635 enum AddressingModeKind {
636 AMK_PreIndexed,
637 AMK_PostIndexed,
638 AMK_None
639 };
640
641 /// Return the preferred addressing mode LSR should make efforts to generate.
642 AddressingModeKind getPreferredAddressingMode(const Loop *L,
643 ScalarEvolution *SE) const;
644
645 /// Return true if the target supports masked store.
646 bool isLegalMaskedStore(Type *DataType, Align Alignment) const;
647 /// Return true if the target supports masked load.
648 bool isLegalMaskedLoad(Type *DataType, Align Alignment) const;
649
650 /// Return true if the target supports nontemporal store.
651 bool isLegalNTStore(Type *DataType, Align Alignment) const;
652 /// Return true if the target supports nontemporal load.
653 bool isLegalNTLoad(Type *DataType, Align Alignment) const;
654
655 /// Return true if the target supports masked scatter.
656 bool isLegalMaskedScatter(Type *DataType, Align Alignment) const;
657 /// Return true if the target supports masked gather.
658 bool isLegalMaskedGather(Type *DataType, Align Alignment) const;
659
660 /// Return true if the target supports masked compress store.
661 bool isLegalMaskedCompressStore(Type *DataType) const;
662 /// Return true if the target supports masked expand load.
663 bool isLegalMaskedExpandLoad(Type *DataType) const;
664
665 /// Return true if we should be enabling ordered reductions for the target.
666 bool enableOrderedReductions() const;
667
668 /// Return true if the target has a unified operation to calculate division
669 /// and remainder. If so, the additional implicit multiplication and
670 /// subtraction required to calculate a remainder from division are free. This
671 /// can enable more aggressive transformations for division and remainder than
672 /// would typically be allowed using throughput or size cost models.
673 bool hasDivRemOp(Type *DataType, bool IsSigned) const;
674
675 /// Return true if the given instruction (assumed to be a memory access
676 /// instruction) has a volatile variant. If that's the case then we can avoid
677 /// addrspacecast to generic AS for volatile loads/stores. Default
678 /// implementation returns false, which prevents address space inference for
679 /// volatile loads/stores.
680 bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) const;
681
682 /// Return true if target doesn't mind addresses in vectors.
683 bool prefersVectorizedAddressing() const;
684
685 /// Return the cost of the scaling factor used in the addressing
686 /// mode represented by AM for this target, for a load/store
687 /// of the specified type.
688 /// If the AM is supported, the return value must be >= 0.
689 /// If the AM is not supported, it returns a negative value.
690 /// TODO: Handle pre/postinc as well.
691 InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
692 int64_t BaseOffset, bool HasBaseReg,
693 int64_t Scale,
694 unsigned AddrSpace = 0) const;
695
696 /// Return true if the loop strength reduce pass should make
697 /// Instruction* based TTI queries to isLegalAddressingMode(). This is
698 /// needed on SystemZ, where e.g. a memcpy can only have a 12 bit unsigned
699 /// immediate offset and no index register.
700 bool LSRWithInstrQueries() const;
701
702 /// Return true if it's free to truncate a value of type Ty1 to type
703 /// Ty2. e.g. On x86 it's free to truncate a i32 value in register EAX to i16
704 /// by referencing its sub-register AX.
705 bool isTruncateFree(Type *Ty1, Type *Ty2) const;
706
707 /// Return true if it is profitable to hoist instruction in the
708 /// then/else to before if.
709 bool isProfitableToHoist(Instruction *I) const;
710
711 bool useAA() const;
712
713 /// Return true if this type is legal.
714 bool isTypeLegal(Type *Ty) const;
715
716 /// Returns the estimated number of registers required to represent \p Ty.
717 InstructionCost getRegUsageForType(Type *Ty) const;
718
719 /// Return true if switches should be turned into lookup tables for the
720 /// target.
721 bool shouldBuildLookupTables() const;
722
723 /// Return true if switches should be turned into lookup tables
724 /// containing this constant value for the target.
725 bool shouldBuildLookupTablesForConstant(Constant *C) const;
726
727 /// Return true if lookup tables should be turned into relative lookup tables.
728 bool shouldBuildRelLookupTables() const;
729
730 /// Return true if the input function which is cold at all call sites,
731 /// should use coldcc calling convention.
732 bool useColdCCForColdCall(Function &F) const;
733
734 /// Estimate the overhead of scalarizing an instruction. Insert and Extract
735 /// are set if the demanded result elements need to be inserted and/or
736 /// extracted from vectors.
737 InstructionCost getScalarizationOverhead(VectorType *Ty,
738 const APInt &DemandedElts,
739 bool Insert, bool Extract) const;
740
741 /// Estimate the overhead of scalarizing an instructions unique
742 /// non-constant operands. The (potentially vector) types to use for each of
743 /// argument are passes via Tys.
744 InstructionCost getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
745 ArrayRef<Type *> Tys) const;
746
747 /// If target has efficient vector element load/store instructions, it can
748 /// return true here so that insertion/extraction costs are not added to
749 /// the scalarization cost of a load/store.
750 bool supportsEfficientVectorElementLoadStore() const;
751
752 /// Don't restrict interleaved unrolling to small loops.
753 bool enableAggressiveInterleaving(bool LoopHasReductions) const;
754
755 /// Returns options for expansion of memcmp. IsZeroCmp is
756 // true if this is the expansion of memcmp(p1, p2, s) == 0.
757 struct MemCmpExpansionOptions {
758 // Return true if memcmp expansion is enabled.
759 operator bool() const { return MaxNumLoads > 0; }
760
761 // Maximum number of load operations.
762 unsigned MaxNumLoads = 0;
763
764 // The list of available load sizes (in bytes), sorted in decreasing order.
765 SmallVector<unsigned, 8> LoadSizes;
766
767 // For memcmp expansion when the memcmp result is only compared equal or
768 // not-equal to 0, allow up to this number of load pairs per block. As an
769 // example, this may allow 'memcmp(a, b, 3) == 0' in a single block:
770 // a0 = load2bytes &a[0]
771 // b0 = load2bytes &b[0]
772 // a2 = load1byte &a[2]
773 // b2 = load1byte &b[2]
774 // r = cmp eq (a0 ^ b0 | a2 ^ b2), 0
775 unsigned NumLoadsPerBlock = 1;
776
777 // Set to true to allow overlapping loads. For example, 7-byte compares can
778 // be done with two 4-byte compares instead of 4+2+1-byte compares. This
779 // requires all loads in LoadSizes to be doable in an unaligned way.
780 bool AllowOverlappingLoads = false;
781 };
782 MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
783 bool IsZeroCmp) const;
784
785 /// Enable matching of interleaved access groups.
786 bool enableInterleavedAccessVectorization() const;
787
788 /// Enable matching of interleaved access groups that contain predicated
789 /// accesses or gaps and therefore vectorized using masked
790 /// vector loads/stores.
791 bool enableMaskedInterleavedAccessVectorization() const;
792
793 /// Indicate that it is potentially unsafe to automatically vectorize
794 /// floating-point operations because the semantics of vector and scalar
795 /// floating-point semantics may differ. For example, ARM NEON v7 SIMD math
796 /// does not support IEEE-754 denormal numbers, while depending on the
797 /// platform, scalar floating-point math does.
798 /// This applies to floating-point math operations and calls, not memory
799 /// operations, shuffles, or casts.
800 bool isFPVectorizationPotentiallyUnsafe() const;
801
802 /// Determine if the target supports unaligned memory accesses.
803 bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth,
804 unsigned AddressSpace = 0,
805 Align Alignment = Align(1),
806 bool *Fast = nullptr) const;
807
808 /// Return hardware support for population count.
809 PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const;
810
811 /// Return true if the hardware has a fast square-root instruction.
812 bool haveFastSqrt(Type *Ty) const;
813
814 /// Return true if it is faster to check if a floating-point value is NaN
815 /// (or not-NaN) versus a comparison against a constant FP zero value.
816 /// Targets should override this if materializing a 0.0 for comparison is
817 /// generally as cheap as checking for ordered/unordered.
818 bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) const;
819
820 /// Return the expected cost of supporting the floating point operation
821 /// of the specified type.
822 InstructionCost getFPOpCost(Type *Ty) const;
823
824 /// Return the expected cost of materializing for the given integer
825 /// immediate of the specified type.
826 InstructionCost getIntImmCost(const APInt &Imm, Type *Ty,
827 TargetCostKind CostKind) const;
828
829 /// Return the expected cost of materialization for the given integer
830 /// immediate of the specified type for a given instruction. The cost can be
831 /// zero if the immediate can be folded into the specified instruction.
832 InstructionCost getIntImmCostInst(unsigned Opc, unsigned Idx,
833 const APInt &Imm, Type *Ty,
834 TargetCostKind CostKind,
835 Instruction *Inst = nullptr) const;
836 InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
837 const APInt &Imm, Type *Ty,
838 TargetCostKind CostKind) const;
839
840 /// Return the expected cost for the given integer when optimising
841 /// for size. This is different than the other integer immediate cost
842 /// functions in that it is subtarget agnostic. This is useful when you e.g.
843 /// target one ISA such as Aarch32 but smaller encodings could be possible
844 /// with another such as Thumb. This return value is used as a penalty when
845 /// the total costs for a constant is calculated (the bigger the cost, the
846 /// more beneficial constant hoisting is).
847 InstructionCost getIntImmCodeSizeCost(unsigned Opc, unsigned Idx,
848 const APInt &Imm, Type *Ty) const;
849 /// @}
850
851 /// \name Vector Target Information
852 /// @{
853
854 /// The various kinds of shuffle patterns for vector queries.
855 enum ShuffleKind {
856 SK_Broadcast, ///< Broadcast element 0 to all other elements.
857 SK_Reverse, ///< Reverse the order of the vector.
858 SK_Select, ///< Selects elements from the corresponding lane of
859 ///< either source operand. This is equivalent to a
860 ///< vector select with a constant condition operand.
861 SK_Transpose, ///< Transpose two vectors.
862 SK_InsertSubvector, ///< InsertSubvector. Index indicates start offset.
863 SK_ExtractSubvector, ///< ExtractSubvector Index indicates start offset.
864 SK_PermuteTwoSrc, ///< Merge elements from two source vectors into one
865 ///< with any shuffle mask.
866 SK_PermuteSingleSrc, ///< Shuffle elements of single source vector with any
867 ///< shuffle mask.
868 SK_Splice ///< Concatenates elements from the first input vector
869 ///< with elements of the second input vector. Returning
870 ///< a vector of the same type as the input vectors.
871 };
872
873 /// Additional information about an operand's possible values.
874 enum OperandValueKind {
875 OK_AnyValue, // Operand can have any value.
876 OK_UniformValue, // Operand is uniform (splat of a value).
877 OK_UniformConstantValue, // Operand is uniform constant.
878 OK_NonUniformConstantValue // Operand is a non uniform constant value.
879 };
880
881 /// Additional properties of an operand's values.
882 enum OperandValueProperties { OP_None = 0, OP_PowerOf2 = 1 };
883
884 /// \return the number of registers in the target-provided register class.
885 unsigned getNumberOfRegisters(unsigned ClassID) const;
886
887 /// \return the target-provided register class ID for the provided type,
888 /// accounting for type promotion and other type-legalization techniques that
889 /// the target might apply. However, it specifically does not account for the
890 /// scalarization or splitting of vector types. Should a vector type require
891 /// scalarization or splitting into multiple underlying vector registers, that
892 /// type should be mapped to a register class containing no registers.
893 /// Specifically, this is designed to provide a simple, high-level view of the
894 /// register allocation later performed by the backend. These register classes
895 /// don't necessarily map onto the register classes used by the backend.
896 /// FIXME: It's not currently possible to determine how many registers
897 /// are used by the provided type.
898 unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const;
899
900 /// \return the target-provided register class name
901 const char *getRegisterClassName(unsigned ClassID) const;
902
903 enum RegisterKind { RGK_Scalar, RGK_FixedWidthVector, RGK_ScalableVector };
904
905 /// \return The width of the largest scalar or vector register type.
906 TypeSize getRegisterBitWidth(RegisterKind K) const;
907
908 /// \return The width of the smallest vector register type.
909 unsigned getMinVectorRegisterBitWidth() const;
910
911 /// \return The maximum value of vscale if the target specifies an
912 /// architectural maximum vector length, and None otherwise.
913 Optional<unsigned> getMaxVScale() const;
914
915 /// \return True if the vectorization factor should be chosen to
916 /// make the vector of the smallest element type match the size of a
917 /// vector register. For wider element types, this could result in
918 /// creating vectors that span multiple vector registers.
919 /// If false, the vectorization factor will be chosen based on the
920 /// size of the widest element type.
921 bool shouldMaximizeVectorBandwidth() const;
922
923 /// \return The minimum vectorization factor for types of given element
924 /// bit width, or 0 if there is no minimum VF. The returned value only
925 /// applies when shouldMaximizeVectorBandwidth returns true.
926 /// If IsScalable is true, the returned ElementCount must be a scalable VF.
927 ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const;
928
929 /// \return The maximum vectorization factor for types of given element
930 /// bit width and opcode, or 0 if there is no maximum VF.
931 /// Currently only used by the SLP vectorizer.
932 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const;
933
934 /// \return True if it should be considered for address type promotion.
935 /// \p AllowPromotionWithoutCommonHeader Set true if promoting \p I is
936 /// profitable without finding other extensions fed by the same input.
937 bool shouldConsiderAddressTypePromotion(
938 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const;
939
940 /// \return The size of a cache line in bytes.
941 unsigned getCacheLineSize() const;
942
943 /// The possible cache levels
944 enum class CacheLevel {
945 L1D, // The L1 data cache
946 L2D, // The L2 data cache
947
948 // We currently do not model L3 caches, as their sizes differ widely between
949 // microarchitectures. Also, we currently do not have a use for L3 cache
950 // size modeling yet.
951 };
952
953 /// \return The size of the cache level in bytes, if available.
954 Optional<unsigned> getCacheSize(CacheLevel Level) const;
955
956 /// \return The associativity of the cache level, if available.
957 Optional<unsigned> getCacheAssociativity(CacheLevel Level) const;
958
959 /// \return How much before a load we should place the prefetch
960 /// instruction. This is currently measured in number of
961 /// instructions.
962 unsigned getPrefetchDistance() const;
963
964 /// Some HW prefetchers can handle accesses up to a certain constant stride.
965 /// Sometimes prefetching is beneficial even below the HW prefetcher limit,
966 /// and the arguments provided are meant to serve as a basis for deciding this
967 /// for a particular loop.
968 ///
969 /// \param NumMemAccesses Number of memory accesses in the loop.
970 /// \param NumStridedMemAccesses Number of the memory accesses that
971 /// ScalarEvolution could find a known stride
972 /// for.
973 /// \param NumPrefetches Number of software prefetches that will be
974 /// emitted as determined by the addresses
975 /// involved and the cache line size.
976 /// \param HasCall True if the loop contains a call.
977 ///
978 /// \return This is the minimum stride in bytes where it makes sense to start
979 /// adding SW prefetches. The default is 1, i.e. prefetch with any
980 /// stride.
981 unsigned getMinPrefetchStride(unsigned NumMemAccesses,
982 unsigned NumStridedMemAccesses,
983 unsigned NumPrefetches, bool HasCall) const;
984
985 /// \return The maximum number of iterations to prefetch ahead. If
986 /// the required number of iterations is more than this number, no
987 /// prefetching is performed.
988 unsigned getMaxPrefetchIterationsAhead() const;
989
990 /// \return True if prefetching should also be done for writes.
991 bool enableWritePrefetching() const;
992
993 /// \return The maximum interleave factor that any transform should try to
994 /// perform for this target. This number depends on the level of parallelism
995 /// and the number of execution units in the CPU.
996 unsigned getMaxInterleaveFactor(unsigned VF) const;
997
998 /// Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
999 static OperandValueKind getOperandInfo(const Value *V,
1000 OperandValueProperties &OpProps);
1001
1002 /// This is an approximation of reciprocal throughput of a math/logic op.
1003 /// A higher cost indicates less expected throughput.
1004 /// From Agner Fog's guides, reciprocal throughput is "the average number of
1005 /// clock cycles per instruction when the instructions are not part of a
1006 /// limiting dependency chain."
1007 /// Therefore, costs should be scaled to account for multiple execution units
1008 /// on the target that can process this type of instruction. For example, if
1009 /// there are 5 scalar integer units and 2 vector integer units that can
1010 /// calculate an 'add' in a single cycle, this model should indicate that the
1011 /// cost of the vector add instruction is 2.5 times the cost of the scalar
1012 /// add instruction.
1013 /// \p Args is an optional argument which holds the instruction operands
1014 /// values so the TTI can analyze those values searching for special
1015 /// cases or optimizations based on those values.
1016 /// \p CxtI is the optional original context instruction, if one exists, to
1017 /// provide even more information.
1018 InstructionCost getArithmeticInstrCost(
1019 unsigned Opcode, Type *Ty,
1020 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
1021 OperandValueKind Opd1Info = OK_AnyValue,
1022 OperandValueKind Opd2Info = OK_AnyValue,
1023 OperandValueProperties Opd1PropInfo = OP_None,
1024 OperandValueProperties Opd2PropInfo = OP_None,
1025 ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
1026 const Instruction *CxtI = nullptr) const;
1027
1028 /// \return The cost of a shuffle instruction of kind Kind and of type Tp.
1029 /// The exact mask may be passed as Mask, or else the array will be empty.
1030 /// The index and subtype parameters are used by the subvector insertion and
1031 /// extraction shuffle kinds to show the insert/extract point and the type of
1032 /// the subvector being inserted/extracted.
1033 /// NOTE: For subvector extractions Tp represents the source type.
1034 InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp,
1035 ArrayRef<int> Mask = None, int Index = 0,
1036 VectorType *SubTp = nullptr) const;
1037
1038 /// Represents a hint about the context in which a cast is used.
1039 ///
1040 /// For zext/sext, the context of the cast is the operand, which must be a
1041 /// load of some kind. For trunc, the context is of the cast is the single
1042 /// user of the instruction, which must be a store of some kind.
1043 ///
1044 /// This enum allows the vectorizer to give getCastInstrCost an idea of the
1045 /// type of cast it's dealing with, as not every cast is equal. For instance,
1046 /// the zext of a load may be free, but the zext of an interleaving load can
1047 //// be (very) expensive!
1048 ///
1049 /// See \c getCastContextHint to compute a CastContextHint from a cast
1050 /// Instruction*. Callers can use it if they don't need to override the
1051 /// context and just want it to be calculated from the instruction.
1052 ///
1053 /// FIXME: This handles the types of load/store that the vectorizer can
1054 /// produce, which are the cases where the context instruction is most
1055 /// likely to be incorrect. There are other situations where that can happen
1056 /// too, which might be handled here but in the long run a more general
1057 /// solution of costing multiple instructions at the same times may be better.
1058 enum class CastContextHint : uint8_t {
1059 None, ///< The cast is not used with a load/store of any kind.
1060 Normal, ///< The cast is used with a normal load/store.
1061 Masked, ///< The cast is used with a masked load/store.
1062 GatherScatter, ///< The cast is used with a gather/scatter.
1063 Interleave, ///< The cast is used with an interleaved load/store.
1064 Reversed, ///< The cast is used with a reversed load/store.
1065 };
1066
1067 /// Calculates a CastContextHint from \p I.
1068 /// This should be used by callers of getCastInstrCost if they wish to
1069 /// determine the context from some instruction.
1070 /// \returns the CastContextHint for ZExt/SExt/Trunc, None if \p I is nullptr,
1071 /// or if it's another type of cast.
1072 static CastContextHint getCastContextHint(const Instruction *I);
1073
1074 /// \return The expected cost of cast instructions, such as bitcast, trunc,
1075 /// zext, etc. If there is an existing instruction that holds Opcode, it
1076 /// may be passed in the 'I' parameter.
1077 InstructionCost
1078 getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1079 TTI::CastContextHint CCH,
1080 TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
1081 const Instruction *I = nullptr) const;
1082
1083 /// \return The expected cost of a sign- or zero-extended vector extract. Use
1084 /// -1 to indicate that there is no information about the index value.
1085 InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst,
1086 VectorType *VecTy,
1087 unsigned Index = -1) const;
1088
1089 /// \return The expected cost of control-flow related instructions such as
1090 /// Phi, Ret, Br, Switch.
1091 InstructionCost
1092 getCFInstrCost(unsigned Opcode,
1093 TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
1094 const Instruction *I = nullptr) const;
1095
1096 /// \returns The expected cost of compare and select instructions. If there
1097 /// is an existing instruction that holds Opcode, it may be passed in the
1098 /// 'I' parameter. The \p VecPred parameter can be used to indicate the select
1099 /// is using a compare with the specified predicate as condition. When vector
1100 /// types are passed, \p VecPred must be used for all lanes.
1101 InstructionCost
1102 getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy = nullptr,
1103 CmpInst::Predicate VecPred = CmpInst::BAD_ICMP_PREDICATE,
1104 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
1105 const Instruction *I = nullptr) const;
1106
1107 /// \return The expected cost of vector Insert and Extract.
1108 /// Use -1 to indicate that there is no information on the index value.
1109 InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
1110 unsigned Index = -1) const;
1111
1112 /// \return The cost of Load and Store instructions.
1113 InstructionCost
1114 getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1115 unsigned AddressSpace,
1116 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
1117 const Instruction *I = nullptr) const;
1118
1119 /// \return The cost of masked Load and Store instructions.
1120 InstructionCost getMaskedMemoryOpCost(
1121 unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
1122 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;
1123
1124 /// \return The cost of Gather or Scatter operation
1125 /// \p Opcode - is a type of memory access Load or Store
1126 /// \p DataTy - a vector type of the data to be loaded or stored
1127 /// \p Ptr - pointer [or vector of pointers] - address[es] in memory
1128 /// \p VariableMask - true when the memory access is predicated with a mask
1129 /// that is not a compile-time constant
1130 /// \p Alignment - alignment of single element
1131 /// \p I - the optional original context instruction, if one exists, e.g. the
1132 /// load/store to transform or the call to the gather/scatter intrinsic
1133 InstructionCost getGatherScatterOpCost(
1134 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1135 Align Alignment, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
1136 const Instruction *I = nullptr) const;
1137
1138 /// \return The cost of the interleaved memory operation.
1139 /// \p Opcode is the memory operation code
1140 /// \p VecTy is the vector type of the interleaved access.
1141 /// \p Factor is the interleave factor
1142 /// \p Indices is the indices for interleaved load members (as interleaved
1143 /// load allows gaps)
1144 /// \p Alignment is the alignment of the memory operation
1145 /// \p AddressSpace is address space of the pointer.
1146 /// \p UseMaskForCond indicates if the memory access is predicated.
1147 /// \p UseMaskForGaps indicates if gaps should be masked.
1148 InstructionCost getInterleavedMemoryOpCost(
1149 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1150 Align Alignment, unsigned AddressSpace,
1151 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
1152 bool UseMaskForCond = false, bool UseMaskForGaps = false) const;
1153
1154 /// A helper function to determine the type of reduction algorithm used
1155 /// for a given \p Opcode and set of FastMathFlags \p FMF.
1156 static bool requiresOrderedReduction(Optional<FastMathFlags> FMF) {
1157 return FMF != None && !(*FMF).allowReassoc();
1158 }
1159
1160 /// Calculate the cost of vector reduction intrinsics.
1161 ///
1162 /// This is the cost of reducing the vector value of type \p Ty to a scalar
1163 /// value using the operation denoted by \p Opcode. The FastMathFlags
1164 /// parameter \p FMF indicates what type of reduction we are performing:
1165 /// 1. Tree-wise. This is the typical 'fast' reduction performed that
1166 /// involves successively splitting a vector into half and doing the
1167 /// operation on the pair of halves until you have a scalar value. For
1168 /// example:
1169 /// (v0, v1, v2, v3)
1170 /// ((v0+v2), (v1+v3), undef, undef)
1171 /// ((v0+v2+v1+v3), undef, undef, undef)
1172 /// This is the default behaviour for integer operations, whereas for
1173 /// floating point we only do this if \p FMF indicates that
1174 /// reassociation is allowed.
1175 /// 2. Ordered. For a vector with N elements this involves performing N
1176 /// operations in lane order, starting with an initial scalar value, i.e.
1177 /// result = InitVal + v0
1178 /// result = result + v1
1179 /// result = result + v2
1180 /// result = result + v3
1181 /// This is only the case for FP operations and when reassociation is not
1182 /// allowed.
1183 ///
1184 InstructionCost getArithmeticReductionCost(
1185 unsigned Opcode, VectorType *Ty, Optional<FastMathFlags> FMF,
1186 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;
1187
1188 InstructionCost getMinMaxReductionCost(
1189 VectorType *Ty, VectorType *CondTy, bool IsUnsigned,
1190 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;
1191
1192 /// Calculate the cost of an extended reduction pattern, similar to
1193 /// getArithmeticReductionCost of an Add reduction with an extension and
1194 /// optional multiply. This is the cost of as:
1195 /// ResTy vecreduce.add(ext(Ty A)), or if IsMLA flag is set then:
1196 /// ResTy vecreduce.add(mul(ext(Ty A), ext(Ty B)). The reduction happens
1197 /// on a VectorType with ResTy elements and Ty lanes.
1198 InstructionCost getExtendedAddReductionCost(
1199 bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty,
1200 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;
1201
1202 /// \returns The cost of Intrinsic instructions. Analyses the real arguments.
1203 /// Three cases are handled: 1. scalar instruction 2. vector instruction
1204 /// 3. scalar instruction which is to be vectorized.
1205 InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1206 TTI::TargetCostKind CostKind) const;
1207
1208 /// \returns The cost of Call instructions.
1209 InstructionCost getCallInstrCost(
1210 Function *F, Type *RetTy, ArrayRef<Type *> Tys,
1211 TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) const;
1212
1213 /// \returns The number of pieces into which the provided type must be
1214 /// split during legalization. Zero is returned when the answer is unknown.
1215 unsigned getNumberOfParts(Type *Tp) const;
1216
1217 /// \returns The cost of the address computation. For most targets this can be
1218 /// merged into the instruction indexing mode. Some targets might want to
1219 /// distinguish between address computation for memory operations on vector
1220 /// types and scalar types. Such targets should override this function.
1221 /// The 'SE' parameter holds pointer for the scalar evolution object which
1222 /// is used in order to get the Ptr step value in case of constant stride.
1223 /// The 'Ptr' parameter holds SCEV of the access pointer.
1224 InstructionCost getAddressComputationCost(Type *Ty,
1225 ScalarEvolution *SE = nullptr,
1226 const SCEV *Ptr = nullptr) const;
1227
1228 /// \returns The cost, if any, of keeping values of the given types alive
1229 /// over a callsite.
1230 ///
1231 /// Some types may require the use of register classes that do not have
1232 /// any callee-saved registers, so would require a spill and fill.
1233 InstructionCost getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) const;
1234
1235 /// \returns True if the intrinsic is a supported memory intrinsic. Info
1236 /// will contain additional information - whether the intrinsic may write
1237 /// or read to memory, volatility and the pointer. Info is undefined
1238 /// if false is returned.
1239 bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const;
1240
1241 /// \returns The maximum element size, in bytes, for an element
1242 /// unordered-atomic memory intrinsic.
1243 unsigned getAtomicMemIntrinsicMaxElementSize() const;
1244
1245 /// \returns A value which is the result of the given memory intrinsic. New
1246 /// instructions may be created to extract the result from the given intrinsic
1247 /// memory operation. Returns nullptr if the target cannot create a result
1248 /// from the given intrinsic.
1249 Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
1250 Type *ExpectedType) const;
1251
1252 /// \returns The type to use in a loop expansion of a memcpy call.
1253 Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
1254 unsigned SrcAddrSpace, unsigned DestAddrSpace,
1255 unsigned SrcAlign, unsigned DestAlign) const;
1256
1257 /// \param[out] OpsOut The operand types to copy RemainingBytes of memory.
1258 /// \param RemainingBytes The number of bytes to copy.
1259 ///
1260 /// Calculates the operand types to use when copying \p RemainingBytes of
1261 /// memory, where source and destination alignments are \p SrcAlign and
1262 /// \p DestAlign respectively.
1263 void getMemcpyLoopResidualLoweringType(
1264 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
1265 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
1266 unsigned SrcAlign, unsigned DestAlign) const;
1267
1268 /// \returns True if the two functions have compatible attributes for inlining
1269 /// purposes.
1270 bool areInlineCompatible(const Function *Caller,
1271 const Function *Callee) const;
1272
1273 /// \returns True if the caller and callee agree on how \p Args will be passed
1274 /// to the callee.
1275 /// \param[out] Args The list of compatible arguments. The implementation may
1276 /// filter out any incompatible args from this list.
1277 bool areFunctionArgsABICompatible(const Function *Caller,
1278 const Function *Callee,
1279 SmallPtrSetImpl<Argument *> &Args) const;
1280
1281 /// The type of load/store indexing.
1282 enum MemIndexedMode {
1283 MIM_Unindexed, ///< No indexing.
1284 MIM_PreInc, ///< Pre-incrementing.
1285 MIM_PreDec, ///< Pre-decrementing.
1286 MIM_PostInc, ///< Post-incrementing.
1287 MIM_PostDec ///< Post-decrementing.
1288 };
1289
1290 /// \returns True if the specified indexed load for the given type is legal.
1291 bool isIndexedLoadLegal(enum MemIndexedMode Mode, Type *Ty) const;
1292
1293 /// \returns True if the specified indexed store for the given type is legal.
1294 bool isIndexedStoreLegal(enum MemIndexedMode Mode, Type *Ty) const;
1295
1296 /// \returns The bitwidth of the largest vector type that should be used to
1297 /// load/store in the given address space.
1298 unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
1299
1300 /// \returns True if the load instruction is legal to vectorize.
1301 bool isLegalToVectorizeLoad(LoadInst *LI) const;
1302
1303 /// \returns True if the store instruction is legal to vectorize.
1304 bool isLegalToVectorizeStore(StoreInst *SI) const;
1305
1306 /// \returns True if it is legal to vectorize the given load chain.
1307 bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment,
1308 unsigned AddrSpace) const;
1309
1310 /// \returns True if it is legal to vectorize the given store chain.
1311 bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
1312 unsigned AddrSpace) const;
1313
1314 /// \returns True if it is legal to vectorize the given reduction kind.
1315 bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc,
1316 ElementCount VF) const;
1317
1318 /// \returns True if the given type is supported for scalable vectors
1319 bool isElementTypeLegalForScalableVector(Type *Ty) const;
1320
1321 /// \returns The new vector factor value if the target doesn't support \p
1322 /// SizeInBytes loads or has a better vector factor.
1323 unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
1324 unsigned ChainSizeInBytes,
1325 VectorType *VecTy) const;
1326
1327 /// \returns The new vector factor value if the target doesn't support \p
1328 /// SizeInBytes stores or has a better vector factor.
1329 unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
1330 unsigned ChainSizeInBytes,
1331 VectorType *VecTy) const;
1332
1333 /// Flags describing the kind of vector reduction.
1334 struct ReductionFlags {
1335 ReductionFlags() : IsMaxOp(false), IsSigned(false), NoNaN(false) {}
1336 bool IsMaxOp; ///< If the op a min/max kind, true if it's a max operation.
1337 bool IsSigned; ///< Whether the operation is a signed int reduction.
1338 bool NoNaN; ///< If op is an fp min/max, whether NaNs may be present.
1339 };
1340
1341 /// \returns True if the target prefers reductions in loop.
1342 bool preferInLoopReduction(unsigned Opcode, Type *Ty,
1343 ReductionFlags Flags) const;
1344
1345 /// \returns True if the target prefers reductions select kept in the loop
1346 /// when tail folding. i.e.
1347 /// loop:
1348 /// p = phi (0, s)
1349 /// a = add (p, x)
1350 /// s = select (mask, a, p)
1351 /// vecreduce.add(s)
1352 ///
1353 /// As opposed to the normal scheme of p = phi (0, a) which allows the select
1354 /// to be pulled out of the loop. If the select(.., add, ..) can be predicated
1355 /// by the target, this can lead to cleaner code generation.
1356 bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
1357 ReductionFlags Flags) const;
1358
1359 /// \returns True if the target wants to expand the given reduction intrinsic
1360 /// into a shuffle sequence.
1361 bool shouldExpandReduction(const IntrinsicInst *II) const;
1362
1363 /// \returns the size cost of rematerializing a GlobalValue address relative
1364 /// to a stack reload.
1365 unsigned getGISelRematGlobalCost() const;
1366
1367 /// \returns True if the target supports scalable vectors.
1368 bool supportsScalableVectors() const;
1369
1370 /// \name Vector Predication Information
1371 /// @{
1372 /// Whether the target supports the %evl parameter of VP intrinsic efficiently
1373 /// in hardware. (see LLVM Language Reference - "Vector Predication
1374 /// Intrinsics") Use of %evl is discouraged when that is not the case.
1375 bool hasActiveVectorLength() const;
1376
1377 struct VPLegalization {
1378 enum VPTransform {
1379 // keep the predicating parameter
1380 Legal = 0,
1381 // where legal, discard the predicate parameter
1382 Discard = 1,
1383 // transform into something else that is also predicating
1384 Convert = 2
1385 };
1386
1387 // How to transform the EVL parameter.
1388 // Legal: keep the EVL parameter as it is.
1389 // Discard: Ignore the EVL parameter where it is safe to do so.
1390 // Convert: Fold the EVL into the mask parameter.
1391 VPTransform EVLParamStrategy;
1392
1393 // How to transform the operator.
1394 // Legal: The target supports this operator.
1395 // Convert: Convert this to a non-VP operation.
1396 // The 'Discard' strategy is invalid.
1397 VPTransform OpStrategy;
1398
1399 bool shouldDoNothing() const {
1400 return (EVLParamStrategy == Legal) && (OpStrategy == Legal);
1401 }
1402 VPLegalization(VPTransform EVLParamStrategy, VPTransform OpStrategy)
1403 : EVLParamStrategy(EVLParamStrategy), OpStrategy(OpStrategy) {}
1404 };
1405
1406 /// \returns How the target needs this vector-predicated operation to be
1407 /// transformed.
1408 VPLegalization getVPLegalizationStrategy(const VPIntrinsic &PI) const;
1409 /// @}
1410
1411 /// @}
1412
1413private:
1414 /// Estimate the latency of specified instruction.
1415 /// Returns 1 as the default value.
1416 InstructionCost getInstructionLatency(const Instruction *I) const;
1417
1418 /// Returns the expected throughput cost of the instruction.
1419 /// Returns -1 if the cost is unknown.
1420 InstructionCost getInstructionThroughput(const Instruction *I) const;
1421
1422 /// The abstract base class used to type erase specific TTI
1423 /// implementations.
1424 class Concept;
1425
1426 /// The template model for the base class which wraps a concrete
1427 /// implementation in a type erased interface.
1428 template <typename T> class Model;
1429
1430 std::unique_ptr<Concept> TTIImpl;
1431};
1432
1433class TargetTransformInfo::Concept {
1434public:
1435 virtual ~Concept() = 0;
1436 virtual const DataLayout &getDataLayout() const = 0;
1437 virtual InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr,
1438 ArrayRef<const Value *> Operands,
1439 TTI::TargetCostKind CostKind) = 0;
1440 virtual unsigned getInliningThresholdMultiplier() = 0;
1441 virtual unsigned adjustInliningThreshold(const CallBase *CB) = 0;
1442 virtual int getInlinerVectorBonusPercent() = 0;
1443 virtual InstructionCost getMemcpyCost(const Instruction *I) = 0;
1444 virtual unsigned
1445 getEstimatedNumberOfCaseClusters(const SwitchInst &SI, unsigned &JTSize,
1446 ProfileSummaryInfo *PSI,
1447 BlockFrequencyInfo *BFI) = 0;
1448 virtual InstructionCost getUserCost(const User *U,
1449 ArrayRef<const Value *> Operands,
1450 TargetCostKind CostKind) = 0;
1451 virtual BranchProbability getPredictableBranchThreshold() = 0;
1452 virtual bool hasBranchDivergence() = 0;
1453 virtual bool useGPUDivergenceAnalysis() = 0;
1454 virtual bool isSourceOfDivergence(const Value *V) = 0;
1455 virtual bool isAlwaysUniform(const Value *V) = 0;
1456 virtual unsigned getFlatAddressSpace() = 0;
1457 virtual bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
1458 Intrinsic::ID IID) const = 0;
1459 virtual bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const = 0;
1460 virtual unsigned getAssumedAddrSpace(const Value *V) const = 0;
1461 virtual Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
1462 Value *OldV,
1463 Value *NewV) const = 0;
1464 virtual bool isLoweredToCall(const Function *F) = 0;
1465 virtual void getUnrollingPreferences(Loop *L, ScalarEvolution &,
1466 UnrollingPreferences &UP,
1467 OptimizationRemarkEmitter *ORE) = 0;
1468 virtual void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1469 PeelingPreferences &PP) = 0;
1470 virtual bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
1471 AssumptionCache &AC,
1472 TargetLibraryInfo *LibInfo,
1473 HardwareLoopInfo &HWLoopInfo) = 0;
1474 virtual bool
1475 preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
1476 AssumptionCache &AC, TargetLibraryInfo *TLI,
1477 DominatorTree *DT, const LoopAccessInfo *LAI) = 0;
1478 virtual bool emitGetActiveLaneMask() = 0;
1479 virtual Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
1480 IntrinsicInst &II) = 0;
1481 virtual Optional<Value *>
1482 simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II,
1483 APInt DemandedMask, KnownBits &Known,
1484 bool &KnownBitsComputed) = 0;
1485 virtual Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
1486 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1487 APInt &UndefElts2, APInt &UndefElts3,
1488 std::function<void(Instruction *, unsigned, APInt, APInt &)>
1489 SimplifyAndSetOp) = 0;
1490 virtual bool isLegalAddImmediate(int64_t Imm) = 0;
1491 virtual bool isLegalICmpImmediate(int64_t Imm) = 0;
1492 virtual bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
1493 int64_t BaseOffset, bool HasBaseReg,
1494 int64_t Scale, unsigned AddrSpace,
1495 Instruction *I) = 0;
1496 virtual bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
1497 TargetTransformInfo::LSRCost &C2) = 0;
1498 virtual bool isNumRegsMajorCostOfLSR() = 0;
1499 virtual bool isProfitableLSRChainElement(Instruction *I) = 0;
1500 virtual bool canMacroFuseCmp() = 0;
1501 virtual bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE,
1502 LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC,
1503 TargetLibraryInfo *LibInfo) = 0;
1504 virtual AddressingModeKind
1505 getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const = 0;
1506 virtual bool isLegalMaskedStore(Type *DataType, Align Alignment) = 0;
1507 virtual bool isLegalMaskedLoad(Type *DataType, Align Alignment) = 0;
1508 virtual bool isLegalNTStore(Type *DataType, Align Alignment) = 0;
1509 virtual bool isLegalNTLoad(Type *DataType, Align Alignment) = 0;
1510 virtual bool isLegalMaskedScatter(Type *DataType, Align Alignment) = 0;
1511 virtual bool isLegalMaskedGather(Type *DataType, Align Alignment) = 0;
1512 virtual bool isLegalMaskedCompressStore(Type *DataType) = 0;
1513 virtual bool isLegalMaskedExpandLoad(Type *DataType) = 0;
1514 virtual bool enableOrderedReductions() = 0;
1515 virtual bool hasDivRemOp(Type *DataType, bool IsSigned) = 0;
1516 virtual bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) = 0;
1517 virtual bool prefersVectorizedAddressing() = 0;
1518 virtual InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
1519 int64_t BaseOffset,
1520 bool HasBaseReg, int64_t Scale,
1521 unsigned AddrSpace) = 0;
1522 virtual bool LSRWithInstrQueries() = 0;
1523 virtual bool isTruncateFree(Type *Ty1, Type *Ty2) = 0;
1524 virtual bool isProfitableToHoist(Instruction *I) = 0;
1525 virtual bool useAA() = 0;
1526 virtual bool isTypeLegal(Type *Ty) = 0;
1527 virtual InstructionCost getRegUsageForType(Type *Ty) = 0;
1528 virtual bool shouldBuildLookupTables() = 0;
1529 virtual bool shouldBuildLookupTablesForConstant(Constant *C) = 0;
1530 virtual bool shouldBuildRelLookupTables() = 0;
1531 virtual bool useColdCCForColdCall(Function &F) = 0;
1532 virtual InstructionCost getScalarizationOverhead(VectorType *Ty,
1533 const APInt &DemandedElts,
1534 bool Insert,
1535 bool Extract) = 0;
1536 virtual InstructionCost
1537 getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
1538 ArrayRef<Type *> Tys) = 0;
1539 virtual bool supportsEfficientVectorElementLoadStore() = 0;
1540 virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0;
1541 virtual MemCmpExpansionOptions
1542 enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const = 0;
1543 virtual bool enableInterleavedAccessVectorization() = 0;
1544 virtual bool enableMaskedInterleavedAccessVectorization() = 0;
1545 virtual bool isFPVectorizationPotentiallyUnsafe() = 0;
1546 virtual bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
1547 unsigned BitWidth,
1548 unsigned AddressSpace,
1549 Align Alignment,
1550 bool *Fast) = 0;
1551 virtual PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) = 0;
1552 virtual bool haveFastSqrt(Type *Ty) = 0;
1553 virtual bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) = 0;
1554 virtual InstructionCost getFPOpCost(Type *Ty) = 0;
1555 virtual InstructionCost getIntImmCodeSizeCost(unsigned Opc, unsigned Idx,
1556 const APInt &Imm, Type *Ty) = 0;
1557 virtual InstructionCost getIntImmCost(const APInt &Imm, Type *Ty,
1558 TargetCostKind CostKind) = 0;
1559 virtual InstructionCost getIntImmCostInst(unsigned Opc, unsigned Idx,
1560 const APInt &Imm, Type *Ty,
1561 TargetCostKind CostKind,
1562 Instruction *Inst = nullptr) = 0;
1563 virtual InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
1564 const APInt &Imm, Type *Ty,
1565 TargetCostKind CostKind) = 0;
1566 virtual unsigned getNumberOfRegisters(unsigned ClassID) const = 0;
1567 virtual unsigned getRegisterClassForType(bool Vector,
1568 Type *Ty = nullptr) const = 0;
1569 virtual const char *getRegisterClassName(unsigned ClassID) const = 0;
1570 virtual TypeSize getRegisterBitWidth(RegisterKind K) const = 0;
1571 virtual unsigned getMinVectorRegisterBitWidth() const = 0;
1572 virtual Optional<unsigned> getMaxVScale() const = 0;
1573 virtual bool shouldMaximizeVectorBandwidth() const = 0;
1574 virtual ElementCount getMinimumVF(unsigned ElemWidth,
1575 bool IsScalable) const = 0;
1576 virtual unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const = 0;
1577 virtual bool shouldConsiderAddressTypePromotion(
1578 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) = 0;
1579 virtual unsigned getCacheLineSize() const = 0;
1580 virtual Optional<unsigned> getCacheSize(CacheLevel Level) const = 0;
1581 virtual Optional<unsigned> getCacheAssociativity(CacheLevel Level) const = 0;
1582
1583 /// \return How much before a load we should place the prefetch
1584 /// instruction. This is currently measured in number of
1585 /// instructions.
1586 virtual unsigned getPrefetchDistance() const = 0;
1587
1588 /// \return Some HW prefetchers can handle accesses up to a certain
1589 /// constant stride. This is the minimum stride in bytes where it
1590 /// makes sense to start adding SW prefetches. The default is 1,
1591 /// i.e. prefetch with any stride. Sometimes prefetching is beneficial
1592 /// even below the HW prefetcher limit, and the arguments provided are
1593 /// meant to serve as a basis for deciding this for a particular loop.
1594 virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
1595 unsigned NumStridedMemAccesses,
1596 unsigned NumPrefetches,
1597 bool HasCall) const = 0;
1598
1599 /// \return The maximum number of iterations to prefetch ahead. If
1600 /// the required number of iterations is more than this number, no
1601 /// prefetching is performed.
1602 virtual unsigned getMaxPrefetchIterationsAhead() const = 0;
1603
1604 /// \return True if prefetching should also be done for writes.
1605 virtual bool enableWritePrefetching() const = 0;
1606
1607 virtual unsigned getMaxInterleaveFactor(unsigned VF) = 0;
1608 virtual InstructionCost getArithmeticInstrCost(
1609 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1610 OperandValueKind Opd1Info, OperandValueKind Opd2Info,
1611 OperandValueProperties Opd1PropInfo, OperandValueProperties Opd2PropInfo,
1612 ArrayRef<const Value *> Args, const Instruction *CxtI = nullptr) = 0;
1613 virtual InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp,
1614 ArrayRef<int> Mask, int Index,
1615 VectorType *SubTp) = 0;
1616 virtual InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst,
1617 Type *Src, CastContextHint CCH,
1618 TTI::TargetCostKind CostKind,
1619 const Instruction *I) = 0;
1620 virtual InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst,
1621 VectorType *VecTy,
1622 unsigned Index) = 0;
1623 virtual InstructionCost getCFInstrCost(unsigned Opcode,
1624 TTI::TargetCostKind CostKind,
1625 const Instruction *I = nullptr) = 0;
1626 virtual InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
1627 Type *CondTy,
1628 CmpInst::Predicate VecPred,
1629 TTI::TargetCostKind CostKind,
1630 const Instruction *I) = 0;
1631 virtual InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
1632 unsigned Index) = 0;
1633 virtual InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src,
1634 Align Alignment,
1635 unsigned AddressSpace,
1636 TTI::TargetCostKind CostKind,
1637 const Instruction *I) = 0;
1638 virtual InstructionCost
1639 getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1640 unsigned AddressSpace,
1641 TTI::TargetCostKind CostKind) = 0;
1642 virtual InstructionCost
1643 getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr,
1644 bool VariableMask, Align Alignment,
1645 TTI::TargetCostKind CostKind,
1646 const Instruction *I = nullptr) = 0;
1647
1648 virtual InstructionCost getInterleavedMemoryOpCost(
1649 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1650 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1651 bool UseMaskForCond = false, bool UseMaskForGaps = false) = 0;
1652 virtual InstructionCost
1653 getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
1654 Optional<FastMathFlags> FMF,
1655 TTI::TargetCostKind CostKind) = 0;
1656 virtual InstructionCost
1657 getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned,
1658 TTI::TargetCostKind CostKind) = 0;
1659 virtual InstructionCost getExtendedAddReductionCost(
1660 bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty,
1661 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) = 0;
1662 virtual InstructionCost
1663 getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1664 TTI::TargetCostKind CostKind) = 0;
1665 virtual InstructionCost getCallInstrCost(Function *F, Type *RetTy,
1666 ArrayRef<Type *> Tys,
1667 TTI::TargetCostKind CostKind) = 0;
1668 virtual unsigned getNumberOfParts(Type *Tp) = 0;
1669 virtual InstructionCost
1670 getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr) = 0;
1671 virtual InstructionCost
1672 getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) = 0;
1673 virtual bool getTgtMemIntrinsic(IntrinsicInst *Inst,
1674 MemIntrinsicInfo &Info) = 0;
1675 virtual unsigned getAtomicMemIntrinsicMaxElementSize() const = 0;
1676 virtual Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
1677 Type *ExpectedType) = 0;
1678 virtual Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
1679 unsigned SrcAddrSpace,
1680 unsigned DestAddrSpace,
1681 unsigned SrcAlign,
1682 unsigned DestAlign) const = 0;
1683 virtual void getMemcpyLoopResidualLoweringType(
1684 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
1685 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
1686 unsigned SrcAlign, unsigned DestAlign) const = 0;
1687 virtual bool areInlineCompatible(const Function *Caller,
1688 const Function *Callee) const = 0;
1689 virtual bool
1690 areFunctionArgsABICompatible(const Function *Caller, const Function *Callee,
1691 SmallPtrSetImpl<Argument *> &Args) const = 0;
1692 virtual bool isIndexedLoadLegal(MemIndexedMode Mode, Type *Ty) const = 0;
1693 virtual bool isIndexedStoreLegal(MemIndexedMode Mode, Type *Ty) const = 0;
1694 virtual unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const = 0;
1695 virtual bool isLegalToVectorizeLoad(LoadInst *LI) const = 0;
1696 virtual bool isLegalToVectorizeStore(StoreInst *SI) const = 0;
1697 virtual bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
1698 Align Alignment,
1699 unsigned AddrSpace) const = 0;
1700 virtual bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
1701 Align Alignment,
1702 unsigned AddrSpace) const = 0;
1703 virtual bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc,
1704 ElementCount VF) const = 0;
1705 virtual bool isElementTypeLegalForScalableVector(Type *Ty) const = 0;
1706 virtual unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
1707 unsigned ChainSizeInBytes,
1708 VectorType *VecTy) const = 0;
1709 virtual unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
1710 unsigned ChainSizeInBytes,
1711 VectorType *VecTy) const = 0;
1712 virtual bool preferInLoopReduction(unsigned Opcode, Type *Ty,
1713 ReductionFlags) const = 0;
1714 virtual bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
1715 ReductionFlags) const = 0;
1716 virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0;
1717 virtual unsigned getGISelRematGlobalCost() const = 0;
1718 virtual bool supportsScalableVectors() const = 0;
1719 virtual bool hasActiveVectorLength() const = 0;
1720 virtual InstructionCost getInstructionLatency(const Instruction *I) = 0;
1721 virtual VPLegalization
1722 getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
1723};
1724
1725template <typename T>
1726class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
1727 T Impl;
1728
1729public:
1730 Model(T Impl) : Impl(std::move(Impl)) {}
1731 ~Model() override {}
1732
1733 const DataLayout &getDataLayout() const override {
1734 return Impl.getDataLayout();
1735 }
1736
1737 InstructionCost
1738 getGEPCost(Type *PointeeType, const Value *Ptr,
1739 ArrayRef<const Value *> Operands,
1740 enum TargetTransformInfo::TargetCostKind CostKind) override {
1741 return Impl.getGEPCost(PointeeType, Ptr, Operands);
1742 }
1743 unsigned getInliningThresholdMultiplier() override {
1744 return Impl.getInliningThresholdMultiplier();
1745 }
1746 unsigned adjustInliningThreshold(const CallBase *CB) override {
1747 return Impl.adjustInliningThreshold(CB);
1748 }
1749 int getInlinerVectorBonusPercent() override {
1750 return Impl.getInlinerVectorBonusPercent();
1751 }
1752 InstructionCost getMemcpyCost(const Instruction *I) override {
1753 return Impl.getMemcpyCost(I);
1754 }
1755 InstructionCost getUserCost(const User *U, ArrayRef<const Value *> Operands,
1756 TargetCostKind CostKind) override {
1757 return Impl.getUserCost(U, Operands, CostKind);
1758 }
1759 BranchProbability getPredictableBranchThreshold() override {
1760 return Impl.getPredictableBranchThreshold();
1761 }
1762 bool hasBranchDivergence() override { return Impl.hasBranchDivergence(); }
1763 bool useGPUDivergenceAnalysis() override {
1764 return Impl.useGPUDivergenceAnalysis();
1765 }
1766 bool isSourceOfDivergence(const Value *V) override {
1767 return Impl.isSourceOfDivergence(V);
1768 }
1769
1770 bool isAlwaysUniform(const Value *V) override {
1771 return Impl.isAlwaysUniform(V);
1772 }
1773
1774 unsigned getFlatAddressSpace() override { return Impl.getFlatAddressSpace(); }
1775
1776 bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
1777 Intrinsic::ID IID) const override {
1778 return Impl.collectFlatAddressOperands(OpIndexes, IID);
1779 }
1780
1781 bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const override {
1782 return Impl.isNoopAddrSpaceCast(FromAS, ToAS);
1783 }
1784
1785 unsigned getAssumedAddrSpace(const Value *V) const override {
1786 return Impl.getAssumedAddrSpace(V);
1787 }
1788
1789 Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
1790 Value *NewV) const override {
1791 return Impl.rewriteIntrinsicWithAddressSpace(II, OldV, NewV);
1792 }
1793
1794 bool isLoweredToCall(const Function *F) override {
1795 return Impl.isLoweredToCall(F);
1796 }
1797 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1798 UnrollingPreferences &UP,
1799 OptimizationRemarkEmitter *ORE) override {
1800 return Impl.getUnrollingPreferences(L, SE, UP, ORE);
1801 }
1802 void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1803 PeelingPreferences &PP) override {
1804 return Impl.getPeelingPreferences(L, SE, PP);
1805 }
1806 bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
1807 AssumptionCache &AC, TargetLibraryInfo *LibInfo,
1808 HardwareLoopInfo &HWLoopInfo) override {
1809 return Impl.isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
1810 }
1811 bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
1812 AssumptionCache &AC, TargetLibraryInfo *TLI,
1813 DominatorTree *DT,
1814 const LoopAccessInfo *LAI) override {
1815 return Impl.preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
1816 }
1817 bool emitGetActiveLaneMask() override {
1818 return Impl.emitGetActiveLaneMask();
1819 }
1820 Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
1821 IntrinsicInst &II) override {
1822 return Impl.instCombineIntrinsic(IC, II);
1823 }
1824 Optional<Value *>
1825 simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II,
1826 APInt DemandedMask, KnownBits &Known,
1827 bool &KnownBitsComputed) override {
1828 return Impl.simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
1829 KnownBitsComputed);
1830 }
1831 Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
1832 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1833 APInt &UndefElts2, APInt &UndefElts3,
1834 std::function<void(Instruction *, unsigned, APInt, APInt &)>
1835 SimplifyAndSetOp) override {
1836 return Impl.simplifyDemandedVectorEltsIntrinsic(
1837 IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
1838 SimplifyAndSetOp);
1839 }
1840 bool isLegalAddImmediate(int64_t Imm) override {
1841 return Impl.isLegalAddImmediate(Imm);
1842 }
1843 bool isLegalICmpImmediate(int64_t Imm) override {
1844 return Impl.isLegalICmpImmediate(Imm);
1845 }
1846 bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
1847 bool HasBaseReg, int64_t Scale, unsigned AddrSpace,
1848 Instruction *I) override {
1849 return Impl.isLegalAddressingMode(Ty, BaseGV, BaseOffset, HasBaseReg, Scale,
1850 AddrSpace, I);
1851 }
1852 bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
1853 TargetTransformInfo::LSRCost &C2) override {
1854 return Impl.isLSRCostLess(C1, C2);
1855 }
1856 bool isNumRegsMajorCostOfLSR() override {
1857 return Impl.isNumRegsMajorCostOfLSR();
1858 }
1859 bool isProfitableLSRChainElement(Instruction *I) override {
1860 return Impl.isProfitableLSRChainElement(I);
1861 }
1862 bool canMacroFuseCmp() override { return Impl.canMacroFuseCmp(); }
1863 bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI,
1864 DominatorTree *DT, AssumptionCache *AC,
1865 TargetLibraryInfo *LibInfo) override {
1866 return Impl.canSaveCmp(L, BI, SE, LI, DT, AC, LibInfo);
1867 }
1868 AddressingModeKind
1869 getPreferredAddressingMode(const Loop *L,
1870 ScalarEvolution *SE) const override {
1871 return Impl.getPreferredAddressingMode(L, SE);
1872 }
1873 bool isLegalMaskedStore(Type *DataType, Align Alignment) override {
1874 return Impl.isLegalMaskedStore(DataType, Alignment);
1875 }
1876 bool isLegalMaskedLoad(Type *DataType, Align Alignment) override {
1877 return Impl.isLegalMaskedLoad(DataType, Alignment);
1878 }
1879 bool isLegalNTStore(Type *DataType, Align Alignment) override {
1880 return Impl.isLegalNTStore(DataType, Alignment);
1881 }
1882 bool isLegalNTLoad(Type *DataType, Align Alignment) override {
1883 return Impl.isLegalNTLoad(DataType, Alignment);
1884 }
1885 bool isLegalMaskedScatter(Type *DataType, Align Alignment) override {
1886 return Impl.isLegalMaskedScatter(DataType, Alignment);
1887 }
1888 bool isLegalMaskedGather(Type *DataType, Align Alignment) override {
1889 return Impl.isLegalMaskedGather(DataType, Alignment);
1890 }
1891 bool isLegalMaskedCompressStore(Type *DataType) override {
1892 return Impl.isLegalMaskedCompressStore(DataType);
1893 }
1894 bool isLegalMaskedExpandLoad(Type *DataType) override {
1895 return Impl.isLegalMaskedExpandLoad(DataType);
1896 }
1897 bool enableOrderedReductions() override {
1898 return Impl.enableOrderedReductions();
1899 }
1900 bool hasDivRemOp(Type *DataType, bool IsSigned) override {
1901 return Impl.hasDivRemOp(DataType, IsSigned);
1902 }
1903 bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) override {
1904 return Impl.hasVolatileVariant(I, AddrSpace);
1905 }
1906 bool prefersVectorizedAddressing() override {
1907 return Impl.prefersVectorizedAddressing();
1908 }
1909 InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
1910 int64_t BaseOffset, bool HasBaseReg,
1911 int64_t Scale,
1912 unsigned AddrSpace) override {
1913 return Impl.getScalingFactorCost(Ty, BaseGV, BaseOffset, HasBaseReg, Scale,
1914 AddrSpace);
1915 }
1916 bool LSRWithInstrQueries() override { return Impl.LSRWithInstrQueries(); }
1917 bool isTruncateFree(Type *Ty1, Type *Ty2) override {
1918 return Impl.isTruncateFree(Ty1, Ty2);
1919 }
1920 bool isProfitableToHoist(Instruction *I) override {
1921 return Impl.isProfitableToHoist(I);
1922 }
1923 bool useAA() override { return Impl.useAA(); }
1924 bool isTypeLegal(Type *Ty) override { return Impl.isTypeLegal(Ty); }
1925 InstructionCost getRegUsageForType(Type *Ty) override {
1926 return Impl.getRegUsageForType(Ty);
1927 }
1928 bool shouldBuildLookupTables() override {
1929 return Impl.shouldBuildLookupTables();
1930 }
1931 bool shouldBuildLookupTablesForConstant(Constant *C) override {
1932 return Impl.shouldBuildLookupTablesForConstant(C);
1933 }
1934 bool shouldBuildRelLookupTables() override {
1935 return Impl.shouldBuildRelLookupTables();
1936 }
1937 bool useColdCCForColdCall(Function &F) override {
1938 return Impl.useColdCCForColdCall(F);
1939 }
1940
1941 InstructionCost getScalarizationOverhead(VectorType *Ty,
1942 const APInt &DemandedElts,
1943 bool Insert, bool Extract) override {
1944 return Impl.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
1945 }
1946 InstructionCost
1947 getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
1948 ArrayRef<Type *> Tys) override {
1949 return Impl.getOperandsScalarizationOverhead(Args, Tys);
1950 }
1951
1952 bool supportsEfficientVectorElementLoadStore() override {
1953 return Impl.supportsEfficientVectorElementLoadStore();
1954 }
1955
1956 bool enableAggressiveInterleaving(bool LoopHasReductions) override {
1957 return Impl.enableAggressiveInterleaving(LoopHasReductions);
1958 }
1959 MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
1960 bool IsZeroCmp) const override {
1961 return Impl.enableMemCmpExpansion(OptSize, IsZeroCmp);
1962 }
1963 bool enableInterleavedAccessVectorization() override {
1964 return Impl.enableInterleavedAccessVectorization();
1965 }
1966 bool enableMaskedInterleavedAccessVectorization() override {
1967 return Impl.enableMaskedInterleavedAccessVectorization();
1968 }
1969 bool isFPVectorizationPotentiallyUnsafe() override {
1970 return Impl.isFPVectorizationPotentiallyUnsafe();
1971 }
1972 bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth,
1973 unsigned AddressSpace, Align Alignment,
1974 bool *Fast) override {
1975 return Impl.allowsMisalignedMemoryAccesses(Context, BitWidth, AddressSpace,
1976 Alignment, Fast);
1977 }
1978 PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) override {
1979 return Impl.getPopcntSupport(IntTyWidthInBit);
1980 }
1981 bool haveFastSqrt(Type *Ty) override { return Impl.haveFastSqrt(Ty); }
1982
1983 bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) override {
1984 return Impl.isFCmpOrdCheaperThanFCmpZero(Ty);
1985 }
1986
1987 InstructionCost getFPOpCost(Type *Ty) override {
1988 return Impl.getFPOpCost(Ty);
1989 }
1990
1991 InstructionCost getIntImmCodeSizeCost(unsigned Opc, unsigned Idx,
1992 const APInt &Imm, Type *Ty) override {
1993 return Impl.getIntImmCodeSizeCost(Opc, Idx, Imm, Ty);
1994 }
1995 InstructionCost getIntImmCost(const APInt &Imm, Type *Ty,
1996 TargetCostKind CostKind) override {
1997 return Impl.getIntImmCost(Imm, Ty, CostKind);
1998 }
1999 InstructionCost getIntImmCostInst(unsigned Opc, unsigned Idx,
2000 const APInt &Imm, Type *Ty,
2001 TargetCostKind CostKind,
2002 Instruction *Inst = nullptr) override {
2003 return Impl.getIntImmCostInst(Opc, Idx, Imm, Ty, CostKind, Inst);
2004 }
2005 InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
2006 const APInt &Imm, Type *Ty,
2007 TargetCostKind CostKind) override {
2008 return Impl.getIntImmCostIntrin(IID, Idx, Imm, Ty, CostKind);
2009 }
2010 unsigned getNumberOfRegisters(unsigned ClassID) const override {
2011 return Impl.getNumberOfRegisters(ClassID);
2012 }
2013 unsigned getRegisterClassForType(bool Vector,
2014 Type *Ty = nullptr) const override {
2015 return Impl.getRegisterClassForType(Vector, Ty);
2016 }
2017 const char *getRegisterClassName(unsigned ClassID) const override {
2018 return Impl.getRegisterClassName(ClassID);
2019 }
2020 TypeSize getRegisterBitWidth(RegisterKind K) const override {
2021 return Impl.getRegisterBitWidth(K);
2022 }
2023 unsigned getMinVectorRegisterBitWidth() const override {
2024 return Impl.getMinVectorRegisterBitWidth();
2025 }
2026 Optional<unsigned> getMaxVScale() const override {
2027 return Impl.getMaxVScale();
2028 }
2029 bool shouldMaximizeVectorBandwidth() const override {
2030 return Impl.shouldMaximizeVectorBandwidth();
2031 }
2032 ElementCount getMinimumVF(unsigned ElemWidth,
2033 bool IsScalable) const override {
2034 return Impl.getMinimumVF(ElemWidth, IsScalable);
2035 }
2036 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override {
2037 return Impl.getMaximumVF(ElemWidth, Opcode);
2038 }
2039 bool shouldConsiderAddressTypePromotion(
2040 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) override {
2041 return Impl.shouldConsiderAddressTypePromotion(
2042 I, AllowPromotionWithoutCommonHeader);
2043 }
2044 unsigned getCacheLineSize() const override { return Impl.getCacheLineSize(); }
2045 Optional<unsigned> getCacheSize(CacheLevel Level) const override {
2046 return Impl.getCacheSize(Level);
2047 }
2048 Optional<unsigned> getCacheAssociativity(CacheLevel Level) const override {
2049 return Impl.getCacheAssociativity(Level);
2050 }
2051
2052 /// Return the preferred prefetch distance in terms of instructions.
2053 ///
2054 unsigned getPrefetchDistance() const override {
2055 return Impl.getPrefetchDistance();
2056 }
2057
2058 /// Return the minimum stride necessary to trigger software
2059 /// prefetching.
2060 ///
2061 unsigned getMinPrefetchStride(unsigned NumMemAccesses,
2062 unsigned NumStridedMemAccesses,
2063 unsigned NumPrefetches,
2064 bool HasCall) const override {
2065 return Impl.getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
2066 NumPrefetches, HasCall);
2067 }
2068
2069 /// Return the maximum prefetch distance in terms of loop
2070 /// iterations.
2071 ///
2072 unsigned getMaxPrefetchIterationsAhead() const override {
2073 return Impl.getMaxPrefetchIterationsAhead();
2074 }
2075
2076 /// \return True if prefetching should also be done for writes.
2077 bool enableWritePrefetching() const override {
2078 return Impl.enableWritePrefetching();
2079 }
2080
2081 unsigned getMaxInterleaveFactor(unsigned VF) override {
2082 return Impl.getMaxInterleaveFactor(VF);
2083 }
2084 unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
2085 unsigned &JTSize,
2086 ProfileSummaryInfo *PSI,
2087 BlockFrequencyInfo *BFI) override {
2088 return Impl.getEstimatedNumberOfCaseClusters(SI, JTSize, PSI, BFI);
2089 }
2090 InstructionCost getArithmeticInstrCost(
2091 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2092 OperandValueKind Opd1Info, OperandValueKind Opd2Info,
2093 OperandValueProperties Opd1PropInfo, OperandValueProperties Opd2PropInfo,
2094 ArrayRef<const Value *> Args,
2095 const Instruction *CxtI = nullptr) override {
2096 return Impl.getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, Opd2Info,
2097 Opd1PropInfo, Opd2PropInfo, Args, CxtI);
2098 }
2099 InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp,
2100 ArrayRef<int> Mask, int Index,
2101 VectorType *SubTp) override {
2102 return Impl.getShuffleCost(Kind, Tp, Mask, Index, SubTp);
2103 }
2104 InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
2105 CastContextHint CCH,
2106 TTI::TargetCostKind CostKind,
2107 const Instruction *I) override {
2108 return Impl.getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
2109 }
2110 InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst,
2111 VectorType *VecTy,
2112 unsigned Index) override {
2113 return Impl.getExtractWithExtendCost(Opcode, Dst, VecTy, Index);
2114 }
2115 InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
2116 const Instruction *I = nullptr) override {
2117 return Impl.getCFInstrCost(Opcode, CostKind, I);
2118 }
2119 InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
2120 CmpInst::Predicate VecPred,
2121 TTI::TargetCostKind CostKind,
2122 const Instruction *I) override {
2123 return Impl.getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
2124 }
2125 InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
2126 unsigned Index) override {
2127 return Impl.getVectorInstrCost(Opcode, Val, Index);
2128 }
2129 InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
2130 unsigned AddressSpace,
2131 TTI::TargetCostKind CostKind,
2132 const Instruction *I) override {
2133 return Impl.getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2134 CostKind, I);
2135 }
2136 InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
2137 Align Alignment, unsigned AddressSpace,
2138 TTI::TargetCostKind CostKind) override {
2139 return Impl.getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2140 CostKind);
2141 }
2142 InstructionCost
2143 getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr,
2144 bool VariableMask, Align Alignment,
2145 TTI::TargetCostKind CostKind,
2146 const Instruction *I = nullptr) override {
2147 return Impl.getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
2148 Alignment, CostKind, I);
2149 }
2150 InstructionCost getInterleavedMemoryOpCost(
2151 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
2152 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
2153 bool UseMaskForCond, bool UseMaskForGaps) override {
2154 return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2155 Alignment, AddressSpace, CostKind,
2156 UseMaskForCond, UseMaskForGaps);
2157 }
2158 InstructionCost
2159 getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
2160 Optional<FastMathFlags> FMF,
2161 TTI::TargetCostKind CostKind) override {
2162 return Impl.getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2163 }
2164 InstructionCost
2165 getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned,
2166 TTI::TargetCostKind CostKind) override {
2167 return Impl.getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
2168 }
2169 InstructionCost getExtendedAddReductionCost(
2170 bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty,
2171 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) override {
2172 return Impl.getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, Ty,
2173 CostKind);
2174 }
2175 InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
2176 TTI::TargetCostKind CostKind) override {
2177 return Impl.getIntrinsicInstrCost(ICA, CostKind);
2178 }
2179 InstructionCost getCallInstrCost(Function *F, Type *RetTy,
2180 ArrayRef<Type *> Tys,
2181 TTI::TargetCostKind CostKind) override {
2182 return Impl.getCallInstrCost(F, RetTy, Tys, CostKind);
2183 }
2184 unsigned getNumberOfParts(Type *Tp) override {
2185 return Impl.getNumberOfParts(Tp);
2186 }
2187 InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
2188 const SCEV *Ptr) override {
2189 return Impl.getAddressComputationCost(Ty, SE, Ptr);
2190 }
2191 InstructionCost getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) override {
2192 return Impl.getCostOfKeepingLiveOverCall(Tys);
2193 }
2194 bool getTgtMemIntrinsic(IntrinsicInst *Inst,
2195 MemIntrinsicInfo &Info) override {
2196 return Impl.getTgtMemIntrinsic(Inst, Info);
2197 }
2198 unsigned getAtomicMemIntrinsicMaxElementSize() const override {
2199 return Impl.getAtomicMemIntrinsicMaxElementSize();
2200 }
2201 Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
2202 Type *ExpectedType) override {
2203 return Impl.getOrCreateResultFromMemIntrinsic(Inst, ExpectedType);
2204 }
2205 Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
2206 unsigned SrcAddrSpace, unsigned DestAddrSpace,
2207 unsigned SrcAlign,
2208 unsigned DestAlign) const override {
2209 return Impl.getMemcpyLoopLoweringType(Context, Length, SrcAddrSpace,
2210 DestAddrSpace, SrcAlign, DestAlign);
2211 }
2212 void getMemcpyLoopResidualLoweringType(
2213 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
2214 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
2215 unsigned SrcAlign, unsigned DestAlign) const override {
2216 Impl.getMemcpyLoopResidualLoweringType(OpsOut, Context, RemainingBytes,
2217 SrcAddrSpace, DestAddrSpace,
2218 SrcAlign, DestAlign);
2219 }
2220 bool areInlineCompatible(const Function *Caller,
2221 const Function *Callee) const override {
2222 return Impl.areInlineCompatible(Caller, Callee);
2223 }
2224 bool areFunctionArgsABICompatible(
2225 const Function *Caller, const Function *Callee,
2226 SmallPtrSetImpl<Argument *> &Args) const override {
2227 return Impl.areFunctionArgsABICompatible(Caller, Callee, Args);
2228 }
2229 bool isIndexedLoadLegal(MemIndexedMode Mode, Type *Ty) const override {
2230 return Impl.isIndexedLoadLegal(Mode, Ty, getDataLayout());
2231 }
2232 bool isIndexedStoreLegal(MemIndexedMode Mode, Type *Ty) const override {
2233 return Impl.isIndexedStoreLegal(Mode, Ty, getDataLayout());
2234 }
2235 unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override {
2236 return Impl.getLoadStoreVecRegBitWidth(AddrSpace);
2237 }
2238 bool isLegalToVectorizeLoad(LoadInst *LI) const override {
2239 return Impl.isLegalToVectorizeLoad(LI);
2240 }
2241 bool isLegalToVectorizeStore(StoreInst *SI) const override {
2242 return Impl.isLegalToVectorizeStore(SI);
2243 }
2244 bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment,
2245 unsigned AddrSpace) const override {
2246 return Impl.isLegalToVectorizeLoadChain(ChainSizeInBytes, Alignment,
2247 AddrSpace);
2248 }
2249 bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
2250 unsigned AddrSpace) const override {
2251 return Impl.isLegalToVectorizeStoreChain(ChainSizeInBytes, Alignment,
2252 AddrSpace);
2253 }
2254 bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc,
2255 ElementCount VF) const override {
2256 return Impl.isLegalToVectorizeReduction(RdxDesc, VF);
2257 }
2258 bool isElementTypeLegalForScalableVector(Type *Ty) const override {
2259 return Impl.isElementTypeLegalForScalableVector(Ty);
2260 }
2261 unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
2262 unsigned ChainSizeInBytes,
2263 VectorType *VecTy) const override {
2264 return Impl.getLoadVectorFactor(VF, LoadSize, ChainSizeInBytes, VecTy);
2265 }
2266 unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
2267 unsigned ChainSizeInBytes,
2268 VectorType *VecTy) const override {
2269 return Impl.getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy);
2270 }
2271 bool preferInLoopReduction(unsigned Opcode, Type *Ty,
2272 ReductionFlags Flags) const override {
2273 return Impl.preferInLoopReduction(Opcode, Ty, Flags);
2274 }
2275 bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
2276 ReductionFlags Flags) const override {
2277 return Impl.preferPredicatedReductionSelect(Opcode, Ty, Flags);
2278 }
2279 bool shouldExpandReduction(const IntrinsicInst *II) const override {
2280 return Impl.shouldExpandReduction(II);
2281 }
2282
2283 unsigned getGISelRematGlobalCost() const override {
2284 return Impl.getGISelRematGlobalCost();
2285 }
2286
2287 bool supportsScalableVectors() const override {
2288 return Impl.supportsScalableVectors();
2289 }
2290
2291 bool hasActiveVectorLength() const override {
2292 return Impl.hasActiveVectorLength();
2293 }
2294
2295 InstructionCost getInstructionLatency(const Instruction *I) override {
2296 return Impl.getInstructionLatency(I);
2297 }
2298
2299 VPLegalization
2300 getVPLegalizationStrategy(const VPIntrinsic &PI) const override {
2301 return Impl.getVPLegalizationStrategy(PI);
2302 }
2303};
2304
2305template <typename T>
2306TargetTransformInfo::TargetTransformInfo(T Impl)
2307 : TTIImpl(new Model<T>(Impl)) {}
2308
2309/// Analysis pass providing the \c TargetTransformInfo.
2310///
2311/// The core idea of the TargetIRAnalysis is to expose an interface through
2312/// which LLVM targets can analyze and provide information about the middle
2313/// end's target-independent IR. This supports use cases such as target-aware
2314/// cost modeling of IR constructs.
2315///
2316/// This is a function analysis because much of the cost modeling for targets
2317/// is done in a subtarget specific way and LLVM supports compiling different
2318/// functions targeting different subtargets in order to support runtime
2319/// dispatch according to the observed subtarget.
2320class TargetIRAnalysis : public AnalysisInfoMixin<TargetIRAnalysis> {
2321public:
2322 typedef TargetTransformInfo Result;
2323
2324 /// Default construct a target IR analysis.
2325 ///
2326 /// This will use the module's datalayout to construct a baseline
2327 /// conservative TTI result.
2328 TargetIRAnalysis();
2329
2330 /// Construct an IR analysis pass around a target-provide callback.
2331 ///
2332 /// The callback will be called with a particular function for which the TTI
2333 /// is needed and must return a TTI object for that function.
2334 TargetIRAnalysis(std::function<Result(const Function &)> TTICallback);
2335
2336 // Value semantics. We spell out the constructors for MSVC.
2337 TargetIRAnalysis(const TargetIRAnalysis &Arg)
2338 : TTICallback(Arg.TTICallback) {}
2339 TargetIRAnalysis(TargetIRAnalysis &&Arg)
2340 : TTICallback(std::move(Arg.TTICallback)) {}
2341 TargetIRAnalysis &operator=(const TargetIRAnalysis &RHS) {
2342 TTICallback = RHS.TTICallback;
2343 return *this;
2344 }
2345 TargetIRAnalysis &operator=(TargetIRAnalysis &&RHS) {
2346 TTICallback = std::move(RHS.TTICallback);
2347 return *this;
2348 }
2349
2350 Result run(const Function &F, FunctionAnalysisManager &);
2351
2352private:
2353 friend AnalysisInfoMixin<TargetIRAnalysis>;
2354 static AnalysisKey Key;
2355
2356 /// The callback used to produce a result.
2357 ///
2358 /// We use a completely opaque callback so that targets can provide whatever
2359 /// mechanism they desire for constructing the TTI for a given function.
2360 ///
2361 /// FIXME: Should we really use std::function? It's relatively inefficient.
2362 /// It might be possible to arrange for even stateful callbacks to outlive
2363 /// the analysis and thus use a function_ref which would be lighter weight.
2364 /// This may also be less error prone as the callback is likely to reference
2365 /// the external TargetMachine, and that reference needs to never dangle.
2366 std::function<Result(const Function &)> TTICallback;
2367
2368 /// Helper function used as the callback in the default constructor.
2369 static Result getDefaultTTI(const Function &F);
2370};
2371
2372/// Wrapper pass for TargetTransformInfo.
2373///
2374/// This pass can be constructed from a TTI object which it stores internally
2375/// and is queried by passes.
2376class TargetTransformInfoWrapperPass : public ImmutablePass {
2377 TargetIRAnalysis TIRA;
2378 Optional<TargetTransformInfo> TTI;
2379
2380 virtual void anchor();
2381
2382public:
2383 static char ID;
2384
2385 /// We must provide a default constructor for the pass but it should
2386 /// never be used.
2387 ///
2388 /// Use the constructor below or call one of the creation routines.
2389 TargetTransformInfoWrapperPass();
2390
2391 explicit TargetTransformInfoWrapperPass(TargetIRAnalysis TIRA);
2392
2393 TargetTransformInfo &getTTI(const Function &F);
2394};
2395
2396/// Create an analysis pass wrapper around a TTI object.
2397///
2398/// This analysis pass just holds the TTI instance and makes it available to
2399/// clients.
2400ImmutablePass *createTargetTransformInfoWrapperPass(TargetIRAnalysis TIRA);
2401
2402} // namespace llvm
2403
2404#endif

/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/include/llvm/ADT/SmallVector.h

1//===- llvm/ADT/SmallVector.h - 'Normally small' vectors --------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the SmallVector class.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_ADT_SMALLVECTOR_H
14#define LLVM_ADT_SMALLVECTOR_H
15
16#include "llvm/ADT/iterator_range.h"
17#include "llvm/Support/Compiler.h"
18#include "llvm/Support/ErrorHandling.h"
19#include "llvm/Support/MemAlloc.h"
20#include "llvm/Support/type_traits.h"
21#include <algorithm>
22#include <cassert>
23#include <cstddef>
24#include <cstdlib>
25#include <cstring>
26#include <functional>
27#include <initializer_list>
28#include <iterator>
29#include <limits>
30#include <memory>
31#include <new>
32#include <type_traits>
33#include <utility>
34
35namespace llvm {
36
37/// This is all the stuff common to all SmallVectors.
38///
39/// The template parameter specifies the type which should be used to hold the
40/// Size and Capacity of the SmallVector, so it can be adjusted.
41/// Using 32 bit size is desirable to shrink the size of the SmallVector.
42/// Using 64 bit size is desirable for cases like SmallVector<char>, where a
43/// 32 bit size would limit the vector to ~4GB. SmallVectors are used for
44/// buffering bitcode output - which can exceed 4GB.
45template <class Size_T> class SmallVectorBase {
46protected:
47 void *BeginX;
48 Size_T Size = 0, Capacity;
49
50 /// The maximum value of the Size_T used.
51 static constexpr size_t SizeTypeMax() {
52 return std::numeric_limits<Size_T>::max();
53 }
54
55 SmallVectorBase() = delete;
56 SmallVectorBase(void *FirstEl, size_t TotalCapacity)
57 : BeginX(FirstEl), Capacity(TotalCapacity) {}
58
59 /// This is a helper for \a grow() that's out of line to reduce code
60 /// duplication. This function will report a fatal error if it can't grow at
61 /// least to \p MinSize.
62 void *mallocForGrow(size_t MinSize, size_t TSize, size_t &NewCapacity);
63
64 /// This is an implementation of the grow() method which only works
65 /// on POD-like data types and is out of line to reduce code duplication.
66 /// This function will report a fatal error if it cannot increase capacity.
67 void grow_pod(void *FirstEl, size_t MinSize, size_t TSize);
68
69public:
70 size_t size() const { return Size; }
71 size_t capacity() const { return Capacity; }
72
73 LLVM_NODISCARD[[clang::warn_unused_result]] bool empty() const { return !Size; }
14
Assuming field 'Size' is not equal to 0
15
Returning zero, which participates in a condition later
74
75 /// Set the array size to \p N, which the current array must have enough
76 /// capacity for.
77 ///
78 /// This does not construct or destroy any elements in the vector.
79 ///
80 /// Clients can use this in conjunction with capacity() to write past the end
81 /// of the buffer when they know that more elements are available, and only
82 /// update the size later. This avoids the cost of value initializing elements
83 /// which will only be overwritten.
84 void set_size(size_t N) {
85 assert(N <= capacity())(static_cast<void> (0));
86 Size = N;
87 }
88};
89
90template <class T>
91using SmallVectorSizeType =
92 typename std::conditional<sizeof(T) < 4 && sizeof(void *) >= 8, uint64_t,
93 uint32_t>::type;
94
95/// Figure out the offset of the first element.
96template <class T, typename = void> struct SmallVectorAlignmentAndSize {
97 alignas(SmallVectorBase<SmallVectorSizeType<T>>) char Base[sizeof(
98 SmallVectorBase<SmallVectorSizeType<T>>)];
99 alignas(T) char FirstEl[sizeof(T)];
100};
101
102/// This is the part of SmallVectorTemplateBase which does not depend on whether
103/// the type T is a POD. The extra dummy template argument is used by ArrayRef
104/// to avoid unnecessarily requiring T to be complete.
105template <typename T, typename = void>
106class SmallVectorTemplateCommon
107 : public SmallVectorBase<SmallVectorSizeType<T>> {
108 using Base = SmallVectorBase<SmallVectorSizeType<T>>;
109
110 /// Find the address of the first element. For this pointer math to be valid
111 /// with small-size of 0 for T with lots of alignment, it's important that
112 /// SmallVectorStorage is properly-aligned even for small-size of 0.
113 void *getFirstEl() const {
114 return const_cast<void *>(reinterpret_cast<const void *>(
115 reinterpret_cast<const char *>(this) +
116 offsetof(SmallVectorAlignmentAndSize<T>, FirstEl)__builtin_offsetof(SmallVectorAlignmentAndSize<T>, FirstEl
)
));
117 }
118 // Space after 'FirstEl' is clobbered, do not add any instance vars after it.
119
120protected:
121 SmallVectorTemplateCommon(size_t Size) : Base(getFirstEl(), Size) {}
122
123 void grow_pod(size_t MinSize, size_t TSize) {
124 Base::grow_pod(getFirstEl(), MinSize, TSize);
125 }
126
127 /// Return true if this is a smallvector which has not had dynamic
128 /// memory allocated for it.
129 bool isSmall() const { return this->BeginX == getFirstEl(); }
130
131 /// Put this vector in a state of being small.
132 void resetToSmall() {
133 this->BeginX = getFirstEl();
134 this->Size = this->Capacity = 0; // FIXME: Setting Capacity to 0 is suspect.
135 }
136
137 /// Return true if V is an internal reference to the given range.
138 bool isReferenceToRange(const void *V, const void *First, const void *Last) const {
139 // Use std::less to avoid UB.
140 std::less<> LessThan;
141 return !LessThan(V, First) && LessThan(V, Last);
142 }
143
144 /// Return true if V is an internal reference to this vector.
145 bool isReferenceToStorage(const void *V) const {
146 return isReferenceToRange(V, this->begin(), this->end());
147 }
148
149 /// Return true if First and Last form a valid (possibly empty) range in this
150 /// vector's storage.
151 bool isRangeInStorage(const void *First, const void *Last) const {
152 // Use std::less to avoid UB.
153 std::less<> LessThan;
154 return !LessThan(First, this->begin()) && !LessThan(Last, First) &&
155 !LessThan(this->end(), Last);
156 }
157
158 /// Return true unless Elt will be invalidated by resizing the vector to
159 /// NewSize.
160 bool isSafeToReferenceAfterResize(const void *Elt, size_t NewSize) {
161 // Past the end.
162 if (LLVM_LIKELY(!isReferenceToStorage(Elt))__builtin_expect((bool)(!isReferenceToStorage(Elt)), true))
163 return true;
164
165 // Return false if Elt will be destroyed by shrinking.
166 if (NewSize <= this->size())
167 return Elt < this->begin() + NewSize;
168
169 // Return false if we need to grow.
170 return NewSize <= this->capacity();
171 }
172
173 /// Check whether Elt will be invalidated by resizing the vector to NewSize.
174 void assertSafeToReferenceAfterResize(const void *Elt, size_t NewSize) {
175 assert(isSafeToReferenceAfterResize(Elt, NewSize) &&(static_cast<void> (0))
176 "Attempting to reference an element of the vector in an operation "(static_cast<void> (0))
177 "that invalidates it")(static_cast<void> (0));
178 }
179
180 /// Check whether Elt will be invalidated by increasing the size of the
181 /// vector by N.
182 void assertSafeToAdd(const void *Elt, size_t N = 1) {
183 this->assertSafeToReferenceAfterResize(Elt, this->size() + N);
184 }
185
186 /// Check whether any part of the range will be invalidated by clearing.
187 void assertSafeToReferenceAfterClear(const T *From, const T *To) {
188 if (From == To)
189 return;
190 this->assertSafeToReferenceAfterResize(From, 0);
191 this->assertSafeToReferenceAfterResize(To - 1, 0);
192 }
193 template <
194 class ItTy,
195 std::enable_if_t<!std::is_same<std::remove_const_t<ItTy>, T *>::value,
196 bool> = false>
197 void assertSafeToReferenceAfterClear(ItTy, ItTy) {}
198
199 /// Check whether any part of the range will be invalidated by growing.
200 void assertSafeToAddRange(const T *From, const T *To) {
201 if (From == To)
202 return;
203 this->assertSafeToAdd(From, To - From);
204 this->assertSafeToAdd(To - 1, To - From);
205 }
206 template <
207 class ItTy,
208 std::enable_if_t<!std::is_same<std::remove_const_t<ItTy>, T *>::value,
209 bool> = false>
210 void assertSafeToAddRange(ItTy, ItTy) {}
211
212 /// Reserve enough space to add one element, and return the updated element
213 /// pointer in case it was a reference to the storage.
214 template <class U>
215 static const T *reserveForParamAndGetAddressImpl(U *This, const T &Elt,
216 size_t N) {
217 size_t NewSize = This->size() + N;
218 if (LLVM_LIKELY(NewSize <= This->capacity())__builtin_expect((bool)(NewSize <= This->capacity()), true
)
)
219 return &Elt;
220
221 bool ReferencesStorage = false;
222 int64_t Index = -1;
223 if (!U::TakesParamByValue) {
224 if (LLVM_UNLIKELY(This->isReferenceToStorage(&Elt))__builtin_expect((bool)(This->isReferenceToStorage(&Elt
)), false)
) {
225 ReferencesStorage = true;
226 Index = &Elt - This->begin();
227 }
228 }
229 This->grow(NewSize);
230 return ReferencesStorage ? This->begin() + Index : &Elt;
231 }
232
233public:
234 using size_type = size_t;
235 using difference_type = ptrdiff_t;
236 using value_type = T;
237 using iterator = T *;
238 using const_iterator = const T *;
239
240 using const_reverse_iterator = std::reverse_iterator<const_iterator>;
241 using reverse_iterator = std::reverse_iterator<iterator>;
242
243 using reference = T &;
244 using const_reference = const T &;
245 using pointer = T *;
246 using const_pointer = const T *;
247
248 using Base::capacity;
249 using Base::empty;
250 using Base::size;
251
252 // forward iterator creation methods.
253 iterator begin() { return (iterator)this->BeginX; }
254 const_iterator begin() const { return (const_iterator)this->BeginX; }
255 iterator end() { return begin() + size(); }
256 const_iterator end() const { return begin() + size(); }
257
258 // reverse iterator creation methods.
259 reverse_iterator rbegin() { return reverse_iterator(end()); }
260 const_reverse_iterator rbegin() const{ return const_reverse_iterator(end()); }
261 reverse_iterator rend() { return reverse_iterator(begin()); }
262 const_reverse_iterator rend() const { return const_reverse_iterator(begin());}
263
264 size_type size_in_bytes() const { return size() * sizeof(T); }
265 size_type max_size() const {
266 return std::min(this->SizeTypeMax(), size_type(-1) / sizeof(T));
267 }
268
269 size_t capacity_in_bytes() const { return capacity() * sizeof(T); }
270
271 /// Return a pointer to the vector's buffer, even if empty().
272 pointer data() { return pointer(begin()); }
273 /// Return a pointer to the vector's buffer, even if empty().
274 const_pointer data() const { return const_pointer(begin()); }
275
276 reference operator[](size_type idx) {
277 assert(idx < size())(static_cast<void> (0));
278 return begin()[idx];
279 }
280 const_reference operator[](size_type idx) const {
281 assert(idx < size())(static_cast<void> (0));
282 return begin()[idx];
283 }
284
285 reference front() {
286 assert(!empty())(static_cast<void> (0));
287 return begin()[0];
288 }
289 const_reference front() const {
290 assert(!empty())(static_cast<void> (0));
291 return begin()[0];
292 }
293
294 reference back() {
295 assert(!empty())(static_cast<void> (0));
296 return end()[-1];
297 }
298 const_reference back() const {
299 assert(!empty())(static_cast<void> (0));
300 return end()[-1];
301 }
302};
303
304/// SmallVectorTemplateBase<TriviallyCopyable = false> - This is where we put
305/// method implementations that are designed to work with non-trivial T's.
306///
307/// We approximate is_trivially_copyable with trivial move/copy construction and
308/// trivial destruction. While the standard doesn't specify that you're allowed
309/// copy these types with memcpy, there is no way for the type to observe this.
310/// This catches the important case of std::pair<POD, POD>, which is not
311/// trivially assignable.
312template <typename T, bool = (is_trivially_copy_constructible<T>::value) &&
313 (is_trivially_move_constructible<T>::value) &&
314 std::is_trivially_destructible<T>::value>
315class SmallVectorTemplateBase : public SmallVectorTemplateCommon<T> {
316 friend class SmallVectorTemplateCommon<T>;
317
318protected:
319 static constexpr bool TakesParamByValue = false;
320 using ValueParamT = const T &;
321
322 SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {}
323
324 static void destroy_range(T *S, T *E) {
325 while (S != E) {
326 --E;
327 E->~T();
328 }
329 }
330
331 /// Move the range [I, E) into the uninitialized memory starting with "Dest",
332 /// constructing elements as needed.
333 template<typename It1, typename It2>
334 static void uninitialized_move(It1 I, It1 E, It2 Dest) {
335 std::uninitialized_copy(std::make_move_iterator(I),
336 std::make_move_iterator(E), Dest);
337 }
338
339 /// Copy the range [I, E) onto the uninitialized memory starting with "Dest",
340 /// constructing elements as needed.
341 template<typename It1, typename It2>
342 static void uninitialized_copy(It1 I, It1 E, It2 Dest) {
343 std::uninitialized_copy(I, E, Dest);
344 }
345
346 /// Grow the allocated memory (without initializing new elements), doubling
347 /// the size of the allocated memory. Guarantees space for at least one more
348 /// element, or MinSize more elements if specified.
349 void grow(size_t MinSize = 0);
350
351 /// Create a new allocation big enough for \p MinSize and pass back its size
352 /// in \p NewCapacity. This is the first section of \a grow().
353 T *mallocForGrow(size_t MinSize, size_t &NewCapacity) {
354 return static_cast<T *>(
355 SmallVectorBase<SmallVectorSizeType<T>>::mallocForGrow(
356 MinSize, sizeof(T), NewCapacity));
357 }
358
359 /// Move existing elements over to the new allocation \p NewElts, the middle
360 /// section of \a grow().
361 void moveElementsForGrow(T *NewElts);
362
363 /// Transfer ownership of the allocation, finishing up \a grow().
364 void takeAllocationForGrow(T *NewElts, size_t NewCapacity);
365
366 /// Reserve enough space to add one element, and return the updated element
367 /// pointer in case it was a reference to the storage.
368 const T *reserveForParamAndGetAddress(const T &Elt, size_t N = 1) {
369 return this->reserveForParamAndGetAddressImpl(this, Elt, N);
370 }
371
372 /// Reserve enough space to add one element, and return the updated element
373 /// pointer in case it was a reference to the storage.
374 T *reserveForParamAndGetAddress(T &Elt, size_t N = 1) {
375 return const_cast<T *>(
376 this->reserveForParamAndGetAddressImpl(this, Elt, N));
377 }
378
379 static T &&forward_value_param(T &&V) { return std::move(V); }
380 static const T &forward_value_param(const T &V) { return V; }
381
382 void growAndAssign(size_t NumElts, const T &Elt) {
383 // Grow manually in case Elt is an internal reference.
384 size_t NewCapacity;
385 T *NewElts = mallocForGrow(NumElts, NewCapacity);
386 std::uninitialized_fill_n(NewElts, NumElts, Elt);
387 this->destroy_range(this->begin(), this->end());
388 takeAllocationForGrow(NewElts, NewCapacity);
389 this->set_size(NumElts);
390 }
391
392 template <typename... ArgTypes> T &growAndEmplaceBack(ArgTypes &&... Args) {
393 // Grow manually in case one of Args is an internal reference.
394 size_t NewCapacity;
395 T *NewElts = mallocForGrow(0, NewCapacity);
396 ::new ((void *)(NewElts + this->size())) T(std::forward<ArgTypes>(Args)...);
397 moveElementsForGrow(NewElts);
398 takeAllocationForGrow(NewElts, NewCapacity);
399 this->set_size(this->size() + 1);
400 return this->back();
401 }
402
403public:
404 void push_back(const T &Elt) {
405 const T *EltPtr = reserveForParamAndGetAddress(Elt);
406 ::new ((void *)this->end()) T(*EltPtr);
407 this->set_size(this->size() + 1);
408 }
409
410 void push_back(T &&Elt) {
411 T *EltPtr = reserveForParamAndGetAddress(Elt);
412 ::new ((void *)this->end()) T(::std::move(*EltPtr));
413 this->set_size(this->size() + 1);
414 }
415
416 void pop_back() {
417 this->set_size(this->size() - 1);
418 this->end()->~T();
419 }
420};
421
422// Define this out-of-line to dissuade the C++ compiler from inlining it.
423template <typename T, bool TriviallyCopyable>
424void SmallVectorTemplateBase<T, TriviallyCopyable>::grow(size_t MinSize) {
425 size_t NewCapacity;
426 T *NewElts = mallocForGrow(MinSize, NewCapacity);
427 moveElementsForGrow(NewElts);
428 takeAllocationForGrow(NewElts, NewCapacity);
429}
430
431// Define this out-of-line to dissuade the C++ compiler from inlining it.
432template <typename T, bool TriviallyCopyable>
433void SmallVectorTemplateBase<T, TriviallyCopyable>::moveElementsForGrow(
434 T *NewElts) {
435 // Move the elements over.
436 this->uninitialized_move(this->begin(), this->end(), NewElts);
437
438 // Destroy the original elements.
439 destroy_range(this->begin(), this->end());
440}
441
442// Define this out-of-line to dissuade the C++ compiler from inlining it.
443template <typename T, bool TriviallyCopyable>
444void SmallVectorTemplateBase<T, TriviallyCopyable>::takeAllocationForGrow(
445 T *NewElts, size_t NewCapacity) {
446 // If this wasn't grown from the inline copy, deallocate the old space.
447 if (!this->isSmall())
448 free(this->begin());
449
450 this->BeginX = NewElts;
451 this->Capacity = NewCapacity;
452}
453
454/// SmallVectorTemplateBase<TriviallyCopyable = true> - This is where we put
455/// method implementations that are designed to work with trivially copyable
456/// T's. This allows using memcpy in place of copy/move construction and
457/// skipping destruction.
458template <typename T>
459class SmallVectorTemplateBase<T, true> : public SmallVectorTemplateCommon<T> {
460 friend class SmallVectorTemplateCommon<T>;
461
462protected:
463 /// True if it's cheap enough to take parameters by value. Doing so avoids
464 /// overhead related to mitigations for reference invalidation.
465 static constexpr bool TakesParamByValue = sizeof(T) <= 2 * sizeof(void *);
466
467 /// Either const T& or T, depending on whether it's cheap enough to take
468 /// parameters by value.
469 using ValueParamT =
470 typename std::conditional<TakesParamByValue, T, const T &>::type;
471
472 SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {}
473
474 // No need to do a destroy loop for POD's.
475 static void destroy_range(T *, T *) {}
476
477 /// Move the range [I, E) onto the uninitialized memory
478 /// starting with "Dest", constructing elements into it as needed.
479 template<typename It1, typename It2>
480 static void uninitialized_move(It1 I, It1 E, It2 Dest) {
481 // Just do a copy.
482 uninitialized_copy(I, E, Dest);
483 }
484
485 /// Copy the range [I, E) onto the uninitialized memory
486 /// starting with "Dest", constructing elements into it as needed.
487 template<typename It1, typename It2>
488 static void uninitialized_copy(It1 I, It1 E, It2 Dest) {
489 // Arbitrary iterator types; just use the basic implementation.
490 std::uninitialized_copy(I, E, Dest);
491 }
492
493 /// Copy the range [I, E) onto the uninitialized memory
494 /// starting with "Dest", constructing elements into it as needed.
495 template <typename T1, typename T2>
496 static void uninitialized_copy(
497 T1 *I, T1 *E, T2 *Dest,
498 std::enable_if_t<std::is_same<typename std::remove_const<T1>::type,
499 T2>::value> * = nullptr) {
500 // Use memcpy for PODs iterated by pointers (which includes SmallVector
501 // iterators): std::uninitialized_copy optimizes to memmove, but we can
502 // use memcpy here. Note that I and E are iterators and thus might be
503 // invalid for memcpy if they are equal.
504 if (I != E)
505 memcpy(reinterpret_cast<void *>(Dest), I, (E - I) * sizeof(T));
506 }
507
508 /// Double the size of the allocated memory, guaranteeing space for at
509 /// least one more element or MinSize if specified.
510 void grow(size_t MinSize = 0) { this->grow_pod(MinSize, sizeof(T)); }
511
512 /// Reserve enough space to add one element, and return the updated element
513 /// pointer in case it was a reference to the storage.
514 const T *reserveForParamAndGetAddress(const T &Elt, size_t N = 1) {
515 return this->reserveForParamAndGetAddressImpl(this, Elt, N);
516 }
517
518 /// Reserve enough space to add one element, and return the updated element
519 /// pointer in case it was a reference to the storage.
520 T *reserveForParamAndGetAddress(T &Elt, size_t N = 1) {
521 return const_cast<T *>(
522 this->reserveForParamAndGetAddressImpl(this, Elt, N));
523 }
524
525 /// Copy \p V or return a reference, depending on \a ValueParamT.
526 static ValueParamT forward_value_param(ValueParamT V) { return V; }
527
528 void growAndAssign(size_t NumElts, T Elt) {
529 // Elt has been copied in case it's an internal reference, side-stepping
530 // reference invalidation problems without losing the realloc optimization.
531 this->set_size(0);
532 this->grow(NumElts);
533 std::uninitialized_fill_n(this->begin(), NumElts, Elt);
534 this->set_size(NumElts);
535 }
536
537 template <typename... ArgTypes> T &growAndEmplaceBack(ArgTypes &&... Args) {
538 // Use push_back with a copy in case Args has an internal reference,
539 // side-stepping reference invalidation problems without losing the realloc
540 // optimization.
541 push_back(T(std::forward<ArgTypes>(Args)...));
542 return this->back();
543 }
544
545public:
546 void push_back(ValueParamT Elt) {
547 const T *EltPtr = reserveForParamAndGetAddress(Elt);
548 memcpy(reinterpret_cast<void *>(this->end()), EltPtr, sizeof(T));
549 this->set_size(this->size() + 1);
550 }
551
552 void pop_back() { this->set_size(this->size() - 1); }
553};
554
555/// This class consists of common code factored out of the SmallVector class to
556/// reduce code duplication based on the SmallVector 'N' template parameter.
557template <typename T>
558class SmallVectorImpl : public SmallVectorTemplateBase<T> {
559 using SuperClass = SmallVectorTemplateBase<T>;
560
561public:
562 using iterator = typename SuperClass::iterator;
563 using const_iterator = typename SuperClass::const_iterator;
564 using reference = typename SuperClass::reference;
565 using size_type = typename SuperClass::size_type;
566
567protected:
568 using SmallVectorTemplateBase<T>::TakesParamByValue;
569 using ValueParamT = typename SuperClass::ValueParamT;
570
571 // Default ctor - Initialize to empty.
572 explicit SmallVectorImpl(unsigned N)
573 : SmallVectorTemplateBase<T>(N) {}
574
575public:
576 SmallVectorImpl(const SmallVectorImpl &) = delete;
577
578 ~SmallVectorImpl() {
579 // Subclass has already destructed this vector's elements.
580 // If this wasn't grown from the inline copy, deallocate the old space.
581 if (!this->isSmall())
582 free(this->begin());
583 }
584
585 void clear() {
586 this->destroy_range(this->begin(), this->end());
587 this->Size = 0;
588 }
589
590private:
591 template <bool ForOverwrite> void resizeImpl(size_type N) {
592 if (N < this->size()) {
593 this->pop_back_n(this->size() - N);
594 } else if (N > this->size()) {
595 this->reserve(N);
596 for (auto I = this->end(), E = this->begin() + N; I != E; ++I)
597 if (ForOverwrite)
598 new (&*I) T;
599 else
600 new (&*I) T();
601 this->set_size(N);
602 }
603 }
604
605public:
606 void resize(size_type N) { resizeImpl<false>(N); }
607
608 /// Like resize, but \ref T is POD, the new values won't be initialized.
609 void resize_for_overwrite(size_type N) { resizeImpl<true>(N); }
610
611 void resize(size_type N, ValueParamT NV) {
612 if (N == this->size())
613 return;
614
615 if (N < this->size()) {
616 this->pop_back_n(this->size() - N);
617 return;
618 }
619
620 // N > this->size(). Defer to append.
621 this->append(N - this->size(), NV);
622 }
623
624 void reserve(size_type N) {
625 if (this->capacity() < N)
626 this->grow(N);
627 }
628
629 void pop_back_n(size_type NumItems) {
630 assert(this->size() >= NumItems)(static_cast<void> (0));
631 this->destroy_range(this->end() - NumItems, this->end());
632 this->set_size(this->size() - NumItems);
633 }
634
635 LLVM_NODISCARD[[clang::warn_unused_result]] T pop_back_val() {
636 T Result = ::std::move(this->back());
637 this->pop_back();
638 return Result;
639 }
640
641 void swap(SmallVectorImpl &RHS);
642
643 /// Add the specified range to the end of the SmallVector.
644 template <typename in_iter,
645 typename = std::enable_if_t<std::is_convertible<
646 typename std::iterator_traits<in_iter>::iterator_category,
647 std::input_iterator_tag>::value>>
648 void append(in_iter in_start, in_iter in_end) {
649 this->assertSafeToAddRange(in_start, in_end);
650 size_type NumInputs = std::distance(in_start, in_end);
651 this->reserve(this->size() + NumInputs);
652 this->uninitialized_copy(in_start, in_end, this->end());
653 this->set_size(this->size() + NumInputs);
654 }
655
656 /// Append \p NumInputs copies of \p Elt to the end.
657 void append(size_type NumInputs, ValueParamT Elt) {
658 const T *EltPtr = this->reserveForParamAndGetAddress(Elt, NumInputs);
659 std::uninitialized_fill_n(this->end(), NumInputs, *EltPtr);
660 this->set_size(this->size() + NumInputs);
661 }
662
663 void append(std::initializer_list<T> IL) {
664 append(IL.begin(), IL.end());
665 }
666
667 void append(const SmallVectorImpl &RHS) { append(RHS.begin(), RHS.end()); }
668
669 void assign(size_type NumElts, ValueParamT Elt) {
670 // Note that Elt could be an internal reference.
671 if (NumElts > this->capacity()) {
672 this->growAndAssign(NumElts, Elt);
673 return;
674 }
675
676 // Assign over existing elements.
677 std::fill_n(this->begin(), std::min(NumElts, this->size()), Elt);
678 if (NumElts > this->size())
679 std::uninitialized_fill_n(this->end(), NumElts - this->size(), Elt);
680 else if (NumElts < this->size())
681 this->destroy_range(this->begin() + NumElts, this->end());
682 this->set_size(NumElts);
683 }
684
685 // FIXME: Consider assigning over existing elements, rather than clearing &
686 // re-initializing them - for all assign(...) variants.
687
688 template <typename in_iter,
689 typename = std::enable_if_t<std::is_convertible<
690 typename std::iterator_traits<in_iter>::iterator_category,
691 std::input_iterator_tag>::value>>
692 void assign(in_iter in_start, in_iter in_end) {
693 this->assertSafeToReferenceAfterClear(in_start, in_end);
694 clear();
695 append(in_start, in_end);
696 }
697
698 void assign(std::initializer_list<T> IL) {
699 clear();
700 append(IL);
701 }
702
703 void assign(const SmallVectorImpl &RHS) { assign(RHS.begin(), RHS.end()); }
704
705 iterator erase(const_iterator CI) {
706 // Just cast away constness because this is a non-const member function.
707 iterator I = const_cast<iterator>(CI);
708
709 assert(this->isReferenceToStorage(CI) && "Iterator to erase is out of bounds.")(static_cast<void> (0));
710
711 iterator N = I;
712 // Shift all elts down one.
713 std::move(I+1, this->end(), I);
714 // Drop the last elt.
715 this->pop_back();
716 return(N);
717 }
718
719 iterator erase(const_iterator CS, const_iterator CE) {
720 // Just cast away constness because this is a non-const member function.
721 iterator S = const_cast<iterator>(CS);
722 iterator E = const_cast<iterator>(CE);
723
724 assert(this->isRangeInStorage(S, E) && "Range to erase is out of bounds.")(static_cast<void> (0));
725
726 iterator N = S;
727 // Shift all elts down.
728 iterator I = std::move(E, this->end(), S);
729 // Drop the last elts.
730 this->destroy_range(I, this->end());
731 this->set_size(I - this->begin());
732 return(N);
733 }
734
735private:
736 template <class ArgType> iterator insert_one_impl(iterator I, ArgType &&Elt) {
737 // Callers ensure that ArgType is derived from T.
738 static_assert(
739 std::is_same<std::remove_const_t<std::remove_reference_t<ArgType>>,
740 T>::value,
741 "ArgType must be derived from T!");
742
743 if (I == this->end()) { // Important special case for empty vector.
744 this->push_back(::std::forward<ArgType>(Elt));
745 return this->end()-1;
746 }
747
748 assert(this->isReferenceToStorage(I) && "Insertion iterator is out of bounds.")(static_cast<void> (0));
749
750 // Grow if necessary.
751 size_t Index = I - this->begin();
752 std::remove_reference_t<ArgType> *EltPtr =
753 this->reserveForParamAndGetAddress(Elt);
754 I = this->begin() + Index;
755
756 ::new ((void*) this->end()) T(::std::move(this->back()));
757 // Push everything else over.
758 std::move_backward(I, this->end()-1, this->end());
759 this->set_size(this->size() + 1);
760
761 // If we just moved the element we're inserting, be sure to update
762 // the reference (never happens if TakesParamByValue).
763 static_assert(!TakesParamByValue || std::is_same<ArgType, T>::value,
764 "ArgType must be 'T' when taking by value!");
765 if (!TakesParamByValue && this->isReferenceToRange(EltPtr, I, this->end()))
766 ++EltPtr;
767
768 *I = ::std::forward<ArgType>(*EltPtr);
769 return I;
770 }
771
772public:
773 iterator insert(iterator I, T &&Elt) {
774 return insert_one_impl(I, this->forward_value_param(std::move(Elt)));
775 }
776
777 iterator insert(iterator I, const T &Elt) {
778 return insert_one_impl(I, this->forward_value_param(Elt));
779 }
780
781 iterator insert(iterator I, size_type NumToInsert, ValueParamT Elt) {
782 // Convert iterator to elt# to avoid invalidating iterator when we reserve()
783 size_t InsertElt = I - this->begin();
784
785 if (I == this->end()) { // Important special case for empty vector.
786 append(NumToInsert, Elt);
787 return this->begin()+InsertElt;
788 }
789
790 assert(this->isReferenceToStorage(I) && "Insertion iterator is out of bounds.")(static_cast<void> (0));
791
792 // Ensure there is enough space, and get the (maybe updated) address of
793 // Elt.
794 const T *EltPtr = this->reserveForParamAndGetAddress(Elt, NumToInsert);
795
796 // Uninvalidate the iterator.
797 I = this->begin()+InsertElt;
798
799 // If there are more elements between the insertion point and the end of the
800 // range than there are being inserted, we can use a simple approach to
801 // insertion. Since we already reserved space, we know that this won't
802 // reallocate the vector.
803 if (size_t(this->end()-I) >= NumToInsert) {
804 T *OldEnd = this->end();
805 append(std::move_iterator<iterator>(this->end() - NumToInsert),
806 std::move_iterator<iterator>(this->end()));
807
808 // Copy the existing elements that get replaced.
809 std::move_backward(I, OldEnd-NumToInsert, OldEnd);
810
811 // If we just moved the element we're inserting, be sure to update
812 // the reference (never happens if TakesParamByValue).
813 if (!TakesParamByValue && I <= EltPtr && EltPtr < this->end())
814 EltPtr += NumToInsert;
815
816 std::fill_n(I, NumToInsert, *EltPtr);
817 return I;
818 }
819
820 // Otherwise, we're inserting more elements than exist already, and we're
821 // not inserting at the end.
822
823 // Move over the elements that we're about to overwrite.
824 T *OldEnd = this->end();
825 this->set_size(this->size() + NumToInsert);
826 size_t NumOverwritten = OldEnd-I;
827 this->uninitialized_move(I, OldEnd, this->end()-NumOverwritten);
828
829 // If we just moved the element we're inserting, be sure to update
830 // the reference (never happens if TakesParamByValue).
831 if (!TakesParamByValue && I <= EltPtr && EltPtr < this->end())
832 EltPtr += NumToInsert;
833
834 // Replace the overwritten part.
835 std::fill_n(I, NumOverwritten, *EltPtr);
836
837 // Insert the non-overwritten middle part.
838 std::uninitialized_fill_n(OldEnd, NumToInsert - NumOverwritten, *EltPtr);
839 return I;
840 }
841
842 template <typename ItTy,
843 typename = std::enable_if_t<std::is_convertible<
844 typename std::iterator_traits<ItTy>::iterator_category,
845 std::input_iterator_tag>::value>>
846 iterator insert(iterator I, ItTy From, ItTy To) {
847 // Convert iterator to elt# to avoid invalidating iterator when we reserve()
848 size_t InsertElt = I - this->begin();
849
850 if (I == this->end()) { // Important special case for empty vector.
851 append(From, To);
852 return this->begin()+InsertElt;
853 }
854
855 assert(this->isReferenceToStorage(I) && "Insertion iterator is out of bounds.")(static_cast<void> (0));
856
857 // Check that the reserve that follows doesn't invalidate the iterators.
858 this->assertSafeToAddRange(From, To);
859
860 size_t NumToInsert = std::distance(From, To);
861
862 // Ensure there is enough space.
863 reserve(this->size() + NumToInsert);
864
865 // Uninvalidate the iterator.
866 I = this->begin()+InsertElt;
867
868 // If there are more elements between the insertion point and the end of the
869 // range than there are being inserted, we can use a simple approach to
870 // insertion. Since we already reserved space, we know that this won't
871 // reallocate the vector.
872 if (size_t(this->end()-I) >= NumToInsert) {
873 T *OldEnd = this->end();
874 append(std::move_iterator<iterator>(this->end() - NumToInsert),
875 std::move_iterator<iterator>(this->end()));
876
877 // Copy the existing elements that get replaced.
878 std::move_backward(I, OldEnd-NumToInsert, OldEnd);
879
880 std::copy(From, To, I);
881 return I;
882 }
883
884 // Otherwise, we're inserting more elements than exist already, and we're
885 // not inserting at the end.
886
887 // Move over the elements that we're about to overwrite.
888 T *OldEnd = this->end();
889 this->set_size(this->size() + NumToInsert);
890 size_t NumOverwritten = OldEnd-I;
891 this->uninitialized_move(I, OldEnd, this->end()-NumOverwritten);
892
893 // Replace the overwritten part.
894 for (T *J = I; NumOverwritten > 0; --NumOverwritten) {
895 *J = *From;
896 ++J; ++From;
897 }
898
899 // Insert the non-overwritten middle part.
900 this->uninitialized_copy(From, To, OldEnd);
901 return I;
902 }
903
904 void insert(iterator I, std::initializer_list<T> IL) {
905 insert(I, IL.begin(), IL.end());
906 }
907
908 template <typename... ArgTypes> reference emplace_back(ArgTypes &&... Args) {
909 if (LLVM_UNLIKELY(this->size() >= this->capacity())__builtin_expect((bool)(this->size() >= this->capacity
()), false)
)
910 return this->growAndEmplaceBack(std::forward<ArgTypes>(Args)...);
911
912 ::new ((void *)this->end()) T(std::forward<ArgTypes>(Args)...);
913 this->set_size(this->size() + 1);
914 return this->back();
915 }
916
917 SmallVectorImpl &operator=(const SmallVectorImpl &RHS);
918
919 SmallVectorImpl &operator=(SmallVectorImpl &&RHS);
920
921 bool operator==(const SmallVectorImpl &RHS) const {
922 if (this->size() != RHS.size()) return false;
923 return std::equal(this->begin(), this->end(), RHS.begin());
924 }
925 bool operator!=(const SmallVectorImpl &RHS) const {
926 return !(*this == RHS);
927 }
928
929 bool operator<(const SmallVectorImpl &RHS) const {
930 return std::lexicographical_compare(this->begin(), this->end(),
931 RHS.begin(), RHS.end());
932 }
933};
934
935template <typename T>
936void SmallVectorImpl<T>::swap(SmallVectorImpl<T> &RHS) {
937 if (this == &RHS) return;
938
939 // We can only avoid copying elements if neither vector is small.
940 if (!this->isSmall() && !RHS.isSmall()) {
941 std::swap(this->BeginX, RHS.BeginX);
942 std::swap(this->Size, RHS.Size);
943 std::swap(this->Capacity, RHS.Capacity);
944 return;
945 }
946 this->reserve(RHS.size());
947 RHS.reserve(this->size());
948
949 // Swap the shared elements.
950 size_t NumShared = this->size();
951 if (NumShared > RHS.size()) NumShared = RHS.size();
952 for (size_type i = 0; i != NumShared; ++i)
953 std::swap((*this)[i], RHS[i]);
954
955 // Copy over the extra elts.
956 if (this->size() > RHS.size()) {
957 size_t EltDiff = this->size() - RHS.size();
958 this->uninitialized_copy(this->begin()+NumShared, this->end(), RHS.end());
959 RHS.set_size(RHS.size() + EltDiff);
960 this->destroy_range(this->begin()+NumShared, this->end());
961 this->set_size(NumShared);
962 } else if (RHS.size() > this->size()) {
963 size_t EltDiff = RHS.size() - this->size();
964 this->uninitialized_copy(RHS.begin()+NumShared, RHS.end(), this->end());
965 this->set_size(this->size() + EltDiff);
966 this->destroy_range(RHS.begin()+NumShared, RHS.end());
967 RHS.set_size(NumShared);
968 }
969}
970
971template <typename T>
972SmallVectorImpl<T> &SmallVectorImpl<T>::
973 operator=(const SmallVectorImpl<T> &RHS) {
974 // Avoid self-assignment.
975 if (this == &RHS) return *this;
976
977 // If we already have sufficient space, assign the common elements, then
978 // destroy any excess.
979 size_t RHSSize = RHS.size();
980 size_t CurSize = this->size();
981 if (CurSize >= RHSSize) {
982 // Assign common elements.
983 iterator NewEnd;
984 if (RHSSize)
985 NewEnd = std::copy(RHS.begin(), RHS.begin()+RHSSize, this->begin());
986 else
987 NewEnd = this->begin();
988
989 // Destroy excess elements.
990 this->destroy_range(NewEnd, this->end());
991
992 // Trim.
993 this->set_size(RHSSize);
994 return *this;
995 }
996
997 // If we have to grow to have enough elements, destroy the current elements.
998 // This allows us to avoid copying them during the grow.
999 // FIXME: don't do this if they're efficiently moveable.
1000 if (this->capacity() < RHSSize) {
1001 // Destroy current elements.
1002 this->clear();
1003 CurSize = 0;
1004 this->grow(RHSSize);
1005 } else if (CurSize) {
1006 // Otherwise, use assignment for the already-constructed elements.
1007 std::copy(RHS.begin(), RHS.begin()+CurSize, this->begin());
1008 }
1009
1010 // Copy construct the new elements in place.
1011 this->uninitialized_copy(RHS.begin()+CurSize, RHS.end(),
1012 this->begin()+CurSize);
1013
1014 // Set end.
1015 this->set_size(RHSSize);
1016 return *this;
1017}
1018
1019template <typename T>
1020SmallVectorImpl<T> &SmallVectorImpl<T>::operator=(SmallVectorImpl<T> &&RHS) {
1021 // Avoid self-assignment.
1022 if (this == &RHS) return *this;
1023
1024 // If the RHS isn't small, clear this vector and then steal its buffer.
1025 if (!RHS.isSmall()) {
1026 this->destroy_range(this->begin(), this->end());
1027 if (!this->isSmall()) free(this->begin());
1028 this->BeginX = RHS.BeginX;
1029 this->Size = RHS.Size;
1030 this->Capacity = RHS.Capacity;
1031 RHS.resetToSmall();
1032 return *this;
1033 }
1034
1035 // If we already have sufficient space, assign the common elements, then
1036 // destroy any excess.
1037 size_t RHSSize = RHS.size();
1038 size_t CurSize = this->size();
1039 if (CurSize >= RHSSize) {
1040 // Assign common elements.
1041 iterator NewEnd = this->begin();
1042 if (RHSSize)
1043 NewEnd = std::move(RHS.begin(), RHS.end(), NewEnd);
1044
1045 // Destroy excess elements and trim the bounds.
1046 this->destroy_range(NewEnd, this->end());
1047 this->set_size(RHSSize);
1048
1049 // Clear the RHS.
1050 RHS.clear();
1051
1052 return *this;
1053 }
1054
1055 // If we have to grow to have enough elements, destroy the current elements.
1056 // This allows us to avoid copying them during the grow.
1057 // FIXME: this may not actually make any sense if we can efficiently move
1058 // elements.
1059 if (this->capacity() < RHSSize) {
1060 // Destroy current elements.
1061 this->clear();
1062 CurSize = 0;
1063 this->grow(RHSSize);
1064 } else if (CurSize) {
1065 // Otherwise, use assignment for the already-constructed elements.
1066 std::move(RHS.begin(), RHS.begin()+CurSize, this->begin());
1067 }
1068
1069 // Move-construct the new elements in place.
1070 this->uninitialized_move(RHS.begin()+CurSize, RHS.end(),
1071 this->begin()+CurSize);
1072
1073 // Set end.
1074 this->set_size(RHSSize);
1075
1076 RHS.clear();
1077 return *this;
1078}
1079
1080/// Storage for the SmallVector elements. This is specialized for the N=0 case
1081/// to avoid allocating unnecessary storage.
1082template <typename T, unsigned N>
1083struct SmallVectorStorage {
1084 alignas(T) char InlineElts[N * sizeof(T)];
1085};
1086
1087/// We need the storage to be properly aligned even for small-size of 0 so that
1088/// the pointer math in \a SmallVectorTemplateCommon::getFirstEl() is
1089/// well-defined.
1090template <typename T> struct alignas(T) SmallVectorStorage<T, 0> {};
1091
1092/// Forward declaration of SmallVector so that
1093/// calculateSmallVectorDefaultInlinedElements can reference
1094/// `sizeof(SmallVector<T, 0>)`.
1095template <typename T, unsigned N> class LLVM_GSL_OWNER[[gsl::Owner]] SmallVector;
1096
1097/// Helper class for calculating the default number of inline elements for
1098/// `SmallVector<T>`.
1099///
1100/// This should be migrated to a constexpr function when our minimum
1101/// compiler support is enough for multi-statement constexpr functions.
1102template <typename T> struct CalculateSmallVectorDefaultInlinedElements {
1103 // Parameter controlling the default number of inlined elements
1104 // for `SmallVector<T>`.
1105 //
1106 // The default number of inlined elements ensures that
1107 // 1. There is at least one inlined element.
1108 // 2. `sizeof(SmallVector<T>) <= kPreferredSmallVectorSizeof` unless
1109 // it contradicts 1.
1110 static constexpr size_t kPreferredSmallVectorSizeof = 64;
1111
1112 // static_assert that sizeof(T) is not "too big".
1113 //
1114 // Because our policy guarantees at least one inlined element, it is possible
1115 // for an arbitrarily large inlined element to allocate an arbitrarily large
1116 // amount of inline storage. We generally consider it an antipattern for a
1117 // SmallVector to allocate an excessive amount of inline storage, so we want
1118 // to call attention to these cases and make sure that users are making an
1119 // intentional decision if they request a lot of inline storage.
1120 //
1121 // We want this assertion to trigger in pathological cases, but otherwise
1122 // not be too easy to hit. To accomplish that, the cutoff is actually somewhat
1123 // larger than kPreferredSmallVectorSizeof (otherwise,
1124 // `SmallVector<SmallVector<T>>` would be one easy way to trip it, and that
1125 // pattern seems useful in practice).
1126 //
1127 // One wrinkle is that this assertion is in theory non-portable, since
1128 // sizeof(T) is in general platform-dependent. However, we don't expect this
1129 // to be much of an issue, because most LLVM development happens on 64-bit
1130 // hosts, and therefore sizeof(T) is expected to *decrease* when compiled for
1131 // 32-bit hosts, dodging the issue. The reverse situation, where development
1132 // happens on a 32-bit host and then fails due to sizeof(T) *increasing* on a
1133 // 64-bit host, is expected to be very rare.
1134 static_assert(
1135 sizeof(T) <= 256,
1136 "You are trying to use a default number of inlined elements for "
1137 "`SmallVector<T>` but `sizeof(T)` is really big! Please use an "
1138 "explicit number of inlined elements with `SmallVector<T, N>` to make "
1139 "sure you really want that much inline storage.");
1140
1141 // Discount the size of the header itself when calculating the maximum inline
1142 // bytes.
1143 static constexpr size_t PreferredInlineBytes =
1144 kPreferredSmallVectorSizeof - sizeof(SmallVector<T, 0>);
1145 static constexpr size_t NumElementsThatFit = PreferredInlineBytes / sizeof(T);
1146 static constexpr size_t value =
1147 NumElementsThatFit == 0 ? 1 : NumElementsThatFit;
1148};
1149
1150/// This is a 'vector' (really, a variable-sized array), optimized
1151/// for the case when the array is small. It contains some number of elements
1152/// in-place, which allows it to avoid heap allocation when the actual number of
1153/// elements is below that threshold. This allows normal "small" cases to be
1154/// fast without losing generality for large inputs.
1155///
1156/// \note
1157/// In the absence of a well-motivated choice for the number of inlined
1158/// elements \p N, it is recommended to use \c SmallVector<T> (that is,
1159/// omitting the \p N). This will choose a default number of inlined elements
1160/// reasonable for allocation on the stack (for example, trying to keep \c
1161/// sizeof(SmallVector<T>) around 64 bytes).
1162///
1163/// \warning This does not attempt to be exception safe.
1164///
1165/// \see https://llvm.org/docs/ProgrammersManual.html#llvm-adt-smallvector-h
1166template <typename T,
1167 unsigned N = CalculateSmallVectorDefaultInlinedElements<T>::value>
1168class LLVM_GSL_OWNER[[gsl::Owner]] SmallVector : public SmallVectorImpl<T>,
1169 SmallVectorStorage<T, N> {
1170public:
1171 SmallVector() : SmallVectorImpl<T>(N) {}
1172
1173 ~SmallVector() {
1174 // Destroy the constructed elements in the vector.
1175 this->destroy_range(this->begin(), this->end());
1176 }
1177
1178 explicit SmallVector(size_t Size, const T &Value = T())
1179 : SmallVectorImpl<T>(N) {
1180 this->assign(Size, Value);
1181 }
1182
1183 template <typename ItTy,
1184 typename = std::enable_if_t<std::is_convertible<
1185 typename std::iterator_traits<ItTy>::iterator_category,
1186 std::input_iterator_tag>::value>>
1187 SmallVector(ItTy S, ItTy E) : SmallVectorImpl<T>(N) {
1188 this->append(S, E);
1189 }
1190
1191 template <typename RangeTy>
1192 explicit SmallVector(const iterator_range<RangeTy> &R)
1193 : SmallVectorImpl<T>(N) {
1194 this->append(R.begin(), R.end());
1195 }
1196
1197 SmallVector(std::initializer_list<T> IL) : SmallVectorImpl<T>(N) {
1198 this->assign(IL);
1199 }
1200
1201 SmallVector(const SmallVector &RHS) : SmallVectorImpl<T>(N) {
1202 if (!RHS.empty())
1203 SmallVectorImpl<T>::operator=(RHS);
1204 }
1205
1206 SmallVector &operator=(const SmallVector &RHS) {
1207 SmallVectorImpl<T>::operator=(RHS);
1208 return *this;
1209 }
1210
1211 SmallVector(SmallVector &&RHS) : SmallVectorImpl<T>(N) {
1212 if (!RHS.empty())
1213 SmallVectorImpl<T>::operator=(::std::move(RHS));
1214 }
1215
1216 SmallVector(SmallVectorImpl<T> &&RHS) : SmallVectorImpl<T>(N) {
1217 if (!RHS.empty())
1218 SmallVectorImpl<T>::operator=(::std::move(RHS));
1219 }
1220
1221 SmallVector &operator=(SmallVector &&RHS) {
1222 SmallVectorImpl<T>::operator=(::std::move(RHS));
1223 return *this;
1224 }
1225
1226 SmallVector &operator=(SmallVectorImpl<T> &&RHS) {
1227 SmallVectorImpl<T>::operator=(::std::move(RHS));
1228 return *this;
1229 }
1230
1231 SmallVector &operator=(std::initializer_list<T> IL) {
1232 this->assign(IL);
1233 return *this;
1234 }
1235};
1236
1237template <typename T, unsigned N>
1238inline size_t capacity_in_bytes(const SmallVector<T, N> &X) {
1239 return X.capacity_in_bytes();
1240}
1241
1242/// Given a range of type R, iterate the entire range and return a
1243/// SmallVector with elements of the vector. This is useful, for example,
1244/// when you want to iterate a range and then sort the results.
1245template <unsigned Size, typename R>
1246SmallVector<typename std::remove_const<typename std::remove_reference<
1247 decltype(*std::begin(std::declval<R &>()))>::type>::type,
1248 Size>
1249to_vector(R &&Range) {
1250 return {std::begin(Range), std::end(Range)};
1251}
1252
1253} // end namespace llvm
1254
1255namespace std {
1256
1257 /// Implement std::swap in terms of SmallVector swap.
1258 template<typename T>
1259 inline void
1260 swap(llvm::SmallVectorImpl<T> &LHS, llvm::SmallVectorImpl<T> &RHS) {
1261 LHS.swap(RHS);
1262 }
1263
1264 /// Implement std::swap in terms of SmallVector swap.
1265 template<typename T, unsigned N>
1266 inline void
1267 swap(llvm::SmallVector<T, N> &LHS, llvm::SmallVector<T, N> &RHS) {
1268 LHS.swap(RHS);
1269 }
1270
1271} // end namespace std
1272
1273#endif // LLVM_ADT_SMALLVECTOR_H

/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/include/llvm/IR/DerivedTypes.h

1//===- llvm/DerivedTypes.h - Classes for handling data types ----*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains the declarations of classes that represent "derived
10// types". These are things like "arrays of x" or "structure of x, y, z" or
11// "function returning x taking (y,z) as parameters", etc...
12//
13// The implementations of these classes live in the Type.cpp file.
14//
15//===----------------------------------------------------------------------===//
16
17#ifndef LLVM_IR_DERIVEDTYPES_H
18#define LLVM_IR_DERIVEDTYPES_H
19
20#include "llvm/ADT/ArrayRef.h"
21#include "llvm/ADT/STLExtras.h"
22#include "llvm/ADT/StringRef.h"
23#include "llvm/IR/Type.h"
24#include "llvm/Support/Casting.h"
25#include "llvm/Support/Compiler.h"
26#include "llvm/Support/TypeSize.h"
27#include <cassert>
28#include <cstdint>
29
30namespace llvm {
31
32class Value;
33class APInt;
34class LLVMContext;
35
36/// Class to represent integer types. Note that this class is also used to
37/// represent the built-in integer types: Int1Ty, Int8Ty, Int16Ty, Int32Ty and
38/// Int64Ty.
39/// Integer representation type
40class IntegerType : public Type {
41 friend class LLVMContextImpl;
42
43protected:
44 explicit IntegerType(LLVMContext &C, unsigned NumBits) : Type(C, IntegerTyID){
45 setSubclassData(NumBits);
46 }
47
48public:
49 /// This enum is just used to hold constants we need for IntegerType.
50 enum {
51 MIN_INT_BITS = 1, ///< Minimum number of bits that can be specified
52 MAX_INT_BITS = (1<<24)-1 ///< Maximum number of bits that can be specified
53 ///< Note that bit width is stored in the Type classes SubclassData field
54 ///< which has 24 bits. This yields a maximum bit width of 16,777,215
55 ///< bits.
56 };
57
58 /// This static method is the primary way of constructing an IntegerType.
59 /// If an IntegerType with the same NumBits value was previously instantiated,
60 /// that instance will be returned. Otherwise a new one will be created. Only
61 /// one instance with a given NumBits value is ever created.
62 /// Get or create an IntegerType instance.
63 static IntegerType *get(LLVMContext &C, unsigned NumBits);
64
65 /// Returns type twice as wide the input type.
66 IntegerType *getExtendedType() const {
67 return Type::getIntNTy(getContext(), 2 * getScalarSizeInBits());
68 }
69
70 /// Get the number of bits in this IntegerType
71 unsigned getBitWidth() const { return getSubclassData(); }
72
73 /// Return a bitmask with ones set for all of the bits that can be set by an
74 /// unsigned version of this type. This is 0xFF for i8, 0xFFFF for i16, etc.
75 uint64_t getBitMask() const {
76 return ~uint64_t(0UL) >> (64-getBitWidth());
77 }
78
79 /// Return a uint64_t with just the most significant bit set (the sign bit, if
80 /// the value is treated as a signed number).
81 uint64_t getSignBit() const {
82 return 1ULL << (getBitWidth()-1);
83 }
84
85 /// For example, this is 0xFF for an 8 bit integer, 0xFFFF for i16, etc.
86 /// @returns a bit mask with ones set for all the bits of this type.
87 /// Get a bit mask for this type.
88 APInt getMask() const;
89
90 /// Methods for support type inquiry through isa, cast, and dyn_cast.
91 static bool classof(const Type *T) {
92 return T->getTypeID() == IntegerTyID;
93 }
94};
95
96unsigned Type::getIntegerBitWidth() const {
97 return cast<IntegerType>(this)->getBitWidth();
98}
99
100/// Class to represent function types
101///
102class FunctionType : public Type {
103 FunctionType(Type *Result, ArrayRef<Type*> Params, bool IsVarArgs);
104
105public:
106 FunctionType(const FunctionType &) = delete;
107 FunctionType &operator=(const FunctionType &) = delete;
108
109 /// This static method is the primary way of constructing a FunctionType.
110 static FunctionType *get(Type *Result,
111 ArrayRef<Type*> Params, bool isVarArg);
112
113 /// Create a FunctionType taking no parameters.
114 static FunctionType *get(Type *Result, bool isVarArg);
115
116 /// Return true if the specified type is valid as a return type.
117 static bool isValidReturnType(Type *RetTy);
118
119 /// Return true if the specified type is valid as an argument type.
120 static bool isValidArgumentType(Type *ArgTy);
121
122 bool isVarArg() const { return getSubclassData()!=0; }
123 Type *getReturnType() const { return ContainedTys[0]; }
124
125 using param_iterator = Type::subtype_iterator;
126
127 param_iterator param_begin() const { return ContainedTys + 1; }
128 param_iterator param_end() const { return &ContainedTys[NumContainedTys]; }
129 ArrayRef<Type *> params() const {
130 return makeArrayRef(param_begin(), param_end());
131 }
132
133 /// Parameter type accessors.
134 Type *getParamType(unsigned i) const { return ContainedTys[i+1]; }
135
136 /// Return the number of fixed parameters this function type requires.
137 /// This does not consider varargs.
138 unsigned getNumParams() const { return NumContainedTys - 1; }
139
140 /// Methods for support type inquiry through isa, cast, and dyn_cast.
141 static bool classof(const Type *T) {
142 return T->getTypeID() == FunctionTyID;
143 }
144};
145static_assert(alignof(FunctionType) >= alignof(Type *),
146 "Alignment sufficient for objects appended to FunctionType");
147
148bool Type::isFunctionVarArg() const {
149 return cast<FunctionType>(this)->isVarArg();
150}
151
152Type *Type::getFunctionParamType(unsigned i) const {
153 return cast<FunctionType>(this)->getParamType(i);
154}
155
156unsigned Type::getFunctionNumParams() const {
157 return cast<FunctionType>(this)->getNumParams();
158}
159
160/// A handy container for a FunctionType+Callee-pointer pair, which can be
161/// passed around as a single entity. This assists in replacing the use of
162/// PointerType::getElementType() to access the function's type, since that's
163/// slated for removal as part of the [opaque pointer types] project.
164class FunctionCallee {
165public:
166 // Allow implicit conversion from types which have a getFunctionType member
167 // (e.g. Function and InlineAsm).
168 template <typename T, typename U = decltype(&T::getFunctionType)>
169 FunctionCallee(T *Fn)
170 : FnTy(Fn ? Fn->getFunctionType() : nullptr), Callee(Fn) {}
171
172 FunctionCallee(FunctionType *FnTy, Value *Callee)
173 : FnTy(FnTy), Callee(Callee) {
174 assert((FnTy == nullptr) == (Callee == nullptr))(static_cast<void> (0));
175 }
176
177 FunctionCallee(std::nullptr_t) {}
178
179 FunctionCallee() = default;
180
181 FunctionType *getFunctionType() { return FnTy; }
182
183 Value *getCallee() { return Callee; }
184
185 explicit operator bool() { return Callee; }
186
187private:
188 FunctionType *FnTy = nullptr;
189 Value *Callee = nullptr;
190};
191
192/// Class to represent struct types. There are two different kinds of struct
193/// types: Literal structs and Identified structs.
194///
195/// Literal struct types (e.g. { i32, i32 }) are uniqued structurally, and must
196/// always have a body when created. You can get one of these by using one of
197/// the StructType::get() forms.
198///
199/// Identified structs (e.g. %foo or %42) may optionally have a name and are not
200/// uniqued. The names for identified structs are managed at the LLVMContext
201/// level, so there can only be a single identified struct with a given name in
202/// a particular LLVMContext. Identified structs may also optionally be opaque
203/// (have no body specified). You get one of these by using one of the
204/// StructType::create() forms.
205///
206/// Independent of what kind of struct you have, the body of a struct type are
207/// laid out in memory consecutively with the elements directly one after the
208/// other (if the struct is packed) or (if not packed) with padding between the
209/// elements as defined by DataLayout (which is required to match what the code
210/// generator for a target expects).
211///
212class StructType : public Type {
213 StructType(LLVMContext &C) : Type(C, StructTyID) {}
214
215 enum {
216 /// This is the contents of the SubClassData field.
217 SCDB_HasBody = 1,
218 SCDB_Packed = 2,
219 SCDB_IsLiteral = 4,
220 SCDB_IsSized = 8
221 };
222
223 /// For a named struct that actually has a name, this is a pointer to the
224 /// symbol table entry (maintained by LLVMContext) for the struct.
225 /// This is null if the type is an literal struct or if it is a identified
226 /// type that has an empty name.
227 void *SymbolTableEntry = nullptr;
228
229public:
230 StructType(const StructType &) = delete;
231 StructType &operator=(const StructType &) = delete;
232
233 /// This creates an identified struct.
234 static StructType *create(LLVMContext &Context, StringRef Name);
235 static StructType *create(LLVMContext &Context);
236
237 static StructType *create(ArrayRef<Type *> Elements, StringRef Name,
238 bool isPacked = false);
239 static StructType *create(ArrayRef<Type *> Elements);
240 static StructType *create(LLVMContext &Context, ArrayRef<Type *> Elements,
241 StringRef Name, bool isPacked = false);
242 static StructType *create(LLVMContext &Context, ArrayRef<Type *> Elements);
243 template <class... Tys>
244 static std::enable_if_t<are_base_of<Type, Tys...>::value, StructType *>
245 create(StringRef Name, Type *elt1, Tys *... elts) {
246 assert(elt1 && "Cannot create a struct type with no elements with this")(static_cast<void> (0));
247 return create(ArrayRef<Type *>({elt1, elts...}), Name);
248 }
249
250 /// This static method is the primary way to create a literal StructType.
251 static StructType *get(LLVMContext &Context, ArrayRef<Type*> Elements,
252 bool isPacked = false);
253
254 /// Create an empty structure type.
255 static StructType *get(LLVMContext &Context, bool isPacked = false);
256
257 /// This static method is a convenience method for creating structure types by
258 /// specifying the elements as arguments. Note that this method always returns
259 /// a non-packed struct, and requires at least one element type.
260 template <class... Tys>
261 static std::enable_if_t<are_base_of<Type, Tys...>::value, StructType *>
262 get(Type *elt1, Tys *... elts) {
263 assert(elt1 && "Cannot create a struct type with no elements with this")(static_cast<void> (0));
264 LLVMContext &Ctx = elt1->getContext();
265 return StructType::get(Ctx, ArrayRef<Type *>({elt1, elts...}));
266 }
267
268 /// Return the type with the specified name, or null if there is none by that
269 /// name.
270 static StructType *getTypeByName(LLVMContext &C, StringRef Name);
271
272 bool isPacked() const { return (getSubclassData() & SCDB_Packed) != 0; }
273
274 /// Return true if this type is uniqued by structural equivalence, false if it
275 /// is a struct definition.
276 bool isLiteral() const { return (getSubclassData() & SCDB_IsLiteral) != 0; }
277
278 /// Return true if this is a type with an identity that has no body specified
279 /// yet. These prints as 'opaque' in .ll files.
280 bool isOpaque() const { return (getSubclassData() & SCDB_HasBody) == 0; }
281
282 /// isSized - Return true if this is a sized type.
283 bool isSized(SmallPtrSetImpl<Type *> *Visited = nullptr) const;
284
285 /// Returns true if this struct contains a scalable vector.
286 bool containsScalableVectorType() const;
287
288 /// Return true if this is a named struct that has a non-empty name.
289 bool hasName() const { return SymbolTableEntry != nullptr; }
290
291 /// Return the name for this struct type if it has an identity.
292 /// This may return an empty string for an unnamed struct type. Do not call
293 /// this on an literal type.
294 StringRef getName() const;
295
296 /// Change the name of this type to the specified name, or to a name with a
297 /// suffix if there is a collision. Do not call this on an literal type.
298 void setName(StringRef Name);
299
300 /// Specify a body for an opaque identified type.
301 void setBody(ArrayRef<Type*> Elements, bool isPacked = false);
302
303 template <typename... Tys>
304 std::enable_if_t<are_base_of<Type, Tys...>::value, void>
305 setBody(Type *elt1, Tys *... elts) {
306 assert(elt1 && "Cannot create a struct type with no elements with this")(static_cast<void> (0));
307 setBody(ArrayRef<Type *>({elt1, elts...}));
308 }
309
310 /// Return true if the specified type is valid as a element type.
311 static bool isValidElementType(Type *ElemTy);
312
313 // Iterator access to the elements.
314 using element_iterator = Type::subtype_iterator;
315
316 element_iterator element_begin() const { return ContainedTys; }
317 element_iterator element_end() const { return &ContainedTys[NumContainedTys];}
318 ArrayRef<Type *> elements() const {
319 return makeArrayRef(element_begin(), element_end());
320 }
321
322 /// Return true if this is layout identical to the specified struct.
323 bool isLayoutIdentical(StructType *Other) const;
324
325 /// Random access to the elements
326 unsigned getNumElements() const { return NumContainedTys; }
327 Type *getElementType(unsigned N) const {
328 assert(N < NumContainedTys && "Element number out of range!")(static_cast<void> (0));
329 return ContainedTys[N];
330 }
331 /// Given an index value into the type, return the type of the element.
332 Type *getTypeAtIndex(const Value *V) const;
333 Type *getTypeAtIndex(unsigned N) const { return getElementType(N); }
334 bool indexValid(const Value *V) const;
335 bool indexValid(unsigned Idx) const { return Idx < getNumElements(); }
336
337 /// Methods for support type inquiry through isa, cast, and dyn_cast.
338 static bool classof(const Type *T) {
339 return T->getTypeID() == StructTyID;
340 }
341};
342
343StringRef Type::getStructName() const {
344 return cast<StructType>(this)->getName();
345}
346
347unsigned Type::getStructNumElements() const {
348 return cast<StructType>(this)->getNumElements();
349}
350
351Type *Type::getStructElementType(unsigned N) const {
352 return cast<StructType>(this)->getElementType(N);
353}
354
355/// Class to represent array types.
356class ArrayType : public Type {
357 /// The element type of the array.
358 Type *ContainedType;
359 /// Number of elements in the array.
360 uint64_t NumElements;
361
362 ArrayType(Type *ElType, uint64_t NumEl);
363
364public:
365 ArrayType(const ArrayType &) = delete;
366 ArrayType &operator=(const ArrayType &) = delete;
367
368 uint64_t getNumElements() const { return NumElements; }
369 Type *getElementType() const { return ContainedType; }
370
371 /// This static method is the primary way to construct an ArrayType
372 static ArrayType *get(Type *ElementType, uint64_t NumElements);
373
374 /// Return true if the specified type is valid as a element type.
375 static bool isValidElementType(Type *ElemTy);
376
377 /// Methods for support type inquiry through isa, cast, and dyn_cast.
378 static bool classof(const Type *T) {
379 return T->getTypeID() == ArrayTyID;
380 }
381};
382
383uint64_t Type::getArrayNumElements() const {
384 return cast<ArrayType>(this)->getNumElements();
385}
386
387/// Base class of all SIMD vector types
388class VectorType : public Type {
389 /// A fully specified VectorType is of the form <vscale x n x Ty>. 'n' is the
390 /// minimum number of elements of type Ty contained within the vector, and
391 /// 'vscale x' indicates that the total element count is an integer multiple
392 /// of 'n', where the multiple is either guaranteed to be one, or is
393 /// statically unknown at compile time.
394 ///
395 /// If the multiple is known to be 1, then the extra term is discarded in
396 /// textual IR:
397 ///
398 /// <4 x i32> - a vector containing 4 i32s
399 /// <vscale x 4 x i32> - a vector containing an unknown integer multiple
400 /// of 4 i32s
401
402 /// The element type of the vector.
403 Type *ContainedType;
404
405protected:
406 /// The element quantity of this vector. The meaning of this value depends
407 /// on the type of vector:
408 /// - For FixedVectorType = <ElementQuantity x ty>, there are
409 /// exactly ElementQuantity elements in this vector.
410 /// - For ScalableVectorType = <vscale x ElementQuantity x ty>,
411 /// there are vscale * ElementQuantity elements in this vector, where
412 /// vscale is a runtime-constant integer greater than 0.
413 const unsigned ElementQuantity;
414
415 VectorType(Type *ElType, unsigned EQ, Type::TypeID TID);
416
417public:
418 VectorType(const VectorType &) = delete;
419 VectorType &operator=(const VectorType &) = delete;
420
421 Type *getElementType() const { return ContainedType; }
422
423 /// This static method is the primary way to construct an VectorType.
424 static VectorType *get(Type *ElementType, ElementCount EC);
425
426 static VectorType *get(Type *ElementType, unsigned NumElements,
427 bool Scalable) {
428 return VectorType::get(ElementType,
429 ElementCount::get(NumElements, Scalable));
430 }
431
432 static VectorType *get(Type *ElementType, const VectorType *Other) {
433 return VectorType::get(ElementType, Other->getElementCount());
434 }
435
436 /// This static method gets a VectorType with the same number of elements as
437 /// the input type, and the element type is an integer type of the same width
438 /// as the input element type.
439 static VectorType *getInteger(VectorType *VTy) {
440 unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits();
441 assert(EltBits && "Element size must be of a non-zero size")(static_cast<void> (0));
442 Type *EltTy = IntegerType::get(VTy->getContext(), EltBits);
443 return VectorType::get(EltTy, VTy->getElementCount());
444 }
445
446 /// This static method is like getInteger except that the element types are
447 /// twice as wide as the elements in the input type.
448 static VectorType *getExtendedElementVectorType(VectorType *VTy) {
449 assert(VTy->isIntOrIntVectorTy() && "VTy expected to be a vector of ints.")(static_cast<void> (0));
450 auto *EltTy = cast<IntegerType>(VTy->getElementType());
451 return VectorType::get(EltTy->getExtendedType(), VTy->getElementCount());
452 }
453
454 // This static method gets a VectorType with the same number of elements as
455 // the input type, and the element type is an integer or float type which
456 // is half as wide as the elements in the input type.
457 static VectorType *getTruncatedElementVectorType(VectorType *VTy) {
458 Type *EltTy;
459 if (VTy->getElementType()->isFloatingPointTy()) {
460 switch(VTy->getElementType()->getTypeID()) {
461 case DoubleTyID:
462 EltTy = Type::getFloatTy(VTy->getContext());
463 break;
464 case FloatTyID:
465 EltTy = Type::getHalfTy(VTy->getContext());
466 break;
467 default:
468 llvm_unreachable("Cannot create narrower fp vector element type")__builtin_unreachable();
469 }
470 } else {
471 unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits();
472 assert((EltBits & 1) == 0 &&(static_cast<void> (0))
473 "Cannot truncate vector element with odd bit-width")(static_cast<void> (0));
474 EltTy = IntegerType::get(VTy->getContext(), EltBits / 2);
475 }
476 return VectorType::get(EltTy, VTy->getElementCount());
477 }
478
479 // This static method returns a VectorType with a smaller number of elements
480 // of a larger type than the input element type. For example, a <16 x i8>
481 // subdivided twice would return <4 x i32>
482 static VectorType *getSubdividedVectorType(VectorType *VTy, int NumSubdivs) {
483 for (int i = 0; i < NumSubdivs; ++i) {
484 VTy = VectorType::getDoubleElementsVectorType(VTy);
485 VTy = VectorType::getTruncatedElementVectorType(VTy);
486 }
487 return VTy;
488 }
489
490 /// This static method returns a VectorType with half as many elements as the
491 /// input type and the same element type.
492 static VectorType *getHalfElementsVectorType(VectorType *VTy) {
493 auto EltCnt = VTy->getElementCount();
494 assert(EltCnt.isKnownEven() &&(static_cast<void> (0))
495 "Cannot halve vector with odd number of elements.")(static_cast<void> (0));
496 return VectorType::get(VTy->getElementType(),
497 EltCnt.divideCoefficientBy(2));
498 }
499
500 /// This static method returns a VectorType with twice as many elements as the
501 /// input type and the same element type.
502 static VectorType *getDoubleElementsVectorType(VectorType *VTy) {
503 auto EltCnt = VTy->getElementCount();
504 assert((EltCnt.getKnownMinValue() * 2ull) <= UINT_MAX &&(static_cast<void> (0))
505 "Too many elements in vector")(static_cast<void> (0));
506 return VectorType::get(VTy->getElementType(), EltCnt * 2);
507 }
508
509 /// Return true if the specified type is valid as a element type.
510 static bool isValidElementType(Type *ElemTy);
511
512 /// Return an ElementCount instance to represent the (possibly scalable)
513 /// number of elements in the vector.
514 inline ElementCount getElementCount() const;
515
516 /// Methods for support type inquiry through isa, cast, and dyn_cast.
517 static bool classof(const Type *T) {
518 return T->getTypeID() == FixedVectorTyID ||
519 T->getTypeID() == ScalableVectorTyID;
520 }
521};
522
523/// Class to represent fixed width SIMD vectors
524class FixedVectorType : public VectorType {
525protected:
526 FixedVectorType(Type *ElTy, unsigned NumElts)
527 : VectorType(ElTy, NumElts, FixedVectorTyID) {}
528
529public:
530 static FixedVectorType *get(Type *ElementType, unsigned NumElts);
531
532 static FixedVectorType *get(Type *ElementType, const FixedVectorType *FVTy) {
533 return get(ElementType, FVTy->getNumElements());
534 }
535
536 static FixedVectorType *getInteger(FixedVectorType *VTy) {
537 return cast<FixedVectorType>(VectorType::getInteger(VTy));
538 }
539
540 static FixedVectorType *getExtendedElementVectorType(FixedVectorType *VTy) {
541 return cast<FixedVectorType>(VectorType::getExtendedElementVectorType(VTy));
542 }
543
544 static FixedVectorType *getTruncatedElementVectorType(FixedVectorType *VTy) {
545 return cast<FixedVectorType>(
546 VectorType::getTruncatedElementVectorType(VTy));
547 }
548
549 static FixedVectorType *getSubdividedVectorType(FixedVectorType *VTy,
550 int NumSubdivs) {
551 return cast<FixedVectorType>(
552 VectorType::getSubdividedVectorType(VTy, NumSubdivs));
553 }
554
555 static FixedVectorType *getHalfElementsVectorType(FixedVectorType *VTy) {
556 return cast<FixedVectorType>(VectorType::getHalfElementsVectorType(VTy));
557 }
558
559 static FixedVectorType *getDoubleElementsVectorType(FixedVectorType *VTy) {
560 return cast<FixedVectorType>(VectorType::getDoubleElementsVectorType(VTy));
561 }
562
563 static bool classof(const Type *T) {
564 return T->getTypeID() == FixedVectorTyID;
565 }
566
567 unsigned getNumElements() const { return ElementQuantity; }
568};
569
570/// Class to represent scalable SIMD vectors
571class ScalableVectorType : public VectorType {
572protected:
573 ScalableVectorType(Type *ElTy, unsigned MinNumElts)
574 : VectorType(ElTy, MinNumElts, ScalableVectorTyID) {}
575
576public:
577 static ScalableVectorType *get(Type *ElementType, unsigned MinNumElts);
578
579 static ScalableVectorType *get(Type *ElementType,
580 const ScalableVectorType *SVTy) {
581 return get(ElementType, SVTy->getMinNumElements());
582 }
583
584 static ScalableVectorType *getInteger(ScalableVectorType *VTy) {
585 return cast<ScalableVectorType>(VectorType::getInteger(VTy));
586 }
587
588 static ScalableVectorType *
589 getExtendedElementVectorType(ScalableVectorType *VTy) {
590 return cast<ScalableVectorType>(
591 VectorType::getExtendedElementVectorType(VTy));
592 }
593
594 static ScalableVectorType *
595 getTruncatedElementVectorType(ScalableVectorType *VTy) {
596 return cast<ScalableVectorType>(
597 VectorType::getTruncatedElementVectorType(VTy));
598 }
599
600 static ScalableVectorType *getSubdividedVectorType(ScalableVectorType *VTy,
601 int NumSubdivs) {
602 return cast<ScalableVectorType>(
603 VectorType::getSubdividedVectorType(VTy, NumSubdivs));
604 }
605
606 static ScalableVectorType *
607 getHalfElementsVectorType(ScalableVectorType *VTy) {
608 return cast<ScalableVectorType>(VectorType::getHalfElementsVectorType(VTy));
609 }
610
611 static ScalableVectorType *
612 getDoubleElementsVectorType(ScalableVectorType *VTy) {
613 return cast<ScalableVectorType>(
614 VectorType::getDoubleElementsVectorType(VTy));
615 }
616
617 /// Get the minimum number of elements in this vector. The actual number of
618 /// elements in the vector is an integer multiple of this value.
619 uint64_t getMinNumElements() const { return ElementQuantity; }
620
621 static bool classof(const Type *T) {
622 return T->getTypeID() == ScalableVectorTyID;
623 }
624};
625
626inline ElementCount VectorType::getElementCount() const {
627 return ElementCount::get(ElementQuantity, isa<ScalableVectorType>(this));
628}
629
630/// Class to represent pointers.
631class PointerType : public Type {
632 explicit PointerType(Type *ElType, unsigned AddrSpace);
633 explicit PointerType(LLVMContext &C, unsigned AddrSpace);
634
635 Type *PointeeTy;
636
637public:
638 PointerType(const PointerType &) = delete;
639 PointerType &operator=(const PointerType &) = delete;
640
641 /// This constructs a pointer to an object of the specified type in a numbered
642 /// address space.
643 static PointerType *get(Type *ElementType, unsigned AddressSpace);
644 /// This constructs an opaque pointer to an object in a numbered address
645 /// space.
646 static PointerType *get(LLVMContext &C, unsigned AddressSpace);
647
648 /// This constructs a pointer to an object of the specified type in the
649 /// default address space (address space zero).
650 static PointerType *getUnqual(Type *ElementType) {
651 return PointerType::get(ElementType, 0);
652 }
653
654 /// This constructs an opaque pointer to an object in the
655 /// default address space (address space zero).
656 static PointerType *getUnqual(LLVMContext &C) {
657 return PointerType::get(C, 0);
658 }
659
660 /// This constructs a pointer type with the same pointee type as input
661 /// PointerType (or opaque pointer is the input PointerType is opaque) and the
662 /// given address space. This is only useful during the opaque pointer
663 /// transition.
664 /// TODO: remove after opaque pointer transition is complete.
665 static PointerType *getWithSamePointeeType(PointerType *PT,
666 unsigned AddressSpace) {
667 if (PT->isOpaque())
668 return get(PT->getContext(), AddressSpace);
669 return get(PT->getElementType(), AddressSpace);
670 }
671
672 Type *getElementType() const {
673 assert(!isOpaque() && "Attempting to get element type of opaque pointer")(static_cast<void> (0));
674 return PointeeTy;
675 }
676
677 bool isOpaque() const { return !PointeeTy; }
678
679 /// Return true if the specified type is valid as a element type.
680 static bool isValidElementType(Type *ElemTy);
681
682 /// Return true if we can load or store from a pointer to this type.
683 static bool isLoadableOrStorableType(Type *ElemTy);
684
685 /// Return the address space of the Pointer type.
686 inline unsigned getAddressSpace() const { return getSubclassData(); }
687
688 /// Return true if either this is an opaque pointer type or if this pointee
689 /// type matches Ty. Primarily used for checking if an instruction's pointer
690 /// operands are valid types. Will be useless after non-opaque pointers are
691 /// removed.
692 bool isOpaqueOrPointeeTypeMatches(Type *Ty) {
693 return isOpaque() || PointeeTy == Ty;
694 }
695
696 /// Return true if both pointer types have the same element type. Two opaque
697 /// pointers are considered to have the same element type, while an opaque
698 /// and a non-opaque pointer have different element types.
699 /// TODO: Remove after opaque pointer transition is complete.
700 bool hasSameElementTypeAs(PointerType *Other) {
701 return PointeeTy == Other->PointeeTy;
702 }
703
704 /// Implement support type inquiry through isa, cast, and dyn_cast.
705 static bool classof(const Type *T) {
706 return T->getTypeID() == PointerTyID;
707 }
708};
709
710Type *Type::getExtendedType() const {
711 assert((static_cast<void> (0))
712 isIntOrIntVectorTy() &&(static_cast<void> (0))
713 "Original type expected to be a vector of integers or a scalar integer.")(static_cast<void> (0));
714 if (auto *VTy = dyn_cast<VectorType>(this))
715 return VectorType::getExtendedElementVectorType(
716 const_cast<VectorType *>(VTy));
717 return cast<IntegerType>(this)->getExtendedType();
718}
719
720Type *Type::getWithNewType(Type *EltTy) const {
721 if (auto *VTy
31.1
'VTy' is null
31.1
'VTy' is null
31.1
'VTy' is null
31.1
'VTy' is null
31.1
'VTy' is null
31.1
'VTy' is null
31.1
'VTy' is null
= dyn_cast<VectorType>(this))
31
Assuming the object is not a 'VectorType'
32
Taking false branch
722 return VectorType::get(EltTy, VTy->getElementCount());
723 return EltTy;
33
Returning pointer (loaded from 'EltTy')
724}
725
726Type *Type::getWithNewBitWidth(unsigned NewBitWidth) const {
727 assert((static_cast<void> (0))
728 isIntOrIntVectorTy() &&(static_cast<void> (0))
729 "Original type expected to be a vector of integers or a scalar integer.")(static_cast<void> (0));
730 return getWithNewType(getIntNTy(getContext(), NewBitWidth));
29
Passing value via 1st parameter 'EltTy'
30
Calling 'Type::getWithNewType'
34
Returning from 'Type::getWithNewType'
35
Returning pointer
731}
732
733unsigned Type::getPointerAddressSpace() const {
734 return cast<PointerType>(getScalarType())->getAddressSpace();
735}
736
737} // end namespace llvm
738
739#endif // LLVM_IR_DERIVEDTYPES_H

/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/include/llvm/IR/Type.h

1//===- llvm/Type.h - Classes for handling data types ------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains the declaration of the Type class. For more "Type"
10// stuff, look in DerivedTypes.h.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_IR_TYPE_H
15#define LLVM_IR_TYPE_H
16
17#include "llvm/ADT/APFloat.h"
18#include "llvm/ADT/ArrayRef.h"
19#include "llvm/ADT/SmallPtrSet.h"
20#include "llvm/Support/CBindingWrapping.h"
21#include "llvm/Support/Casting.h"
22#include "llvm/Support/Compiler.h"
23#include "llvm/Support/ErrorHandling.h"
24#include "llvm/Support/TypeSize.h"
25#include <cassert>
26#include <cstdint>
27#include <iterator>
28
29namespace llvm {
30
31class IntegerType;
32class LLVMContext;
33class PointerType;
34class raw_ostream;
35class StringRef;
36
37/// The instances of the Type class are immutable: once they are created,
38/// they are never changed. Also note that only one instance of a particular
39/// type is ever created. Thus seeing if two types are equal is a matter of
40/// doing a trivial pointer comparison. To enforce that no two equal instances
41/// are created, Type instances can only be created via static factory methods
42/// in class Type and in derived classes. Once allocated, Types are never
43/// free'd.
44///
45class Type {
46public:
47 //===--------------------------------------------------------------------===//
48 /// Definitions of all of the base types for the Type system. Based on this
49 /// value, you can cast to a class defined in DerivedTypes.h.
50 /// Note: If you add an element to this, you need to add an element to the
51 /// Type::getPrimitiveType function, or else things will break!
52 /// Also update LLVMTypeKind and LLVMGetTypeKind () in the C binding.
53 ///
54 enum TypeID {
55 // PrimitiveTypes
56 HalfTyID = 0, ///< 16-bit floating point type
57 BFloatTyID, ///< 16-bit floating point type (7-bit significand)
58 FloatTyID, ///< 32-bit floating point type
59 DoubleTyID, ///< 64-bit floating point type
60 X86_FP80TyID, ///< 80-bit floating point type (X87)
61 FP128TyID, ///< 128-bit floating point type (112-bit significand)
62 PPC_FP128TyID, ///< 128-bit floating point type (two 64-bits, PowerPC)
63 VoidTyID, ///< type with no size
64 LabelTyID, ///< Labels
65 MetadataTyID, ///< Metadata
66 X86_MMXTyID, ///< MMX vectors (64 bits, X86 specific)
67 X86_AMXTyID, ///< AMX vectors (8192 bits, X86 specific)
68 TokenTyID, ///< Tokens
69
70 // Derived types... see DerivedTypes.h file.
71 IntegerTyID, ///< Arbitrary bit width integers
72 FunctionTyID, ///< Functions
73 PointerTyID, ///< Pointers
74 StructTyID, ///< Structures
75 ArrayTyID, ///< Arrays
76 FixedVectorTyID, ///< Fixed width SIMD vector type
77 ScalableVectorTyID ///< Scalable SIMD vector type
78 };
79
80private:
81 /// This refers to the LLVMContext in which this type was uniqued.
82 LLVMContext &Context;
83
84 TypeID ID : 8; // The current base type of this type.
85 unsigned SubclassData : 24; // Space for subclasses to store data.
86 // Note that this should be synchronized with
87 // MAX_INT_BITS value in IntegerType class.
88
89protected:
90 friend class LLVMContextImpl;
91
92 explicit Type(LLVMContext &C, TypeID tid)
93 : Context(C), ID(tid), SubclassData(0) {}
94 ~Type() = default;
95
96 unsigned getSubclassData() const { return SubclassData; }
97
98 void setSubclassData(unsigned val) {
99 SubclassData = val;
100 // Ensure we don't have any accidental truncation.
101 assert(getSubclassData() == val && "Subclass data too large for field")(static_cast<void> (0));
102 }
103
104 /// Keeps track of how many Type*'s there are in the ContainedTys list.
105 unsigned NumContainedTys = 0;
106
107 /// A pointer to the array of Types contained by this Type. For example, this
108 /// includes the arguments of a function type, the elements of a structure,
109 /// the pointee of a pointer, the element type of an array, etc. This pointer
110 /// may be 0 for types that don't contain other types (Integer, Double,
111 /// Float).
112 Type * const *ContainedTys = nullptr;
113
114public:
115 /// Print the current type.
116 /// Omit the type details if \p NoDetails == true.
117 /// E.g., let %st = type { i32, i16 }
118 /// When \p NoDetails is true, we only print %st.
119 /// Put differently, \p NoDetails prints the type as if
120 /// inlined with the operands when printing an instruction.
121 void print(raw_ostream &O, bool IsForDebug = false,
122 bool NoDetails = false) const;
123
124 void dump() const;
125
126 /// Return the LLVMContext in which this type was uniqued.
127 LLVMContext &getContext() const { return Context; }
128
129 //===--------------------------------------------------------------------===//
130 // Accessors for working with types.
131 //
132
133 /// Return the type id for the type. This will return one of the TypeID enum
134 /// elements defined above.
135 TypeID getTypeID() const { return ID; }
136
137 /// Return true if this is 'void'.
138 bool isVoidTy() const { return getTypeID() == VoidTyID; }
139
140 /// Return true if this is 'half', a 16-bit IEEE fp type.
141 bool isHalfTy() const { return getTypeID() == HalfTyID; }
142
143 /// Return true if this is 'bfloat', a 16-bit bfloat type.
144 bool isBFloatTy() const { return getTypeID() == BFloatTyID; }
145
146 /// Return true if this is 'float', a 32-bit IEEE fp type.
147 bool isFloatTy() const { return getTypeID() == FloatTyID; }
148
149 /// Return true if this is 'double', a 64-bit IEEE fp type.
150 bool isDoubleTy() const { return getTypeID() == DoubleTyID; }
151
152 /// Return true if this is x86 long double.
153 bool isX86_FP80Ty() const { return getTypeID() == X86_FP80TyID; }
154
155 /// Return true if this is 'fp128'.
156 bool isFP128Ty() const { return getTypeID() == FP128TyID; }
157
158 /// Return true if this is powerpc long double.
159 bool isPPC_FP128Ty() const { return getTypeID() == PPC_FP128TyID; }
160
161 /// Return true if this is one of the six floating-point types
162 bool isFloatingPointTy() const {
163 return getTypeID() == HalfTyID || getTypeID() == BFloatTyID ||
164 getTypeID() == FloatTyID || getTypeID() == DoubleTyID ||
165 getTypeID() == X86_FP80TyID || getTypeID() == FP128TyID ||
166 getTypeID() == PPC_FP128TyID;
167 }
168
169 const fltSemantics &getFltSemantics() const {
170 switch (getTypeID()) {
171 case HalfTyID: return APFloat::IEEEhalf();
172 case BFloatTyID: return APFloat::BFloat();
173 case FloatTyID: return APFloat::IEEEsingle();
174 case DoubleTyID: return APFloat::IEEEdouble();
175 case X86_FP80TyID: return APFloat::x87DoubleExtended();
176 case FP128TyID: return APFloat::IEEEquad();
177 case PPC_FP128TyID: return APFloat::PPCDoubleDouble();
178 default: llvm_unreachable("Invalid floating type")__builtin_unreachable();
179 }
180 }
181
182 /// Return true if this is X86 MMX.
183 bool isX86_MMXTy() const { return getTypeID() == X86_MMXTyID; }
184
185 /// Return true if this is X86 AMX.
186 bool isX86_AMXTy() const { return getTypeID() == X86_AMXTyID; }
187
188 /// Return true if this is a FP type or a vector of FP.
189 bool isFPOrFPVectorTy() const { return getScalarType()->isFloatingPointTy(); }
190
191 /// Return true if this is 'label'.
192 bool isLabelTy() const { return getTypeID() == LabelTyID; }
193
194 /// Return true if this is 'metadata'.
195 bool isMetadataTy() const { return getTypeID() == MetadataTyID; }
196
197 /// Return true if this is 'token'.
198 bool isTokenTy() const { return getTypeID() == TokenTyID; }
199
200 /// True if this is an instance of IntegerType.
201 bool isIntegerTy() const { return getTypeID() == IntegerTyID; }
202
203 /// Return true if this is an IntegerType of the given width.
204 bool isIntegerTy(unsigned Bitwidth) const;
205
206 /// Return true if this is an integer type or a vector of integer types.
207 bool isIntOrIntVectorTy() const { return getScalarType()->isIntegerTy(); }
208
209 /// Return true if this is an integer type or a vector of integer types of
210 /// the given width.
211 bool isIntOrIntVectorTy(unsigned BitWidth) const {
212 return getScalarType()->isIntegerTy(BitWidth);
213 }
214
215 /// Return true if this is an integer type or a pointer type.
216 bool isIntOrPtrTy() const { return isIntegerTy() || isPointerTy(); }
217
218 /// True if this is an instance of FunctionType.
219 bool isFunctionTy() const { return getTypeID() == FunctionTyID; }
220
221 /// True if this is an instance of StructType.
222 bool isStructTy() const { return getTypeID() == StructTyID; }
223
224 /// True if this is an instance of ArrayType.
225 bool isArrayTy() const { return getTypeID() == ArrayTyID; }
226
227 /// True if this is an instance of PointerType.
228 bool isPointerTy() const { return getTypeID() == PointerTyID; }
229
230 /// True if this is an instance of an opaque PointerType.
231 bool isOpaquePointerTy() const;
232
233 /// Return true if this is a pointer type or a vector of pointer types.
234 bool isPtrOrPtrVectorTy() const { return getScalarType()->isPointerTy(); }
235
236 /// True if this is an instance of VectorType.
237 inline bool isVectorTy() const {
238 return getTypeID() == ScalableVectorTyID || getTypeID() == FixedVectorTyID;
44
Assuming the condition is false
45
Assuming the condition is true
46
Returning the value 1, which participates in a condition later
239 }
240
241 /// Return true if this type could be converted with a lossless BitCast to
242 /// type 'Ty'. For example, i8* to i32*. BitCasts are valid for types of the
243 /// same size only where no re-interpretation of the bits is done.
244 /// Determine if this type could be losslessly bitcast to Ty
245 bool canLosslesslyBitCastTo(Type *Ty) const;
246
247 /// Return true if this type is empty, that is, it has no elements or all of
248 /// its elements are empty.
249 bool isEmptyTy() const;
250
251 /// Return true if the type is "first class", meaning it is a valid type for a
252 /// Value.
253 bool isFirstClassType() const {
254 return getTypeID() != FunctionTyID && getTypeID() != VoidTyID;
255 }
256
257 /// Return true if the type is a valid type for a register in codegen. This
258 /// includes all first-class types except struct and array types.
259 bool isSingleValueType() const {
260 return isFloatingPointTy() || isX86_MMXTy() || isIntegerTy() ||
261 isPointerTy() || isVectorTy() || isX86_AMXTy();
262 }
263
264 /// Return true if the type is an aggregate type. This means it is valid as
265 /// the first operand of an insertvalue or extractvalue instruction. This
266 /// includes struct and array types, but does not include vector types.
267 bool isAggregateType() const {
268 return getTypeID() == StructTyID || getTypeID() == ArrayTyID;
269 }
270
271 /// Return true if it makes sense to take the size of this type. To get the
272 /// actual size for a particular target, it is reasonable to use the
273 /// DataLayout subsystem to do this.
274 bool isSized(SmallPtrSetImpl<Type*> *Visited = nullptr) const {
275 // If it's a primitive, it is always sized.
276 if (getTypeID() == IntegerTyID || isFloatingPointTy() ||
277 getTypeID() == PointerTyID || getTypeID() == X86_MMXTyID ||
278 getTypeID() == X86_AMXTyID)
279 return true;
280 // If it is not something that can have a size (e.g. a function or label),
281 // it doesn't have a size.
282 if (getTypeID() != StructTyID && getTypeID() != ArrayTyID && !isVectorTy())
283 return false;
284 // Otherwise we have to try harder to decide.
285 return isSizedDerivedType(Visited);
286 }
287
288 /// Return the basic size of this type if it is a primitive type. These are
289 /// fixed by LLVM and are not target-dependent.
290 /// This will return zero if the type does not have a size or is not a
291 /// primitive type.
292 ///
293 /// If this is a scalable vector type, the scalable property will be set and
294 /// the runtime size will be a positive integer multiple of the base size.
295 ///
296 /// Note that this may not reflect the size of memory allocated for an
297 /// instance of the type or the number of bytes that are written when an
298 /// instance of the type is stored to memory. The DataLayout class provides
299 /// additional query functions to provide this information.
300 ///
301 TypeSize getPrimitiveSizeInBits() const LLVM_READONLY__attribute__((__pure__));
302
303 /// If this is a vector type, return the getPrimitiveSizeInBits value for the
304 /// element type. Otherwise return the getPrimitiveSizeInBits value for this
305 /// type.
306 unsigned getScalarSizeInBits() const LLVM_READONLY__attribute__((__pure__));
307
308 /// Return the width of the mantissa of this type. This is only valid on
309 /// floating-point types. If the FP type does not have a stable mantissa (e.g.
310 /// ppc long double), this method returns -1.
311 int getFPMantissaWidth() const;
312
313 /// Return whether the type is IEEE compatible, as defined by the eponymous
314 /// method in APFloat.
315 bool isIEEE() const { return APFloat::getZero(getFltSemantics()).isIEEE(); }
316
317 /// If this is a vector type, return the element type, otherwise return
318 /// 'this'.
319 inline Type *getScalarType() const {
320 if (isVectorTy())
321 return getContainedType(0);
322 return const_cast<Type *>(this);
323 }
324
325 //===--------------------------------------------------------------------===//
326 // Type Iteration support.
327 //
328 using subtype_iterator = Type * const *;
329
330 subtype_iterator subtype_begin() const { return ContainedTys; }
331 subtype_iterator subtype_end() const { return &ContainedTys[NumContainedTys];}
332 ArrayRef<Type*> subtypes() const {
333 return makeArrayRef(subtype_begin(), subtype_end());
334 }
335
336 using subtype_reverse_iterator = std::reverse_iterator<subtype_iterator>;
337
338 subtype_reverse_iterator subtype_rbegin() const {
339 return subtype_reverse_iterator(subtype_end());
340 }
341 subtype_reverse_iterator subtype_rend() const {
342 return subtype_reverse_iterator(subtype_begin());
343 }
344
345 /// This method is used to implement the type iterator (defined at the end of
346 /// the file). For derived types, this returns the types 'contained' in the
347 /// derived type.
348 Type *getContainedType(unsigned i) const {
349 assert(i < NumContainedTys && "Index out of range!")(static_cast<void> (0));
350 return ContainedTys[i];
351 }
352
353 /// Return the number of types in the derived type.
354 unsigned getNumContainedTypes() const { return NumContainedTys; }
355
356 //===--------------------------------------------------------------------===//
357 // Helper methods corresponding to subclass methods. This forces a cast to
358 // the specified subclass and calls its accessor. "getArrayNumElements" (for
359 // example) is shorthand for cast<ArrayType>(Ty)->getNumElements(). This is
360 // only intended to cover the core methods that are frequently used, helper
361 // methods should not be added here.
362
363 inline unsigned getIntegerBitWidth() const;
364
365 inline Type *getFunctionParamType(unsigned i) const;
366 inline unsigned getFunctionNumParams() const;
367 inline bool isFunctionVarArg() const;
368
369 inline StringRef getStructName() const;
370 inline unsigned getStructNumElements() const;
371 inline Type *getStructElementType(unsigned N) const;
372
373 inline uint64_t getArrayNumElements() const;
374
375 Type *getArrayElementType() const {
376 assert(getTypeID() == ArrayTyID)(static_cast<void> (0));
377 return ContainedTys[0];
378 }
379
380 Type *getPointerElementType() const {
381 assert(getTypeID() == PointerTyID)(static_cast<void> (0));
382 return ContainedTys[0];
383 }
384
385 /// Given vector type, change the element type,
386 /// whilst keeping the old number of elements.
387 /// For non-vectors simply returns \p EltTy.
388 inline Type *getWithNewType(Type *EltTy) const;
389
390 /// Given an integer or vector type, change the lane bitwidth to NewBitwidth,
391 /// whilst keeping the old number of lanes.
392 inline Type *getWithNewBitWidth(unsigned NewBitWidth) const;
393
394 /// Given scalar/vector integer type, returns a type with elements twice as
395 /// wide as in the original type. For vectors, preserves element count.
396 inline Type *getExtendedType() const;
397
398 /// Get the address space of this pointer or pointer vector type.
399 inline unsigned getPointerAddressSpace() const;
400
401 //===--------------------------------------------------------------------===//
402 // Static members exported by the Type class itself. Useful for getting
403 // instances of Type.
404 //
405
406 /// Return a type based on an identifier.
407 static Type *getPrimitiveType(LLVMContext &C, TypeID IDNumber);
408
409 //===--------------------------------------------------------------------===//
410 // These are the builtin types that are always available.
411 //
412 static Type *getVoidTy(LLVMContext &C);
413 static Type *getLabelTy(LLVMContext &C);
414 static Type *getHalfTy(LLVMContext &C);
415 static Type *getBFloatTy(LLVMContext &C);
416 static Type *getFloatTy(LLVMContext &C);
417 static Type *getDoubleTy(LLVMContext &C);
418 static Type *getMetadataTy(LLVMContext &C);
419 static Type *getX86_FP80Ty(LLVMContext &C);
420 static Type *getFP128Ty(LLVMContext &C);
421 static Type *getPPC_FP128Ty(LLVMContext &C);
422 static Type *getX86_MMXTy(LLVMContext &C);
423 static Type *getX86_AMXTy(LLVMContext &C);
424 static Type *getTokenTy(LLVMContext &C);
425 static IntegerType *getIntNTy(LLVMContext &C, unsigned N);
426 static IntegerType *getInt1Ty(LLVMContext &C);
427 static IntegerType *getInt8Ty(LLVMContext &C);
428 static IntegerType *getInt16Ty(LLVMContext &C);
429 static IntegerType *getInt32Ty(LLVMContext &C);
430 static IntegerType *getInt64Ty(LLVMContext &C);
431 static IntegerType *getInt128Ty(LLVMContext &C);
432 template <typename ScalarTy> static Type *getScalarTy(LLVMContext &C) {
433 int noOfBits = sizeof(ScalarTy) * CHAR_BIT8;
434 if (std::is_integral<ScalarTy>::value) {
435 return (Type*) Type::getIntNTy(C, noOfBits);
436 } else if (std::is_floating_point<ScalarTy>::value) {
437 switch (noOfBits) {
438 case 32:
439 return Type::getFloatTy(C);
440 case 64:
441 return Type::getDoubleTy(C);
442 }
443 }
444 llvm_unreachable("Unsupported type in Type::getScalarTy")__builtin_unreachable();
445 }
446 static Type *getFloatingPointTy(LLVMContext &C, const fltSemantics &S) {
447 Type *Ty;
448 if (&S == &APFloat::IEEEhalf())
449 Ty = Type::getHalfTy(C);
450 else if (&S == &APFloat::BFloat())
451 Ty = Type::getBFloatTy(C);
452 else if (&S == &APFloat::IEEEsingle())
453 Ty = Type::getFloatTy(C);
454 else if (&S == &APFloat::IEEEdouble())
455 Ty = Type::getDoubleTy(C);
456 else if (&S == &APFloat::x87DoubleExtended())
457 Ty = Type::getX86_FP80Ty(C);
458 else if (&S == &APFloat::IEEEquad())
459 Ty = Type::getFP128Ty(C);
460 else {
461 assert(&S == &APFloat::PPCDoubleDouble() && "Unknown FP format")(static_cast<void> (0));
462 Ty = Type::getPPC_FP128Ty(C);
463 }
464 return Ty;
465 }
466
467 //===--------------------------------------------------------------------===//
468 // Convenience methods for getting pointer types with one of the above builtin
469 // types as pointee.
470 //
471 static PointerType *getHalfPtrTy(LLVMContext &C, unsigned AS = 0);
472 static PointerType *getBFloatPtrTy(LLVMContext &C, unsigned AS = 0);
473 static PointerType *getFloatPtrTy(LLVMContext &C, unsigned AS = 0);
474 static PointerType *getDoublePtrTy(LLVMContext &C, unsigned AS = 0);
475 static PointerType *getX86_FP80PtrTy(LLVMContext &C, unsigned AS = 0);
476 static PointerType *getFP128PtrTy(LLVMContext &C, unsigned AS = 0);
477 static PointerType *getPPC_FP128PtrTy(LLVMContext &C, unsigned AS = 0);
478 static PointerType *getX86_MMXPtrTy(LLVMContext &C, unsigned AS = 0);
479 static PointerType *getX86_AMXPtrTy(LLVMContext &C, unsigned AS = 0);
480 static PointerType *getIntNPtrTy(LLVMContext &C, unsigned N, unsigned AS = 0);
481 static PointerType *getInt1PtrTy(LLVMContext &C, unsigned AS = 0);
482 static PointerType *getInt8PtrTy(LLVMContext &C, unsigned AS = 0);
483 static PointerType *getInt16PtrTy(LLVMContext &C, unsigned AS = 0);
484 static PointerType *getInt32PtrTy(LLVMContext &C, unsigned AS = 0);
485 static PointerType *getInt64PtrTy(LLVMContext &C, unsigned AS = 0);
486
487 /// Return a pointer to the current type. This is equivalent to
488 /// PointerType::get(Foo, AddrSpace).
489 /// TODO: Remove this after opaque pointer transition is complete.
490 PointerType *getPointerTo(unsigned AddrSpace = 0) const;
491
492private:
493 /// Derived types like structures and arrays are sized iff all of the members
494 /// of the type are sized as well. Since asking for their size is relatively
495 /// uncommon, move this operation out-of-line.
496 bool isSizedDerivedType(SmallPtrSetImpl<Type*> *Visited = nullptr) const;
497};
498
499// Printing of types.
500inline raw_ostream &operator<<(raw_ostream &OS, const Type &T) {
501 T.print(OS);
502 return OS;
503}
504
505// allow isa<PointerType>(x) to work without DerivedTypes.h included.
506template <> struct isa_impl<PointerType, Type> {
507 static inline bool doit(const Type &Ty) {
508 return Ty.getTypeID() == Type::PointerTyID;
509 }
510};
511
512// Create wrappers for C Binding types (see CBindingWrapping.h).
513DEFINE_ISA_CONVERSION_FUNCTIONS(Type, LLVMTypeRef)inline Type *unwrap(LLVMTypeRef P) { return reinterpret_cast<
Type*>(P); } inline LLVMTypeRef wrap(const Type *P) { return
reinterpret_cast<LLVMTypeRef>(const_cast<Type*>(
P)); } template<typename T> inline T *unwrap(LLVMTypeRef
P) { return cast<T>(unwrap(P)); }
514
515/* Specialized opaque type conversions.
516 */
517inline Type **unwrap(LLVMTypeRef* Tys) {
518 return reinterpret_cast<Type**>(Tys);
519}
520
521inline LLVMTypeRef *wrap(Type **Tys) {
522 return reinterpret_cast<LLVMTypeRef*>(const_cast<Type**>(Tys));
523}
524
525} // end namespace llvm
526
527#endif // LLVM_IR_TYPE_H