Bug Summary

File:llvm/include/llvm/CodeGen/BasicTTIImpl.h
Warning:line 1076, column 11
Called C++ object pointer is null

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name ARMTargetTransformInfo.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/lib/Target/ARM -resource-dir /usr/lib/llvm-14/lib/clang/14.0.0 -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/lib/Target/ARM -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/lib/Target/ARM -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/include -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/include -D NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-14/lib/clang/14.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/lib/Target/ARM -fdebug-prefix-map=/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e=. -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2021-09-04-040900-46481-1 -x c++ /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

1//===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "ARMTargetTransformInfo.h"
10#include "ARMSubtarget.h"
11#include "MCTargetDesc/ARMAddressingModes.h"
12#include "llvm/ADT/APInt.h"
13#include "llvm/ADT/SmallVector.h"
14#include "llvm/Analysis/LoopInfo.h"
15#include "llvm/CodeGen/CostTable.h"
16#include "llvm/CodeGen/ISDOpcodes.h"
17#include "llvm/CodeGen/ValueTypes.h"
18#include "llvm/IR/BasicBlock.h"
19#include "llvm/IR/DataLayout.h"
20#include "llvm/IR/DerivedTypes.h"
21#include "llvm/IR/Instruction.h"
22#include "llvm/IR/Instructions.h"
23#include "llvm/IR/Intrinsics.h"
24#include "llvm/IR/IntrinsicInst.h"
25#include "llvm/IR/IntrinsicsARM.h"
26#include "llvm/IR/PatternMatch.h"
27#include "llvm/IR/Type.h"
28#include "llvm/MC/SubtargetFeature.h"
29#include "llvm/Support/Casting.h"
30#include "llvm/Support/KnownBits.h"
31#include "llvm/Support/MachineValueType.h"
32#include "llvm/Target/TargetMachine.h"
33#include "llvm/Transforms/InstCombine/InstCombiner.h"
34#include "llvm/Transforms/Utils/Local.h"
35#include "llvm/Transforms/Utils/LoopUtils.h"
36#include <algorithm>
37#include <cassert>
38#include <cstdint>
39#include <utility>
40
41using namespace llvm;
42
43#define DEBUG_TYPE"armtti" "armtti"
44
45static cl::opt<bool> EnableMaskedLoadStores(
46 "enable-arm-maskedldst", cl::Hidden, cl::init(true),
47 cl::desc("Enable the generation of masked loads and stores"));
48
49static cl::opt<bool> DisableLowOverheadLoops(
50 "disable-arm-loloops", cl::Hidden, cl::init(false),
51 cl::desc("Disable the generation of low-overhead loops"));
52
53static cl::opt<bool>
54 AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
55 cl::desc("Enable the generation of WLS loops"));
56
57extern cl::opt<TailPredication::Mode> EnableTailPredication;
58
59extern cl::opt<bool> EnableMaskedGatherScatters;
60
61extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor;
62
63/// Convert a vector load intrinsic into a simple llvm load instruction.
64/// This is beneficial when the underlying object being addressed comes
65/// from a constant, since we get constant-folding for free.
66static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
67 InstCombiner::BuilderTy &Builder) {
68 auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
69
70 if (!IntrAlign)
71 return nullptr;
72
73 unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
74 ? MemAlign
75 : IntrAlign->getLimitedValue();
76
77 if (!isPowerOf2_32(Alignment))
78 return nullptr;
79
80 auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
81 PointerType::get(II.getType(), 0));
82 return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
83}
84
85bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
86 const Function *Callee) const {
87 const TargetMachine &TM = getTLI()->getTargetMachine();
88 const FeatureBitset &CallerBits =
89 TM.getSubtargetImpl(*Caller)->getFeatureBits();
90 const FeatureBitset &CalleeBits =
91 TM.getSubtargetImpl(*Callee)->getFeatureBits();
92
93 // To inline a callee, all features not in the allowed list must match exactly.
94 bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
95 (CalleeBits & ~InlineFeaturesAllowed);
96 // For features in the allowed list, the callee's features must be a subset of
97 // the callers'.
98 bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
99 (CalleeBits & InlineFeaturesAllowed);
100 return MatchExact && MatchSubset;
101}
102
103TTI::AddressingModeKind
104ARMTTIImpl::getPreferredAddressingMode(const Loop *L,
105 ScalarEvolution *SE) const {
106 if (ST->hasMVEIntegerOps())
107 return TTI::AMK_PostIndexed;
108
109 if (L->getHeader()->getParent()->hasOptSize())
110 return TTI::AMK_None;
111
112 if (ST->isMClass() && ST->isThumb2() &&
113 L->getNumBlocks() == 1)
114 return TTI::AMK_PreIndexed;
115
116 return TTI::AMK_None;
117}
118
119Optional<Instruction *>
120ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
121 using namespace PatternMatch;
122 Intrinsic::ID IID = II.getIntrinsicID();
123 switch (IID) {
124 default:
125 break;
126 case Intrinsic::arm_neon_vld1: {
127 Align MemAlign =
128 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
129 &IC.getAssumptionCache(), &IC.getDominatorTree());
130 if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
131 return IC.replaceInstUsesWith(II, V);
132 }
133 break;
134 }
135
136 case Intrinsic::arm_neon_vld2:
137 case Intrinsic::arm_neon_vld3:
138 case Intrinsic::arm_neon_vld4:
139 case Intrinsic::arm_neon_vld2lane:
140 case Intrinsic::arm_neon_vld3lane:
141 case Intrinsic::arm_neon_vld4lane:
142 case Intrinsic::arm_neon_vst1:
143 case Intrinsic::arm_neon_vst2:
144 case Intrinsic::arm_neon_vst3:
145 case Intrinsic::arm_neon_vst4:
146 case Intrinsic::arm_neon_vst2lane:
147 case Intrinsic::arm_neon_vst3lane:
148 case Intrinsic::arm_neon_vst4lane: {
149 Align MemAlign =
150 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
151 &IC.getAssumptionCache(), &IC.getDominatorTree());
152 unsigned AlignArg = II.getNumArgOperands() - 1;
153 Value *AlignArgOp = II.getArgOperand(AlignArg);
154 MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
155 if (Align && *Align < MemAlign) {
156 return IC.replaceOperand(
157 II, AlignArg,
158 ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(),
159 false));
160 }
161 break;
162 }
163
164 case Intrinsic::arm_mve_pred_i2v: {
165 Value *Arg = II.getArgOperand(0);
166 Value *ArgArg;
167 if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
168 PatternMatch::m_Value(ArgArg))) &&
169 II.getType() == ArgArg->getType()) {
170 return IC.replaceInstUsesWith(II, ArgArg);
171 }
172 Constant *XorMask;
173 if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
174 PatternMatch::m_Value(ArgArg)),
175 PatternMatch::m_Constant(XorMask))) &&
176 II.getType() == ArgArg->getType()) {
177 if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
178 if (CI->getValue().trunc(16).isAllOnesValue()) {
179 auto TrueVector = IC.Builder.CreateVectorSplat(
180 cast<FixedVectorType>(II.getType())->getNumElements(),
181 IC.Builder.getTrue());
182 return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
183 }
184 }
185 }
186 KnownBits ScalarKnown(32);
187 if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
188 ScalarKnown, 0)) {
189 return &II;
190 }
191 break;
192 }
193 case Intrinsic::arm_mve_pred_v2i: {
194 Value *Arg = II.getArgOperand(0);
195 Value *ArgArg;
196 if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
197 PatternMatch::m_Value(ArgArg)))) {
198 return IC.replaceInstUsesWith(II, ArgArg);
199 }
200 if (!II.getMetadata(LLVMContext::MD_range)) {
201 Type *IntTy32 = Type::getInt32Ty(II.getContext());
202 Metadata *M[] = {
203 ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)),
204 ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0x10000))};
205 II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M));
206 return &II;
207 }
208 break;
209 }
210 case Intrinsic::arm_mve_vadc:
211 case Intrinsic::arm_mve_vadc_predicated: {
212 unsigned CarryOp =
213 (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
214 assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&(static_cast<void> (0))
215 "Bad type for intrinsic!")(static_cast<void> (0));
216
217 KnownBits CarryKnown(32);
218 if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
219 CarryKnown)) {
220 return &II;
221 }
222 break;
223 }
224 case Intrinsic::arm_mve_vmldava: {
225 Instruction *I = cast<Instruction>(&II);
226 if (I->hasOneUse()) {
227 auto *User = cast<Instruction>(*I->user_begin());
228 Value *OpZ;
229 if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
230 match(I->getOperand(3), m_Zero())) {
231 Value *OpX = I->getOperand(4);
232 Value *OpY = I->getOperand(5);
233 Type *OpTy = OpX->getType();
234
235 IC.Builder.SetInsertPoint(User);
236 Value *V =
237 IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
238 {I->getOperand(0), I->getOperand(1),
239 I->getOperand(2), OpZ, OpX, OpY});
240
241 IC.replaceInstUsesWith(*User, V);
242 return IC.eraseInstFromFunction(*User);
243 }
244 }
245 return None;
246 }
247 }
248 return None;
249}
250
251InstructionCost ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
252 TTI::TargetCostKind CostKind) {
253 assert(Ty->isIntegerTy())(static_cast<void> (0));
254
255 unsigned Bits = Ty->getPrimitiveSizeInBits();
256 if (Bits == 0 || Imm.getActiveBits() >= 64)
257 return 4;
258
259 int64_t SImmVal = Imm.getSExtValue();
260 uint64_t ZImmVal = Imm.getZExtValue();
261 if (!ST->isThumb()) {
262 if ((SImmVal >= 0 && SImmVal < 65536) ||
263 (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
264 (ARM_AM::getSOImmVal(~ZImmVal) != -1))
265 return 1;
266 return ST->hasV6T2Ops() ? 2 : 3;
267 }
268 if (ST->isThumb2()) {
269 if ((SImmVal >= 0 && SImmVal < 65536) ||
270 (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
271 (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
272 return 1;
273 return ST->hasV6T2Ops() ? 2 : 3;
274 }
275 // Thumb1, any i8 imm cost 1.
276 if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
277 return 1;
278 if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
279 return 2;
280 // Load from constantpool.
281 return 3;
282}
283
284// Constants smaller than 256 fit in the immediate field of
285// Thumb1 instructions so we return a zero cost and 1 otherwise.
286InstructionCost ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
287 const APInt &Imm, Type *Ty) {
288 if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
289 return 0;
290
291 return 1;
292}
293
294// Checks whether Inst is part of a min(max()) or max(min()) pattern
295// that will match to an SSAT instruction
296static bool isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
297 Value *LHS, *RHS;
298 ConstantInt *C;
299 SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor;
300
301 if (InstSPF == SPF_SMAX &&
302 PatternMatch::match(RHS, PatternMatch::m_ConstantInt(C)) &&
303 C->getValue() == Imm && Imm.isNegative() && (-Imm).isPowerOf2()) {
304
305 auto isSSatMin = [&](Value *MinInst) {
306 if (isa<SelectInst>(MinInst)) {
307 Value *MinLHS, *MinRHS;
308 ConstantInt *MinC;
309 SelectPatternFlavor MinSPF =
310 matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
311 if (MinSPF == SPF_SMIN &&
312 PatternMatch::match(MinRHS, PatternMatch::m_ConstantInt(MinC)) &&
313 MinC->getValue() == ((-Imm) - 1))
314 return true;
315 }
316 return false;
317 };
318
319 if (isSSatMin(Inst->getOperand(1)) ||
320 (Inst->hasNUses(2) && (isSSatMin(*Inst->user_begin()) ||
321 isSSatMin(*(++Inst->user_begin())))))
322 return true;
323 }
324 return false;
325}
326
327InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
328 const APInt &Imm, Type *Ty,
329 TTI::TargetCostKind CostKind,
330 Instruction *Inst) {
331 // Division by a constant can be turned into multiplication, but only if we
332 // know it's constant. So it's not so much that the immediate is cheap (it's
333 // not), but that the alternative is worse.
334 // FIXME: this is probably unneeded with GlobalISel.
335 if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
336 Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
337 Idx == 1)
338 return 0;
339
340 // Leave any gep offsets for the CodeGenPrepare, which will do a better job at
341 // splitting any large offsets.
342 if (Opcode == Instruction::GetElementPtr && Idx != 0)
343 return 0;
344
345 if (Opcode == Instruction::And) {
346 // UXTB/UXTH
347 if (Imm == 255 || Imm == 65535)
348 return 0;
349 // Conversion to BIC is free, and means we can use ~Imm instead.
350 return std::min(getIntImmCost(Imm, Ty, CostKind),
351 getIntImmCost(~Imm, Ty, CostKind));
352 }
353
354 if (Opcode == Instruction::Add)
355 // Conversion to SUB is free, and means we can use -Imm instead.
356 return std::min(getIntImmCost(Imm, Ty, CostKind),
357 getIntImmCost(-Imm, Ty, CostKind));
358
359 if (Opcode == Instruction::ICmp && Imm.isNegative() &&
360 Ty->getIntegerBitWidth() == 32) {
361 int64_t NegImm = -Imm.getSExtValue();
362 if (ST->isThumb2() && NegImm < 1<<12)
363 // icmp X, #-C -> cmn X, #C
364 return 0;
365 if (ST->isThumb() && NegImm < 1<<8)
366 // icmp X, #-C -> adds X, #C
367 return 0;
368 }
369
370 // xor a, -1 can always be folded to MVN
371 if (Opcode == Instruction::Xor && Imm.isAllOnesValue())
372 return 0;
373
374 // Ensures negative constant of min(max()) or max(min()) patterns that
375 // match to SSAT instructions don't get hoisted
376 if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
377 Ty->getIntegerBitWidth() <= 32) {
378 if (isSSATMinMaxPattern(Inst, Imm) ||
379 (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
380 isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm)))
381 return 0;
382 }
383
384 return getIntImmCost(Imm, Ty, CostKind);
385}
386
387InstructionCost ARMTTIImpl::getCFInstrCost(unsigned Opcode,
388 TTI::TargetCostKind CostKind,
389 const Instruction *I) {
390 if (CostKind == TTI::TCK_RecipThroughput &&
391 (ST->hasNEON() || ST->hasMVEIntegerOps())) {
392 // FIXME: The vectorizer is highly sensistive to the cost of these
393 // instructions, which suggests that it may be using the costs incorrectly.
394 // But, for now, just make them free to avoid performance regressions for
395 // vector targets.
396 return 0;
397 }
398 return BaseT::getCFInstrCost(Opcode, CostKind, I);
399}
400
401InstructionCost ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
402 Type *Src,
403 TTI::CastContextHint CCH,
404 TTI::TargetCostKind CostKind,
405 const Instruction *I) {
406 int ISD = TLI->InstructionOpcodeToISD(Opcode);
407 assert(ISD && "Invalid opcode")(static_cast<void> (0));
408
409 // TODO: Allow non-throughput costs that aren't binary.
410 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
411 if (CostKind != TTI::TCK_RecipThroughput)
412 return Cost == 0 ? 0 : 1;
413 return Cost;
414 };
415 auto IsLegalFPType = [this](EVT VT) {
416 EVT EltVT = VT.getScalarType();
417 return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
418 (EltVT == MVT::f64 && ST->hasFP64()) ||
419 (EltVT == MVT::f16 && ST->hasFullFP16());
420 };
421
422 EVT SrcTy = TLI->getValueType(DL, Src);
423 EVT DstTy = TLI->getValueType(DL, Dst);
424
425 if (!SrcTy.isSimple() || !DstTy.isSimple())
426 return AdjustCost(
427 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
428
429 // Extending masked load/Truncating masked stores is expensive because we
430 // currently don't split them. This means that we'll likely end up
431 // loading/storing each element individually (hence the high cost).
432 if ((ST->hasMVEIntegerOps() &&
433 (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
434 Opcode == Instruction::SExt)) ||
435 (ST->hasMVEFloatOps() &&
436 (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
437 IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
438 if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
439 return 2 * DstTy.getVectorNumElements() *
440 ST->getMVEVectorCostFactor(CostKind);
441
442 // The extend of other kinds of load is free
443 if (CCH == TTI::CastContextHint::Normal ||
444 CCH == TTI::CastContextHint::Masked) {
445 static const TypeConversionCostTblEntry LoadConversionTbl[] = {
446 {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
447 {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
448 {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
449 {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
450 {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
451 {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
452 {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
453 {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
454 {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
455 {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
456 {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
457 {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
458 };
459 if (const auto *Entry = ConvertCostTableLookup(
460 LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
461 return AdjustCost(Entry->Cost);
462
463 static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
464 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
465 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
466 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
467 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
468 {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
469 {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
470 // The following extend from a legal type to an illegal type, so need to
471 // split the load. This introduced an extra load operation, but the
472 // extend is still "free".
473 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1},
474 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1},
475 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3},
476 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3},
477 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1},
478 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1},
479 };
480 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
481 if (const auto *Entry =
482 ConvertCostTableLookup(MVELoadConversionTbl, ISD,
483 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
484 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
485 }
486
487 static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
488 // FPExtends are similar but also require the VCVT instructions.
489 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1},
490 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3},
491 };
492 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
493 if (const auto *Entry =
494 ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
495 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
496 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
497 }
498
499 // The truncate of a store is free. This is the mirror of extends above.
500 static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
501 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0},
502 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0},
503 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0},
504 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1},
505 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1},
506 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3},
507 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1},
508 };
509 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
510 if (const auto *Entry =
511 ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
512 SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
513 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
514 }
515
516 static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
517 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
518 {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
519 };
520 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
521 if (const auto *Entry =
522 ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
523 SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
524 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
525 }
526 }
527
528 // NEON vector operations that can extend their inputs.
529 if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
530 I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
531 static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
532 // vaddl
533 { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
534 { ISD::ADD, MVT::v8i16, MVT::v8i8, 0 },
535 // vsubl
536 { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
537 { ISD::SUB, MVT::v8i16, MVT::v8i8, 0 },
538 // vmull
539 { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
540 { ISD::MUL, MVT::v8i16, MVT::v8i8, 0 },
541 // vshll
542 { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
543 { ISD::SHL, MVT::v8i16, MVT::v8i8, 0 },
544 };
545
546 auto *User = cast<Instruction>(*I->user_begin());
547 int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
548 if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
549 DstTy.getSimpleVT(),
550 SrcTy.getSimpleVT())) {
551 return AdjustCost(Entry->Cost);
552 }
553 }
554
555 // Single to/from double precision conversions.
556 if (Src->isVectorTy() && ST->hasNEON() &&
557 ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
558 DstTy.getScalarType() == MVT::f32) ||
559 (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
560 DstTy.getScalarType() == MVT::f64))) {
561 static const CostTblEntry NEONFltDblTbl[] = {
562 // Vector fptrunc/fpext conversions.
563 {ISD::FP_ROUND, MVT::v2f64, 2},
564 {ISD::FP_EXTEND, MVT::v2f32, 2},
565 {ISD::FP_EXTEND, MVT::v4f32, 4}};
566
567 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
568 if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
569 return AdjustCost(LT.first * Entry->Cost);
570 }
571
572 // Some arithmetic, load and store operations have specific instructions
573 // to cast up/down their types automatically at no extra cost.
574 // TODO: Get these tables to know at least what the related operations are.
575 static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
576 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
577 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
578 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
579 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
580 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
581 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
582
583 // The number of vmovl instructions for the extension.
584 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
585 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
586 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
587 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
588 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
589 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
590 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
591 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
592 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
593 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
594 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
595 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
596 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
597 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
598 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
599 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
600 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
601 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
602
603 // Operations that we legalize using splitting.
604 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
605 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
606
607 // Vector float <-> i32 conversions.
608 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
609 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
610
611 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
612 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
613 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
614 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
615 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
616 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
617 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
618 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
619 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
620 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
621 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
622 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
623 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
624 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
625 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
626 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
627 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
628 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
629 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
630 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
631
632 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
633 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
634 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 },
635 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 },
636 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
637 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
638
639 // Vector double <-> i32 conversions.
640 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
641 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
642
643 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
644 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
645 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
646 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
647 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
648 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
649
650 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
651 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
652 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 4 },
653 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 4 },
654 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 8 },
655 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 8 }
656 };
657
658 if (SrcTy.isVector() && ST->hasNEON()) {
659 if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
660 DstTy.getSimpleVT(),
661 SrcTy.getSimpleVT()))
662 return AdjustCost(Entry->Cost);
663 }
664
665 // Scalar float to integer conversions.
666 static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
667 { ISD::FP_TO_SINT, MVT::i1, MVT::f32, 2 },
668 { ISD::FP_TO_UINT, MVT::i1, MVT::f32, 2 },
669 { ISD::FP_TO_SINT, MVT::i1, MVT::f64, 2 },
670 { ISD::FP_TO_UINT, MVT::i1, MVT::f64, 2 },
671 { ISD::FP_TO_SINT, MVT::i8, MVT::f32, 2 },
672 { ISD::FP_TO_UINT, MVT::i8, MVT::f32, 2 },
673 { ISD::FP_TO_SINT, MVT::i8, MVT::f64, 2 },
674 { ISD::FP_TO_UINT, MVT::i8, MVT::f64, 2 },
675 { ISD::FP_TO_SINT, MVT::i16, MVT::f32, 2 },
676 { ISD::FP_TO_UINT, MVT::i16, MVT::f32, 2 },
677 { ISD::FP_TO_SINT, MVT::i16, MVT::f64, 2 },
678 { ISD::FP_TO_UINT, MVT::i16, MVT::f64, 2 },
679 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 2 },
680 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 2 },
681 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 2 },
682 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 2 },
683 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 10 },
684 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 10 },
685 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 10 },
686 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 10 }
687 };
688 if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
689 if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
690 DstTy.getSimpleVT(),
691 SrcTy.getSimpleVT()))
692 return AdjustCost(Entry->Cost);
693 }
694
695 // Scalar integer to float conversions.
696 static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
697 { ISD::SINT_TO_FP, MVT::f32, MVT::i1, 2 },
698 { ISD::UINT_TO_FP, MVT::f32, MVT::i1, 2 },
699 { ISD::SINT_TO_FP, MVT::f64, MVT::i1, 2 },
700 { ISD::UINT_TO_FP, MVT::f64, MVT::i1, 2 },
701 { ISD::SINT_TO_FP, MVT::f32, MVT::i8, 2 },
702 { ISD::UINT_TO_FP, MVT::f32, MVT::i8, 2 },
703 { ISD::SINT_TO_FP, MVT::f64, MVT::i8, 2 },
704 { ISD::UINT_TO_FP, MVT::f64, MVT::i8, 2 },
705 { ISD::SINT_TO_FP, MVT::f32, MVT::i16, 2 },
706 { ISD::UINT_TO_FP, MVT::f32, MVT::i16, 2 },
707 { ISD::SINT_TO_FP, MVT::f64, MVT::i16, 2 },
708 { ISD::UINT_TO_FP, MVT::f64, MVT::i16, 2 },
709 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 2 },
710 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 2 },
711 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 2 },
712 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 2 },
713 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 10 },
714 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 10 },
715 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 10 },
716 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 10 }
717 };
718
719 if (SrcTy.isInteger() && ST->hasNEON()) {
720 if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
721 ISD, DstTy.getSimpleVT(),
722 SrcTy.getSimpleVT()))
723 return AdjustCost(Entry->Cost);
724 }
725
726 // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
727 // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
728 // are linearised so take more.
729 static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
730 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
731 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
732 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
733 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
734 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
735 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
736 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
737 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
738 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
739 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
740 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
741 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
742 };
743
744 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
745 if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
746 ISD, DstTy.getSimpleVT(),
747 SrcTy.getSimpleVT()))
748 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
749 }
750
751 if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
752 // As general rule, fp converts that were not matched above are scalarized
753 // and cost 1 vcvt for each lane, so long as the instruction is available.
754 // If not it will become a series of function calls.
755 const InstructionCost CallCost =
756 getCallInstrCost(nullptr, Dst, {Src}, CostKind);
757 int Lanes = 1;
758 if (SrcTy.isFixedLengthVector())
759 Lanes = SrcTy.getVectorNumElements();
760
761 if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
762 return Lanes;
763 else
764 return Lanes * CallCost;
765 }
766
767 if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
768 SrcTy.isFixedLengthVector()) {
769 // Treat a truncate with larger than legal source (128bits for MVE) as
770 // expensive, 2 instructions per lane.
771 if ((SrcTy.getScalarType() == MVT::i8 ||
772 SrcTy.getScalarType() == MVT::i16 ||
773 SrcTy.getScalarType() == MVT::i32) &&
774 SrcTy.getSizeInBits() > 128 &&
775 SrcTy.getSizeInBits() > DstTy.getSizeInBits())
776 return SrcTy.getVectorNumElements() * 2;
777 }
778
779 // Scalar integer conversion costs.
780 static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
781 // i16 -> i64 requires two dependent operations.
782 { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
783
784 // Truncates on i64 are assumed to be free.
785 { ISD::TRUNCATE, MVT::i32, MVT::i64, 0 },
786 { ISD::TRUNCATE, MVT::i16, MVT::i64, 0 },
787 { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 },
788 { ISD::TRUNCATE, MVT::i1, MVT::i64, 0 }
789 };
790
791 if (SrcTy.isInteger()) {
792 if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
793 DstTy.getSimpleVT(),
794 SrcTy.getSimpleVT()))
795 return AdjustCost(Entry->Cost);
796 }
797
798 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
799 ? ST->getMVEVectorCostFactor(CostKind)
800 : 1;
801 return AdjustCost(
802 BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
803}
804
805InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
806 unsigned Index) {
807 // Penalize inserting into an D-subregister. We end up with a three times
808 // lower estimated throughput on swift.
809 if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
810 ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
811 return 3;
812
813 if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
814 Opcode == Instruction::ExtractElement)) {
815 // Cross-class copies are expensive on many microarchitectures,
816 // so assume they are expensive by default.
817 if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
818 return 3;
819
820 // Even if it's not a cross class copy, this likely leads to mixing
821 // of NEON and VFP code and should be therefore penalized.
822 if (ValTy->isVectorTy() &&
823 ValTy->getScalarSizeInBits() <= 32)
824 return std::max<InstructionCost>(
825 BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
826 }
827
828 if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
829 Opcode == Instruction::ExtractElement)) {
830 // Integer cross-lane moves are more expensive than float, which can
831 // sometimes just be vmovs. Integer involve being passes to GPR registers,
832 // causing more of a delay.
833 std::pair<InstructionCost, MVT> LT =
834 getTLI()->getTypeLegalizationCost(DL, ValTy->getScalarType());
835 return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
836 }
837
838 return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
839}
840
841InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
842 Type *CondTy,
843 CmpInst::Predicate VecPred,
844 TTI::TargetCostKind CostKind,
845 const Instruction *I) {
846 int ISD = TLI->InstructionOpcodeToISD(Opcode);
847
848 // Thumb scalar code size cost for select.
849 if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
40
Assuming 'CostKind' is not equal to TCK_CodeSize
850 ST->isThumb() && !ValTy->isVectorTy()) {
851 // Assume expensive structs.
852 if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
853 return TTI::TCC_Expensive;
854
855 // Select costs can vary because they:
856 // - may require one or more conditional mov (including an IT),
857 // - can't operate directly on immediates,
858 // - require live flags, which we can't copy around easily.
859 InstructionCost Cost = TLI->getTypeLegalizationCost(DL, ValTy).first;
860
861 // Possible IT instruction for Thumb2, or more for Thumb1.
862 ++Cost;
863
864 // i1 values may need rematerialising by using mov immediates and/or
865 // flag setting instructions.
866 if (ValTy->isIntegerTy(1))
867 ++Cost;
868
869 return Cost;
870 }
871
872 // If this is a vector min/max/abs, use the cost of that intrinsic directly
873 // instead. Hopefully when min/max intrinsics are more prevalent this code
874 // will not be needed.
875 const Instruction *Sel = I;
876 if ((Opcode
40.1
'Opcode' is equal to ICmp
40.1
'Opcode' is equal to ICmp
40.1
'Opcode' is equal to ICmp
40.1
'Opcode' is equal to ICmp
40.1
'Opcode' is equal to ICmp
40.1
'Opcode' is equal to ICmp
40.1
'Opcode' is equal to ICmp
== Instruction::ICmp || Opcode == Instruction::FCmp) && Sel
40.2
'Sel' is null
40.2
'Sel' is null
40.2
'Sel' is null
40.2
'Sel' is null
40.2
'Sel' is null
40.2
'Sel' is null
40.2
'Sel' is null
&&
877 Sel->hasOneUse())
878 Sel = cast<Instruction>(Sel->user_back());
879 if (Sel
40.3
'Sel' is null
40.3
'Sel' is null
40.3
'Sel' is null
40.3
'Sel' is null
40.3
'Sel' is null
40.3
'Sel' is null
40.3
'Sel' is null
&& ValTy->isVectorTy() &&
880 (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
881 const Value *LHS, *RHS;
882 SelectPatternFlavor SPF = matchSelectPattern(Sel, LHS, RHS).Flavor;
883 unsigned IID = 0;
884 switch (SPF) {
885 case SPF_ABS:
886 IID = Intrinsic::abs;
887 break;
888 case SPF_SMIN:
889 IID = Intrinsic::smin;
890 break;
891 case SPF_SMAX:
892 IID = Intrinsic::smax;
893 break;
894 case SPF_UMIN:
895 IID = Intrinsic::umin;
896 break;
897 case SPF_UMAX:
898 IID = Intrinsic::umax;
899 break;
900 case SPF_FMINNUM:
901 IID = Intrinsic::minnum;
902 break;
903 case SPF_FMAXNUM:
904 IID = Intrinsic::maxnum;
905 break;
906 default:
907 break;
908 }
909 if (IID) {
910 // The ICmp is free, the select gets the cost of the min/max/etc
911 if (Sel != I)
912 return 0;
913 IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
914 return getIntrinsicInstrCost(CostAttrs, CostKind);
915 }
916 }
917
918 // On NEON a vector select gets lowered to vbsl.
919 if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
41
Assuming the condition is false
920 // Lowering of some vector selects is currently far from perfect.
921 static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
922 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
923 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
924 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
925 };
926
927 EVT SelCondTy = TLI->getValueType(DL, CondTy);
928 EVT SelValTy = TLI->getValueType(DL, ValTy);
929 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
930 if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
931 SelCondTy.getSimpleVT(),
932 SelValTy.getSimpleVT()))
933 return Entry->Cost;
934 }
935
936 std::pair<InstructionCost, MVT> LT =
937 TLI->getTypeLegalizationCost(DL, ValTy);
938 return LT.first;
939 }
940
941 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
42
Assuming the condition is true
43
Calling 'Type::isVectorTy'
47
Returning from 'Type::isVectorTy'
50
Taking true branch
942 (Opcode
47.1
'Opcode' is equal to ICmp
47.1
'Opcode' is equal to ICmp
47.1
'Opcode' is equal to ICmp
47.1
'Opcode' is equal to ICmp
47.1
'Opcode' is equal to ICmp
47.1
'Opcode' is equal to ICmp
47.1
'Opcode' is equal to ICmp
== Instruction::ICmp || Opcode == Instruction::FCmp) &&
943 cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
48
'ValTy' is a 'FixedVectorType'
49
Assuming the condition is true
944 FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
51
'ValTy' is a 'FixedVectorType'
945 FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy);
52
Assuming pointer value is null
53
Assuming null pointer is passed into cast
946 if (!VecCondTy
53.1
'VecCondTy' is null
53.1
'VecCondTy' is null
53.1
'VecCondTy' is null
53.1
'VecCondTy' is null
53.1
'VecCondTy' is null
53.1
'VecCondTy' is null
53.1
'VecCondTy' is null
)
54
Taking true branch
947 VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy));
55
The object is a 'FixedVectorType'
948
949 // If we don't have mve.fp any fp operations will need to be scalarized.
950 if (Opcode
55.1
'Opcode' is not equal to FCmp
55.1
'Opcode' is not equal to FCmp
55.1
'Opcode' is not equal to FCmp
55.1
'Opcode' is not equal to FCmp
55.1
'Opcode' is not equal to FCmp
55.1
'Opcode' is not equal to FCmp
55.1
'Opcode' is not equal to FCmp
== Instruction::FCmp && !ST->hasMVEFloatOps()) {
951 // One scalaization insert, one scalarization extract and the cost of the
952 // fcmps.
953 return BaseT::getScalarizationOverhead(VecValTy, false, true) +
954 BaseT::getScalarizationOverhead(VecCondTy, true, false) +
955 VecValTy->getNumElements() *
956 getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
957 VecCondTy->getScalarType(), VecPred, CostKind,
958 I);
959 }
960
961 std::pair<InstructionCost, MVT> LT =
962 TLI->getTypeLegalizationCost(DL, ValTy);
963 int BaseCost = ST->getMVEVectorCostFactor(CostKind);
964 // There are two types - the input that specifies the type of the compare
965 // and the output vXi1 type. Because we don't know how the output will be
966 // split, we may need an expensive shuffle to get two in sync. This has the
967 // effect of making larger than legal compares (v8i32 for example)
968 // expensive.
969 if (LT.second.getVectorNumElements() > 2) {
56
Assuming the condition is false
57
Taking false branch
970 if (LT.first > 1)
971 return LT.first * BaseCost +
972 BaseT::getScalarizationOverhead(VecCondTy, true, false);
973 return BaseCost;
974 }
975 }
976
977 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
978 // for "multiple beats" potentially needed by MVE instructions.
979 int BaseCost = 1;
980 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
58
Taking false branch
981 BaseCost = ST->getMVEVectorCostFactor(CostKind);
982
983 return BaseCost *
984 BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
59
Passing null pointer value via 3rd parameter 'CondTy'
60
Calling 'BasicTTIImplBase::getCmpSelInstrCost'
985}
986
987InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty,
988 ScalarEvolution *SE,
989 const SCEV *Ptr) {
990 // Address computations in vectorized code with non-consecutive addresses will
991 // likely result in more instructions compared to scalar code where the
992 // computation can more often be merged into the index mode. The resulting
993 // extra micro-ops can significantly decrease throughput.
994 unsigned NumVectorInstToHideOverhead = 10;
995 int MaxMergeDistance = 64;
996
997 if (ST->hasNEON()) {
998 if (Ty->isVectorTy() && SE &&
999 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
1000 return NumVectorInstToHideOverhead;
1001
1002 // In many cases the address computation is not merged into the instruction
1003 // addressing mode.
1004 return 1;
1005 }
1006 return BaseT::getAddressComputationCost(Ty, SE, Ptr);
1007}
1008
1009bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) {
1010 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
1011 // If a VCTP is part of a chain, it's already profitable and shouldn't be
1012 // optimized, else LSR may block tail-predication.
1013 switch (II->getIntrinsicID()) {
1014 case Intrinsic::arm_mve_vctp8:
1015 case Intrinsic::arm_mve_vctp16:
1016 case Intrinsic::arm_mve_vctp32:
1017 case Intrinsic::arm_mve_vctp64:
1018 return true;
1019 default:
1020 break;
1021 }
1022 }
1023 return false;
1024}
1025
1026bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
1027 if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
1028 return false;
1029
1030 if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
1031 // Don't support v2i1 yet.
1032 if (VecTy->getNumElements() == 2)
1033 return false;
1034
1035 // We don't support extending fp types.
1036 unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
1037 if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
1038 return false;
1039 }
1040
1041 unsigned EltWidth = DataTy->getScalarSizeInBits();
1042 return (EltWidth == 32 && Alignment >= 4) ||
1043 (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
1044}
1045
1046bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) {
1047 if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
1048 return false;
1049
1050 // This method is called in 2 places:
1051 // - from the vectorizer with a scalar type, in which case we need to get
1052 // this as good as we can with the limited info we have (and rely on the cost
1053 // model for the rest).
1054 // - from the masked intrinsic lowering pass with the actual vector type.
1055 // For MVE, we have a custom lowering pass that will already have custom
1056 // legalised any gathers that we can to MVE intrinsics, and want to expand all
1057 // the rest. The pass runs before the masked intrinsic lowering pass, so if we
1058 // are here, we know we want to expand.
1059 if (isa<VectorType>(Ty))
1060 return false;
1061
1062 unsigned EltWidth = Ty->getScalarSizeInBits();
1063 return ((EltWidth == 32 && Alignment >= 4) ||
1064 (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1065}
1066
1067/// Given a memcpy/memset/memmove instruction, return the number of memory
1068/// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1069/// call is used.
1070int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const {
1071 MemOp MOp;
1072 unsigned DstAddrSpace = ~0u;
1073 unsigned SrcAddrSpace = ~0u;
1074 const Function *F = I->getParent()->getParent();
1075
1076 if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
1077 ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
1078 // If 'size' is not a constant, a library call will be generated.
1079 if (!C)
1080 return -1;
1081
1082 const unsigned Size = C->getValue().getZExtValue();
1083 const Align DstAlign = *MC->getDestAlign();
1084 const Align SrcAlign = *MC->getSourceAlign();
1085
1086 MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
1087 /*IsVolatile*/ false);
1088 DstAddrSpace = MC->getDestAddressSpace();
1089 SrcAddrSpace = MC->getSourceAddressSpace();
1090 }
1091 else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
1092 ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
1093 // If 'size' is not a constant, a library call will be generated.
1094 if (!C)
1095 return -1;
1096
1097 const unsigned Size = C->getValue().getZExtValue();
1098 const Align DstAlign = *MS->getDestAlign();
1099
1100 MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
1101 /*IsZeroMemset*/ false, /*IsVolatile*/ false);
1102 DstAddrSpace = MS->getDestAddressSpace();
1103 }
1104 else
1105 llvm_unreachable("Expected a memcpy/move or memset!")__builtin_unreachable();
1106
1107 unsigned Limit, Factor = 2;
1108 switch(I->getIntrinsicID()) {
1109 case Intrinsic::memcpy:
1110 Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
1111 break;
1112 case Intrinsic::memmove:
1113 Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
1114 break;
1115 case Intrinsic::memset:
1116 Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
1117 Factor = 1;
1118 break;
1119 default:
1120 llvm_unreachable("Expected a memcpy/move or memset!")__builtin_unreachable();
1121 }
1122
1123 // MemOps will be poplulated with a list of data types that needs to be
1124 // loaded and stored. That's why we multiply the number of elements by 2 to
1125 // get the cost for this memcpy.
1126 std::vector<EVT> MemOps;
1127 if (getTLI()->findOptimalMemOpLowering(
1128 MemOps, Limit, MOp, DstAddrSpace,
1129 SrcAddrSpace, F->getAttributes()))
1130 return MemOps.size() * Factor;
1131
1132 // If we can't find an optimal memop lowering, return the default cost
1133 return -1;
1134}
1135
1136InstructionCost ARMTTIImpl::getMemcpyCost(const Instruction *I) {
1137 int NumOps = getNumMemOps(cast<IntrinsicInst>(I));
1138
1139 // To model the cost of a library call, we assume 1 for the call, and
1140 // 3 for the argument setup.
1141 if (NumOps == -1)
1142 return 4;
1143 return NumOps;
1144}
1145
1146InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1147 VectorType *Tp, ArrayRef<int> Mask,
1148 int Index, VectorType *SubTp) {
1149 Kind = improveShuffleKindFromMask(Kind, Mask);
1150 if (ST->hasNEON()) {
1151 if (Kind == TTI::SK_Broadcast) {
1152 static const CostTblEntry NEONDupTbl[] = {
1153 // VDUP handles these cases.
1154 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1155 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1156 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1157 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1158 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1159 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1160
1161 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1162 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1163 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1164 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
1165
1166 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1167 if (const auto *Entry =
1168 CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
1169 return LT.first * Entry->Cost;
1170 }
1171 if (Kind == TTI::SK_Reverse) {
1172 static const CostTblEntry NEONShuffleTbl[] = {
1173 // Reverse shuffle cost one instruction if we are shuffling within a
1174 // double word (vrev) or two if we shuffle a quad word (vrev, vext).
1175 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1176 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1177 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1178 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1179 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1180 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1181
1182 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1183 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1184 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
1185 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
1186
1187 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1188 if (const auto *Entry =
1189 CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
1190 return LT.first * Entry->Cost;
1191 }
1192 if (Kind == TTI::SK_Select) {
1193 static const CostTblEntry NEONSelShuffleTbl[] = {
1194 // Select shuffle cost table for ARM. Cost is the number of
1195 // instructions
1196 // required to create the shuffled vector.
1197
1198 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1199 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1200 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1201 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1202
1203 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1204 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1205 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
1206
1207 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
1208
1209 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
1210
1211 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1212 if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
1213 ISD::VECTOR_SHUFFLE, LT.second))
1214 return LT.first * Entry->Cost;
1215 }
1216 }
1217 if (ST->hasMVEIntegerOps()) {
1218 if (Kind == TTI::SK_Broadcast) {
1219 static const CostTblEntry MVEDupTbl[] = {
1220 // VDUP handles these cases.
1221 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1222 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1223 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
1224 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1225 {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};
1226
1227 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1228 if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
1229 LT.second))
1230 return LT.first * Entry->Cost *
1231 ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput);
1232 }
1233
1234 if (!Mask.empty()) {
1235 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1236 if (Mask.size() <= LT.second.getVectorNumElements() &&
1237 (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
1238 isVREVMask(Mask, LT.second, 64)))
1239 return ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) * LT.first;
1240 }
1241 }
1242
1243 int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
1244 ? ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput)
1245 : 1;
1246 return BaseCost * BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
1247}
1248
1249InstructionCost ARMTTIImpl::getArithmeticInstrCost(
1250 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1251 TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
1252 TTI::OperandValueProperties Opd1PropInfo,
1253 TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
1254 const Instruction *CxtI) {
1255 int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1256 if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
1257 // Make operations on i1 relatively expensive as this often involves
1258 // combining predicates. AND and XOR should be easier to handle with IT
1259 // blocks.
1260 switch (ISDOpcode) {
1261 default:
1262 break;
1263 case ISD::AND:
1264 case ISD::XOR:
1265 return 2;
1266 case ISD::OR:
1267 return 3;
1268 }
1269 }
1270
1271 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
1272
1273 if (ST->hasNEON()) {
1274 const unsigned FunctionCallDivCost = 20;
1275 const unsigned ReciprocalDivCost = 10;
1276 static const CostTblEntry CostTbl[] = {
1277 // Division.
1278 // These costs are somewhat random. Choose a cost of 20 to indicate that
1279 // vectorizing devision (added function call) is going to be very expensive.
1280 // Double registers types.
1281 { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1282 { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1283 { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
1284 { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
1285 { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1286 { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1287 { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
1288 { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
1289 { ISD::SDIV, MVT::v4i16, ReciprocalDivCost},
1290 { ISD::UDIV, MVT::v4i16, ReciprocalDivCost},
1291 { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
1292 { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
1293 { ISD::SDIV, MVT::v8i8, ReciprocalDivCost},
1294 { ISD::UDIV, MVT::v8i8, ReciprocalDivCost},
1295 { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost},
1296 { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost},
1297 // Quad register types.
1298 { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1299 { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1300 { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
1301 { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
1302 { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1303 { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1304 { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
1305 { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
1306 { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1307 { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1308 { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
1309 { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
1310 { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1311 { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1312 { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
1313 { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
1314 // Multiplication.
1315 };
1316
1317 if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
1318 return LT.first * Entry->Cost;
1319
1320 InstructionCost Cost = BaseT::getArithmeticInstrCost(
1321 Opcode, Ty, CostKind, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo);
1322
1323 // This is somewhat of a hack. The problem that we are facing is that SROA
1324 // creates a sequence of shift, and, or instructions to construct values.
1325 // These sequences are recognized by the ISel and have zero-cost. Not so for
1326 // the vectorized code. Because we have support for v2i64 but not i64 those
1327 // sequences look particularly beneficial to vectorize.
1328 // To work around this we increase the cost of v2i64 operations to make them
1329 // seem less beneficial.
1330 if (LT.second == MVT::v2i64 &&
1331 Op2Info == TargetTransformInfo::OK_UniformConstantValue)
1332 Cost += 4;
1333
1334 return Cost;
1335 }
1336
1337 // If this operation is a shift on arm/thumb2, it might well be folded into
1338 // the following instruction, hence having a cost of 0.
1339 auto LooksLikeAFreeShift = [&]() {
1340 if (ST->isThumb1Only() || Ty->isVectorTy())
1341 return false;
1342
1343 if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
1344 return false;
1345 if (Op2Info != TargetTransformInfo::OK_UniformConstantValue)
1346 return false;
1347
1348 // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1349 switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
1350 case Instruction::Add:
1351 case Instruction::Sub:
1352 case Instruction::And:
1353 case Instruction::Xor:
1354 case Instruction::Or:
1355 case Instruction::ICmp:
1356 return true;
1357 default:
1358 return false;
1359 }
1360 };
1361 if (LooksLikeAFreeShift())
1362 return 0;
1363
1364 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1365 // for "multiple beats" potentially needed by MVE instructions.
1366 int BaseCost = 1;
1367 if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1368 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1369
1370 // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
1371 // without treating floats as more expensive that scalars or increasing the
1372 // costs for custom operations. The results is also multiplied by the
1373 // MVEVectorCostFactor where appropriate.
1374 if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
1375 return LT.first * BaseCost;
1376
1377 // Else this is expand, assume that we need to scalarize this op.
1378 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
1379 unsigned Num = VTy->getNumElements();
1380 InstructionCost Cost =
1381 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind);
1382 // Return the cost of multiple scalar invocation plus the cost of
1383 // inserting and extracting the values.
1384 SmallVector<Type *> Tys(Args.size(), Ty);
1385 return BaseT::getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
1386 }
1387
1388 return BaseCost;
1389}
1390
1391InstructionCost ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1392 MaybeAlign Alignment,
1393 unsigned AddressSpace,
1394 TTI::TargetCostKind CostKind,
1395 const Instruction *I) {
1396 // TODO: Handle other cost kinds.
1397 if (CostKind != TTI::TCK_RecipThroughput)
1398 return 1;
1399
1400 // Type legalization can't handle structs
1401 if (TLI->getValueType(DL, Src, true) == MVT::Other)
1402 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1403 CostKind);
1404
1405 if (ST->hasNEON() && Src->isVectorTy() &&
1406 (Alignment && *Alignment != Align(16)) &&
1407 cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
1408 // Unaligned loads/stores are extremely inefficient.
1409 // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1410 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
1411 return LT.first * 4;
1412 }
1413
1414 // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
1415 // Same for stores.
1416 if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
1417 ((Opcode == Instruction::Load && I->hasOneUse() &&
1418 isa<FPExtInst>(*I->user_begin())) ||
1419 (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
1420 FixedVectorType *SrcVTy = cast<FixedVectorType>(Src);
1421 Type *DstTy =
1422 Opcode == Instruction::Load
1423 ? (*I->user_begin())->getType()
1424 : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
1425 if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
1426 DstTy->getScalarType()->isFloatTy())
1427 return ST->getMVEVectorCostFactor(CostKind);
1428 }
1429
1430 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1431 ? ST->getMVEVectorCostFactor(CostKind)
1432 : 1;
1433 return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1434 CostKind, I);
1435}
1436
1437InstructionCost
1438ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1439 unsigned AddressSpace,
1440 TTI::TargetCostKind CostKind) {
1441 if (ST->hasMVEIntegerOps()) {
1442 if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment))
1443 return ST->getMVEVectorCostFactor(CostKind);
1444 if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment))
1445 return ST->getMVEVectorCostFactor(CostKind);
1446 }
1447 if (!isa<FixedVectorType>(Src))
1448 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1449 CostKind);
1450 // Scalar cost, which is currently very high due to the efficiency of the
1451 // generated code.
1452 return cast<FixedVectorType>(Src)->getNumElements() * 8;
1453}
1454
1455InstructionCost ARMTTIImpl::getInterleavedMemoryOpCost(
1456 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1457 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1458 bool UseMaskForCond, bool UseMaskForGaps) {
1459 assert(Factor >= 2 && "Invalid interleave factor")(static_cast<void> (0));
1460 assert(isa<VectorType>(VecTy) && "Expect a vector type")(static_cast<void> (0));
1461
1462 // vldN/vstN doesn't support vector types of i64/f64 element.
1463 bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
1464
1465 if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
1466 !UseMaskForCond && !UseMaskForGaps) {
1467 unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1468 auto *SubVecTy =
1469 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
1470
1471 // vldN/vstN only support legal vector types of size 64 or 128 in bits.
1472 // Accesses having vector types that are a multiple of 128 bits can be
1473 // matched to more than one vldN/vstN instruction.
1474 int BaseCost =
1475 ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1;
1476 if (NumElts % Factor == 0 &&
1477 TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
1478 return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1479
1480 // Some smaller than legal interleaved patterns are cheap as we can make
1481 // use of the vmovn or vrev patterns to interleave a standard load. This is
1482 // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1483 // promoted differently). The cost of 2 here is then a load and vrev or
1484 // vmovn.
1485 if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
1486 VecTy->isIntOrIntVectorTy() &&
1487 DL.getTypeSizeInBits(SubVecTy).getFixedSize() <= 64)
1488 return 2 * BaseCost;
1489 }
1490
1491 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1492 Alignment, AddressSpace, CostKind,
1493 UseMaskForCond, UseMaskForGaps);
1494}
1495
1496InstructionCost ARMTTIImpl::getGatherScatterOpCost(
1497 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1498 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
1499 using namespace PatternMatch;
1500 if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
1501 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1502 Alignment, CostKind, I);
1503
1504 assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!")(static_cast<void> (0));
1505 auto *VTy = cast<FixedVectorType>(DataTy);
1506
1507 // TODO: Splitting, once we do that.
1508
1509 unsigned NumElems = VTy->getNumElements();
1510 unsigned EltSize = VTy->getScalarSizeInBits();
1511 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, DataTy);
1512
1513 // For now, it is assumed that for the MVE gather instructions the loads are
1514 // all effectively serialised. This means the cost is the scalar cost
1515 // multiplied by the number of elements being loaded. This is possibly very
1516 // conservative, but even so we still end up vectorising loops because the
1517 // cost per iteration for many loops is lower than for scalar loops.
1518 InstructionCost VectorCost =
1519 NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
1520 // The scalarization cost should be a lot higher. We use the number of vector
1521 // elements plus the scalarization overhead.
1522 InstructionCost ScalarCost =
1523 NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, true, false) +
1524 BaseT::getScalarizationOverhead(VTy, false, true);
1525
1526 if (EltSize < 8 || Alignment < EltSize / 8)
1527 return ScalarCost;
1528
1529 unsigned ExtSize = EltSize;
1530 // Check whether there's a single user that asks for an extended type
1531 if (I != nullptr) {
1532 // Dependent of the caller of this function, a gather instruction will
1533 // either have opcode Instruction::Load or be a call to the masked_gather
1534 // intrinsic
1535 if ((I->getOpcode() == Instruction::Load ||
1536 match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
1537 I->hasOneUse()) {
1538 const User *Us = *I->users().begin();
1539 if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
1540 // only allow valid type combinations
1541 unsigned TypeSize =
1542 cast<Instruction>(Us)->getType()->getScalarSizeInBits();
1543 if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
1544 (TypeSize == 16 && EltSize == 8)) &&
1545 TypeSize * NumElems == 128) {
1546 ExtSize = TypeSize;
1547 }
1548 }
1549 }
1550 // Check whether the input data needs to be truncated
1551 TruncInst *T;
1552 if ((I->getOpcode() == Instruction::Store ||
1553 match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
1554 (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
1555 // Only allow valid type combinations
1556 unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
1557 if (((EltSize == 16 && TypeSize == 32) ||
1558 (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
1559 TypeSize * NumElems == 128)
1560 ExtSize = TypeSize;
1561 }
1562 }
1563
1564 if (ExtSize * NumElems != 128 || NumElems < 4)
1565 return ScalarCost;
1566
1567 // Any (aligned) i32 gather will not need to be scalarised.
1568 if (ExtSize == 32)
1569 return VectorCost;
1570 // For smaller types, we need to ensure that the gep's inputs are correctly
1571 // extended from a small enough value. Other sizes (including i64) are
1572 // scalarized for now.
1573 if (ExtSize != 8 && ExtSize != 16)
1574 return ScalarCost;
1575
1576 if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
1577 Ptr = BC->getOperand(0);
1578 if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
1579 if (GEP->getNumOperands() != 2)
1580 return ScalarCost;
1581 unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
1582 // Scale needs to be correct (which is only relevant for i16s).
1583 if (Scale != 1 && Scale * 8 != ExtSize)
1584 return ScalarCost;
1585 // And we need to zext (not sext) the indexes from a small enough type.
1586 if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
1587 if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
1588 return VectorCost;
1589 }
1590 return ScalarCost;
1591 }
1592 return ScalarCost;
1593}
1594
1595InstructionCost
1596ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
1597 Optional<FastMathFlags> FMF,
1598 TTI::TargetCostKind CostKind) {
1599 if (TTI::requiresOrderedReduction(FMF))
1600 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1601
1602 EVT ValVT = TLI->getValueType(DL, ValTy);
1603 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1604 if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD)
1605 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1606
1607 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1608
1609 static const CostTblEntry CostTblAdd[]{
1610 {ISD::ADD, MVT::v16i8, 1},
1611 {ISD::ADD, MVT::v8i16, 1},
1612 {ISD::ADD, MVT::v4i32, 1},
1613 };
1614 if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
1615 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1616
1617 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1618}
1619
1620InstructionCost
1621ARMTTIImpl::getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
1622 Type *ResTy, VectorType *ValTy,
1623 TTI::TargetCostKind CostKind) {
1624 EVT ValVT = TLI->getValueType(DL, ValTy);
1625 EVT ResVT = TLI->getValueType(DL, ResTy);
1626
1627 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1628 std::pair<InstructionCost, MVT> LT =
1629 TLI->getTypeLegalizationCost(DL, ValTy);
1630
1631 // The legal cases are:
1632 // VADDV u/s 8/16/32
1633 // VMLAV u/s 8/16/32
1634 // VADDLV u/s 32
1635 // VMLALV u/s 16/32
1636 // Codegen currently cannot always handle larger than legal vectors very
1637 // well, especially for predicated reductions where the mask needs to be
1638 // split, so restrict to 128bit or smaller input types.
1639 unsigned RevVTSize = ResVT.getSizeInBits();
1640 if (ValVT.getSizeInBits() <= 128 &&
1641 ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1642 (LT.second == MVT::v8i16 && RevVTSize <= (IsMLA ? 64u : 32u)) ||
1643 (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1644 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1645 }
1646
1647 return BaseT::getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, ValTy,
1648 CostKind);
1649}
1650
1651InstructionCost
1652ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1653 TTI::TargetCostKind CostKind) {
1654 switch (ICA.getID()) {
1655 case Intrinsic::get_active_lane_mask:
1656 // Currently we make a somewhat optimistic assumption that
1657 // active_lane_mask's are always free. In reality it may be freely folded
1658 // into a tail predicated loop, expanded into a VCPT or expanded into a lot
1659 // of add/icmp code. We may need to improve this in the future, but being
1660 // able to detect if it is free or not involves looking at a lot of other
1661 // code. We currently assume that the vectorizer inserted these, and knew
1662 // what it was doing in adding one.
1663 if (ST->hasMVEIntegerOps())
1664 return 0;
1665 break;
1666 case Intrinsic::sadd_sat:
1667 case Intrinsic::ssub_sat:
1668 case Intrinsic::uadd_sat:
1669 case Intrinsic::usub_sat: {
1670 if (!ST->hasMVEIntegerOps())
1671 break;
1672 Type *VT = ICA.getReturnType();
1673
1674 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1675 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1676 LT.second == MVT::v16i8) {
1677 // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
1678 // need to extend the type, as it uses shr(qadd(shl, shl)).
1679 unsigned Instrs =
1680 LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4;
1681 return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
1682 }
1683 break;
1684 }
1685 case Intrinsic::abs:
1686 case Intrinsic::smin:
1687 case Intrinsic::smax:
1688 case Intrinsic::umin:
1689 case Intrinsic::umax: {
1690 if (!ST->hasMVEIntegerOps())
1691 break;
1692 Type *VT = ICA.getReturnType();
1693
1694 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1695 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1696 LT.second == MVT::v16i8)
1697 return LT.first * ST->getMVEVectorCostFactor(CostKind);
1698 break;
1699 }
1700 case Intrinsic::minnum:
1701 case Intrinsic::maxnum: {
1702 if (!ST->hasMVEFloatOps())
1703 break;
1704 Type *VT = ICA.getReturnType();
1705 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1706 if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
1707 return LT.first * ST->getMVEVectorCostFactor(CostKind);
1708 break;
1709 }
1710 }
1711
1712 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1
'Default' branch taken. Execution continues on line 1712
2
Calling 'BasicTTIImplBase::getIntrinsicInstrCost'
1713}
1714
1715bool ARMTTIImpl::isLoweredToCall(const Function *F) {
1716 if (!F->isIntrinsic())
1717 BaseT::isLoweredToCall(F);
1718
1719 // Assume all Arm-specific intrinsics map to an instruction.
1720 if (F->getName().startswith("llvm.arm"))
1721 return false;
1722
1723 switch (F->getIntrinsicID()) {
1724 default: break;
1725 case Intrinsic::powi:
1726 case Intrinsic::sin:
1727 case Intrinsic::cos:
1728 case Intrinsic::pow:
1729 case Intrinsic::log:
1730 case Intrinsic::log10:
1731 case Intrinsic::log2:
1732 case Intrinsic::exp:
1733 case Intrinsic::exp2:
1734 return true;
1735 case Intrinsic::sqrt:
1736 case Intrinsic::fabs:
1737 case Intrinsic::copysign:
1738 case Intrinsic::floor:
1739 case Intrinsic::ceil:
1740 case Intrinsic::trunc:
1741 case Intrinsic::rint:
1742 case Intrinsic::nearbyint:
1743 case Intrinsic::round:
1744 case Intrinsic::canonicalize:
1745 case Intrinsic::lround:
1746 case Intrinsic::llround:
1747 case Intrinsic::lrint:
1748 case Intrinsic::llrint:
1749 if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
1750 return true;
1751 if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
1752 return true;
1753 // Some operations can be handled by vector instructions and assume
1754 // unsupported vectors will be expanded into supported scalar ones.
1755 // TODO Handle scalar operations properly.
1756 return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
1757 case Intrinsic::masked_store:
1758 case Intrinsic::masked_load:
1759 case Intrinsic::masked_gather:
1760 case Intrinsic::masked_scatter:
1761 return !ST->hasMVEIntegerOps();
1762 case Intrinsic::sadd_with_overflow:
1763 case Intrinsic::uadd_with_overflow:
1764 case Intrinsic::ssub_with_overflow:
1765 case Intrinsic::usub_with_overflow:
1766 case Intrinsic::sadd_sat:
1767 case Intrinsic::uadd_sat:
1768 case Intrinsic::ssub_sat:
1769 case Intrinsic::usub_sat:
1770 return false;
1771 }
1772
1773 return BaseT::isLoweredToCall(F);
1774}
1775
1776bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) {
1777 unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
1778 EVT VT = TLI->getValueType(DL, I.getType(), true);
1779 if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
1780 return true;
1781
1782 // Check if an intrinsic will be lowered to a call and assume that any
1783 // other CallInst will generate a bl.
1784 if (auto *Call = dyn_cast<CallInst>(&I)) {
1785 if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
1786 switch(II->getIntrinsicID()) {
1787 case Intrinsic::memcpy:
1788 case Intrinsic::memset:
1789 case Intrinsic::memmove:
1790 return getNumMemOps(II) == -1;
1791 default:
1792 if (const Function *F = Call->getCalledFunction())
1793 return isLoweredToCall(F);
1794 }
1795 }
1796 return true;
1797 }
1798
1799 // FPv5 provides conversions between integer, double-precision,
1800 // single-precision, and half-precision formats.
1801 switch (I.getOpcode()) {
1802 default:
1803 break;
1804 case Instruction::FPToSI:
1805 case Instruction::FPToUI:
1806 case Instruction::SIToFP:
1807 case Instruction::UIToFP:
1808 case Instruction::FPTrunc:
1809 case Instruction::FPExt:
1810 return !ST->hasFPARMv8Base();
1811 }
1812
1813 // FIXME: Unfortunately the approach of checking the Operation Action does
1814 // not catch all cases of Legalization that use library calls. Our
1815 // Legalization step categorizes some transformations into library calls as
1816 // Custom, Expand or even Legal when doing type legalization. So for now
1817 // we have to special case for instance the SDIV of 64bit integers and the
1818 // use of floating point emulation.
1819 if (VT.isInteger() && VT.getSizeInBits() >= 64) {
1820 switch (ISD) {
1821 default:
1822 break;
1823 case ISD::SDIV:
1824 case ISD::UDIV:
1825 case ISD::SREM:
1826 case ISD::UREM:
1827 case ISD::SDIVREM:
1828 case ISD::UDIVREM:
1829 return true;
1830 }
1831 }
1832
1833 // Assume all other non-float operations are supported.
1834 if (!VT.isFloatingPoint())
1835 return false;
1836
1837 // We'll need a library call to handle most floats when using soft.
1838 if (TLI->useSoftFloat()) {
1839 switch (I.getOpcode()) {
1840 default:
1841 return true;
1842 case Instruction::Alloca:
1843 case Instruction::Load:
1844 case Instruction::Store:
1845 case Instruction::Select:
1846 case Instruction::PHI:
1847 return false;
1848 }
1849 }
1850
1851 // We'll need a libcall to perform double precision operations on a single
1852 // precision only FPU.
1853 if (I.getType()->isDoubleTy() && !ST->hasFP64())
1854 return true;
1855
1856 // Likewise for half precision arithmetic.
1857 if (I.getType()->isHalfTy() && !ST->hasFullFP16())
1858 return true;
1859
1860 return false;
1861}
1862
1863bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
1864 AssumptionCache &AC,
1865 TargetLibraryInfo *LibInfo,
1866 HardwareLoopInfo &HWLoopInfo) {
1867 // Low-overhead branches are only supported in the 'low-overhead branch'
1868 // extension of v8.1-m.
1869 if (!ST->hasLOB() || DisableLowOverheadLoops) {
1870 LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n")do { } while (false);
1871 return false;
1872 }
1873
1874 if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
1875 LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n")do { } while (false);
1876 return false;
1877 }
1878
1879 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
1880 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
1881 LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n")do { } while (false);
1882 return false;
1883 }
1884
1885 const SCEV *TripCountSCEV =
1886 SE.getAddExpr(BackedgeTakenCount,
1887 SE.getOne(BackedgeTakenCount->getType()));
1888
1889 // We need to store the trip count in LR, a 32-bit register.
1890 if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
1891 LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n")do { } while (false);
1892 return false;
1893 }
1894
1895 // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
1896 // point in generating a hardware loop if that's going to happen.
1897
1898 auto IsHardwareLoopIntrinsic = [](Instruction &I) {
1899 if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
1900 switch (Call->getIntrinsicID()) {
1901 default:
1902 break;
1903 case Intrinsic::start_loop_iterations:
1904 case Intrinsic::test_start_loop_iterations:
1905 case Intrinsic::loop_decrement:
1906 case Intrinsic::loop_decrement_reg:
1907 return true;
1908 }
1909 }
1910 return false;
1911 };
1912
1913 // Scan the instructions to see if there's any that we know will turn into a
1914 // call or if this loop is already a low-overhead loop or will become a tail
1915 // predicated loop.
1916 bool IsTailPredLoop = false;
1917 auto ScanLoop = [&](Loop *L) {
1918 for (auto *BB : L->getBlocks()) {
1919 for (auto &I : *BB) {
1920 if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
1921 isa<InlineAsm>(I)) {
1922 LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n")do { } while (false);
1923 return false;
1924 }
1925 if (auto *II = dyn_cast<IntrinsicInst>(&I))
1926 IsTailPredLoop |=
1927 II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
1928 II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
1929 II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
1930 II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
1931 II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
1932 }
1933 }
1934 return true;
1935 };
1936
1937 // Visit inner loops.
1938 for (auto Inner : *L)
1939 if (!ScanLoop(Inner))
1940 return false;
1941
1942 if (!ScanLoop(L))
1943 return false;
1944
1945 // TODO: Check whether the trip count calculation is expensive. If L is the
1946 // inner loop but we know it has a low trip count, calculating that trip
1947 // count (in the parent loop) may be detrimental.
1948
1949 LLVMContext &C = L->getHeader()->getContext();
1950 HWLoopInfo.CounterInReg = true;
1951 HWLoopInfo.IsNestingLegal = false;
1952 HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
1953 HWLoopInfo.CountType = Type::getInt32Ty(C);
1954 HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
1955 return true;
1956}
1957
1958static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
1959 // We don't allow icmp's, and because we only look at single block loops,
1960 // we simply count the icmps, i.e. there should only be 1 for the backedge.
1961 if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
1962 return false;
1963 // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
1964 // not currently canonical, but soon will be. Code without them uses icmp, and
1965 // so is not tail predicated as per the condition above. In order to get the
1966 // same performance we treat min and max the same as an icmp for tailpred
1967 // purposes for the moment (we often rely on non-tailpred and higher VF's to
1968 // pick more optimial instructions like VQDMULH. They need to be recognized
1969 // directly by the vectorizer).
1970 if (auto *II = dyn_cast<IntrinsicInst>(&I))
1971 if ((II->getIntrinsicID() == Intrinsic::smin ||
1972 II->getIntrinsicID() == Intrinsic::smax ||
1973 II->getIntrinsicID() == Intrinsic::umin ||
1974 II->getIntrinsicID() == Intrinsic::umax) &&
1975 ++ICmpCount > 1)
1976 return false;
1977
1978 if (isa<FCmpInst>(&I))
1979 return false;
1980
1981 // We could allow extending/narrowing FP loads/stores, but codegen is
1982 // too inefficient so reject this for now.
1983 if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
1984 return false;
1985
1986 // Extends have to be extending-loads
1987 if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
1988 if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
1989 return false;
1990
1991 // Truncs have to be narrowing-stores
1992 if (isa<TruncInst>(&I) )
1993 if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
1994 return false;
1995
1996 return true;
1997}
1998
1999// To set up a tail-predicated loop, we need to know the total number of
2000// elements processed by that loop. Thus, we need to determine the element
2001// size and:
2002// 1) it should be uniform for all operations in the vector loop, so we
2003// e.g. don't want any widening/narrowing operations.
2004// 2) it should be smaller than i64s because we don't have vector operations
2005// that work on i64s.
2006// 3) we don't want elements to be reversed or shuffled, to make sure the
2007// tail-predication masks/predicates the right lanes.
2008//
2009static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
2010 const DataLayout &DL,
2011 const LoopAccessInfo *LAI) {
2012 LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n")do { } while (false);
2013
2014 // If there are live-out values, it is probably a reduction. We can predicate
2015 // most reduction operations freely under MVE using a combination of
2016 // prefer-predicated-reduction-select and inloop reductions. We limit this to
2017 // floating point and integer reductions, but don't check for operators
2018 // specifically here. If the value ends up not being a reduction (and so the
2019 // vectorizer cannot tailfold the loop), we should fall back to standard
2020 // vectorization automatically.
2021 SmallVector< Instruction *, 8 > LiveOuts;
2022 LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
2023 bool ReductionsDisabled =
2024 EnableTailPredication == TailPredication::EnabledNoReductions ||
2025 EnableTailPredication == TailPredication::ForceEnabledNoReductions;
2026
2027 for (auto *I : LiveOuts) {
2028 if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
2029 !I->getType()->isHalfTy()) {
2030 LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "do { } while (false)
2031 "live-out value\n")do { } while (false);
2032 return false;
2033 }
2034 if (ReductionsDisabled) {
2035 LLVM_DEBUG(dbgs() << "Reductions not enabled\n")do { } while (false);
2036 return false;
2037 }
2038 }
2039
2040 // Next, check that all instructions can be tail-predicated.
2041 PredicatedScalarEvolution PSE = LAI->getPSE();
2042 SmallVector<Instruction *, 16> LoadStores;
2043 int ICmpCount = 0;
2044
2045 for (BasicBlock *BB : L->blocks()) {
2046 for (Instruction &I : BB->instructionsWithoutDebug()) {
2047 if (isa<PHINode>(&I))
2048 continue;
2049 if (!canTailPredicateInstruction(I, ICmpCount)) {
2050 LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump())do { } while (false);
2051 return false;
2052 }
2053
2054 Type *T = I.getType();
2055 if (T->isPointerTy())
2056 T = T->getPointerElementType();
2057
2058 if (T->getScalarSizeInBits() > 32) {
2059 LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump())do { } while (false);
2060 return false;
2061 }
2062 if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
2063 Value *Ptr = isa<LoadInst>(I) ? I.getOperand(0) : I.getOperand(1);
2064 int64_t NextStride = getPtrStride(PSE, Ptr, L);
2065 if (NextStride == 1) {
2066 // TODO: for now only allow consecutive strides of 1. We could support
2067 // other strides as long as it is uniform, but let's keep it simple
2068 // for now.
2069 continue;
2070 } else if (NextStride == -1 ||
2071 (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
2072 (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
2073 LLVM_DEBUG(dbgs()do { } while (false)
2074 << "Consecutive strides of 2 found, vld2/vstr2 can't "do { } while (false)
2075 "be tail-predicated\n.")do { } while (false);
2076 return false;
2077 // TODO: don't tail predicate if there is a reversed load?
2078 } else if (EnableMaskedGatherScatters) {
2079 // Gather/scatters do allow loading from arbitrary strides, at
2080 // least if they are loop invariant.
2081 // TODO: Loop variant strides should in theory work, too, but
2082 // this requires further testing.
2083 const SCEV *PtrScev =
2084 replaceSymbolicStrideSCEV(PSE, llvm::ValueToValueMap(), Ptr);
2085 if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
2086 const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
2087 if (PSE.getSE()->isLoopInvariant(Step, L))
2088 continue;
2089 }
2090 }
2091 LLVM_DEBUG(dbgs() << "Bad stride found, can't "do { } while (false)
2092 "tail-predicate\n.")do { } while (false);
2093 return false;
2094 }
2095 }
2096 }
2097
2098 LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n")do { } while (false);
2099 return true;
2100}
2101
2102bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
2103 ScalarEvolution &SE,
2104 AssumptionCache &AC,
2105 TargetLibraryInfo *TLI,
2106 DominatorTree *DT,
2107 const LoopAccessInfo *LAI) {
2108 if (!EnableTailPredication) {
2109 LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n")do { } while (false);
2110 return false;
2111 }
2112
2113 // Creating a predicated vector loop is the first step for generating a
2114 // tail-predicated hardware loop, for which we need the MVE masked
2115 // load/stores instructions:
2116 if (!ST->hasMVEIntegerOps())
2117 return false;
2118
2119 // For now, restrict this to single block loops.
2120 if (L->getNumBlocks() > 1) {
2121 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "do { } while (false)
2122 "loop.\n")do { } while (false);
2123 return false;
2124 }
2125
2126 assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected")(static_cast<void> (0));
2127
2128 HardwareLoopInfo HWLoopInfo(L);
2129 if (!HWLoopInfo.canAnalyze(*LI)) {
2130 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "do { } while (false)
2131 "analyzable.\n")do { } while (false);
2132 return false;
2133 }
2134
2135 // This checks if we have the low-overhead branch architecture
2136 // extension, and if we will create a hardware-loop:
2137 if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {
2138 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "do { } while (false)
2139 "profitable.\n")do { } while (false);
2140 return false;
2141 }
2142
2143 if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) {
2144 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "do { } while (false)
2145 "a candidate.\n")do { } while (false);
2146 return false;
2147 }
2148
2149 return canTailPredicateLoop(L, LI, SE, DL, LAI);
2150}
2151
2152bool ARMTTIImpl::emitGetActiveLaneMask() const {
2153 if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
2154 return false;
2155
2156 // Intrinsic @llvm.get.active.lane.mask is supported.
2157 // It is used in the MVETailPredication pass, which requires the number of
2158 // elements processed by this vector loop to setup the tail-predicated
2159 // loop.
2160 return true;
2161}
2162void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
2163 TTI::UnrollingPreferences &UP,
2164 OptimizationRemarkEmitter *ORE) {
2165 // Enable Upper bound unrolling universally, not dependant upon the conditions
2166 // below.
2167 UP.UpperBound = true;
2168
2169 // Only currently enable these preferences for M-Class cores.
2170 if (!ST->isMClass())
2171 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2172
2173 // Disable loop unrolling for Oz and Os.
2174 UP.OptSizeThreshold = 0;
2175 UP.PartialOptSizeThreshold = 0;
2176 if (L->getHeader()->getParent()->hasOptSize())
2177 return;
2178
2179 SmallVector<BasicBlock*, 4> ExitingBlocks;
2180 L->getExitingBlocks(ExitingBlocks);
2181 LLVM_DEBUG(dbgs() << "Loop has:\n"do { } while (false)
2182 << "Blocks: " << L->getNumBlocks() << "\n"do { } while (false)
2183 << "Exit blocks: " << ExitingBlocks.size() << "\n")do { } while (false);
2184
2185 // Only allow another exit other than the latch. This acts as an early exit
2186 // as it mirrors the profitability calculation of the runtime unroller.
2187 if (ExitingBlocks.size() > 2)
2188 return;
2189
2190 // Limit the CFG of the loop body for targets with a branch predictor.
2191 // Allowing 4 blocks permits if-then-else diamonds in the body.
2192 if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
2193 return;
2194
2195 // Don't unroll vectorized loops, including the remainder loop
2196 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
2197 return;
2198
2199 // Scan the loop: don't unroll loops with calls as this could prevent
2200 // inlining.
2201 InstructionCost Cost = 0;
2202 for (auto *BB : L->getBlocks()) {
2203 for (auto &I : *BB) {
2204 // Don't unroll vectorised loop. MVE does not benefit from it as much as
2205 // scalar code.
2206 if (I.getType()->isVectorTy())
2207 return;
2208
2209 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2210 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2211 if (!isLoweredToCall(F))
2212 continue;
2213 }
2214 return;
2215 }
2216
2217 SmallVector<const Value*, 4> Operands(I.operand_values());
2218 Cost +=
2219 getUserCost(&I, Operands, TargetTransformInfo::TCK_SizeAndLatency);
2220 }
2221 }
2222
2223 // On v6m cores, there are very few registers available. We can easily end up
2224 // spilling and reloading more registers in an unrolled loop. Look at the
2225 // number of LCSSA phis as a rough measure of how many registers will need to
2226 // be live out of the loop, reducing the default unroll count if more than 1
2227 // value is needed. In the long run, all of this should be being learnt by a
2228 // machine.
2229 unsigned UnrollCount = 4;
2230 if (ST->isThumb1Only()) {
2231 unsigned ExitingValues = 0;
2232 SmallVector<BasicBlock *, 4> ExitBlocks;
2233 L->getExitBlocks(ExitBlocks);
2234 for (auto *Exit : ExitBlocks) {
2235 // Count the number of LCSSA phis. Exclude values coming from GEP's as
2236 // only the last is expected to be needed for address operands.
2237 unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) {
2238 return PH.getNumOperands() != 1 ||
2239 !isa<GetElementPtrInst>(PH.getOperand(0));
2240 });
2241 ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
2242 }
2243 if (ExitingValues)
2244 UnrollCount /= ExitingValues;
2245 if (UnrollCount <= 1)
2246 return;
2247 }
2248
2249 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n")do { } while (false);
2250 LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n")do { } while (false);
2251
2252 UP.Partial = true;
2253 UP.Runtime = true;
2254 UP.UnrollRemainder = true;
2255 UP.DefaultUnrollRuntimeCount = UnrollCount;
2256 UP.UnrollAndJam = true;
2257 UP.UnrollAndJamInnerLoopThreshold = 60;
2258
2259 // Force unrolling small loops can be very useful because of the branch
2260 // taken cost of the backedge.
2261 if (Cost < 12)
2262 UP.Force = true;
2263}
2264
2265void ARMTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
2266 TTI::PeelingPreferences &PP) {
2267 BaseT::getPeelingPreferences(L, SE, PP);
2268}
2269
2270bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
2271 TTI::ReductionFlags Flags) const {
2272 if (!ST->hasMVEIntegerOps())
2273 return false;
2274
2275 unsigned ScalarBits = Ty->getScalarSizeInBits();
2276 switch (Opcode) {
2277 case Instruction::Add:
2278 return ScalarBits <= 64;
2279 default:
2280 return false;
2281 }
2282}
2283
2284bool ARMTTIImpl::preferPredicatedReductionSelect(
2285 unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const {
2286 if (!ST->hasMVEIntegerOps())
2287 return false;
2288 return true;
2289}

/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/include/llvm/CodeGen/BasicTTIImpl.h

1//===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file provides a helper that implements much of the TTI interface in
11/// terms of the target-independent code generator and TargetLowering
12/// interfaces.
13//
14//===----------------------------------------------------------------------===//
15
16#ifndef LLVM_CODEGEN_BASICTTIIMPL_H
17#define LLVM_CODEGEN_BASICTTIIMPL_H
18
19#include "llvm/ADT/APInt.h"
20#include "llvm/ADT/ArrayRef.h"
21#include "llvm/ADT/BitVector.h"
22#include "llvm/ADT/SmallPtrSet.h"
23#include "llvm/ADT/SmallVector.h"
24#include "llvm/Analysis/LoopInfo.h"
25#include "llvm/Analysis/OptimizationRemarkEmitter.h"
26#include "llvm/Analysis/TargetTransformInfo.h"
27#include "llvm/Analysis/TargetTransformInfoImpl.h"
28#include "llvm/CodeGen/ISDOpcodes.h"
29#include "llvm/CodeGen/TargetLowering.h"
30#include "llvm/CodeGen/TargetSubtargetInfo.h"
31#include "llvm/CodeGen/ValueTypes.h"
32#include "llvm/IR/BasicBlock.h"
33#include "llvm/IR/Constant.h"
34#include "llvm/IR/Constants.h"
35#include "llvm/IR/DataLayout.h"
36#include "llvm/IR/DerivedTypes.h"
37#include "llvm/IR/InstrTypes.h"
38#include "llvm/IR/Instruction.h"
39#include "llvm/IR/Instructions.h"
40#include "llvm/IR/Intrinsics.h"
41#include "llvm/IR/Operator.h"
42#include "llvm/IR/Type.h"
43#include "llvm/IR/Value.h"
44#include "llvm/Support/Casting.h"
45#include "llvm/Support/CommandLine.h"
46#include "llvm/Support/ErrorHandling.h"
47#include "llvm/Support/MachineValueType.h"
48#include "llvm/Support/MathExtras.h"
49#include "llvm/Target/TargetMachine.h"
50#include <algorithm>
51#include <cassert>
52#include <cstdint>
53#include <limits>
54#include <utility>
55
56namespace llvm {
57
58class Function;
59class GlobalValue;
60class LLVMContext;
61class ScalarEvolution;
62class SCEV;
63class TargetMachine;
64
65extern cl::opt<unsigned> PartialUnrollingThreshold;
66
67/// Base class which can be used to help build a TTI implementation.
68///
69/// This class provides as much implementation of the TTI interface as is
70/// possible using the target independent parts of the code generator.
71///
72/// In order to subclass it, your class must implement a getST() method to
73/// return the subtarget, and a getTLI() method to return the target lowering.
74/// We need these methods implemented in the derived class so that this class
75/// doesn't have to duplicate storage for them.
76template <typename T>
77class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
78private:
79 using BaseT = TargetTransformInfoImplCRTPBase<T>;
80 using TTI = TargetTransformInfo;
81
82 /// Helper function to access this as a T.
83 T *thisT() { return static_cast<T *>(this); }
84
85 /// Estimate a cost of Broadcast as an extract and sequence of insert
86 /// operations.
87 InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy) {
88 InstructionCost Cost = 0;
89 // Broadcast cost is equal to the cost of extracting the zero'th element
90 // plus the cost of inserting it into every element of the result vector.
91 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 0);
92
93 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
94 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i);
95 }
96 return Cost;
97 }
98
99 /// Estimate a cost of shuffle as a sequence of extract and insert
100 /// operations.
101 InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy) {
102 InstructionCost Cost = 0;
103 // Shuffle cost is equal to the cost of extracting element from its argument
104 // plus the cost of inserting them onto the result vector.
105
106 // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
107 // index 0 of first vector, index 1 of second vector,index 2 of first
108 // vector and finally index 3 of second vector and insert them at index
109 // <0,1,2,3> of result vector.
110 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
111 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i);
112 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, i);
113 }
114 return Cost;
115 }
116
117 /// Estimate a cost of subvector extraction as a sequence of extract and
118 /// insert operations.
119 InstructionCost getExtractSubvectorOverhead(VectorType *VTy, int Index,
120 FixedVectorType *SubVTy) {
121 assert(VTy && SubVTy &&(static_cast<void> (0))
122 "Can only extract subvectors from vectors")(static_cast<void> (0));
123 int NumSubElts = SubVTy->getNumElements();
124 assert((!isa<FixedVectorType>(VTy) ||(static_cast<void> (0))
125 (Index + NumSubElts) <=(static_cast<void> (0))
126 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&(static_cast<void> (0))
127 "SK_ExtractSubvector index out of range")(static_cast<void> (0));
128
129 InstructionCost Cost = 0;
130 // Subvector extraction cost is equal to the cost of extracting element from
131 // the source type plus the cost of inserting them into the result vector
132 // type.
133 for (int i = 0; i != NumSubElts; ++i) {
134 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
135 i + Index);
136 Cost +=
137 thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, i);
138 }
139 return Cost;
140 }
141
142 /// Estimate a cost of subvector insertion as a sequence of extract and
143 /// insert operations.
144 InstructionCost getInsertSubvectorOverhead(VectorType *VTy, int Index,
145 FixedVectorType *SubVTy) {
146 assert(VTy && SubVTy &&(static_cast<void> (0))
147 "Can only insert subvectors into vectors")(static_cast<void> (0));
148 int NumSubElts = SubVTy->getNumElements();
149 assert((!isa<FixedVectorType>(VTy) ||(static_cast<void> (0))
150 (Index + NumSubElts) <=(static_cast<void> (0))
151 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&(static_cast<void> (0))
152 "SK_InsertSubvector index out of range")(static_cast<void> (0));
153
154 InstructionCost Cost = 0;
155 // Subvector insertion cost is equal to the cost of extracting element from
156 // the source type plus the cost of inserting them into the result vector
157 // type.
158 for (int i = 0; i != NumSubElts; ++i) {
159 Cost +=
160 thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy, i);
161 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
162 i + Index);
163 }
164 return Cost;
165 }
166
167 /// Local query method delegates up to T which *must* implement this!
168 const TargetSubtargetInfo *getST() const {
169 return static_cast<const T *>(this)->getST();
170 }
171
172 /// Local query method delegates up to T which *must* implement this!
173 const TargetLoweringBase *getTLI() const {
174 return static_cast<const T *>(this)->getTLI();
175 }
176
177 static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
178 switch (M) {
179 case TTI::MIM_Unindexed:
180 return ISD::UNINDEXED;
181 case TTI::MIM_PreInc:
182 return ISD::PRE_INC;
183 case TTI::MIM_PreDec:
184 return ISD::PRE_DEC;
185 case TTI::MIM_PostInc:
186 return ISD::POST_INC;
187 case TTI::MIM_PostDec:
188 return ISD::POST_DEC;
189 }
190 llvm_unreachable("Unexpected MemIndexedMode")__builtin_unreachable();
191 }
192
193 InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
194 Align Alignment,
195 bool VariableMask,
196 bool IsGatherScatter,
197 TTI::TargetCostKind CostKind) {
198 auto *VT = cast<FixedVectorType>(DataTy);
199 // Assume the target does not have support for gather/scatter operations
200 // and provide a rough estimate.
201 //
202 // First, compute the cost of the individual memory operations.
203 InstructionCost AddrExtractCost =
204 IsGatherScatter
205 ? getVectorInstrCost(Instruction::ExtractElement,
206 FixedVectorType::get(
207 PointerType::get(VT->getElementType(), 0),
208 VT->getNumElements()),
209 -1)
210 : 0;
211 InstructionCost LoadCost =
212 VT->getNumElements() *
213 (AddrExtractCost +
214 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind));
215
216 // Next, compute the cost of packing the result in a vector.
217 InstructionCost PackingCost = getScalarizationOverhead(
218 VT, Opcode != Instruction::Store, Opcode == Instruction::Store);
219
220 InstructionCost ConditionalCost = 0;
221 if (VariableMask) {
222 // Compute the cost of conditionally executing the memory operations with
223 // variable masks. This includes extracting the individual conditions, a
224 // branches and PHIs to combine the results.
225 // NOTE: Estimating the cost of conditionally executing the memory
226 // operations accurately is quite difficult and the current solution
227 // provides a very rough estimate only.
228 ConditionalCost =
229 VT->getNumElements() *
230 (getVectorInstrCost(
231 Instruction::ExtractElement,
232 FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()),
233 VT->getNumElements()),
234 -1) +
235 getCFInstrCost(Instruction::Br, CostKind) +
236 getCFInstrCost(Instruction::PHI, CostKind));
237 }
238
239 return LoadCost + PackingCost + ConditionalCost;
240 }
241
242protected:
243 explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
244 : BaseT(DL) {}
245 virtual ~BasicTTIImplBase() = default;
246
247 using TargetTransformInfoImplBase::DL;
248
249public:
250 /// \name Scalar TTI Implementations
251 /// @{
252 bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth,
253 unsigned AddressSpace, Align Alignment,
254 bool *Fast) const {
255 EVT E = EVT::getIntegerVT(Context, BitWidth);
256 return getTLI()->allowsMisalignedMemoryAccesses(
257 E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast);
258 }
259
260 bool hasBranchDivergence() { return false; }
261
262 bool useGPUDivergenceAnalysis() { return false; }
263
264 bool isSourceOfDivergence(const Value *V) { return false; }
265
266 bool isAlwaysUniform(const Value *V) { return false; }
267
268 unsigned getFlatAddressSpace() {
269 // Return an invalid address space.
270 return -1;
271 }
272
273 bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
274 Intrinsic::ID IID) const {
275 return false;
276 }
277
278 bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
279 return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS);
280 }
281
282 unsigned getAssumedAddrSpace(const Value *V) const {
283 return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
284 }
285
286 Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
287 Value *NewV) const {
288 return nullptr;
289 }
290
291 bool isLegalAddImmediate(int64_t imm) {
292 return getTLI()->isLegalAddImmediate(imm);
293 }
294
295 bool isLegalICmpImmediate(int64_t imm) {
296 return getTLI()->isLegalICmpImmediate(imm);
297 }
298
299 bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
300 bool HasBaseReg, int64_t Scale,
301 unsigned AddrSpace, Instruction *I = nullptr) {
302 TargetLoweringBase::AddrMode AM;
303 AM.BaseGV = BaseGV;
304 AM.BaseOffs = BaseOffset;
305 AM.HasBaseReg = HasBaseReg;
306 AM.Scale = Scale;
307 return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
308 }
309
310 bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty,
311 const DataLayout &DL) const {
312 EVT VT = getTLI()->getValueType(DL, Ty);
313 return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
314 }
315
316 bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty,
317 const DataLayout &DL) const {
318 EVT VT = getTLI()->getValueType(DL, Ty);
319 return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
320 }
321
322 bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) {
323 return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
324 }
325
326 bool isNumRegsMajorCostOfLSR() {
327 return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR();
328 }
329
330 bool isProfitableLSRChainElement(Instruction *I) {
331 return TargetTransformInfoImplBase::isProfitableLSRChainElement(I);
332 }
333
334 InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
335 int64_t BaseOffset, bool HasBaseReg,
336 int64_t Scale, unsigned AddrSpace) {
337 TargetLoweringBase::AddrMode AM;
338 AM.BaseGV = BaseGV;
339 AM.BaseOffs = BaseOffset;
340 AM.HasBaseReg = HasBaseReg;
341 AM.Scale = Scale;
342 return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace);
343 }
344
345 bool isTruncateFree(Type *Ty1, Type *Ty2) {
346 return getTLI()->isTruncateFree(Ty1, Ty2);
347 }
348
349 bool isProfitableToHoist(Instruction *I) {
350 return getTLI()->isProfitableToHoist(I);
351 }
352
353 bool useAA() const { return getST()->useAA(); }
354
355 bool isTypeLegal(Type *Ty) {
356 EVT VT = getTLI()->getValueType(DL, Ty);
357 return getTLI()->isTypeLegal(VT);
358 }
359
360 InstructionCost getRegUsageForType(Type *Ty) {
361 InstructionCost Val = getTLI()->getTypeLegalizationCost(DL, Ty).first;
362 assert(Val >= 0 && "Negative cost!")(static_cast<void> (0));
363 return Val;
364 }
365
366 InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr,
367 ArrayRef<const Value *> Operands) {
368 return BaseT::getGEPCost(PointeeType, Ptr, Operands);
369 }
370
371 unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
372 unsigned &JumpTableSize,
373 ProfileSummaryInfo *PSI,
374 BlockFrequencyInfo *BFI) {
375 /// Try to find the estimated number of clusters. Note that the number of
376 /// clusters identified in this function could be different from the actual
377 /// numbers found in lowering. This function ignore switches that are
378 /// lowered with a mix of jump table / bit test / BTree. This function was
379 /// initially intended to be used when estimating the cost of switch in
380 /// inline cost heuristic, but it's a generic cost model to be used in other
381 /// places (e.g., in loop unrolling).
382 unsigned N = SI.getNumCases();
383 const TargetLoweringBase *TLI = getTLI();
384 const DataLayout &DL = this->getDataLayout();
385
386 JumpTableSize = 0;
387 bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());
388
389 // Early exit if both a jump table and bit test are not allowed.
390 if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N))
391 return N;
392
393 APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
394 APInt MinCaseVal = MaxCaseVal;
395 for (auto CI : SI.cases()) {
396 const APInt &CaseVal = CI.getCaseValue()->getValue();
397 if (CaseVal.sgt(MaxCaseVal))
398 MaxCaseVal = CaseVal;
399 if (CaseVal.slt(MinCaseVal))
400 MinCaseVal = CaseVal;
401 }
402
403 // Check if suitable for a bit test
404 if (N <= DL.getIndexSizeInBits(0u)) {
405 SmallPtrSet<const BasicBlock *, 4> Dests;
406 for (auto I : SI.cases())
407 Dests.insert(I.getCaseSuccessor());
408
409 if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
410 DL))
411 return 1;
412 }
413
414 // Check if suitable for a jump table.
415 if (IsJTAllowed) {
416 if (N < 2 || N < TLI->getMinimumJumpTableEntries())
417 return N;
418 uint64_t Range =
419 (MaxCaseVal - MinCaseVal)
420 .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1;
421 // Check whether a range of clusters is dense enough for a jump table
422 if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) {
423 JumpTableSize = Range;
424 return 1;
425 }
426 }
427 return N;
428 }
429
430 bool shouldBuildLookupTables() {
431 const TargetLoweringBase *TLI = getTLI();
432 return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
433 TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
434 }
435
436 bool shouldBuildRelLookupTables() const {
437 const TargetMachine &TM = getTLI()->getTargetMachine();
438 // If non-PIC mode, do not generate a relative lookup table.
439 if (!TM.isPositionIndependent())
440 return false;
441
442 /// Relative lookup table entries consist of 32-bit offsets.
443 /// Do not generate relative lookup tables for large code models
444 /// in 64-bit achitectures where 32-bit offsets might not be enough.
445 if (TM.getCodeModel() == CodeModel::Medium ||
446 TM.getCodeModel() == CodeModel::Large)
447 return false;
448
449 Triple TargetTriple = TM.getTargetTriple();
450 if (!TargetTriple.isArch64Bit())
451 return false;
452
453 // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it
454 // there.
455 if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin())
456 return false;
457
458 return true;
459 }
460
461 bool haveFastSqrt(Type *Ty) {
462 const TargetLoweringBase *TLI = getTLI();
463 EVT VT = TLI->getValueType(DL, Ty);
464 return TLI->isTypeLegal(VT) &&
465 TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
466 }
467
468 bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
469 return true;
470 }
471
472 InstructionCost getFPOpCost(Type *Ty) {
473 // Check whether FADD is available, as a proxy for floating-point in
474 // general.
475 const TargetLoweringBase *TLI = getTLI();
476 EVT VT = TLI->getValueType(DL, Ty);
477 if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT))
478 return TargetTransformInfo::TCC_Basic;
479 return TargetTransformInfo::TCC_Expensive;
480 }
481
482 unsigned getInliningThresholdMultiplier() { return 1; }
483 unsigned adjustInliningThreshold(const CallBase *CB) { return 0; }
484
485 int getInlinerVectorBonusPercent() { return 150; }
486
487 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
488 TTI::UnrollingPreferences &UP,
489 OptimizationRemarkEmitter *ORE) {
490 // This unrolling functionality is target independent, but to provide some
491 // motivation for its intended use, for x86:
492
493 // According to the Intel 64 and IA-32 Architectures Optimization Reference
494 // Manual, Intel Core models and later have a loop stream detector (and
495 // associated uop queue) that can benefit from partial unrolling.
496 // The relevant requirements are:
497 // - The loop must have no more than 4 (8 for Nehalem and later) branches
498 // taken, and none of them may be calls.
499 // - The loop can have no more than 18 (28 for Nehalem and later) uops.
500
501 // According to the Software Optimization Guide for AMD Family 15h
502 // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
503 // and loop buffer which can benefit from partial unrolling.
504 // The relevant requirements are:
505 // - The loop must have fewer than 16 branches
506 // - The loop must have less than 40 uops in all executed loop branches
507
508 // The number of taken branches in a loop is hard to estimate here, and
509 // benchmarking has revealed that it is better not to be conservative when
510 // estimating the branch count. As a result, we'll ignore the branch limits
511 // until someone finds a case where it matters in practice.
512
513 unsigned MaxOps;
514 const TargetSubtargetInfo *ST = getST();
515 if (PartialUnrollingThreshold.getNumOccurrences() > 0)
516 MaxOps = PartialUnrollingThreshold;
517 else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
518 MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
519 else
520 return;
521
522 // Scan the loop: don't unroll loops with calls.
523 for (BasicBlock *BB : L->blocks()) {
524 for (Instruction &I : *BB) {
525 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
526 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
527 if (!thisT()->isLoweredToCall(F))
528 continue;
529 }
530
531 if (ORE) {
532 ORE->emit([&]() {
533 return OptimizationRemark("TTI", "DontUnroll", L->getStartLoc(),
534 L->getHeader())
535 << "advising against unrolling the loop because it "
536 "contains a "
537 << ore::NV("Call", &I);
538 });
539 }
540 return;
541 }
542 }
543 }
544
545 // Enable runtime and partial unrolling up to the specified size.
546 // Enable using trip count upper bound to unroll loops.
547 UP.Partial = UP.Runtime = UP.UpperBound = true;
548 UP.PartialThreshold = MaxOps;
549
550 // Avoid unrolling when optimizing for size.
551 UP.OptSizeThreshold = 0;
552 UP.PartialOptSizeThreshold = 0;
553
554 // Set number of instructions optimized when "back edge"
555 // becomes "fall through" to default value of 2.
556 UP.BEInsns = 2;
557 }
558
559 void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
560 TTI::PeelingPreferences &PP) {
561 PP.PeelCount = 0;
562 PP.AllowPeeling = true;
563 PP.AllowLoopNestsPeeling = false;
564 PP.PeelProfiledIterations = true;
565 }
566
567 bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
568 AssumptionCache &AC,
569 TargetLibraryInfo *LibInfo,
570 HardwareLoopInfo &HWLoopInfo) {
571 return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
572 }
573
574 bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
575 AssumptionCache &AC, TargetLibraryInfo *TLI,
576 DominatorTree *DT,
577 const LoopAccessInfo *LAI) {
578 return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
579 }
580
581 bool emitGetActiveLaneMask() {
582 return BaseT::emitGetActiveLaneMask();
583 }
584
585 Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
586 IntrinsicInst &II) {
587 return BaseT::instCombineIntrinsic(IC, II);
588 }
589
590 Optional<Value *> simplifyDemandedUseBitsIntrinsic(InstCombiner &IC,
591 IntrinsicInst &II,
592 APInt DemandedMask,
593 KnownBits &Known,
594 bool &KnownBitsComputed) {
595 return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
596 KnownBitsComputed);
597 }
598
599 Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
600 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
601 APInt &UndefElts2, APInt &UndefElts3,
602 std::function<void(Instruction *, unsigned, APInt, APInt &)>
603 SimplifyAndSetOp) {
604 return BaseT::simplifyDemandedVectorEltsIntrinsic(
605 IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
606 SimplifyAndSetOp);
607 }
608
609 InstructionCost getInstructionLatency(const Instruction *I) {
610 if (isa<LoadInst>(I))
611 return getST()->getSchedModel().DefaultLoadLatency;
612
613 return BaseT::getInstructionLatency(I);
614 }
615
616 virtual Optional<unsigned>
617 getCacheSize(TargetTransformInfo::CacheLevel Level) const {
618 return Optional<unsigned>(
619 getST()->getCacheSize(static_cast<unsigned>(Level)));
620 }
621
622 virtual Optional<unsigned>
623 getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const {
624 Optional<unsigned> TargetResult =
625 getST()->getCacheAssociativity(static_cast<unsigned>(Level));
626
627 if (TargetResult)
628 return TargetResult;
629
630 return BaseT::getCacheAssociativity(Level);
631 }
632
633 virtual unsigned getCacheLineSize() const {
634 return getST()->getCacheLineSize();
635 }
636
637 virtual unsigned getPrefetchDistance() const {
638 return getST()->getPrefetchDistance();
639 }
640
641 virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
642 unsigned NumStridedMemAccesses,
643 unsigned NumPrefetches,
644 bool HasCall) const {
645 return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
646 NumPrefetches, HasCall);
647 }
648
649 virtual unsigned getMaxPrefetchIterationsAhead() const {
650 return getST()->getMaxPrefetchIterationsAhead();
651 }
652
653 virtual bool enableWritePrefetching() const {
654 return getST()->enableWritePrefetching();
655 }
656
657 /// @}
658
659 /// \name Vector TTI Implementations
660 /// @{
661
662 TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
663 return TypeSize::getFixed(32);
664 }
665
666 Optional<unsigned> getMaxVScale() const { return None; }
667
668 /// Estimate the overhead of scalarizing an instruction. Insert and Extract
669 /// are set if the demanded result elements need to be inserted and/or
670 /// extracted from vectors.
671 InstructionCost getScalarizationOverhead(VectorType *InTy,
672 const APInt &DemandedElts,
673 bool Insert, bool Extract) {
674 /// FIXME: a bitfield is not a reasonable abstraction for talking about
675 /// which elements are needed from a scalable vector
676 auto *Ty = cast<FixedVectorType>(InTy);
677
678 assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&(static_cast<void> (0))
679 "Vector size mismatch")(static_cast<void> (0));
680
681 InstructionCost Cost = 0;
682
683 for (int i = 0, e = Ty->getNumElements(); i < e; ++i) {
684 if (!DemandedElts[i])
685 continue;
686 if (Insert)
687 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, i);
688 if (Extract)
689 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
690 }
691
692 return Cost;
693 }
694
695 /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
696 InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert,
697 bool Extract) {
698 auto *Ty = cast<FixedVectorType>(InTy);
699
700 APInt DemandedElts = APInt::getAllOnesValue(Ty->getNumElements());
701 return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
702 }
703
704 /// Estimate the overhead of scalarizing an instructions unique
705 /// non-constant operands. The (potentially vector) types to use for each of
706 /// argument are passes via Tys.
707 InstructionCost getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
708 ArrayRef<Type *> Tys) {
709 assert(Args.size() == Tys.size() && "Expected matching Args and Tys")(static_cast<void> (0));
710
711 InstructionCost Cost = 0;
712 SmallPtrSet<const Value*, 4> UniqueOperands;
713 for (int I = 0, E = Args.size(); I != E; I++) {
714 // Disregard things like metadata arguments.
715 const Value *A = Args[I];
716 Type *Ty = Tys[I];
717 if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() &&
718 !Ty->isPtrOrPtrVectorTy())
719 continue;
720
721 if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
722 if (auto *VecTy = dyn_cast<VectorType>(Ty))
723 Cost += getScalarizationOverhead(VecTy, false, true);
724 }
725 }
726
727 return Cost;
728 }
729
730 /// Estimate the overhead of scalarizing the inputs and outputs of an
731 /// instruction, with return type RetTy and arguments Args of type Tys. If
732 /// Args are unknown (empty), then the cost associated with one argument is
733 /// added as a heuristic.
734 InstructionCost getScalarizationOverhead(VectorType *RetTy,
735 ArrayRef<const Value *> Args,
736 ArrayRef<Type *> Tys) {
737 InstructionCost Cost = getScalarizationOverhead(RetTy, true, false);
738 if (!Args.empty())
739 Cost += getOperandsScalarizationOverhead(Args, Tys);
740 else
741 // When no information on arguments is provided, we add the cost
742 // associated with one argument as a heuristic.
743 Cost += getScalarizationOverhead(RetTy, false, true);
744
745 return Cost;
746 }
747
748 unsigned getMaxInterleaveFactor(unsigned VF) { return 1; }
749
750 InstructionCost getArithmeticInstrCost(
751 unsigned Opcode, Type *Ty,
752 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
753 TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
754 TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
755 TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
756 TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
757 ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
758 const Instruction *CxtI = nullptr) {
759 // Check if any of the operands are vector operands.
760 const TargetLoweringBase *TLI = getTLI();
761 int ISD = TLI->InstructionOpcodeToISD(Opcode);
762 assert(ISD && "Invalid opcode")(static_cast<void> (0));
763
764 // TODO: Handle more cost kinds.
765 if (CostKind != TTI::TCK_RecipThroughput)
766 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
767 Opd1Info, Opd2Info,
768 Opd1PropInfo, Opd2PropInfo,
769 Args, CxtI);
770
771 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
772
773 bool IsFloat = Ty->isFPOrFPVectorTy();
774 // Assume that floating point arithmetic operations cost twice as much as
775 // integer operations.
776 InstructionCost OpCost = (IsFloat ? 2 : 1);
777
778 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
779 // The operation is legal. Assume it costs 1.
780 // TODO: Once we have extract/insert subvector cost we need to use them.
781 return LT.first * OpCost;
782 }
783
784 if (!TLI->isOperationExpand(ISD, LT.second)) {
785 // If the operation is custom lowered, then assume that the code is twice
786 // as expensive.
787 return LT.first * 2 * OpCost;
788 }
789
790 // An 'Expand' of URem and SRem is special because it may default
791 // to expanding the operation into a sequence of sub-operations
792 // i.e. X % Y -> X-(X/Y)*Y.
793 if (ISD == ISD::UREM || ISD == ISD::SREM) {
794 bool IsSigned = ISD == ISD::SREM;
795 if (TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIVREM : ISD::UDIVREM,
796 LT.second) ||
797 TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIV : ISD::UDIV,
798 LT.second)) {
799 unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv;
800 InstructionCost DivCost = thisT()->getArithmeticInstrCost(
801 DivOpc, Ty, CostKind, Opd1Info, Opd2Info, Opd1PropInfo,
802 Opd2PropInfo);
803 InstructionCost MulCost =
804 thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind);
805 InstructionCost SubCost =
806 thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
807 return DivCost + MulCost + SubCost;
808 }
809 }
810
811 // We cannot scalarize scalable vectors, so return Invalid.
812 if (isa<ScalableVectorType>(Ty))
813 return InstructionCost::getInvalid();
814
815 // Else, assume that we need to scalarize this op.
816 // TODO: If one of the types get legalized by splitting, handle this
817 // similarly to what getCastInstrCost() does.
818 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
819 InstructionCost Cost = thisT()->getArithmeticInstrCost(
820 Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
821 Opd1PropInfo, Opd2PropInfo, Args, CxtI);
822 // Return the cost of multiple scalar invocation plus the cost of
823 // inserting and extracting the values.
824 SmallVector<Type *> Tys(Args.size(), Ty);
825 return getScalarizationOverhead(VTy, Args, Tys) +
826 VTy->getNumElements() * Cost;
827 }
828
829 // We don't know anything about this scalar instruction.
830 return OpCost;
831 }
832
833 TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind,
834 ArrayRef<int> Mask) const {
835 int Limit = Mask.size() * 2;
836 if (Mask.empty() ||
837 // Extra check required by isSingleSourceMaskImpl function (called by
838 // ShuffleVectorInst::isSingleSourceMask).
839 any_of(Mask, [Limit](int I) { return I >= Limit; }))
840 return Kind;
841 switch (Kind) {
842 case TTI::SK_PermuteSingleSrc:
843 if (ShuffleVectorInst::isReverseMask(Mask))
844 return TTI::SK_Reverse;
845 if (ShuffleVectorInst::isZeroEltSplatMask(Mask))
846 return TTI::SK_Broadcast;
847 break;
848 case TTI::SK_PermuteTwoSrc:
849 if (ShuffleVectorInst::isSelectMask(Mask))
850 return TTI::SK_Select;
851 if (ShuffleVectorInst::isTransposeMask(Mask))
852 return TTI::SK_Transpose;
853 break;
854 case TTI::SK_Select:
855 case TTI::SK_Reverse:
856 case TTI::SK_Broadcast:
857 case TTI::SK_Transpose:
858 case TTI::SK_InsertSubvector:
859 case TTI::SK_ExtractSubvector:
860 case TTI::SK_Splice:
861 break;
862 }
863 return Kind;
864 }
865
866 InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
867 ArrayRef<int> Mask, int Index,
868 VectorType *SubTp) {
869
870 switch (improveShuffleKindFromMask(Kind, Mask)) {
871 case TTI::SK_Broadcast:
872 return getBroadcastShuffleOverhead(cast<FixedVectorType>(Tp));
873 case TTI::SK_Select:
874 case TTI::SK_Splice:
875 case TTI::SK_Reverse:
876 case TTI::SK_Transpose:
877 case TTI::SK_PermuteSingleSrc:
878 case TTI::SK_PermuteTwoSrc:
879 return getPermuteShuffleOverhead(cast<FixedVectorType>(Tp));
880 case TTI::SK_ExtractSubvector:
881 return getExtractSubvectorOverhead(Tp, Index,
882 cast<FixedVectorType>(SubTp));
883 case TTI::SK_InsertSubvector:
884 return getInsertSubvectorOverhead(Tp, Index,
885 cast<FixedVectorType>(SubTp));
886 }
887 llvm_unreachable("Unknown TTI::ShuffleKind")__builtin_unreachable();
888 }
889
890 InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
891 TTI::CastContextHint CCH,
892 TTI::TargetCostKind CostKind,
893 const Instruction *I = nullptr) {
894 if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0)
895 return 0;
896
897 const TargetLoweringBase *TLI = getTLI();
898 int ISD = TLI->InstructionOpcodeToISD(Opcode);
899 assert(ISD && "Invalid opcode")(static_cast<void> (0));
900 std::pair<InstructionCost, MVT> SrcLT =
901 TLI->getTypeLegalizationCost(DL, Src);
902 std::pair<InstructionCost, MVT> DstLT =
903 TLI->getTypeLegalizationCost(DL, Dst);
904
905 TypeSize SrcSize = SrcLT.second.getSizeInBits();
906 TypeSize DstSize = DstLT.second.getSizeInBits();
907 bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy();
908 bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy();
909
910 switch (Opcode) {
911 default:
912 break;
913 case Instruction::Trunc:
914 // Check for NOOP conversions.
915 if (TLI->isTruncateFree(SrcLT.second, DstLT.second))
916 return 0;
917 LLVM_FALLTHROUGH[[gnu::fallthrough]];
918 case Instruction::BitCast:
919 // Bitcast between types that are legalized to the same type are free and
920 // assume int to/from ptr of the same size is also free.
921 if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst &&
922 SrcSize == DstSize)
923 return 0;
924 break;
925 case Instruction::FPExt:
926 if (I && getTLI()->isExtFree(I))
927 return 0;
928 break;
929 case Instruction::ZExt:
930 if (TLI->isZExtFree(SrcLT.second, DstLT.second))
931 return 0;
932 LLVM_FALLTHROUGH[[gnu::fallthrough]];
933 case Instruction::SExt:
934 if (I && getTLI()->isExtFree(I))
935 return 0;
936
937 // If this is a zext/sext of a load, return 0 if the corresponding
938 // extending load exists on target and the result type is legal.
939 if (CCH == TTI::CastContextHint::Normal) {
940 EVT ExtVT = EVT::getEVT(Dst);
941 EVT LoadVT = EVT::getEVT(Src);
942 unsigned LType =
943 ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
944 if (DstLT.first == SrcLT.first &&
945 TLI->isLoadExtLegal(LType, ExtVT, LoadVT))
946 return 0;
947 }
948 break;
949 case Instruction::AddrSpaceCast:
950 if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(),
951 Dst->getPointerAddressSpace()))
952 return 0;
953 break;
954 }
955
956 auto *SrcVTy = dyn_cast<VectorType>(Src);
957 auto *DstVTy = dyn_cast<VectorType>(Dst);
958
959 // If the cast is marked as legal (or promote) then assume low cost.
960 if (SrcLT.first == DstLT.first &&
961 TLI->isOperationLegalOrPromote(ISD, DstLT.second))
962 return SrcLT.first;
963
964 // Handle scalar conversions.
965 if (!SrcVTy && !DstVTy) {
966 // Just check the op cost. If the operation is legal then assume it costs
967 // 1.
968 if (!TLI->isOperationExpand(ISD, DstLT.second))
969 return 1;
970
971 // Assume that illegal scalar instruction are expensive.
972 return 4;
973 }
974
975 // Check vector-to-vector casts.
976 if (DstVTy && SrcVTy) {
977 // If the cast is between same-sized registers, then the check is simple.
978 if (SrcLT.first == DstLT.first && SrcSize == DstSize) {
979
980 // Assume that Zext is done using AND.
981 if (Opcode == Instruction::ZExt)
982 return SrcLT.first;
983
984 // Assume that sext is done using SHL and SRA.
985 if (Opcode == Instruction::SExt)
986 return SrcLT.first * 2;
987
988 // Just check the op cost. If the operation is legal then assume it
989 // costs
990 // 1 and multiply by the type-legalization overhead.
991 if (!TLI->isOperationExpand(ISD, DstLT.second))
992 return SrcLT.first * 1;
993 }
994
995 // If we are legalizing by splitting, query the concrete TTI for the cost
996 // of casting the original vector twice. We also need to factor in the
997 // cost of the split itself. Count that as 1, to be consistent with
998 // TLI->getTypeLegalizationCost().
999 bool SplitSrc =
1000 TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
1001 TargetLowering::TypeSplitVector;
1002 bool SplitDst =
1003 TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
1004 TargetLowering::TypeSplitVector;
1005 if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() &&
1006 DstVTy->getElementCount().isVector()) {
1007 Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy);
1008 Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy);
1009 T *TTI = static_cast<T *>(this);
1010 // If both types need to be split then the split is free.
1011 InstructionCost SplitCost =
1012 (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0;
1013 return SplitCost +
1014 (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH,
1015 CostKind, I));
1016 }
1017
1018 // Scalarization cost is Invalid, can't assume any num elements.
1019 if (isa<ScalableVectorType>(DstVTy))
1020 return InstructionCost::getInvalid();
1021
1022 // In other cases where the source or destination are illegal, assume
1023 // the operation will get scalarized.
1024 unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements();
1025 InstructionCost Cost = thisT()->getCastInstrCost(
1026 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I);
1027
1028 // Return the cost of multiple scalar invocation plus the cost of
1029 // inserting and extracting the values.
1030 return getScalarizationOverhead(DstVTy, true, true) + Num * Cost;
1031 }
1032
1033 // We already handled vector-to-vector and scalar-to-scalar conversions.
1034 // This
1035 // is where we handle bitcast between vectors and scalars. We need to assume
1036 // that the conversion is scalarized in one way or another.
1037 if (Opcode == Instruction::BitCast) {
1038 // Illegal bitcasts are done by storing and loading from a stack slot.
1039 return (SrcVTy ? getScalarizationOverhead(SrcVTy, false, true) : 0) +
1040 (DstVTy ? getScalarizationOverhead(DstVTy, true, false) : 0);
1041 }
1042
1043 llvm_unreachable("Unhandled cast")__builtin_unreachable();
1044 }
1045
1046 InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst,
1047 VectorType *VecTy, unsigned Index) {
1048 return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
1049 Index) +
1050 thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(),
1051 TTI::CastContextHint::None,
1052 TTI::TCK_RecipThroughput);
1053 }
1054
1055 InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
1056 const Instruction *I = nullptr) {
1057 return BaseT::getCFInstrCost(Opcode, CostKind, I);
1058 }
1059
1060 InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1061 CmpInst::Predicate VecPred,
1062 TTI::TargetCostKind CostKind,
1063 const Instruction *I = nullptr) {
1064 const TargetLoweringBase *TLI = getTLI();
1065 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1066 assert(ISD && "Invalid opcode")(static_cast<void> (0));
1067
1068 // TODO: Handle other cost kinds.
1069 if (CostKind != TTI::TCK_RecipThroughput)
61
Assuming 'CostKind' is equal to TCK_RecipThroughput
62
Taking false branch
1070 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1071 I);
1072
1073 // Selects on vectors are actually vector selects.
1074 if (ISD == ISD::SELECT) {
63
Assuming 'ISD' is equal to SELECT
64
Taking true branch
1075 assert(CondTy && "CondTy must exist")(static_cast<void> (0));
1076 if (CondTy->isVectorTy())
65
Called C++ object pointer is null
1077 ISD = ISD::VSELECT;
1078 }
1079 std::pair<InstructionCost, MVT> LT =
1080 TLI->getTypeLegalizationCost(DL, ValTy);
1081
1082 if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
1083 !TLI->isOperationExpand(ISD, LT.second)) {
1084 // The operation is legal. Assume it costs 1. Multiply
1085 // by the type-legalization overhead.
1086 return LT.first * 1;
1087 }
1088
1089 // Otherwise, assume that the cast is scalarized.
1090 // TODO: If one of the types get legalized by splitting, handle this
1091 // similarly to what getCastInstrCost() does.
1092 if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) {
1093 unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements();
1094 if (CondTy)
1095 CondTy = CondTy->getScalarType();
1096 InstructionCost Cost = thisT()->getCmpSelInstrCost(
1097 Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I);
1098
1099 // Return the cost of multiple scalar invocation plus the cost of
1100 // inserting and extracting the values.
1101 return getScalarizationOverhead(ValVTy, true, false) + Num * Cost;
1102 }
1103
1104 // Unknown scalar opcode.
1105 return 1;
1106 }
1107
1108 InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
1109 unsigned Index) {
1110 std::pair<InstructionCost, MVT> LT =
1111 getTLI()->getTypeLegalizationCost(DL, Val->getScalarType());
1112
1113 return LT.first;
1114 }
1115
1116 InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src,
1117 MaybeAlign Alignment, unsigned AddressSpace,
1118 TTI::TargetCostKind CostKind,
1119 const Instruction *I = nullptr) {
1120 assert(!Src->isVoidTy() && "Invalid type")(static_cast<void> (0));
1121 // Assume types, such as structs, are expensive.
1122 if (getTLI()->getValueType(DL, Src, true) == MVT::Other)
1123 return 4;
1124 std::pair<InstructionCost, MVT> LT =
1125 getTLI()->getTypeLegalizationCost(DL, Src);
1126
1127 // Assuming that all loads of legal types cost 1.
1128 InstructionCost Cost = LT.first;
1129 if (CostKind != TTI::TCK_RecipThroughput)
1130 return Cost;
1131
1132 if (Src->isVectorTy() &&
1133 // In practice it's not currently possible to have a change in lane
1134 // length for extending loads or truncating stores so both types should
1135 // have the same scalable property.
1136 TypeSize::isKnownLT(Src->getPrimitiveSizeInBits(),
1137 LT.second.getSizeInBits())) {
1138 // This is a vector load that legalizes to a larger type than the vector
1139 // itself. Unless the corresponding extending load or truncating store is
1140 // legal, then this will scalarize.
1141 TargetLowering::LegalizeAction LA = TargetLowering::Expand;
1142 EVT MemVT = getTLI()->getValueType(DL, Src);
1143 if (Opcode == Instruction::Store)
1144 LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
1145 else
1146 LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
1147
1148 if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
1149 // This is a vector load/store for some illegal type that is scalarized.
1150 // We must account for the cost of building or decomposing the vector.
1151 Cost += getScalarizationOverhead(cast<VectorType>(Src),
1152 Opcode != Instruction::Store,
1153 Opcode == Instruction::Store);
1154 }
1155 }
1156
1157 return Cost;
1158 }
1159
1160 InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
1161 Align Alignment, unsigned AddressSpace,
1162 TTI::TargetCostKind CostKind) {
1163 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false,
1164 CostKind);
1165 }
1166
1167 InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
1168 const Value *Ptr, bool VariableMask,
1169 Align Alignment,
1170 TTI::TargetCostKind CostKind,
1171 const Instruction *I = nullptr) {
1172 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask,
1173 true, CostKind);
1174 }
1175
1176 InstructionCost getInterleavedMemoryOpCost(
1177 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1178 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1179 bool UseMaskForCond = false, bool UseMaskForGaps = false) {
1180 auto *VT = cast<FixedVectorType>(VecTy);
1181
1182 unsigned NumElts = VT->getNumElements();
1183 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor")(static_cast<void> (0));
1184
1185 unsigned NumSubElts = NumElts / Factor;
1186 auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts);
1187
1188 // Firstly, the cost of load/store operation.
1189 InstructionCost Cost;
1190 if (UseMaskForCond || UseMaskForGaps)
1191 Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment,
1192 AddressSpace, CostKind);
1193 else
1194 Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace,
1195 CostKind);
1196
1197 // Legalize the vector type, and get the legalized and unlegalized type
1198 // sizes.
1199 MVT VecTyLT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
1200 unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy);
1201 unsigned VecTyLTSize = VecTyLT.getStoreSize();
1202
1203 // Scale the cost of the memory operation by the fraction of legalized
1204 // instructions that will actually be used. We shouldn't account for the
1205 // cost of dead instructions since they will be removed.
1206 //
1207 // E.g., An interleaved load of factor 8:
1208 // %vec = load <16 x i64>, <16 x i64>* %ptr
1209 // %v0 = shufflevector %vec, undef, <0, 8>
1210 //
1211 // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
1212 // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
1213 // type). The other loads are unused.
1214 //
1215 // TODO: Note that legalization can turn masked loads/stores into unmasked
1216 // (legalized) loads/stores. This can be reflected in the cost.
1217 if (VecTySize > VecTyLTSize) {
1218 // The number of loads of a legal type it will take to represent a load
1219 // of the unlegalized vector type.
1220 unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize);
1221
1222 // The number of elements of the unlegalized type that correspond to a
1223 // single legal instruction.
1224 unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts);
1225
1226 // Determine which legal instructions will be used.
1227 BitVector UsedInsts(NumLegalInsts, false);
1228 for (unsigned Index : Indices)
1229 for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
1230 UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
1231
1232 // Scale the cost of the load by the fraction of legal instructions that
1233 // will be used.
1234 Cost *= UsedInsts.count() / NumLegalInsts;
1235 }
1236
1237 // Then plus the cost of interleave operation.
1238 assert(Indices.size() <= Factor &&(static_cast<void> (0))
1239 "Interleaved memory op has too many members")(static_cast<void> (0));
1240 if (Opcode == Instruction::Load) {
1241 // The interleave cost is similar to extract sub vectors' elements
1242 // from the wide vector, and insert them into sub vectors.
1243 //
1244 // E.g. An interleaved load of factor 2 (with one member of index 0):
1245 // %vec = load <8 x i32>, <8 x i32>* %ptr
1246 // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0
1247 // The cost is estimated as extract elements at 0, 2, 4, 6 from the
1248 // <8 x i32> vector and insert them into a <4 x i32> vector.
1249 for (unsigned Index : Indices) {
1250 assert(Index < Factor && "Invalid index for interleaved memory op")(static_cast<void> (0));
1251
1252 // Extract elements from loaded vector for each sub vector.
1253 for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
1254 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VT,
1255 Index + Elm * Factor);
1256 }
1257
1258 InstructionCost InsSubCost = 0;
1259 for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
1260 InsSubCost +=
1261 thisT()->getVectorInstrCost(Instruction::InsertElement, SubVT, Elm);
1262
1263 Cost += Indices.size() * InsSubCost;
1264 } else {
1265 // The interleave cost is extract elements from sub vectors, and
1266 // insert them into the wide vector.
1267 //
1268 // E.g. An interleaved store of factor 3 with 2 members at indices 0,1:
1269 // (using VF=4):
1270 // %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef>
1271 // %gaps.mask = <true, true, false, true, true, false,
1272 // true, true, false, true, true, false>
1273 // call llvm.masked.store <12 x i32> %v0_v1, <12 x i32>* %ptr,
1274 // i32 Align, <12 x i1> %gaps.mask
1275 // The cost is estimated as extract all elements (of actual members,
1276 // excluding gaps) from both <4 x i32> vectors and insert into the <12 x
1277 // i32> vector.
1278 InstructionCost ExtSubCost = 0;
1279 for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
1280 ExtSubCost += thisT()->getVectorInstrCost(Instruction::ExtractElement,
1281 SubVT, Elm);
1282 Cost += ExtSubCost * Indices.size();
1283
1284 for (unsigned Index : Indices) {
1285 assert(Index < Factor && "Invalid index for interleaved memory op")(static_cast<void> (0));
1286
1287 // Insert elements from loaded vector for each sub vector.
1288 for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
1289 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VT,
1290 Index + Elm * Factor);
1291 }
1292 }
1293
1294 if (!UseMaskForCond)
1295 return Cost;
1296
1297 Type *I8Type = Type::getInt8Ty(VT->getContext());
1298 auto *MaskVT = FixedVectorType::get(I8Type, NumElts);
1299 SubVT = FixedVectorType::get(I8Type, NumSubElts);
1300
1301 // The Mask shuffling cost is extract all the elements of the Mask
1302 // and insert each of them Factor times into the wide vector:
1303 //
1304 // E.g. an interleaved group with factor 3:
1305 // %mask = icmp ult <8 x i32> %vec1, %vec2
1306 // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
1307 // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
1308 // The cost is estimated as extract all mask elements from the <8xi1> mask
1309 // vector and insert them factor times into the <24xi1> shuffled mask
1310 // vector.
1311 for (unsigned i = 0; i < NumSubElts; i++)
1312 Cost +=
1313 thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVT, i);
1314
1315 for (unsigned i = 0; i < NumElts; i++)
1316 Cost +=
1317 thisT()->getVectorInstrCost(Instruction::InsertElement, MaskVT, i);
1318
1319 // The Gaps mask is invariant and created outside the loop, therefore the
1320 // cost of creating it is not accounted for here. However if we have both
1321 // a MaskForGaps and some other mask that guards the execution of the
1322 // memory access, we need to account for the cost of And-ing the two masks
1323 // inside the loop.
1324 if (UseMaskForGaps)
1325 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT,
1326 CostKind);
1327
1328 return Cost;
1329 }
1330
1331 /// Get intrinsic cost based on arguments.
1332 InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1333 TTI::TargetCostKind CostKind) {
1334 // Check for generically free intrinsics.
1335 if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0)
3
Calling 'InstructionCost::operator=='
8
Returning from 'InstructionCost::operator=='
9
Taking false branch
1336 return 0;
1337
1338 // Assume that target intrinsics are cheap.
1339 Intrinsic::ID IID = ICA.getID();
1340 if (Function::isTargetIntrinsic(IID))
10
Assuming the condition is false
11
Taking false branch
1341 return TargetTransformInfo::TCC_Basic;
1342
1343 if (ICA.isTypeBasedOnly())
12
Calling 'IntrinsicCostAttributes::isTypeBasedOnly'
18
Returning from 'IntrinsicCostAttributes::isTypeBasedOnly'
19
Taking false branch
1344 return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
1345
1346 Type *RetTy = ICA.getReturnType();
1347
1348 ElementCount RetVF =
1349 (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
20
'?' condition is false
1350 : ElementCount::getFixed(1));
1351 const IntrinsicInst *I = ICA.getInst();
1352 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
1353 FastMathFlags FMF = ICA.getFlags();
1354 switch (IID) {
21
Control jumps to 'case fshl:' at line 1445
1355 default:
1356 break;
1357
1358 case Intrinsic::cttz:
1359 // FIXME: If necessary, this should go in target-specific overrides.
1360 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz())
1361 return TargetTransformInfo::TCC_Basic;
1362 break;
1363
1364 case Intrinsic::ctlz:
1365 // FIXME: If necessary, this should go in target-specific overrides.
1366 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz())
1367 return TargetTransformInfo::TCC_Basic;
1368 break;
1369
1370 case Intrinsic::memcpy:
1371 return thisT()->getMemcpyCost(ICA.getInst());
1372
1373 case Intrinsic::masked_scatter: {
1374 const Value *Mask = Args[3];
1375 bool VarMask = !isa<Constant>(Mask);
1376 Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue();
1377 return thisT()->getGatherScatterOpCost(Instruction::Store,
1378 ICA.getArgTypes()[0], Args[1],
1379 VarMask, Alignment, CostKind, I);
1380 }
1381 case Intrinsic::masked_gather: {
1382 const Value *Mask = Args[2];
1383 bool VarMask = !isa<Constant>(Mask);
1384 Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue();
1385 return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
1386 VarMask, Alignment, CostKind, I);
1387 }
1388 case Intrinsic::experimental_stepvector: {
1389 if (isa<ScalableVectorType>(RetTy))
1390 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1391 // The cost of materialising a constant integer vector.
1392 return TargetTransformInfo::TCC_Basic;
1393 }
1394 case Intrinsic::experimental_vector_extract: {
1395 // FIXME: Handle case where a scalable vector is extracted from a scalable
1396 // vector
1397 if (isa<ScalableVectorType>(RetTy))
1398 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1399 unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue();
1400 return thisT()->getShuffleCost(TTI::SK_ExtractSubvector,
1401 cast<VectorType>(Args[0]->getType()), None,
1402 Index, cast<VectorType>(RetTy));
1403 }
1404 case Intrinsic::experimental_vector_insert: {
1405 // FIXME: Handle case where a scalable vector is inserted into a scalable
1406 // vector
1407 if (isa<ScalableVectorType>(Args[1]->getType()))
1408 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1409 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1410 return thisT()->getShuffleCost(
1411 TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()), None,
1412 Index, cast<VectorType>(Args[1]->getType()));
1413 }
1414 case Intrinsic::experimental_vector_reverse: {
1415 return thisT()->getShuffleCost(TTI::SK_Reverse,
1416 cast<VectorType>(Args[0]->getType()), None,
1417 0, cast<VectorType>(RetTy));
1418 }
1419 case Intrinsic::experimental_vector_splice: {
1420 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1421 return thisT()->getShuffleCost(TTI::SK_Splice,
1422 cast<VectorType>(Args[0]->getType()), None,
1423 Index, cast<VectorType>(RetTy));
1424 }
1425 case Intrinsic::vector_reduce_add:
1426 case Intrinsic::vector_reduce_mul:
1427 case Intrinsic::vector_reduce_and:
1428 case Intrinsic::vector_reduce_or:
1429 case Intrinsic::vector_reduce_xor:
1430 case Intrinsic::vector_reduce_smax:
1431 case Intrinsic::vector_reduce_smin:
1432 case Intrinsic::vector_reduce_fmax:
1433 case Intrinsic::vector_reduce_fmin:
1434 case Intrinsic::vector_reduce_umax:
1435 case Intrinsic::vector_reduce_umin: {
1436 IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1);
1437 return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1438 }
1439 case Intrinsic::vector_reduce_fadd:
1440 case Intrinsic::vector_reduce_fmul: {
1441 IntrinsicCostAttributes Attrs(
1442 IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1);
1443 return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1444 }
1445 case Intrinsic::fshl:
1446 case Intrinsic::fshr: {
1447 if (isa<ScalableVectorType>(RetTy))
22
Assuming 'RetTy' is not a 'ScalableVectorType'
23
Taking false branch
1448 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1449 const Value *X = Args[0];
1450 const Value *Y = Args[1];
1451 const Value *Z = Args[2];
1452 TTI::OperandValueProperties OpPropsX, OpPropsY, OpPropsZ, OpPropsBW;
1453 TTI::OperandValueKind OpKindX = TTI::getOperandInfo(X, OpPropsX);
1454 TTI::OperandValueKind OpKindY = TTI::getOperandInfo(Y, OpPropsY);
1455 TTI::OperandValueKind OpKindZ = TTI::getOperandInfo(Z, OpPropsZ);
1456 TTI::OperandValueKind OpKindBW = TTI::OK_UniformConstantValue;
1457 OpPropsBW = isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2
24
'?' condition is false
1458 : TTI::OP_None;
1459 // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
1460 // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
1461 InstructionCost Cost = 0;
1462 Cost +=
1463 thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
1464 Cost +=
1465 thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
1466 Cost += thisT()->getArithmeticInstrCost(
1467 BinaryOperator::Shl, RetTy, CostKind, OpKindX, OpKindZ, OpPropsX);
1468 Cost += thisT()->getArithmeticInstrCost(
1469 BinaryOperator::LShr, RetTy, CostKind, OpKindY, OpKindZ, OpPropsY);
1470 // Non-constant shift amounts requires a modulo.
1471 if (OpKindZ != TTI::OK_UniformConstantValue &&
25
Assuming 'OpKindZ' is equal to OK_UniformConstantValue
1472 OpKindZ != TTI::OK_NonUniformConstantValue)
1473 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy,
1474 CostKind, OpKindZ, OpKindBW,
1475 OpPropsZ, OpPropsBW);
1476 // For non-rotates (X != Y) we must add shift-by-zero handling costs.
1477 if (X != Y) {
26
Assuming 'X' is not equal to 'Y'
27
Taking true branch
1478 Type *CondTy = RetTy->getWithNewBitWidth(1);
28
Calling 'Type::getWithNewBitWidth'
36
Returning from 'Type::getWithNewBitWidth'
37
'CondTy' initialized here
1479 Cost +=
1480 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
38
Passing 'CondTy' via 3rd parameter 'CondTy'
39
Calling 'ARMTTIImpl::getCmpSelInstrCost'
1481 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1482 Cost +=
1483 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1484 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1485 }
1486 return Cost;
1487 }
1488 }
1489
1490 // Assume that we need to scalarize this intrinsic.
1491 // Compute the scalarization overhead based on Args for a vector
1492 // intrinsic.
1493 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
1494 if (RetVF.isVector() && !RetVF.isScalable()) {
1495 ScalarizationCost = 0;
1496 if (!RetTy->isVoidTy())
1497 ScalarizationCost +=
1498 getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
1499 ScalarizationCost +=
1500 getOperandsScalarizationOverhead(Args, ICA.getArgTypes());
1501 }
1502
1503 IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
1504 ScalarizationCost);
1505 return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1506 }
1507
1508 /// Get intrinsic cost based on argument types.
1509 /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the
1510 /// cost of scalarizing the arguments and the return value will be computed
1511 /// based on types.
1512 InstructionCost
1513 getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1514 TTI::TargetCostKind CostKind) {
1515 Intrinsic::ID IID = ICA.getID();
1516 Type *RetTy = ICA.getReturnType();
1517 const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes();
1518 FastMathFlags FMF = ICA.getFlags();
1519 InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost();
1520 bool SkipScalarizationCost = ICA.skipScalarizationCost();
1521
1522 VectorType *VecOpTy = nullptr;
1523 if (!Tys.empty()) {
1524 // The vector reduction operand is operand 0 except for fadd/fmul.
1525 // Their operand 0 is a scalar start value, so the vector op is operand 1.
1526 unsigned VecTyIndex = 0;
1527 if (IID == Intrinsic::vector_reduce_fadd ||
1528 IID == Intrinsic::vector_reduce_fmul)
1529 VecTyIndex = 1;
1530 assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes")(static_cast<void> (0));
1531 VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]);
1532 }
1533
1534 // Library call cost - other than size, make it expensive.
1535 unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10;
1536 SmallVector<unsigned, 2> ISDs;
1537 switch (IID) {
1538 default: {
1539 // Scalable vectors cannot be scalarized, so return Invalid.
1540 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
1541 return isa<ScalableVectorType>(Ty);
1542 }))
1543 return InstructionCost::getInvalid();
1544
1545 // Assume that we need to scalarize this intrinsic.
1546 InstructionCost ScalarizationCost =
1547 SkipScalarizationCost ? ScalarizationCostPassed : 0;
1548 unsigned ScalarCalls = 1;
1549 Type *ScalarRetTy = RetTy;
1550 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
1551 if (!SkipScalarizationCost)
1552 ScalarizationCost = getScalarizationOverhead(RetVTy, true, false);
1553 ScalarCalls = std::max(ScalarCalls,
1554 cast<FixedVectorType>(RetVTy)->getNumElements());
1555 ScalarRetTy = RetTy->getScalarType();
1556 }
1557 SmallVector<Type *, 4> ScalarTys;
1558 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1559 Type *Ty = Tys[i];
1560 if (auto *VTy = dyn_cast<VectorType>(Ty)) {
1561 if (!SkipScalarizationCost)
1562 ScalarizationCost += getScalarizationOverhead(VTy, false, true);
1563 ScalarCalls = std::max(ScalarCalls,
1564 cast<FixedVectorType>(VTy)->getNumElements());
1565 Ty = Ty->getScalarType();
1566 }
1567 ScalarTys.push_back(Ty);
1568 }
1569 if (ScalarCalls == 1)
1570 return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
1571
1572 IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF);
1573 InstructionCost ScalarCost =
1574 thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind);
1575
1576 return ScalarCalls * ScalarCost + ScalarizationCost;
1577 }
1578 // Look for intrinsics that can be lowered directly or turned into a scalar
1579 // intrinsic call.
1580 case Intrinsic::sqrt:
1581 ISDs.push_back(ISD::FSQRT);
1582 break;
1583 case Intrinsic::sin:
1584 ISDs.push_back(ISD::FSIN);
1585 break;
1586 case Intrinsic::cos:
1587 ISDs.push_back(ISD::FCOS);
1588 break;
1589 case Intrinsic::exp:
1590 ISDs.push_back(ISD::FEXP);
1591 break;
1592 case Intrinsic::exp2:
1593 ISDs.push_back(ISD::FEXP2);
1594 break;
1595 case Intrinsic::log:
1596 ISDs.push_back(ISD::FLOG);
1597 break;
1598 case Intrinsic::log10:
1599 ISDs.push_back(ISD::FLOG10);
1600 break;
1601 case Intrinsic::log2:
1602 ISDs.push_back(ISD::FLOG2);
1603 break;
1604 case Intrinsic::fabs:
1605 ISDs.push_back(ISD::FABS);
1606 break;
1607 case Intrinsic::canonicalize:
1608 ISDs.push_back(ISD::FCANONICALIZE);
1609 break;
1610 case Intrinsic::minnum:
1611 ISDs.push_back(ISD::FMINNUM);
1612 break;
1613 case Intrinsic::maxnum:
1614 ISDs.push_back(ISD::FMAXNUM);
1615 break;
1616 case Intrinsic::minimum:
1617 ISDs.push_back(ISD::FMINIMUM);
1618 break;
1619 case Intrinsic::maximum:
1620 ISDs.push_back(ISD::FMAXIMUM);
1621 break;
1622 case Intrinsic::copysign:
1623 ISDs.push_back(ISD::FCOPYSIGN);
1624 break;
1625 case Intrinsic::floor:
1626 ISDs.push_back(ISD::FFLOOR);
1627 break;
1628 case Intrinsic::ceil:
1629 ISDs.push_back(ISD::FCEIL);
1630 break;
1631 case Intrinsic::trunc:
1632 ISDs.push_back(ISD::FTRUNC);
1633 break;
1634 case Intrinsic::nearbyint:
1635 ISDs.push_back(ISD::FNEARBYINT);
1636 break;
1637 case Intrinsic::rint:
1638 ISDs.push_back(ISD::FRINT);
1639 break;
1640 case Intrinsic::round:
1641 ISDs.push_back(ISD::FROUND);
1642 break;
1643 case Intrinsic::roundeven:
1644 ISDs.push_back(ISD::FROUNDEVEN);
1645 break;
1646 case Intrinsic::pow:
1647 ISDs.push_back(ISD::FPOW);
1648 break;
1649 case Intrinsic::fma:
1650 ISDs.push_back(ISD::FMA);
1651 break;
1652 case Intrinsic::fmuladd:
1653 ISDs.push_back(ISD::FMA);
1654 break;
1655 case Intrinsic::experimental_constrained_fmuladd:
1656 ISDs.push_back(ISD::STRICT_FMA);
1657 break;
1658 // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
1659 case Intrinsic::lifetime_start:
1660 case Intrinsic::lifetime_end:
1661 case Intrinsic::sideeffect:
1662 case Intrinsic::pseudoprobe:
1663 case Intrinsic::arithmetic_fence:
1664 return 0;
1665 case Intrinsic::masked_store: {
1666 Type *Ty = Tys[0];
1667 Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
1668 return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0,
1669 CostKind);
1670 }
1671 case Intrinsic::masked_load: {
1672 Type *Ty = RetTy;
1673 Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
1674 return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
1675 CostKind);
1676 }
1677 case Intrinsic::vector_reduce_add:
1678 return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy,
1679 None, CostKind);
1680 case Intrinsic::vector_reduce_mul:
1681 return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy,
1682 None, CostKind);
1683 case Intrinsic::vector_reduce_and:
1684 return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy,
1685 None, CostKind);
1686 case Intrinsic::vector_reduce_or:
1687 return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy, None,
1688 CostKind);
1689 case Intrinsic::vector_reduce_xor:
1690 return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy,
1691 None, CostKind);
1692 case Intrinsic::vector_reduce_fadd:
1693 return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy,
1694 FMF, CostKind);
1695 case Intrinsic::vector_reduce_fmul:
1696 return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy,
1697 FMF, CostKind);
1698 case Intrinsic::vector_reduce_smax:
1699 case Intrinsic::vector_reduce_smin:
1700 case Intrinsic::vector_reduce_fmax:
1701 case Intrinsic::vector_reduce_fmin:
1702 return thisT()->getMinMaxReductionCost(
1703 VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
1704 /*IsUnsigned=*/false, CostKind);
1705 case Intrinsic::vector_reduce_umax:
1706 case Intrinsic::vector_reduce_umin:
1707 return thisT()->getMinMaxReductionCost(
1708 VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
1709 /*IsUnsigned=*/true, CostKind);
1710 case Intrinsic::abs:
1711 case Intrinsic::smax:
1712 case Intrinsic::smin:
1713 case Intrinsic::umax:
1714 case Intrinsic::umin: {
1715 // abs(X) = select(icmp(X,0),X,sub(0,X))
1716 // minmax(X,Y) = select(icmp(X,Y),X,Y)
1717 Type *CondTy = RetTy->getWithNewBitWidth(1);
1718 InstructionCost Cost = 0;
1719 // TODO: Ideally getCmpSelInstrCost would accept an icmp condition code.
1720 Cost +=
1721 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1722 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1723 Cost +=
1724 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1725 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1726 // TODO: Should we add an OperandValueProperties::OP_Zero property?
1727 if (IID == Intrinsic::abs)
1728 Cost += thisT()->getArithmeticInstrCost(
1729 BinaryOperator::Sub, RetTy, CostKind, TTI::OK_UniformConstantValue);
1730 return Cost;
1731 }
1732 case Intrinsic::sadd_sat:
1733 case Intrinsic::ssub_sat: {
1734 Type *CondTy = RetTy->getWithNewBitWidth(1);
1735
1736 Type *OpTy = StructType::create({RetTy, CondTy});
1737 Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
1738 ? Intrinsic::sadd_with_overflow
1739 : Intrinsic::ssub_with_overflow;
1740
1741 // SatMax -> Overflow && SumDiff < 0
1742 // SatMin -> Overflow && SumDiff >= 0
1743 InstructionCost Cost = 0;
1744 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
1745 nullptr, ScalarizationCostPassed);
1746 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1747 Cost +=
1748 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1749 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1750 Cost += 2 * thisT()->getCmpSelInstrCost(
1751 BinaryOperator::Select, RetTy, CondTy,
1752 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1753 return Cost;
1754 }
1755 case Intrinsic::uadd_sat:
1756 case Intrinsic::usub_sat: {
1757 Type *CondTy = RetTy->getWithNewBitWidth(1);
1758
1759 Type *OpTy = StructType::create({RetTy, CondTy});
1760 Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
1761 ? Intrinsic::uadd_with_overflow
1762 : Intrinsic::usub_with_overflow;
1763
1764 InstructionCost Cost = 0;
1765 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
1766 nullptr, ScalarizationCostPassed);
1767 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1768 Cost +=
1769 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1770 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1771 return Cost;
1772 }
1773 case Intrinsic::smul_fix:
1774 case Intrinsic::umul_fix: {
1775 unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
1776 Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);
1777
1778 unsigned ExtOp =
1779 IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
1780 TTI::CastContextHint CCH = TTI::CastContextHint::None;
1781
1782 InstructionCost Cost = 0;
1783 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
1784 Cost +=
1785 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
1786 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
1787 CCH, CostKind);
1788 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy,
1789 CostKind, TTI::OK_AnyValue,
1790 TTI::OK_UniformConstantValue);
1791 Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind,
1792 TTI::OK_AnyValue,
1793 TTI::OK_UniformConstantValue);
1794 Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
1795 return Cost;
1796 }
1797 case Intrinsic::sadd_with_overflow:
1798 case Intrinsic::ssub_with_overflow: {
1799 Type *SumTy = RetTy->getContainedType(0);
1800 Type *OverflowTy = RetTy->getContainedType(1);
1801 unsigned Opcode = IID == Intrinsic::sadd_with_overflow
1802 ? BinaryOperator::Add
1803 : BinaryOperator::Sub;
1804
1805 // LHSSign -> LHS >= 0
1806 // RHSSign -> RHS >= 0
1807 // SumSign -> Sum >= 0
1808 //
1809 // Add:
1810 // Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign)
1811 // Sub:
1812 // Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign)
1813 InstructionCost Cost = 0;
1814 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
1815 Cost += 3 * thisT()->getCmpSelInstrCost(
1816 Instruction::ICmp, SumTy, OverflowTy,
1817 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1818 Cost += 2 * thisT()->getCmpSelInstrCost(
1819 Instruction::Select, OverflowTy, OverflowTy,
1820 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1821 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, OverflowTy,
1822 CostKind);
1823 return Cost;
1824 }
1825 case Intrinsic::uadd_with_overflow:
1826 case Intrinsic::usub_with_overflow: {
1827 Type *SumTy = RetTy->getContainedType(0);
1828 Type *OverflowTy = RetTy->getContainedType(1);
1829 unsigned Opcode = IID == Intrinsic::uadd_with_overflow
1830 ? BinaryOperator::Add
1831 : BinaryOperator::Sub;
1832
1833 InstructionCost Cost = 0;
1834 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
1835 Cost +=
1836 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy,
1837 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1838 return Cost;
1839 }
1840 case Intrinsic::smul_with_overflow:
1841 case Intrinsic::umul_with_overflow: {
1842 Type *MulTy = RetTy->getContainedType(0);
1843 Type *OverflowTy = RetTy->getContainedType(1);
1844 unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
1845 Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
1846
1847 unsigned ExtOp =
1848 IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
1849 TTI::CastContextHint CCH = TTI::CastContextHint::None;
1850
1851 InstructionCost Cost = 0;
1852 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
1853 Cost +=
1854 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
1855 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
1856 CCH, CostKind);
1857 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, MulTy,
1858 CostKind, TTI::OK_AnyValue,
1859 TTI::OK_UniformConstantValue);
1860
1861 if (IID == Intrinsic::smul_with_overflow)
1862 Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy,
1863 CostKind, TTI::OK_AnyValue,
1864 TTI::OK_UniformConstantValue);
1865
1866 Cost +=
1867 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, MulTy, OverflowTy,
1868 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1869 return Cost;
1870 }
1871 case Intrinsic::ctpop:
1872 ISDs.push_back(ISD::CTPOP);
1873 // In case of legalization use TCC_Expensive. This is cheaper than a
1874 // library call but still not a cheap instruction.
1875 SingleCallCost = TargetTransformInfo::TCC_Expensive;
1876 break;
1877 case Intrinsic::ctlz:
1878 ISDs.push_back(ISD::CTLZ);
1879 break;
1880 case Intrinsic::cttz:
1881 ISDs.push_back(ISD::CTTZ);
1882 break;
1883 case Intrinsic::bswap:
1884 ISDs.push_back(ISD::BSWAP);
1885 break;
1886 case Intrinsic::bitreverse:
1887 ISDs.push_back(ISD::BITREVERSE);
1888 break;
1889 }
1890
1891 const TargetLoweringBase *TLI = getTLI();
1892 std::pair<InstructionCost, MVT> LT =
1893 TLI->getTypeLegalizationCost(DL, RetTy);
1894
1895 SmallVector<InstructionCost, 2> LegalCost;
1896 SmallVector<InstructionCost, 2> CustomCost;
1897 for (unsigned ISD : ISDs) {
1898 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
1899 if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
1900 TLI->isFAbsFree(LT.second)) {
1901 return 0;
1902 }
1903
1904 // The operation is legal. Assume it costs 1.
1905 // If the type is split to multiple registers, assume that there is some
1906 // overhead to this.
1907 // TODO: Once we have extract/insert subvector cost we need to use them.
1908 if (LT.first > 1)
1909 LegalCost.push_back(LT.first * 2);
1910 else
1911 LegalCost.push_back(LT.first * 1);
1912 } else if (!TLI->isOperationExpand(ISD, LT.second)) {
1913 // If the operation is custom lowered then assume
1914 // that the code is twice as expensive.
1915 CustomCost.push_back(LT.first * 2);
1916 }
1917 }
1918
1919 auto *MinLegalCostI = std::min_element(LegalCost.begin(), LegalCost.end());
1920 if (MinLegalCostI != LegalCost.end())
1921 return *MinLegalCostI;
1922
1923 auto MinCustomCostI =
1924 std::min_element(CustomCost.begin(), CustomCost.end());
1925 if (MinCustomCostI != CustomCost.end())
1926 return *MinCustomCostI;
1927
1928 // If we can't lower fmuladd into an FMA estimate the cost as a floating
1929 // point mul followed by an add.
1930 if (IID == Intrinsic::fmuladd)
1931 return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy,
1932 CostKind) +
1933 thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy,
1934 CostKind);
1935 if (IID == Intrinsic::experimental_constrained_fmuladd) {
1936 IntrinsicCostAttributes FMulAttrs(
1937 Intrinsic::experimental_constrained_fmul, RetTy, Tys);
1938 IntrinsicCostAttributes FAddAttrs(
1939 Intrinsic::experimental_constrained_fadd, RetTy, Tys);
1940 return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) +
1941 thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind);
1942 }
1943
1944 // Else, assume that we need to scalarize this intrinsic. For math builtins
1945 // this will emit a costly libcall, adding call overhead and spills. Make it
1946 // very expensive.
1947 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
1948 // Scalable vectors cannot be scalarized, so return Invalid.
1949 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
1950 return isa<ScalableVectorType>(Ty);
1951 }))
1952 return InstructionCost::getInvalid();
1953
1954 InstructionCost ScalarizationCost =
1955 SkipScalarizationCost ? ScalarizationCostPassed
1956 : getScalarizationOverhead(RetVTy, true, false);
1957
1958 unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements();
1959 SmallVector<Type *, 4> ScalarTys;
1960 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1961 Type *Ty = Tys[i];
1962 if (Ty->isVectorTy())
1963 Ty = Ty->getScalarType();
1964 ScalarTys.push_back(Ty);
1965 }
1966 IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
1967 InstructionCost ScalarCost =
1968 thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1969 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1970 if (auto *VTy = dyn_cast<VectorType>(Tys[i])) {
1971 if (!ICA.skipScalarizationCost())
1972 ScalarizationCost += getScalarizationOverhead(VTy, false, true);
1973 ScalarCalls = std::max(ScalarCalls,
1974 cast<FixedVectorType>(VTy)->getNumElements());
1975 }
1976 }
1977 return ScalarCalls * ScalarCost + ScalarizationCost;
1978 }
1979
1980 // This is going to be turned into a library call, make it expensive.
1981 return SingleCallCost;
1982 }
1983
1984 /// Compute a cost of the given call instruction.
1985 ///
1986 /// Compute the cost of calling function F with return type RetTy and
1987 /// argument types Tys. F might be nullptr, in this case the cost of an
1988 /// arbitrary call with the specified signature will be returned.
1989 /// This is used, for instance, when we estimate call of a vector
1990 /// counterpart of the given function.
1991 /// \param F Called function, might be nullptr.
1992 /// \param RetTy Return value types.
1993 /// \param Tys Argument types.
1994 /// \returns The cost of Call instruction.
1995 InstructionCost
1996 getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys,
1997 TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) {
1998 return 10;
1999 }
2000
2001 unsigned getNumberOfParts(Type *Tp) {
2002 std::pair<InstructionCost, MVT> LT =
2003 getTLI()->getTypeLegalizationCost(DL, Tp);
2004 return *LT.first.getValue();
2005 }
2006
2007 InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *,
2008 const SCEV *) {
2009 return 0;
2010 }
2011
2012 /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
2013 /// We're assuming that reduction operation are performing the following way:
2014 ///
2015 /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
2016 /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
2017 /// \----------------v-------------/ \----------v------------/
2018 /// n/2 elements n/2 elements
2019 /// %red1 = op <n x t> %val, <n x t> val1
2020 /// After this operation we have a vector %red1 where only the first n/2
2021 /// elements are meaningful, the second n/2 elements are undefined and can be
2022 /// dropped. All other operations are actually working with the vector of
2023 /// length n/2, not n, though the real vector length is still n.
2024 /// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
2025 /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
2026 /// \----------------v-------------/ \----------v------------/
2027 /// n/4 elements 3*n/4 elements
2028 /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of
2029 /// length n/2, the resulting vector has length n/4 etc.
2030 ///
2031 /// The cost model should take into account that the actual length of the
2032 /// vector is reduced on each iteration.
2033 InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty,
2034 TTI::TargetCostKind CostKind) {
2035 Type *ScalarTy = Ty->getElementType();
2036 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2037 if ((Opcode == Instruction::Or || Opcode == Instruction::And) &&
2038 ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) &&
2039 NumVecElts >= 2) {
2040 // Or reduction for i1 is represented as:
2041 // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2042 // %res = cmp ne iReduxWidth %val, 0
2043 // And reduction for i1 is represented as:
2044 // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2045 // %res = cmp eq iReduxWidth %val, 11111
2046 Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts);
2047 return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty,
2048 TTI::CastContextHint::None, CostKind) +
2049 thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy,
2050 CmpInst::makeCmpResultType(ValTy),
2051 CmpInst::BAD_ICMP_PREDICATE, CostKind);
2052 }
2053 unsigned NumReduxLevels = Log2_32(NumVecElts);
2054 InstructionCost ArithCost = 0;
2055 InstructionCost ShuffleCost = 0;
2056 std::pair<InstructionCost, MVT> LT =
2057 thisT()->getTLI()->getTypeLegalizationCost(DL, Ty);
2058 unsigned LongVectorCount = 0;
2059 unsigned MVTLen =
2060 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2061 while (NumVecElts > MVTLen) {
2062 NumVecElts /= 2;
2063 VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2064 ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None,
2065 NumVecElts, SubTy);
2066 ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind);
2067 Ty = SubTy;
2068 ++LongVectorCount;
2069 }
2070
2071 NumReduxLevels -= LongVectorCount;
2072
2073 // The minimal length of the vector is limited by the real length of vector
2074 // operations performed on the current platform. That's why several final
2075 // reduction operations are performed on the vectors with the same
2076 // architecture-dependent length.
2077
2078 // By default reductions need one shuffle per reduction level.
2079 ShuffleCost += NumReduxLevels * thisT()->getShuffleCost(
2080 TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty);
2081 ArithCost += NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty);
2082 return ShuffleCost + ArithCost +
2083 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
2084 }
2085
2086 /// Try to calculate the cost of performing strict (in-order) reductions,
2087 /// which involves doing a sequence of floating point additions in lane
2088 /// order, starting with an initial value. For example, consider a scalar
2089 /// initial value 'InitVal' of type float and a vector of type <4 x float>:
2090 ///
2091 /// Vector = <float %v0, float %v1, float %v2, float %v3>
2092 ///
2093 /// %add1 = %InitVal + %v0
2094 /// %add2 = %add1 + %v1
2095 /// %add3 = %add2 + %v2
2096 /// %add4 = %add3 + %v3
2097 ///
2098 /// As a simple estimate we can say the cost of such a reduction is 4 times
2099 /// the cost of a scalar FP addition. We can only estimate the costs for
2100 /// fixed-width vectors here because for scalable vectors we do not know the
2101 /// runtime number of operations.
2102 InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty,
2103 TTI::TargetCostKind CostKind) {
2104 // Targets must implement a default value for the scalable case, since
2105 // we don't know how many lanes the vector has.
2106 if (isa<ScalableVectorType>(Ty))
2107 return InstructionCost::getInvalid();
2108
2109 auto *VTy = cast<FixedVectorType>(Ty);
2110 InstructionCost ExtractCost =
2111 getScalarizationOverhead(VTy, /*Insert=*/false, /*Extract=*/true);
2112 InstructionCost ArithCost = thisT()->getArithmeticInstrCost(
2113 Opcode, VTy->getElementType(), CostKind);
2114 ArithCost *= VTy->getNumElements();
2115
2116 return ExtractCost + ArithCost;
2117 }
2118
2119 InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
2120 Optional<FastMathFlags> FMF,
2121 TTI::TargetCostKind CostKind) {
2122 if (TTI::requiresOrderedReduction(FMF))
2123 return getOrderedReductionCost(Opcode, Ty, CostKind);
2124 return getTreeReductionCost(Opcode, Ty, CostKind);
2125 }
2126
2127 /// Try to calculate op costs for min/max reduction operations.
2128 /// \param CondTy Conditional type for the Select instruction.
2129 InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
2130 bool IsUnsigned,
2131 TTI::TargetCostKind CostKind) {
2132 Type *ScalarTy = Ty->getElementType();
2133 Type *ScalarCondTy = CondTy->getElementType();
2134 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2135 unsigned NumReduxLevels = Log2_32(NumVecElts);
2136 unsigned CmpOpcode;
2137 if (Ty->isFPOrFPVectorTy()) {
2138 CmpOpcode = Instruction::FCmp;
2139 } else {
2140 assert(Ty->isIntOrIntVectorTy() &&(static_cast<void> (0))
2141 "expecting floating point or integer type for min/max reduction")(static_cast<void> (0));
2142 CmpOpcode = Instruction::ICmp;
2143 }
2144 InstructionCost MinMaxCost = 0;
2145 InstructionCost ShuffleCost = 0;
2146 std::pair<InstructionCost, MVT> LT =
2147 thisT()->getTLI()->getTypeLegalizationCost(DL, Ty);
2148 unsigned LongVectorCount = 0;
2149 unsigned MVTLen =
2150 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2151 while (NumVecElts > MVTLen) {
2152 NumVecElts /= 2;
2153 auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2154 CondTy = FixedVectorType::get(ScalarCondTy, NumVecElts);
2155
2156 ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None,
2157 NumVecElts, SubTy);
2158 MinMaxCost +=
2159 thisT()->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy,
2160 CmpInst::BAD_ICMP_PREDICATE, CostKind) +
2161 thisT()->getCmpSelInstrCost(Instruction::Select, SubTy, CondTy,
2162 CmpInst::BAD_ICMP_PREDICATE, CostKind);
2163 Ty = SubTy;
2164 ++LongVectorCount;
2165 }
2166
2167 NumReduxLevels -= LongVectorCount;
2168
2169 // The minimal length of the vector is limited by the real length of vector
2170 // operations performed on the current platform. That's why several final
2171 // reduction opertions are perfomed on the vectors with the same
2172 // architecture-dependent length.
2173 ShuffleCost += NumReduxLevels * thisT()->getShuffleCost(
2174 TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty);
2175 MinMaxCost +=
2176 NumReduxLevels *
2177 (thisT()->getCmpSelInstrCost(CmpOpcode, Ty, CondTy,
2178 CmpInst::BAD_ICMP_PREDICATE, CostKind) +
2179 thisT()->getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
2180 CmpInst::BAD_ICMP_PREDICATE, CostKind));
2181 // The last min/max should be in vector registers and we counted it above.
2182 // So just need a single extractelement.
2183 return ShuffleCost + MinMaxCost +
2184 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
2185 }
2186
2187 InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
2188 Type *ResTy, VectorType *Ty,
2189 TTI::TargetCostKind CostKind) {
2190 // Without any native support, this is equivalent to the cost of
2191 // vecreduce.add(ext) or if IsMLA vecreduce.add(mul(ext, ext))
2192 VectorType *ExtTy = VectorType::get(ResTy, Ty);
2193 InstructionCost RedCost = thisT()->getArithmeticReductionCost(
2194 Instruction::Add, ExtTy, None, CostKind);
2195 InstructionCost MulCost = 0;
2196 InstructionCost ExtCost = thisT()->getCastInstrCost(
2197 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2198 TTI::CastContextHint::None, CostKind);
2199 if (IsMLA) {
2200 MulCost =
2201 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2202 ExtCost *= 2;
2203 }
2204
2205 return RedCost + MulCost + ExtCost;
2206 }
2207
2208 InstructionCost getVectorSplitCost() { return 1; }
2209
2210 /// @}
2211};
2212
2213/// Concrete BasicTTIImpl that can be used if no further customization
2214/// is needed.
2215class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
2216 using BaseT = BasicTTIImplBase<BasicTTIImpl>;
2217
2218 friend class BasicTTIImplBase<BasicTTIImpl>;
2219
2220 const TargetSubtargetInfo *ST;
2221 const TargetLoweringBase *TLI;
2222
2223 const TargetSubtargetInfo *getST() const { return ST; }
2224 const TargetLoweringBase *getTLI() const { return TLI; }
2225
2226public:
2227 explicit BasicTTIImpl(const TargetMachine *TM, const Function &F);
2228};
2229
2230} // end namespace llvm
2231
2232#endif // LLVM_CODEGEN_BASICTTIIMPL_H

/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/include/llvm/Support/InstructionCost.h

1//===- InstructionCost.h ----------------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file defines an InstructionCost class that is used when calculating
10/// the cost of an instruction, or a group of instructions. In addition to a
11/// numeric value representing the cost the class also contains a state that
12/// can be used to encode particular properties, such as a cost being invalid.
13/// Operations on InstructionCost implement saturation arithmetic, so that
14/// accumulating costs on large cost-values don't overflow.
15///
16//===----------------------------------------------------------------------===//
17
18#ifndef LLVM_SUPPORT_INSTRUCTIONCOST_H
19#define LLVM_SUPPORT_INSTRUCTIONCOST_H
20
21#include "llvm/ADT/Optional.h"
22#include "llvm/Support/MathExtras.h"
23#include <limits>
24
25namespace llvm {
26
27class raw_ostream;
28
29class InstructionCost {
30public:
31 using CostType = int64_t;
32
33 /// CostState describes the state of a cost.
34 enum CostState {
35 Valid, /// < The cost value represents a valid cost, even when the
36 /// cost-value is large.
37 Invalid /// < Invalid indicates there is no way to represent the cost as a
38 /// numeric value. This state exists to represent a possible issue,
39 /// e.g. if the cost-model knows the operation cannot be expanded
40 /// into a valid code-sequence by the code-generator. While some
41 /// passes may assert that the calculated cost must be valid, it is
42 /// up to individual passes how to interpret an Invalid cost. For
43 /// example, a transformation pass could choose not to perform a
44 /// transformation if the resulting cost would end up Invalid.
45 /// Because some passes may assert a cost is Valid, it is not
46 /// recommended to use Invalid costs to model 'Unknown'.
47 /// Note that Invalid is semantically different from a (very) high,
48 /// but valid cost, which intentionally indicates no issue, but
49 /// rather a strong preference not to select a certain operation.
50 };
51
52private:
53 CostType Value = 0;
54 CostState State = Valid;
55
56 void propagateState(const InstructionCost &RHS) {
57 if (RHS.State == Invalid)
58 State = Invalid;
59 }
60
61 static CostType getMaxValue() { return std::numeric_limits<CostType>::max(); }
62 static CostType getMinValue() { return std::numeric_limits<CostType>::min(); }
63
64public:
65 // A default constructed InstructionCost is a valid zero cost
66 InstructionCost() = default;
67
68 InstructionCost(CostState) = delete;
69 InstructionCost(CostType Val) : Value(Val), State(Valid) {}
70
71 static InstructionCost getMax() { return getMaxValue(); }
72 static InstructionCost getMin() { return getMinValue(); }
73 static InstructionCost getInvalid(CostType Val = 0) {
74 InstructionCost Tmp(Val);
75 Tmp.setInvalid();
76 return Tmp;
77 }
78
79 bool isValid() const { return State == Valid; }
80 void setValid() { State = Valid; }
81 void setInvalid() { State = Invalid; }
82 CostState getState() const { return State; }
83
84 /// This function is intended to be used as sparingly as possible, since the
85 /// class provides the full range of operator support required for arithmetic
86 /// and comparisons.
87 Optional<CostType> getValue() const {
88 if (isValid())
89 return Value;
90 return None;
91 }
92
93 /// For all of the arithmetic operators provided here any invalid state is
94 /// perpetuated and cannot be removed. Once a cost becomes invalid it stays
95 /// invalid, and it also inherits any invalid state from the RHS.
96 /// Arithmetic work on the actual values is implemented with saturation,
97 /// to avoid overflow when using more extreme cost values.
98
99 InstructionCost &operator+=(const InstructionCost &RHS) {
100 propagateState(RHS);
101
102 // Saturating addition.
103 InstructionCost::CostType Result;
104 if (AddOverflow(Value, RHS.Value, Result))
105 Result = RHS.Value > 0 ? getMaxValue() : getMinValue();
106
107 Value = Result;
108 return *this;
109 }
110
111 InstructionCost &operator+=(const CostType RHS) {
112 InstructionCost RHS2(RHS);
113 *this += RHS2;
114 return *this;
115 }
116
117 InstructionCost &operator-=(const InstructionCost &RHS) {
118 propagateState(RHS);
119
120 // Saturating subtract.
121 InstructionCost::CostType Result;
122 if (SubOverflow(Value, RHS.Value, Result))
123 Result = RHS.Value > 0 ? getMinValue() : getMaxValue();
124 Value = Result;
125 return *this;
126 }
127
128 InstructionCost &operator-=(const CostType RHS) {
129 InstructionCost RHS2(RHS);
130 *this -= RHS2;
131 return *this;
132 }
133
134 InstructionCost &operator*=(const InstructionCost &RHS) {
135 propagateState(RHS);
136
137 // Saturating multiply.
138 InstructionCost::CostType Result;
139 if (MulOverflow(Value, RHS.Value, Result)) {
140 if ((Value > 0 && RHS.Value > 0) || (Value < 0 && RHS.Value < 0))
141 Result = getMaxValue();
142 else
143 Result = getMinValue();
144 }
145
146 Value = Result;
147 return *this;
148 }
149
150 InstructionCost &operator*=(const CostType RHS) {
151 InstructionCost RHS2(RHS);
152 *this *= RHS2;
153 return *this;
154 }
155
156 InstructionCost &operator/=(const InstructionCost &RHS) {
157 propagateState(RHS);
158 Value /= RHS.Value;
159 return *this;
160 }
161
162 InstructionCost &operator/=(const CostType RHS) {
163 InstructionCost RHS2(RHS);
164 *this /= RHS2;
165 return *this;
166 }
167
168 InstructionCost &operator++() {
169 *this += 1;
170 return *this;
171 }
172
173 InstructionCost operator++(int) {
174 InstructionCost Copy = *this;
175 ++*this;
176 return Copy;
177 }
178
179 InstructionCost &operator--() {
180 *this -= 1;
181 return *this;
182 }
183
184 InstructionCost operator--(int) {
185 InstructionCost Copy = *this;
186 --*this;
187 return Copy;
188 }
189
190 /// For the comparison operators we have chosen to use lexicographical
191 /// ordering where valid costs are always considered to be less than invalid
192 /// costs. This avoids having to add asserts to the comparison operators that
193 /// the states are valid and users can test for validity of the cost
194 /// explicitly.
195 bool operator<(const InstructionCost &RHS) const {
196 if (State != RHS.State)
197 return State < RHS.State;
198 return Value < RHS.Value;
199 }
200
201 // Implement in terms of operator< to ensure that the two comparisons stay in
202 // sync
203 bool operator==(const InstructionCost &RHS) const {
204 return !(*this < RHS) && !(RHS < *this);
5
Returning zero, which participates in a condition later
205 }
206
207 bool operator!=(const InstructionCost &RHS) const { return !(*this == RHS); }
208
209 bool operator==(const CostType RHS) const {
210 InstructionCost RHS2(RHS);
211 return *this == RHS2;
4
Calling 'InstructionCost::operator=='
6
Returning from 'InstructionCost::operator=='
7
Returning zero, which participates in a condition later
212 }
213
214 bool operator!=(const CostType RHS) const { return !(*this == RHS); }
215
216 bool operator>(const InstructionCost &RHS) const { return RHS < *this; }
217
218 bool operator<=(const InstructionCost &RHS) const { return !(RHS < *this); }
219
220 bool operator>=(const InstructionCost &RHS) const { return !(*this < RHS); }
221
222 bool operator<(const CostType RHS) const {
223 InstructionCost RHS2(RHS);
224 return *this < RHS2;
225 }
226
227 bool operator>(const CostType RHS) const {
228 InstructionCost RHS2(RHS);
229 return *this > RHS2;
230 }
231
232 bool operator<=(const CostType RHS) const {
233 InstructionCost RHS2(RHS);
234 return *this <= RHS2;
235 }
236
237 bool operator>=(const CostType RHS) const {
238 InstructionCost RHS2(RHS);
239 return *this >= RHS2;
240 }
241
242 void print(raw_ostream &OS) const;
243
244 template <class Function>
245 auto map(const Function &F) const -> InstructionCost {
246 if (isValid())
247 return F(*getValue());
248 return getInvalid();
249 }
250};
251
252inline InstructionCost operator+(const InstructionCost &LHS,
253 const InstructionCost &RHS) {
254 InstructionCost LHS2(LHS);
255 LHS2 += RHS;
256 return LHS2;
257}
258
259inline InstructionCost operator-(const InstructionCost &LHS,
260 const InstructionCost &RHS) {
261 InstructionCost LHS2(LHS);
262 LHS2 -= RHS;
263 return LHS2;
264}
265
266inline InstructionCost operator*(const InstructionCost &LHS,
267 const InstructionCost &RHS) {
268 InstructionCost LHS2(LHS);
269 LHS2 *= RHS;
270 return LHS2;
271}
272
273inline InstructionCost operator/(const InstructionCost &LHS,
274 const InstructionCost &RHS) {
275 InstructionCost LHS2(LHS);
276 LHS2 /= RHS;
277 return LHS2;
278}
279
280inline raw_ostream &operator<<(raw_ostream &OS, const InstructionCost &V) {
281 V.print(OS);
282 return OS;
283}
284
285} // namespace llvm
286
287#endif

/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/include/llvm/Analysis/TargetTransformInfo.h

1//===- TargetTransformInfo.h ------------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This pass exposes codegen information to IR-level passes. Every
10/// transformation that uses codegen information is broken into three parts:
11/// 1. The IR-level analysis pass.
12/// 2. The IR-level transformation interface which provides the needed
13/// information.
14/// 3. Codegen-level implementation which uses target-specific hooks.
15///
16/// This file defines #2, which is the interface that IR-level transformations
17/// use for querying the codegen.
18///
19//===----------------------------------------------------------------------===//
20
21#ifndef LLVM_ANALYSIS_TARGETTRANSFORMINFO_H
22#define LLVM_ANALYSIS_TARGETTRANSFORMINFO_H
23
24#include "llvm/IR/InstrTypes.h"
25#include "llvm/IR/Operator.h"
26#include "llvm/IR/PassManager.h"
27#include "llvm/Pass.h"
28#include "llvm/Support/AtomicOrdering.h"
29#include "llvm/Support/BranchProbability.h"
30#include "llvm/Support/DataTypes.h"
31#include "llvm/Support/InstructionCost.h"
32#include <functional>
33
34namespace llvm {
35
36namespace Intrinsic {
37typedef unsigned ID;
38}
39
40class AssumptionCache;
41class BlockFrequencyInfo;
42class DominatorTree;
43class BranchInst;
44class CallBase;
45class ExtractElementInst;
46class Function;
47class GlobalValue;
48class InstCombiner;
49class OptimizationRemarkEmitter;
50class IntrinsicInst;
51class LoadInst;
52class LoopAccessInfo;
53class Loop;
54class LoopInfo;
55class ProfileSummaryInfo;
56class RecurrenceDescriptor;
57class SCEV;
58class ScalarEvolution;
59class StoreInst;
60class SwitchInst;
61class TargetLibraryInfo;
62class Type;
63class User;
64class Value;
65class VPIntrinsic;
66struct KnownBits;
67template <typename T> class Optional;
68
69/// Information about a load/store intrinsic defined by the target.
70struct MemIntrinsicInfo {
71 /// This is the pointer that the intrinsic is loading from or storing to.
72 /// If this is non-null, then analysis/optimization passes can assume that
73 /// this intrinsic is functionally equivalent to a load/store from this
74 /// pointer.
75 Value *PtrVal = nullptr;
76
77 // Ordering for atomic operations.
78 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
79
80 // Same Id is set by the target for corresponding load/store intrinsics.
81 unsigned short MatchingId = 0;
82
83 bool ReadMem = false;
84 bool WriteMem = false;
85 bool IsVolatile = false;
86
87 bool isUnordered() const {
88 return (Ordering == AtomicOrdering::NotAtomic ||
89 Ordering == AtomicOrdering::Unordered) &&
90 !IsVolatile;
91 }
92};
93
94/// Attributes of a target dependent hardware loop.
95struct HardwareLoopInfo {
96 HardwareLoopInfo() = delete;
97 HardwareLoopInfo(Loop *L) : L(L) {}
98 Loop *L = nullptr;
99 BasicBlock *ExitBlock = nullptr;
100 BranchInst *ExitBranch = nullptr;
101 const SCEV *ExitCount = nullptr;
102 IntegerType *CountType = nullptr;
103 Value *LoopDecrement = nullptr; // Decrement the loop counter by this
104 // value in every iteration.
105 bool IsNestingLegal = false; // Can a hardware loop be a parent to
106 // another hardware loop?
107 bool CounterInReg = false; // Should loop counter be updated in
108 // the loop via a phi?
109 bool PerformEntryTest = false; // Generate the intrinsic which also performs
110 // icmp ne zero on the loop counter value and
111 // produces an i1 to guard the loop entry.
112 bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI,
113 DominatorTree &DT, bool ForceNestedLoop = false,
114 bool ForceHardwareLoopPHI = false);
115 bool canAnalyze(LoopInfo &LI);
116};
117
118class IntrinsicCostAttributes {
119 const IntrinsicInst *II = nullptr;
120 Type *RetTy = nullptr;
121 Intrinsic::ID IID;
122 SmallVector<Type *, 4> ParamTys;
123 SmallVector<const Value *, 4> Arguments;
124 FastMathFlags FMF;
125 // If ScalarizationCost is UINT_MAX, the cost of scalarizing the
126 // arguments and the return value will be computed based on types.
127 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
128
129public:
130 IntrinsicCostAttributes(
131 Intrinsic::ID Id, const CallBase &CI,
132 InstructionCost ScalarCost = InstructionCost::getInvalid());
133
134 IntrinsicCostAttributes(
135 Intrinsic::ID Id, Type *RTy, ArrayRef<Type *> Tys,
136 FastMathFlags Flags = FastMathFlags(), const IntrinsicInst *I = nullptr,
137 InstructionCost ScalarCost = InstructionCost::getInvalid());
138
139 IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
140 ArrayRef<const Value *> Args);
141
142 IntrinsicCostAttributes(
143 Intrinsic::ID Id, Type *RTy, ArrayRef<const Value *> Args,
144 ArrayRef<Type *> Tys, FastMathFlags Flags = FastMathFlags(),
145 const IntrinsicInst *I = nullptr,
146 InstructionCost ScalarCost = InstructionCost::getInvalid());
147
148 Intrinsic::ID getID() const { return IID; }
149 const IntrinsicInst *getInst() const { return II; }
150 Type *getReturnType() const { return RetTy; }
151 FastMathFlags getFlags() const { return FMF; }
152 InstructionCost getScalarizationCost() const { return ScalarizationCost; }
153 const SmallVectorImpl<const Value *> &getArgs() const { return Arguments; }
154 const SmallVectorImpl<Type *> &getArgTypes() const { return ParamTys; }
155
156 bool isTypeBasedOnly() const {
157 return Arguments.empty();
13
Calling 'SmallVectorBase::empty'
16
Returning from 'SmallVectorBase::empty'
17
Returning zero, which participates in a condition later
158 }
159
160 bool skipScalarizationCost() const { return ScalarizationCost.isValid(); }
161};
162
163class TargetTransformInfo;
164typedef TargetTransformInfo TTI;
165
166/// This pass provides access to the codegen interfaces that are needed
167/// for IR-level transformations.
168class TargetTransformInfo {
169public:
170 /// Construct a TTI object using a type implementing the \c Concept
171 /// API below.
172 ///
173 /// This is used by targets to construct a TTI wrapping their target-specific
174 /// implementation that encodes appropriate costs for their target.
175 template <typename T> TargetTransformInfo(T Impl);
176
177 /// Construct a baseline TTI object using a minimal implementation of
178 /// the \c Concept API below.
179 ///
180 /// The TTI implementation will reflect the information in the DataLayout
181 /// provided if non-null.
182 explicit TargetTransformInfo(const DataLayout &DL);
183
184 // Provide move semantics.
185 TargetTransformInfo(TargetTransformInfo &&Arg);
186 TargetTransformInfo &operator=(TargetTransformInfo &&RHS);
187
188 // We need to define the destructor out-of-line to define our sub-classes
189 // out-of-line.
190 ~TargetTransformInfo();
191
192 /// Handle the invalidation of this information.
193 ///
194 /// When used as a result of \c TargetIRAnalysis this method will be called
195 /// when the function this was computed for changes. When it returns false,
196 /// the information is preserved across those changes.
197 bool invalidate(Function &, const PreservedAnalyses &,
198 FunctionAnalysisManager::Invalidator &) {
199 // FIXME: We should probably in some way ensure that the subtarget
200 // information for a function hasn't changed.
201 return false;
202 }
203
204 /// \name Generic Target Information
205 /// @{
206
207 /// The kind of cost model.
208 ///
209 /// There are several different cost models that can be customized by the
210 /// target. The normalization of each cost model may be target specific.
211 enum TargetCostKind {
212 TCK_RecipThroughput, ///< Reciprocal throughput.
213 TCK_Latency, ///< The latency of instruction.
214 TCK_CodeSize, ///< Instruction code size.
215 TCK_SizeAndLatency ///< The weighted sum of size and latency.
216 };
217
218 /// Query the cost of a specified instruction.
219 ///
220 /// Clients should use this interface to query the cost of an existing
221 /// instruction. The instruction must have a valid parent (basic block).
222 ///
223 /// Note, this method does not cache the cost calculation and it
224 /// can be expensive in some cases.
225 InstructionCost getInstructionCost(const Instruction *I,
226 enum TargetCostKind kind) const {
227 InstructionCost Cost;
228 switch (kind) {
229 case TCK_RecipThroughput:
230 Cost = getInstructionThroughput(I);
231 break;
232 case TCK_Latency:
233 Cost = getInstructionLatency(I);
234 break;
235 case TCK_CodeSize:
236 case TCK_SizeAndLatency:
237 Cost = getUserCost(I, kind);
238 break;
239 }
240 return Cost;
241 }
242
243 /// Underlying constants for 'cost' values in this interface.
244 ///
245 /// Many APIs in this interface return a cost. This enum defines the
246 /// fundamental values that should be used to interpret (and produce) those
247 /// costs. The costs are returned as an int rather than a member of this
248 /// enumeration because it is expected that the cost of one IR instruction
249 /// may have a multiplicative factor to it or otherwise won't fit directly
250 /// into the enum. Moreover, it is common to sum or average costs which works
251 /// better as simple integral values. Thus this enum only provides constants.
252 /// Also note that the returned costs are signed integers to make it natural
253 /// to add, subtract, and test with zero (a common boundary condition). It is
254 /// not expected that 2^32 is a realistic cost to be modeling at any point.
255 ///
256 /// Note that these costs should usually reflect the intersection of code-size
257 /// cost and execution cost. A free instruction is typically one that folds
258 /// into another instruction. For example, reg-to-reg moves can often be
259 /// skipped by renaming the registers in the CPU, but they still are encoded
260 /// and thus wouldn't be considered 'free' here.
261 enum TargetCostConstants {
262 TCC_Free = 0, ///< Expected to fold away in lowering.
263 TCC_Basic = 1, ///< The cost of a typical 'add' instruction.
264 TCC_Expensive = 4 ///< The cost of a 'div' instruction on x86.
265 };
266
267 /// Estimate the cost of a GEP operation when lowered.
268 InstructionCost
269 getGEPCost(Type *PointeeType, const Value *Ptr,
270 ArrayRef<const Value *> Operands,
271 TargetCostKind CostKind = TCK_SizeAndLatency) const;
272
273 /// \returns A value by which our inlining threshold should be multiplied.
274 /// This is primarily used to bump up the inlining threshold wholesale on
275 /// targets where calls are unusually expensive.
276 ///
277 /// TODO: This is a rather blunt instrument. Perhaps altering the costs of
278 /// individual classes of instructions would be better.
279 unsigned getInliningThresholdMultiplier() const;
280
281 /// \returns A value to be added to the inlining threshold.
282 unsigned adjustInliningThreshold(const CallBase *CB) const;
283
284 /// \returns Vector bonus in percent.
285 ///
286 /// Vector bonuses: We want to more aggressively inline vector-dense kernels
287 /// and apply this bonus based on the percentage of vector instructions. A
288 /// bonus is applied if the vector instructions exceed 50% and half that
289 /// amount is applied if it exceeds 10%. Note that these bonuses are some what
290 /// arbitrary and evolved over time by accident as much as because they are
291 /// principled bonuses.
292 /// FIXME: It would be nice to base the bonus values on something more
293 /// scientific. A target may has no bonus on vector instructions.
294 int getInlinerVectorBonusPercent() const;
295
296 /// \return the expected cost of a memcpy, which could e.g. depend on the
297 /// source/destination type and alignment and the number of bytes copied.
298 InstructionCost getMemcpyCost(const Instruction *I) const;
299
300 /// \return The estimated number of case clusters when lowering \p 'SI'.
301 /// \p JTSize Set a jump table size only when \p SI is suitable for a jump
302 /// table.
303 unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
304 unsigned &JTSize,
305 ProfileSummaryInfo *PSI,
306 BlockFrequencyInfo *BFI) const;
307
308 /// Estimate the cost of a given IR user when lowered.
309 ///
310 /// This can estimate the cost of either a ConstantExpr or Instruction when
311 /// lowered.
312 ///
313 /// \p Operands is a list of operands which can be a result of transformations
314 /// of the current operands. The number of the operands on the list must equal
315 /// to the number of the current operands the IR user has. Their order on the
316 /// list must be the same as the order of the current operands the IR user
317 /// has.
318 ///
319 /// The returned cost is defined in terms of \c TargetCostConstants, see its
320 /// comments for a detailed explanation of the cost values.
321 InstructionCost getUserCost(const User *U, ArrayRef<const Value *> Operands,
322 TargetCostKind CostKind) const;
323
324 /// This is a helper function which calls the two-argument getUserCost
325 /// with \p Operands which are the current operands U has.
326 InstructionCost getUserCost(const User *U, TargetCostKind CostKind) const {
327 SmallVector<const Value *, 4> Operands(U->operand_values());
328 return getUserCost(U, Operands, CostKind);
329 }
330
331 /// If a branch or a select condition is skewed in one direction by more than
332 /// this factor, it is very likely to be predicted correctly.
333 BranchProbability getPredictableBranchThreshold() const;
334
335 /// Return true if branch divergence exists.
336 ///
337 /// Branch divergence has a significantly negative impact on GPU performance
338 /// when threads in the same wavefront take different paths due to conditional
339 /// branches.
340 bool hasBranchDivergence() const;
341
342 /// Return true if the target prefers to use GPU divergence analysis to
343 /// replace the legacy version.
344 bool useGPUDivergenceAnalysis() const;
345
346 /// Returns whether V is a source of divergence.
347 ///
348 /// This function provides the target-dependent information for
349 /// the target-independent LegacyDivergenceAnalysis. LegacyDivergenceAnalysis
350 /// first builds the dependency graph, and then runs the reachability
351 /// algorithm starting with the sources of divergence.
352 bool isSourceOfDivergence(const Value *V) const;
353
354 // Returns true for the target specific
355 // set of operations which produce uniform result
356 // even taking non-uniform arguments
357 bool isAlwaysUniform(const Value *V) const;
358
359 /// Returns the address space ID for a target's 'flat' address space. Note
360 /// this is not necessarily the same as addrspace(0), which LLVM sometimes
361 /// refers to as the generic address space. The flat address space is a
362 /// generic address space that can be used access multiple segments of memory
363 /// with different address spaces. Access of a memory location through a
364 /// pointer with this address space is expected to be legal but slower
365 /// compared to the same memory location accessed through a pointer with a
366 /// different address space.
367 //
368 /// This is for targets with different pointer representations which can
369 /// be converted with the addrspacecast instruction. If a pointer is converted
370 /// to this address space, optimizations should attempt to replace the access
371 /// with the source address space.
372 ///
373 /// \returns ~0u if the target does not have such a flat address space to
374 /// optimize away.
375 unsigned getFlatAddressSpace() const;
376
377 /// Return any intrinsic address operand indexes which may be rewritten if
378 /// they use a flat address space pointer.
379 ///
380 /// \returns true if the intrinsic was handled.
381 bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
382 Intrinsic::ID IID) const;
383
384 bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const;
385
386 unsigned getAssumedAddrSpace(const Value *V) const;
387
388 /// Rewrite intrinsic call \p II such that \p OldV will be replaced with \p
389 /// NewV, which has a different address space. This should happen for every
390 /// operand index that collectFlatAddressOperands returned for the intrinsic.
391 /// \returns nullptr if the intrinsic was not handled. Otherwise, returns the
392 /// new value (which may be the original \p II with modified operands).
393 Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
394 Value *NewV) const;
395
396 /// Test whether calls to a function lower to actual program function
397 /// calls.
398 ///
399 /// The idea is to test whether the program is likely to require a 'call'
400 /// instruction or equivalent in order to call the given function.
401 ///
402 /// FIXME: It's not clear that this is a good or useful query API. Client's
403 /// should probably move to simpler cost metrics using the above.
404 /// Alternatively, we could split the cost interface into distinct code-size
405 /// and execution-speed costs. This would allow modelling the core of this
406 /// query more accurately as a call is a single small instruction, but
407 /// incurs significant execution cost.
408 bool isLoweredToCall(const Function *F) const;
409
410 struct LSRCost {
411 /// TODO: Some of these could be merged. Also, a lexical ordering
412 /// isn't always optimal.
413 unsigned Insns;
414 unsigned NumRegs;
415 unsigned AddRecCost;
416 unsigned NumIVMuls;
417 unsigned NumBaseAdds;
418 unsigned ImmCost;
419 unsigned SetupCost;
420 unsigned ScaleCost;
421 };
422
423 /// Parameters that control the generic loop unrolling transformation.
424 struct UnrollingPreferences {
425 /// The cost threshold for the unrolled loop. Should be relative to the
426 /// getUserCost values returned by this API, and the expectation is that
427 /// the unrolled loop's instructions when run through that interface should
428 /// not exceed this cost. However, this is only an estimate. Also, specific
429 /// loops may be unrolled even with a cost above this threshold if deemed
430 /// profitable. Set this to UINT_MAX to disable the loop body cost
431 /// restriction.
432 unsigned Threshold;
433 /// If complete unrolling will reduce the cost of the loop, we will boost
434 /// the Threshold by a certain percent to allow more aggressive complete
435 /// unrolling. This value provides the maximum boost percentage that we
436 /// can apply to Threshold (The value should be no less than 100).
437 /// BoostedThreshold = Threshold * min(RolledCost / UnrolledCost,
438 /// MaxPercentThresholdBoost / 100)
439 /// E.g. if complete unrolling reduces the loop execution time by 50%
440 /// then we boost the threshold by the factor of 2x. If unrolling is not
441 /// expected to reduce the running time, then we do not increase the
442 /// threshold.
443 unsigned MaxPercentThresholdBoost;
444 /// The cost threshold for the unrolled loop when optimizing for size (set
445 /// to UINT_MAX to disable).
446 unsigned OptSizeThreshold;
447 /// The cost threshold for the unrolled loop, like Threshold, but used
448 /// for partial/runtime unrolling (set to UINT_MAX to disable).
449 unsigned PartialThreshold;
450 /// The cost threshold for the unrolled loop when optimizing for size, like
451 /// OptSizeThreshold, but used for partial/runtime unrolling (set to
452 /// UINT_MAX to disable).
453 unsigned PartialOptSizeThreshold;
454 /// A forced unrolling factor (the number of concatenated bodies of the
455 /// original loop in the unrolled loop body). When set to 0, the unrolling
456 /// transformation will select an unrolling factor based on the current cost
457 /// threshold and other factors.
458 unsigned Count;
459 /// Default unroll count for loops with run-time trip count.
460 unsigned DefaultUnrollRuntimeCount;
461 // Set the maximum unrolling factor. The unrolling factor may be selected
462 // using the appropriate cost threshold, but may not exceed this number
463 // (set to UINT_MAX to disable). This does not apply in cases where the
464 // loop is being fully unrolled.
465 unsigned MaxCount;
466 /// Set the maximum unrolling factor for full unrolling. Like MaxCount, but
467 /// applies even if full unrolling is selected. This allows a target to fall
468 /// back to Partial unrolling if full unrolling is above FullUnrollMaxCount.
469 unsigned FullUnrollMaxCount;
470 // Represents number of instructions optimized when "back edge"
471 // becomes "fall through" in unrolled loop.
472 // For now we count a conditional branch on a backedge and a comparison
473 // feeding it.
474 unsigned BEInsns;
475 /// Allow partial unrolling (unrolling of loops to expand the size of the
476 /// loop body, not only to eliminate small constant-trip-count loops).
477 bool Partial;
478 /// Allow runtime unrolling (unrolling of loops to expand the size of the
479 /// loop body even when the number of loop iterations is not known at
480 /// compile time).
481 bool Runtime;
482 /// Allow generation of a loop remainder (extra iterations after unroll).
483 bool AllowRemainder;
484 /// Allow emitting expensive instructions (such as divisions) when computing
485 /// the trip count of a loop for runtime unrolling.
486 bool AllowExpensiveTripCount;
487 /// Apply loop unroll on any kind of loop
488 /// (mainly to loops that fail runtime unrolling).
489 bool Force;
490 /// Allow using trip count upper bound to unroll loops.
491 bool UpperBound;
492 /// Allow unrolling of all the iterations of the runtime loop remainder.
493 bool UnrollRemainder;
494 /// Allow unroll and jam. Used to enable unroll and jam for the target.
495 bool UnrollAndJam;
496 /// Threshold for unroll and jam, for inner loop size. The 'Threshold'
497 /// value above is used during unroll and jam for the outer loop size.
498 /// This value is used in the same manner to limit the size of the inner
499 /// loop.
500 unsigned UnrollAndJamInnerLoopThreshold;
501 /// Don't allow loop unrolling to simulate more than this number of
502 /// iterations when checking full unroll profitability
503 unsigned MaxIterationsCountToAnalyze;
504 };
505
506 /// Get target-customized preferences for the generic loop unrolling
507 /// transformation. The caller will initialize UP with the current
508 /// target-independent defaults.
509 void getUnrollingPreferences(Loop *L, ScalarEvolution &,
510 UnrollingPreferences &UP,
511 OptimizationRemarkEmitter *ORE) const;
512
513 /// Query the target whether it would be profitable to convert the given loop
514 /// into a hardware loop.
515 bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
516 AssumptionCache &AC, TargetLibraryInfo *LibInfo,
517 HardwareLoopInfo &HWLoopInfo) const;
518
519 /// Query the target whether it would be prefered to create a predicated
520 /// vector loop, which can avoid the need to emit a scalar epilogue loop.
521 bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
522 AssumptionCache &AC, TargetLibraryInfo *TLI,
523 DominatorTree *DT,
524 const LoopAccessInfo *LAI) const;
525
526 /// Query the target whether lowering of the llvm.get.active.lane.mask
527 /// intrinsic is supported.
528 bool emitGetActiveLaneMask() const;
529
530 // Parameters that control the loop peeling transformation
531 struct PeelingPreferences {
532 /// A forced peeling factor (the number of bodied of the original loop
533 /// that should be peeled off before the loop body). When set to 0, the
534 /// a peeling factor based on profile information and other factors.
535 unsigned PeelCount;
536 /// Allow peeling off loop iterations.
537 bool AllowPeeling;
538 /// Allow peeling off loop iterations for loop nests.
539 bool AllowLoopNestsPeeling;
540 /// Allow peeling basing on profile. Uses to enable peeling off all
541 /// iterations basing on provided profile.
542 /// If the value is true the peeling cost model can decide to peel only
543 /// some iterations and in this case it will set this to false.
544 bool PeelProfiledIterations;
545 };
546
547 /// Get target-customized preferences for the generic loop peeling
548 /// transformation. The caller will initialize \p PP with the current
549 /// target-independent defaults with information from \p L and \p SE.
550 void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
551 PeelingPreferences &PP) const;
552
553 /// Targets can implement their own combinations for target-specific
554 /// intrinsics. This function will be called from the InstCombine pass every
555 /// time a target-specific intrinsic is encountered.
556 ///
557 /// \returns None to not do anything target specific or a value that will be
558 /// returned from the InstCombiner. It is possible to return null and stop
559 /// further processing of the intrinsic by returning nullptr.
560 Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
561 IntrinsicInst &II) const;
562 /// Can be used to implement target-specific instruction combining.
563 /// \see instCombineIntrinsic
564 Optional<Value *>
565 simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II,
566 APInt DemandedMask, KnownBits &Known,
567 bool &KnownBitsComputed) const;
568 /// Can be used to implement target-specific instruction combining.
569 /// \see instCombineIntrinsic
570 Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
571 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
572 APInt &UndefElts2, APInt &UndefElts3,
573 std::function<void(Instruction *, unsigned, APInt, APInt &)>
574 SimplifyAndSetOp) const;
575 /// @}
576
577 /// \name Scalar Target Information
578 /// @{
579
580 /// Flags indicating the kind of support for population count.
581 ///
582 /// Compared to the SW implementation, HW support is supposed to
583 /// significantly boost the performance when the population is dense, and it
584 /// may or may not degrade performance if the population is sparse. A HW
585 /// support is considered as "Fast" if it can outperform, or is on a par
586 /// with, SW implementation when the population is sparse; otherwise, it is
587 /// considered as "Slow".
588 enum PopcntSupportKind { PSK_Software, PSK_SlowHardware, PSK_FastHardware };
589
590 /// Return true if the specified immediate is legal add immediate, that
591 /// is the target has add instructions which can add a register with the
592 /// immediate without having to materialize the immediate into a register.
593 bool isLegalAddImmediate(int64_t Imm) const;
594
595 /// Return true if the specified immediate is legal icmp immediate,
596 /// that is the target has icmp instructions which can compare a register
597 /// against the immediate without having to materialize the immediate into a
598 /// register.
599 bool isLegalICmpImmediate(int64_t Imm) const;
600
601 /// Return true if the addressing mode represented by AM is legal for
602 /// this target, for a load/store of the specified type.
603 /// The type may be VoidTy, in which case only return true if the addressing
604 /// mode is legal for a load/store of any legal type.
605 /// If target returns true in LSRWithInstrQueries(), I may be valid.
606 /// TODO: Handle pre/postinc as well.
607 bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
608 bool HasBaseReg, int64_t Scale,
609 unsigned AddrSpace = 0,
610 Instruction *I = nullptr) const;
611
612 /// Return true if LSR cost of C1 is lower than C1.
613 bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
614 TargetTransformInfo::LSRCost &C2) const;
615
616 /// Return true if LSR major cost is number of registers. Targets which
617 /// implement their own isLSRCostLess and unset number of registers as major
618 /// cost should return false, otherwise return true.
619 bool isNumRegsMajorCostOfLSR() const;
620
621 /// \returns true if LSR should not optimize a chain that includes \p I.
622 bool isProfitableLSRChainElement(Instruction *I) const;
623
624 /// Return true if the target can fuse a compare and branch.
625 /// Loop-strength-reduction (LSR) uses that knowledge to adjust its cost
626 /// calculation for the instructions in a loop.
627 bool canMacroFuseCmp() const;
628
629 /// Return true if the target can save a compare for loop count, for example
630 /// hardware loop saves a compare.
631 bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI,
632 DominatorTree *DT, AssumptionCache *AC,
633 TargetLibraryInfo *LibInfo) const;
634
635 enum AddressingModeKind {
636 AMK_PreIndexed,
637 AMK_PostIndexed,
638 AMK_None
639 };
640
641 /// Return the preferred addressing mode LSR should make efforts to generate.
642 AddressingModeKind getPreferredAddressingMode(const Loop *L,
643 ScalarEvolution *SE) const;
644
645 /// Return true if the target supports masked store.
646 bool isLegalMaskedStore(Type *DataType, Align Alignment) const;
647 /// Return true if the target supports masked load.
648 bool isLegalMaskedLoad(Type *DataType, Align Alignment) const;
649
650 /// Return true if the target supports nontemporal store.
651 bool isLegalNTStore(Type *DataType, Align Alignment) const;
652 /// Return true if the target supports nontemporal load.
653 bool isLegalNTLoad(Type *DataType, Align Alignment) const;
654
655 /// Return true if the target supports masked scatter.
656 bool isLegalMaskedScatter(Type *DataType, Align Alignment) const;
657 /// Return true if the target supports masked gather.
658 bool isLegalMaskedGather(Type *DataType, Align Alignment) const;
659
660 /// Return true if the target supports masked compress store.
661 bool isLegalMaskedCompressStore(Type *DataType) const;
662 /// Return true if the target supports masked expand load.
663 bool isLegalMaskedExpandLoad(Type *DataType) const;
664
665 /// Return true if we should be enabling ordered reductions for the target.
666 bool enableOrderedReductions() const;
667
668 /// Return true if the target has a unified operation to calculate division
669 /// and remainder. If so, the additional implicit multiplication and
670 /// subtraction required to calculate a remainder from division are free. This
671 /// can enable more aggressive transformations for division and remainder than
672 /// would typically be allowed using throughput or size cost models.
673 bool hasDivRemOp(Type *DataType, bool IsSigned) const;
674
675 /// Return true if the given instruction (assumed to be a memory access
676 /// instruction) has a volatile variant. If that's the case then we can avoid
677 /// addrspacecast to generic AS for volatile loads/stores. Default
678 /// implementation returns false, which prevents address space inference for
679 /// volatile loads/stores.
680 bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) const;
681
682 /// Return true if target doesn't mind addresses in vectors.
683 bool prefersVectorizedAddressing() const;
684
685 /// Return the cost of the scaling factor used in the addressing
686 /// mode represented by AM for this target, for a load/store
687 /// of the specified type.
688 /// If the AM is supported, the return value must be >= 0.
689 /// If the AM is not supported, it returns a negative value.
690 /// TODO: Handle pre/postinc as well.
691 InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
692 int64_t BaseOffset, bool HasBaseReg,
693 int64_t Scale,
694 unsigned AddrSpace = 0) const;
695
696 /// Return true if the loop strength reduce pass should make
697 /// Instruction* based TTI queries to isLegalAddressingMode(). This is
698 /// needed on SystemZ, where e.g. a memcpy can only have a 12 bit unsigned
699 /// immediate offset and no index register.
700 bool LSRWithInstrQueries() const;
701
702 /// Return true if it's free to truncate a value of type Ty1 to type
703 /// Ty2. e.g. On x86 it's free to truncate a i32 value in register EAX to i16
704 /// by referencing its sub-register AX.
705 bool isTruncateFree(Type *Ty1, Type *Ty2) const;
706
707 /// Return true if it is profitable to hoist instruction in the
708 /// then/else to before if.
709 bool isProfitableToHoist(Instruction *I) const;
710
711 bool useAA() const;
712
713 /// Return true if this type is legal.
714 bool isTypeLegal(Type *Ty) const;
715
716 /// Returns the estimated number of registers required to represent \p Ty.
717 InstructionCost getRegUsageForType(Type *Ty) const;
718
719 /// Return true if switches should be turned into lookup tables for the
720 /// target.
721 bool shouldBuildLookupTables() const;
722
723 /// Return true if switches should be turned into lookup tables
724 /// containing this constant value for the target.
725 bool shouldBuildLookupTablesForConstant(Constant *C) const;
726
727 /// Return true if lookup tables should be turned into relative lookup tables.
728 bool shouldBuildRelLookupTables() const;
729
730 /// Return true if the input function which is cold at all call sites,
731 /// should use coldcc calling convention.
732 bool useColdCCForColdCall(Function &F) const;
733
734 /// Estimate the overhead of scalarizing an instruction. Insert and Extract
735 /// are set if the demanded result elements need to be inserted and/or
736 /// extracted from vectors.
737 InstructionCost getScalarizationOverhead(VectorType *Ty,
738 const APInt &DemandedElts,
739 bool Insert, bool Extract) const;
740
741 /// Estimate the overhead of scalarizing an instructions unique
742 /// non-constant operands. The (potentially vector) types to use for each of
743 /// argument are passes via Tys.
744 InstructionCost getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
745 ArrayRef<Type *> Tys) const;
746
747 /// If target has efficient vector element load/store instructions, it can
748 /// return true here so that insertion/extraction costs are not added to
749 /// the scalarization cost of a load/store.
750 bool supportsEfficientVectorElementLoadStore() const;
751
752 /// Don't restrict interleaved unrolling to small loops.
753 bool enableAggressiveInterleaving(bool LoopHasReductions) const;
754
755 /// Returns options for expansion of memcmp. IsZeroCmp is
756 // true if this is the expansion of memcmp(p1, p2, s) == 0.
757 struct MemCmpExpansionOptions {
758 // Return true if memcmp expansion is enabled.
759 operator bool() const { return MaxNumLoads > 0; }
760
761 // Maximum number of load operations.
762 unsigned MaxNumLoads = 0;
763
764 // The list of available load sizes (in bytes), sorted in decreasing order.
765 SmallVector<unsigned, 8> LoadSizes;
766
767 // For memcmp expansion when the memcmp result is only compared equal or
768 // not-equal to 0, allow up to this number of load pairs per block. As an
769 // example, this may allow 'memcmp(a, b, 3) == 0' in a single block:
770 // a0 = load2bytes &a[0]
771 // b0 = load2bytes &b[0]
772 // a2 = load1byte &a[2]
773 // b2 = load1byte &b[2]
774 // r = cmp eq (a0 ^ b0 | a2 ^ b2), 0
775 unsigned NumLoadsPerBlock = 1;
776
777 // Set to true to allow overlapping loads. For example, 7-byte compares can
778 // be done with two 4-byte compares instead of 4+2+1-byte compares. This
779 // requires all loads in LoadSizes to be doable in an unaligned way.
780 bool AllowOverlappingLoads = false;
781 };
782 MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
783 bool IsZeroCmp) const;
784
785 /// Enable matching of interleaved access groups.
786 bool enableInterleavedAccessVectorization() const;
787
788 /// Enable matching of interleaved access groups that contain predicated
789 /// accesses or gaps and therefore vectorized using masked
790 /// vector loads/stores.
791 bool enableMaskedInterleavedAccessVectorization() const;
792
793 /// Indicate that it is potentially unsafe to automatically vectorize
794 /// floating-point operations because the semantics of vector and scalar
795 /// floating-point semantics may differ. For example, ARM NEON v7 SIMD math
796 /// does not support IEEE-754 denormal numbers, while depending on the
797 /// platform, scalar floating-point math does.
798 /// This applies to floating-point math operations and calls, not memory
799 /// operations, shuffles, or casts.
800 bool isFPVectorizationPotentiallyUnsafe() const;
801
802 /// Determine if the target supports unaligned memory accesses.
803 bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth,
804 unsigned AddressSpace = 0,
805 Align Alignment = Align(1),
806 bool *Fast = nullptr) const;
807
808 /// Return hardware support for population count.
809 PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const;
810
811 /// Return true if the hardware has a fast square-root instruction.
812 bool haveFastSqrt(Type *Ty) const;
813
814 /// Return true if it is faster to check if a floating-point value is NaN
815 /// (or not-NaN) versus a comparison against a constant FP zero value.
816 /// Targets should override this if materializing a 0.0 for comparison is
817 /// generally as cheap as checking for ordered/unordered.
818 bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) const;
819
820 /// Return the expected cost of supporting the floating point operation
821 /// of the specified type.
822 InstructionCost getFPOpCost(Type *Ty) const;
823
824 /// Return the expected cost of materializing for the given integer
825 /// immediate of the specified type.
826 InstructionCost getIntImmCost(const APInt &Imm, Type *Ty,
827 TargetCostKind CostKind) const;
828
829 /// Return the expected cost of materialization for the given integer
830 /// immediate of the specified type for a given instruction. The cost can be
831 /// zero if the immediate can be folded into the specified instruction.
832 InstructionCost getIntImmCostInst(unsigned Opc, unsigned Idx,
833 const APInt &Imm, Type *Ty,
834 TargetCostKind CostKind,
835 Instruction *Inst = nullptr) const;
836 InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
837 const APInt &Imm, Type *Ty,
838 TargetCostKind CostKind) const;
839
840 /// Return the expected cost for the given integer when optimising
841 /// for size. This is different than the other integer immediate cost
842 /// functions in that it is subtarget agnostic. This is useful when you e.g.
843 /// target one ISA such as Aarch32 but smaller encodings could be possible
844 /// with another such as Thumb. This return value is used as a penalty when
845 /// the total costs for a constant is calculated (the bigger the cost, the
846 /// more beneficial constant hoisting is).
847 InstructionCost getIntImmCodeSizeCost(unsigned Opc, unsigned Idx,
848 const APInt &Imm, Type *Ty) const;
849 /// @}
850
851 /// \name Vector Target Information
852 /// @{
853
854 /// The various kinds of shuffle patterns for vector queries.
855 enum ShuffleKind {
856 SK_Broadcast, ///< Broadcast element 0 to all other elements.
857 SK_Reverse, ///< Reverse the order of the vector.
858 SK_Select, ///< Selects elements from the corresponding lane of
859 ///< either source operand. This is equivalent to a
860 ///< vector select with a constant condition operand.
861 SK_Transpose, ///< Transpose two vectors.
862 SK_InsertSubvector, ///< InsertSubvector. Index indicates start offset.
863 SK_ExtractSubvector, ///< ExtractSubvector Index indicates start offset.
864 SK_PermuteTwoSrc, ///< Merge elements from two source vectors into one
865 ///< with any shuffle mask.
866 SK_PermuteSingleSrc, ///< Shuffle elements of single source vector with any
867 ///< shuffle mask.
868 SK_Splice ///< Concatenates elements from the first input vector
869 ///< with elements of the second input vector. Returning
870 ///< a vector of the same type as the input vectors.
871 };
872