Bug Summary

File:llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
Warning:line 237, column 16
The result of the left shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name ARMTargetTransformInfo.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/lib/Target/ARM -resource-dir /usr/lib/llvm-14/lib/clang/14.0.0 -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/lib/Target/ARM -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/lib/Target/ARM -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/include -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/include -D NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-14/lib/clang/14.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/lib/Target/ARM -fdebug-prefix-map=/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e=. -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2021-09-04-040900-46481-1 -x c++ /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

1//===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "ARMTargetTransformInfo.h"
10#include "ARMSubtarget.h"
11#include "MCTargetDesc/ARMAddressingModes.h"
12#include "llvm/ADT/APInt.h"
13#include "llvm/ADT/SmallVector.h"
14#include "llvm/Analysis/LoopInfo.h"
15#include "llvm/CodeGen/CostTable.h"
16#include "llvm/CodeGen/ISDOpcodes.h"
17#include "llvm/CodeGen/ValueTypes.h"
18#include "llvm/IR/BasicBlock.h"
19#include "llvm/IR/DataLayout.h"
20#include "llvm/IR/DerivedTypes.h"
21#include "llvm/IR/Instruction.h"
22#include "llvm/IR/Instructions.h"
23#include "llvm/IR/Intrinsics.h"
24#include "llvm/IR/IntrinsicInst.h"
25#include "llvm/IR/IntrinsicsARM.h"
26#include "llvm/IR/PatternMatch.h"
27#include "llvm/IR/Type.h"
28#include "llvm/MC/SubtargetFeature.h"
29#include "llvm/Support/Casting.h"
30#include "llvm/Support/KnownBits.h"
31#include "llvm/Support/MachineValueType.h"
32#include "llvm/Target/TargetMachine.h"
33#include "llvm/Transforms/InstCombine/InstCombiner.h"
34#include "llvm/Transforms/Utils/Local.h"
35#include "llvm/Transforms/Utils/LoopUtils.h"
36#include <algorithm>
37#include <cassert>
38#include <cstdint>
39#include <utility>
40
41using namespace llvm;
42
43#define DEBUG_TYPE"armtti" "armtti"
44
45static cl::opt<bool> EnableMaskedLoadStores(
46 "enable-arm-maskedldst", cl::Hidden, cl::init(true),
47 cl::desc("Enable the generation of masked loads and stores"));
48
49static cl::opt<bool> DisableLowOverheadLoops(
50 "disable-arm-loloops", cl::Hidden, cl::init(false),
51 cl::desc("Disable the generation of low-overhead loops"));
52
53static cl::opt<bool>
54 AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
55 cl::desc("Enable the generation of WLS loops"));
56
57extern cl::opt<TailPredication::Mode> EnableTailPredication;
58
59extern cl::opt<bool> EnableMaskedGatherScatters;
60
61extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor;
62
63/// Convert a vector load intrinsic into a simple llvm load instruction.
64/// This is beneficial when the underlying object being addressed comes
65/// from a constant, since we get constant-folding for free.
66static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
67 InstCombiner::BuilderTy &Builder) {
68 auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
69
70 if (!IntrAlign)
71 return nullptr;
72
73 unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
74 ? MemAlign
75 : IntrAlign->getLimitedValue();
76
77 if (!isPowerOf2_32(Alignment))
78 return nullptr;
79
80 auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
81 PointerType::get(II.getType(), 0));
82 return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
83}
84
85bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
86 const Function *Callee) const {
87 const TargetMachine &TM = getTLI()->getTargetMachine();
88 const FeatureBitset &CallerBits =
89 TM.getSubtargetImpl(*Caller)->getFeatureBits();
90 const FeatureBitset &CalleeBits =
91 TM.getSubtargetImpl(*Callee)->getFeatureBits();
92
93 // To inline a callee, all features not in the allowed list must match exactly.
94 bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
95 (CalleeBits & ~InlineFeaturesAllowed);
96 // For features in the allowed list, the callee's features must be a subset of
97 // the callers'.
98 bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
99 (CalleeBits & InlineFeaturesAllowed);
100 return MatchExact && MatchSubset;
101}
102
103TTI::AddressingModeKind
104ARMTTIImpl::getPreferredAddressingMode(const Loop *L,
105 ScalarEvolution *SE) const {
106 if (ST->hasMVEIntegerOps())
107 return TTI::AMK_PostIndexed;
108
109 if (L->getHeader()->getParent()->hasOptSize())
110 return TTI::AMK_None;
111
112 if (ST->isMClass() && ST->isThumb2() &&
113 L->getNumBlocks() == 1)
114 return TTI::AMK_PreIndexed;
115
116 return TTI::AMK_None;
117}
118
119Optional<Instruction *>
120ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
121 using namespace PatternMatch;
122 Intrinsic::ID IID = II.getIntrinsicID();
123 switch (IID) {
124 default:
125 break;
126 case Intrinsic::arm_neon_vld1: {
127 Align MemAlign =
128 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
129 &IC.getAssumptionCache(), &IC.getDominatorTree());
130 if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
131 return IC.replaceInstUsesWith(II, V);
132 }
133 break;
134 }
135
136 case Intrinsic::arm_neon_vld2:
137 case Intrinsic::arm_neon_vld3:
138 case Intrinsic::arm_neon_vld4:
139 case Intrinsic::arm_neon_vld2lane:
140 case Intrinsic::arm_neon_vld3lane:
141 case Intrinsic::arm_neon_vld4lane:
142 case Intrinsic::arm_neon_vst1:
143 case Intrinsic::arm_neon_vst2:
144 case Intrinsic::arm_neon_vst3:
145 case Intrinsic::arm_neon_vst4:
146 case Intrinsic::arm_neon_vst2lane:
147 case Intrinsic::arm_neon_vst3lane:
148 case Intrinsic::arm_neon_vst4lane: {
149 Align MemAlign =
150 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
151 &IC.getAssumptionCache(), &IC.getDominatorTree());
152 unsigned AlignArg = II.getNumArgOperands() - 1;
153 Value *AlignArgOp = II.getArgOperand(AlignArg);
154 MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
155 if (Align && *Align < MemAlign) {
156 return IC.replaceOperand(
157 II, AlignArg,
158 ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(),
159 false));
160 }
161 break;
162 }
163
164 case Intrinsic::arm_mve_pred_i2v: {
165 Value *Arg = II.getArgOperand(0);
166 Value *ArgArg;
167 if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
168 PatternMatch::m_Value(ArgArg))) &&
169 II.getType() == ArgArg->getType()) {
170 return IC.replaceInstUsesWith(II, ArgArg);
171 }
172 Constant *XorMask;
173 if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
174 PatternMatch::m_Value(ArgArg)),
175 PatternMatch::m_Constant(XorMask))) &&
176 II.getType() == ArgArg->getType()) {
177 if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
178 if (CI->getValue().trunc(16).isAllOnesValue()) {
179 auto TrueVector = IC.Builder.CreateVectorSplat(
180 cast<FixedVectorType>(II.getType())->getNumElements(),
181 IC.Builder.getTrue());
182 return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
183 }
184 }
185 }
186 KnownBits ScalarKnown(32);
187 if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
188 ScalarKnown, 0)) {
189 return &II;
190 }
191 break;
192 }
193 case Intrinsic::arm_mve_pred_v2i: {
194 Value *Arg = II.getArgOperand(0);
195 Value *ArgArg;
196 if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
197 PatternMatch::m_Value(ArgArg)))) {
198 return IC.replaceInstUsesWith(II, ArgArg);
199 }
200 if (!II.getMetadata(LLVMContext::MD_range)) {
201 Type *IntTy32 = Type::getInt32Ty(II.getContext());
202 Metadata *M[] = {
203 ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)),
204 ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0x10000))};
205 II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M));
206 return &II;
207 }
208 break;
209 }
210 case Intrinsic::arm_mve_vadc:
211 case Intrinsic::arm_mve_vadc_predicated: {
212 unsigned CarryOp =
213 (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
214 assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&(static_cast<void> (0))
215 "Bad type for intrinsic!")(static_cast<void> (0));
216
217 KnownBits CarryKnown(32);
218 if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
219 CarryKnown)) {
220 return &II;
221 }
222 break;
223 }
224 case Intrinsic::arm_mve_vmldava: {
225 Instruction *I = cast<Instruction>(&II);
226 if (I->hasOneUse()) {
227 auto *User = cast<Instruction>(*I->user_begin());
228 Value *OpZ;
229 if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
230 match(I->getOperand(3), m_Zero())) {
231 Value *OpX = I->getOperand(4);
232 Value *OpY = I->getOperand(5);
233 Type *OpTy = OpX->getType();
234
235 IC.Builder.SetInsertPoint(User);
236 Value *V =
237 IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
238 {I->getOperand(0), I->getOperand(1),
239 I->getOperand(2), OpZ, OpX, OpY});
240
241 IC.replaceInstUsesWith(*User, V);
242 return IC.eraseInstFromFunction(*User);
243 }
244 }
245 return None;
246 }
247 }
248 return None;
249}
250
251InstructionCost ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
252 TTI::TargetCostKind CostKind) {
253 assert(Ty->isIntegerTy())(static_cast<void> (0));
254
255 unsigned Bits = Ty->getPrimitiveSizeInBits();
256 if (Bits == 0 || Imm.getActiveBits() >= 64)
9
Assuming 'Bits' is not equal to 0
10
Assuming the condition is false
11
Taking false branch
257 return 4;
258
259 int64_t SImmVal = Imm.getSExtValue();
260 uint64_t ZImmVal = Imm.getZExtValue();
261 if (!ST->isThumb()) {
12
Taking false branch
262 if ((SImmVal >= 0 && SImmVal < 65536) ||
263 (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
264 (ARM_AM::getSOImmVal(~ZImmVal) != -1))
265 return 1;
266 return ST->hasV6T2Ops() ? 2 : 3;
267 }
268 if (ST->isThumb2()) {
269 if ((SImmVal >= 0 && SImmVal < 65536) ||
270 (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
271 (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
272 return 1;
273 return ST->hasV6T2Ops() ? 2 : 3;
274 }
275 // Thumb1, any i8 imm cost 1.
276 if (Bits == 8 || (SImmVal
13.1
'SImmVal' is >= 0
13.1
'SImmVal' is >= 0
13.1
'SImmVal' is >= 0
>= 0 && SImmVal < 256))
13
Assuming 'Bits' is not equal to 8
14
Assuming 'SImmVal' is >= 256
277 return 1;
278 if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
15
Assuming the condition is false
16
Calling 'isThumbImmShiftedVal'
279 return 2;
280 // Load from constantpool.
281 return 3;
282}
283
284// Constants smaller than 256 fit in the immediate field of
285// Thumb1 instructions so we return a zero cost and 1 otherwise.
286InstructionCost ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
287 const APInt &Imm, Type *Ty) {
288 if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
289 return 0;
290
291 return 1;
292}
293
294// Checks whether Inst is part of a min(max()) or max(min()) pattern
295// that will match to an SSAT instruction
296static bool isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
297 Value *LHS, *RHS;
298 ConstantInt *C;
299 SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor;
300
301 if (InstSPF == SPF_SMAX &&
302 PatternMatch::match(RHS, PatternMatch::m_ConstantInt(C)) &&
303 C->getValue() == Imm && Imm.isNegative() && (-Imm).isPowerOf2()) {
304
305 auto isSSatMin = [&](Value *MinInst) {
306 if (isa<SelectInst>(MinInst)) {
307 Value *MinLHS, *MinRHS;
308 ConstantInt *MinC;
309 SelectPatternFlavor MinSPF =
310 matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
311 if (MinSPF == SPF_SMIN &&
312 PatternMatch::match(MinRHS, PatternMatch::m_ConstantInt(MinC)) &&
313 MinC->getValue() == ((-Imm) - 1))
314 return true;
315 }
316 return false;
317 };
318
319 if (isSSatMin(Inst->getOperand(1)) ||
320 (Inst->hasNUses(2) && (isSSatMin(*Inst->user_begin()) ||
321 isSSatMin(*(++Inst->user_begin())))))
322 return true;
323 }
324 return false;
325}
326
327InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
328 const APInt &Imm, Type *Ty,
329 TTI::TargetCostKind CostKind,
330 Instruction *Inst) {
331 // Division by a constant can be turned into multiplication, but only if we
332 // know it's constant. So it's not so much that the immediate is cheap (it's
333 // not), but that the alternative is worse.
334 // FIXME: this is probably unneeded with GlobalISel.
335 if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
1
Assuming 'Opcode' is not equal to SDiv
2
Assuming 'Opcode' is not equal to UDiv
336 Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
3
Assuming 'Opcode' is not equal to SRem
4
Assuming 'Opcode' is not equal to URem
337 Idx == 1)
338 return 0;
339
340 // Leave any gep offsets for the CodeGenPrepare, which will do a better job at
341 // splitting any large offsets.
342 if (Opcode == Instruction::GetElementPtr && Idx != 0)
5
Assuming 'Opcode' is not equal to GetElementPtr
343 return 0;
344
345 if (Opcode == Instruction::And) {
6
Assuming 'Opcode' is equal to And
346 // UXTB/UXTH
347 if (Imm == 255 || Imm == 65535)
7
Taking false branch
348 return 0;
349 // Conversion to BIC is free, and means we can use ~Imm instead.
350 return std::min(getIntImmCost(Imm, Ty, CostKind),
351 getIntImmCost(~Imm, Ty, CostKind));
8
Calling 'ARMTTIImpl::getIntImmCost'
352 }
353
354 if (Opcode == Instruction::Add)
355 // Conversion to SUB is free, and means we can use -Imm instead.
356 return std::min(getIntImmCost(Imm, Ty, CostKind),
357 getIntImmCost(-Imm, Ty, CostKind));
358
359 if (Opcode == Instruction::ICmp && Imm.isNegative() &&
360 Ty->getIntegerBitWidth() == 32) {
361 int64_t NegImm = -Imm.getSExtValue();
362 if (ST->isThumb2() && NegImm < 1<<12)
363 // icmp X, #-C -> cmn X, #C
364 return 0;
365 if (ST->isThumb() && NegImm < 1<<8)
366 // icmp X, #-C -> adds X, #C
367 return 0;
368 }
369
370 // xor a, -1 can always be folded to MVN
371 if (Opcode == Instruction::Xor && Imm.isAllOnesValue())
372 return 0;
373
374 // Ensures negative constant of min(max()) or max(min()) patterns that
375 // match to SSAT instructions don't get hoisted
376 if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
377 Ty->getIntegerBitWidth() <= 32) {
378 if (isSSATMinMaxPattern(Inst, Imm) ||
379 (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
380 isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm)))
381 return 0;
382 }
383
384 return getIntImmCost(Imm, Ty, CostKind);
385}
386
387InstructionCost ARMTTIImpl::getCFInstrCost(unsigned Opcode,
388 TTI::TargetCostKind CostKind,
389 const Instruction *I) {
390 if (CostKind == TTI::TCK_RecipThroughput &&
391 (ST->hasNEON() || ST->hasMVEIntegerOps())) {
392 // FIXME: The vectorizer is highly sensistive to the cost of these
393 // instructions, which suggests that it may be using the costs incorrectly.
394 // But, for now, just make them free to avoid performance regressions for
395 // vector targets.
396 return 0;
397 }
398 return BaseT::getCFInstrCost(Opcode, CostKind, I);
399}
400
401InstructionCost ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
402 Type *Src,
403 TTI::CastContextHint CCH,
404 TTI::TargetCostKind CostKind,
405 const Instruction *I) {
406 int ISD = TLI->InstructionOpcodeToISD(Opcode);
407 assert(ISD && "Invalid opcode")(static_cast<void> (0));
408
409 // TODO: Allow non-throughput costs that aren't binary.
410 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
411 if (CostKind != TTI::TCK_RecipThroughput)
412 return Cost == 0 ? 0 : 1;
413 return Cost;
414 };
415 auto IsLegalFPType = [this](EVT VT) {
416 EVT EltVT = VT.getScalarType();
417 return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
418 (EltVT == MVT::f64 && ST->hasFP64()) ||
419 (EltVT == MVT::f16 && ST->hasFullFP16());
420 };
421
422 EVT SrcTy = TLI->getValueType(DL, Src);
423 EVT DstTy = TLI->getValueType(DL, Dst);
424
425 if (!SrcTy.isSimple() || !DstTy.isSimple())
426 return AdjustCost(
427 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
428
429 // Extending masked load/Truncating masked stores is expensive because we
430 // currently don't split them. This means that we'll likely end up
431 // loading/storing each element individually (hence the high cost).
432 if ((ST->hasMVEIntegerOps() &&
433 (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
434 Opcode == Instruction::SExt)) ||
435 (ST->hasMVEFloatOps() &&
436 (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
437 IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
438 if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
439 return 2 * DstTy.getVectorNumElements() *
440 ST->getMVEVectorCostFactor(CostKind);
441
442 // The extend of other kinds of load is free
443 if (CCH == TTI::CastContextHint::Normal ||
444 CCH == TTI::CastContextHint::Masked) {
445 static const TypeConversionCostTblEntry LoadConversionTbl[] = {
446 {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
447 {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
448 {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
449 {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
450 {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
451 {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
452 {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
453 {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
454 {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
455 {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
456 {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
457 {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
458 };
459 if (const auto *Entry = ConvertCostTableLookup(
460 LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
461 return AdjustCost(Entry->Cost);
462
463 static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
464 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
465 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
466 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
467 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
468 {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
469 {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
470 // The following extend from a legal type to an illegal type, so need to
471 // split the load. This introduced an extra load operation, but the
472 // extend is still "free".
473 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1},
474 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1},
475 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3},
476 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3},
477 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1},
478 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1},
479 };
480 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
481 if (const auto *Entry =
482 ConvertCostTableLookup(MVELoadConversionTbl, ISD,
483 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
484 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
485 }
486
487 static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
488 // FPExtends are similar but also require the VCVT instructions.
489 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1},
490 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3},
491 };
492 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
493 if (const auto *Entry =
494 ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
495 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
496 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
497 }
498
499 // The truncate of a store is free. This is the mirror of extends above.
500 static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
501 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0},
502 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0},
503 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0},
504 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1},
505 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1},
506 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3},
507 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1},
508 };
509 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
510 if (const auto *Entry =
511 ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
512 SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
513 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
514 }
515
516 static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
517 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
518 {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
519 };
520 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
521 if (const auto *Entry =
522 ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
523 SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
524 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
525 }
526 }
527
528 // NEON vector operations that can extend their inputs.
529 if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
530 I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
531 static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
532 // vaddl
533 { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
534 { ISD::ADD, MVT::v8i16, MVT::v8i8, 0 },
535 // vsubl
536 { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
537 { ISD::SUB, MVT::v8i16, MVT::v8i8, 0 },
538 // vmull
539 { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
540 { ISD::MUL, MVT::v8i16, MVT::v8i8, 0 },
541 // vshll
542 { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
543 { ISD::SHL, MVT::v8i16, MVT::v8i8, 0 },
544 };
545
546 auto *User = cast<Instruction>(*I->user_begin());
547 int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
548 if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
549 DstTy.getSimpleVT(),
550 SrcTy.getSimpleVT())) {
551 return AdjustCost(Entry->Cost);
552 }
553 }
554
555 // Single to/from double precision conversions.
556 if (Src->isVectorTy() && ST->hasNEON() &&
557 ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
558 DstTy.getScalarType() == MVT::f32) ||
559 (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
560 DstTy.getScalarType() == MVT::f64))) {
561 static const CostTblEntry NEONFltDblTbl[] = {
562 // Vector fptrunc/fpext conversions.
563 {ISD::FP_ROUND, MVT::v2f64, 2},
564 {ISD::FP_EXTEND, MVT::v2f32, 2},
565 {ISD::FP_EXTEND, MVT::v4f32, 4}};
566
567 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
568 if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
569 return AdjustCost(LT.first * Entry->Cost);
570 }
571
572 // Some arithmetic, load and store operations have specific instructions
573 // to cast up/down their types automatically at no extra cost.
574 // TODO: Get these tables to know at least what the related operations are.
575 static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
576 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
577 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
578 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
579 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
580 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
581 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
582
583 // The number of vmovl instructions for the extension.
584 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
585 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
586 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
587 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
588 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
589 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
590 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
591 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
592 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
593 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
594 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
595 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
596 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
597 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
598 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
599 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
600 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
601 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
602
603 // Operations that we legalize using splitting.
604 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
605 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
606
607 // Vector float <-> i32 conversions.
608 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
609 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
610
611 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
612 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
613 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
614 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
615 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
616 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
617 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
618 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
619 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
620 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
621 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
622 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
623 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
624 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
625 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
626 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
627 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
628 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
629 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
630 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
631
632 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
633 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
634 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 },
635 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 },
636 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
637 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
638
639 // Vector double <-> i32 conversions.
640 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
641 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
642
643 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
644 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
645 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
646 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
647 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
648 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
649
650 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
651 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
652 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 4 },
653 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 4 },
654 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 8 },
655 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 8 }
656 };
657
658 if (SrcTy.isVector() && ST->hasNEON()) {
659 if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
660 DstTy.getSimpleVT(),
661 SrcTy.getSimpleVT()))
662 return AdjustCost(Entry->Cost);
663 }
664
665 // Scalar float to integer conversions.
666 static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
667 { ISD::FP_TO_SINT, MVT::i1, MVT::f32, 2 },
668 { ISD::FP_TO_UINT, MVT::i1, MVT::f32, 2 },
669 { ISD::FP_TO_SINT, MVT::i1, MVT::f64, 2 },
670 { ISD::FP_TO_UINT, MVT::i1, MVT::f64, 2 },
671 { ISD::FP_TO_SINT, MVT::i8, MVT::f32, 2 },
672 { ISD::FP_TO_UINT, MVT::i8, MVT::f32, 2 },
673 { ISD::FP_TO_SINT, MVT::i8, MVT::f64, 2 },
674 { ISD::FP_TO_UINT, MVT::i8, MVT::f64, 2 },
675 { ISD::FP_TO_SINT, MVT::i16, MVT::f32, 2 },
676 { ISD::FP_TO_UINT, MVT::i16, MVT::f32, 2 },
677 { ISD::FP_TO_SINT, MVT::i16, MVT::f64, 2 },
678 { ISD::FP_TO_UINT, MVT::i16, MVT::f64, 2 },
679 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 2 },
680 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 2 },
681 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 2 },
682 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 2 },
683 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 10 },
684 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 10 },
685 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 10 },
686 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 10 }
687 };
688 if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
689 if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
690 DstTy.getSimpleVT(),
691 SrcTy.getSimpleVT()))
692 return AdjustCost(Entry->Cost);
693 }
694
695 // Scalar integer to float conversions.
696 static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
697 { ISD::SINT_TO_FP, MVT::f32, MVT::i1, 2 },
698 { ISD::UINT_TO_FP, MVT::f32, MVT::i1, 2 },
699 { ISD::SINT_TO_FP, MVT::f64, MVT::i1, 2 },
700 { ISD::UINT_TO_FP, MVT::f64, MVT::i1, 2 },
701 { ISD::SINT_TO_FP, MVT::f32, MVT::i8, 2 },
702 { ISD::UINT_TO_FP, MVT::f32, MVT::i8, 2 },
703 { ISD::SINT_TO_FP, MVT::f64, MVT::i8, 2 },
704 { ISD::UINT_TO_FP, MVT::f64, MVT::i8, 2 },
705 { ISD::SINT_TO_FP, MVT::f32, MVT::i16, 2 },
706 { ISD::UINT_TO_FP, MVT::f32, MVT::i16, 2 },
707 { ISD::SINT_TO_FP, MVT::f64, MVT::i16, 2 },
708 { ISD::UINT_TO_FP, MVT::f64, MVT::i16, 2 },
709 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 2 },
710 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 2 },
711 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 2 },
712 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 2 },
713 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 10 },
714 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 10 },
715 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 10 },
716 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 10 }
717 };
718
719 if (SrcTy.isInteger() && ST->hasNEON()) {
720 if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
721 ISD, DstTy.getSimpleVT(),
722 SrcTy.getSimpleVT()))
723 return AdjustCost(Entry->Cost);
724 }
725
726 // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
727 // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
728 // are linearised so take more.
729 static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
730 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
731 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
732 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
733 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
734 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
735 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
736 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
737 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
738 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
739 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
740 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
741 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
742 };
743
744 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
745 if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
746 ISD, DstTy.getSimpleVT(),
747 SrcTy.getSimpleVT()))
748 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
749 }
750
751 if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
752 // As general rule, fp converts that were not matched above are scalarized
753 // and cost 1 vcvt for each lane, so long as the instruction is available.
754 // If not it will become a series of function calls.
755 const InstructionCost CallCost =
756 getCallInstrCost(nullptr, Dst, {Src}, CostKind);
757 int Lanes = 1;
758 if (SrcTy.isFixedLengthVector())
759 Lanes = SrcTy.getVectorNumElements();
760
761 if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
762 return Lanes;
763 else
764 return Lanes * CallCost;
765 }
766
767 if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
768 SrcTy.isFixedLengthVector()) {
769 // Treat a truncate with larger than legal source (128bits for MVE) as
770 // expensive, 2 instructions per lane.
771 if ((SrcTy.getScalarType() == MVT::i8 ||
772 SrcTy.getScalarType() == MVT::i16 ||
773 SrcTy.getScalarType() == MVT::i32) &&
774 SrcTy.getSizeInBits() > 128 &&
775 SrcTy.getSizeInBits() > DstTy.getSizeInBits())
776 return SrcTy.getVectorNumElements() * 2;
777 }
778
779 // Scalar integer conversion costs.
780 static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
781 // i16 -> i64 requires two dependent operations.
782 { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
783
784 // Truncates on i64 are assumed to be free.
785 { ISD::TRUNCATE, MVT::i32, MVT::i64, 0 },
786 { ISD::TRUNCATE, MVT::i16, MVT::i64, 0 },
787 { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 },
788 { ISD::TRUNCATE, MVT::i1, MVT::i64, 0 }
789 };
790
791 if (SrcTy.isInteger()) {
792 if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
793 DstTy.getSimpleVT(),
794 SrcTy.getSimpleVT()))
795 return AdjustCost(Entry->Cost);
796 }
797
798 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
799 ? ST->getMVEVectorCostFactor(CostKind)
800 : 1;
801 return AdjustCost(
802 BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
803}
804
805InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
806 unsigned Index) {
807 // Penalize inserting into an D-subregister. We end up with a three times
808 // lower estimated throughput on swift.
809 if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
810 ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
811 return 3;
812
813 if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
814 Opcode == Instruction::ExtractElement)) {
815 // Cross-class copies are expensive on many microarchitectures,
816 // so assume they are expensive by default.
817 if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
818 return 3;
819
820 // Even if it's not a cross class copy, this likely leads to mixing
821 // of NEON and VFP code and should be therefore penalized.
822 if (ValTy->isVectorTy() &&
823 ValTy->getScalarSizeInBits() <= 32)
824 return std::max<InstructionCost>(
825 BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
826 }
827
828 if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
829 Opcode == Instruction::ExtractElement)) {
830 // Integer cross-lane moves are more expensive than float, which can
831 // sometimes just be vmovs. Integer involve being passes to GPR registers,
832 // causing more of a delay.
833 std::pair<InstructionCost, MVT> LT =
834 getTLI()->getTypeLegalizationCost(DL, ValTy->getScalarType());
835 return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
836 }
837
838 return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
839}
840
841InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
842 Type *CondTy,
843 CmpInst::Predicate VecPred,
844 TTI::TargetCostKind CostKind,
845 const Instruction *I) {
846 int ISD = TLI->InstructionOpcodeToISD(Opcode);
847
848 // Thumb scalar code size cost for select.
849 if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
850 ST->isThumb() && !ValTy->isVectorTy()) {
851 // Assume expensive structs.
852 if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
853 return TTI::TCC_Expensive;
854
855 // Select costs can vary because they:
856 // - may require one or more conditional mov (including an IT),
857 // - can't operate directly on immediates,
858 // - require live flags, which we can't copy around easily.
859 InstructionCost Cost = TLI->getTypeLegalizationCost(DL, ValTy).first;
860
861 // Possible IT instruction for Thumb2, or more for Thumb1.
862 ++Cost;
863
864 // i1 values may need rematerialising by using mov immediates and/or
865 // flag setting instructions.
866 if (ValTy->isIntegerTy(1))
867 ++Cost;
868
869 return Cost;
870 }
871
872 // If this is a vector min/max/abs, use the cost of that intrinsic directly
873 // instead. Hopefully when min/max intrinsics are more prevalent this code
874 // will not be needed.
875 const Instruction *Sel = I;
876 if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
877 Sel->hasOneUse())
878 Sel = cast<Instruction>(Sel->user_back());
879 if (Sel && ValTy->isVectorTy() &&
880 (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
881 const Value *LHS, *RHS;
882 SelectPatternFlavor SPF = matchSelectPattern(Sel, LHS, RHS).Flavor;
883 unsigned IID = 0;
884 switch (SPF) {
885 case SPF_ABS:
886 IID = Intrinsic::abs;
887 break;
888 case SPF_SMIN:
889 IID = Intrinsic::smin;
890 break;
891 case SPF_SMAX:
892 IID = Intrinsic::smax;
893 break;
894 case SPF_UMIN:
895 IID = Intrinsic::umin;
896 break;
897 case SPF_UMAX:
898 IID = Intrinsic::umax;
899 break;
900 case SPF_FMINNUM:
901 IID = Intrinsic::minnum;
902 break;
903 case SPF_FMAXNUM:
904 IID = Intrinsic::maxnum;
905 break;
906 default:
907 break;
908 }
909 if (IID) {
910 // The ICmp is free, the select gets the cost of the min/max/etc
911 if (Sel != I)
912 return 0;
913 IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
914 return getIntrinsicInstrCost(CostAttrs, CostKind);
915 }
916 }
917
918 // On NEON a vector select gets lowered to vbsl.
919 if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
920 // Lowering of some vector selects is currently far from perfect.
921 static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
922 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
923 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
924 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
925 };
926
927 EVT SelCondTy = TLI->getValueType(DL, CondTy);
928 EVT SelValTy = TLI->getValueType(DL, ValTy);
929 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
930 if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
931 SelCondTy.getSimpleVT(),
932 SelValTy.getSimpleVT()))
933 return Entry->Cost;
934 }
935
936 std::pair<InstructionCost, MVT> LT =
937 TLI->getTypeLegalizationCost(DL, ValTy);
938 return LT.first;
939 }
940
941 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
942 (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
943 cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
944 FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
945 FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy);
946 if (!VecCondTy)
947 VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy));
948
949 // If we don't have mve.fp any fp operations will need to be scalarized.
950 if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
951 // One scalaization insert, one scalarization extract and the cost of the
952 // fcmps.
953 return BaseT::getScalarizationOverhead(VecValTy, false, true) +
954 BaseT::getScalarizationOverhead(VecCondTy, true, false) +
955 VecValTy->getNumElements() *
956 getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
957 VecCondTy->getScalarType(), VecPred, CostKind,
958 I);
959 }
960
961 std::pair<InstructionCost, MVT> LT =
962 TLI->getTypeLegalizationCost(DL, ValTy);
963 int BaseCost = ST->getMVEVectorCostFactor(CostKind);
964 // There are two types - the input that specifies the type of the compare
965 // and the output vXi1 type. Because we don't know how the output will be
966 // split, we may need an expensive shuffle to get two in sync. This has the
967 // effect of making larger than legal compares (v8i32 for example)
968 // expensive.
969 if (LT.second.getVectorNumElements() > 2) {
970 if (LT.first > 1)
971 return LT.first * BaseCost +
972 BaseT::getScalarizationOverhead(VecCondTy, true, false);
973 return BaseCost;
974 }
975 }
976
977 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
978 // for "multiple beats" potentially needed by MVE instructions.
979 int BaseCost = 1;
980 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
981 BaseCost = ST->getMVEVectorCostFactor(CostKind);
982
983 return BaseCost *
984 BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
985}
986
987InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty,
988 ScalarEvolution *SE,
989 const SCEV *Ptr) {
990 // Address computations in vectorized code with non-consecutive addresses will
991 // likely result in more instructions compared to scalar code where the
992 // computation can more often be merged into the index mode. The resulting
993 // extra micro-ops can significantly decrease throughput.
994 unsigned NumVectorInstToHideOverhead = 10;
995 int MaxMergeDistance = 64;
996
997 if (ST->hasNEON()) {
998 if (Ty->isVectorTy() && SE &&
999 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
1000 return NumVectorInstToHideOverhead;
1001
1002 // In many cases the address computation is not merged into the instruction
1003 // addressing mode.
1004 return 1;
1005 }
1006 return BaseT::getAddressComputationCost(Ty, SE, Ptr);
1007}
1008
1009bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) {
1010 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
1011 // If a VCTP is part of a chain, it's already profitable and shouldn't be
1012 // optimized, else LSR may block tail-predication.
1013 switch (II->getIntrinsicID()) {
1014 case Intrinsic::arm_mve_vctp8:
1015 case Intrinsic::arm_mve_vctp16:
1016 case Intrinsic::arm_mve_vctp32:
1017 case Intrinsic::arm_mve_vctp64:
1018 return true;
1019 default:
1020 break;
1021 }
1022 }
1023 return false;
1024}
1025
1026bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
1027 if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
1028 return false;
1029
1030 if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
1031 // Don't support v2i1 yet.
1032 if (VecTy->getNumElements() == 2)
1033 return false;
1034
1035 // We don't support extending fp types.
1036 unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
1037 if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
1038 return false;
1039 }
1040
1041 unsigned EltWidth = DataTy->getScalarSizeInBits();
1042 return (EltWidth == 32 && Alignment >= 4) ||
1043 (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
1044}
1045
1046bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) {
1047 if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
1048 return false;
1049
1050 // This method is called in 2 places:
1051 // - from the vectorizer with a scalar type, in which case we need to get
1052 // this as good as we can with the limited info we have (and rely on the cost
1053 // model for the rest).
1054 // - from the masked intrinsic lowering pass with the actual vector type.
1055 // For MVE, we have a custom lowering pass that will already have custom
1056 // legalised any gathers that we can to MVE intrinsics, and want to expand all
1057 // the rest. The pass runs before the masked intrinsic lowering pass, so if we
1058 // are here, we know we want to expand.
1059 if (isa<VectorType>(Ty))
1060 return false;
1061
1062 unsigned EltWidth = Ty->getScalarSizeInBits();
1063 return ((EltWidth == 32 && Alignment >= 4) ||
1064 (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1065}
1066
1067/// Given a memcpy/memset/memmove instruction, return the number of memory
1068/// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1069/// call is used.
1070int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const {
1071 MemOp MOp;
1072 unsigned DstAddrSpace = ~0u;
1073 unsigned SrcAddrSpace = ~0u;
1074 const Function *F = I->getParent()->getParent();
1075
1076 if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
1077 ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
1078 // If 'size' is not a constant, a library call will be generated.
1079 if (!C)
1080 return -1;
1081
1082 const unsigned Size = C->getValue().getZExtValue();
1083 const Align DstAlign = *MC->getDestAlign();
1084 const Align SrcAlign = *MC->getSourceAlign();
1085
1086 MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
1087 /*IsVolatile*/ false);
1088 DstAddrSpace = MC->getDestAddressSpace();
1089 SrcAddrSpace = MC->getSourceAddressSpace();
1090 }
1091 else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
1092 ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
1093 // If 'size' is not a constant, a library call will be generated.
1094 if (!C)
1095 return -1;
1096
1097 const unsigned Size = C->getValue().getZExtValue();
1098 const Align DstAlign = *MS->getDestAlign();
1099
1100 MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
1101 /*IsZeroMemset*/ false, /*IsVolatile*/ false);
1102 DstAddrSpace = MS->getDestAddressSpace();
1103 }
1104 else
1105 llvm_unreachable("Expected a memcpy/move or memset!")__builtin_unreachable();
1106
1107 unsigned Limit, Factor = 2;
1108 switch(I->getIntrinsicID()) {
1109 case Intrinsic::memcpy:
1110 Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
1111 break;
1112 case Intrinsic::memmove:
1113 Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
1114 break;
1115 case Intrinsic::memset:
1116 Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
1117 Factor = 1;
1118 break;
1119 default:
1120 llvm_unreachable("Expected a memcpy/move or memset!")__builtin_unreachable();
1121 }
1122
1123 // MemOps will be poplulated with a list of data types that needs to be
1124 // loaded and stored. That's why we multiply the number of elements by 2 to
1125 // get the cost for this memcpy.
1126 std::vector<EVT> MemOps;
1127 if (getTLI()->findOptimalMemOpLowering(
1128 MemOps, Limit, MOp, DstAddrSpace,
1129 SrcAddrSpace, F->getAttributes()))
1130 return MemOps.size() * Factor;
1131
1132 // If we can't find an optimal memop lowering, return the default cost
1133 return -1;
1134}
1135
1136InstructionCost ARMTTIImpl::getMemcpyCost(const Instruction *I) {
1137 int NumOps = getNumMemOps(cast<IntrinsicInst>(I));
1138
1139 // To model the cost of a library call, we assume 1 for the call, and
1140 // 3 for the argument setup.
1141 if (NumOps == -1)
1142 return 4;
1143 return NumOps;
1144}
1145
1146InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1147 VectorType *Tp, ArrayRef<int> Mask,
1148 int Index, VectorType *SubTp) {
1149 Kind = improveShuffleKindFromMask(Kind, Mask);
1150 if (ST->hasNEON()) {
1151 if (Kind == TTI::SK_Broadcast) {
1152 static const CostTblEntry NEONDupTbl[] = {
1153 // VDUP handles these cases.
1154 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1155 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1156 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1157 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1158 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1159 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1160
1161 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1162 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1163 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1164 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
1165
1166 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1167 if (const auto *Entry =
1168 CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
1169 return LT.first * Entry->Cost;
1170 }
1171 if (Kind == TTI::SK_Reverse) {
1172 static const CostTblEntry NEONShuffleTbl[] = {
1173 // Reverse shuffle cost one instruction if we are shuffling within a
1174 // double word (vrev) or two if we shuffle a quad word (vrev, vext).
1175 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1176 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1177 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1178 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1179 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1180 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1181
1182 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1183 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1184 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
1185 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
1186
1187 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1188 if (const auto *Entry =
1189 CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
1190 return LT.first * Entry->Cost;
1191 }
1192 if (Kind == TTI::SK_Select) {
1193 static const CostTblEntry NEONSelShuffleTbl[] = {
1194 // Select shuffle cost table for ARM. Cost is the number of
1195 // instructions
1196 // required to create the shuffled vector.
1197
1198 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1199 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1200 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1201 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1202
1203 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1204 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1205 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
1206
1207 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
1208
1209 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
1210
1211 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1212 if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
1213 ISD::VECTOR_SHUFFLE, LT.second))
1214 return LT.first * Entry->Cost;
1215 }
1216 }
1217 if (ST->hasMVEIntegerOps()) {
1218 if (Kind == TTI::SK_Broadcast) {
1219 static const CostTblEntry MVEDupTbl[] = {
1220 // VDUP handles these cases.
1221 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1222 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1223 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
1224 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1225 {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};
1226
1227 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1228 if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
1229 LT.second))
1230 return LT.first * Entry->Cost *
1231 ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput);
1232 }
1233
1234 if (!Mask.empty()) {
1235 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1236 if (Mask.size() <= LT.second.getVectorNumElements() &&
1237 (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
1238 isVREVMask(Mask, LT.second, 64)))
1239 return ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) * LT.first;
1240 }
1241 }
1242
1243 int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
1244 ? ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput)
1245 : 1;
1246 return BaseCost * BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
1247}
1248
1249InstructionCost ARMTTIImpl::getArithmeticInstrCost(
1250 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1251 TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
1252 TTI::OperandValueProperties Opd1PropInfo,
1253 TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
1254 const Instruction *CxtI) {
1255 int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1256 if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
1257 // Make operations on i1 relatively expensive as this often involves
1258 // combining predicates. AND and XOR should be easier to handle with IT
1259 // blocks.
1260 switch (ISDOpcode) {
1261 default:
1262 break;
1263 case ISD::AND:
1264 case ISD::XOR:
1265 return 2;
1266 case ISD::OR:
1267 return 3;
1268 }
1269 }
1270
1271 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
1272
1273 if (ST->hasNEON()) {
1274 const unsigned FunctionCallDivCost = 20;
1275 const unsigned ReciprocalDivCost = 10;
1276 static const CostTblEntry CostTbl[] = {
1277 // Division.
1278 // These costs are somewhat random. Choose a cost of 20 to indicate that
1279 // vectorizing devision (added function call) is going to be very expensive.
1280 // Double registers types.
1281 { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1282 { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1283 { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
1284 { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
1285 { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1286 { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1287 { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
1288 { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
1289 { ISD::SDIV, MVT::v4i16, ReciprocalDivCost},
1290 { ISD::UDIV, MVT::v4i16, ReciprocalDivCost},
1291 { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
1292 { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
1293 { ISD::SDIV, MVT::v8i8, ReciprocalDivCost},
1294 { ISD::UDIV, MVT::v8i8, ReciprocalDivCost},
1295 { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost},
1296 { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost},
1297 // Quad register types.
1298 { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1299 { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1300 { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
1301 { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
1302 { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1303 { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1304 { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
1305 { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
1306 { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1307 { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1308 { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
1309 { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
1310 { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1311 { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1312 { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
1313 { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
1314 // Multiplication.
1315 };
1316
1317 if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
1318 return LT.first * Entry->Cost;
1319
1320 InstructionCost Cost = BaseT::getArithmeticInstrCost(
1321 Opcode, Ty, CostKind, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo);
1322
1323 // This is somewhat of a hack. The problem that we are facing is that SROA
1324 // creates a sequence of shift, and, or instructions to construct values.
1325 // These sequences are recognized by the ISel and have zero-cost. Not so for
1326 // the vectorized code. Because we have support for v2i64 but not i64 those
1327 // sequences look particularly beneficial to vectorize.
1328 // To work around this we increase the cost of v2i64 operations to make them
1329 // seem less beneficial.
1330 if (LT.second == MVT::v2i64 &&
1331 Op2Info == TargetTransformInfo::OK_UniformConstantValue)
1332 Cost += 4;
1333
1334 return Cost;
1335 }
1336
1337 // If this operation is a shift on arm/thumb2, it might well be folded into
1338 // the following instruction, hence having a cost of 0.
1339 auto LooksLikeAFreeShift = [&]() {
1340 if (ST->isThumb1Only() || Ty->isVectorTy())
1341 return false;
1342
1343 if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
1344 return false;
1345 if (Op2Info != TargetTransformInfo::OK_UniformConstantValue)
1346 return false;
1347
1348 // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1349 switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
1350 case Instruction::Add:
1351 case Instruction::Sub:
1352 case Instruction::And:
1353 case Instruction::Xor:
1354 case Instruction::Or:
1355 case Instruction::ICmp:
1356 return true;
1357 default:
1358 return false;
1359 }
1360 };
1361 if (LooksLikeAFreeShift())
1362 return 0;
1363
1364 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1365 // for "multiple beats" potentially needed by MVE instructions.
1366 int BaseCost = 1;
1367 if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1368 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1369
1370 // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
1371 // without treating floats as more expensive that scalars or increasing the
1372 // costs for custom operations. The results is also multiplied by the
1373 // MVEVectorCostFactor where appropriate.
1374 if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
1375 return LT.first * BaseCost;
1376
1377 // Else this is expand, assume that we need to scalarize this op.
1378 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
1379 unsigned Num = VTy->getNumElements();
1380 InstructionCost Cost =
1381 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind);
1382 // Return the cost of multiple scalar invocation plus the cost of
1383 // inserting and extracting the values.
1384 SmallVector<Type *> Tys(Args.size(), Ty);
1385 return BaseT::getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
1386 }
1387
1388 return BaseCost;
1389}
1390
1391InstructionCost ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1392 MaybeAlign Alignment,
1393 unsigned AddressSpace,
1394 TTI::TargetCostKind CostKind,
1395 const Instruction *I) {
1396 // TODO: Handle other cost kinds.
1397 if (CostKind != TTI::TCK_RecipThroughput)
1398 return 1;
1399
1400 // Type legalization can't handle structs
1401 if (TLI->getValueType(DL, Src, true) == MVT::Other)
1402 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1403 CostKind);
1404
1405 if (ST->hasNEON() && Src->isVectorTy() &&
1406 (Alignment && *Alignment != Align(16)) &&
1407 cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
1408 // Unaligned loads/stores are extremely inefficient.
1409 // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1410 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
1411 return LT.first * 4;
1412 }
1413
1414 // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
1415 // Same for stores.
1416 if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
1417 ((Opcode == Instruction::Load && I->hasOneUse() &&
1418 isa<FPExtInst>(*I->user_begin())) ||
1419 (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
1420 FixedVectorType *SrcVTy = cast<FixedVectorType>(Src);
1421 Type *DstTy =
1422 Opcode == Instruction::Load
1423 ? (*I->user_begin())->getType()
1424 : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
1425 if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
1426 DstTy->getScalarType()->isFloatTy())
1427 return ST->getMVEVectorCostFactor(CostKind);
1428 }
1429
1430 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1431 ? ST->getMVEVectorCostFactor(CostKind)
1432 : 1;
1433 return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1434 CostKind, I);
1435}
1436
1437InstructionCost
1438ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1439 unsigned AddressSpace,
1440 TTI::TargetCostKind CostKind) {
1441 if (ST->hasMVEIntegerOps()) {
1442 if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment))
1443 return ST->getMVEVectorCostFactor(CostKind);
1444 if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment))
1445 return ST->getMVEVectorCostFactor(CostKind);
1446 }
1447 if (!isa<FixedVectorType>(Src))
1448 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1449 CostKind);
1450 // Scalar cost, which is currently very high due to the efficiency of the
1451 // generated code.
1452 return cast<FixedVectorType>(Src)->getNumElements() * 8;
1453}
1454
1455InstructionCost ARMTTIImpl::getInterleavedMemoryOpCost(
1456 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1457 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1458 bool UseMaskForCond, bool UseMaskForGaps) {
1459 assert(Factor >= 2 && "Invalid interleave factor")(static_cast<void> (0));
1460 assert(isa<VectorType>(VecTy) && "Expect a vector type")(static_cast<void> (0));
1461
1462 // vldN/vstN doesn't support vector types of i64/f64 element.
1463 bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
1464
1465 if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
1466 !UseMaskForCond && !UseMaskForGaps) {
1467 unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1468 auto *SubVecTy =
1469 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
1470
1471 // vldN/vstN only support legal vector types of size 64 or 128 in bits.
1472 // Accesses having vector types that are a multiple of 128 bits can be
1473 // matched to more than one vldN/vstN instruction.
1474 int BaseCost =
1475 ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1;
1476 if (NumElts % Factor == 0 &&
1477 TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
1478 return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1479
1480 // Some smaller than legal interleaved patterns are cheap as we can make
1481 // use of the vmovn or vrev patterns to interleave a standard load. This is
1482 // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1483 // promoted differently). The cost of 2 here is then a load and vrev or
1484 // vmovn.
1485 if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
1486 VecTy->isIntOrIntVectorTy() &&
1487 DL.getTypeSizeInBits(SubVecTy).getFixedSize() <= 64)
1488 return 2 * BaseCost;
1489 }
1490
1491 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1492 Alignment, AddressSpace, CostKind,
1493 UseMaskForCond, UseMaskForGaps);
1494}
1495
1496InstructionCost ARMTTIImpl::getGatherScatterOpCost(
1497 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1498 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
1499 using namespace PatternMatch;
1500 if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
1501 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1502 Alignment, CostKind, I);
1503
1504 assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!")(static_cast<void> (0));
1505 auto *VTy = cast<FixedVectorType>(DataTy);
1506
1507 // TODO: Splitting, once we do that.
1508
1509 unsigned NumElems = VTy->getNumElements();
1510 unsigned EltSize = VTy->getScalarSizeInBits();
1511 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, DataTy);
1512
1513 // For now, it is assumed that for the MVE gather instructions the loads are
1514 // all effectively serialised. This means the cost is the scalar cost
1515 // multiplied by the number of elements being loaded. This is possibly very
1516 // conservative, but even so we still end up vectorising loops because the
1517 // cost per iteration for many loops is lower than for scalar loops.
1518 InstructionCost VectorCost =
1519 NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
1520 // The scalarization cost should be a lot higher. We use the number of vector
1521 // elements plus the scalarization overhead.
1522 InstructionCost ScalarCost =
1523 NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, true, false) +
1524 BaseT::getScalarizationOverhead(VTy, false, true);
1525
1526 if (EltSize < 8 || Alignment < EltSize / 8)
1527 return ScalarCost;
1528
1529 unsigned ExtSize = EltSize;
1530 // Check whether there's a single user that asks for an extended type
1531 if (I != nullptr) {
1532 // Dependent of the caller of this function, a gather instruction will
1533 // either have opcode Instruction::Load or be a call to the masked_gather
1534 // intrinsic
1535 if ((I->getOpcode() == Instruction::Load ||
1536 match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
1537 I->hasOneUse()) {
1538 const User *Us = *I->users().begin();
1539 if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
1540 // only allow valid type combinations
1541 unsigned TypeSize =
1542 cast<Instruction>(Us)->getType()->getScalarSizeInBits();
1543 if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
1544 (TypeSize == 16 && EltSize == 8)) &&
1545 TypeSize * NumElems == 128) {
1546 ExtSize = TypeSize;
1547 }
1548 }
1549 }
1550 // Check whether the input data needs to be truncated
1551 TruncInst *T;
1552 if ((I->getOpcode() == Instruction::Store ||
1553 match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
1554 (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
1555 // Only allow valid type combinations
1556 unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
1557 if (((EltSize == 16 && TypeSize == 32) ||
1558 (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
1559 TypeSize * NumElems == 128)
1560 ExtSize = TypeSize;
1561 }
1562 }
1563
1564 if (ExtSize * NumElems != 128 || NumElems < 4)
1565 return ScalarCost;
1566
1567 // Any (aligned) i32 gather will not need to be scalarised.
1568 if (ExtSize == 32)
1569 return VectorCost;
1570 // For smaller types, we need to ensure that the gep's inputs are correctly
1571 // extended from a small enough value. Other sizes (including i64) are
1572 // scalarized for now.
1573 if (ExtSize != 8 && ExtSize != 16)
1574 return ScalarCost;
1575
1576 if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
1577 Ptr = BC->getOperand(0);
1578 if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
1579 if (GEP->getNumOperands() != 2)
1580 return ScalarCost;
1581 unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
1582 // Scale needs to be correct (which is only relevant for i16s).
1583 if (Scale != 1 && Scale * 8 != ExtSize)
1584 return ScalarCost;
1585 // And we need to zext (not sext) the indexes from a small enough type.
1586 if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
1587 if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
1588 return VectorCost;
1589 }
1590 return ScalarCost;
1591 }
1592 return ScalarCost;
1593}
1594
1595InstructionCost
1596ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
1597 Optional<FastMathFlags> FMF,
1598 TTI::TargetCostKind CostKind) {
1599 if (TTI::requiresOrderedReduction(FMF))
1600 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1601
1602 EVT ValVT = TLI->getValueType(DL, ValTy);
1603 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1604 if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD)
1605 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1606
1607 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1608
1609 static const CostTblEntry CostTblAdd[]{
1610 {ISD::ADD, MVT::v16i8, 1},
1611 {ISD::ADD, MVT::v8i16, 1},
1612 {ISD::ADD, MVT::v4i32, 1},
1613 };
1614 if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
1615 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1616
1617 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1618}
1619
1620InstructionCost
1621ARMTTIImpl::getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
1622 Type *ResTy, VectorType *ValTy,
1623 TTI::TargetCostKind CostKind) {
1624 EVT ValVT = TLI->getValueType(DL, ValTy);
1625 EVT ResVT = TLI->getValueType(DL, ResTy);
1626
1627 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1628 std::pair<InstructionCost, MVT> LT =
1629 TLI->getTypeLegalizationCost(DL, ValTy);
1630
1631 // The legal cases are:
1632 // VADDV u/s 8/16/32
1633 // VMLAV u/s 8/16/32
1634 // VADDLV u/s 32
1635 // VMLALV u/s 16/32
1636 // Codegen currently cannot always handle larger than legal vectors very
1637 // well, especially for predicated reductions where the mask needs to be
1638 // split, so restrict to 128bit or smaller input types.
1639 unsigned RevVTSize = ResVT.getSizeInBits();
1640 if (ValVT.getSizeInBits() <= 128 &&
1641 ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1642 (LT.second == MVT::v8i16 && RevVTSize <= (IsMLA ? 64u : 32u)) ||
1643 (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1644 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1645 }
1646
1647 return BaseT::getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, ValTy,
1648 CostKind);
1649}
1650
1651InstructionCost
1652ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1653 TTI::TargetCostKind CostKind) {
1654 switch (ICA.getID()) {
1655 case Intrinsic::get_active_lane_mask:
1656 // Currently we make a somewhat optimistic assumption that
1657 // active_lane_mask's are always free. In reality it may be freely folded
1658 // into a tail predicated loop, expanded into a VCPT or expanded into a lot
1659 // of add/icmp code. We may need to improve this in the future, but being
1660 // able to detect if it is free or not involves looking at a lot of other
1661 // code. We currently assume that the vectorizer inserted these, and knew
1662 // what it was doing in adding one.
1663 if (ST->hasMVEIntegerOps())
1664 return 0;
1665 break;
1666 case Intrinsic::sadd_sat:
1667 case Intrinsic::ssub_sat:
1668 case Intrinsic::uadd_sat:
1669 case Intrinsic::usub_sat: {
1670 if (!ST->hasMVEIntegerOps())
1671 break;
1672 Type *VT = ICA.getReturnType();
1673
1674 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1675 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1676 LT.second == MVT::v16i8) {
1677 // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
1678 // need to extend the type, as it uses shr(qadd(shl, shl)).
1679 unsigned Instrs =
1680 LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4;
1681 return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
1682 }
1683 break;
1684 }
1685 case Intrinsic::abs:
1686 case Intrinsic::smin:
1687 case Intrinsic::smax:
1688 case Intrinsic::umin:
1689 case Intrinsic::umax: {
1690 if (!ST->hasMVEIntegerOps())
1691 break;
1692 Type *VT = ICA.getReturnType();
1693
1694 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1695 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1696 LT.second == MVT::v16i8)
1697 return LT.first * ST->getMVEVectorCostFactor(CostKind);
1698 break;
1699 }
1700 case Intrinsic::minnum:
1701 case Intrinsic::maxnum: {
1702 if (!ST->hasMVEFloatOps())
1703 break;
1704 Type *VT = ICA.getReturnType();
1705 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1706 if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
1707 return LT.first * ST->getMVEVectorCostFactor(CostKind);
1708 break;
1709 }
1710 }
1711
1712 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1713}
1714
1715bool ARMTTIImpl::isLoweredToCall(const Function *F) {
1716 if (!F->isIntrinsic())
1717 BaseT::isLoweredToCall(F);
1718
1719 // Assume all Arm-specific intrinsics map to an instruction.
1720 if (F->getName().startswith("llvm.arm"))
1721 return false;
1722
1723 switch (F->getIntrinsicID()) {
1724 default: break;
1725 case Intrinsic::powi:
1726 case Intrinsic::sin:
1727 case Intrinsic::cos:
1728 case Intrinsic::pow:
1729 case Intrinsic::log:
1730 case Intrinsic::log10:
1731 case Intrinsic::log2:
1732 case Intrinsic::exp:
1733 case Intrinsic::exp2:
1734 return true;
1735 case Intrinsic::sqrt:
1736 case Intrinsic::fabs:
1737 case Intrinsic::copysign:
1738 case Intrinsic::floor:
1739 case Intrinsic::ceil:
1740 case Intrinsic::trunc:
1741 case Intrinsic::rint:
1742 case Intrinsic::nearbyint:
1743 case Intrinsic::round:
1744 case Intrinsic::canonicalize:
1745 case Intrinsic::lround:
1746 case Intrinsic::llround:
1747 case Intrinsic::lrint:
1748 case Intrinsic::llrint:
1749 if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
1750 return true;
1751 if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
1752 return true;
1753 // Some operations can be handled by vector instructions and assume
1754 // unsupported vectors will be expanded into supported scalar ones.
1755 // TODO Handle scalar operations properly.
1756 return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
1757 case Intrinsic::masked_store:
1758 case Intrinsic::masked_load:
1759 case Intrinsic::masked_gather:
1760 case Intrinsic::masked_scatter:
1761 return !ST->hasMVEIntegerOps();
1762 case Intrinsic::sadd_with_overflow:
1763 case Intrinsic::uadd_with_overflow:
1764 case Intrinsic::ssub_with_overflow:
1765 case Intrinsic::usub_with_overflow:
1766 case Intrinsic::sadd_sat:
1767 case Intrinsic::uadd_sat:
1768 case Intrinsic::ssub_sat:
1769 case Intrinsic::usub_sat:
1770 return false;
1771 }
1772
1773 return BaseT::isLoweredToCall(F);
1774}
1775
1776bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) {
1777 unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
1778 EVT VT = TLI->getValueType(DL, I.getType(), true);
1779 if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
1780 return true;
1781
1782 // Check if an intrinsic will be lowered to a call and assume that any
1783 // other CallInst will generate a bl.
1784 if (auto *Call = dyn_cast<CallInst>(&I)) {
1785 if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
1786 switch(II->getIntrinsicID()) {
1787 case Intrinsic::memcpy:
1788 case Intrinsic::memset:
1789 case Intrinsic::memmove:
1790 return getNumMemOps(II) == -1;
1791 default:
1792 if (const Function *F = Call->getCalledFunction())
1793 return isLoweredToCall(F);
1794 }
1795 }
1796 return true;
1797 }
1798
1799 // FPv5 provides conversions between integer, double-precision,
1800 // single-precision, and half-precision formats.
1801 switch (I.getOpcode()) {
1802 default:
1803 break;
1804 case Instruction::FPToSI:
1805 case Instruction::FPToUI:
1806 case Instruction::SIToFP:
1807 case Instruction::UIToFP:
1808 case Instruction::FPTrunc:
1809 case Instruction::FPExt:
1810 return !ST->hasFPARMv8Base();
1811 }
1812
1813 // FIXME: Unfortunately the approach of checking the Operation Action does
1814 // not catch all cases of Legalization that use library calls. Our
1815 // Legalization step categorizes some transformations into library calls as
1816 // Custom, Expand or even Legal when doing type legalization. So for now
1817 // we have to special case for instance the SDIV of 64bit integers and the
1818 // use of floating point emulation.
1819 if (VT.isInteger() && VT.getSizeInBits() >= 64) {
1820 switch (ISD) {
1821 default:
1822 break;
1823 case ISD::SDIV:
1824 case ISD::UDIV:
1825 case ISD::SREM:
1826 case ISD::UREM:
1827 case ISD::SDIVREM:
1828 case ISD::UDIVREM:
1829 return true;
1830 }
1831 }
1832
1833 // Assume all other non-float operations are supported.
1834 if (!VT.isFloatingPoint())
1835 return false;
1836
1837 // We'll need a library call to handle most floats when using soft.
1838 if (TLI->useSoftFloat()) {
1839 switch (I.getOpcode()) {
1840 default:
1841 return true;
1842 case Instruction::Alloca:
1843 case Instruction::Load:
1844 case Instruction::Store:
1845 case Instruction::Select:
1846 case Instruction::PHI:
1847 return false;
1848 }
1849 }
1850
1851 // We'll need a libcall to perform double precision operations on a single
1852 // precision only FPU.
1853 if (I.getType()->isDoubleTy() && !ST->hasFP64())
1854 return true;
1855
1856 // Likewise for half precision arithmetic.
1857 if (I.getType()->isHalfTy() && !ST->hasFullFP16())
1858 return true;
1859
1860 return false;
1861}
1862
1863bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
1864 AssumptionCache &AC,
1865 TargetLibraryInfo *LibInfo,
1866 HardwareLoopInfo &HWLoopInfo) {
1867 // Low-overhead branches are only supported in the 'low-overhead branch'
1868 // extension of v8.1-m.
1869 if (!ST->hasLOB() || DisableLowOverheadLoops) {
1870 LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n")do { } while (false);
1871 return false;
1872 }
1873
1874 if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
1875 LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n")do { } while (false);
1876 return false;
1877 }
1878
1879 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
1880 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
1881 LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n")do { } while (false);
1882 return false;
1883 }
1884
1885 const SCEV *TripCountSCEV =
1886 SE.getAddExpr(BackedgeTakenCount,
1887 SE.getOne(BackedgeTakenCount->getType()));
1888
1889 // We need to store the trip count in LR, a 32-bit register.
1890 if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
1891 LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n")do { } while (false);
1892 return false;
1893 }
1894
1895 // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
1896 // point in generating a hardware loop if that's going to happen.
1897
1898 auto IsHardwareLoopIntrinsic = [](Instruction &I) {
1899 if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
1900 switch (Call->getIntrinsicID()) {
1901 default:
1902 break;
1903 case Intrinsic::start_loop_iterations:
1904 case Intrinsic::test_start_loop_iterations:
1905 case Intrinsic::loop_decrement:
1906 case Intrinsic::loop_decrement_reg:
1907 return true;
1908 }
1909 }
1910 return false;
1911 };
1912
1913 // Scan the instructions to see if there's any that we know will turn into a
1914 // call or if this loop is already a low-overhead loop or will become a tail
1915 // predicated loop.
1916 bool IsTailPredLoop = false;
1917 auto ScanLoop = [&](Loop *L) {
1918 for (auto *BB : L->getBlocks()) {
1919 for (auto &I : *BB) {
1920 if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
1921 isa<InlineAsm>(I)) {
1922 LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n")do { } while (false);
1923 return false;
1924 }
1925 if (auto *II = dyn_cast<IntrinsicInst>(&I))
1926 IsTailPredLoop |=
1927 II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
1928 II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
1929 II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
1930 II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
1931 II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
1932 }
1933 }
1934 return true;
1935 };
1936
1937 // Visit inner loops.
1938 for (auto Inner : *L)
1939 if (!ScanLoop(Inner))
1940 return false;
1941
1942 if (!ScanLoop(L))
1943 return false;
1944
1945 // TODO: Check whether the trip count calculation is expensive. If L is the
1946 // inner loop but we know it has a low trip count, calculating that trip
1947 // count (in the parent loop) may be detrimental.
1948
1949 LLVMContext &C = L->getHeader()->getContext();
1950 HWLoopInfo.CounterInReg = true;
1951 HWLoopInfo.IsNestingLegal = false;
1952 HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
1953 HWLoopInfo.CountType = Type::getInt32Ty(C);
1954 HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
1955 return true;
1956}
1957
1958static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
1959 // We don't allow icmp's, and because we only look at single block loops,
1960 // we simply count the icmps, i.e. there should only be 1 for the backedge.
1961 if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
1962 return false;
1963 // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
1964 // not currently canonical, but soon will be. Code without them uses icmp, and
1965 // so is not tail predicated as per the condition above. In order to get the
1966 // same performance we treat min and max the same as an icmp for tailpred
1967 // purposes for the moment (we often rely on non-tailpred and higher VF's to
1968 // pick more optimial instructions like VQDMULH. They need to be recognized
1969 // directly by the vectorizer).
1970 if (auto *II = dyn_cast<IntrinsicInst>(&I))
1971 if ((II->getIntrinsicID() == Intrinsic::smin ||
1972 II->getIntrinsicID() == Intrinsic::smax ||
1973 II->getIntrinsicID() == Intrinsic::umin ||
1974 II->getIntrinsicID() == Intrinsic::umax) &&
1975 ++ICmpCount > 1)
1976 return false;
1977
1978 if (isa<FCmpInst>(&I))
1979 return false;
1980
1981 // We could allow extending/narrowing FP loads/stores, but codegen is
1982 // too inefficient so reject this for now.
1983 if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
1984 return false;
1985
1986 // Extends have to be extending-loads
1987 if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
1988 if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
1989 return false;
1990
1991 // Truncs have to be narrowing-stores
1992 if (isa<TruncInst>(&I) )
1993 if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
1994 return false;
1995
1996 return true;
1997}
1998
1999// To set up a tail-predicated loop, we need to know the total number of
2000// elements processed by that loop. Thus, we need to determine the element
2001// size and:
2002// 1) it should be uniform for all operations in the vector loop, so we
2003// e.g. don't want any widening/narrowing operations.
2004// 2) it should be smaller than i64s because we don't have vector operations
2005// that work on i64s.
2006// 3) we don't want elements to be reversed or shuffled, to make sure the
2007// tail-predication masks/predicates the right lanes.
2008//
2009static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
2010 const DataLayout &DL,
2011 const LoopAccessInfo *LAI) {
2012 LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n")do { } while (false);
2013
2014 // If there are live-out values, it is probably a reduction. We can predicate
2015 // most reduction operations freely under MVE using a combination of
2016 // prefer-predicated-reduction-select and inloop reductions. We limit this to
2017 // floating point and integer reductions, but don't check for operators
2018 // specifically here. If the value ends up not being a reduction (and so the
2019 // vectorizer cannot tailfold the loop), we should fall back to standard
2020 // vectorization automatically.
2021 SmallVector< Instruction *, 8 > LiveOuts;
2022 LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
2023 bool ReductionsDisabled =
2024 EnableTailPredication == TailPredication::EnabledNoReductions ||
2025 EnableTailPredication == TailPredication::ForceEnabledNoReductions;
2026
2027 for (auto *I : LiveOuts) {
2028 if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
2029 !I->getType()->isHalfTy()) {
2030 LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "do { } while (false)
2031 "live-out value\n")do { } while (false);
2032 return false;
2033 }
2034 if (ReductionsDisabled) {
2035 LLVM_DEBUG(dbgs() << "Reductions not enabled\n")do { } while (false);
2036 return false;
2037 }
2038 }
2039
2040 // Next, check that all instructions can be tail-predicated.
2041 PredicatedScalarEvolution PSE = LAI->getPSE();
2042 SmallVector<Instruction *, 16> LoadStores;
2043 int ICmpCount = 0;
2044
2045 for (BasicBlock *BB : L->blocks()) {
2046 for (Instruction &I : BB->instructionsWithoutDebug()) {
2047 if (isa<PHINode>(&I))
2048 continue;
2049 if (!canTailPredicateInstruction(I, ICmpCount)) {
2050 LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump())do { } while (false);
2051 return false;
2052 }
2053
2054 Type *T = I.getType();
2055 if (T->isPointerTy())
2056 T = T->getPointerElementType();
2057
2058 if (T->getScalarSizeInBits() > 32) {
2059 LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump())do { } while (false);
2060 return false;
2061 }
2062 if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
2063 Value *Ptr = isa<LoadInst>(I) ? I.getOperand(0) : I.getOperand(1);
2064 int64_t NextStride = getPtrStride(PSE, Ptr, L);
2065 if (NextStride == 1) {
2066 // TODO: for now only allow consecutive strides of 1. We could support
2067 // other strides as long as it is uniform, but let's keep it simple
2068 // for now.
2069 continue;
2070 } else if (NextStride == -1 ||
2071 (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
2072 (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
2073 LLVM_DEBUG(dbgs()do { } while (false)
2074 << "Consecutive strides of 2 found, vld2/vstr2 can't "do { } while (false)
2075 "be tail-predicated\n.")do { } while (false);
2076 return false;
2077 // TODO: don't tail predicate if there is a reversed load?
2078 } else if (EnableMaskedGatherScatters) {
2079 // Gather/scatters do allow loading from arbitrary strides, at
2080 // least if they are loop invariant.
2081 // TODO: Loop variant strides should in theory work, too, but
2082 // this requires further testing.
2083 const SCEV *PtrScev =
2084 replaceSymbolicStrideSCEV(PSE, llvm::ValueToValueMap(), Ptr);
2085 if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
2086 const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
2087 if (PSE.getSE()->isLoopInvariant(Step, L))
2088 continue;
2089 }
2090 }
2091 LLVM_DEBUG(dbgs() << "Bad stride found, can't "do { } while (false)
2092 "tail-predicate\n.")do { } while (false);
2093 return false;
2094 }
2095 }
2096 }
2097
2098 LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n")do { } while (false);
2099 return true;
2100}
2101
2102bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
2103 ScalarEvolution &SE,
2104 AssumptionCache &AC,
2105 TargetLibraryInfo *TLI,
2106 DominatorTree *DT,
2107 const LoopAccessInfo *LAI) {
2108 if (!EnableTailPredication) {
2109 LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n")do { } while (false);
2110 return false;
2111 }
2112
2113 // Creating a predicated vector loop is the first step for generating a
2114 // tail-predicated hardware loop, for which we need the MVE masked
2115 // load/stores instructions:
2116 if (!ST->hasMVEIntegerOps())
2117 return false;
2118
2119 // For now, restrict this to single block loops.
2120 if (L->getNumBlocks() > 1) {
2121 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "do { } while (false)
2122 "loop.\n")do { } while (false);
2123 return false;
2124 }
2125
2126 assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected")(static_cast<void> (0));
2127
2128 HardwareLoopInfo HWLoopInfo(L);
2129 if (!HWLoopInfo.canAnalyze(*LI)) {
2130 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "do { } while (false)
2131 "analyzable.\n")do { } while (false);
2132 return false;
2133 }
2134
2135 // This checks if we have the low-overhead branch architecture
2136 // extension, and if we will create a hardware-loop:
2137 if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {
2138 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "do { } while (false)
2139 "profitable.\n")do { } while (false);
2140 return false;
2141 }
2142
2143 if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) {
2144 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "do { } while (false)
2145 "a candidate.\n")do { } while (false);
2146 return false;
2147 }
2148
2149 return canTailPredicateLoop(L, LI, SE, DL, LAI);
2150}
2151
2152bool ARMTTIImpl::emitGetActiveLaneMask() const {
2153 if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
2154 return false;
2155
2156 // Intrinsic @llvm.get.active.lane.mask is supported.
2157 // It is used in the MVETailPredication pass, which requires the number of
2158 // elements processed by this vector loop to setup the tail-predicated
2159 // loop.
2160 return true;
2161}
2162void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
2163 TTI::UnrollingPreferences &UP,
2164 OptimizationRemarkEmitter *ORE) {
2165 // Enable Upper bound unrolling universally, not dependant upon the conditions
2166 // below.
2167 UP.UpperBound = true;
2168
2169 // Only currently enable these preferences for M-Class cores.
2170 if (!ST->isMClass())
2171 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2172
2173 // Disable loop unrolling for Oz and Os.
2174 UP.OptSizeThreshold = 0;
2175 UP.PartialOptSizeThreshold = 0;
2176 if (L->getHeader()->getParent()->hasOptSize())
2177 return;
2178
2179 SmallVector<BasicBlock*, 4> ExitingBlocks;
2180 L->getExitingBlocks(ExitingBlocks);
2181 LLVM_DEBUG(dbgs() << "Loop has:\n"do { } while (false)
2182 << "Blocks: " << L->getNumBlocks() << "\n"do { } while (false)
2183 << "Exit blocks: " << ExitingBlocks.size() << "\n")do { } while (false);
2184
2185 // Only allow another exit other than the latch. This acts as an early exit
2186 // as it mirrors the profitability calculation of the runtime unroller.
2187 if (ExitingBlocks.size() > 2)
2188 return;
2189
2190 // Limit the CFG of the loop body for targets with a branch predictor.
2191 // Allowing 4 blocks permits if-then-else diamonds in the body.
2192 if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
2193 return;
2194
2195 // Don't unroll vectorized loops, including the remainder loop
2196 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
2197 return;
2198
2199 // Scan the loop: don't unroll loops with calls as this could prevent
2200 // inlining.
2201 InstructionCost Cost = 0;
2202 for (auto *BB : L->getBlocks()) {
2203 for (auto &I : *BB) {
2204 // Don't unroll vectorised loop. MVE does not benefit from it as much as
2205 // scalar code.
2206 if (I.getType()->isVectorTy())
2207 return;
2208
2209 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2210 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2211 if (!isLoweredToCall(F))
2212 continue;
2213 }
2214 return;
2215 }
2216
2217 SmallVector<const Value*, 4> Operands(I.operand_values());
2218 Cost +=
2219 getUserCost(&I, Operands, TargetTransformInfo::TCK_SizeAndLatency);
2220 }
2221 }
2222
2223 // On v6m cores, there are very few registers available. We can easily end up
2224 // spilling and reloading more registers in an unrolled loop. Look at the
2225 // number of LCSSA phis as a rough measure of how many registers will need to
2226 // be live out of the loop, reducing the default unroll count if more than 1
2227 // value is needed. In the long run, all of this should be being learnt by a
2228 // machine.
2229 unsigned UnrollCount = 4;
2230 if (ST->isThumb1Only()) {
2231 unsigned ExitingValues = 0;
2232 SmallVector<BasicBlock *, 4> ExitBlocks;
2233 L->getExitBlocks(ExitBlocks);
2234 for (auto *Exit : ExitBlocks) {
2235 // Count the number of LCSSA phis. Exclude values coming from GEP's as
2236 // only the last is expected to be needed for address operands.
2237 unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) {
2238 return PH.getNumOperands() != 1 ||
2239 !isa<GetElementPtrInst>(PH.getOperand(0));
2240 });
2241 ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
2242 }
2243 if (ExitingValues)
2244 UnrollCount /= ExitingValues;
2245 if (UnrollCount <= 1)
2246 return;
2247 }
2248
2249 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n")do { } while (false);
2250 LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n")do { } while (false);
2251
2252 UP.Partial = true;
2253 UP.Runtime = true;
2254 UP.UnrollRemainder = true;
2255 UP.DefaultUnrollRuntimeCount = UnrollCount;
2256 UP.UnrollAndJam = true;
2257 UP.UnrollAndJamInnerLoopThreshold = 60;
2258
2259 // Force unrolling small loops can be very useful because of the branch
2260 // taken cost of the backedge.
2261 if (Cost < 12)
2262 UP.Force = true;
2263}
2264
2265void ARMTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
2266 TTI::PeelingPreferences &PP) {
2267 BaseT::getPeelingPreferences(L, SE, PP);
2268}
2269
2270bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
2271 TTI::ReductionFlags Flags) const {
2272 if (!ST->hasMVEIntegerOps())
2273 return false;
2274
2275 unsigned ScalarBits = Ty->getScalarSizeInBits();
2276 switch (Opcode) {
2277 case Instruction::Add:
2278 return ScalarBits <= 64;
2279 default:
2280 return false;
2281 }
2282}
2283
2284bool ARMTTIImpl::preferPredicatedReductionSelect(
2285 unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const {
2286 if (!ST->hasMVEIntegerOps())
2287 return false;
2288 return true;
2289}

/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h

1//===-- ARMAddressingModes.h - ARM Addressing Modes -------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains the ARM addressing mode implementation stuff.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMADDRESSINGMODES_H
14#define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMADDRESSINGMODES_H
15
16#include "llvm/ADT/APFloat.h"
17#include "llvm/ADT/APInt.h"
18#include "llvm/ADT/bit.h"
19#include "llvm/Support/ErrorHandling.h"
20#include "llvm/Support/MathExtras.h"
21#include <cassert>
22
23namespace llvm {
24
25/// ARM_AM - ARM Addressing Mode Stuff
26namespace ARM_AM {
27 enum ShiftOpc {
28 no_shift = 0,
29 asr,
30 lsl,
31 lsr,
32 ror,
33 rrx,
34 uxtw
35 };
36
37 enum AddrOpc {
38 sub = 0,
39 add
40 };
41
42 inline const char *getAddrOpcStr(AddrOpc Op) { return Op == sub ? "-" : ""; }
43
44 inline const char *getShiftOpcStr(ShiftOpc Op) {
45 switch (Op) {
46 default: llvm_unreachable("Unknown shift opc!")__builtin_unreachable();
47 case ARM_AM::asr: return "asr";
48 case ARM_AM::lsl: return "lsl";
49 case ARM_AM::lsr: return "lsr";
50 case ARM_AM::ror: return "ror";
51 case ARM_AM::rrx: return "rrx";
52 case ARM_AM::uxtw: return "uxtw";
53 }
54 }
55
56 inline unsigned getShiftOpcEncoding(ShiftOpc Op) {
57 switch (Op) {
58 default: llvm_unreachable("Unknown shift opc!")__builtin_unreachable();
59 case ARM_AM::asr: return 2;
60 case ARM_AM::lsl: return 0;
61 case ARM_AM::lsr: return 1;
62 case ARM_AM::ror: return 3;
63 }
64 }
65
66 enum AMSubMode {
67 bad_am_submode = 0,
68 ia,
69 ib,
70 da,
71 db
72 };
73
74 inline const char *getAMSubModeStr(AMSubMode Mode) {
75 switch (Mode) {
76 default: llvm_unreachable("Unknown addressing sub-mode!")__builtin_unreachable();
77 case ARM_AM::ia: return "ia";
78 case ARM_AM::ib: return "ib";
79 case ARM_AM::da: return "da";
80 case ARM_AM::db: return "db";
81 }
82 }
83
84 /// rotr32 - Rotate a 32-bit unsigned value right by a specified # bits.
85 ///
86 inline unsigned rotr32(unsigned Val, unsigned Amt) {
87 assert(Amt < 32 && "Invalid rotate amount")(static_cast<void> (0));
88 return (Val >> Amt) | (Val << ((32-Amt)&31));
89 }
90
91 /// rotl32 - Rotate a 32-bit unsigned value left by a specified # bits.
92 ///
93 inline unsigned rotl32(unsigned Val, unsigned Amt) {
94 assert(Amt < 32 && "Invalid rotate amount")(static_cast<void> (0));
95 return (Val << Amt) | (Val >> ((32-Amt)&31));
96 }
97
98 //===--------------------------------------------------------------------===//
99 // Addressing Mode #1: shift_operand with registers
100 //===--------------------------------------------------------------------===//
101 //
102 // This 'addressing mode' is used for arithmetic instructions. It can
103 // represent things like:
104 // reg
105 // reg [asr|lsl|lsr|ror|rrx] reg
106 // reg [asr|lsl|lsr|ror|rrx] imm
107 //
108 // This is stored three operands [rega, regb, opc]. The first is the base
109 // reg, the second is the shift amount (or reg0 if not present or imm). The
110 // third operand encodes the shift opcode and the imm if a reg isn't present.
111 //
112 inline unsigned getSORegOpc(ShiftOpc ShOp, unsigned Imm) {
113 return ShOp | (Imm << 3);
114 }
115 inline unsigned getSORegOffset(unsigned Op) { return Op >> 3; }
116 inline ShiftOpc getSORegShOp(unsigned Op) { return (ShiftOpc)(Op & 7); }
117
118 /// getSOImmValImm - Given an encoded imm field for the reg/imm form, return
119 /// the 8-bit imm value.
120 inline unsigned getSOImmValImm(unsigned Imm) { return Imm & 0xFF; }
121 /// getSOImmValRot - Given an encoded imm field for the reg/imm form, return
122 /// the rotate amount.
123 inline unsigned getSOImmValRot(unsigned Imm) { return (Imm >> 8) * 2; }
124
125 /// getSOImmValRotate - Try to handle Imm with an immediate shifter operand,
126 /// computing the rotate amount to use. If this immediate value cannot be
127 /// handled with a single shifter-op, determine a good rotate amount that will
128 /// take a maximal chunk of bits out of the immediate.
129 inline unsigned getSOImmValRotate(unsigned Imm) {
130 // 8-bit (or less) immediates are trivially shifter_operands with a rotate
131 // of zero.
132 if ((Imm & ~255U) == 0) return 0;
133
134 // Use CTZ to compute the rotate amount.
135 unsigned TZ = countTrailingZeros(Imm);
136
137 // Rotate amount must be even. Something like 0x200 must be rotated 8 bits,
138 // not 9.
139 unsigned RotAmt = TZ & ~1;
140
141 // If we can handle this spread, return it.
142 if ((rotr32(Imm, RotAmt) & ~255U) == 0)
143 return (32-RotAmt)&31; // HW rotates right, not left.
144
145 // For values like 0xF000000F, we should ignore the low 6 bits, then
146 // retry the hunt.
147 if (Imm & 63U) {
148 unsigned TZ2 = countTrailingZeros(Imm & ~63U);
149 unsigned RotAmt2 = TZ2 & ~1;
150 if ((rotr32(Imm, RotAmt2) & ~255U) == 0)
151 return (32-RotAmt2)&31; // HW rotates right, not left.
152 }
153
154 // Otherwise, we have no way to cover this span of bits with a single
155 // shifter_op immediate. Return a chunk of bits that will be useful to
156 // handle.
157 return (32-RotAmt)&31; // HW rotates right, not left.
158 }
159
160 /// getSOImmVal - Given a 32-bit immediate, if it is something that can fit
161 /// into an shifter_operand immediate operand, return the 12-bit encoding for
162 /// it. If not, return -1.
163 inline int getSOImmVal(unsigned Arg) {
164 // 8-bit (or less) immediates are trivially shifter_operands with a rotate
165 // of zero.
166 if ((Arg & ~255U) == 0) return Arg;
167
168 unsigned RotAmt = getSOImmValRotate(Arg);
169
170 // If this cannot be handled with a single shifter_op, bail out.
171 if (rotr32(~255U, RotAmt) & Arg)
172 return -1;
173
174 // Encode this correctly.
175 return rotl32(Arg, RotAmt) | ((RotAmt>>1) << 8);
176 }
177
178 /// isSOImmTwoPartVal - Return true if the specified value can be obtained by
179 /// or'ing together two SOImmVal's.
180 inline bool isSOImmTwoPartVal(unsigned V) {
181 // If this can be handled with a single shifter_op, bail out.
182 V = rotr32(~255U, getSOImmValRotate(V)) & V;
183 if (V == 0)
184 return false;
185
186 // If this can be handled with two shifter_op's, accept.
187 V = rotr32(~255U, getSOImmValRotate(V)) & V;
188 return V == 0;
189 }
190
191 /// getSOImmTwoPartFirst - If V is a value that satisfies isSOImmTwoPartVal,
192 /// return the first chunk of it.
193 inline unsigned getSOImmTwoPartFirst(unsigned V) {
194 return rotr32(255U, getSOImmValRotate(V)) & V;
195 }
196
197 /// getSOImmTwoPartSecond - If V is a value that satisfies isSOImmTwoPartVal,
198 /// return the second chunk of it.
199 inline unsigned getSOImmTwoPartSecond(unsigned V) {
200 // Mask out the first hunk.
201 V = rotr32(~255U, getSOImmValRotate(V)) & V;
202
203 // Take what's left.
204 assert(V == (rotr32(255U, getSOImmValRotate(V)) & V))(static_cast<void> (0));
205 return V;
206 }
207
208 /// isSOImmTwoPartValNeg - Return true if the specified value can be obtained
209 /// by two SOImmVal, that -V = First + Second.
210 /// "R+V" can be optimized to (sub (sub R, First), Second).
211 /// "R=V" can be optimized to (sub (mvn R, ~(-First)), Second).
212 inline bool isSOImmTwoPartValNeg(unsigned V) {
213 unsigned First;
214 if (!isSOImmTwoPartVal(-V))
215 return false;
216 // Return false if ~(-First) is not a SoImmval.
217 First = getSOImmTwoPartFirst(-V);
218 First = ~(-First);
219 return !(rotr32(~255U, getSOImmValRotate(First)) & First);
220 }
221
222 /// getThumbImmValShift - Try to handle Imm with a 8-bit immediate followed
223 /// by a left shift. Returns the shift amount to use.
224 inline unsigned getThumbImmValShift(unsigned Imm) {
225 // 8-bit (or less) immediates are trivially immediate operand with a shift
226 // of zero.
227 if ((Imm & ~255U) == 0) return 0;
18
Assuming the condition is false
19
Taking false branch
228
229 // Use CTZ to compute the shift amount.
230 return countTrailingZeros(Imm);
20
Calling 'countTrailingZeros<unsigned int>'
27
Returning from 'countTrailingZeros<unsigned int>'
28
Returning the value 32
231 }
232
233 /// isThumbImmShiftedVal - Return true if the specified value can be obtained
234 /// by left shifting a 8-bit immediate.
235 inline bool isThumbImmShiftedVal(unsigned V) {
236 // If this can be handled with
237 V = (~255U << getThumbImmValShift(V)) & V;
17
Calling 'getThumbImmValShift'
29
Returning from 'getThumbImmValShift'
30
The result of the left shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'
238 return V == 0;
239 }
240
241 /// getThumbImm16ValShift - Try to handle Imm with a 16-bit immediate followed
242 /// by a left shift. Returns the shift amount to use.
243 inline unsigned getThumbImm16ValShift(unsigned Imm) {
244 // 16-bit (or less) immediates are trivially immediate operand with a shift
245 // of zero.
246 if ((Imm & ~65535U) == 0) return 0;
247
248 // Use CTZ to compute the shift amount.
249 return countTrailingZeros(Imm);
250 }
251
252 /// isThumbImm16ShiftedVal - Return true if the specified value can be
253 /// obtained by left shifting a 16-bit immediate.
254 inline bool isThumbImm16ShiftedVal(unsigned V) {
255 // If this can be handled with
256 V = (~65535U << getThumbImm16ValShift(V)) & V;
257 return V == 0;
258 }
259
260 /// getThumbImmNonShiftedVal - If V is a value that satisfies
261 /// isThumbImmShiftedVal, return the non-shiftd value.
262 inline unsigned getThumbImmNonShiftedVal(unsigned V) {
263 return V >> getThumbImmValShift(V);
264 }
265
266
267 /// getT2SOImmValSplat - Return the 12-bit encoded representation
268 /// if the specified value can be obtained by splatting the low 8 bits
269 /// into every other byte or every byte of a 32-bit value. i.e.,
270 /// 00000000 00000000 00000000 abcdefgh control = 0
271 /// 00000000 abcdefgh 00000000 abcdefgh control = 1
272 /// abcdefgh 00000000 abcdefgh 00000000 control = 2
273 /// abcdefgh abcdefgh abcdefgh abcdefgh control = 3
274 /// Return -1 if none of the above apply.
275 /// See ARM Reference Manual A6.3.2.
276 inline int getT2SOImmValSplatVal(unsigned V) {
277 unsigned u, Vs, Imm;
278 // control = 0
279 if ((V & 0xffffff00) == 0)
280 return V;
281
282 // If the value is zeroes in the first byte, just shift those off
283 Vs = ((V & 0xff) == 0) ? V >> 8 : V;
284 // Any passing value only has 8 bits of payload, splatted across the word
285 Imm = Vs & 0xff;
286 // Likewise, any passing values have the payload splatted into the 3rd byte
287 u = Imm | (Imm << 16);
288
289 // control = 1 or 2
290 if (Vs == u)
291 return (((Vs == V) ? 1 : 2) << 8) | Imm;
292
293 // control = 3
294 if (Vs == (u | (u << 8)))
295 return (3 << 8) | Imm;
296
297 return -1;
298 }
299
300 /// getT2SOImmValRotateVal - Return the 12-bit encoded representation if the
301 /// specified value is a rotated 8-bit value. Return -1 if no rotation
302 /// encoding is possible.
303 /// See ARM Reference Manual A6.3.2.
304 inline int getT2SOImmValRotateVal(unsigned V) {
305 unsigned RotAmt = countLeadingZeros(V);
306 if (RotAmt >= 24)
307 return -1;
308
309 // If 'Arg' can be handled with a single shifter_op return the value.
310 if ((rotr32(0xff000000U, RotAmt) & V) == V)
311 return (rotr32(V, 24 - RotAmt) & 0x7f) | ((RotAmt + 8) << 7);
312
313 return -1;
314 }
315
316 /// getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit
317 /// into a Thumb-2 shifter_operand immediate operand, return the 12-bit
318 /// encoding for it. If not, return -1.
319 /// See ARM Reference Manual A6.3.2.
320 inline int getT2SOImmVal(unsigned Arg) {
321 // If 'Arg' is an 8-bit splat, then get the encoded value.
322 int Splat = getT2SOImmValSplatVal(Arg);
323 if (Splat != -1)
324 return Splat;
325
326 // If 'Arg' can be handled with a single shifter_op return the value.
327 int Rot = getT2SOImmValRotateVal(Arg);
328 if (Rot != -1)
329 return Rot;
330
331 return -1;
332 }
333
334 inline unsigned getT2SOImmValRotate(unsigned V) {
335 if ((V & ~255U) == 0) return 0;
336 // Use CTZ to compute the rotate amount.
337 unsigned RotAmt = countTrailingZeros(V);
338 return (32 - RotAmt) & 31;
339 }
340
341 inline bool isT2SOImmTwoPartVal(unsigned Imm) {
342 unsigned V = Imm;
343 // Passing values can be any combination of splat values and shifter
344 // values. If this can be handled with a single shifter or splat, bail
345 // out. Those should be handled directly, not with a two-part val.
346 if (getT2SOImmValSplatVal(V) != -1)
347 return false;
348 V = rotr32 (~255U, getT2SOImmValRotate(V)) & V;
349 if (V == 0)
350 return false;
351
352 // If this can be handled as an immediate, accept.
353 if (getT2SOImmVal(V) != -1) return true;
354
355 // Likewise, try masking out a splat value first.
356 V = Imm;
357 if (getT2SOImmValSplatVal(V & 0xff00ff00U) != -1)
358 V &= ~0xff00ff00U;
359 else if (getT2SOImmValSplatVal(V & 0x00ff00ffU) != -1)
360 V &= ~0x00ff00ffU;
361 // If what's left can be handled as an immediate, accept.
362 if (getT2SOImmVal(V) != -1) return true;
363
364 // Otherwise, do not accept.
365 return false;
366 }
367
368 inline unsigned getT2SOImmTwoPartFirst(unsigned Imm) {
369 assert (isT2SOImmTwoPartVal(Imm) &&(static_cast<void> (0))
370 "Immedate cannot be encoded as two part immediate!")(static_cast<void> (0));
371 // Try a shifter operand as one part
372 unsigned V = rotr32 (~255, getT2SOImmValRotate(Imm)) & Imm;
373 // If the rest is encodable as an immediate, then return it.
374 if (getT2SOImmVal(V) != -1) return V;
375
376 // Try masking out a splat value first.
377 if (getT2SOImmValSplatVal(Imm & 0xff00ff00U) != -1)
378 return Imm & 0xff00ff00U;
379
380 // The other splat is all that's left as an option.
381 assert (getT2SOImmValSplatVal(Imm & 0x00ff00ffU) != -1)(static_cast<void> (0));
382 return Imm & 0x00ff00ffU;
383 }
384
385 inline unsigned getT2SOImmTwoPartSecond(unsigned Imm) {
386 // Mask out the first hunk
387 Imm ^= getT2SOImmTwoPartFirst(Imm);
388 // Return what's left
389 assert (getT2SOImmVal(Imm) != -1 &&(static_cast<void> (0))
390 "Unable to encode second part of T2 two part SO immediate")(static_cast<void> (0));
391 return Imm;
392 }
393
394
395 //===--------------------------------------------------------------------===//
396 // Addressing Mode #2
397 //===--------------------------------------------------------------------===//
398 //
399 // This is used for most simple load/store instructions.
400 //
401 // addrmode2 := reg +/- reg shop imm
402 // addrmode2 := reg +/- imm12
403 //
404 // The first operand is always a Reg. The second operand is a reg if in
405 // reg/reg form, otherwise it's reg#0. The third field encodes the operation
406 // in bit 12, the immediate in bits 0-11, and the shift op in 13-15. The
407 // fourth operand 16-17 encodes the index mode.
408 //
409 // If this addressing mode is a frame index (before prolog/epilog insertion
410 // and code rewriting), this operand will have the form: FI#, reg0, <offs>
411 // with no shift amount for the frame offset.
412 //
413 inline unsigned getAM2Opc(AddrOpc Opc, unsigned Imm12, ShiftOpc SO,
414 unsigned IdxMode = 0) {
415 assert(Imm12 < (1 << 12) && "Imm too large!")(static_cast<void> (0));
416 bool isSub = Opc == sub;
417 return Imm12 | ((int)isSub << 12) | (SO << 13) | (IdxMode << 16) ;
418 }
419 inline unsigned getAM2Offset(unsigned AM2Opc) {
420 return AM2Opc & ((1 << 12)-1);
421 }
422 inline AddrOpc getAM2Op(unsigned AM2Opc) {
423 return ((AM2Opc >> 12) & 1) ? sub : add;
424 }
425 inline ShiftOpc getAM2ShiftOpc(unsigned AM2Opc) {
426 return (ShiftOpc)((AM2Opc >> 13) & 7);
427 }
428 inline unsigned getAM2IdxMode(unsigned AM2Opc) { return (AM2Opc >> 16); }
429
430 //===--------------------------------------------------------------------===//
431 // Addressing Mode #3
432 //===--------------------------------------------------------------------===//
433 //
434 // This is used for sign-extending loads, and load/store-pair instructions.
435 //
436 // addrmode3 := reg +/- reg
437 // addrmode3 := reg +/- imm8
438 //
439 // The first operand is always a Reg. The second operand is a reg if in
440 // reg/reg form, otherwise it's reg#0. The third field encodes the operation
441 // in bit 8, the immediate in bits 0-7. The fourth operand 9-10 encodes the
442 // index mode.
443
444 /// getAM3Opc - This function encodes the addrmode3 opc field.
445 inline unsigned getAM3Opc(AddrOpc Opc, unsigned char Offset,
446 unsigned IdxMode = 0) {
447 bool isSub = Opc == sub;
448 return ((int)isSub << 8) | Offset | (IdxMode << 9);
449 }
450 inline unsigned char getAM3Offset(unsigned AM3Opc) { return AM3Opc & 0xFF; }
451 inline AddrOpc getAM3Op(unsigned AM3Opc) {
452 return ((AM3Opc >> 8) & 1) ? sub : add;
453 }
454 inline unsigned getAM3IdxMode(unsigned AM3Opc) { return (AM3Opc >> 9); }
455
456 //===--------------------------------------------------------------------===//
457 // Addressing Mode #4
458 //===--------------------------------------------------------------------===//
459 //
460 // This is used for load / store multiple instructions.
461 //
462 // addrmode4 := reg, <mode>
463 //
464 // The four modes are:
465 // IA - Increment after
466 // IB - Increment before
467 // DA - Decrement after
468 // DB - Decrement before
469 // For VFP instructions, only the IA and DB modes are valid.
470
471 inline AMSubMode getAM4SubMode(unsigned Mode) {
472 return (AMSubMode)(Mode & 0x7);
473 }
474
475 inline unsigned getAM4ModeImm(AMSubMode SubMode) { return (int)SubMode; }
476
477 //===--------------------------------------------------------------------===//
478 // Addressing Mode #5
479 //===--------------------------------------------------------------------===//
480 //
481 // This is used for coprocessor instructions, such as FP load/stores.
482 //
483 // addrmode5 := reg +/- imm8*4
484 //
485 // The first operand is always a Reg. The second operand encodes the
486 // operation (add or subtract) in bit 8 and the immediate in bits 0-7.
487
488 /// getAM5Opc - This function encodes the addrmode5 opc field.
489 inline unsigned getAM5Opc(AddrOpc Opc, unsigned char Offset) {
490 bool isSub = Opc == sub;
491 return ((int)isSub << 8) | Offset;
492 }
493 inline unsigned char getAM5Offset(unsigned AM5Opc) { return AM5Opc & 0xFF; }
494 inline AddrOpc getAM5Op(unsigned AM5Opc) {
495 return ((AM5Opc >> 8) & 1) ? sub : add;
496 }
497
498 //===--------------------------------------------------------------------===//
499 // Addressing Mode #5 FP16
500 //===--------------------------------------------------------------------===//
501 //
502 // This is used for coprocessor instructions, such as 16-bit FP load/stores.
503 //
504 // addrmode5fp16 := reg +/- imm8*2
505 //
506 // The first operand is always a Reg. The second operand encodes the
507 // operation (add or subtract) in bit 8 and the immediate in bits 0-7.
508
509 /// getAM5FP16Opc - This function encodes the addrmode5fp16 opc field.
510 inline unsigned getAM5FP16Opc(AddrOpc Opc, unsigned char Offset) {
511 bool isSub = Opc == sub;
512 return ((int)isSub << 8) | Offset;
513 }
514 inline unsigned char getAM5FP16Offset(unsigned AM5Opc) {
515 return AM5Opc & 0xFF;
516 }
517 inline AddrOpc getAM5FP16Op(unsigned AM5Opc) {
518 return ((AM5Opc >> 8) & 1) ? sub : add;
519 }
520
521 //===--------------------------------------------------------------------===//
522 // Addressing Mode #6
523 //===--------------------------------------------------------------------===//
524 //
525 // This is used for NEON load / store instructions.
526 //
527 // addrmode6 := reg with optional alignment
528 //
529 // This is stored in two operands [regaddr, align]. The first is the
530 // address register. The second operand is the value of the alignment
531 // specifier in bytes or zero if no explicit alignment.
532 // Valid alignments depend on the specific instruction.
533
534 //===--------------------------------------------------------------------===//
535 // NEON/MVE Modified Immediates
536 //===--------------------------------------------------------------------===//
537 //
538 // Several NEON and MVE instructions (e.g., VMOV) take a "modified immediate"
539 // vector operand, where a small immediate encoded in the instruction
540 // specifies a full NEON vector value. These modified immediates are
541 // represented here as encoded integers. The low 8 bits hold the immediate
542 // value; bit 12 holds the "Op" field of the instruction, and bits 11-8 hold
543 // the "Cmode" field of the instruction. The interfaces below treat the
544 // Op and Cmode values as a single 5-bit value.
545
546 inline unsigned createVMOVModImm(unsigned OpCmode, unsigned Val) {
547 return (OpCmode << 8) | Val;
548 }
549 inline unsigned getVMOVModImmOpCmode(unsigned ModImm) {
550 return (ModImm >> 8) & 0x1f;
551 }
552 inline unsigned getVMOVModImmVal(unsigned ModImm) { return ModImm & 0xff; }
553
554 /// decodeVMOVModImm - Decode a NEON/MVE modified immediate value into the
555 /// element value and the element size in bits. (If the element size is
556 /// smaller than the vector, it is splatted into all the elements.)
557 inline uint64_t decodeVMOVModImm(unsigned ModImm, unsigned &EltBits) {
558 unsigned OpCmode = getVMOVModImmOpCmode(ModImm);
559 unsigned Imm8 = getVMOVModImmVal(ModImm);
560 uint64_t Val = 0;
561
562 if (OpCmode == 0xe) {
563 // 8-bit vector elements
564 Val = Imm8;
565 EltBits = 8;
566 } else if ((OpCmode & 0xc) == 0x8) {
567 // 16-bit vector elements
568 unsigned ByteNum = (OpCmode & 0x6) >> 1;
569 Val = Imm8 << (8 * ByteNum);
570 EltBits = 16;
571 } else if ((OpCmode & 0x8) == 0) {
572 // 32-bit vector elements, zero with one byte set
573 unsigned ByteNum = (OpCmode & 0x6) >> 1;
574 Val = Imm8 << (8 * ByteNum);
575 EltBits = 32;
576 } else if ((OpCmode & 0xe) == 0xc) {
577 // 32-bit vector elements, one byte with low bits set
578 unsigned ByteNum = 1 + (OpCmode & 0x1);
579 Val = (Imm8 << (8 * ByteNum)) | (0xffff >> (8 * (2 - ByteNum)));
580 EltBits = 32;
581 } else if (OpCmode == 0x1e) {
582 // 64-bit vector elements
583 for (unsigned ByteNum = 0; ByteNum < 8; ++ByteNum) {
584 if ((ModImm >> ByteNum) & 1)
585 Val |= (uint64_t)0xff << (8 * ByteNum);
586 }
587 EltBits = 64;
588 } else {
589 llvm_unreachable("Unsupported VMOV immediate")__builtin_unreachable();
590 }
591 return Val;
592 }
593
594 // Generic validation for single-byte immediate (0X00, 00X0, etc).
595 inline bool isNEONBytesplat(unsigned Value, unsigned Size) {
596 assert(Size >= 1 && Size <= 4 && "Invalid size")(static_cast<void> (0));
597 unsigned count = 0;
598 for (unsigned i = 0; i < Size; ++i) {
599 if (Value & 0xff) count++;
600 Value >>= 8;
601 }
602 return count == 1;
603 }
604
605 /// Checks if Value is a correct immediate for instructions like VBIC/VORR.
606 inline bool isNEONi16splat(unsigned Value) {
607 if (Value > 0xffff)
608 return false;
609 // i16 value with set bits only in one byte X0 or 0X.
610 return Value == 0 || isNEONBytesplat(Value, 2);
611 }
612
613 // Encode NEON 16 bits Splat immediate for instructions like VBIC/VORR
614 inline unsigned encodeNEONi16splat(unsigned Value) {
615 assert(isNEONi16splat(Value) && "Invalid NEON splat value")(static_cast<void> (0));
616 if (Value >= 0x100)
617 Value = (Value >> 8) | 0xa00;
618 else
619 Value |= 0x800;
620 return Value;
621 }
622
623 /// Checks if Value is a correct immediate for instructions like VBIC/VORR.
624 inline bool isNEONi32splat(unsigned Value) {
625 // i32 value with set bits only in one byte X000, 0X00, 00X0, or 000X.
626 return Value == 0 || isNEONBytesplat(Value, 4);
627 }
628
629 /// Encode NEON 32 bits Splat immediate for instructions like VBIC/VORR.
630 inline unsigned encodeNEONi32splat(unsigned Value) {
631 assert(isNEONi32splat(Value) && "Invalid NEON splat value")(static_cast<void> (0));
632 if (Value >= 0x100 && Value <= 0xff00)
633 Value = (Value >> 8) | 0x200;
634 else if (Value > 0xffff && Value <= 0xff0000)
635 Value = (Value >> 16) | 0x400;
636 else if (Value > 0xffffff)
637 Value = (Value >> 24) | 0x600;
638 return Value;
639 }
640
641 //===--------------------------------------------------------------------===//
642 // Floating-point Immediates
643 //
644 inline float getFPImmFloat(unsigned Imm) {
645 // We expect an 8-bit binary encoding of a floating-point number here.
646
647 uint8_t Sign = (Imm >> 7) & 0x1;
648 uint8_t Exp = (Imm >> 4) & 0x7;
649 uint8_t Mantissa = Imm & 0xf;
650
651 // 8-bit FP IEEE Float Encoding
652 // abcd efgh aBbbbbbc defgh000 00000000 00000000
653 //
654 // where B = NOT(b);
655 uint32_t I = 0;
656 I |= Sign << 31;
657 I |= ((Exp & 0x4) != 0 ? 0 : 1) << 30;
658 I |= ((Exp & 0x4) != 0 ? 0x1f : 0) << 25;
659 I |= (Exp & 0x3) << 23;
660 I |= Mantissa << 19;
661 return bit_cast<float>(I);
662 }
663
664 /// getFP16Imm - Return an 8-bit floating-point version of the 16-bit
665 /// floating-point value. If the value cannot be represented as an 8-bit
666 /// floating-point value, then return -1.
667 inline int getFP16Imm(const APInt &Imm) {
668 uint32_t Sign = Imm.lshr(15).getZExtValue() & 1;
669 int32_t Exp = (Imm.lshr(10).getSExtValue() & 0x1f) - 15; // -14 to 15
670 int64_t Mantissa = Imm.getZExtValue() & 0x3ff; // 10 bits
671
672 // We can handle 4 bits of mantissa.
673 // mantissa = (16+UInt(e:f:g:h))/16.
674 if (Mantissa & 0x3f)
675 return -1;
676 Mantissa >>= 6;
677
678 // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
679 if (Exp < -3 || Exp > 4)
680 return -1;
681 Exp = ((Exp+3) & 0x7) ^ 4;
682
683 return ((int)Sign << 7) | (Exp << 4) | Mantissa;
684 }
685
686 inline int getFP16Imm(const APFloat &FPImm) {
687 return getFP16Imm(FPImm.bitcastToAPInt());
688 }
689
690 /// If this is a FP16Imm encoded as a fp32 value, return the 8-bit encoding
691 /// for it. Otherwise return -1 like getFP16Imm.
692 inline int getFP32FP16Imm(const APInt &Imm) {
693 if (Imm.getActiveBits() > 16)
694 return -1;
695 return ARM_AM::getFP16Imm(Imm.trunc(16));
696 }
697
698 inline int getFP32FP16Imm(const APFloat &FPImm) {
699 return getFP32FP16Imm(FPImm.bitcastToAPInt());
700 }
701
702 /// getFP32Imm - Return an 8-bit floating-point version of the 32-bit
703 /// floating-point value. If the value cannot be represented as an 8-bit
704 /// floating-point value, then return -1.
705 inline int getFP32Imm(const APInt &Imm) {
706 uint32_t Sign = Imm.lshr(31).getZExtValue() & 1;
707 int32_t Exp = (Imm.lshr(23).getSExtValue() & 0xff) - 127; // -126 to 127
708 int64_t Mantissa = Imm.getZExtValue() & 0x7fffff; // 23 bits
709
710 // We can handle 4 bits of mantissa.
711 // mantissa = (16+UInt(e:f:g:h))/16.
712 if (Mantissa & 0x7ffff)
713 return -1;
714 Mantissa >>= 19;
715 if ((Mantissa & 0xf) != Mantissa)
716 return -1;
717
718 // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
719 if (Exp < -3 || Exp > 4)
720 return -1;
721 Exp = ((Exp+3) & 0x7) ^ 4;
722
723 return ((int)Sign << 7) | (Exp << 4) | Mantissa;
724 }
725
726 inline int getFP32Imm(const APFloat &FPImm) {
727 return getFP32Imm(FPImm.bitcastToAPInt());
728 }
729
730 /// getFP64Imm - Return an 8-bit floating-point version of the 64-bit
731 /// floating-point value. If the value cannot be represented as an 8-bit
732 /// floating-point value, then return -1.
733 inline int getFP64Imm(const APInt &Imm) {
734 uint64_t Sign = Imm.lshr(63).getZExtValue() & 1;
735 int64_t Exp = (Imm.lshr(52).getSExtValue() & 0x7ff) - 1023; // -1022 to 1023
736 uint64_t Mantissa = Imm.getZExtValue() & 0xfffffffffffffULL;
737
738 // We can handle 4 bits of mantissa.
739 // mantissa = (16+UInt(e:f:g:h))/16.
740 if (Mantissa & 0xffffffffffffULL)
741 return -1;
742 Mantissa >>= 48;
743 if ((Mantissa & 0xf) != Mantissa)
744 return -1;
745
746 // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
747 if (Exp < -3 || Exp > 4)
748 return -1;
749 Exp = ((Exp+3) & 0x7) ^ 4;
750
751 return ((int)Sign << 7) | (Exp << 4) | Mantissa;
752 }
753
754 inline int getFP64Imm(const APFloat &FPImm) {
755 return getFP64Imm(FPImm.bitcastToAPInt());
756 }
757
758} // end namespace ARM_AM
759} // end namespace llvm
760
761#endif
762

/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/include/llvm/Support/MathExtras.h

1//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains some functions that are useful for math stuff.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_SUPPORT_MATHEXTRAS_H
14#define LLVM_SUPPORT_MATHEXTRAS_H
15
16#include "llvm/Support/Compiler.h"
17#include <cassert>
18#include <climits>
19#include <cmath>
20#include <cstdint>
21#include <cstring>
22#include <limits>
23#include <type_traits>
24
25#ifdef __ANDROID_NDK__
26#include <android/api-level.h>
27#endif
28
29#ifdef _MSC_VER
30// Declare these intrinsics manually rather including intrin.h. It's very
31// expensive, and MathExtras.h is popular.
32// #include <intrin.h>
33extern "C" {
34unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
35unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
36unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
37unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
38}
39#endif
40
41namespace llvm {
42
43/// The behavior an operation has on an input of 0.
44enum ZeroBehavior {
45 /// The returned value is undefined.
46 ZB_Undefined,
47 /// The returned value is numeric_limits<T>::max()
48 ZB_Max,
49 /// The returned value is numeric_limits<T>::digits
50 ZB_Width
51};
52
53/// Mathematical constants.
54namespace numbers {
55// TODO: Track C++20 std::numbers.
56// TODO: Favor using the hexadecimal FP constants (requires C++17).
57constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113
58 egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620
59 ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162
60 ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392
61 log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0)
62 log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2)
63 pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796
64 inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541
65 sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161
66 inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197
67 sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219
68 inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1)
69 sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194
70 inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1)
71 phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622
72constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113
73 egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620
74 ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162
75 ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392
76 log2ef = 1.44269504F, // (0x1.715476P+0)
77 log10ef = .434294482F, // (0x1.bcb7b2P-2)
78 pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796
79 inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541
80 sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161
81 inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197
82 sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193
83 inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1)
84 sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194
85 inv_sqrt3f = .577350269F, // (0x1.279a74P-1)
86 phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622
87} // namespace numbers
88
89namespace detail {
90template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
91 static unsigned count(T Val, ZeroBehavior) {
92 if (!Val)
93 return std::numeric_limits<T>::digits;
94 if (Val & 0x1)
95 return 0;
96
97 // Bisection method.
98 unsigned ZeroBits = 0;
99 T Shift = std::numeric_limits<T>::digits >> 1;
100 T Mask = std::numeric_limits<T>::max() >> Shift;
101 while (Shift) {
102 if ((Val & Mask) == 0) {
103 Val >>= Shift;
104 ZeroBits |= Shift;
105 }
106 Shift >>= 1;
107 Mask >>= Shift;
108 }
109 return ZeroBits;
110 }
111};
112
113#if defined(__GNUC__4) || defined(_MSC_VER)
114template <typename T> struct TrailingZerosCounter<T, 4> {
115 static unsigned count(T Val, ZeroBehavior ZB) {
116 if (ZB
21.1
'ZB' is not equal to ZB_Undefined
21.1
'ZB' is not equal to ZB_Undefined
21.1
'ZB' is not equal to ZB_Undefined
!= ZB_Undefined && Val == 0)
22
Assuming 'Val' is equal to 0
23
Taking true branch
117 return 32;
24
Returning the value 32
118
119#if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4)
120 return __builtin_ctz(Val);
121#elif defined(_MSC_VER)
122 unsigned long Index;
123 _BitScanForward(&Index, Val);
124 return Index;
125#endif
126 }
127};
128
129#if !defined(_MSC_VER) || defined(_M_X64)
130template <typename T> struct TrailingZerosCounter<T, 8> {
131 static unsigned count(T Val, ZeroBehavior ZB) {
132 if (ZB != ZB_Undefined && Val == 0)
133 return 64;
134
135#if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4)
136 return __builtin_ctzll(Val);
137#elif defined(_MSC_VER)
138 unsigned long Index;
139 _BitScanForward64(&Index, Val);
140 return Index;
141#endif
142 }
143};
144#endif
145#endif
146} // namespace detail
147
148/// Count number of 0's from the least significant bit to the most
149/// stopping at the first 1.
150///
151/// Only unsigned integral types are allowed.
152///
153/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
154/// valid arguments.
155template <typename T>
156unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
157 static_assert(std::numeric_limits<T>::is_integer &&
158 !std::numeric_limits<T>::is_signed,
159 "Only unsigned integral types are allowed.");
160 return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
21
Calling 'TrailingZerosCounter::count'
25
Returning from 'TrailingZerosCounter::count'
26
Returning the value 32
161}
162
163namespace detail {
164template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
165 static unsigned count(T Val, ZeroBehavior) {
166 if (!Val)
167 return std::numeric_limits<T>::digits;
168
169 // Bisection method.
170 unsigned ZeroBits = 0;
171 for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
172 T Tmp = Val >> Shift;
173 if (Tmp)
174 Val = Tmp;
175 else
176 ZeroBits |= Shift;
177 }
178 return ZeroBits;
179 }
180};
181
182#if defined(__GNUC__4) || defined(_MSC_VER)
183template <typename T> struct LeadingZerosCounter<T, 4> {
184 static unsigned count(T Val, ZeroBehavior ZB) {
185 if (ZB != ZB_Undefined && Val == 0)
186 return 32;
187
188#if __has_builtin(__builtin_clz)1 || defined(__GNUC__4)
189 return __builtin_clz(Val);
190#elif defined(_MSC_VER)
191 unsigned long Index;
192 _BitScanReverse(&Index, Val);
193 return Index ^ 31;
194#endif
195 }
196};
197
198#if !defined(_MSC_VER) || defined(_M_X64)
199template <typename T> struct LeadingZerosCounter<T, 8> {
200 static unsigned count(T Val, ZeroBehavior ZB) {
201 if (ZB != ZB_Undefined && Val == 0)
202 return 64;
203
204#if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4)
205 return __builtin_clzll(Val);
206#elif defined(_MSC_VER)
207 unsigned long Index;
208 _BitScanReverse64(&Index, Val);
209 return Index ^ 63;
210#endif
211 }
212};
213#endif
214#endif
215} // namespace detail
216
217/// Count number of 0's from the most significant bit to the least
218/// stopping at the first 1.
219///
220/// Only unsigned integral types are allowed.
221///
222/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
223/// valid arguments.
224template <typename T>
225unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
226 static_assert(std::numeric_limits<T>::is_integer &&
227 !std::numeric_limits<T>::is_signed,
228 "Only unsigned integral types are allowed.");
229 return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
230}
231
232/// Get the index of the first set bit starting from the least
233/// significant bit.
234///
235/// Only unsigned integral types are allowed.
236///
237/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
238/// valid arguments.
239template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
240 if (ZB == ZB_Max && Val == 0)
241 return std::numeric_limits<T>::max();
242
243 return countTrailingZeros(Val, ZB_Undefined);
244}
245
246/// Create a bitmask with the N right-most bits set to 1, and all other
247/// bits set to 0. Only unsigned types are allowed.
248template <typename T> T maskTrailingOnes(unsigned N) {
249 static_assert(std::is_unsigned<T>::value, "Invalid type!");
250 const unsigned Bits = CHAR_BIT8 * sizeof(T);
251 assert(N <= Bits && "Invalid bit index")(static_cast<void> (0));
252 return N == 0 ? 0 : (T(-1) >> (Bits - N));
253}
254
255/// Create a bitmask with the N left-most bits set to 1, and all other
256/// bits set to 0. Only unsigned types are allowed.
257template <typename T> T maskLeadingOnes(unsigned N) {
258 return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
259}
260
261/// Create a bitmask with the N right-most bits set to 0, and all other
262/// bits set to 1. Only unsigned types are allowed.
263template <typename T> T maskTrailingZeros(unsigned N) {
264 return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
265}
266
267/// Create a bitmask with the N left-most bits set to 0, and all other
268/// bits set to 1. Only unsigned types are allowed.
269template <typename T> T maskLeadingZeros(unsigned N) {
270 return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
271}
272
273/// Get the index of the last set bit starting from the least
274/// significant bit.
275///
276/// Only unsigned integral types are allowed.
277///
278/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
279/// valid arguments.
280template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
281 if (ZB == ZB_Max && Val == 0)
282 return std::numeric_limits<T>::max();
283
284 // Use ^ instead of - because both gcc and llvm can remove the associated ^
285 // in the __builtin_clz intrinsic on x86.
286 return countLeadingZeros(Val, ZB_Undefined) ^
287 (std::numeric_limits<T>::digits - 1);
288}
289
290/// Macro compressed bit reversal table for 256 bits.
291///
292/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
293static const unsigned char BitReverseTable256[256] = {
294#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
295#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
296#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
297 R6(0), R6(2), R6(1), R6(3)
298#undef R2
299#undef R4
300#undef R6
301};
302
303/// Reverse the bits in \p Val.
304template <typename T>
305T reverseBits(T Val) {
306 unsigned char in[sizeof(Val)];
307 unsigned char out[sizeof(Val)];
308 std::memcpy(in, &Val, sizeof(Val));
309 for (unsigned i = 0; i < sizeof(Val); ++i)
310 out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
311 std::memcpy(&Val, out, sizeof(Val));
312 return Val;
313}
314
315#if __has_builtin(__builtin_bitreverse8)1
316template<>
317inline uint8_t reverseBits<uint8_t>(uint8_t Val) {
318 return __builtin_bitreverse8(Val);
319}
320#endif
321
322#if __has_builtin(__builtin_bitreverse16)1
323template<>
324inline uint16_t reverseBits<uint16_t>(uint16_t Val) {
325 return __builtin_bitreverse16(Val);
326}
327#endif
328
329#if __has_builtin(__builtin_bitreverse32)1
330template<>
331inline uint32_t reverseBits<uint32_t>(uint32_t Val) {
332 return __builtin_bitreverse32(Val);
333}
334#endif
335
336#if __has_builtin(__builtin_bitreverse64)1
337template<>
338inline uint64_t reverseBits<uint64_t>(uint64_t Val) {
339 return __builtin_bitreverse64(Val);
340}
341#endif
342
343// NOTE: The following support functions use the _32/_64 extensions instead of
344// type overloading so that signed and unsigned integers can be used without
345// ambiguity.
346
347/// Return the high 32 bits of a 64 bit value.
348constexpr inline uint32_t Hi_32(uint64_t Value) {
349 return static_cast<uint32_t>(Value >> 32);
350}
351
352/// Return the low 32 bits of a 64 bit value.
353constexpr inline uint32_t Lo_32(uint64_t Value) {
354 return static_cast<uint32_t>(Value);
355}
356
357/// Make a 64-bit integer from a high / low pair of 32-bit integers.
358constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
359 return ((uint64_t)High << 32) | (uint64_t)Low;
360}
361
362/// Checks if an integer fits into the given bit width.
363template <unsigned N> constexpr inline bool isInt(int64_t x) {
364 return N >= 64 || (-(INT64_C(1)1L<<(N-1)) <= x && x < (INT64_C(1)1L<<(N-1)));
365}
366// Template specializations to get better code for common cases.
367template <> constexpr inline bool isInt<8>(int64_t x) {
368 return static_cast<int8_t>(x) == x;
369}
370template <> constexpr inline bool isInt<16>(int64_t x) {
371 return static_cast<int16_t>(x) == x;
372}
373template <> constexpr inline bool isInt<32>(int64_t x) {
374 return static_cast<int32_t>(x) == x;
375}
376
377/// Checks if a signed integer is an N bit number shifted left by S.
378template <unsigned N, unsigned S>
379constexpr inline bool isShiftedInt(int64_t x) {
380 static_assert(
381 N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number.");
382 static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide.");
383 return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
384}
385
386/// Checks if an unsigned integer fits into the given bit width.
387///
388/// This is written as two functions rather than as simply
389///
390/// return N >= 64 || X < (UINT64_C(1) << N);
391///
392/// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting
393/// left too many places.
394template <unsigned N>
395constexpr inline std::enable_if_t<(N < 64), bool> isUInt(uint64_t X) {
396 static_assert(N > 0, "isUInt<0> doesn't make sense");
397 return X < (UINT64_C(1)1UL << (N));
398}
399template <unsigned N>
400constexpr inline std::enable_if_t<N >= 64, bool> isUInt(uint64_t) {
401 return true;
402}
403
404// Template specializations to get better code for common cases.
405template <> constexpr inline bool isUInt<8>(uint64_t x) {
406 return static_cast<uint8_t>(x) == x;
407}
408template <> constexpr inline bool isUInt<16>(uint64_t x) {
409 return static_cast<uint16_t>(x) == x;
410}
411template <> constexpr inline bool isUInt<32>(uint64_t x) {
412 return static_cast<uint32_t>(x) == x;
413}
414
415/// Checks if a unsigned integer is an N bit number shifted left by S.
416template <unsigned N, unsigned S>
417constexpr inline bool isShiftedUInt(uint64_t x) {
418 static_assert(
419 N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)");
420 static_assert(N + S <= 64,
421 "isShiftedUInt<N, S> with N + S > 64 is too wide.");
422 // Per the two static_asserts above, S must be strictly less than 64. So
423 // 1 << S is not undefined behavior.
424 return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
425}
426
427/// Gets the maximum value for a N-bit unsigned integer.
428inline uint64_t maxUIntN(uint64_t N) {
429 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast<void> (0));
430
431 // uint64_t(1) << 64 is undefined behavior, so we can't do
432 // (uint64_t(1) << N) - 1
433 // without checking first that N != 64. But this works and doesn't have a
434 // branch.
435 return UINT64_MAX(18446744073709551615UL) >> (64 - N);
436}
437
438/// Gets the minimum value for a N-bit signed integer.
439inline int64_t minIntN(int64_t N) {
440 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast<void> (0));
441
442 return UINT64_C(1)1UL + ~(UINT64_C(1)1UL << (N - 1));
443}
444
445/// Gets the maximum value for a N-bit signed integer.
446inline int64_t maxIntN(int64_t N) {
447 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast<void> (0));
448
449 // This relies on two's complement wraparound when N == 64, so we convert to
450 // int64_t only at the very end to avoid UB.
451 return (UINT64_C(1)1UL << (N - 1)) - 1;
452}
453
454/// Checks if an unsigned integer fits into the given (dynamic) bit width.
455inline bool isUIntN(unsigned N, uint64_t x) {
456 return N >= 64 || x <= maxUIntN(N);
457}
458
459/// Checks if an signed integer fits into the given (dynamic) bit width.
460inline bool isIntN(unsigned N, int64_t x) {
461 return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N));
462}
463
464/// Return true if the argument is a non-empty sequence of ones starting at the
465/// least significant bit with the remainder zero (32 bit version).
466/// Ex. isMask_32(0x0000FFFFU) == true.
467constexpr inline bool isMask_32(uint32_t Value) {
468 return Value && ((Value + 1) & Value) == 0;
469}
470
471/// Return true if the argument is a non-empty sequence of ones starting at the
472/// least significant bit with the remainder zero (64 bit version).
473constexpr inline bool isMask_64(uint64_t Value) {
474 return Value && ((Value + 1) & Value) == 0;
475}
476
477/// Return true if the argument contains a non-empty sequence of ones with the
478/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
479constexpr inline bool isShiftedMask_32(uint32_t Value) {
480 return Value && isMask_32((Value - 1) | Value);
481}
482
483/// Return true if the argument contains a non-empty sequence of ones with the
484/// remainder zero (64 bit version.)
485constexpr inline bool isShiftedMask_64(uint64_t Value) {
486 return Value && isMask_64((Value - 1) | Value);
487}
488
489/// Return true if the argument is a power of two > 0.
490/// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
491constexpr inline bool isPowerOf2_32(uint32_t Value) {
492 return Value && !(Value & (Value - 1));
493}
494
495/// Return true if the argument is a power of two > 0 (64 bit edition.)
496constexpr inline bool isPowerOf2_64(uint64_t Value) {
497 return Value && !(Value & (Value - 1));
498}
499
500/// Count the number of ones from the most significant bit to the first
501/// zero bit.
502///
503/// Ex. countLeadingOnes(0xFF0FFF00) == 8.
504/// Only unsigned integral types are allowed.
505///
506/// \param ZB the behavior on an input of all ones. Only ZB_Width and
507/// ZB_Undefined are valid arguments.
508template <typename T>
509unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
510 static_assert(std::numeric_limits<T>::is_integer &&
511 !std::numeric_limits<T>::is_signed,
512 "Only unsigned integral types are allowed.");
513 return countLeadingZeros<T>(~Value, ZB);
514}
515
516/// Count the number of ones from the least significant bit to the first
517/// zero bit.
518///
519/// Ex. countTrailingOnes(0x00FF00FF) == 8.
520/// Only unsigned integral types are allowed.
521///
522/// \param ZB the behavior on an input of all ones. Only ZB_Width and
523/// ZB_Undefined are valid arguments.
524template <typename T>
525unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
526 static_assert(std::numeric_limits<T>::is_integer &&
527 !std::numeric_limits<T>::is_signed,
528 "Only unsigned integral types are allowed.");
529 return countTrailingZeros<T>(~Value, ZB);
530}
531
532namespace detail {
533template <typename T, std::size_t SizeOfT> struct PopulationCounter {
534 static unsigned count(T Value) {
535 // Generic version, forward to 32 bits.
536 static_assert(SizeOfT <= 4, "Not implemented!");
537#if defined(__GNUC__4)
538 return __builtin_popcount(Value);
539#else
540 uint32_t v = Value;
541 v = v - ((v >> 1) & 0x55555555);
542 v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
543 return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
544#endif
545 }
546};
547
548template <typename T> struct PopulationCounter<T, 8> {
549 static unsigned count(T Value) {
550#if defined(__GNUC__4)
551 return __builtin_popcountll(Value);
552#else
553 uint64_t v = Value;
554 v = v - ((v >> 1) & 0x5555555555555555ULL);
555 v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
556 v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
557 return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
558#endif
559 }
560};
561} // namespace detail
562
563/// Count the number of set bits in a value.
564/// Ex. countPopulation(0xF000F000) = 8
565/// Returns 0 if the word is zero.
566template <typename T>
567inline unsigned countPopulation(T Value) {
568 static_assert(std::numeric_limits<T>::is_integer &&
569 !std::numeric_limits<T>::is_signed,
570 "Only unsigned integral types are allowed.");
571 return detail::PopulationCounter<T, sizeof(T)>::count(Value);
572}
573
574/// Compile time Log2.
575/// Valid only for positive powers of two.
576template <size_t kValue> constexpr inline size_t CTLog2() {
577 static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue),
578 "Value is not a valid power of 2");
579 return 1 + CTLog2<kValue / 2>();
580}
581
582template <> constexpr inline size_t CTLog2<1>() { return 0; }
583
584/// Return the log base 2 of the specified value.
585inline double Log2(double Value) {
586#if defined(__ANDROID_API__) && __ANDROID_API__ < 18
587 return __builtin_log(Value) / __builtin_log(2.0);
588#else
589 return log2(Value);
590#endif
591}
592
593/// Return the floor log base 2 of the specified value, -1 if the value is zero.
594/// (32 bit edition.)
595/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
596inline unsigned Log2_32(uint32_t Value) {
597 return 31 - countLeadingZeros(Value);
598}
599
600/// Return the floor log base 2 of the specified value, -1 if the value is zero.
601/// (64 bit edition.)
602inline unsigned Log2_64(uint64_t Value) {
603 return 63 - countLeadingZeros(Value);
604}
605
606/// Return the ceil log base 2 of the specified value, 32 if the value is zero.
607/// (32 bit edition).
608/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
609inline unsigned Log2_32_Ceil(uint32_t Value) {
610 return 32 - countLeadingZeros(Value - 1);
611}
612
613/// Return the ceil log base 2 of the specified value, 64 if the value is zero.
614/// (64 bit edition.)
615inline unsigned Log2_64_Ceil(uint64_t Value) {
616 return 64 - countLeadingZeros(Value - 1);
617}
618
619/// Return the greatest common divisor of the values using Euclid's algorithm.
620template <typename T>
621inline T greatestCommonDivisor(T A, T B) {
622 while (B) {
623 T Tmp = B;
624 B = A % B;
625 A = Tmp;
626 }
627 return A;
628}
629
630inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) {
631 return greatestCommonDivisor<uint64_t>(A, B);
632}
633
634/// This function takes a 64-bit integer and returns the bit equivalent double.
635inline double BitsToDouble(uint64_t Bits) {
636 double D;
637 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
638 memcpy(&D, &Bits, sizeof(Bits));
639 return D;
640}
641
642/// This function takes a 32-bit integer and returns the bit equivalent float.
643inline float BitsToFloat(uint32_t Bits) {
644 float F;
645 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
646 memcpy(&F, &Bits, sizeof(Bits));
647 return F;
648}
649
650/// This function takes a double and returns the bit equivalent 64-bit integer.
651/// Note that copying doubles around changes the bits of NaNs on some hosts,
652/// notably x86, so this routine cannot be used if these bits are needed.
653inline uint64_t DoubleToBits(double Double) {
654 uint64_t Bits;
655 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
656 memcpy(&Bits, &Double, sizeof(Double));
657 return Bits;
658}
659
660/// This function takes a float and returns the bit equivalent 32-bit integer.
661/// Note that copying floats around changes the bits of NaNs on some hosts,
662/// notably x86, so this routine cannot be used if these bits are needed.
663inline uint32_t FloatToBits(float Float) {
664 uint32_t Bits;
665 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
666 memcpy(&Bits, &Float, sizeof(Float));
667 return Bits;
668}
669
670/// A and B are either alignments or offsets. Return the minimum alignment that
671/// may be assumed after adding the two together.
672constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
673 // The largest power of 2 that divides both A and B.
674 //
675 // Replace "-Value" by "1+~Value" in the following commented code to avoid
676 // MSVC warning C4146
677 // return (A | B) & -(A | B);
678 return (A | B) & (1 + ~(A | B));
679}
680
681/// Returns the next power of two (in 64-bits) that is strictly greater than A.
682/// Returns zero on overflow.
683inline uint64_t NextPowerOf2(uint64_t A) {
684 A |= (A >> 1);
685 A |= (A >> 2);
686 A |= (A >> 4);
687 A |= (A >> 8);
688 A |= (A >> 16);
689 A |= (A >> 32);
690 return A + 1;
691}
692
693/// Returns the power of two which is less than or equal to the given value.
694/// Essentially, it is a floor operation across the domain of powers of two.
695inline uint64_t PowerOf2Floor(uint64_t A) {
696 if (!A) return 0;
697 return 1ull << (63 - countLeadingZeros(A, ZB_Undefined));
698}
699
700/// Returns the power of two which is greater than or equal to the given value.
701/// Essentially, it is a ceil operation across the domain of powers of two.
702inline uint64_t PowerOf2Ceil(uint64_t A) {
703 if (!A)
704 return 0;
705 return NextPowerOf2(A - 1);
706}
707
708/// Returns the next integer (mod 2**64) that is greater than or equal to
709/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
710///
711/// If non-zero \p Skew is specified, the return value will be a minimal
712/// integer that is greater than or equal to \p Value and equal to
713/// \p Align * N + \p Skew for some integer N. If \p Skew is larger than
714/// \p Align, its value is adjusted to '\p Skew mod \p Align'.
715///
716/// Examples:
717/// \code
718/// alignTo(5, 8) = 8
719/// alignTo(17, 8) = 24
720/// alignTo(~0LL, 8) = 0
721/// alignTo(321, 255) = 510
722///
723/// alignTo(5, 8, 7) = 7
724/// alignTo(17, 8, 1) = 17
725/// alignTo(~0LL, 8, 3) = 3
726/// alignTo(321, 255, 42) = 552
727/// \endcode
728inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
729 assert(Align != 0u && "Align can't be 0.")(static_cast<void> (0));
730 Skew %= Align;
731 return (Value + Align - 1 - Skew) / Align * Align + Skew;
732}
733
734/// Returns the next integer (mod 2**64) that is greater than or equal to
735/// \p Value and is a multiple of \c Align. \c Align must be non-zero.
736template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) {
737 static_assert(Align != 0u, "Align must be non-zero");
738 return (Value + Align - 1) / Align * Align;
739}
740
741/// Returns the integer ceil(Numerator / Denominator).
742inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
743 return alignTo(Numerator, Denominator) / Denominator;
744}
745
746/// Returns the integer nearest(Numerator / Denominator).
747inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) {
748 return (Numerator + (Denominator / 2)) / Denominator;
749}
750
751/// Returns the largest uint64_t less than or equal to \p Value and is
752/// \p Skew mod \p Align. \p Align must be non-zero
753inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
754 assert(Align != 0u && "Align can't be 0.")(static_cast<void> (0));
755 Skew %= Align;
756 return (Value - Skew) / Align * Align + Skew;
757}
758
759/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
760/// Requires 0 < B <= 32.
761template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) {
762 static_assert(B > 0, "Bit width can't be 0.");
763 static_assert(B <= 32, "Bit width out of range.");
764 return int32_t(X << (32 - B)) >> (32 - B);
765}
766
767/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
768/// Requires 0 < B <= 32.
769inline int32_t SignExtend32(uint32_t X, unsigned B) {
770 assert(B > 0 && "Bit width can't be 0.")(static_cast<void> (0));
771 assert(B <= 32 && "Bit width out of range.")(static_cast<void> (0));
772 return int32_t(X << (32 - B)) >> (32 - B);
773}
774
775/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
776/// Requires 0 < B <= 64.
777template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) {
778 static_assert(B > 0, "Bit width can't be 0.");
779 static_assert(B <= 64, "Bit width out of range.");
780 return int64_t(x << (64 - B)) >> (64 - B);
781}
782
783/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
784/// Requires 0 < B <= 64.
785inline int64_t SignExtend64(uint64_t X, unsigned B) {
786 assert(B > 0 && "Bit width can't be 0.")(static_cast<void> (0));
787 assert(B <= 64 && "Bit width out of range.")(static_cast<void> (0));
788 return int64_t(X << (64 - B)) >> (64 - B);
789}
790
791/// Subtract two unsigned integers, X and Y, of type T and return the absolute
792/// value of the result.
793template <typename T>
794std::enable_if_t<std::is_unsigned<T>::value, T> AbsoluteDifference(T X, T Y) {
795 return X > Y ? (X - Y) : (Y - X);
796}
797
798/// Add two unsigned integers, X and Y, of type T. Clamp the result to the
799/// maximum representable value of T on overflow. ResultOverflowed indicates if
800/// the result is larger than the maximum representable value of type T.
801template <typename T>
802std::enable_if_t<std::is_unsigned<T>::value, T>
803SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) {
804 bool Dummy;
805 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
806 // Hacker's Delight, p. 29
807 T Z = X + Y;
808 Overflowed = (Z < X || Z < Y);
809 if (Overflowed)
810 return std::numeric_limits<T>::max();
811 else
812 return Z;
813}
814
815/// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the
816/// maximum representable value of T on overflow. ResultOverflowed indicates if
817/// the result is larger than the maximum representable value of type T.
818template <typename T>
819std::enable_if_t<std::is_unsigned<T>::value, T>
820SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) {
821 bool Dummy;
822 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
823
824 // Hacker's Delight, p. 30 has a different algorithm, but we don't use that
825 // because it fails for uint16_t (where multiplication can have undefined
826 // behavior due to promotion to int), and requires a division in addition
827 // to the multiplication.
828
829 Overflowed = false;
830
831 // Log2(Z) would be either Log2Z or Log2Z + 1.
832 // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z
833 // will necessarily be less than Log2Max as desired.
834 int Log2Z = Log2_64(X) + Log2_64(Y);
835 const T Max = std::numeric_limits<T>::max();
836 int Log2Max = Log2_64(Max);
837 if (Log2Z < Log2Max) {
838 return X * Y;
839 }
840 if (Log2Z > Log2Max) {
841 Overflowed = true;
842 return Max;
843 }
844
845 // We're going to use the top bit, and maybe overflow one
846 // bit past it. Multiply all but the bottom bit then add
847 // that on at the end.
848 T Z = (X >> 1) * Y;
849 if (Z & ~(Max >> 1)) {
850 Overflowed = true;
851 return Max;
852 }
853 Z <<= 1;
854 if (X & 1)
855 return SaturatingAdd(Z, Y, ResultOverflowed);
856
857 return Z;
858}
859
860/// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to
861/// the product. Clamp the result to the maximum representable value of T on
862/// overflow. ResultOverflowed indicates if the result is larger than the
863/// maximum representable value of type T.
864template <typename T>
865std::enable_if_t<std::is_unsigned<T>::value, T>
866SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) {
867 bool Dummy;
868 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
869
870 T Product = SaturatingMultiply(X, Y, &Overflowed);
871 if (Overflowed)
872 return Product;
873
874 return SaturatingAdd(A, Product, &Overflowed);
875}
876
877/// Use this rather than HUGE_VALF; the latter causes warnings on MSVC.
878extern const float huge_valf;
879
880
881/// Add two signed integers, computing the two's complement truncated result,
882/// returning true if overflow occured.
883template <typename T>
884std::enable_if_t<std::is_signed<T>::value, T> AddOverflow(T X, T Y, T &Result) {
885#if __has_builtin(__builtin_add_overflow)1
886 return __builtin_add_overflow(X, Y, &Result);
887#else
888 // Perform the unsigned addition.
889 using U = std::make_unsigned_t<T>;
890 const U UX = static_cast<U>(X);
891 const U UY = static_cast<U>(Y);
892 const U UResult = UX + UY;
893
894 // Convert to signed.
895 Result = static_cast<T>(UResult);
896
897 // Adding two positive numbers should result in a positive number.
898 if (X > 0 && Y > 0)
899 return Result <= 0;
900 // Adding two negatives should result in a negative number.
901 if (X < 0 && Y < 0)
902 return Result >= 0;
903 return false;
904#endif
905}
906
907/// Subtract two signed integers, computing the two's complement truncated
908/// result, returning true if an overflow ocurred.
909template <typename T>
910std::enable_if_t<std::is_signed<T>::value, T> SubOverflow(T X, T Y, T &Result) {
911#if __has_builtin(__builtin_sub_overflow)1
912 return __builtin_sub_overflow(X, Y, &Result);
913#else
914 // Perform the unsigned addition.
915 using U = std::make_unsigned_t<T>;
916 const U UX = static_cast<U>(X);
917 const U UY = static_cast<U>(Y);
918 const U UResult = UX - UY;
919
920 // Convert to signed.
921 Result = static_cast<T>(UResult);
922
923 // Subtracting a positive number from a negative results in a negative number.
924 if (X <= 0 && Y > 0)
925 return Result >= 0;
926 // Subtracting a negative number from a positive results in a positive number.
927 if (X >= 0 && Y < 0)
928 return Result <= 0;
929 return false;
930#endif
931}
932
933/// Multiply two signed integers, computing the two's complement truncated
934/// result, returning true if an overflow ocurred.
935template <typename T>
936std::enable_if_t<std::is_signed<T>::value, T> MulOverflow(T X, T Y, T &Result) {
937 // Perform the unsigned multiplication on absolute values.
938 using U = std::make_unsigned_t<T>;
939 const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X);
940 const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y);
941 const U UResult = UX * UY;
942
943 // Convert to signed.
944 const bool IsNegative = (X < 0) ^ (Y < 0);
945 Result = IsNegative ? (0 - UResult) : UResult;
946
947 // If any of the args was 0, result is 0 and no overflow occurs.
948 if (UX == 0 || UY == 0)
949 return false;
950
951 // UX and UY are in [1, 2^n], where n is the number of digits.
952 // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for
953 // positive) divided by an argument compares to the other.
954 if (IsNegative)
955 return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY;
956 else
957 return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY;
958}
959
960} // End llvm namespace
961
962#endif