Bug Summary

File:llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
Warning:line 237, column 16
The result of the left shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name ARMTargetTransformInfo.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/build-llvm -resource-dir /usr/lib/llvm-14/lib/clang/14.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Target/ARM -I /build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM -I include -I /build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/include -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-14/lib/clang/14.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-command-line-argument -Wno-unknown-warning-option -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/build-llvm -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2021-10-15-103323-21151-1 -x c++ /build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

1//===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "ARMTargetTransformInfo.h"
10#include "ARMSubtarget.h"
11#include "MCTargetDesc/ARMAddressingModes.h"
12#include "llvm/ADT/APInt.h"
13#include "llvm/ADT/SmallVector.h"
14#include "llvm/Analysis/LoopInfo.h"
15#include "llvm/CodeGen/CostTable.h"
16#include "llvm/CodeGen/ISDOpcodes.h"
17#include "llvm/CodeGen/ValueTypes.h"
18#include "llvm/IR/BasicBlock.h"
19#include "llvm/IR/DataLayout.h"
20#include "llvm/IR/DerivedTypes.h"
21#include "llvm/IR/Instruction.h"
22#include "llvm/IR/Instructions.h"
23#include "llvm/IR/Intrinsics.h"
24#include "llvm/IR/IntrinsicInst.h"
25#include "llvm/IR/IntrinsicsARM.h"
26#include "llvm/IR/PatternMatch.h"
27#include "llvm/IR/Type.h"
28#include "llvm/MC/SubtargetFeature.h"
29#include "llvm/Support/Casting.h"
30#include "llvm/Support/KnownBits.h"
31#include "llvm/Support/MachineValueType.h"
32#include "llvm/Target/TargetMachine.h"
33#include "llvm/Transforms/InstCombine/InstCombiner.h"
34#include "llvm/Transforms/Utils/Local.h"
35#include "llvm/Transforms/Utils/LoopUtils.h"
36#include <algorithm>
37#include <cassert>
38#include <cstdint>
39#include <utility>
40
41using namespace llvm;
42
43#define DEBUG_TYPE"armtti" "armtti"
44
45static cl::opt<bool> EnableMaskedLoadStores(
46 "enable-arm-maskedldst", cl::Hidden, cl::init(true),
47 cl::desc("Enable the generation of masked loads and stores"));
48
49static cl::opt<bool> DisableLowOverheadLoops(
50 "disable-arm-loloops", cl::Hidden, cl::init(false),
51 cl::desc("Disable the generation of low-overhead loops"));
52
53static cl::opt<bool>
54 AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
55 cl::desc("Enable the generation of WLS loops"));
56
57extern cl::opt<TailPredication::Mode> EnableTailPredication;
58
59extern cl::opt<bool> EnableMaskedGatherScatters;
60
61extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor;
62
63/// Convert a vector load intrinsic into a simple llvm load instruction.
64/// This is beneficial when the underlying object being addressed comes
65/// from a constant, since we get constant-folding for free.
66static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
67 InstCombiner::BuilderTy &Builder) {
68 auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
69
70 if (!IntrAlign)
71 return nullptr;
72
73 unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
74 ? MemAlign
75 : IntrAlign->getLimitedValue();
76
77 if (!isPowerOf2_32(Alignment))
78 return nullptr;
79
80 auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
81 PointerType::get(II.getType(), 0));
82 return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
83}
84
85bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
86 const Function *Callee) const {
87 const TargetMachine &TM = getTLI()->getTargetMachine();
88 const FeatureBitset &CallerBits =
89 TM.getSubtargetImpl(*Caller)->getFeatureBits();
90 const FeatureBitset &CalleeBits =
91 TM.getSubtargetImpl(*Callee)->getFeatureBits();
92
93 // To inline a callee, all features not in the allowed list must match exactly.
94 bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
95 (CalleeBits & ~InlineFeaturesAllowed);
96 // For features in the allowed list, the callee's features must be a subset of
97 // the callers'.
98 bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
99 (CalleeBits & InlineFeaturesAllowed);
100 return MatchExact && MatchSubset;
101}
102
103TTI::AddressingModeKind
104ARMTTIImpl::getPreferredAddressingMode(const Loop *L,
105 ScalarEvolution *SE) const {
106 if (ST->hasMVEIntegerOps())
107 return TTI::AMK_PostIndexed;
108
109 if (L->getHeader()->getParent()->hasOptSize())
110 return TTI::AMK_None;
111
112 if (ST->isMClass() && ST->isThumb2() &&
113 L->getNumBlocks() == 1)
114 return TTI::AMK_PreIndexed;
115
116 return TTI::AMK_None;
117}
118
119Optional<Instruction *>
120ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
121 using namespace PatternMatch;
122 Intrinsic::ID IID = II.getIntrinsicID();
123 switch (IID) {
124 default:
125 break;
126 case Intrinsic::arm_neon_vld1: {
127 Align MemAlign =
128 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
129 &IC.getAssumptionCache(), &IC.getDominatorTree());
130 if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
131 return IC.replaceInstUsesWith(II, V);
132 }
133 break;
134 }
135
136 case Intrinsic::arm_neon_vld2:
137 case Intrinsic::arm_neon_vld3:
138 case Intrinsic::arm_neon_vld4:
139 case Intrinsic::arm_neon_vld2lane:
140 case Intrinsic::arm_neon_vld3lane:
141 case Intrinsic::arm_neon_vld4lane:
142 case Intrinsic::arm_neon_vst1:
143 case Intrinsic::arm_neon_vst2:
144 case Intrinsic::arm_neon_vst3:
145 case Intrinsic::arm_neon_vst4:
146 case Intrinsic::arm_neon_vst2lane:
147 case Intrinsic::arm_neon_vst3lane:
148 case Intrinsic::arm_neon_vst4lane: {
149 Align MemAlign =
150 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
151 &IC.getAssumptionCache(), &IC.getDominatorTree());
152 unsigned AlignArg = II.arg_size() - 1;
153 Value *AlignArgOp = II.getArgOperand(AlignArg);
154 MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
155 if (Align && *Align < MemAlign) {
156 return IC.replaceOperand(
157 II, AlignArg,
158 ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(),
159 false));
160 }
161 break;
162 }
163
164 case Intrinsic::arm_mve_pred_i2v: {
165 Value *Arg = II.getArgOperand(0);
166 Value *ArgArg;
167 if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
168 PatternMatch::m_Value(ArgArg))) &&
169 II.getType() == ArgArg->getType()) {
170 return IC.replaceInstUsesWith(II, ArgArg);
171 }
172 Constant *XorMask;
173 if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
174 PatternMatch::m_Value(ArgArg)),
175 PatternMatch::m_Constant(XorMask))) &&
176 II.getType() == ArgArg->getType()) {
177 if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
178 if (CI->getValue().trunc(16).isAllOnes()) {
179 auto TrueVector = IC.Builder.CreateVectorSplat(
180 cast<FixedVectorType>(II.getType())->getNumElements(),
181 IC.Builder.getTrue());
182 return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
183 }
184 }
185 }
186 KnownBits ScalarKnown(32);
187 if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
188 ScalarKnown, 0)) {
189 return &II;
190 }
191 break;
192 }
193 case Intrinsic::arm_mve_pred_v2i: {
194 Value *Arg = II.getArgOperand(0);
195 Value *ArgArg;
196 if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
197 PatternMatch::m_Value(ArgArg)))) {
198 return IC.replaceInstUsesWith(II, ArgArg);
199 }
200 if (!II.getMetadata(LLVMContext::MD_range)) {
201 Type *IntTy32 = Type::getInt32Ty(II.getContext());
202 Metadata *M[] = {
203 ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)),
204 ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0x10000))};
205 II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M));
206 return &II;
207 }
208 break;
209 }
210 case Intrinsic::arm_mve_vadc:
211 case Intrinsic::arm_mve_vadc_predicated: {
212 unsigned CarryOp =
213 (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
214 assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&(static_cast <bool> (II.getArgOperand(CarryOp)->getType
()->getScalarSizeInBits() == 32 && "Bad type for intrinsic!"
) ? void (0) : __assert_fail ("II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 && \"Bad type for intrinsic!\""
, "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp"
, 215, __extension__ __PRETTY_FUNCTION__))
215 "Bad type for intrinsic!")(static_cast <bool> (II.getArgOperand(CarryOp)->getType
()->getScalarSizeInBits() == 32 && "Bad type for intrinsic!"
) ? void (0) : __assert_fail ("II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 && \"Bad type for intrinsic!\""
, "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp"
, 215, __extension__ __PRETTY_FUNCTION__))
;
216
217 KnownBits CarryKnown(32);
218 if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
219 CarryKnown)) {
220 return &II;
221 }
222 break;
223 }
224 case Intrinsic::arm_mve_vmldava: {
225 Instruction *I = cast<Instruction>(&II);
226 if (I->hasOneUse()) {
227 auto *User = cast<Instruction>(*I->user_begin());
228 Value *OpZ;
229 if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
230 match(I->getOperand(3), m_Zero())) {
231 Value *OpX = I->getOperand(4);
232 Value *OpY = I->getOperand(5);
233 Type *OpTy = OpX->getType();
234
235 IC.Builder.SetInsertPoint(User);
236 Value *V =
237 IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
238 {I->getOperand(0), I->getOperand(1),
239 I->getOperand(2), OpZ, OpX, OpY});
240
241 IC.replaceInstUsesWith(*User, V);
242 return IC.eraseInstFromFunction(*User);
243 }
244 }
245 return None;
246 }
247 }
248 return None;
249}
250
251Optional<Value *> ARMTTIImpl::simplifyDemandedVectorEltsIntrinsic(
252 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
253 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
254 std::function<void(Instruction *, unsigned, APInt, APInt &)>
255 SimplifyAndSetOp) const {
256
257 // Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the
258 // opcode specifying a Top/Bottom instruction, which can change between
259 // instructions.
260 auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) {
261 unsigned NumElts = cast<FixedVectorType>(II.getType())->getNumElements();
262 unsigned IsTop = cast<ConstantInt>(II.getOperand(TopOpc))->getZExtValue();
263
264 // The only odd/even lanes of operand 0 will only be demanded depending
265 // on whether this is a top/bottom instruction.
266 APInt DemandedElts =
267 APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
268 : APInt::getHighBitsSet(2, 1));
269 SimplifyAndSetOp(&II, 0, OrigDemandedElts & DemandedElts, UndefElts);
270 // The other lanes will be defined from the inserted elements.
271 UndefElts &= APInt::getSplat(NumElts, !IsTop ? APInt::getLowBitsSet(2, 1)
272 : APInt::getHighBitsSet(2, 1));
273 return None;
274 };
275
276 switch (II.getIntrinsicID()) {
277 default:
278 break;
279 case Intrinsic::arm_mve_vcvt_narrow:
280 SimplifyNarrowInstrTopBottom(2);
281 break;
282 case Intrinsic::arm_mve_vqmovn:
283 SimplifyNarrowInstrTopBottom(4);
284 break;
285 case Intrinsic::arm_mve_vshrn:
286 SimplifyNarrowInstrTopBottom(7);
287 break;
288 }
289
290 return None;
291}
292
293InstructionCost ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
294 TTI::TargetCostKind CostKind) {
295 assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
__assert_fail ("Ty->isIntegerTy()", "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp"
, 295, __extension__ __PRETTY_FUNCTION__))
;
13
'?' condition is true
296
297 unsigned Bits = Ty->getPrimitiveSizeInBits();
298 if (Bits == 0 || Imm.getActiveBits() >= 64)
14
Assuming 'Bits' is not equal to 0
15
Assuming the condition is false
16
Taking false branch
299 return 4;
300
301 int64_t SImmVal = Imm.getSExtValue();
302 uint64_t ZImmVal = Imm.getZExtValue();
303 if (!ST->isThumb()) {
17
Assuming the condition is false
18
Taking false branch
304 if ((SImmVal >= 0 && SImmVal < 65536) ||
305 (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
306 (ARM_AM::getSOImmVal(~ZImmVal) != -1))
307 return 1;
308 return ST->hasV6T2Ops() ? 2 : 3;
309 }
310 if (ST->isThumb2()) {
19
Assuming the condition is false
311 if ((SImmVal >= 0 && SImmVal < 65536) ||
312 (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
313 (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
314 return 1;
315 return ST->hasV6T2Ops() ? 2 : 3;
316 }
317 // Thumb1, any i8 imm cost 1.
318 if (Bits == 8 || (SImmVal
20.1
'SImmVal' is >= 0
20.1
'SImmVal' is >= 0
20.1
'SImmVal' is >= 0
>= 0 && SImmVal < 256))
20
Assuming 'Bits' is not equal to 8
21
Assuming 'SImmVal' is >= 256
319 return 1;
320 if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
22
Assuming the condition is false
23
Calling 'isThumbImmShiftedVal'
321 return 2;
322 // Load from constantpool.
323 return 3;
324}
325
326// Constants smaller than 256 fit in the immediate field of
327// Thumb1 instructions so we return a zero cost and 1 otherwise.
328InstructionCost ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
329 const APInt &Imm, Type *Ty) {
330 if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
331 return 0;
332
333 return 1;
334}
335
336// Checks whether Inst is part of a min(max()) or max(min()) pattern
337// that will match to an SSAT instruction
338static bool isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
339 Value *LHS, *RHS;
340 ConstantInt *C;
341 SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor;
342
343 if (InstSPF == SPF_SMAX &&
344 PatternMatch::match(RHS, PatternMatch::m_ConstantInt(C)) &&
345 C->getValue() == Imm && Imm.isNegative() && (-Imm).isPowerOf2()) {
346
347 auto isSSatMin = [&](Value *MinInst) {
348 if (isa<SelectInst>(MinInst)) {
349 Value *MinLHS, *MinRHS;
350 ConstantInt *MinC;
351 SelectPatternFlavor MinSPF =
352 matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
353 if (MinSPF == SPF_SMIN &&
354 PatternMatch::match(MinRHS, PatternMatch::m_ConstantInt(MinC)) &&
355 MinC->getValue() == ((-Imm) - 1))
356 return true;
357 }
358 return false;
359 };
360
361 if (isSSatMin(Inst->getOperand(1)) ||
362 (Inst->hasNUses(2) && (isSSatMin(*Inst->user_begin()) ||
363 isSSatMin(*(++Inst->user_begin())))))
364 return true;
365 }
366 return false;
367}
368
369InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
370 const APInt &Imm, Type *Ty,
371 TTI::TargetCostKind CostKind,
372 Instruction *Inst) {
373 // Division by a constant can be turned into multiplication, but only if we
374 // know it's constant. So it's not so much that the immediate is cheap (it's
375 // not), but that the alternative is worse.
376 // FIXME: this is probably unneeded with GlobalISel.
377 if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
1
Assuming 'Opcode' is not equal to SDiv
2
Assuming 'Opcode' is not equal to UDiv
378 Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
3
Assuming 'Opcode' is not equal to SRem
4
Assuming 'Opcode' is not equal to URem
379 Idx == 1)
380 return 0;
381
382 // Leave any gep offsets for the CodeGenPrepare, which will do a better job at
383 // splitting any large offsets.
384 if (Opcode == Instruction::GetElementPtr && Idx != 0)
5
Assuming 'Opcode' is not equal to GetElementPtr
385 return 0;
386
387 if (Opcode == Instruction::And) {
6
Assuming 'Opcode' is not equal to And
7
Taking false branch
388 // UXTB/UXTH
389 if (Imm == 255 || Imm == 65535)
390 return 0;
391 // Conversion to BIC is free, and means we can use ~Imm instead.
392 return std::min(getIntImmCost(Imm, Ty, CostKind),
393 getIntImmCost(~Imm, Ty, CostKind));
394 }
395
396 if (Opcode == Instruction::Add)
8
Assuming 'Opcode' is not equal to Add
397 // Conversion to SUB is free, and means we can use -Imm instead.
398 return std::min(getIntImmCost(Imm, Ty, CostKind),
399 getIntImmCost(-Imm, Ty, CostKind));
400
401 if (Opcode == Instruction::ICmp && Imm.isNegative() &&
9
Assuming 'Opcode' is not equal to ICmp
402 Ty->getIntegerBitWidth() == 32) {
403 int64_t NegImm = -Imm.getSExtValue();
404 if (ST->isThumb2() && NegImm < 1<<12)
405 // icmp X, #-C -> cmn X, #C
406 return 0;
407 if (ST->isThumb() && NegImm < 1<<8)
408 // icmp X, #-C -> adds X, #C
409 return 0;
410 }
411
412 // xor a, -1 can always be folded to MVN
413 if (Opcode == Instruction::Xor && Imm.isAllOnes())
10
Assuming 'Opcode' is equal to Xor
414 return 0;
415
416 // Ensures negative constant of min(max()) or max(min()) patterns that
417 // match to SSAT instructions don't get hoisted
418 if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
11
Assuming 'Inst' is null
419 Ty->getIntegerBitWidth() <= 32) {
420 if (isSSATMinMaxPattern(Inst, Imm) ||
421 (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
422 isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm)))
423 return 0;
424 }
425
426 // We can convert <= -1 to < 0, which is generally quite cheap.
427 if (Inst
11.1
'Inst' is null
11.1
'Inst' is null
11.1
'Inst' is null
&& Opcode == Instruction::ICmp && Idx == 1 && Imm.isAllOnesValue()) {
428 ICmpInst::Predicate Pred = cast<ICmpInst>(Inst)->getPredicate();
429 if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE)
430 return std::min(getIntImmCost(Imm, Ty, CostKind),
431 getIntImmCost(Imm + 1, Ty, CostKind));
432 }
433
434 return getIntImmCost(Imm, Ty, CostKind);
12
Calling 'ARMTTIImpl::getIntImmCost'
435}
436
437InstructionCost ARMTTIImpl::getCFInstrCost(unsigned Opcode,
438 TTI::TargetCostKind CostKind,
439 const Instruction *I) {
440 if (CostKind == TTI::TCK_RecipThroughput &&
441 (ST->hasNEON() || ST->hasMVEIntegerOps())) {
442 // FIXME: The vectorizer is highly sensistive to the cost of these
443 // instructions, which suggests that it may be using the costs incorrectly.
444 // But, for now, just make them free to avoid performance regressions for
445 // vector targets.
446 return 0;
447 }
448 return BaseT::getCFInstrCost(Opcode, CostKind, I);
449}
450
451InstructionCost ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
452 Type *Src,
453 TTI::CastContextHint CCH,
454 TTI::TargetCostKind CostKind,
455 const Instruction *I) {
456 int ISD = TLI->InstructionOpcodeToISD(Opcode);
457 assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ?
void (0) : __assert_fail ("ISD && \"Invalid opcode\""
, "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp"
, 457, __extension__ __PRETTY_FUNCTION__))
;
458
459 // TODO: Allow non-throughput costs that aren't binary.
460 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
461 if (CostKind != TTI::TCK_RecipThroughput)
462 return Cost == 0 ? 0 : 1;
463 return Cost;
464 };
465 auto IsLegalFPType = [this](EVT VT) {
466 EVT EltVT = VT.getScalarType();
467 return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
468 (EltVT == MVT::f64 && ST->hasFP64()) ||
469 (EltVT == MVT::f16 && ST->hasFullFP16());
470 };
471
472 EVT SrcTy = TLI->getValueType(DL, Src);
473 EVT DstTy = TLI->getValueType(DL, Dst);
474
475 if (!SrcTy.isSimple() || !DstTy.isSimple())
476 return AdjustCost(
477 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
478
479 // Extending masked load/Truncating masked stores is expensive because we
480 // currently don't split them. This means that we'll likely end up
481 // loading/storing each element individually (hence the high cost).
482 if ((ST->hasMVEIntegerOps() &&
483 (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
484 Opcode == Instruction::SExt)) ||
485 (ST->hasMVEFloatOps() &&
486 (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
487 IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
488 if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
489 return 2 * DstTy.getVectorNumElements() *
490 ST->getMVEVectorCostFactor(CostKind);
491
492 // The extend of other kinds of load is free
493 if (CCH == TTI::CastContextHint::Normal ||
494 CCH == TTI::CastContextHint::Masked) {
495 static const TypeConversionCostTblEntry LoadConversionTbl[] = {
496 {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
497 {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
498 {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
499 {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
500 {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
501 {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
502 {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
503 {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
504 {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
505 {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
506 {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
507 {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
508 };
509 if (const auto *Entry = ConvertCostTableLookup(
510 LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
511 return AdjustCost(Entry->Cost);
512
513 static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
514 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
515 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
516 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
517 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
518 {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
519 {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
520 // The following extend from a legal type to an illegal type, so need to
521 // split the load. This introduced an extra load operation, but the
522 // extend is still "free".
523 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1},
524 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1},
525 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3},
526 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3},
527 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1},
528 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1},
529 };
530 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
531 if (const auto *Entry =
532 ConvertCostTableLookup(MVELoadConversionTbl, ISD,
533 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
534 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
535 }
536
537 static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
538 // FPExtends are similar but also require the VCVT instructions.
539 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1},
540 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3},
541 };
542 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
543 if (const auto *Entry =
544 ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
545 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
546 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
547 }
548
549 // The truncate of a store is free. This is the mirror of extends above.
550 static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
551 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0},
552 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0},
553 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0},
554 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1},
555 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1},
556 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3},
557 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1},
558 };
559 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
560 if (const auto *Entry =
561 ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
562 SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
563 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
564 }
565
566 static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
567 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
568 {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
569 };
570 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
571 if (const auto *Entry =
572 ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
573 SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
574 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
575 }
576 }
577
578 // NEON vector operations that can extend their inputs.
579 if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
580 I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
581 static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
582 // vaddl
583 { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
584 { ISD::ADD, MVT::v8i16, MVT::v8i8, 0 },
585 // vsubl
586 { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
587 { ISD::SUB, MVT::v8i16, MVT::v8i8, 0 },
588 // vmull
589 { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
590 { ISD::MUL, MVT::v8i16, MVT::v8i8, 0 },
591 // vshll
592 { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
593 { ISD::SHL, MVT::v8i16, MVT::v8i8, 0 },
594 };
595
596 auto *User = cast<Instruction>(*I->user_begin());
597 int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
598 if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
599 DstTy.getSimpleVT(),
600 SrcTy.getSimpleVT())) {
601 return AdjustCost(Entry->Cost);
602 }
603 }
604
605 // Single to/from double precision conversions.
606 if (Src->isVectorTy() && ST->hasNEON() &&
607 ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
608 DstTy.getScalarType() == MVT::f32) ||
609 (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
610 DstTy.getScalarType() == MVT::f64))) {
611 static const CostTblEntry NEONFltDblTbl[] = {
612 // Vector fptrunc/fpext conversions.
613 {ISD::FP_ROUND, MVT::v2f64, 2},
614 {ISD::FP_EXTEND, MVT::v2f32, 2},
615 {ISD::FP_EXTEND, MVT::v4f32, 4}};
616
617 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
618 if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
619 return AdjustCost(LT.first * Entry->Cost);
620 }
621
622 // Some arithmetic, load and store operations have specific instructions
623 // to cast up/down their types automatically at no extra cost.
624 // TODO: Get these tables to know at least what the related operations are.
625 static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
626 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
627 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
628 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
629 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
630 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
631 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
632
633 // The number of vmovl instructions for the extension.
634 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
635 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
636 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
637 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
638 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
639 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
640 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
641 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
642 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
643 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
644 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
645 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
646 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
647 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
648 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
649 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
650 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
651 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
652
653 // Operations that we legalize using splitting.
654 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
655 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
656
657 // Vector float <-> i32 conversions.
658 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
659 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
660
661 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
662 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
663 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
664 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
665 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
666 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
667 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
668 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
669 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
670 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
671 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
672 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
673 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
674 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
675 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
676 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
677 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
678 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
679 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
680 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
681
682 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
683 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
684 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 },
685 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 },
686 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
687 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
688
689 // Vector double <-> i32 conversions.
690 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
691 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
692
693 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
694 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
695 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
696 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
697 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
698 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
699
700 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
701 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
702 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 4 },
703 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 4 },
704 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 8 },
705 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 8 }
706 };
707
708 if (SrcTy.isVector() && ST->hasNEON()) {
709 if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
710 DstTy.getSimpleVT(),
711 SrcTy.getSimpleVT()))
712 return AdjustCost(Entry->Cost);
713 }
714
715 // Scalar float to integer conversions.
716 static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
717 { ISD::FP_TO_SINT, MVT::i1, MVT::f32, 2 },
718 { ISD::FP_TO_UINT, MVT::i1, MVT::f32, 2 },
719 { ISD::FP_TO_SINT, MVT::i1, MVT::f64, 2 },
720 { ISD::FP_TO_UINT, MVT::i1, MVT::f64, 2 },
721 { ISD::FP_TO_SINT, MVT::i8, MVT::f32, 2 },
722 { ISD::FP_TO_UINT, MVT::i8, MVT::f32, 2 },
723 { ISD::FP_TO_SINT, MVT::i8, MVT::f64, 2 },
724 { ISD::FP_TO_UINT, MVT::i8, MVT::f64, 2 },
725 { ISD::FP_TO_SINT, MVT::i16, MVT::f32, 2 },
726 { ISD::FP_TO_UINT, MVT::i16, MVT::f32, 2 },
727 { ISD::FP_TO_SINT, MVT::i16, MVT::f64, 2 },
728 { ISD::FP_TO_UINT, MVT::i16, MVT::f64, 2 },
729 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 2 },
730 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 2 },
731 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 2 },
732 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 2 },
733 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 10 },
734 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 10 },
735 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 10 },
736 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 10 }
737 };
738 if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
739 if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
740 DstTy.getSimpleVT(),
741 SrcTy.getSimpleVT()))
742 return AdjustCost(Entry->Cost);
743 }
744
745 // Scalar integer to float conversions.
746 static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
747 { ISD::SINT_TO_FP, MVT::f32, MVT::i1, 2 },
748 { ISD::UINT_TO_FP, MVT::f32, MVT::i1, 2 },
749 { ISD::SINT_TO_FP, MVT::f64, MVT::i1, 2 },
750 { ISD::UINT_TO_FP, MVT::f64, MVT::i1, 2 },
751 { ISD::SINT_TO_FP, MVT::f32, MVT::i8, 2 },
752 { ISD::UINT_TO_FP, MVT::f32, MVT::i8, 2 },
753 { ISD::SINT_TO_FP, MVT::f64, MVT::i8, 2 },
754 { ISD::UINT_TO_FP, MVT::f64, MVT::i8, 2 },
755 { ISD::SINT_TO_FP, MVT::f32, MVT::i16, 2 },
756 { ISD::UINT_TO_FP, MVT::f32, MVT::i16, 2 },
757 { ISD::SINT_TO_FP, MVT::f64, MVT::i16, 2 },
758 { ISD::UINT_TO_FP, MVT::f64, MVT::i16, 2 },
759 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 2 },
760 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 2 },
761 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 2 },
762 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 2 },
763 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 10 },
764 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 10 },
765 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 10 },
766 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 10 }
767 };
768
769 if (SrcTy.isInteger() && ST->hasNEON()) {
770 if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
771 ISD, DstTy.getSimpleVT(),
772 SrcTy.getSimpleVT()))
773 return AdjustCost(Entry->Cost);
774 }
775
776 // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
777 // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
778 // are linearised so take more.
779 static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
780 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
781 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
782 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
783 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
784 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
785 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
786 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
787 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
788 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
789 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
790 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
791 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
792 };
793
794 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
795 if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
796 ISD, DstTy.getSimpleVT(),
797 SrcTy.getSimpleVT()))
798 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
799 }
800
801 if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
802 // As general rule, fp converts that were not matched above are scalarized
803 // and cost 1 vcvt for each lane, so long as the instruction is available.
804 // If not it will become a series of function calls.
805 const InstructionCost CallCost =
806 getCallInstrCost(nullptr, Dst, {Src}, CostKind);
807 int Lanes = 1;
808 if (SrcTy.isFixedLengthVector())
809 Lanes = SrcTy.getVectorNumElements();
810
811 if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
812 return Lanes;
813 else
814 return Lanes * CallCost;
815 }
816
817 if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
818 SrcTy.isFixedLengthVector()) {
819 // Treat a truncate with larger than legal source (128bits for MVE) as
820 // expensive, 2 instructions per lane.
821 if ((SrcTy.getScalarType() == MVT::i8 ||
822 SrcTy.getScalarType() == MVT::i16 ||
823 SrcTy.getScalarType() == MVT::i32) &&
824 SrcTy.getSizeInBits() > 128 &&
825 SrcTy.getSizeInBits() > DstTy.getSizeInBits())
826 return SrcTy.getVectorNumElements() * 2;
827 }
828
829 // Scalar integer conversion costs.
830 static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
831 // i16 -> i64 requires two dependent operations.
832 { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
833
834 // Truncates on i64 are assumed to be free.
835 { ISD::TRUNCATE, MVT::i32, MVT::i64, 0 },
836 { ISD::TRUNCATE, MVT::i16, MVT::i64, 0 },
837 { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 },
838 { ISD::TRUNCATE, MVT::i1, MVT::i64, 0 }
839 };
840
841 if (SrcTy.isInteger()) {
842 if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
843 DstTy.getSimpleVT(),
844 SrcTy.getSimpleVT()))
845 return AdjustCost(Entry->Cost);
846 }
847
848 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
849 ? ST->getMVEVectorCostFactor(CostKind)
850 : 1;
851 return AdjustCost(
852 BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
853}
854
855InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
856 unsigned Index) {
857 // Penalize inserting into an D-subregister. We end up with a three times
858 // lower estimated throughput on swift.
859 if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
860 ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
861 return 3;
862
863 if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
864 Opcode == Instruction::ExtractElement)) {
865 // Cross-class copies are expensive on many microarchitectures,
866 // so assume they are expensive by default.
867 if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
868 return 3;
869
870 // Even if it's not a cross class copy, this likely leads to mixing
871 // of NEON and VFP code and should be therefore penalized.
872 if (ValTy->isVectorTy() &&
873 ValTy->getScalarSizeInBits() <= 32)
874 return std::max<InstructionCost>(
875 BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
876 }
877
878 if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
879 Opcode == Instruction::ExtractElement)) {
880 // Integer cross-lane moves are more expensive than float, which can
881 // sometimes just be vmovs. Integer involve being passes to GPR registers,
882 // causing more of a delay.
883 std::pair<InstructionCost, MVT> LT =
884 getTLI()->getTypeLegalizationCost(DL, ValTy->getScalarType());
885 return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
886 }
887
888 return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
889}
890
891InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
892 Type *CondTy,
893 CmpInst::Predicate VecPred,
894 TTI::TargetCostKind CostKind,
895 const Instruction *I) {
896 int ISD = TLI->InstructionOpcodeToISD(Opcode);
897
898 // Thumb scalar code size cost for select.
899 if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
900 ST->isThumb() && !ValTy->isVectorTy()) {
901 // Assume expensive structs.
902 if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
903 return TTI::TCC_Expensive;
904
905 // Select costs can vary because they:
906 // - may require one or more conditional mov (including an IT),
907 // - can't operate directly on immediates,
908 // - require live flags, which we can't copy around easily.
909 InstructionCost Cost = TLI->getTypeLegalizationCost(DL, ValTy).first;
910
911 // Possible IT instruction for Thumb2, or more for Thumb1.
912 ++Cost;
913
914 // i1 values may need rematerialising by using mov immediates and/or
915 // flag setting instructions.
916 if (ValTy->isIntegerTy(1))
917 ++Cost;
918
919 return Cost;
920 }
921
922 // If this is a vector min/max/abs, use the cost of that intrinsic directly
923 // instead. Hopefully when min/max intrinsics are more prevalent this code
924 // will not be needed.
925 const Instruction *Sel = I;
926 if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
927 Sel->hasOneUse())
928 Sel = cast<Instruction>(Sel->user_back());
929 if (Sel && ValTy->isVectorTy() &&
930 (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
931 const Value *LHS, *RHS;
932 SelectPatternFlavor SPF = matchSelectPattern(Sel, LHS, RHS).Flavor;
933 unsigned IID = 0;
934 switch (SPF) {
935 case SPF_ABS:
936 IID = Intrinsic::abs;
937 break;
938 case SPF_SMIN:
939 IID = Intrinsic::smin;
940 break;
941 case SPF_SMAX:
942 IID = Intrinsic::smax;
943 break;
944 case SPF_UMIN:
945 IID = Intrinsic::umin;
946 break;
947 case SPF_UMAX:
948 IID = Intrinsic::umax;
949 break;
950 case SPF_FMINNUM:
951 IID = Intrinsic::minnum;
952 break;
953 case SPF_FMAXNUM:
954 IID = Intrinsic::maxnum;
955 break;
956 default:
957 break;
958 }
959 if (IID) {
960 // The ICmp is free, the select gets the cost of the min/max/etc
961 if (Sel != I)
962 return 0;
963 IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
964 return getIntrinsicInstrCost(CostAttrs, CostKind);
965 }
966 }
967
968 // On NEON a vector select gets lowered to vbsl.
969 if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
970 // Lowering of some vector selects is currently far from perfect.
971 static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
972 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
973 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
974 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
975 };
976
977 EVT SelCondTy = TLI->getValueType(DL, CondTy);
978 EVT SelValTy = TLI->getValueType(DL, ValTy);
979 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
980 if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
981 SelCondTy.getSimpleVT(),
982 SelValTy.getSimpleVT()))
983 return Entry->Cost;
984 }
985
986 std::pair<InstructionCost, MVT> LT =
987 TLI->getTypeLegalizationCost(DL, ValTy);
988 return LT.first;
989 }
990
991 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
992 (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
993 cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
994 FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
995 FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy);
996 if (!VecCondTy)
997 VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy));
998
999 // If we don't have mve.fp any fp operations will need to be scalarized.
1000 if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
1001 // One scalaization insert, one scalarization extract and the cost of the
1002 // fcmps.
1003 return BaseT::getScalarizationOverhead(VecValTy, false, true) +
1004 BaseT::getScalarizationOverhead(VecCondTy, true, false) +
1005 VecValTy->getNumElements() *
1006 getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
1007 VecCondTy->getScalarType(), VecPred, CostKind,
1008 I);
1009 }
1010
1011 std::pair<InstructionCost, MVT> LT =
1012 TLI->getTypeLegalizationCost(DL, ValTy);
1013 int BaseCost = ST->getMVEVectorCostFactor(CostKind);
1014 // There are two types - the input that specifies the type of the compare
1015 // and the output vXi1 type. Because we don't know how the output will be
1016 // split, we may need an expensive shuffle to get two in sync. This has the
1017 // effect of making larger than legal compares (v8i32 for example)
1018 // expensive.
1019 if (LT.second.getVectorNumElements() > 2) {
1020 if (LT.first > 1)
1021 return LT.first * BaseCost +
1022 BaseT::getScalarizationOverhead(VecCondTy, true, false);
1023 return BaseCost;
1024 }
1025 }
1026
1027 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1028 // for "multiple beats" potentially needed by MVE instructions.
1029 int BaseCost = 1;
1030 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
1031 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1032
1033 return BaseCost *
1034 BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1035}
1036
1037InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty,
1038 ScalarEvolution *SE,
1039 const SCEV *Ptr) {
1040 // Address computations in vectorized code with non-consecutive addresses will
1041 // likely result in more instructions compared to scalar code where the
1042 // computation can more often be merged into the index mode. The resulting
1043 // extra micro-ops can significantly decrease throughput.
1044 unsigned NumVectorInstToHideOverhead = 10;
1045 int MaxMergeDistance = 64;
1046
1047 if (ST->hasNEON()) {
1048 if (Ty->isVectorTy() && SE &&
1049 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
1050 return NumVectorInstToHideOverhead;
1051
1052 // In many cases the address computation is not merged into the instruction
1053 // addressing mode.
1054 return 1;
1055 }
1056 return BaseT::getAddressComputationCost(Ty, SE, Ptr);
1057}
1058
1059bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) {
1060 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
1061 // If a VCTP is part of a chain, it's already profitable and shouldn't be
1062 // optimized, else LSR may block tail-predication.
1063 switch (II->getIntrinsicID()) {
1064 case Intrinsic::arm_mve_vctp8:
1065 case Intrinsic::arm_mve_vctp16:
1066 case Intrinsic::arm_mve_vctp32:
1067 case Intrinsic::arm_mve_vctp64:
1068 return true;
1069 default:
1070 break;
1071 }
1072 }
1073 return false;
1074}
1075
1076bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
1077 if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
1078 return false;
1079
1080 if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
1081 // Don't support v2i1 yet.
1082 if (VecTy->getNumElements() == 2)
1083 return false;
1084
1085 // We don't support extending fp types.
1086 unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
1087 if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
1088 return false;
1089 }
1090
1091 unsigned EltWidth = DataTy->getScalarSizeInBits();
1092 return (EltWidth == 32 && Alignment >= 4) ||
1093 (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
1094}
1095
1096bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) {
1097 if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
1098 return false;
1099
1100 // This method is called in 2 places:
1101 // - from the vectorizer with a scalar type, in which case we need to get
1102 // this as good as we can with the limited info we have (and rely on the cost
1103 // model for the rest).
1104 // - from the masked intrinsic lowering pass with the actual vector type.
1105 // For MVE, we have a custom lowering pass that will already have custom
1106 // legalised any gathers that we can to MVE intrinsics, and want to expand all
1107 // the rest. The pass runs before the masked intrinsic lowering pass, so if we
1108 // are here, we know we want to expand.
1109 if (isa<VectorType>(Ty))
1110 return false;
1111
1112 unsigned EltWidth = Ty->getScalarSizeInBits();
1113 return ((EltWidth == 32 && Alignment >= 4) ||
1114 (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1115}
1116
1117/// Given a memcpy/memset/memmove instruction, return the number of memory
1118/// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1119/// call is used.
1120int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const {
1121 MemOp MOp;
1122 unsigned DstAddrSpace = ~0u;
1123 unsigned SrcAddrSpace = ~0u;
1124 const Function *F = I->getParent()->getParent();
1125
1126 if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
1127 ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
1128 // If 'size' is not a constant, a library call will be generated.
1129 if (!C)
1130 return -1;
1131
1132 const unsigned Size = C->getValue().getZExtValue();
1133 const Align DstAlign = *MC->getDestAlign();
1134 const Align SrcAlign = *MC->getSourceAlign();
1135
1136 MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
1137 /*IsVolatile*/ false);
1138 DstAddrSpace = MC->getDestAddressSpace();
1139 SrcAddrSpace = MC->getSourceAddressSpace();
1140 }
1141 else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
1142 ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
1143 // If 'size' is not a constant, a library call will be generated.
1144 if (!C)
1145 return -1;
1146
1147 const unsigned Size = C->getValue().getZExtValue();
1148 const Align DstAlign = *MS->getDestAlign();
1149
1150 MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
1151 /*IsZeroMemset*/ false, /*IsVolatile*/ false);
1152 DstAddrSpace = MS->getDestAddressSpace();
1153 }
1154 else
1155 llvm_unreachable("Expected a memcpy/move or memset!")::llvm::llvm_unreachable_internal("Expected a memcpy/move or memset!"
, "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp"
, 1155)
;
1156
1157 unsigned Limit, Factor = 2;
1158 switch(I->getIntrinsicID()) {
1159 case Intrinsic::memcpy:
1160 Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
1161 break;
1162 case Intrinsic::memmove:
1163 Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
1164 break;
1165 case Intrinsic::memset:
1166 Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
1167 Factor = 1;
1168 break;
1169 default:
1170 llvm_unreachable("Expected a memcpy/move or memset!")::llvm::llvm_unreachable_internal("Expected a memcpy/move or memset!"
, "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp"
, 1170)
;
1171 }
1172
1173 // MemOps will be poplulated with a list of data types that needs to be
1174 // loaded and stored. That's why we multiply the number of elements by 2 to
1175 // get the cost for this memcpy.
1176 std::vector<EVT> MemOps;
1177 if (getTLI()->findOptimalMemOpLowering(
1178 MemOps, Limit, MOp, DstAddrSpace,
1179 SrcAddrSpace, F->getAttributes()))
1180 return MemOps.size() * Factor;
1181
1182 // If we can't find an optimal memop lowering, return the default cost
1183 return -1;
1184}
1185
1186InstructionCost ARMTTIImpl::getMemcpyCost(const Instruction *I) {
1187 int NumOps = getNumMemOps(cast<IntrinsicInst>(I));
1188
1189 // To model the cost of a library call, we assume 1 for the call, and
1190 // 3 for the argument setup.
1191 if (NumOps == -1)
1192 return 4;
1193 return NumOps;
1194}
1195
1196InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1197 VectorType *Tp, ArrayRef<int> Mask,
1198 int Index, VectorType *SubTp) {
1199 Kind = improveShuffleKindFromMask(Kind, Mask);
1200 if (ST->hasNEON()) {
1201 if (Kind == TTI::SK_Broadcast) {
1202 static const CostTblEntry NEONDupTbl[] = {
1203 // VDUP handles these cases.
1204 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1205 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1206 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1207 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1208 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1209 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1210
1211 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1212 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1213 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1214 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
1215
1216 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1217 if (const auto *Entry =
1218 CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
1219 return LT.first * Entry->Cost;
1220 }
1221 if (Kind == TTI::SK_Reverse) {
1222 static const CostTblEntry NEONShuffleTbl[] = {
1223 // Reverse shuffle cost one instruction if we are shuffling within a
1224 // double word (vrev) or two if we shuffle a quad word (vrev, vext).
1225 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1226 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1227 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1228 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1229 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1230 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1231
1232 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1233 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1234 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
1235 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
1236
1237 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1238 if (const auto *Entry =
1239 CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
1240 return LT.first * Entry->Cost;
1241 }
1242 if (Kind == TTI::SK_Select) {
1243 static const CostTblEntry NEONSelShuffleTbl[] = {
1244 // Select shuffle cost table for ARM. Cost is the number of
1245 // instructions
1246 // required to create the shuffled vector.
1247
1248 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1249 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1250 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1251 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1252
1253 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1254 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1255 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
1256
1257 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
1258
1259 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
1260
1261 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1262 if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
1263 ISD::VECTOR_SHUFFLE, LT.second))
1264 return LT.first * Entry->Cost;
1265 }
1266 }
1267 if (ST->hasMVEIntegerOps()) {
1268 if (Kind == TTI::SK_Broadcast) {
1269 static const CostTblEntry MVEDupTbl[] = {
1270 // VDUP handles these cases.
1271 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1272 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1273 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
1274 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1275 {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};
1276
1277 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1278 if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
1279 LT.second))
1280 return LT.first * Entry->Cost *
1281 ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput);
1282 }
1283
1284 if (!Mask.empty()) {
1285 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1286 if (Mask.size() <= LT.second.getVectorNumElements() &&
1287 (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
1288 isVREVMask(Mask, LT.second, 64)))
1289 return ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) * LT.first;
1290 }
1291 }
1292
1293 int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
1294 ? ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput)
1295 : 1;
1296 return BaseCost * BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
1297}
1298
1299InstructionCost ARMTTIImpl::getArithmeticInstrCost(
1300 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1301 TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
1302 TTI::OperandValueProperties Opd1PropInfo,
1303 TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
1304 const Instruction *CxtI) {
1305 int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1306 if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
1307 // Make operations on i1 relatively expensive as this often involves
1308 // combining predicates. AND and XOR should be easier to handle with IT
1309 // blocks.
1310 switch (ISDOpcode) {
1311 default:
1312 break;
1313 case ISD::AND:
1314 case ISD::XOR:
1315 return 2;
1316 case ISD::OR:
1317 return 3;
1318 }
1319 }
1320
1321 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
1322
1323 if (ST->hasNEON()) {
1324 const unsigned FunctionCallDivCost = 20;
1325 const unsigned ReciprocalDivCost = 10;
1326 static const CostTblEntry CostTbl[] = {
1327 // Division.
1328 // These costs are somewhat random. Choose a cost of 20 to indicate that
1329 // vectorizing devision (added function call) is going to be very expensive.
1330 // Double registers types.
1331 { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1332 { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1333 { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
1334 { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
1335 { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1336 { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1337 { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
1338 { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
1339 { ISD::SDIV, MVT::v4i16, ReciprocalDivCost},
1340 { ISD::UDIV, MVT::v4i16, ReciprocalDivCost},
1341 { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
1342 { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
1343 { ISD::SDIV, MVT::v8i8, ReciprocalDivCost},
1344 { ISD::UDIV, MVT::v8i8, ReciprocalDivCost},
1345 { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost},
1346 { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost},
1347 // Quad register types.
1348 { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1349 { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1350 { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
1351 { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
1352 { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1353 { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1354 { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
1355 { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
1356 { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1357 { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1358 { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
1359 { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
1360 { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1361 { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1362 { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
1363 { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
1364 // Multiplication.
1365 };
1366
1367 if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
1368 return LT.first * Entry->Cost;
1369
1370 InstructionCost Cost = BaseT::getArithmeticInstrCost(
1371 Opcode, Ty, CostKind, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo);
1372
1373 // This is somewhat of a hack. The problem that we are facing is that SROA
1374 // creates a sequence of shift, and, or instructions to construct values.
1375 // These sequences are recognized by the ISel and have zero-cost. Not so for
1376 // the vectorized code. Because we have support for v2i64 but not i64 those
1377 // sequences look particularly beneficial to vectorize.
1378 // To work around this we increase the cost of v2i64 operations to make them
1379 // seem less beneficial.
1380 if (LT.second == MVT::v2i64 &&
1381 Op2Info == TargetTransformInfo::OK_UniformConstantValue)
1382 Cost += 4;
1383
1384 return Cost;
1385 }
1386
1387 // If this operation is a shift on arm/thumb2, it might well be folded into
1388 // the following instruction, hence having a cost of 0.
1389 auto LooksLikeAFreeShift = [&]() {
1390 if (ST->isThumb1Only() || Ty->isVectorTy())
1391 return false;
1392
1393 if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
1394 return false;
1395 if (Op2Info != TargetTransformInfo::OK_UniformConstantValue)
1396 return false;
1397
1398 // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1399 switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
1400 case Instruction::Add:
1401 case Instruction::Sub:
1402 case Instruction::And:
1403 case Instruction::Xor:
1404 case Instruction::Or:
1405 case Instruction::ICmp:
1406 return true;
1407 default:
1408 return false;
1409 }
1410 };
1411 if (LooksLikeAFreeShift())
1412 return 0;
1413
1414 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1415 // for "multiple beats" potentially needed by MVE instructions.
1416 int BaseCost = 1;
1417 if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1418 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1419
1420 // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
1421 // without treating floats as more expensive that scalars or increasing the
1422 // costs for custom operations. The results is also multiplied by the
1423 // MVEVectorCostFactor where appropriate.
1424 if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
1425 return LT.first * BaseCost;
1426
1427 // Else this is expand, assume that we need to scalarize this op.
1428 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
1429 unsigned Num = VTy->getNumElements();
1430 InstructionCost Cost =
1431 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind);
1432 // Return the cost of multiple scalar invocation plus the cost of
1433 // inserting and extracting the values.
1434 SmallVector<Type *> Tys(Args.size(), Ty);
1435 return BaseT::getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
1436 }
1437
1438 return BaseCost;
1439}
1440
1441InstructionCost ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1442 MaybeAlign Alignment,
1443 unsigned AddressSpace,
1444 TTI::TargetCostKind CostKind,
1445 const Instruction *I) {
1446 // TODO: Handle other cost kinds.
1447 if (CostKind != TTI::TCK_RecipThroughput)
1448 return 1;
1449
1450 // Type legalization can't handle structs
1451 if (TLI->getValueType(DL, Src, true) == MVT::Other)
1452 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1453 CostKind);
1454
1455 if (ST->hasNEON() && Src->isVectorTy() &&
1456 (Alignment && *Alignment != Align(16)) &&
1457 cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
1458 // Unaligned loads/stores are extremely inefficient.
1459 // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1460 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
1461 return LT.first * 4;
1462 }
1463
1464 // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
1465 // Same for stores.
1466 if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
1467 ((Opcode == Instruction::Load && I->hasOneUse() &&
1468 isa<FPExtInst>(*I->user_begin())) ||
1469 (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
1470 FixedVectorType *SrcVTy = cast<FixedVectorType>(Src);
1471 Type *DstTy =
1472 Opcode == Instruction::Load
1473 ? (*I->user_begin())->getType()
1474 : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
1475 if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
1476 DstTy->getScalarType()->isFloatTy())
1477 return ST->getMVEVectorCostFactor(CostKind);
1478 }
1479
1480 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1481 ? ST->getMVEVectorCostFactor(CostKind)
1482 : 1;
1483 return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1484 CostKind, I);
1485}
1486
1487InstructionCost
1488ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1489 unsigned AddressSpace,
1490 TTI::TargetCostKind CostKind) {
1491 if (ST->hasMVEIntegerOps()) {
1492 if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment))
1493 return ST->getMVEVectorCostFactor(CostKind);
1494 if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment))
1495 return ST->getMVEVectorCostFactor(CostKind);
1496 }
1497 if (!isa<FixedVectorType>(Src))
1498 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1499 CostKind);
1500 // Scalar cost, which is currently very high due to the efficiency of the
1501 // generated code.
1502 return cast<FixedVectorType>(Src)->getNumElements() * 8;
1503}
1504
1505InstructionCost ARMTTIImpl::getInterleavedMemoryOpCost(
1506 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1507 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1508 bool UseMaskForCond, bool UseMaskForGaps) {
1509 assert(Factor >= 2 && "Invalid interleave factor")(static_cast <bool> (Factor >= 2 && "Invalid interleave factor"
) ? void (0) : __assert_fail ("Factor >= 2 && \"Invalid interleave factor\""
, "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp"
, 1509, __extension__ __PRETTY_FUNCTION__))
;
1510 assert(isa<VectorType>(VecTy) && "Expect a vector type")(static_cast <bool> (isa<VectorType>(VecTy) &&
"Expect a vector type") ? void (0) : __assert_fail ("isa<VectorType>(VecTy) && \"Expect a vector type\""
, "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp"
, 1510, __extension__ __PRETTY_FUNCTION__))
;
1511
1512 // vldN/vstN doesn't support vector types of i64/f64 element.
1513 bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
1514
1515 if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
1516 !UseMaskForCond && !UseMaskForGaps) {
1517 unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1518 auto *SubVecTy =
1519 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
1520
1521 // vldN/vstN only support legal vector types of size 64 or 128 in bits.
1522 // Accesses having vector types that are a multiple of 128 bits can be
1523 // matched to more than one vldN/vstN instruction.
1524 int BaseCost =
1525 ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1;
1526 if (NumElts % Factor == 0 &&
1527 TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
1528 return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1529
1530 // Some smaller than legal interleaved patterns are cheap as we can make
1531 // use of the vmovn or vrev patterns to interleave a standard load. This is
1532 // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1533 // promoted differently). The cost of 2 here is then a load and vrev or
1534 // vmovn.
1535 if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
1536 VecTy->isIntOrIntVectorTy() &&
1537 DL.getTypeSizeInBits(SubVecTy).getFixedSize() <= 64)
1538 return 2 * BaseCost;
1539 }
1540
1541 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1542 Alignment, AddressSpace, CostKind,
1543 UseMaskForCond, UseMaskForGaps);
1544}
1545
1546InstructionCost ARMTTIImpl::getGatherScatterOpCost(
1547 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1548 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
1549 using namespace PatternMatch;
1550 if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
1551 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1552 Alignment, CostKind, I);
1553
1554 assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!")(static_cast <bool> (DataTy->isVectorTy() &&
"Can't do gather/scatters on scalar!") ? void (0) : __assert_fail
("DataTy->isVectorTy() && \"Can't do gather/scatters on scalar!\""
, "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp"
, 1554, __extension__ __PRETTY_FUNCTION__))
;
1555 auto *VTy = cast<FixedVectorType>(DataTy);
1556
1557 // TODO: Splitting, once we do that.
1558
1559 unsigned NumElems = VTy->getNumElements();
1560 unsigned EltSize = VTy->getScalarSizeInBits();
1561 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, DataTy);
1562
1563 // For now, it is assumed that for the MVE gather instructions the loads are
1564 // all effectively serialised. This means the cost is the scalar cost
1565 // multiplied by the number of elements being loaded. This is possibly very
1566 // conservative, but even so we still end up vectorising loops because the
1567 // cost per iteration for many loops is lower than for scalar loops.
1568 InstructionCost VectorCost =
1569 NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
1570 // The scalarization cost should be a lot higher. We use the number of vector
1571 // elements plus the scalarization overhead.
1572 InstructionCost ScalarCost =
1573 NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, true, false) +
1574 BaseT::getScalarizationOverhead(VTy, false, true);
1575
1576 if (EltSize < 8 || Alignment < EltSize / 8)
1577 return ScalarCost;
1578
1579 unsigned ExtSize = EltSize;
1580 // Check whether there's a single user that asks for an extended type
1581 if (I != nullptr) {
1582 // Dependent of the caller of this function, a gather instruction will
1583 // either have opcode Instruction::Load or be a call to the masked_gather
1584 // intrinsic
1585 if ((I->getOpcode() == Instruction::Load ||
1586 match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
1587 I->hasOneUse()) {
1588 const User *Us = *I->users().begin();
1589 if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
1590 // only allow valid type combinations
1591 unsigned TypeSize =
1592 cast<Instruction>(Us)->getType()->getScalarSizeInBits();
1593 if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
1594 (TypeSize == 16 && EltSize == 8)) &&
1595 TypeSize * NumElems == 128) {
1596 ExtSize = TypeSize;
1597 }
1598 }
1599 }
1600 // Check whether the input data needs to be truncated
1601 TruncInst *T;
1602 if ((I->getOpcode() == Instruction::Store ||
1603 match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
1604 (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
1605 // Only allow valid type combinations
1606 unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
1607 if (((EltSize == 16 && TypeSize == 32) ||
1608 (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
1609 TypeSize * NumElems == 128)
1610 ExtSize = TypeSize;
1611 }
1612 }
1613
1614 if (ExtSize * NumElems != 128 || NumElems < 4)
1615 return ScalarCost;
1616
1617 // Any (aligned) i32 gather will not need to be scalarised.
1618 if (ExtSize == 32)
1619 return VectorCost;
1620 // For smaller types, we need to ensure that the gep's inputs are correctly
1621 // extended from a small enough value. Other sizes (including i64) are
1622 // scalarized for now.
1623 if (ExtSize != 8 && ExtSize != 16)
1624 return ScalarCost;
1625
1626 if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
1627 Ptr = BC->getOperand(0);
1628 if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
1629 if (GEP->getNumOperands() != 2)
1630 return ScalarCost;
1631 unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
1632 // Scale needs to be correct (which is only relevant for i16s).
1633 if (Scale != 1 && Scale * 8 != ExtSize)
1634 return ScalarCost;
1635 // And we need to zext (not sext) the indexes from a small enough type.
1636 if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
1637 if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
1638 return VectorCost;
1639 }
1640 return ScalarCost;
1641 }
1642 return ScalarCost;
1643}
1644
1645InstructionCost
1646ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
1647 Optional<FastMathFlags> FMF,
1648 TTI::TargetCostKind CostKind) {
1649 if (TTI::requiresOrderedReduction(FMF))
1650 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1651
1652 EVT ValVT = TLI->getValueType(DL, ValTy);
1653 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1654 if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD)
1655 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1656
1657 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1658
1659 static const CostTblEntry CostTblAdd[]{
1660 {ISD::ADD, MVT::v16i8, 1},
1661 {ISD::ADD, MVT::v8i16, 1},
1662 {ISD::ADD, MVT::v4i32, 1},
1663 };
1664 if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
1665 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1666
1667 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1668}
1669
1670InstructionCost
1671ARMTTIImpl::getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
1672 Type *ResTy, VectorType *ValTy,
1673 TTI::TargetCostKind CostKind) {
1674 EVT ValVT = TLI->getValueType(DL, ValTy);
1675 EVT ResVT = TLI->getValueType(DL, ResTy);
1676
1677 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1678 std::pair<InstructionCost, MVT> LT =
1679 TLI->getTypeLegalizationCost(DL, ValTy);
1680
1681 // The legal cases are:
1682 // VADDV u/s 8/16/32
1683 // VMLAV u/s 8/16/32
1684 // VADDLV u/s 32
1685 // VMLALV u/s 16/32
1686 // Codegen currently cannot always handle larger than legal vectors very
1687 // well, especially for predicated reductions where the mask needs to be
1688 // split, so restrict to 128bit or smaller input types.
1689 unsigned RevVTSize = ResVT.getSizeInBits();
1690 if (ValVT.getSizeInBits() <= 128 &&
1691 ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1692 (LT.second == MVT::v8i16 && RevVTSize <= (IsMLA ? 64u : 32u)) ||
1693 (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1694 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1695 }
1696
1697 return BaseT::getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, ValTy,
1698 CostKind);
1699}
1700
1701InstructionCost
1702ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1703 TTI::TargetCostKind CostKind) {
1704 switch (ICA.getID()) {
1705 case Intrinsic::get_active_lane_mask:
1706 // Currently we make a somewhat optimistic assumption that
1707 // active_lane_mask's are always free. In reality it may be freely folded
1708 // into a tail predicated loop, expanded into a VCPT or expanded into a lot
1709 // of add/icmp code. We may need to improve this in the future, but being
1710 // able to detect if it is free or not involves looking at a lot of other
1711 // code. We currently assume that the vectorizer inserted these, and knew
1712 // what it was doing in adding one.
1713 if (ST->hasMVEIntegerOps())
1714 return 0;
1715 break;
1716 case Intrinsic::sadd_sat:
1717 case Intrinsic::ssub_sat:
1718 case Intrinsic::uadd_sat:
1719 case Intrinsic::usub_sat: {
1720 if (!ST->hasMVEIntegerOps())
1721 break;
1722 Type *VT = ICA.getReturnType();
1723
1724 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1725 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1726 LT.second == MVT::v16i8) {
1727 // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
1728 // need to extend the type, as it uses shr(qadd(shl, shl)).
1729 unsigned Instrs =
1730 LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4;
1731 return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
1732 }
1733 break;
1734 }
1735 case Intrinsic::abs:
1736 case Intrinsic::smin:
1737 case Intrinsic::smax:
1738 case Intrinsic::umin:
1739 case Intrinsic::umax: {
1740 if (!ST->hasMVEIntegerOps())
1741 break;
1742 Type *VT = ICA.getReturnType();
1743
1744 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1745 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1746 LT.second == MVT::v16i8)
1747 return LT.first * ST->getMVEVectorCostFactor(CostKind);
1748 break;
1749 }
1750 case Intrinsic::minnum:
1751 case Intrinsic::maxnum: {
1752 if (!ST->hasMVEFloatOps())
1753 break;
1754 Type *VT = ICA.getReturnType();
1755 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1756 if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
1757 return LT.first * ST->getMVEVectorCostFactor(CostKind);
1758 break;
1759 }
1760 }
1761
1762 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1763}
1764
1765bool ARMTTIImpl::isLoweredToCall(const Function *F) {
1766 if (!F->isIntrinsic())
1767 BaseT::isLoweredToCall(F);
1768
1769 // Assume all Arm-specific intrinsics map to an instruction.
1770 if (F->getName().startswith("llvm.arm"))
1771 return false;
1772
1773 switch (F->getIntrinsicID()) {
1774 default: break;
1775 case Intrinsic::powi:
1776 case Intrinsic::sin:
1777 case Intrinsic::cos:
1778 case Intrinsic::pow:
1779 case Intrinsic::log:
1780 case Intrinsic::log10:
1781 case Intrinsic::log2:
1782 case Intrinsic::exp:
1783 case Intrinsic::exp2:
1784 return true;
1785 case Intrinsic::sqrt:
1786 case Intrinsic::fabs:
1787 case Intrinsic::copysign:
1788 case Intrinsic::floor:
1789 case Intrinsic::ceil:
1790 case Intrinsic::trunc:
1791 case Intrinsic::rint:
1792 case Intrinsic::nearbyint:
1793 case Intrinsic::round:
1794 case Intrinsic::canonicalize:
1795 case Intrinsic::lround:
1796 case Intrinsic::llround:
1797 case Intrinsic::lrint:
1798 case Intrinsic::llrint:
1799 if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
1800 return true;
1801 if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
1802 return true;
1803 // Some operations can be handled by vector instructions and assume
1804 // unsupported vectors will be expanded into supported scalar ones.
1805 // TODO Handle scalar operations properly.
1806 return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
1807 case Intrinsic::masked_store:
1808 case Intrinsic::masked_load:
1809 case Intrinsic::masked_gather:
1810 case Intrinsic::masked_scatter:
1811 return !ST->hasMVEIntegerOps();
1812 case Intrinsic::sadd_with_overflow:
1813 case Intrinsic::uadd_with_overflow:
1814 case Intrinsic::ssub_with_overflow:
1815 case Intrinsic::usub_with_overflow:
1816 case Intrinsic::sadd_sat:
1817 case Intrinsic::uadd_sat:
1818 case Intrinsic::ssub_sat:
1819 case Intrinsic::usub_sat:
1820 return false;
1821 }
1822
1823 return BaseT::isLoweredToCall(F);
1824}
1825
1826bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) {
1827 unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
1828 EVT VT = TLI->getValueType(DL, I.getType(), true);
1829 if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
1830 return true;
1831
1832 // Check if an intrinsic will be lowered to a call and assume that any
1833 // other CallInst will generate a bl.
1834 if (auto *Call = dyn_cast<CallInst>(&I)) {
1835 if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
1836 switch(II->getIntrinsicID()) {
1837 case Intrinsic::memcpy:
1838 case Intrinsic::memset:
1839 case Intrinsic::memmove:
1840 return getNumMemOps(II) == -1;
1841 default:
1842 if (const Function *F = Call->getCalledFunction())
1843 return isLoweredToCall(F);
1844 }
1845 }
1846 return true;
1847 }
1848
1849 // FPv5 provides conversions between integer, double-precision,
1850 // single-precision, and half-precision formats.
1851 switch (I.getOpcode()) {
1852 default:
1853 break;
1854 case Instruction::FPToSI:
1855 case Instruction::FPToUI:
1856 case Instruction::SIToFP:
1857 case Instruction::UIToFP:
1858 case Instruction::FPTrunc:
1859 case Instruction::FPExt:
1860 return !ST->hasFPARMv8Base();
1861 }
1862
1863 // FIXME: Unfortunately the approach of checking the Operation Action does
1864 // not catch all cases of Legalization that use library calls. Our
1865 // Legalization step categorizes some transformations into library calls as
1866 // Custom, Expand or even Legal when doing type legalization. So for now
1867 // we have to special case for instance the SDIV of 64bit integers and the
1868 // use of floating point emulation.
1869 if (VT.isInteger() && VT.getSizeInBits() >= 64) {
1870 switch (ISD) {
1871 default:
1872 break;
1873 case ISD::SDIV:
1874 case ISD::UDIV:
1875 case ISD::SREM:
1876 case ISD::UREM:
1877 case ISD::SDIVREM:
1878 case ISD::UDIVREM:
1879 return true;
1880 }
1881 }
1882
1883 // Assume all other non-float operations are supported.
1884 if (!VT.isFloatingPoint())
1885 return false;
1886
1887 // We'll need a library call to handle most floats when using soft.
1888 if (TLI->useSoftFloat()) {
1889 switch (I.getOpcode()) {
1890 default:
1891 return true;
1892 case Instruction::Alloca:
1893 case Instruction::Load:
1894 case Instruction::Store:
1895 case Instruction::Select:
1896 case Instruction::PHI:
1897 return false;
1898 }
1899 }
1900
1901 // We'll need a libcall to perform double precision operations on a single
1902 // precision only FPU.
1903 if (I.getType()->isDoubleTy() && !ST->hasFP64())
1904 return true;
1905
1906 // Likewise for half precision arithmetic.
1907 if (I.getType()->isHalfTy() && !ST->hasFullFP16())
1908 return true;
1909
1910 return false;
1911}
1912
1913bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
1914 AssumptionCache &AC,
1915 TargetLibraryInfo *LibInfo,
1916 HardwareLoopInfo &HWLoopInfo) {
1917 // Low-overhead branches are only supported in the 'low-overhead branch'
1918 // extension of v8.1-m.
1919 if (!ST->hasLOB() || DisableLowOverheadLoops) {
1920 LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "ARMHWLoops: Disabled\n"; } } while
(false)
;
1921 return false;
1922 }
1923
1924 if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
1925 LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "ARMHWLoops: No BETC\n"; } } while
(false)
;
1926 return false;
1927 }
1928
1929 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
1930 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
1931 LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "ARMHWLoops: Uncomputable BETC\n"
; } } while (false)
;
1932 return false;
1933 }
1934
1935 const SCEV *TripCountSCEV =
1936 SE.getAddExpr(BackedgeTakenCount,
1937 SE.getOne(BackedgeTakenCount->getType()));
1938
1939 // We need to store the trip count in LR, a 32-bit register.
1940 if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
1941 LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n"
; } } while (false)
;
1942 return false;
1943 }
1944
1945 // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
1946 // point in generating a hardware loop if that's going to happen.
1947
1948 auto IsHardwareLoopIntrinsic = [](Instruction &I) {
1949 if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
1950 switch (Call->getIntrinsicID()) {
1951 default:
1952 break;
1953 case Intrinsic::start_loop_iterations:
1954 case Intrinsic::test_start_loop_iterations:
1955 case Intrinsic::loop_decrement:
1956 case Intrinsic::loop_decrement_reg:
1957 return true;
1958 }
1959 }
1960 return false;
1961 };
1962
1963 // Scan the instructions to see if there's any that we know will turn into a
1964 // call or if this loop is already a low-overhead loop or will become a tail
1965 // predicated loop.
1966 bool IsTailPredLoop = false;
1967 auto ScanLoop = [&](Loop *L) {
1968 for (auto *BB : L->getBlocks()) {
1969 for (auto &I : *BB) {
1970 if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
1971 isa<InlineAsm>(I)) {
1972 LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "ARMHWLoops: Bad instruction: "
<< I << "\n"; } } while (false)
;
1973 return false;
1974 }
1975 if (auto *II = dyn_cast<IntrinsicInst>(&I))
1976 IsTailPredLoop |=
1977 II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
1978 II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
1979 II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
1980 II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
1981 II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
1982 }
1983 }
1984 return true;
1985 };
1986
1987 // Visit inner loops.
1988 for (auto Inner : *L)
1989 if (!ScanLoop(Inner))
1990 return false;
1991
1992 if (!ScanLoop(L))
1993 return false;
1994
1995 // TODO: Check whether the trip count calculation is expensive. If L is the
1996 // inner loop but we know it has a low trip count, calculating that trip
1997 // count (in the parent loop) may be detrimental.
1998
1999 LLVMContext &C = L->getHeader()->getContext();
2000 HWLoopInfo.CounterInReg = true;
2001 HWLoopInfo.IsNestingLegal = false;
2002 HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
2003 HWLoopInfo.CountType = Type::getInt32Ty(C);
2004 HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
2005 return true;
2006}
2007
2008static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
2009 // We don't allow icmp's, and because we only look at single block loops,
2010 // we simply count the icmps, i.e. there should only be 1 for the backedge.
2011 if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
2012 return false;
2013 // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
2014 // not currently canonical, but soon will be. Code without them uses icmp, and
2015 // so is not tail predicated as per the condition above. In order to get the
2016 // same performance we treat min and max the same as an icmp for tailpred
2017 // purposes for the moment (we often rely on non-tailpred and higher VF's to
2018 // pick more optimial instructions like VQDMULH. They need to be recognized
2019 // directly by the vectorizer).
2020 if (auto *II = dyn_cast<IntrinsicInst>(&I))
2021 if ((II->getIntrinsicID() == Intrinsic::smin ||
2022 II->getIntrinsicID() == Intrinsic::smax ||
2023 II->getIntrinsicID() == Intrinsic::umin ||
2024 II->getIntrinsicID() == Intrinsic::umax) &&
2025 ++ICmpCount > 1)
2026 return false;
2027
2028 if (isa<FCmpInst>(&I))
2029 return false;
2030
2031 // We could allow extending/narrowing FP loads/stores, but codegen is
2032 // too inefficient so reject this for now.
2033 if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
2034 return false;
2035
2036 // Extends have to be extending-loads
2037 if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
2038 if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
2039 return false;
2040
2041 // Truncs have to be narrowing-stores
2042 if (isa<TruncInst>(&I) )
2043 if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
2044 return false;
2045
2046 return true;
2047}
2048
2049// To set up a tail-predicated loop, we need to know the total number of
2050// elements processed by that loop. Thus, we need to determine the element
2051// size and:
2052// 1) it should be uniform for all operations in the vector loop, so we
2053// e.g. don't want any widening/narrowing operations.
2054// 2) it should be smaller than i64s because we don't have vector operations
2055// that work on i64s.
2056// 3) we don't want elements to be reversed or shuffled, to make sure the
2057// tail-predication masks/predicates the right lanes.
2058//
2059static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
2060 const DataLayout &DL,
2061 const LoopAccessInfo *LAI) {
2062 LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Tail-predication: checking allowed instructions\n"
; } } while (false)
;
2063
2064 // If there are live-out values, it is probably a reduction. We can predicate
2065 // most reduction operations freely under MVE using a combination of
2066 // prefer-predicated-reduction-select and inloop reductions. We limit this to
2067 // floating point and integer reductions, but don't check for operators
2068 // specifically here. If the value ends up not being a reduction (and so the
2069 // vectorizer cannot tailfold the loop), we should fall back to standard
2070 // vectorization automatically.
2071 SmallVector< Instruction *, 8 > LiveOuts;
2072 LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
2073 bool ReductionsDisabled =
2074 EnableTailPredication == TailPredication::EnabledNoReductions ||
2075 EnableTailPredication == TailPredication::ForceEnabledNoReductions;
2076
2077 for (auto *I : LiveOuts) {
2078 if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
2079 !I->getType()->isHalfTy()) {
2080 LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Don't tail-predicate loop with non-integer/float "
"live-out value\n"; } } while (false)
2081 "live-out value\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Don't tail-predicate loop with non-integer/float "
"live-out value\n"; } } while (false)
;
2082 return false;
2083 }
2084 if (ReductionsDisabled) {
2085 LLVM_DEBUG(dbgs() << "Reductions not enabled\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Reductions not enabled\n"; } }
while (false)
;
2086 return false;
2087 }
2088 }
2089
2090 // Next, check that all instructions can be tail-predicated.
2091 PredicatedScalarEvolution PSE = LAI->getPSE();
2092 SmallVector<Instruction *, 16> LoadStores;
2093 int ICmpCount = 0;
2094
2095 for (BasicBlock *BB : L->blocks()) {
2096 for (Instruction &I : BB->instructionsWithoutDebug()) {
2097 if (isa<PHINode>(&I))
2098 continue;
2099 if (!canTailPredicateInstruction(I, ICmpCount)) {
2100 LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Instruction not allowed: "; I.
dump(); } } while (false)
;
2101 return false;
2102 }
2103
2104 Type *T = I.getType();
2105 if (T->isPointerTy())
2106 T = T->getPointerElementType();
2107
2108 if (T->getScalarSizeInBits() > 32) {
2109 LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Unsupported Type: "; T->dump
(); } } while (false)
;
2110 return false;
2111 }
2112 if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
2113 Value *Ptr = getLoadStorePointerOperand(&I);
2114 Type *AccessTy = getLoadStoreType(&I);
2115 int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L);
2116 if (NextStride == 1) {
2117 // TODO: for now only allow consecutive strides of 1. We could support
2118 // other strides as long as it is uniform, but let's keep it simple
2119 // for now.
2120 continue;
2121 } else if (NextStride == -1 ||
2122 (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
2123 (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
2124 LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Consecutive strides of 2 found, vld2/vstr2 can't "
"be tail-predicated\n."; } } while (false)
2125 << "Consecutive strides of 2 found, vld2/vstr2 can't "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Consecutive strides of 2 found, vld2/vstr2 can't "
"be tail-predicated\n."; } } while (false)
2126 "be tail-predicated\n.")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Consecutive strides of 2 found, vld2/vstr2 can't "
"be tail-predicated\n."; } } while (false)
;
2127 return false;
2128 // TODO: don't tail predicate if there is a reversed load?
2129 } else if (EnableMaskedGatherScatters) {
2130 // Gather/scatters do allow loading from arbitrary strides, at
2131 // least if they are loop invariant.
2132 // TODO: Loop variant strides should in theory work, too, but
2133 // this requires further testing.
2134 const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr);
2135 if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
2136 const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
2137 if (PSE.getSE()->isLoopInvariant(Step, L))
2138 continue;
2139 }
2140 }
2141 LLVM_DEBUG(dbgs() << "Bad stride found, can't "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Bad stride found, can't " "tail-predicate\n."
; } } while (false)
2142 "tail-predicate\n.")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Bad stride found, can't " "tail-predicate\n."
; } } while (false)
;
2143 return false;
2144 }
2145 }
2146 }
2147
2148 LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "tail-predication: all instructions allowed!\n"
; } } while (false)
;
2149 return true;
2150}
2151
2152bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
2153 ScalarEvolution &SE,
2154 AssumptionCache &AC,
2155 TargetLibraryInfo *TLI,
2156 DominatorTree *DT,
2157 const LoopAccessInfo *LAI) {
2158 if (!EnableTailPredication) {
2159 LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Tail-predication not enabled.\n"
; } } while (false)
;
2160 return false;
2161 }
2162
2163 // Creating a predicated vector loop is the first step for generating a
2164 // tail-predicated hardware loop, for which we need the MVE masked
2165 // load/stores instructions:
2166 if (!ST->hasMVEIntegerOps())
2167 return false;
2168
2169 // For now, restrict this to single block loops.
2170 if (L->getNumBlocks() > 1) {
2171 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "preferPredicateOverEpilogue: not a single block "
"loop.\n"; } } while (false)
2172 "loop.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "preferPredicateOverEpilogue: not a single block "
"loop.\n"; } } while (false)
;
2173 return false;
2174 }
2175
2176 assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected")(static_cast <bool> (L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected"
) ? void (0) : __assert_fail ("L->isInnermost() && \"preferPredicateOverEpilogue: inner-loop expected\""
, "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp"
, 2176, __extension__ __PRETTY_FUNCTION__))
;
2177
2178 HardwareLoopInfo HWLoopInfo(L);
2179 if (!HWLoopInfo.canAnalyze(*LI)) {
2180 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
"analyzable.\n"; } } while (false)
2181 "analyzable.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
"analyzable.\n"; } } while (false)
;
2182 return false;
2183 }
2184
2185 // This checks if we have the low-overhead branch architecture
2186 // extension, and if we will create a hardware-loop:
2187 if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {
2188 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
"profitable.\n"; } } while (false)
2189 "profitable.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
"profitable.\n"; } } while (false)
;
2190 return false;
2191 }
2192
2193 if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) {
2194 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
"a candidate.\n"; } } while (false)
2195 "a candidate.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
"a candidate.\n"; } } while (false)
;
2196 return false;
2197 }
2198
2199 return canTailPredicateLoop(L, LI, SE, DL, LAI);
2200}
2201
2202bool ARMTTIImpl::emitGetActiveLaneMask() const {
2203 if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
2204 return false;
2205
2206 // Intrinsic @llvm.get.active.lane.mask is supported.
2207 // It is used in the MVETailPredication pass, which requires the number of
2208 // elements processed by this vector loop to setup the tail-predicated
2209 // loop.
2210 return true;
2211}
2212void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
2213 TTI::UnrollingPreferences &UP,
2214 OptimizationRemarkEmitter *ORE) {
2215 // Enable Upper bound unrolling universally, not dependant upon the conditions
2216 // below.
2217 UP.UpperBound = true;
2218
2219 // Only currently enable these preferences for M-Class cores.
2220 if (!ST->isMClass())
2221 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2222
2223 // Disable loop unrolling for Oz and Os.
2224 UP.OptSizeThreshold = 0;
2225 UP.PartialOptSizeThreshold = 0;
2226 if (L->getHeader()->getParent()->hasOptSize())
2227 return;
2228
2229 SmallVector<BasicBlock*, 4> ExitingBlocks;
2230 L->getExitingBlocks(ExitingBlocks);
2231 LLVM_DEBUG(dbgs() << "Loop has:\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Loop has:\n" << "Blocks: "
<< L->getNumBlocks() << "\n" << "Exit blocks: "
<< ExitingBlocks.size() << "\n"; } } while (false
)
2232 << "Blocks: " << L->getNumBlocks() << "\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Loop has:\n" << "Blocks: "
<< L->getNumBlocks() << "\n" << "Exit blocks: "
<< ExitingBlocks.size() << "\n"; } } while (false
)
2233 << "Exit blocks: " << ExitingBlocks.size() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Loop has:\n" << "Blocks: "
<< L->getNumBlocks() << "\n" << "Exit blocks: "
<< ExitingBlocks.size() << "\n"; } } while (false
)
;
2234
2235 // Only allow another exit other than the latch. This acts as an early exit
2236 // as it mirrors the profitability calculation of the runtime unroller.
2237 if (ExitingBlocks.size() > 2)
2238 return;
2239
2240 // Limit the CFG of the loop body for targets with a branch predictor.
2241 // Allowing 4 blocks permits if-then-else diamonds in the body.
2242 if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
2243 return;
2244
2245 // Don't unroll vectorized loops, including the remainder loop
2246 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
2247 return;
2248
2249 // Scan the loop: don't unroll loops with calls as this could prevent
2250 // inlining.
2251 InstructionCost Cost = 0;
2252 for (auto *BB : L->getBlocks()) {
2253 for (auto &I : *BB) {
2254 // Don't unroll vectorised loop. MVE does not benefit from it as much as
2255 // scalar code.
2256 if (I.getType()->isVectorTy())
2257 return;
2258
2259 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2260 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2261 if (!isLoweredToCall(F))
2262 continue;
2263 }
2264 return;
2265 }
2266
2267 SmallVector<const Value*, 4> Operands(I.operand_values());
2268 Cost +=
2269 getUserCost(&I, Operands, TargetTransformInfo::TCK_SizeAndLatency);
2270 }
2271 }
2272
2273 // On v6m cores, there are very few registers available. We can easily end up
2274 // spilling and reloading more registers in an unrolled loop. Look at the
2275 // number of LCSSA phis as a rough measure of how many registers will need to
2276 // be live out of the loop, reducing the default unroll count if more than 1
2277 // value is needed. In the long run, all of this should be being learnt by a
2278 // machine.
2279 unsigned UnrollCount = 4;
2280 if (ST->isThumb1Only()) {
2281 unsigned ExitingValues = 0;
2282 SmallVector<BasicBlock *, 4> ExitBlocks;
2283 L->getExitBlocks(ExitBlocks);
2284 for (auto *Exit : ExitBlocks) {
2285 // Count the number of LCSSA phis. Exclude values coming from GEP's as
2286 // only the last is expected to be needed for address operands.
2287 unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) {
2288 return PH.getNumOperands() != 1 ||
2289 !isa<GetElementPtrInst>(PH.getOperand(0));
2290 });
2291 ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
2292 }
2293 if (ExitingValues)
2294 UnrollCount /= ExitingValues;
2295 if (UnrollCount <= 1)
2296 return;
2297 }
2298
2299 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Cost of loop: " << Cost <<
"\n"; } } while (false)
;
2300 LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Default Runtime Unroll Count: "
<< UnrollCount << "\n"; } } while (false)
;
2301
2302 UP.Partial = true;
2303 UP.Runtime = true;
2304 UP.UnrollRemainder = true;
2305 UP.DefaultUnrollRuntimeCount = UnrollCount;
2306 UP.UnrollAndJam = true;
2307 UP.UnrollAndJamInnerLoopThreshold = 60;
2308
2309 // Force unrolling small loops can be very useful because of the branch
2310 // taken cost of the backedge.
2311 if (Cost < 12)
2312 UP.Force = true;
2313}
2314
2315void ARMTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
2316 TTI::PeelingPreferences &PP) {
2317 BaseT::getPeelingPreferences(L, SE, PP);
2318}
2319
2320bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
2321 TTI::ReductionFlags Flags) const {
2322 if (!ST->hasMVEIntegerOps())
2323 return false;
2324
2325 unsigned ScalarBits = Ty->getScalarSizeInBits();
2326 switch (Opcode) {
2327 case Instruction::Add:
2328 return ScalarBits <= 64;
2329 default:
2330 return false;
2331 }
2332}
2333
2334bool ARMTTIImpl::preferPredicatedReductionSelect(
2335 unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const {
2336 if (!ST->hasMVEIntegerOps())
2337 return false;
2338 return true;
2339}

/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h

1//===-- ARMAddressingModes.h - ARM Addressing Modes -------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains the ARM addressing mode implementation stuff.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMADDRESSINGMODES_H
14#define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMADDRESSINGMODES_H
15
16#include "llvm/ADT/APFloat.h"
17#include "llvm/ADT/APInt.h"
18#include "llvm/ADT/bit.h"
19#include "llvm/Support/ErrorHandling.h"
20#include "llvm/Support/MathExtras.h"
21#include <cassert>
22
23namespace llvm {
24
25/// ARM_AM - ARM Addressing Mode Stuff
26namespace ARM_AM {
27 enum ShiftOpc {
28 no_shift = 0,
29 asr,
30 lsl,
31 lsr,
32 ror,
33 rrx,
34 uxtw
35 };
36
37 enum AddrOpc {
38 sub = 0,
39 add
40 };
41
42 inline const char *getAddrOpcStr(AddrOpc Op) { return Op == sub ? "-" : ""; }
43
44 inline const char *getShiftOpcStr(ShiftOpc Op) {
45 switch (Op) {
46 default: llvm_unreachable("Unknown shift opc!")::llvm::llvm_unreachable_internal("Unknown shift opc!", "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 46)
;
47 case ARM_AM::asr: return "asr";
48 case ARM_AM::lsl: return "lsl";
49 case ARM_AM::lsr: return "lsr";
50 case ARM_AM::ror: return "ror";
51 case ARM_AM::rrx: return "rrx";
52 case ARM_AM::uxtw: return "uxtw";
53 }
54 }
55
56 inline unsigned getShiftOpcEncoding(ShiftOpc Op) {
57 switch (Op) {
58 default: llvm_unreachable("Unknown shift opc!")::llvm::llvm_unreachable_internal("Unknown shift opc!", "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 58)
;
59 case ARM_AM::asr: return 2;
60 case ARM_AM::lsl: return 0;
61 case ARM_AM::lsr: return 1;
62 case ARM_AM::ror: return 3;
63 }
64 }
65
66 enum AMSubMode {
67 bad_am_submode = 0,
68 ia,
69 ib,
70 da,
71 db
72 };
73
74 inline const char *getAMSubModeStr(AMSubMode Mode) {
75 switch (Mode) {
76 default: llvm_unreachable("Unknown addressing sub-mode!")::llvm::llvm_unreachable_internal("Unknown addressing sub-mode!"
, "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 76)
;
77 case ARM_AM::ia: return "ia";
78 case ARM_AM::ib: return "ib";
79 case ARM_AM::da: return "da";
80 case ARM_AM::db: return "db";
81 }
82 }
83
84 /// rotr32 - Rotate a 32-bit unsigned value right by a specified # bits.
85 ///
86 inline unsigned rotr32(unsigned Val, unsigned Amt) {
87 assert(Amt < 32 && "Invalid rotate amount")(static_cast <bool> (Amt < 32 && "Invalid rotate amount"
) ? void (0) : __assert_fail ("Amt < 32 && \"Invalid rotate amount\""
, "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 87, __extension__ __PRETTY_FUNCTION__))
;
88 return (Val >> Amt) | (Val << ((32-Amt)&31));
89 }
90
91 /// rotl32 - Rotate a 32-bit unsigned value left by a specified # bits.
92 ///
93 inline unsigned rotl32(unsigned Val, unsigned Amt) {
94 assert(Amt < 32 && "Invalid rotate amount")(static_cast <bool> (Amt < 32 && "Invalid rotate amount"
) ? void (0) : __assert_fail ("Amt < 32 && \"Invalid rotate amount\""
, "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 94, __extension__ __PRETTY_FUNCTION__))
;
95 return (Val << Amt) | (Val >> ((32-Amt)&31));
96 }
97
98 //===--------------------------------------------------------------------===//
99 // Addressing Mode #1: shift_operand with registers
100 //===--------------------------------------------------------------------===//
101 //
102 // This 'addressing mode' is used for arithmetic instructions. It can
103 // represent things like:
104 // reg
105 // reg [asr|lsl|lsr|ror|rrx] reg
106 // reg [asr|lsl|lsr|ror|rrx] imm
107 //
108 // This is stored three operands [rega, regb, opc]. The first is the base
109 // reg, the second is the shift amount (or reg0 if not present or imm). The
110 // third operand encodes the shift opcode and the imm if a reg isn't present.
111 //
112 inline unsigned getSORegOpc(ShiftOpc ShOp, unsigned Imm) {
113 return ShOp | (Imm << 3);
114 }
115 inline unsigned getSORegOffset(unsigned Op) { return Op >> 3; }
116 inline ShiftOpc getSORegShOp(unsigned Op) { return (ShiftOpc)(Op & 7); }
117
118 /// getSOImmValImm - Given an encoded imm field for the reg/imm form, return
119 /// the 8-bit imm value.
120 inline unsigned getSOImmValImm(unsigned Imm) { return Imm & 0xFF; }
121 /// getSOImmValRot - Given an encoded imm field for the reg/imm form, return
122 /// the rotate amount.
123 inline unsigned getSOImmValRot(unsigned Imm) { return (Imm >> 8) * 2; }
124
125 /// getSOImmValRotate - Try to handle Imm with an immediate shifter operand,
126 /// computing the rotate amount to use. If this immediate value cannot be
127 /// handled with a single shifter-op, determine a good rotate amount that will
128 /// take a maximal chunk of bits out of the immediate.
129 inline unsigned getSOImmValRotate(unsigned Imm) {
130 // 8-bit (or less) immediates are trivially shifter_operands with a rotate
131 // of zero.
132 if ((Imm & ~255U) == 0) return 0;
133
134 // Use CTZ to compute the rotate amount.
135 unsigned TZ = countTrailingZeros(Imm);
136
137 // Rotate amount must be even. Something like 0x200 must be rotated 8 bits,
138 // not 9.
139 unsigned RotAmt = TZ & ~1;
140
141 // If we can handle this spread, return it.
142 if ((rotr32(Imm, RotAmt) & ~255U) == 0)
143 return (32-RotAmt)&31; // HW rotates right, not left.
144
145 // For values like 0xF000000F, we should ignore the low 6 bits, then
146 // retry the hunt.
147 if (Imm & 63U) {
148 unsigned TZ2 = countTrailingZeros(Imm & ~63U);
149 unsigned RotAmt2 = TZ2 & ~1;
150 if ((rotr32(Imm, RotAmt2) & ~255U) == 0)
151 return (32-RotAmt2)&31; // HW rotates right, not left.
152 }
153
154 // Otherwise, we have no way to cover this span of bits with a single
155 // shifter_op immediate. Return a chunk of bits that will be useful to
156 // handle.
157 return (32-RotAmt)&31; // HW rotates right, not left.
158 }
159
160 /// getSOImmVal - Given a 32-bit immediate, if it is something that can fit
161 /// into an shifter_operand immediate operand, return the 12-bit encoding for
162 /// it. If not, return -1.
163 inline int getSOImmVal(unsigned Arg) {
164 // 8-bit (or less) immediates are trivially shifter_operands with a rotate
165 // of zero.
166 if ((Arg & ~255U) == 0) return Arg;
167
168 unsigned RotAmt = getSOImmValRotate(Arg);
169
170 // If this cannot be handled with a single shifter_op, bail out.
171 if (rotr32(~255U, RotAmt) & Arg)
172 return -1;
173
174 // Encode this correctly.
175 return rotl32(Arg, RotAmt) | ((RotAmt>>1) << 8);
176 }
177
178 /// isSOImmTwoPartVal - Return true if the specified value can be obtained by
179 /// or'ing together two SOImmVal's.
180 inline bool isSOImmTwoPartVal(unsigned V) {
181 // If this can be handled with a single shifter_op, bail out.
182 V = rotr32(~255U, getSOImmValRotate(V)) & V;
183 if (V == 0)
184 return false;
185
186 // If this can be handled with two shifter_op's, accept.
187 V = rotr32(~255U, getSOImmValRotate(V)) & V;
188 return V == 0;
189 }
190
191 /// getSOImmTwoPartFirst - If V is a value that satisfies isSOImmTwoPartVal,
192 /// return the first chunk of it.
193 inline unsigned getSOImmTwoPartFirst(unsigned V) {
194 return rotr32(255U, getSOImmValRotate(V)) & V;
195 }
196
197 /// getSOImmTwoPartSecond - If V is a value that satisfies isSOImmTwoPartVal,
198 /// return the second chunk of it.
199 inline unsigned getSOImmTwoPartSecond(unsigned V) {
200 // Mask out the first hunk.
201 V = rotr32(~255U, getSOImmValRotate(V)) & V;
202
203 // Take what's left.
204 assert(V == (rotr32(255U, getSOImmValRotate(V)) & V))(static_cast <bool> (V == (rotr32(255U, getSOImmValRotate
(V)) & V)) ? void (0) : __assert_fail ("V == (rotr32(255U, getSOImmValRotate(V)) & V)"
, "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 204, __extension__ __PRETTY_FUNCTION__))
;
205 return V;
206 }
207
208 /// isSOImmTwoPartValNeg - Return true if the specified value can be obtained
209 /// by two SOImmVal, that -V = First + Second.
210 /// "R+V" can be optimized to (sub (sub R, First), Second).
211 /// "R=V" can be optimized to (sub (mvn R, ~(-First)), Second).
212 inline bool isSOImmTwoPartValNeg(unsigned V) {
213 unsigned First;
214 if (!isSOImmTwoPartVal(-V))
215 return false;
216 // Return false if ~(-First) is not a SoImmval.
217 First = getSOImmTwoPartFirst(-V);
218 First = ~(-First);
219 return !(rotr32(~255U, getSOImmValRotate(First)) & First);
220 }
221
222 /// getThumbImmValShift - Try to handle Imm with a 8-bit immediate followed
223 /// by a left shift. Returns the shift amount to use.
224 inline unsigned getThumbImmValShift(unsigned Imm) {
225 // 8-bit (or less) immediates are trivially immediate operand with a shift
226 // of zero.
227 if ((Imm & ~255U) == 0) return 0;
25
Assuming the condition is false
26
Taking false branch
228
229 // Use CTZ to compute the shift amount.
230 return countTrailingZeros(Imm);
27
Calling 'countTrailingZeros<unsigned int>'
34
Returning from 'countTrailingZeros<unsigned int>'
35
Returning the value 32
231 }
232
233 /// isThumbImmShiftedVal - Return true if the specified value can be obtained
234 /// by left shifting a 8-bit immediate.
235 inline bool isThumbImmShiftedVal(unsigned V) {
236 // If this can be handled with
237 V = (~255U << getThumbImmValShift(V)) & V;
24
Calling 'getThumbImmValShift'
36
Returning from 'getThumbImmValShift'
37
The result of the left shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'
238 return V == 0;
239 }
240
241 /// getThumbImm16ValShift - Try to handle Imm with a 16-bit immediate followed
242 /// by a left shift. Returns the shift amount to use.
243 inline unsigned getThumbImm16ValShift(unsigned Imm) {
244 // 16-bit (or less) immediates are trivially immediate operand with a shift
245 // of zero.
246 if ((Imm & ~65535U) == 0) return 0;
247
248 // Use CTZ to compute the shift amount.
249 return countTrailingZeros(Imm);
250 }
251
252 /// isThumbImm16ShiftedVal - Return true if the specified value can be
253 /// obtained by left shifting a 16-bit immediate.
254 inline bool isThumbImm16ShiftedVal(unsigned V) {
255 // If this can be handled with
256 V = (~65535U << getThumbImm16ValShift(V)) & V;
257 return V == 0;
258 }
259
260 /// getThumbImmNonShiftedVal - If V is a value that satisfies
261 /// isThumbImmShiftedVal, return the non-shiftd value.
262 inline unsigned getThumbImmNonShiftedVal(unsigned V) {
263 return V >> getThumbImmValShift(V);
264 }
265
266
267 /// getT2SOImmValSplat - Return the 12-bit encoded representation
268 /// if the specified value can be obtained by splatting the low 8 bits
269 /// into every other byte or every byte of a 32-bit value. i.e.,
270 /// 00000000 00000000 00000000 abcdefgh control = 0
271 /// 00000000 abcdefgh 00000000 abcdefgh control = 1
272 /// abcdefgh 00000000 abcdefgh 00000000 control = 2
273 /// abcdefgh abcdefgh abcdefgh abcdefgh control = 3
274 /// Return -1 if none of the above apply.
275 /// See ARM Reference Manual A6.3.2.
276 inline int getT2SOImmValSplatVal(unsigned V) {
277 unsigned u, Vs, Imm;
278 // control = 0
279 if ((V & 0xffffff00) == 0)
280 return V;
281
282 // If the value is zeroes in the first byte, just shift those off
283 Vs = ((V & 0xff) == 0) ? V >> 8 : V;
284 // Any passing value only has 8 bits of payload, splatted across the word
285 Imm = Vs & 0xff;
286 // Likewise, any passing values have the payload splatted into the 3rd byte
287 u = Imm | (Imm << 16);
288
289 // control = 1 or 2
290 if (Vs == u)
291 return (((Vs == V) ? 1 : 2) << 8) | Imm;
292
293 // control = 3
294 if (Vs == (u | (u << 8)))
295 return (3 << 8) | Imm;
296
297 return -1;
298 }
299
300 /// getT2SOImmValRotateVal - Return the 12-bit encoded representation if the
301 /// specified value is a rotated 8-bit value. Return -1 if no rotation
302 /// encoding is possible.
303 /// See ARM Reference Manual A6.3.2.
304 inline int getT2SOImmValRotateVal(unsigned V) {
305 unsigned RotAmt = countLeadingZeros(V);
306 if (RotAmt >= 24)
307 return -1;
308
309 // If 'Arg' can be handled with a single shifter_op return the value.
310 if ((rotr32(0xff000000U, RotAmt) & V) == V)
311 return (rotr32(V, 24 - RotAmt) & 0x7f) | ((RotAmt + 8) << 7);
312
313 return -1;
314 }
315
316 /// getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit
317 /// into a Thumb-2 shifter_operand immediate operand, return the 12-bit
318 /// encoding for it. If not, return -1.
319 /// See ARM Reference Manual A6.3.2.
320 inline int getT2SOImmVal(unsigned Arg) {
321 // If 'Arg' is an 8-bit splat, then get the encoded value.
322 int Splat = getT2SOImmValSplatVal(Arg);
323 if (Splat != -1)
324 return Splat;
325
326 // If 'Arg' can be handled with a single shifter_op return the value.
327 int Rot = getT2SOImmValRotateVal(Arg);
328 if (Rot != -1)
329 return Rot;
330
331 return -1;
332 }
333
334 inline unsigned getT2SOImmValRotate(unsigned V) {
335 if ((V & ~255U) == 0) return 0;
336 // Use CTZ to compute the rotate amount.
337 unsigned RotAmt = countTrailingZeros(V);
338 return (32 - RotAmt) & 31;
339 }
340
341 inline bool isT2SOImmTwoPartVal(unsigned Imm) {
342 unsigned V = Imm;
343 // Passing values can be any combination of splat values and shifter
344 // values. If this can be handled with a single shifter or splat, bail
345 // out. Those should be handled directly, not with a two-part val.
346 if (getT2SOImmValSplatVal(V) != -1)
347 return false;
348 V = rotr32 (~255U, getT2SOImmValRotate(V)) & V;
349 if (V == 0)
350 return false;
351
352 // If this can be handled as an immediate, accept.
353 if (getT2SOImmVal(V) != -1) return true;
354
355 // Likewise, try masking out a splat value first.
356 V = Imm;
357 if (getT2SOImmValSplatVal(V & 0xff00ff00U) != -1)
358 V &= ~0xff00ff00U;
359 else if (getT2SOImmValSplatVal(V & 0x00ff00ffU) != -1)
360 V &= ~0x00ff00ffU;
361 // If what's left can be handled as an immediate, accept.
362 if (getT2SOImmVal(V) != -1) return true;
363
364 // Otherwise, do not accept.
365 return false;
366 }
367
368 inline unsigned getT2SOImmTwoPartFirst(unsigned Imm) {
369 assert (isT2SOImmTwoPartVal(Imm) &&(static_cast <bool> (isT2SOImmTwoPartVal(Imm) &&
"Immedate cannot be encoded as two part immediate!") ? void (
0) : __assert_fail ("isT2SOImmTwoPartVal(Imm) && \"Immedate cannot be encoded as two part immediate!\""
, "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 370, __extension__ __PRETTY_FUNCTION__))
370 "Immedate cannot be encoded as two part immediate!")(static_cast <bool> (isT2SOImmTwoPartVal(Imm) &&
"Immedate cannot be encoded as two part immediate!") ? void (
0) : __assert_fail ("isT2SOImmTwoPartVal(Imm) && \"Immedate cannot be encoded as two part immediate!\""
, "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 370, __extension__ __PRETTY_FUNCTION__))
;
371 // Try a shifter operand as one part
372 unsigned V = rotr32 (~255, getT2SOImmValRotate(Imm)) & Imm;
373 // If the rest is encodable as an immediate, then return it.
374 if (getT2SOImmVal(V) != -1) return V;
375
376 // Try masking out a splat value first.
377 if (getT2SOImmValSplatVal(Imm & 0xff00ff00U) != -1)
378 return Imm & 0xff00ff00U;
379
380 // The other splat is all that's left as an option.
381 assert (getT2SOImmValSplatVal(Imm & 0x00ff00ffU) != -1)(static_cast <bool> (getT2SOImmValSplatVal(Imm & 0x00ff00ffU
) != -1) ? void (0) : __assert_fail ("getT2SOImmValSplatVal(Imm & 0x00ff00ffU) != -1"
, "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 381, __extension__ __PRETTY_FUNCTION__))
;
382 return Imm & 0x00ff00ffU;
383 }
384
385 inline unsigned getT2SOImmTwoPartSecond(unsigned Imm) {
386 // Mask out the first hunk
387 Imm ^= getT2SOImmTwoPartFirst(Imm);
388 // Return what's left
389 assert (getT2SOImmVal(Imm) != -1 &&(static_cast <bool> (getT2SOImmVal(Imm) != -1 &&
"Unable to encode second part of T2 two part SO immediate") ?
void (0) : __assert_fail ("getT2SOImmVal(Imm) != -1 && \"Unable to encode second part of T2 two part SO immediate\""
, "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 390, __extension__ __PRETTY_FUNCTION__))
390 "Unable to encode second part of T2 two part SO immediate")(static_cast <bool> (getT2SOImmVal(Imm) != -1 &&
"Unable to encode second part of T2 two part SO immediate") ?
void (0) : __assert_fail ("getT2SOImmVal(Imm) != -1 && \"Unable to encode second part of T2 two part SO immediate\""
, "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 390, __extension__ __PRETTY_FUNCTION__))
;
391 return Imm;
392 }
393
394
395 //===--------------------------------------------------------------------===//
396 // Addressing Mode #2
397 //===--------------------------------------------------------------------===//
398 //
399 // This is used for most simple load/store instructions.
400 //
401 // addrmode2 := reg +/- reg shop imm
402 // addrmode2 := reg +/- imm12
403 //
404 // The first operand is always a Reg. The second operand is a reg if in
405 // reg/reg form, otherwise it's reg#0. The third field encodes the operation
406 // in bit 12, the immediate in bits 0-11, and the shift op in 13-15. The
407 // fourth operand 16-17 encodes the index mode.
408 //
409 // If this addressing mode is a frame index (before prolog/epilog insertion
410 // and code rewriting), this operand will have the form: FI#, reg0, <offs>
411 // with no shift amount for the frame offset.
412 //
413 inline unsigned getAM2Opc(AddrOpc Opc, unsigned Imm12, ShiftOpc SO,
414 unsigned IdxMode = 0) {
415 assert(Imm12 < (1 << 12) && "Imm too large!")(static_cast <bool> (Imm12 < (1 << 12) &&
"Imm too large!") ? void (0) : __assert_fail ("Imm12 < (1 << 12) && \"Imm too large!\""
, "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 415, __extension__ __PRETTY_FUNCTION__))
;
416 bool isSub = Opc == sub;
417 return Imm12 | ((int)isSub << 12) | (SO << 13) | (IdxMode << 16) ;
418 }
419 inline unsigned getAM2Offset(unsigned AM2Opc) {
420 return AM2Opc & ((1 << 12)-1);
421 }
422 inline AddrOpc getAM2Op(unsigned AM2Opc) {
423 return ((AM2Opc >> 12) & 1) ? sub : add;
424 }
425 inline ShiftOpc getAM2ShiftOpc(unsigned AM2Opc) {
426 return (ShiftOpc)((AM2Opc >> 13) & 7);
427 }
428 inline unsigned getAM2IdxMode(unsigned AM2Opc) { return (AM2Opc >> 16); }
429
430 //===--------------------------------------------------------------------===//
431 // Addressing Mode #3
432 //===--------------------------------------------------------------------===//
433 //
434 // This is used for sign-extending loads, and load/store-pair instructions.
435 //
436 // addrmode3 := reg +/- reg
437 // addrmode3 := reg +/- imm8
438 //
439 // The first operand is always a Reg. The second operand is a reg if in
440 // reg/reg form, otherwise it's reg#0. The third field encodes the operation
441 // in bit 8, the immediate in bits 0-7. The fourth operand 9-10 encodes the
442 // index mode.
443
444 /// getAM3Opc - This function encodes the addrmode3 opc field.
445 inline unsigned getAM3Opc(AddrOpc Opc, unsigned char Offset,
446 unsigned IdxMode = 0) {
447 bool isSub = Opc == sub;
448 return ((int)isSub << 8) | Offset | (IdxMode << 9);
449 }
450 inline unsigned char getAM3Offset(unsigned AM3Opc) { return AM3Opc & 0xFF; }
451 inline AddrOpc getAM3Op(unsigned AM3Opc) {
452 return ((AM3Opc >> 8) & 1) ? sub : add;
453 }
454 inline unsigned getAM3IdxMode(unsigned AM3Opc) { return (AM3Opc >> 9); }
455
456 //===--------------------------------------------------------------------===//
457 // Addressing Mode #4
458 //===--------------------------------------------------------------------===//
459 //
460 // This is used for load / store multiple instructions.
461 //
462 // addrmode4 := reg, <mode>
463 //
464 // The four modes are:
465 // IA - Increment after
466 // IB - Increment before
467 // DA - Decrement after
468 // DB - Decrement before
469 // For VFP instructions, only the IA and DB modes are valid.
470
471 inline AMSubMode getAM4SubMode(unsigned Mode) {
472 return (AMSubMode)(Mode & 0x7);
473 }
474
475 inline unsigned getAM4ModeImm(AMSubMode SubMode) { return (int)SubMode; }
476
477 //===--------------------------------------------------------------------===//
478 // Addressing Mode #5
479 //===--------------------------------------------------------------------===//
480 //
481 // This is used for coprocessor instructions, such as FP load/stores.
482 //
483 // addrmode5 := reg +/- imm8*4
484 //
485 // The first operand is always a Reg. The second operand encodes the
486 // operation (add or subtract) in bit 8 and the immediate in bits 0-7.
487
488 /// getAM5Opc - This function encodes the addrmode5 opc field.
489 inline unsigned getAM5Opc(AddrOpc Opc, unsigned char Offset) {
490 bool isSub = Opc == sub;
491 return ((int)isSub << 8) | Offset;
492 }
493 inline unsigned char getAM5Offset(unsigned AM5Opc) { return AM5Opc & 0xFF; }
494 inline AddrOpc getAM5Op(unsigned AM5Opc) {
495 return ((AM5Opc >> 8) & 1) ? sub : add;
496 }
497
498 //===--------------------------------------------------------------------===//
499 // Addressing Mode #5 FP16
500 //===--------------------------------------------------------------------===//
501 //
502 // This is used for coprocessor instructions, such as 16-bit FP load/stores.
503 //
504 // addrmode5fp16 := reg +/- imm8*2
505 //
506 // The first operand is always a Reg. The second operand encodes the
507 // operation (add or subtract) in bit 8 and the immediate in bits 0-7.
508
509 /// getAM5FP16Opc - This function encodes the addrmode5fp16 opc field.
510 inline unsigned getAM5FP16Opc(AddrOpc Opc, unsigned char Offset) {
511 bool isSub = Opc == sub;
512 return ((int)isSub << 8) | Offset;
513 }
514 inline unsigned char getAM5FP16Offset(unsigned AM5Opc) {
515 return AM5Opc & 0xFF;
516 }
517 inline AddrOpc getAM5FP16Op(unsigned AM5Opc) {
518 return ((AM5Opc >> 8) & 1) ? sub : add;
519 }
520
521 //===--------------------------------------------------------------------===//
522 // Addressing Mode #6
523 //===--------------------------------------------------------------------===//
524 //
525 // This is used for NEON load / store instructions.
526 //
527 // addrmode6 := reg with optional alignment
528 //
529 // This is stored in two operands [regaddr, align]. The first is the
530 // address register. The second operand is the value of the alignment
531 // specifier in bytes or zero if no explicit alignment.
532 // Valid alignments depend on the specific instruction.
533
534 //===--------------------------------------------------------------------===//
535 // NEON/MVE Modified Immediates
536 //===--------------------------------------------------------------------===//
537 //
538 // Several NEON and MVE instructions (e.g., VMOV) take a "modified immediate"
539 // vector operand, where a small immediate encoded in the instruction
540 // specifies a full NEON vector value. These modified immediates are
541 // represented here as encoded integers. The low 8 bits hold the immediate
542 // value; bit 12 holds the "Op" field of the instruction, and bits 11-8 hold
543 // the "Cmode" field of the instruction. The interfaces below treat the
544 // Op and Cmode values as a single 5-bit value.
545
546 inline unsigned createVMOVModImm(unsigned OpCmode, unsigned Val) {
547 return (OpCmode << 8) | Val;
548 }
549 inline unsigned getVMOVModImmOpCmode(unsigned ModImm) {
550 return (ModImm >> 8) & 0x1f;
551 }
552 inline unsigned getVMOVModImmVal(unsigned ModImm) { return ModImm & 0xff; }
553
554 /// decodeVMOVModImm - Decode a NEON/MVE modified immediate value into the
555 /// element value and the element size in bits. (If the element size is
556 /// smaller than the vector, it is splatted into all the elements.)
557 inline uint64_t decodeVMOVModImm(unsigned ModImm, unsigned &EltBits) {
558 unsigned OpCmode = getVMOVModImmOpCmode(ModImm);
559 unsigned Imm8 = getVMOVModImmVal(ModImm);
560 uint64_t Val = 0;
561
562 if (OpCmode == 0xe) {
563 // 8-bit vector elements
564 Val = Imm8;
565 EltBits = 8;
566 } else if ((OpCmode & 0xc) == 0x8) {
567 // 16-bit vector elements
568 unsigned ByteNum = (OpCmode & 0x6) >> 1;
569 Val = Imm8 << (8 * ByteNum);
570 EltBits = 16;
571 } else if ((OpCmode & 0x8) == 0) {
572 // 32-bit vector elements, zero with one byte set
573 unsigned ByteNum = (OpCmode & 0x6) >> 1;
574 Val = Imm8 << (8 * ByteNum);
575 EltBits = 32;
576 } else if ((OpCmode & 0xe) == 0xc) {
577 // 32-bit vector elements, one byte with low bits set
578 unsigned ByteNum = 1 + (OpCmode & 0x1);
579 Val = (Imm8 << (8 * ByteNum)) | (0xffff >> (8 * (2 - ByteNum)));
580 EltBits = 32;
581 } else if (OpCmode == 0x1e) {
582 // 64-bit vector elements
583 for (unsigned ByteNum = 0; ByteNum < 8; ++ByteNum) {
584 if ((ModImm >> ByteNum) & 1)
585 Val |= (uint64_t)0xff << (8 * ByteNum);
586 }
587 EltBits = 64;
588 } else {
589 llvm_unreachable("Unsupported VMOV immediate")::llvm::llvm_unreachable_internal("Unsupported VMOV immediate"
, "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 589)
;
590 }
591 return Val;
592 }
593
594 // Generic validation for single-byte immediate (0X00, 00X0, etc).
595 inline bool isNEONBytesplat(unsigned Value, unsigned Size) {
596 assert(Size >= 1 && Size <= 4 && "Invalid size")(static_cast <bool> (Size >= 1 && Size <=
4 && "Invalid size") ? void (0) : __assert_fail ("Size >= 1 && Size <= 4 && \"Invalid size\""
, "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 596, __extension__ __PRETTY_FUNCTION__))
;
597 unsigned count = 0;
598 for (unsigned i = 0; i < Size; ++i) {
599 if (Value & 0xff) count++;
600 Value >>= 8;
601 }
602 return count == 1;
603 }
604
605 /// Checks if Value is a correct immediate for instructions like VBIC/VORR.
606 inline bool isNEONi16splat(unsigned Value) {
607 if (Value > 0xffff)
608 return false;
609 // i16 value with set bits only in one byte X0 or 0X.
610 return Value == 0 || isNEONBytesplat(Value, 2);
611 }
612
613 // Encode NEON 16 bits Splat immediate for instructions like VBIC/VORR
614 inline unsigned encodeNEONi16splat(unsigned Value) {
615 assert(isNEONi16splat(Value) && "Invalid NEON splat value")(static_cast <bool> (isNEONi16splat(Value) && "Invalid NEON splat value"
) ? void (0) : __assert_fail ("isNEONi16splat(Value) && \"Invalid NEON splat value\""
, "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 615, __extension__ __PRETTY_FUNCTION__))
;
616 if (Value >= 0x100)
617 Value = (Value >> 8) | 0xa00;
618 else
619 Value |= 0x800;
620 return Value;
621 }
622
623 /// Checks if Value is a correct immediate for instructions like VBIC/VORR.
624 inline bool isNEONi32splat(unsigned Value) {
625 // i32 value with set bits only in one byte X000, 0X00, 00X0, or 000X.
626 return Value == 0 || isNEONBytesplat(Value, 4);
627 }
628
629 /// Encode NEON 32 bits Splat immediate for instructions like VBIC/VORR.
630 inline unsigned encodeNEONi32splat(unsigned Value) {
631 assert(isNEONi32splat(Value) && "Invalid NEON splat value")(static_cast <bool> (isNEONi32splat(Value) && "Invalid NEON splat value"
) ? void (0) : __assert_fail ("isNEONi32splat(Value) && \"Invalid NEON splat value\""
, "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 631, __extension__ __PRETTY_FUNCTION__))
;
632 if (Value >= 0x100 && Value <= 0xff00)
633 Value = (Value >> 8) | 0x200;
634 else if (Value > 0xffff && Value <= 0xff0000)
635 Value = (Value >> 16) | 0x400;
636 else if (Value > 0xffffff)
637 Value = (Value >> 24) | 0x600;
638 return Value;
639 }
640
641 //===--------------------------------------------------------------------===//
642 // Floating-point Immediates
643 //
644 inline float getFPImmFloat(unsigned Imm) {
645 // We expect an 8-bit binary encoding of a floating-point number here.
646
647 uint8_t Sign = (Imm >> 7) & 0x1;
648 uint8_t Exp = (Imm >> 4) & 0x7;
649 uint8_t Mantissa = Imm & 0xf;
650
651 // 8-bit FP IEEE Float Encoding
652 // abcd efgh aBbbbbbc defgh000 00000000 00000000
653 //
654 // where B = NOT(b);
655 uint32_t I = 0;
656 I |= Sign << 31;
657 I |= ((Exp & 0x4) != 0 ? 0 : 1) << 30;
658 I |= ((Exp & 0x4) != 0 ? 0x1f : 0) << 25;
659 I |= (Exp & 0x3) << 23;
660 I |= Mantissa << 19;
661 return bit_cast<float>(I);
662 }
663
664 /// getFP16Imm - Return an 8-bit floating-point version of the 16-bit
665 /// floating-point value. If the value cannot be represented as an 8-bit
666 /// floating-point value, then return -1.
667 inline int getFP16Imm(const APInt &Imm) {
668 uint32_t Sign = Imm.lshr(15).getZExtValue() & 1;
669 int32_t Exp = (Imm.lshr(10).getSExtValue() & 0x1f) - 15; // -14 to 15
670 int64_t Mantissa = Imm.getZExtValue() & 0x3ff; // 10 bits
671
672 // We can handle 4 bits of mantissa.
673 // mantissa = (16+UInt(e:f:g:h))/16.
674 if (Mantissa & 0x3f)
675 return -1;
676 Mantissa >>= 6;
677
678 // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
679 if (Exp < -3 || Exp > 4)
680 return -1;
681 Exp = ((Exp+3) & 0x7) ^ 4;
682
683 return ((int)Sign << 7) | (Exp << 4) | Mantissa;
684 }
685
686 inline int getFP16Imm(const APFloat &FPImm) {
687 return getFP16Imm(FPImm.bitcastToAPInt());
688 }
689
690 /// If this is a FP16Imm encoded as a fp32 value, return the 8-bit encoding
691 /// for it. Otherwise return -1 like getFP16Imm.
692 inline int getFP32FP16Imm(const APInt &Imm) {
693 if (Imm.getActiveBits() > 16)
694 return -1;
695 return ARM_AM::getFP16Imm(Imm.trunc(16));
696 }
697
698 inline int getFP32FP16Imm(const APFloat &FPImm) {
699 return getFP32FP16Imm(FPImm.bitcastToAPInt());
700 }
701
702 /// getFP32Imm - Return an 8-bit floating-point version of the 32-bit
703 /// floating-point value. If the value cannot be represented as an 8-bit
704 /// floating-point value, then return -1.
705 inline int getFP32Imm(const APInt &Imm) {
706 uint32_t Sign = Imm.lshr(31).getZExtValue() & 1;
707 int32_t Exp = (Imm.lshr(23).getSExtValue() & 0xff) - 127; // -126 to 127
708 int64_t Mantissa = Imm.getZExtValue() & 0x7fffff; // 23 bits
709
710 // We can handle 4 bits of mantissa.
711 // mantissa = (16+UInt(e:f:g:h))/16.
712 if (Mantissa & 0x7ffff)
713 return -1;
714 Mantissa >>= 19;
715 if ((Mantissa & 0xf) != Mantissa)
716 return -1;
717
718 // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
719 if (Exp < -3 || Exp > 4)
720 return -1;
721 Exp = ((Exp+3) & 0x7) ^ 4;
722
723 return ((int)Sign << 7) | (Exp << 4) | Mantissa;
724 }
725
726 inline int getFP32Imm(const APFloat &FPImm) {
727 return getFP32Imm(FPImm.bitcastToAPInt());
728 }
729
730 /// getFP64Imm - Return an 8-bit floating-point version of the 64-bit
731 /// floating-point value. If the value cannot be represented as an 8-bit
732 /// floating-point value, then return -1.
733 inline int getFP64Imm(const APInt &Imm) {
734 uint64_t Sign = Imm.lshr(63).getZExtValue() & 1;
735 int64_t Exp = (Imm.lshr(52).getSExtValue() & 0x7ff) - 1023; // -1022 to 1023
736 uint64_t Mantissa = Imm.getZExtValue() & 0xfffffffffffffULL;
737
738 // We can handle 4 bits of mantissa.
739 // mantissa = (16+UInt(e:f:g:h))/16.
740 if (Mantissa & 0xffffffffffffULL)
741 return -1;
742 Mantissa >>= 48;
743 if ((Mantissa & 0xf) != Mantissa)
744 return -1;
745
746 // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
747 if (Exp < -3 || Exp > 4)
748 return -1;
749 Exp = ((Exp+3) & 0x7) ^ 4;
750
751 return ((int)Sign << 7) | (Exp << 4) | Mantissa;
752 }
753
754 inline int getFP64Imm(const APFloat &FPImm) {
755 return getFP64Imm(FPImm.bitcastToAPInt());
756 }
757
758} // end namespace ARM_AM
759} // end namespace llvm
760
761#endif
762

/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/include/llvm/Support/MathExtras.h

1//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains some functions that are useful for math stuff.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_SUPPORT_MATHEXTRAS_H
14#define LLVM_SUPPORT_MATHEXTRAS_H
15
16#include "llvm/Support/Compiler.h"
17#include <cassert>
18#include <climits>
19#include <cmath>
20#include <cstdint>
21#include <cstring>
22#include <limits>
23#include <type_traits>
24
25#ifdef __ANDROID_NDK__
26#include <android/api-level.h>
27#endif
28
29#ifdef _MSC_VER
30// Declare these intrinsics manually rather including intrin.h. It's very
31// expensive, and MathExtras.h is popular.
32// #include <intrin.h>
33extern "C" {
34unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
35unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
36unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
37unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
38}
39#endif
40
41namespace llvm {
42
43/// The behavior an operation has on an input of 0.
44enum ZeroBehavior {
45 /// The returned value is undefined.
46 ZB_Undefined,
47 /// The returned value is numeric_limits<T>::max()
48 ZB_Max,
49 /// The returned value is numeric_limits<T>::digits
50 ZB_Width
51};
52
53/// Mathematical constants.
54namespace numbers {
55// TODO: Track C++20 std::numbers.
56// TODO: Favor using the hexadecimal FP constants (requires C++17).
57constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113
58 egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620
59 ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162
60 ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392
61 log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0)
62 log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2)
63 pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796
64 inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541
65 sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161
66 inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197
67 sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219
68 inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1)
69 sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194
70 inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1)
71 phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622
72constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113
73 egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620
74 ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162
75 ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392
76 log2ef = 1.44269504F, // (0x1.715476P+0)
77 log10ef = .434294482F, // (0x1.bcb7b2P-2)
78 pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796
79 inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541
80 sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161
81 inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197
82 sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193
83 inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1)
84 sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194
85 inv_sqrt3f = .577350269F, // (0x1.279a74P-1)
86 phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622
87} // namespace numbers
88
89namespace detail {
90template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
91 static unsigned count(T Val, ZeroBehavior) {
92 if (!Val)
93 return std::numeric_limits<T>::digits;
94 if (Val & 0x1)
95 return 0;
96
97 // Bisection method.
98 unsigned ZeroBits = 0;
99 T Shift = std::numeric_limits<T>::digits >> 1;
100 T Mask = std::numeric_limits<T>::max() >> Shift;
101 while (Shift) {
102 if ((Val & Mask) == 0) {
103 Val >>= Shift;
104 ZeroBits |= Shift;
105 }
106 Shift >>= 1;
107 Mask >>= Shift;
108 }
109 return ZeroBits;
110 }
111};
112
113#if defined(__GNUC__4) || defined(_MSC_VER)
114template <typename T> struct TrailingZerosCounter<T, 4> {
115 static unsigned count(T Val, ZeroBehavior ZB) {
116 if (ZB
28.1
'ZB' is not equal to ZB_Undefined
28.1
'ZB' is not equal to ZB_Undefined
28.1
'ZB' is not equal to ZB_Undefined
!= ZB_Undefined && Val == 0)
29
Assuming 'Val' is equal to 0
30
Taking true branch
117 return 32;
31
Returning the value 32
118
119#if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4)
120 return __builtin_ctz(Val);
121#elif defined(_MSC_VER)
122 unsigned long Index;
123 _BitScanForward(&Index, Val);
124 return Index;
125#endif
126 }
127};
128
129#if !defined(_MSC_VER) || defined(_M_X64)
130template <typename T> struct TrailingZerosCounter<T, 8> {
131 static unsigned count(T Val, ZeroBehavior ZB) {
132 if (ZB != ZB_Undefined && Val == 0)
133 return 64;
134
135#if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4)
136 return __builtin_ctzll(Val);
137#elif defined(_MSC_VER)
138 unsigned long Index;
139 _BitScanForward64(&Index, Val);
140 return Index;
141#endif
142 }
143};
144#endif
145#endif
146} // namespace detail
147
148/// Count number of 0's from the least significant bit to the most
149/// stopping at the first 1.
150///
151/// Only unsigned integral types are allowed.
152///
153/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
154/// valid arguments.
155template <typename T>
156unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
157 static_assert(std::numeric_limits<T>::is_integer &&
158 !std::numeric_limits<T>::is_signed,
159 "Only unsigned integral types are allowed.");
160 return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
28
Calling 'TrailingZerosCounter::count'
32
Returning from 'TrailingZerosCounter::count'
33
Returning the value 32
161}
162
163namespace detail {
164template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
165 static unsigned count(T Val, ZeroBehavior) {
166 if (!Val)
167 return std::numeric_limits<T>::digits;
168
169 // Bisection method.
170 unsigned ZeroBits = 0;
171 for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
172 T Tmp = Val >> Shift;
173 if (Tmp)
174 Val = Tmp;
175 else
176 ZeroBits |= Shift;
177 }
178 return ZeroBits;
179 }
180};
181
182#if defined(__GNUC__4) || defined(_MSC_VER)
183template <typename T> struct LeadingZerosCounter<T, 4> {
184 static unsigned count(T Val, ZeroBehavior ZB) {
185 if (ZB != ZB_Undefined && Val == 0)
186 return 32;
187
188#if __has_builtin(__builtin_clz)1 || defined(__GNUC__4)
189 return __builtin_clz(Val);
190#elif defined(_MSC_VER)
191 unsigned long Index;
192 _BitScanReverse(&Index, Val);
193 return Index ^ 31;
194#endif
195 }
196};
197
198#if !defined(_MSC_VER) || defined(_M_X64)
199template <typename T> struct LeadingZerosCounter<T, 8> {
200 static unsigned count(T Val, ZeroBehavior ZB) {
201 if (ZB != ZB_Undefined && Val == 0)
202 return 64;
203
204#if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4)
205 return __builtin_clzll(Val);
206#elif defined(_MSC_VER)
207 unsigned long Index;
208 _BitScanReverse64(&Index, Val);
209 return Index ^ 63;
210#endif
211 }
212};
213#endif
214#endif
215} // namespace detail
216
217/// Count number of 0's from the most significant bit to the least
218/// stopping at the first 1.
219///
220/// Only unsigned integral types are allowed.
221///
222/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
223/// valid arguments.
224template <typename T>
225unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
226 static_assert(std::numeric_limits<T>::is_integer &&
227 !std::numeric_limits<T>::is_signed,
228 "Only unsigned integral types are allowed.");
229 return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
230}
231
232/// Get the index of the first set bit starting from the least
233/// significant bit.
234///
235/// Only unsigned integral types are allowed.
236///
237/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
238/// valid arguments.
239template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
240 if (ZB == ZB_Max && Val == 0)
241 return std::numeric_limits<T>::max();
242
243 return countTrailingZeros(Val, ZB_Undefined);
244}
245
246/// Create a bitmask with the N right-most bits set to 1, and all other
247/// bits set to 0. Only unsigned types are allowed.
248template <typename T> T maskTrailingOnes(unsigned N) {
249 static_assert(std::is_unsigned<T>::value, "Invalid type!");
250 const unsigned Bits = CHAR_BIT8 * sizeof(T);
251 assert(N <= Bits && "Invalid bit index")(static_cast <bool> (N <= Bits && "Invalid bit index"
) ? void (0) : __assert_fail ("N <= Bits && \"Invalid bit index\""
, "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/include/llvm/Support/MathExtras.h"
, 251, __extension__ __PRETTY_FUNCTION__))
;
252 return N == 0 ? 0 : (T(-1) >> (Bits - N));
253}
254
255/// Create a bitmask with the N left-most bits set to 1, and all other
256/// bits set to 0. Only unsigned types are allowed.
257template <typename T> T maskLeadingOnes(unsigned N) {
258 return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
259}
260
261/// Create a bitmask with the N right-most bits set to 0, and all other
262/// bits set to 1. Only unsigned types are allowed.
263template <typename T> T maskTrailingZeros(unsigned N) {
264 return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
265}
266
267/// Create a bitmask with the N left-most bits set to 0, and all other
268/// bits set to 1. Only unsigned types are allowed.
269template <typename T> T maskLeadingZeros(unsigned N) {
270 return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
271}
272
273/// Get the index of the last set bit starting from the least
274/// significant bit.
275///
276/// Only unsigned integral types are allowed.
277///
278/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
279/// valid arguments.
280template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
281 if (ZB == ZB_Max && Val == 0)
282 return std::numeric_limits<T>::max();
283
284 // Use ^ instead of - because both gcc and llvm can remove the associated ^
285 // in the __builtin_clz intrinsic on x86.
286 return countLeadingZeros(Val, ZB_Undefined) ^
287 (std::numeric_limits<T>::digits - 1);
288}
289
290/// Macro compressed bit reversal table for 256 bits.
291///
292/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
293static const unsigned char BitReverseTable256[256] = {
294#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
295#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
296#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
297 R6(0), R6(2), R6(1), R6(3)
298#undef R2
299#undef R4
300#undef R6
301};
302
303/// Reverse the bits in \p Val.
304template <typename T>
305T reverseBits(T Val) {
306 unsigned char in[sizeof(Val)];
307 unsigned char out[sizeof(Val)];
308 std::memcpy(in, &Val, sizeof(Val));
309 for (unsigned i = 0; i < sizeof(Val); ++i)
310 out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
311 std::memcpy(&Val, out, sizeof(Val));
312 return Val;
313}
314
315#if __has_builtin(__builtin_bitreverse8)1
316template<>
317inline uint8_t reverseBits<uint8_t>(uint8_t Val) {
318 return __builtin_bitreverse8(Val);
319}
320#endif
321
322#if __has_builtin(__builtin_bitreverse16)1
323template<>
324inline uint16_t reverseBits<uint16_t>(uint16_t Val) {
325 return __builtin_bitreverse16(Val);
326}
327#endif
328
329#if __has_builtin(__builtin_bitreverse32)1
330template<>
331inline uint32_t reverseBits<uint32_t>(uint32_t Val) {
332 return __builtin_bitreverse32(Val);
333}
334#endif
335
336#if __has_builtin(__builtin_bitreverse64)1
337template<>
338inline uint64_t reverseBits<uint64_t>(uint64_t Val) {
339 return __builtin_bitreverse64(Val);
340}
341#endif
342
343// NOTE: The following support functions use the _32/_64 extensions instead of
344// type overloading so that signed and unsigned integers can be used without
345// ambiguity.
346
347/// Return the high 32 bits of a 64 bit value.
348constexpr inline uint32_t Hi_32(uint64_t Value) {
349 return static_cast<uint32_t>(Value >> 32);
350}
351
352/// Return the low 32 bits of a 64 bit value.
353constexpr inline uint32_t Lo_32(uint64_t Value) {
354 return static_cast<uint32_t>(Value);
355}
356
357/// Make a 64-bit integer from a high / low pair of 32-bit integers.
358constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
359 return ((uint64_t)High << 32) | (uint64_t)Low;
360}
361
362/// Checks if an integer fits into the given bit width.
363template <unsigned N> constexpr inline bool isInt(int64_t x) {
364 return N >= 64 || (-(INT64_C(1)1L<<(N-1)) <= x && x < (INT64_C(1)1L<<(N-1)));
365}
366// Template specializations to get better code for common cases.
367template <> constexpr inline bool isInt<8>(int64_t x) {
368 return static_cast<int8_t>(x) == x;
369}
370template <> constexpr inline bool isInt<16>(int64_t x) {
371 return static_cast<int16_t>(x) == x;
372}
373template <> constexpr inline bool isInt<32>(int64_t x) {
374 return static_cast<int32_t>(x) == x;
375}
376
377/// Checks if a signed integer is an N bit number shifted left by S.
378template <unsigned N, unsigned S>
379constexpr inline bool isShiftedInt(int64_t x) {
380 static_assert(
381 N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number.");
382 static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide.");
383 return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
384}
385
386/// Checks if an unsigned integer fits into the given bit width.
387///
388/// This is written as two functions rather than as simply
389///
390/// return N >= 64 || X < (UINT64_C(1) << N);
391///
392/// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting
393/// left too many places.
394template <unsigned N>
395constexpr inline std::enable_if_t<(N < 64), bool> isUInt(uint64_t X) {
396 static_assert(N > 0, "isUInt<0> doesn't make sense");
397 return X < (UINT64_C(1)1UL << (N));
398}
399template <unsigned N>
400constexpr inline std::enable_if_t<N >= 64, bool> isUInt(uint64_t) {
401 return true;
402}
403
404// Template specializations to get better code for common cases.
405template <> constexpr inline bool isUInt<8>(uint64_t x) {
406 return static_cast<uint8_t>(x) == x;
407}
408template <> constexpr inline bool isUInt<16>(uint64_t x) {
409 return static_cast<uint16_t>(x) == x;
410}
411template <> constexpr inline bool isUInt<32>(uint64_t x) {
412 return static_cast<uint32_t>(x) == x;
413}
414
415/// Checks if a unsigned integer is an N bit number shifted left by S.
416template <unsigned N, unsigned S>
417constexpr inline bool isShiftedUInt(uint64_t x) {
418 static_assert(
419 N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)");
420 static_assert(N + S <= 64,
421 "isShiftedUInt<N, S> with N + S > 64 is too wide.");
422 // Per the two static_asserts above, S must be strictly less than 64. So
423 // 1 << S is not undefined behavior.
424 return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
425}
426
427/// Gets the maximum value for a N-bit unsigned integer.
428inline uint64_t maxUIntN(uint64_t N) {
429 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
"integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/include/llvm/Support/MathExtras.h"
, 429, __extension__ __PRETTY_FUNCTION__))
;
430
431 // uint64_t(1) << 64 is undefined behavior, so we can't do
432 // (uint64_t(1) << N) - 1
433 // without checking first that N != 64. But this works and doesn't have a
434 // branch.
435 return UINT64_MAX(18446744073709551615UL) >> (64 - N);
436}
437
438/// Gets the minimum value for a N-bit signed integer.
439inline int64_t minIntN(int64_t N) {
440 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
"integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/include/llvm/Support/MathExtras.h"
, 440, __extension__ __PRETTY_FUNCTION__))
;
441
442 return UINT64_C(1)1UL + ~(UINT64_C(1)1UL << (N - 1));
443}
444
445/// Gets the maximum value for a N-bit signed integer.
446inline int64_t maxIntN(int64_t N) {
447 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
"integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/include/llvm/Support/MathExtras.h"
, 447, __extension__ __PRETTY_FUNCTION__))
;
448
449 // This relies on two's complement wraparound when N == 64, so we convert to
450 // int64_t only at the very end to avoid UB.
451 return (UINT64_C(1)1UL << (N - 1)) - 1;
452}
453
454/// Checks if an unsigned integer fits into the given (dynamic) bit width.
455inline bool isUIntN(unsigned N, uint64_t x) {
456 return N >= 64 || x <= maxUIntN(N);
457}
458
459/// Checks if an signed integer fits into the given (dynamic) bit width.
460inline bool isIntN(unsigned N, int64_t x) {
461 return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N));
462}
463
464/// Return true if the argument is a non-empty sequence of ones starting at the
465/// least significant bit with the remainder zero (32 bit version).
466/// Ex. isMask_32(0x0000FFFFU) == true.
467constexpr inline bool isMask_32(uint32_t Value) {
468 return Value && ((Value + 1) & Value) == 0;
469}
470
471/// Return true if the argument is a non-empty sequence of ones starting at the
472/// least significant bit with the remainder zero (64 bit version).
473constexpr inline bool isMask_64(uint64_t Value) {
474 return Value && ((Value + 1) & Value) == 0;
475}
476
477/// Return true if the argument contains a non-empty sequence of ones with the
478/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
479constexpr inline bool isShiftedMask_32(uint32_t Value) {
480 return Value && isMask_32((Value - 1) | Value);
481}
482
483/// Return true if the argument contains a non-empty sequence of ones with the
484/// remainder zero (64 bit version.)
485constexpr inline bool isShiftedMask_64(uint64_t Value) {
486 return Value && isMask_64((Value - 1) | Value);
487}
488
489/// Return true if the argument is a power of two > 0.
490/// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
491constexpr inline bool isPowerOf2_32(uint32_t Value) {
492 return Value && !(Value & (Value - 1));
493}
494
495/// Return true if the argument is a power of two > 0 (64 bit edition.)
496constexpr inline bool isPowerOf2_64(uint64_t Value) {
497 return Value && !(Value & (Value - 1));
498}
499
500/// Count the number of ones from the most significant bit to the first
501/// zero bit.
502///
503/// Ex. countLeadingOnes(0xFF0FFF00) == 8.
504/// Only unsigned integral types are allowed.
505///
506/// \param ZB the behavior on an input of all ones. Only ZB_Width and
507/// ZB_Undefined are valid arguments.
508template <typename T>
509unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
510 static_assert(std::numeric_limits<T>::is_integer &&
511 !std::numeric_limits<T>::is_signed,
512 "Only unsigned integral types are allowed.");
513 return countLeadingZeros<T>(~Value, ZB);
514}
515
516/// Count the number of ones from the least significant bit to the first
517/// zero bit.
518///
519/// Ex. countTrailingOnes(0x00FF00FF) == 8.
520/// Only unsigned integral types are allowed.
521///
522/// \param ZB the behavior on an input of all ones. Only ZB_Width and
523/// ZB_Undefined are valid arguments.
524template <typename T>
525unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
526 static_assert(std::numeric_limits<T>::is_integer &&
527 !std::numeric_limits<T>::is_signed,
528 "Only unsigned integral types are allowed.");
529 return countTrailingZeros<T>(~Value, ZB);
530}
531
532namespace detail {
533template <typename T, std::size_t SizeOfT> struct PopulationCounter {
534 static unsigned count(T Value) {
535 // Generic version, forward to 32 bits.
536 static_assert(SizeOfT <= 4, "Not implemented!");
537#if defined(__GNUC__4)
538 return __builtin_popcount(Value);
539#else
540 uint32_t v = Value;
541 v = v - ((v >> 1) & 0x55555555);
542 v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
543 return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
544#endif
545 }
546};
547
548template <typename T> struct PopulationCounter<T, 8> {
549 static unsigned count(T Value) {
550#if defined(__GNUC__4)
551 return __builtin_popcountll(Value);
552#else
553 uint64_t v = Value;
554 v = v - ((v >> 1) & 0x5555555555555555ULL);
555 v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
556 v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
557 return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
558#endif
559 }
560};
561} // namespace detail
562
563/// Count the number of set bits in a value.
564/// Ex. countPopulation(0xF000F000) = 8
565/// Returns 0 if the word is zero.
566template <typename T>
567inline unsigned countPopulation(T Value) {
568 static_assert(std::numeric_limits<T>::is_integer &&
569 !std::numeric_limits<T>::is_signed,
570 "Only unsigned integral types are allowed.");
571 return detail::PopulationCounter<T, sizeof(T)>::count(Value);
572}
573
574/// Compile time Log2.
575/// Valid only for positive powers of two.
576template <size_t kValue> constexpr inline size_t CTLog2() {
577 static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue),
578 "Value is not a valid power of 2");
579 return 1 + CTLog2<kValue / 2>();
580}
581
582template <> constexpr inline size_t CTLog2<1>() { return 0; }
583
584/// Return the log base 2 of the specified value.
585inline double Log2(double Value) {
586#if defined(__ANDROID_API__) && __ANDROID_API__ < 18
587 return __builtin_log(Value) / __builtin_log(2.0);
588#else
589 return log2(Value);
590#endif
591}
592
593/// Return the floor log base 2 of the specified value, -1 if the value is zero.
594/// (32 bit edition.)
595/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
596inline unsigned Log2_32(uint32_t Value) {
597 return 31 - countLeadingZeros(Value);
598}
599
600/// Return the floor log base 2 of the specified value, -1 if the value is zero.
601/// (64 bit edition.)
602inline unsigned Log2_64(uint64_t Value) {
603 return 63 - countLeadingZeros(Value);
604}
605
606/// Return the ceil log base 2 of the specified value, 32 if the value is zero.
607/// (32 bit edition).
608/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
609inline unsigned Log2_32_Ceil(uint32_t Value) {
610 return 32 - countLeadingZeros(Value - 1);
611}
612
613/// Return the ceil log base 2 of the specified value, 64 if the value is zero.
614/// (64 bit edition.)
615inline unsigned Log2_64_Ceil(uint64_t Value) {
616 return 64 - countLeadingZeros(Value - 1);
617}
618
619/// Return the greatest common divisor of the values using Euclid's algorithm.
620template <typename T>
621inline T greatestCommonDivisor(T A, T B) {
622 while (B) {
623 T Tmp = B;
624 B = A % B;
625 A = Tmp;
626 }
627 return A;
628}
629
630inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) {
631 return greatestCommonDivisor<uint64_t>(A, B);
632}
633
634/// This function takes a 64-bit integer and returns the bit equivalent double.
635inline double BitsToDouble(uint64_t Bits) {
636 double D;
637 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
638 memcpy(&D, &Bits, sizeof(Bits));
639 return D;
640}
641
642/// This function takes a 32-bit integer and returns the bit equivalent float.
643inline float BitsToFloat(uint32_t Bits) {
644 float F;
645 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
646 memcpy(&F, &Bits, sizeof(Bits));
647 return F;
648}
649
650/// This function takes a double and returns the bit equivalent 64-bit integer.
651/// Note that copying doubles around changes the bits of NaNs on some hosts,
652/// notably x86, so this routine cannot be used if these bits are needed.
653inline uint64_t DoubleToBits(double Double) {
654 uint64_t Bits;
655 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
656 memcpy(&Bits, &Double, sizeof(Double));
657 return Bits;
658}
659
660/// This function takes a float and returns the bit equivalent 32-bit integer.
661/// Note that copying floats around changes the bits of NaNs on some hosts,
662/// notably x86, so this routine cannot be used if these bits are needed.
663inline uint32_t FloatToBits(float Float) {
664 uint32_t Bits;
665 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
666 memcpy(&Bits, &Float, sizeof(Float));
667 return Bits;
668}
669
670/// A and B are either alignments or offsets. Return the minimum alignment that
671/// may be assumed after adding the two together.
672constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
673 // The largest power of 2 that divides both A and B.
674 //
675 // Replace "-Value" by "1+~Value" in the following commented code to avoid
676 // MSVC warning C4146
677 // return (A | B) & -(A | B);
678 return (A | B) & (1 + ~(A | B));
679}
680
681/// Returns the next power of two (in 64-bits) that is strictly greater than A.
682/// Returns zero on overflow.
683inline uint64_t NextPowerOf2(uint64_t A) {
684 A |= (A >> 1);
685 A |= (A >> 2);
686 A |= (A >> 4);
687 A |= (A >> 8);
688 A |= (A >> 16);
689 A |= (A >> 32);
690 return A + 1;
691}
692
693/// Returns the power of two which is less than or equal to the given value.
694/// Essentially, it is a floor operation across the domain of powers of two.
695inline uint64_t PowerOf2Floor(uint64_t A) {
696 if (!A) return 0;
697 return 1ull << (63 - countLeadingZeros(A, ZB_Undefined));
698}
699
700/// Returns the power of two which is greater than or equal to the given value.
701/// Essentially, it is a ceil operation across the domain of powers of two.
702inline uint64_t PowerOf2Ceil(uint64_t A) {
703 if (!A)
704 return 0;
705 return NextPowerOf2(A - 1);
706}
707
708/// Returns the next integer (mod 2**64) that is greater than or equal to
709/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
710///
711/// If non-zero \p Skew is specified, the return value will be a minimal
712/// integer that is greater than or equal to \p Value and equal to
713/// \p Align * N + \p Skew for some integer N. If \p Skew is larger than
714/// \p Align, its value is adjusted to '\p Skew mod \p Align'.
715///
716/// Examples:
717/// \code
718/// alignTo(5, 8) = 8
719/// alignTo(17, 8) = 24
720/// alignTo(~0LL, 8) = 0
721/// alignTo(321, 255) = 510
722///
723/// alignTo(5, 8, 7) = 7
724/// alignTo(17, 8, 1) = 17
725/// alignTo(~0LL, 8, 3) = 3
726/// alignTo(321, 255, 42) = 552
727/// \endcode
728inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
729 assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0."
) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/include/llvm/Support/MathExtras.h"
, 729, __extension__ __PRETTY_FUNCTION__))
;
730 Skew %= Align;
731 return (Value + Align - 1 - Skew) / Align * Align + Skew;
732}
733
734/// Returns the next integer (mod 2**64) that is greater than or equal to
735/// \p Value and is a multiple of \c Align. \c Align must be non-zero.
736template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) {
737 static_assert(Align != 0u, "Align must be non-zero");
738 return (Value + Align - 1) / Align * Align;
739}
740
741/// Returns the integer ceil(Numerator / Denominator).
742inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
743 return alignTo(Numerator, Denominator) / Denominator;
744}
745
746/// Returns the integer nearest(Numerator / Denominator).
747inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) {
748 return (Numerator + (Denominator / 2)) / Denominator;
749}
750
751/// Returns the largest uint64_t less than or equal to \p Value and is
752/// \p Skew mod \p Align. \p Align must be non-zero
753inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
754 assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0."
) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/include/llvm/Support/MathExtras.h"
, 754, __extension__ __PRETTY_FUNCTION__))
;
755 Skew %= Align;
756 return (Value - Skew) / Align * Align + Skew;
757}
758
759/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
760/// Requires 0 < B <= 32.
761template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) {
762 static_assert(B > 0, "Bit width can't be 0.");
763 static_assert(B <= 32, "Bit width out of range.");
764 return int32_t(X << (32 - B)) >> (32 - B);
765}
766
767/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
768/// Requires 0 < B <= 32.
769inline int32_t SignExtend32(uint32_t X, unsigned B) {
770 assert(B > 0 && "Bit width can't be 0.")(static_cast <bool> (B > 0 && "Bit width can't be 0."
) ? void (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/include/llvm/Support/MathExtras.h"
, 770, __extension__ __PRETTY_FUNCTION__))
;
771 assert(B <= 32 && "Bit width out of range.")(static_cast <bool> (B <= 32 && "Bit width out of range."
) ? void (0) : __assert_fail ("B <= 32 && \"Bit width out of range.\""
, "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/include/llvm/Support/MathExtras.h"
, 771, __extension__ __PRETTY_FUNCTION__))
;
772 return int32_t(X << (32 - B)) >> (32 - B);
773}
774
775/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
776/// Requires 0 < B <= 64.
777template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) {
778 static_assert(B > 0, "Bit width can't be 0.");
779 static_assert(B <= 64, "Bit width out of range.");
780 return int64_t(x << (64 - B)) >> (64 - B);
781}
782
783/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
784/// Requires 0 < B <= 64.
785inline int64_t SignExtend64(uint64_t X, unsigned B) {
786 assert(B > 0 && "Bit width can't be 0.")(static_cast <bool> (B > 0 && "Bit width can't be 0."
) ? void (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/include/llvm/Support/MathExtras.h"
, 786, __extension__ __PRETTY_FUNCTION__))
;
787 assert(B <= 64 && "Bit width out of range.")(static_cast <bool> (B <= 64 && "Bit width out of range."
) ? void (0) : __assert_fail ("B <= 64 && \"Bit width out of range.\""
, "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/include/llvm/Support/MathExtras.h"
, 787, __extension__ __PRETTY_FUNCTION__))
;
788 return int64_t(X << (64 - B)) >> (64 - B);
789}
790
791/// Subtract two unsigned integers, X and Y, of type T and return the absolute
792/// value of the result.
793template <typename T>
794std::enable_if_t<std::is_unsigned<T>::value, T> AbsoluteDifference(T X, T Y) {
795 return X > Y ? (X - Y) : (Y - X);
796}
797
798/// Add two unsigned integers, X and Y, of type T. Clamp the result to the
799/// maximum representable value of T on overflow. ResultOverflowed indicates if
800/// the result is larger than the maximum representable value of type T.
801template <typename T>
802std::enable_if_t<std::is_unsigned<T>::value, T>
803SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) {
804 bool Dummy;
805 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
806 // Hacker's Delight, p. 29
807 T Z = X + Y;
808 Overflowed = (Z < X || Z < Y);
809 if (Overflowed)
810 return std::numeric_limits<T>::max();
811 else
812 return Z;
813}
814
815/// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the
816/// maximum representable value of T on overflow. ResultOverflowed indicates if
817/// the result is larger than the maximum representable value of type T.
818template <typename T>
819std::enable_if_t<std::is_unsigned<T>::value, T>
820SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) {
821 bool Dummy;
822 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
823
824 // Hacker's Delight, p. 30 has a different algorithm, but we don't use that
825 // because it fails for uint16_t (where multiplication can have undefined
826 // behavior due to promotion to int), and requires a division in addition
827 // to the multiplication.
828
829 Overflowed = false;
830
831 // Log2(Z) would be either Log2Z or Log2Z + 1.
832 // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z
833 // will necessarily be less than Log2Max as desired.
834 int Log2Z = Log2_64(X) + Log2_64(Y);
835 const T Max = std::numeric_limits<T>::max();
836 int Log2Max = Log2_64(Max);
837 if (Log2Z < Log2Max) {
838 return X * Y;
839 }
840 if (Log2Z > Log2Max) {
841 Overflowed = true;
842 return Max;
843 }
844
845 // We're going to use the top bit, and maybe overflow one
846 // bit past it. Multiply all but the bottom bit then add
847 // that on at the end.
848 T Z = (X >> 1) * Y;
849 if (Z & ~(Max >> 1)) {
850 Overflowed = true;
851 return Max;
852 }
853 Z <<= 1;
854 if (X & 1)
855 return SaturatingAdd(Z, Y, ResultOverflowed);
856
857 return Z;
858}
859
860/// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to
861/// the product. Clamp the result to the maximum representable value of T on
862/// overflow. ResultOverflowed indicates if the result is larger than the
863/// maximum representable value of type T.
864template <typename T>
865std::enable_if_t<std::is_unsigned<T>::value, T>
866SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) {
867 bool Dummy;
868 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
869
870 T Product = SaturatingMultiply(X, Y, &Overflowed);
871 if (Overflowed)
872 return Product;
873
874 return SaturatingAdd(A, Product, &Overflowed);
875}
876
877/// Use this rather than HUGE_VALF; the latter causes warnings on MSVC.
878extern const float huge_valf;
879
880
881/// Add two signed integers, computing the two's complement truncated result,
882/// returning true if overflow occured.
883template <typename T>
884std::enable_if_t<std::is_signed<T>::value, T> AddOverflow(T X, T Y, T &Result) {
885#if __has_builtin(__builtin_add_overflow)1
886 return __builtin_add_overflow(X, Y, &Result);
887#else
888 // Perform the unsigned addition.
889 using U = std::make_unsigned_t<T>;
890 const U UX = static_cast<U>(X);
891 const U UY = static_cast<U>(Y);
892 const U UResult = UX + UY;
893
894 // Convert to signed.
895 Result = static_cast<T>(UResult);
896
897 // Adding two positive numbers should result in a positive number.
898 if (X > 0 && Y > 0)
899 return Result <= 0;
900 // Adding two negatives should result in a negative number.
901 if (X < 0 && Y < 0)
902 return Result >= 0;
903 return false;
904#endif
905}
906
907/// Subtract two signed integers, computing the two's complement truncated
908/// result, returning true if an overflow ocurred.
909template <typename T>
910std::enable_if_t<std::is_signed<T>::value, T> SubOverflow(T X, T Y, T &Result) {
911#if __has_builtin(__builtin_sub_overflow)1
912 return __builtin_sub_overflow(X, Y, &Result);
913#else
914 // Perform the unsigned addition.
915 using U = std::make_unsigned_t<T>;
916 const U UX = static_cast<U>(X);
917 const U UY = static_cast<U>(Y);
918 const U UResult = UX - UY;
919
920 // Convert to signed.
921 Result = static_cast<T>(UResult);
922
923 // Subtracting a positive number from a negative results in a negative number.
924 if (X <= 0 && Y > 0)
925 return Result >= 0;
926 // Subtracting a negative number from a positive results in a positive number.
927 if (X >= 0 && Y < 0)
928 return Result <= 0;
929 return false;
930#endif
931}
932
933/// Multiply two signed integers, computing the two's complement truncated
934/// result, returning true if an overflow ocurred.
935template <typename T>
936std::enable_if_t<std::is_signed<T>::value, T> MulOverflow(T X, T Y, T &Result) {
937 // Perform the unsigned multiplication on absolute values.
938 using U = std::make_unsigned_t<T>;
939 const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X);
940 const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y);
941 const U UResult = UX * UY;
942
943 // Convert to signed.
944 const bool IsNegative = (X < 0) ^ (Y < 0);
945 Result = IsNegative ? (0 - UResult) : UResult;
946
947 // If any of the args was 0, result is 0 and no overflow occurs.
948 if (UX == 0 || UY == 0)
949 return false;
950
951 // UX and UY are in [1, 2^n], where n is the number of digits.
952 // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for
953 // positive) divided by an argument compares to the other.
954 if (IsNegative)
955 return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY;
956 else
957 return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY;
958}
959
960} // End llvm namespace
961
962#endif