Bug Summary

File:llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
Warning:line 223, column 16
The result of the left shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name ARMTargetTransformInfo.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mthread-model posix -mframe-pointer=none -fmath-errno -fno-rounding-math -masm-verbose -mconstructor-aliases -munwind-tables -target-cpu x86-64 -dwarf-column-info -fno-split-dwarf-inlining -debugger-tuning=gdb -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-11/lib/clang/11.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/build-llvm/lib/Target/ARM -I /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/ARM -I /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/build-llvm/include -I /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-11/lib/clang/11.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/build-llvm/lib/Target/ARM -fdebug-prefix-map=/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347=. -ferror-limit 19 -fmessage-length 0 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -o /tmp/scan-build-2020-03-09-184146-41876-1 -x c++ /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

1//===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "ARMTargetTransformInfo.h"
10#include "ARMSubtarget.h"
11#include "MCTargetDesc/ARMAddressingModes.h"
12#include "llvm/ADT/APInt.h"
13#include "llvm/ADT/SmallVector.h"
14#include "llvm/Analysis/LoopInfo.h"
15#include "llvm/CodeGen/CostTable.h"
16#include "llvm/CodeGen/ISDOpcodes.h"
17#include "llvm/CodeGen/ValueTypes.h"
18#include "llvm/IR/BasicBlock.h"
19#include "llvm/IR/CallSite.h"
20#include "llvm/IR/DataLayout.h"
21#include "llvm/IR/DerivedTypes.h"
22#include "llvm/IR/Instruction.h"
23#include "llvm/IR/Instructions.h"
24#include "llvm/IR/IntrinsicInst.h"
25#include "llvm/IR/PatternMatch.h"
26#include "llvm/IR/Type.h"
27#include "llvm/MC/SubtargetFeature.h"
28#include "llvm/Support/Casting.h"
29#include "llvm/Support/MachineValueType.h"
30#include "llvm/Target/TargetMachine.h"
31#include <algorithm>
32#include <cassert>
33#include <cstdint>
34#include <utility>
35
36using namespace llvm;
37
38#define DEBUG_TYPE"armtti" "armtti"
39
40static cl::opt<bool> EnableMaskedLoadStores(
41 "enable-arm-maskedldst", cl::Hidden, cl::init(true),
42 cl::desc("Enable the generation of masked loads and stores"));
43
44static cl::opt<bool> DisableLowOverheadLoops(
45 "disable-arm-loloops", cl::Hidden, cl::init(false),
46 cl::desc("Disable the generation of low-overhead loops"));
47
48extern cl::opt<bool> DisableTailPredication;
49
50extern cl::opt<bool> EnableMaskedGatherScatters;
51
52bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
53 const Function *Callee) const {
54 const TargetMachine &TM = getTLI()->getTargetMachine();
55 const FeatureBitset &CallerBits =
56 TM.getSubtargetImpl(*Caller)->getFeatureBits();
57 const FeatureBitset &CalleeBits =
58 TM.getSubtargetImpl(*Callee)->getFeatureBits();
59
60 // To inline a callee, all features not in the whitelist must match exactly.
61 bool MatchExact = (CallerBits & ~InlineFeatureWhitelist) ==
62 (CalleeBits & ~InlineFeatureWhitelist);
63 // For features in the whitelist, the callee's features must be a subset of
64 // the callers'.
65 bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeatureWhitelist) ==
66 (CalleeBits & InlineFeatureWhitelist);
67 return MatchExact && MatchSubset;
68}
69
70bool ARMTTIImpl::shouldFavorBackedgeIndex(const Loop *L) const {
71 if (L->getHeader()->getParent()->hasOptSize())
72 return false;
73 if (ST->hasMVEIntegerOps())
74 return false;
75 return ST->isMClass() && ST->isThumb2() && L->getNumBlocks() == 1;
76}
77
78bool ARMTTIImpl::shouldFavorPostInc() const {
79 if (ST->hasMVEIntegerOps())
80 return true;
81 return false;
82}
83
84int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
85 assert(Ty->isIntegerTy())((Ty->isIntegerTy()) ? static_cast<void> (0) : __assert_fail
("Ty->isIntegerTy()", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp"
, 85, __PRETTY_FUNCTION__))
;
9
'?' condition is true
86
87 unsigned Bits = Ty->getPrimitiveSizeInBits();
88 if (Bits == 0 || Imm.getActiveBits() >= 64)
10
Assuming 'Bits' is not equal to 0
11
Assuming the condition is false
12
Taking false branch
89 return 4;
90
91 int64_t SImmVal = Imm.getSExtValue();
92 uint64_t ZImmVal = Imm.getZExtValue();
93 if (!ST->isThumb()) {
13
Taking false branch
94 if ((SImmVal >= 0 && SImmVal < 65536) ||
95 (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
96 (ARM_AM::getSOImmVal(~ZImmVal) != -1))
97 return 1;
98 return ST->hasV6T2Ops() ? 2 : 3;
99 }
100 if (ST->isThumb2()) {
14
Taking false branch
101 if ((SImmVal >= 0 && SImmVal < 65536) ||
102 (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
103 (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
104 return 1;
105 return ST->hasV6T2Ops() ? 2 : 3;
106 }
107 // Thumb1, any i8 imm cost 1.
108 if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
15
Assuming 'Bits' is not equal to 8
16
Assuming 'SImmVal' is < 0
109 return 1;
110 if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
17
Assuming the condition is false
18
Calling 'isThumbImmShiftedVal'
111 return 2;
112 // Load from constantpool.
113 return 3;
114}
115
116// Constants smaller than 256 fit in the immediate field of
117// Thumb1 instructions so we return a zero cost and 1 otherwise.
118int ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
119 const APInt &Imm, Type *Ty) {
120 if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
121 return 0;
122
123 return 1;
124}
125
126int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
127 Type *Ty) {
128 // Division by a constant can be turned into multiplication, but only if we
129 // know it's constant. So it's not so much that the immediate is cheap (it's
130 // not), but that the alternative is worse.
131 // FIXME: this is probably unneeded with GlobalISel.
132 if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
1
Assuming 'Opcode' is not equal to SDiv
2
Assuming 'Opcode' is not equal to UDiv
133 Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
3
Assuming 'Opcode' is not equal to SRem
4
Assuming 'Opcode' is not equal to URem
134 Idx == 1)
135 return 0;
136
137 if (Opcode == Instruction::And) {
5
Assuming 'Opcode' is equal to And
6
Taking true branch
138 // UXTB/UXTH
139 if (Imm == 255 || Imm == 65535)
7
Taking false branch
140 return 0;
141 // Conversion to BIC is free, and means we can use ~Imm instead.
142 return std::min(getIntImmCost(Imm, Ty), getIntImmCost(~Imm, Ty));
8
Calling 'ARMTTIImpl::getIntImmCost'
143 }
144
145 if (Opcode == Instruction::Add)
146 // Conversion to SUB is free, and means we can use -Imm instead.
147 return std::min(getIntImmCost(Imm, Ty), getIntImmCost(-Imm, Ty));
148
149 if (Opcode == Instruction::ICmp && Imm.isNegative() &&
150 Ty->getIntegerBitWidth() == 32) {
151 int64_t NegImm = -Imm.getSExtValue();
152 if (ST->isThumb2() && NegImm < 1<<12)
153 // icmp X, #-C -> cmn X, #C
154 return 0;
155 if (ST->isThumb() && NegImm < 1<<8)
156 // icmp X, #-C -> adds X, #C
157 return 0;
158 }
159
160 // xor a, -1 can always be folded to MVN
161 if (Opcode == Instruction::Xor && Imm.isAllOnesValue())
162 return 0;
163
164 return getIntImmCost(Imm, Ty);
165}
166
167int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
168 const Instruction *I) {
169 int ISD = TLI->InstructionOpcodeToISD(Opcode);
170 assert(ISD && "Invalid opcode")((ISD && "Invalid opcode") ? static_cast<void> (
0) : __assert_fail ("ISD && \"Invalid opcode\"", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp"
, 170, __PRETTY_FUNCTION__))
;
171
172 // Single to/from double precision conversions.
173 static const CostTblEntry NEONFltDblTbl[] = {
174 // Vector fptrunc/fpext conversions.
175 { ISD::FP_ROUND, MVT::v2f64, 2 },
176 { ISD::FP_EXTEND, MVT::v2f32, 2 },
177 { ISD::FP_EXTEND, MVT::v4f32, 4 }
178 };
179
180 if (Src->isVectorTy() && ST->hasNEON() && (ISD == ISD::FP_ROUND ||
181 ISD == ISD::FP_EXTEND)) {
182 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
183 if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
184 return LT.first * Entry->Cost;
185 }
186
187 EVT SrcTy = TLI->getValueType(DL, Src);
188 EVT DstTy = TLI->getValueType(DL, Dst);
189
190 if (!SrcTy.isSimple() || !DstTy.isSimple())
191 return BaseT::getCastInstrCost(Opcode, Dst, Src);
192
193 // The extend of a load is free
194 if (I && isa<LoadInst>(I->getOperand(0))) {
195 static const TypeConversionCostTblEntry LoadConversionTbl[] = {
196 {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
197 {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
198 {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
199 {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
200 {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
201 {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
202 {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
203 {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
204 {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
205 {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
206 {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
207 {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
208 };
209 if (const auto *Entry = ConvertCostTableLookup(
210 LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
211 return Entry->Cost;
212
213 static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
214 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
215 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
216 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
217 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
218 {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
219 {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
220 };
221 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
222 if (const auto *Entry =
223 ConvertCostTableLookup(MVELoadConversionTbl, ISD,
224 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
225 return Entry->Cost;
226 }
227 }
228
229 // Some arithmetic, load and store operations have specific instructions
230 // to cast up/down their types automatically at no extra cost.
231 // TODO: Get these tables to know at least what the related operations are.
232 static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
233 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0 },
234 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0 },
235 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
236 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
237 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
238 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
239
240 // The number of vmovl instructions for the extension.
241 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
242 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
243 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
244 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
245 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
246 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
247 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
248 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
249 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
250 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
251
252 // Operations that we legalize using splitting.
253 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
254 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
255
256 // Vector float <-> i32 conversions.
257 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
258 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
259
260 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
261 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
262 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
263 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
264 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
265 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
266 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
267 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
268 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
269 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
270 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
271 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
272 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
273 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
274 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
275 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
276 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
277 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
278 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
279 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
280
281 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
282 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
283 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 },
284 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 },
285 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
286 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
287
288 // Vector double <-> i32 conversions.
289 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
290 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
291
292 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
293 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
294 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
295 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
296 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
297 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
298
299 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
300 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
301 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 4 },
302 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 4 },
303 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 8 },
304 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 8 }
305 };
306
307 if (SrcTy.isVector() && ST->hasNEON()) {
308 if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
309 DstTy.getSimpleVT(),
310 SrcTy.getSimpleVT()))
311 return Entry->Cost;
312 }
313
314 // Scalar float to integer conversions.
315 static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
316 { ISD::FP_TO_SINT, MVT::i1, MVT::f32, 2 },
317 { ISD::FP_TO_UINT, MVT::i1, MVT::f32, 2 },
318 { ISD::FP_TO_SINT, MVT::i1, MVT::f64, 2 },
319 { ISD::FP_TO_UINT, MVT::i1, MVT::f64, 2 },
320 { ISD::FP_TO_SINT, MVT::i8, MVT::f32, 2 },
321 { ISD::FP_TO_UINT, MVT::i8, MVT::f32, 2 },
322 { ISD::FP_TO_SINT, MVT::i8, MVT::f64, 2 },
323 { ISD::FP_TO_UINT, MVT::i8, MVT::f64, 2 },
324 { ISD::FP_TO_SINT, MVT::i16, MVT::f32, 2 },
325 { ISD::FP_TO_UINT, MVT::i16, MVT::f32, 2 },
326 { ISD::FP_TO_SINT, MVT::i16, MVT::f64, 2 },
327 { ISD::FP_TO_UINT, MVT::i16, MVT::f64, 2 },
328 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 2 },
329 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 2 },
330 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 2 },
331 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 2 },
332 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 10 },
333 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 10 },
334 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 10 },
335 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 10 }
336 };
337 if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
338 if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
339 DstTy.getSimpleVT(),
340 SrcTy.getSimpleVT()))
341 return Entry->Cost;
342 }
343
344 // Scalar integer to float conversions.
345 static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
346 { ISD::SINT_TO_FP, MVT::f32, MVT::i1, 2 },
347 { ISD::UINT_TO_FP, MVT::f32, MVT::i1, 2 },
348 { ISD::SINT_TO_FP, MVT::f64, MVT::i1, 2 },
349 { ISD::UINT_TO_FP, MVT::f64, MVT::i1, 2 },
350 { ISD::SINT_TO_FP, MVT::f32, MVT::i8, 2 },
351 { ISD::UINT_TO_FP, MVT::f32, MVT::i8, 2 },
352 { ISD::SINT_TO_FP, MVT::f64, MVT::i8, 2 },
353 { ISD::UINT_TO_FP, MVT::f64, MVT::i8, 2 },
354 { ISD::SINT_TO_FP, MVT::f32, MVT::i16, 2 },
355 { ISD::UINT_TO_FP, MVT::f32, MVT::i16, 2 },
356 { ISD::SINT_TO_FP, MVT::f64, MVT::i16, 2 },
357 { ISD::UINT_TO_FP, MVT::f64, MVT::i16, 2 },
358 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 2 },
359 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 2 },
360 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 2 },
361 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 2 },
362 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 10 },
363 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 10 },
364 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 10 },
365 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 10 }
366 };
367
368 if (SrcTy.isInteger() && ST->hasNEON()) {
369 if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
370 ISD, DstTy.getSimpleVT(),
371 SrcTy.getSimpleVT()))
372 return Entry->Cost;
373 }
374
375 // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
376 // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
377 // are linearised so take more.
378 static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
379 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
380 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
381 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
382 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
383 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
384 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
385 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
386 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
387 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
388 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
389 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
390 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
391 };
392
393 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
394 if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
395 ISD, DstTy.getSimpleVT(),
396 SrcTy.getSimpleVT()))
397 return Entry->Cost * ST->getMVEVectorCostFactor();
398 }
399
400 // Scalar integer conversion costs.
401 static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
402 // i16 -> i64 requires two dependent operations.
403 { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
404
405 // Truncates on i64 are assumed to be free.
406 { ISD::TRUNCATE, MVT::i32, MVT::i64, 0 },
407 { ISD::TRUNCATE, MVT::i16, MVT::i64, 0 },
408 { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 },
409 { ISD::TRUNCATE, MVT::i1, MVT::i64, 0 }
410 };
411
412 if (SrcTy.isInteger()) {
413 if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
414 DstTy.getSimpleVT(),
415 SrcTy.getSimpleVT()))
416 return Entry->Cost;
417 }
418
419 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
420 ? ST->getMVEVectorCostFactor()
421 : 1;
422 return BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src);
423}
424
425int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
426 unsigned Index) {
427 // Penalize inserting into an D-subregister. We end up with a three times
428 // lower estimated throughput on swift.
429 if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
430 ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
431 return 3;
432
433 if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
434 Opcode == Instruction::ExtractElement)) {
435 // Cross-class copies are expensive on many microarchitectures,
436 // so assume they are expensive by default.
437 if (ValTy->getVectorElementType()->isIntegerTy())
438 return 3;
439
440 // Even if it's not a cross class copy, this likely leads to mixing
441 // of NEON and VFP code and should be therefore penalized.
442 if (ValTy->isVectorTy() &&
443 ValTy->getScalarSizeInBits() <= 32)
444 return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
445 }
446
447 if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
448 Opcode == Instruction::ExtractElement)) {
449 // We say MVE moves costs at least the MVEVectorCostFactor, even though
450 // they are scalar instructions. This helps prevent mixing scalar and
451 // vector, to prevent vectorising where we end up just scalarising the
452 // result anyway.
453 return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index),
454 ST->getMVEVectorCostFactor()) *
455 ValTy->getVectorNumElements() / 2;
456 }
457
458 return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
459}
460
461int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
462 const Instruction *I) {
463 int ISD = TLI->InstructionOpcodeToISD(Opcode);
464 // On NEON a vector select gets lowered to vbsl.
465 if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT) {
466 // Lowering of some vector selects is currently far from perfect.
467 static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
468 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
469 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
470 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
471 };
472
473 EVT SelCondTy = TLI->getValueType(DL, CondTy);
474 EVT SelValTy = TLI->getValueType(DL, ValTy);
475 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
476 if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
477 SelCondTy.getSimpleVT(),
478 SelValTy.getSimpleVT()))
479 return Entry->Cost;
480 }
481
482 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
483 return LT.first;
484 }
485
486 int BaseCost = ST->hasMVEIntegerOps() && ValTy->isVectorTy()
487 ? ST->getMVEVectorCostFactor()
488 : 1;
489 return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
490}
491
492int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
493 const SCEV *Ptr) {
494 // Address computations in vectorized code with non-consecutive addresses will
495 // likely result in more instructions compared to scalar code where the
496 // computation can more often be merged into the index mode. The resulting
497 // extra micro-ops can significantly decrease throughput.
498 unsigned NumVectorInstToHideOverhead = 10;
499 int MaxMergeDistance = 64;
500
501 if (ST->hasNEON()) {
502 if (Ty->isVectorTy() && SE &&
503 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
504 return NumVectorInstToHideOverhead;
505
506 // In many cases the address computation is not merged into the instruction
507 // addressing mode.
508 return 1;
509 }
510 return BaseT::getAddressComputationCost(Ty, SE, Ptr);
511}
512
513bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment) {
514 if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
515 return false;
516
517 if (auto *VecTy = dyn_cast<VectorType>(DataTy)) {
518 // Don't support v2i1 yet.
519 if (VecTy->getNumElements() == 2)
520 return false;
521
522 // We don't support extending fp types.
523 unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
524 if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
525 return false;
526 }
527
528 unsigned EltWidth = DataTy->getScalarSizeInBits();
529 return (EltWidth == 32 && (!Alignment || Alignment >= 4)) ||
530 (EltWidth == 16 && (!Alignment || Alignment >= 2)) ||
531 (EltWidth == 8);
532}
533
534bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, MaybeAlign Alignment) {
535 if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
536 return false;
537
538 // This method is called in 2 places:
539 // - from the vectorizer with a scalar type, in which case we need to get
540 // this as good as we can with the limited info we have (and rely on the cost
541 // model for the rest).
542 // - from the masked intrinsic lowering pass with the actual vector type.
543 // For MVE, we have a custom lowering pass that will already have custom
544 // legalised any gathers that we can to MVE intrinsics, and want to expand all
545 // the rest. The pass runs before the masked intrinsic lowering pass, so if we
546 // are here, we know we want to expand.
547 if (isa<VectorType>(Ty))
548 return false;
549
550 unsigned EltWidth = Ty->getScalarSizeInBits();
551 return ((EltWidth == 32 && (!Alignment || Alignment >= 4)) ||
552 (EltWidth == 16 && (!Alignment || Alignment >= 2)) || EltWidth == 8);
553}
554
555int ARMTTIImpl::getMemcpyCost(const Instruction *I) {
556 const MemCpyInst *MI = dyn_cast<MemCpyInst>(I);
557 assert(MI && "MemcpyInst expected")((MI && "MemcpyInst expected") ? static_cast<void>
(0) : __assert_fail ("MI && \"MemcpyInst expected\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp"
, 557, __PRETTY_FUNCTION__))
;
558 ConstantInt *C = dyn_cast<ConstantInt>(MI->getLength());
559
560 // To model the cost of a library call, we assume 1 for the call, and
561 // 3 for the argument setup.
562 const unsigned LibCallCost = 4;
563
564 // If 'size' is not a constant, a library call will be generated.
565 if (!C)
566 return LibCallCost;
567
568 const unsigned Size = C->getValue().getZExtValue();
569 const Align DstAlign = *MI->getDestAlign();
570 const Align SrcAlign = *MI->getSourceAlign();
571 const Function *F = I->getParent()->getParent();
572 const unsigned Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
573 std::vector<EVT> MemOps;
574
575 // MemOps will be poplulated with a list of data types that needs to be
576 // loaded and stored. That's why we multiply the number of elements by 2 to
577 // get the cost for this memcpy.
578 if (getTLI()->findOptimalMemOpLowering(
579 MemOps, Limit,
580 MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
581 /*IsVolatile*/ true),
582 MI->getDestAddressSpace(), MI->getSourceAddressSpace(),
583 F->getAttributes()))
584 return MemOps.size() * 2;
585
586 // If we can't find an optimal memop lowering, return the default cost
587 return LibCallCost;
588}
589
590int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
591 Type *SubTp) {
592 if (ST->hasNEON()) {
593 if (Kind == TTI::SK_Broadcast) {
594 static const CostTblEntry NEONDupTbl[] = {
595 // VDUP handles these cases.
596 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
597 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
598 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
599 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
600 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
601 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
602
603 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
604 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
605 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
606 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
607
608 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
609
610 if (const auto *Entry =
611 CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
612 return LT.first * Entry->Cost;
613 }
614 if (Kind == TTI::SK_Reverse) {
615 static const CostTblEntry NEONShuffleTbl[] = {
616 // Reverse shuffle cost one instruction if we are shuffling within a
617 // double word (vrev) or two if we shuffle a quad word (vrev, vext).
618 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
619 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
620 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
621 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
622 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
623 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
624
625 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
626 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
627 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
628 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
629
630 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
631
632 if (const auto *Entry =
633 CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
634 return LT.first * Entry->Cost;
635 }
636 if (Kind == TTI::SK_Select) {
637 static const CostTblEntry NEONSelShuffleTbl[] = {
638 // Select shuffle cost table for ARM. Cost is the number of
639 // instructions
640 // required to create the shuffled vector.
641
642 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
643 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
644 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
645 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
646
647 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
648 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
649 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
650
651 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
652
653 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
654
655 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
656 if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
657 ISD::VECTOR_SHUFFLE, LT.second))
658 return LT.first * Entry->Cost;
659 }
660 }
661 if (ST->hasMVEIntegerOps()) {
662 if (Kind == TTI::SK_Broadcast) {
663 static const CostTblEntry MVEDupTbl[] = {
664 // VDUP handles these cases.
665 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
666 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
667 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
668 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
669 {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};
670
671 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
672
673 if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
674 LT.second))
675 return LT.first * Entry->Cost * ST->getMVEVectorCostFactor();
676 }
677 }
678 int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
679 ? ST->getMVEVectorCostFactor()
680 : 1;
681 return BaseCost * BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
682}
683
684int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
685 TTI::OperandValueKind Op1Info,
686 TTI::OperandValueKind Op2Info,
687 TTI::OperandValueProperties Opd1PropInfo,
688 TTI::OperandValueProperties Opd2PropInfo,
689 ArrayRef<const Value *> Args,
690 const Instruction *CxtI) {
691 int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
692 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
693
694 if (ST->hasNEON()) {
695 const unsigned FunctionCallDivCost = 20;
696 const unsigned ReciprocalDivCost = 10;
697 static const CostTblEntry CostTbl[] = {
698 // Division.
699 // These costs are somewhat random. Choose a cost of 20 to indicate that
700 // vectorizing devision (added function call) is going to be very expensive.
701 // Double registers types.
702 { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
703 { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
704 { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
705 { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
706 { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
707 { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
708 { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
709 { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
710 { ISD::SDIV, MVT::v4i16, ReciprocalDivCost},
711 { ISD::UDIV, MVT::v4i16, ReciprocalDivCost},
712 { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
713 { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
714 { ISD::SDIV, MVT::v8i8, ReciprocalDivCost},
715 { ISD::UDIV, MVT::v8i8, ReciprocalDivCost},
716 { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost},
717 { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost},
718 // Quad register types.
719 { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
720 { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
721 { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
722 { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
723 { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
724 { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
725 { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
726 { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
727 { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
728 { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
729 { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
730 { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
731 { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
732 { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
733 { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
734 { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
735 // Multiplication.
736 };
737
738 if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
739 return LT.first * Entry->Cost;
740
741 int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
742 Opd1PropInfo, Opd2PropInfo);
743
744 // This is somewhat of a hack. The problem that we are facing is that SROA
745 // creates a sequence of shift, and, or instructions to construct values.
746 // These sequences are recognized by the ISel and have zero-cost. Not so for
747 // the vectorized code. Because we have support for v2i64 but not i64 those
748 // sequences look particularly beneficial to vectorize.
749 // To work around this we increase the cost of v2i64 operations to make them
750 // seem less beneficial.
751 if (LT.second == MVT::v2i64 &&
752 Op2Info == TargetTransformInfo::OK_UniformConstantValue)
753 Cost += 4;
754
755 return Cost;
756 }
757
758 // If this operation is a shift on arm/thumb2, it might well be folded into
759 // the following instruction, hence having a cost of 0.
760 auto LooksLikeAFreeShift = [&]() {
761 if (ST->isThumb1Only() || Ty->isVectorTy())
762 return false;
763
764 if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
765 return false;
766 if (Op2Info != TargetTransformInfo::OK_UniformConstantValue)
767 return false;
768
769 // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
770 switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
771 case Instruction::Add:
772 case Instruction::Sub:
773 case Instruction::And:
774 case Instruction::Xor:
775 case Instruction::Or:
776 case Instruction::ICmp:
777 return true;
778 default:
779 return false;
780 }
781 };
782 if (LooksLikeAFreeShift())
783 return 0;
784
785 int BaseCost = ST->hasMVEIntegerOps() && Ty->isVectorTy()
786 ? ST->getMVEVectorCostFactor()
787 : 1;
788
789 // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
790 // without treating floats as more expensive that scalars or increasing the
791 // costs for custom operations. The results is also multiplied by the
792 // MVEVectorCostFactor where appropriate.
793 if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
794 return LT.first * BaseCost;
795
796 // Else this is expand, assume that we need to scalarize this op.
797 if (Ty->isVectorTy()) {
798 unsigned Num = Ty->getVectorNumElements();
799 unsigned Cost = getArithmeticInstrCost(Opcode, Ty->getScalarType());
800 // Return the cost of multiple scalar invocation plus the cost of
801 // inserting and extracting the values.
802 return BaseT::getScalarizationOverhead(Ty, Args) + Num * Cost;
803 }
804
805 return BaseCost;
806}
807
808int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
809 MaybeAlign Alignment, unsigned AddressSpace,
810 const Instruction *I) {
811 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
812
813 if (ST->hasNEON() && Src->isVectorTy() &&
814 (Alignment && *Alignment != Align(16)) &&
815 Src->getVectorElementType()->isDoubleTy()) {
816 // Unaligned loads/stores are extremely inefficient.
817 // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
818 return LT.first * 4;
819 }
820 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
821 ? ST->getMVEVectorCostFactor()
822 : 1;
823 return BaseCost * LT.first;
824}
825
826int ARMTTIImpl::getInterleavedMemoryOpCost(
827 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
828 unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond,
829 bool UseMaskForGaps) {
830 assert(Factor >= 2 && "Invalid interleave factor")((Factor >= 2 && "Invalid interleave factor") ? static_cast
<void> (0) : __assert_fail ("Factor >= 2 && \"Invalid interleave factor\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp"
, 830, __PRETTY_FUNCTION__))
;
831 assert(isa<VectorType>(VecTy) && "Expect a vector type")((isa<VectorType>(VecTy) && "Expect a vector type"
) ? static_cast<void> (0) : __assert_fail ("isa<VectorType>(VecTy) && \"Expect a vector type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp"
, 831, __PRETTY_FUNCTION__))
;
832
833 // vldN/vstN doesn't support vector types of i64/f64 element.
834 bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
835
836 if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
837 !UseMaskForCond && !UseMaskForGaps) {
838 unsigned NumElts = VecTy->getVectorNumElements();
839 auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
840
841 // vldN/vstN only support legal vector types of size 64 or 128 in bits.
842 // Accesses having vector types that are a multiple of 128 bits can be
843 // matched to more than one vldN/vstN instruction.
844 int BaseCost = ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor() : 1;
845 if (NumElts % Factor == 0 &&
846 TLI->isLegalInterleavedAccessType(Factor, SubVecTy, DL))
847 return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
848
849 // Some smaller than legal interleaved patterns are cheap as we can make
850 // use of the vmovn or vrev patterns to interleave a standard load. This is
851 // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
852 // promoted differently). The cost of 2 here is then a load and vrev or
853 // vmovn.
854 if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
855 VecTy->isIntOrIntVectorTy() && DL.getTypeSizeInBits(SubVecTy) <= 64)
856 return 2 * BaseCost;
857 }
858
859 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
860 Alignment, AddressSpace,
861 UseMaskForCond, UseMaskForGaps);
862}
863
864unsigned ARMTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
865 Value *Ptr, bool VariableMask,
866 unsigned Alignment) {
867 if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
868 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
869 Alignment);
870
871 assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!")((DataTy->isVectorTy() && "Can't do gather/scatters on scalar!"
) ? static_cast<void> (0) : __assert_fail ("DataTy->isVectorTy() && \"Can't do gather/scatters on scalar!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp"
, 871, __PRETTY_FUNCTION__))
;
872 VectorType *VTy = cast<VectorType>(DataTy);
873
874 // TODO: Splitting, once we do that.
875 // TODO: trunc/sext/zext the result/input
876
877 unsigned NumElems = VTy->getNumElements();
878 unsigned EltSize = VTy->getScalarSizeInBits();
879 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, DataTy);
880
881 // For now, it is assumed that for the MVE gather instructions the loads are
882 // all effectively serialised. This means the cost is the scalar cost
883 // multiplied by the number of elements being loaded. This is possibly very
884 // conservative, but even so we still end up vectorising loops because the
885 // cost per iteration for many loops is lower than for scalar loops.
886 unsigned VectorCost = NumElems * LT.first;
887 // The scalarization cost should be a lot higher. We use the number of vector
888 // elements plus the scalarization overhead.
889 unsigned ScalarCost =
890 NumElems * LT.first + BaseT::getScalarizationOverhead(DataTy, {});
891
892 // TODO: Cost extended gathers or trunc stores correctly.
893 if (EltSize * NumElems != 128 || NumElems < 4)
894 return ScalarCost;
895 if (Alignment < EltSize / 8)
896 return ScalarCost;
897
898 // Any (aligned) i32 gather will not need to be scalarised.
899 if (EltSize == 32)
900 return VectorCost;
901 // For smaller types, we need to ensure that the gep's inputs are correctly
902 // extended from a small enough value. Other size (including i64) are
903 // scalarized for now.
904 if (EltSize != 8 && EltSize != 16)
905 return ScalarCost;
906
907 if (auto BC = dyn_cast<BitCastInst>(Ptr))
908 Ptr = BC->getOperand(0);
909 if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
910 if (GEP->getNumOperands() != 2)
911 return ScalarCost;
912 unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
913 // Scale needs to be correct (which is only relevant for i16s).
914 if (Scale != 1 && Scale * 8 != EltSize)
915 return ScalarCost;
916 // And we need to zext (not sext) the indexes from a small enough type.
917 if (auto ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1)))
918 if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= EltSize)
919 return VectorCost;
920 return ScalarCost;
921 }
922 return ScalarCost;
923}
924
925bool ARMTTIImpl::isLoweredToCall(const Function *F) {
926 if (!F->isIntrinsic())
927 BaseT::isLoweredToCall(F);
928
929 // Assume all Arm-specific intrinsics map to an instruction.
930 if (F->getName().startswith("llvm.arm"))
931 return false;
932
933 switch (F->getIntrinsicID()) {
934 default: break;
935 case Intrinsic::powi:
936 case Intrinsic::sin:
937 case Intrinsic::cos:
938 case Intrinsic::pow:
939 case Intrinsic::log:
940 case Intrinsic::log10:
941 case Intrinsic::log2:
942 case Intrinsic::exp:
943 case Intrinsic::exp2:
944 return true;
945 case Intrinsic::sqrt:
946 case Intrinsic::fabs:
947 case Intrinsic::copysign:
948 case Intrinsic::floor:
949 case Intrinsic::ceil:
950 case Intrinsic::trunc:
951 case Intrinsic::rint:
952 case Intrinsic::nearbyint:
953 case Intrinsic::round:
954 case Intrinsic::canonicalize:
955 case Intrinsic::lround:
956 case Intrinsic::llround:
957 case Intrinsic::lrint:
958 case Intrinsic::llrint:
959 if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
960 return true;
961 if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
962 return true;
963 // Some operations can be handled by vector instructions and assume
964 // unsupported vectors will be expanded into supported scalar ones.
965 // TODO Handle scalar operations properly.
966 return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
967 case Intrinsic::masked_store:
968 case Intrinsic::masked_load:
969 case Intrinsic::masked_gather:
970 case Intrinsic::masked_scatter:
971 return !ST->hasMVEIntegerOps();
972 case Intrinsic::sadd_with_overflow:
973 case Intrinsic::uadd_with_overflow:
974 case Intrinsic::ssub_with_overflow:
975 case Intrinsic::usub_with_overflow:
976 case Intrinsic::sadd_sat:
977 case Intrinsic::uadd_sat:
978 case Intrinsic::ssub_sat:
979 case Intrinsic::usub_sat:
980 return false;
981 }
982
983 return BaseT::isLoweredToCall(F);
984}
985
986bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
987 AssumptionCache &AC,
988 TargetLibraryInfo *LibInfo,
989 HardwareLoopInfo &HWLoopInfo) {
990 // Low-overhead branches are only supported in the 'low-overhead branch'
991 // extension of v8.1-m.
992 if (!ST->hasLOB() || DisableLowOverheadLoops)
993 return false;
994
995 if (!SE.hasLoopInvariantBackedgeTakenCount(L))
996 return false;
997
998 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
999 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
1000 return false;
1001
1002 const SCEV *TripCountSCEV =
1003 SE.getAddExpr(BackedgeTakenCount,
1004 SE.getOne(BackedgeTakenCount->getType()));
1005
1006 // We need to store the trip count in LR, a 32-bit register.
1007 if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32)
1008 return false;
1009
1010 // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
1011 // point in generating a hardware loop if that's going to happen.
1012 auto MaybeCall = [this](Instruction &I) {
1013 const ARMTargetLowering *TLI = getTLI();
1014 unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
1015 EVT VT = TLI->getValueType(DL, I.getType(), true);
1016 if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
1017 return true;
1018
1019 // Check if an intrinsic will be lowered to a call and assume that any
1020 // other CallInst will generate a bl.
1021 if (auto *Call = dyn_cast<CallInst>(&I)) {
1022 if (isa<IntrinsicInst>(Call)) {
1023 if (const Function *F = Call->getCalledFunction())
1024 return isLoweredToCall(F);
1025 }
1026 return true;
1027 }
1028
1029 // FPv5 provides conversions between integer, double-precision,
1030 // single-precision, and half-precision formats.
1031 switch (I.getOpcode()) {
1032 default:
1033 break;
1034 case Instruction::FPToSI:
1035 case Instruction::FPToUI:
1036 case Instruction::SIToFP:
1037 case Instruction::UIToFP:
1038 case Instruction::FPTrunc:
1039 case Instruction::FPExt:
1040 return !ST->hasFPARMv8Base();
1041 }
1042
1043 // FIXME: Unfortunately the approach of checking the Operation Action does
1044 // not catch all cases of Legalization that use library calls. Our
1045 // Legalization step categorizes some transformations into library calls as
1046 // Custom, Expand or even Legal when doing type legalization. So for now
1047 // we have to special case for instance the SDIV of 64bit integers and the
1048 // use of floating point emulation.
1049 if (VT.isInteger() && VT.getSizeInBits() >= 64) {
1050 switch (ISD) {
1051 default:
1052 break;
1053 case ISD::SDIV:
1054 case ISD::UDIV:
1055 case ISD::SREM:
1056 case ISD::UREM:
1057 case ISD::SDIVREM:
1058 case ISD::UDIVREM:
1059 return true;
1060 }
1061 }
1062
1063 // Assume all other non-float operations are supported.
1064 if (!VT.isFloatingPoint())
1065 return false;
1066
1067 // We'll need a library call to handle most floats when using soft.
1068 if (TLI->useSoftFloat()) {
1069 switch (I.getOpcode()) {
1070 default:
1071 return true;
1072 case Instruction::Alloca:
1073 case Instruction::Load:
1074 case Instruction::Store:
1075 case Instruction::Select:
1076 case Instruction::PHI:
1077 return false;
1078 }
1079 }
1080
1081 // We'll need a libcall to perform double precision operations on a single
1082 // precision only FPU.
1083 if (I.getType()->isDoubleTy() && !ST->hasFP64())
1084 return true;
1085
1086 // Likewise for half precision arithmetic.
1087 if (I.getType()->isHalfTy() && !ST->hasFullFP16())
1088 return true;
1089
1090 return false;
1091 };
1092
1093 auto IsHardwareLoopIntrinsic = [](Instruction &I) {
1094 if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
1095 switch (Call->getIntrinsicID()) {
1096 default:
1097 break;
1098 case Intrinsic::set_loop_iterations:
1099 case Intrinsic::test_set_loop_iterations:
1100 case Intrinsic::loop_decrement:
1101 case Intrinsic::loop_decrement_reg:
1102 return true;
1103 }
1104 }
1105 return false;
1106 };
1107
1108 // Scan the instructions to see if there's any that we know will turn into a
1109 // call or if this loop is already a low-overhead loop.
1110 auto ScanLoop = [&](Loop *L) {
1111 for (auto *BB : L->getBlocks()) {
1112 for (auto &I : *BB) {
1113 if (MaybeCall(I) || IsHardwareLoopIntrinsic(I))
1114 return false;
1115 }
1116 }
1117 return true;
1118 };
1119
1120 // Visit inner loops.
1121 for (auto Inner : *L)
1122 if (!ScanLoop(Inner))
1123 return false;
1124
1125 if (!ScanLoop(L))
1126 return false;
1127
1128 // TODO: Check whether the trip count calculation is expensive. If L is the
1129 // inner loop but we know it has a low trip count, calculating that trip
1130 // count (in the parent loop) may be detrimental.
1131
1132 LLVMContext &C = L->getHeader()->getContext();
1133 HWLoopInfo.CounterInReg = true;
1134 HWLoopInfo.IsNestingLegal = false;
1135 HWLoopInfo.PerformEntryTest = true;
1136 HWLoopInfo.CountType = Type::getInt32Ty(C);
1137 HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
1138 return true;
1139}
1140
1141static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
1142 // We don't allow icmp's, and because we only look at single block loops,
1143 // we simply count the icmps, i.e. there should only be 1 for the backedge.
1144 if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
1145 return false;
1146
1147 if (isa<FCmpInst>(&I))
1148 return false;
1149
1150 // We could allow extending/narrowing FP loads/stores, but codegen is
1151 // too inefficient so reject this for now.
1152 if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
1153 return false;
1154
1155 // Extends have to be extending-loads
1156 if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
1157 if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
1158 return false;
1159
1160 // Truncs have to be narrowing-stores
1161 if (isa<TruncInst>(&I) )
1162 if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
1163 return false;
1164
1165 return true;
1166}
1167
1168// To set up a tail-predicated loop, we need to know the total number of
1169// elements processed by that loop. Thus, we need to determine the element
1170// size and:
1171// 1) it should be uniform for all operations in the vector loop, so we
1172// e.g. don't want any widening/narrowing operations.
1173// 2) it should be smaller than i64s because we don't have vector operations
1174// that work on i64s.
1175// 3) we don't want elements to be reversed or shuffled, to make sure the
1176// tail-predication masks/predicates the right lanes.
1177//
1178static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
1179 const DataLayout &DL,
1180 const LoopAccessInfo *LAI) {
1181 PredicatedScalarEvolution PSE = LAI->getPSE();
1182 int ICmpCount = 0;
1183 int Stride = 0;
1184
1185 LLVM_DEBUG(dbgs() << "tail-predication: checking allowed instructions\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "tail-predication: checking allowed instructions\n"
; } } while (false)
;
1186 SmallVector<Instruction *, 16> LoadStores;
1187 for (BasicBlock *BB : L->blocks()) {
1188 for (Instruction &I : BB->instructionsWithoutDebug()) {
1189 if (isa<PHINode>(&I))
1190 continue;
1191 if (!canTailPredicateInstruction(I, ICmpCount)) {
1192 LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Instruction not allowed: "; I.
dump(); } } while (false)
;
1193 return false;
1194 }
1195
1196 Type *T = I.getType();
1197 if (T->isPointerTy())
1198 T = T->getPointerElementType();
1199
1200 if (T->getScalarSizeInBits() > 32) {
1201 LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Unsupported Type: "; T->dump
(); } } while (false)
;
1202 return false;
1203 }
1204
1205 if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
1206 Value *Ptr = isa<LoadInst>(I) ? I.getOperand(0) : I.getOperand(1);
1207 int64_t NextStride = getPtrStride(PSE, Ptr, L);
1208 // TODO: for now only allow consecutive strides of 1. We could support
1209 // other strides as long as it is uniform, but let's keep it simple for
1210 // now.
1211 if (Stride == 0 && NextStride == 1) {
1212 Stride = NextStride;
1213 continue;
1214 }
1215 if (Stride != NextStride) {
1216 LLVM_DEBUG(dbgs() << "Different strides found, can't "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Different strides found, can't "
"tail-predicate\n."; } } while (false)
1217 "tail-predicate\n.")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Different strides found, can't "
"tail-predicate\n."; } } while (false)
;
1218 return false;
1219 }
1220 }
1221 }
1222 }
1223
1224 LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "tail-predication: all instructions allowed!\n"
; } } while (false)
;
1225 return true;
1226}
1227
1228bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
1229 ScalarEvolution &SE,
1230 AssumptionCache &AC,
1231 TargetLibraryInfo *TLI,
1232 DominatorTree *DT,
1233 const LoopAccessInfo *LAI) {
1234 if (DisableTailPredication)
1235 return false;
1236
1237 // Creating a predicated vector loop is the first step for generating a
1238 // tail-predicated hardware loop, for which we need the MVE masked
1239 // load/stores instructions:
1240 if (!ST->hasMVEIntegerOps())
1241 return false;
1242
1243 // For now, restrict this to single block loops.
1244 if (L->getNumBlocks() > 1) {
1245 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "preferPredicateOverEpilogue: not a single block "
"loop.\n"; } } while (false)
1246 "loop.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "preferPredicateOverEpilogue: not a single block "
"loop.\n"; } } while (false)
;
1247 return false;
1248 }
1249
1250 assert(L->empty() && "preferPredicateOverEpilogue: inner-loop expected")((L->empty() && "preferPredicateOverEpilogue: inner-loop expected"
) ? static_cast<void> (0) : __assert_fail ("L->empty() && \"preferPredicateOverEpilogue: inner-loop expected\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp"
, 1250, __PRETTY_FUNCTION__))
;
1251
1252 HardwareLoopInfo HWLoopInfo(L);
1253 if (!HWLoopInfo.canAnalyze(*LI)) {
1254 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
"analyzable.\n"; } } while (false)
1255 "analyzable.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
"analyzable.\n"; } } while (false)
;
1256 return false;
1257 }
1258
1259 // This checks if we have the low-overhead branch architecture
1260 // extension, and if we will create a hardware-loop:
1261 if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {
1262 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
"profitable.\n"; } } while (false)
1263 "profitable.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
"profitable.\n"; } } while (false)
;
1264 return false;
1265 }
1266
1267 if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) {
1268 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
"a candidate.\n"; } } while (false)
1269 "a candidate.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
"a candidate.\n"; } } while (false)
;
1270 return false;
1271 }
1272
1273 return canTailPredicateLoop(L, LI, SE, DL, LAI);
1274}
1275
1276
1277void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1278 TTI::UnrollingPreferences &UP) {
1279 // Only currently enable these preferences for M-Class cores.
1280 if (!ST->isMClass())
1281 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP);
1282
1283 // Disable loop unrolling for Oz and Os.
1284 UP.OptSizeThreshold = 0;
1285 UP.PartialOptSizeThreshold = 0;
1286 if (L->getHeader()->getParent()->hasOptSize())
1287 return;
1288
1289 // Only enable on Thumb-2 targets.
1290 if (!ST->isThumb2())
1291 return;
1292
1293 SmallVector<BasicBlock*, 4> ExitingBlocks;
1294 L->getExitingBlocks(ExitingBlocks);
1295 LLVM_DEBUG(dbgs() << "Loop has:\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Loop has:\n" << "Blocks: "
<< L->getNumBlocks() << "\n" << "Exit blocks: "
<< ExitingBlocks.size() << "\n"; } } while (false
)
1296 << "Blocks: " << L->getNumBlocks() << "\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Loop has:\n" << "Blocks: "
<< L->getNumBlocks() << "\n" << "Exit blocks: "
<< ExitingBlocks.size() << "\n"; } } while (false
)
1297 << "Exit blocks: " << ExitingBlocks.size() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Loop has:\n" << "Blocks: "
<< L->getNumBlocks() << "\n" << "Exit blocks: "
<< ExitingBlocks.size() << "\n"; } } while (false
)
;
1298
1299 // Only allow another exit other than the latch. This acts as an early exit
1300 // as it mirrors the profitability calculation of the runtime unroller.
1301 if (ExitingBlocks.size() > 2)
1302 return;
1303
1304 // Limit the CFG of the loop body for targets with a branch predictor.
1305 // Allowing 4 blocks permits if-then-else diamonds in the body.
1306 if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
1307 return;
1308
1309 // Scan the loop: don't unroll loops with calls as this could prevent
1310 // inlining.
1311 unsigned Cost = 0;
1312 for (auto *BB : L->getBlocks()) {
1313 for (auto &I : *BB) {
1314 // Don't unroll vectorised loop. MVE does not benefit from it as much as
1315 // scalar code.
1316 if (I.getType()->isVectorTy())
1317 return;
1318
1319 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1320 ImmutableCallSite CS(&I);
1321 if (const Function *F = CS.getCalledFunction()) {
1322 if (!isLoweredToCall(F))
1323 continue;
1324 }
1325 return;
1326 }
1327
1328 SmallVector<const Value*, 4> Operands(I.value_op_begin(),
1329 I.value_op_end());
1330 Cost += getUserCost(&I, Operands);
1331 }
1332 }
1333
1334 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("armtti")) { dbgs() << "Cost of loop: " << Cost <<
"\n"; } } while (false)
;
1335
1336 UP.Partial = true;
1337 UP.Runtime = true;
1338 UP.UpperBound = true;
1339 UP.UnrollRemainder = true;
1340 UP.DefaultUnrollRuntimeCount = 4;
1341 UP.UnrollAndJam = true;
1342 UP.UnrollAndJamInnerLoopThreshold = 60;
1343
1344 // Force unrolling small loops can be very useful because of the branch
1345 // taken cost of the backedge.
1346 if (Cost < 12)
1347 UP.Force = true;
1348}
1349
1350bool ARMTTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
1351 TTI::ReductionFlags Flags) const {
1352 assert(isa<VectorType>(Ty) && "Expected Ty to be a vector type")((isa<VectorType>(Ty) && "Expected Ty to be a vector type"
) ? static_cast<void> (0) : __assert_fail ("isa<VectorType>(Ty) && \"Expected Ty to be a vector type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp"
, 1352, __PRETTY_FUNCTION__))
;
1353 unsigned ScalarBits = Ty->getScalarSizeInBits();
1354 if (!ST->hasMVEIntegerOps())
1355 return false;
1356
1357 switch (Opcode) {
1358 case Instruction::FAdd:
1359 case Instruction::FMul:
1360 case Instruction::And:
1361 case Instruction::Or:
1362 case Instruction::Xor:
1363 case Instruction::Mul:
1364 case Instruction::FCmp:
1365 return false;
1366 case Instruction::ICmp:
1367 case Instruction::Add:
1368 return ScalarBits < 64 &&
1369 (ScalarBits * Ty->getVectorNumElements()) % 128 == 0;
1370 default:
1371 llvm_unreachable("Unhandled reduction opcode")::llvm::llvm_unreachable_internal("Unhandled reduction opcode"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp"
, 1371)
;
1372 }
1373 return false;
1374}

/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h

1//===-- ARMAddressingModes.h - ARM Addressing Modes -------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains the ARM addressing mode implementation stuff.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMADDRESSINGMODES_H
14#define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMADDRESSINGMODES_H
15
16#include "llvm/ADT/APFloat.h"
17#include "llvm/ADT/APInt.h"
18#include "llvm/ADT/bit.h"
19#include "llvm/Support/ErrorHandling.h"
20#include "llvm/Support/MathExtras.h"
21#include <cassert>
22
23namespace llvm {
24
25/// ARM_AM - ARM Addressing Mode Stuff
26namespace ARM_AM {
27 enum ShiftOpc {
28 no_shift = 0,
29 asr,
30 lsl,
31 lsr,
32 ror,
33 rrx,
34 uxtw
35 };
36
37 enum AddrOpc {
38 sub = 0,
39 add
40 };
41
42 inline const char *getAddrOpcStr(AddrOpc Op) { return Op == sub ? "-" : ""; }
43
44 inline const char *getShiftOpcStr(ShiftOpc Op) {
45 switch (Op) {
46 default: llvm_unreachable("Unknown shift opc!")::llvm::llvm_unreachable_internal("Unknown shift opc!", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 46)
;
47 case ARM_AM::asr: return "asr";
48 case ARM_AM::lsl: return "lsl";
49 case ARM_AM::lsr: return "lsr";
50 case ARM_AM::ror: return "ror";
51 case ARM_AM::rrx: return "rrx";
52 case ARM_AM::uxtw: return "uxtw";
53 }
54 }
55
56 inline unsigned getShiftOpcEncoding(ShiftOpc Op) {
57 switch (Op) {
58 default: llvm_unreachable("Unknown shift opc!")::llvm::llvm_unreachable_internal("Unknown shift opc!", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 58)
;
59 case ARM_AM::asr: return 2;
60 case ARM_AM::lsl: return 0;
61 case ARM_AM::lsr: return 1;
62 case ARM_AM::ror: return 3;
63 }
64 }
65
66 enum AMSubMode {
67 bad_am_submode = 0,
68 ia,
69 ib,
70 da,
71 db
72 };
73
74 inline const char *getAMSubModeStr(AMSubMode Mode) {
75 switch (Mode) {
76 default: llvm_unreachable("Unknown addressing sub-mode!")::llvm::llvm_unreachable_internal("Unknown addressing sub-mode!"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 76)
;
77 case ARM_AM::ia: return "ia";
78 case ARM_AM::ib: return "ib";
79 case ARM_AM::da: return "da";
80 case ARM_AM::db: return "db";
81 }
82 }
83
84 /// rotr32 - Rotate a 32-bit unsigned value right by a specified # bits.
85 ///
86 inline unsigned rotr32(unsigned Val, unsigned Amt) {
87 assert(Amt < 32 && "Invalid rotate amount")((Amt < 32 && "Invalid rotate amount") ? static_cast
<void> (0) : __assert_fail ("Amt < 32 && \"Invalid rotate amount\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 87, __PRETTY_FUNCTION__))
;
88 return (Val >> Amt) | (Val << ((32-Amt)&31));
89 }
90
91 /// rotl32 - Rotate a 32-bit unsigned value left by a specified # bits.
92 ///
93 inline unsigned rotl32(unsigned Val, unsigned Amt) {
94 assert(Amt < 32 && "Invalid rotate amount")((Amt < 32 && "Invalid rotate amount") ? static_cast
<void> (0) : __assert_fail ("Amt < 32 && \"Invalid rotate amount\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 94, __PRETTY_FUNCTION__))
;
95 return (Val << Amt) | (Val >> ((32-Amt)&31));
96 }
97
98 //===--------------------------------------------------------------------===//
99 // Addressing Mode #1: shift_operand with registers
100 //===--------------------------------------------------------------------===//
101 //
102 // This 'addressing mode' is used for arithmetic instructions. It can
103 // represent things like:
104 // reg
105 // reg [asr|lsl|lsr|ror|rrx] reg
106 // reg [asr|lsl|lsr|ror|rrx] imm
107 //
108 // This is stored three operands [rega, regb, opc]. The first is the base
109 // reg, the second is the shift amount (or reg0 if not present or imm). The
110 // third operand encodes the shift opcode and the imm if a reg isn't present.
111 //
112 inline unsigned getSORegOpc(ShiftOpc ShOp, unsigned Imm) {
113 return ShOp | (Imm << 3);
114 }
115 inline unsigned getSORegOffset(unsigned Op) { return Op >> 3; }
116 inline ShiftOpc getSORegShOp(unsigned Op) { return (ShiftOpc)(Op & 7); }
117
118 /// getSOImmValImm - Given an encoded imm field for the reg/imm form, return
119 /// the 8-bit imm value.
120 inline unsigned getSOImmValImm(unsigned Imm) { return Imm & 0xFF; }
121 /// getSOImmValRot - Given an encoded imm field for the reg/imm form, return
122 /// the rotate amount.
123 inline unsigned getSOImmValRot(unsigned Imm) { return (Imm >> 8) * 2; }
124
125 /// getSOImmValRotate - Try to handle Imm with an immediate shifter operand,
126 /// computing the rotate amount to use. If this immediate value cannot be
127 /// handled with a single shifter-op, determine a good rotate amount that will
128 /// take a maximal chunk of bits out of the immediate.
129 inline unsigned getSOImmValRotate(unsigned Imm) {
130 // 8-bit (or less) immediates are trivially shifter_operands with a rotate
131 // of zero.
132 if ((Imm & ~255U) == 0) return 0;
133
134 // Use CTZ to compute the rotate amount.
135 unsigned TZ = countTrailingZeros(Imm);
136
137 // Rotate amount must be even. Something like 0x200 must be rotated 8 bits,
138 // not 9.
139 unsigned RotAmt = TZ & ~1;
140
141 // If we can handle this spread, return it.
142 if ((rotr32(Imm, RotAmt) & ~255U) == 0)
143 return (32-RotAmt)&31; // HW rotates right, not left.
144
145 // For values like 0xF000000F, we should ignore the low 6 bits, then
146 // retry the hunt.
147 if (Imm & 63U) {
148 unsigned TZ2 = countTrailingZeros(Imm & ~63U);
149 unsigned RotAmt2 = TZ2 & ~1;
150 if ((rotr32(Imm, RotAmt2) & ~255U) == 0)
151 return (32-RotAmt2)&31; // HW rotates right, not left.
152 }
153
154 // Otherwise, we have no way to cover this span of bits with a single
155 // shifter_op immediate. Return a chunk of bits that will be useful to
156 // handle.
157 return (32-RotAmt)&31; // HW rotates right, not left.
158 }
159
160 /// getSOImmVal - Given a 32-bit immediate, if it is something that can fit
161 /// into an shifter_operand immediate operand, return the 12-bit encoding for
162 /// it. If not, return -1.
163 inline int getSOImmVal(unsigned Arg) {
164 // 8-bit (or less) immediates are trivially shifter_operands with a rotate
165 // of zero.
166 if ((Arg & ~255U) == 0) return Arg;
167
168 unsigned RotAmt = getSOImmValRotate(Arg);
169
170 // If this cannot be handled with a single shifter_op, bail out.
171 if (rotr32(~255U, RotAmt) & Arg)
172 return -1;
173
174 // Encode this correctly.
175 return rotl32(Arg, RotAmt) | ((RotAmt>>1) << 8);
176 }
177
178 /// isSOImmTwoPartVal - Return true if the specified value can be obtained by
179 /// or'ing together two SOImmVal's.
180 inline bool isSOImmTwoPartVal(unsigned V) {
181 // If this can be handled with a single shifter_op, bail out.
182 V = rotr32(~255U, getSOImmValRotate(V)) & V;
183 if (V == 0)
184 return false;
185
186 // If this can be handled with two shifter_op's, accept.
187 V = rotr32(~255U, getSOImmValRotate(V)) & V;
188 return V == 0;
189 }
190
191 /// getSOImmTwoPartFirst - If V is a value that satisfies isSOImmTwoPartVal,
192 /// return the first chunk of it.
193 inline unsigned getSOImmTwoPartFirst(unsigned V) {
194 return rotr32(255U, getSOImmValRotate(V)) & V;
195 }
196
197 /// getSOImmTwoPartSecond - If V is a value that satisfies isSOImmTwoPartVal,
198 /// return the second chunk of it.
199 inline unsigned getSOImmTwoPartSecond(unsigned V) {
200 // Mask out the first hunk.
201 V = rotr32(~255U, getSOImmValRotate(V)) & V;
202
203 // Take what's left.
204 assert(V == (rotr32(255U, getSOImmValRotate(V)) & V))((V == (rotr32(255U, getSOImmValRotate(V)) & V)) ? static_cast
<void> (0) : __assert_fail ("V == (rotr32(255U, getSOImmValRotate(V)) & V)"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 204, __PRETTY_FUNCTION__))
;
205 return V;
206 }
207
208 /// getThumbImmValShift - Try to handle Imm with a 8-bit immediate followed
209 /// by a left shift. Returns the shift amount to use.
210 inline unsigned getThumbImmValShift(unsigned Imm) {
211 // 8-bit (or less) immediates are trivially immediate operand with a shift
212 // of zero.
213 if ((Imm & ~255U) == 0) return 0;
20
Assuming the condition is false
21
Taking false branch
214
215 // Use CTZ to compute the shift amount.
216 return countTrailingZeros(Imm);
22
Calling 'countTrailingZeros<unsigned int>'
29
Returning from 'countTrailingZeros<unsigned int>'
30
Returning the value 32
217 }
218
219 /// isThumbImmShiftedVal - Return true if the specified value can be obtained
220 /// by left shifting a 8-bit immediate.
221 inline bool isThumbImmShiftedVal(unsigned V) {
222 // If this can be handled with
223 V = (~255U << getThumbImmValShift(V)) & V;
19
Calling 'getThumbImmValShift'
31
Returning from 'getThumbImmValShift'
32
The result of the left shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'
224 return V == 0;
225 }
226
227 /// getThumbImm16ValShift - Try to handle Imm with a 16-bit immediate followed
228 /// by a left shift. Returns the shift amount to use.
229 inline unsigned getThumbImm16ValShift(unsigned Imm) {
230 // 16-bit (or less) immediates are trivially immediate operand with a shift
231 // of zero.
232 if ((Imm & ~65535U) == 0) return 0;
233
234 // Use CTZ to compute the shift amount.
235 return countTrailingZeros(Imm);
236 }
237
238 /// isThumbImm16ShiftedVal - Return true if the specified value can be
239 /// obtained by left shifting a 16-bit immediate.
240 inline bool isThumbImm16ShiftedVal(unsigned V) {
241 // If this can be handled with
242 V = (~65535U << getThumbImm16ValShift(V)) & V;
243 return V == 0;
244 }
245
246 /// getThumbImmNonShiftedVal - If V is a value that satisfies
247 /// isThumbImmShiftedVal, return the non-shiftd value.
248 inline unsigned getThumbImmNonShiftedVal(unsigned V) {
249 return V >> getThumbImmValShift(V);
250 }
251
252
253 /// getT2SOImmValSplat - Return the 12-bit encoded representation
254 /// if the specified value can be obtained by splatting the low 8 bits
255 /// into every other byte or every byte of a 32-bit value. i.e.,
256 /// 00000000 00000000 00000000 abcdefgh control = 0
257 /// 00000000 abcdefgh 00000000 abcdefgh control = 1
258 /// abcdefgh 00000000 abcdefgh 00000000 control = 2
259 /// abcdefgh abcdefgh abcdefgh abcdefgh control = 3
260 /// Return -1 if none of the above apply.
261 /// See ARM Reference Manual A6.3.2.
262 inline int getT2SOImmValSplatVal(unsigned V) {
263 unsigned u, Vs, Imm;
264 // control = 0
265 if ((V & 0xffffff00) == 0)
266 return V;
267
268 // If the value is zeroes in the first byte, just shift those off
269 Vs = ((V & 0xff) == 0) ? V >> 8 : V;
270 // Any passing value only has 8 bits of payload, splatted across the word
271 Imm = Vs & 0xff;
272 // Likewise, any passing values have the payload splatted into the 3rd byte
273 u = Imm | (Imm << 16);
274
275 // control = 1 or 2
276 if (Vs == u)
277 return (((Vs == V) ? 1 : 2) << 8) | Imm;
278
279 // control = 3
280 if (Vs == (u | (u << 8)))
281 return (3 << 8) | Imm;
282
283 return -1;
284 }
285
286 /// getT2SOImmValRotateVal - Return the 12-bit encoded representation if the
287 /// specified value is a rotated 8-bit value. Return -1 if no rotation
288 /// encoding is possible.
289 /// See ARM Reference Manual A6.3.2.
290 inline int getT2SOImmValRotateVal(unsigned V) {
291 unsigned RotAmt = countLeadingZeros(V);
292 if (RotAmt >= 24)
293 return -1;
294
295 // If 'Arg' can be handled with a single shifter_op return the value.
296 if ((rotr32(0xff000000U, RotAmt) & V) == V)
297 return (rotr32(V, 24 - RotAmt) & 0x7f) | ((RotAmt + 8) << 7);
298
299 return -1;
300 }
301
302 /// getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit
303 /// into a Thumb-2 shifter_operand immediate operand, return the 12-bit
304 /// encoding for it. If not, return -1.
305 /// See ARM Reference Manual A6.3.2.
306 inline int getT2SOImmVal(unsigned Arg) {
307 // If 'Arg' is an 8-bit splat, then get the encoded value.
308 int Splat = getT2SOImmValSplatVal(Arg);
309 if (Splat != -1)
310 return Splat;
311
312 // If 'Arg' can be handled with a single shifter_op return the value.
313 int Rot = getT2SOImmValRotateVal(Arg);
314 if (Rot != -1)
315 return Rot;
316
317 return -1;
318 }
319
320 inline unsigned getT2SOImmValRotate(unsigned V) {
321 if ((V & ~255U) == 0) return 0;
322 // Use CTZ to compute the rotate amount.
323 unsigned RotAmt = countTrailingZeros(V);
324 return (32 - RotAmt) & 31;
325 }
326
327 inline bool isT2SOImmTwoPartVal(unsigned Imm) {
328 unsigned V = Imm;
329 // Passing values can be any combination of splat values and shifter
330 // values. If this can be handled with a single shifter or splat, bail
331 // out. Those should be handled directly, not with a two-part val.
332 if (getT2SOImmValSplatVal(V) != -1)
333 return false;
334 V = rotr32 (~255U, getT2SOImmValRotate(V)) & V;
335 if (V == 0)
336 return false;
337
338 // If this can be handled as an immediate, accept.
339 if (getT2SOImmVal(V) != -1) return true;
340
341 // Likewise, try masking out a splat value first.
342 V = Imm;
343 if (getT2SOImmValSplatVal(V & 0xff00ff00U) != -1)
344 V &= ~0xff00ff00U;
345 else if (getT2SOImmValSplatVal(V & 0x00ff00ffU) != -1)
346 V &= ~0x00ff00ffU;
347 // If what's left can be handled as an immediate, accept.
348 if (getT2SOImmVal(V) != -1) return true;
349
350 // Otherwise, do not accept.
351 return false;
352 }
353
354 inline unsigned getT2SOImmTwoPartFirst(unsigned Imm) {
355 assert (isT2SOImmTwoPartVal(Imm) &&((isT2SOImmTwoPartVal(Imm) && "Immedate cannot be encoded as two part immediate!"
) ? static_cast<void> (0) : __assert_fail ("isT2SOImmTwoPartVal(Imm) && \"Immedate cannot be encoded as two part immediate!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 356, __PRETTY_FUNCTION__))
356 "Immedate cannot be encoded as two part immediate!")((isT2SOImmTwoPartVal(Imm) && "Immedate cannot be encoded as two part immediate!"
) ? static_cast<void> (0) : __assert_fail ("isT2SOImmTwoPartVal(Imm) && \"Immedate cannot be encoded as two part immediate!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 356, __PRETTY_FUNCTION__))
;
357 // Try a shifter operand as one part
358 unsigned V = rotr32 (~255, getT2SOImmValRotate(Imm)) & Imm;
359 // If the rest is encodable as an immediate, then return it.
360 if (getT2SOImmVal(V) != -1) return V;
361
362 // Try masking out a splat value first.
363 if (getT2SOImmValSplatVal(Imm & 0xff00ff00U) != -1)
364 return Imm & 0xff00ff00U;
365
366 // The other splat is all that's left as an option.
367 assert (getT2SOImmValSplatVal(Imm & 0x00ff00ffU) != -1)((getT2SOImmValSplatVal(Imm & 0x00ff00ffU) != -1) ? static_cast
<void> (0) : __assert_fail ("getT2SOImmValSplatVal(Imm & 0x00ff00ffU) != -1"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 367, __PRETTY_FUNCTION__))
;
368 return Imm & 0x00ff00ffU;
369 }
370
371 inline unsigned getT2SOImmTwoPartSecond(unsigned Imm) {
372 // Mask out the first hunk
373 Imm ^= getT2SOImmTwoPartFirst(Imm);
374 // Return what's left
375 assert (getT2SOImmVal(Imm) != -1 &&((getT2SOImmVal(Imm) != -1 && "Unable to encode second part of T2 two part SO immediate"
) ? static_cast<void> (0) : __assert_fail ("getT2SOImmVal(Imm) != -1 && \"Unable to encode second part of T2 two part SO immediate\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 376, __PRETTY_FUNCTION__))
376 "Unable to encode second part of T2 two part SO immediate")((getT2SOImmVal(Imm) != -1 && "Unable to encode second part of T2 two part SO immediate"
) ? static_cast<void> (0) : __assert_fail ("getT2SOImmVal(Imm) != -1 && \"Unable to encode second part of T2 two part SO immediate\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 376, __PRETTY_FUNCTION__))
;
377 return Imm;
378 }
379
380
381 //===--------------------------------------------------------------------===//
382 // Addressing Mode #2
383 //===--------------------------------------------------------------------===//
384 //
385 // This is used for most simple load/store instructions.
386 //
387 // addrmode2 := reg +/- reg shop imm
388 // addrmode2 := reg +/- imm12
389 //
390 // The first operand is always a Reg. The second operand is a reg if in
391 // reg/reg form, otherwise it's reg#0. The third field encodes the operation
392 // in bit 12, the immediate in bits 0-11, and the shift op in 13-15. The
393 // fourth operand 16-17 encodes the index mode.
394 //
395 // If this addressing mode is a frame index (before prolog/epilog insertion
396 // and code rewriting), this operand will have the form: FI#, reg0, <offs>
397 // with no shift amount for the frame offset.
398 //
399 inline unsigned getAM2Opc(AddrOpc Opc, unsigned Imm12, ShiftOpc SO,
400 unsigned IdxMode = 0) {
401 assert(Imm12 < (1 << 12) && "Imm too large!")((Imm12 < (1 << 12) && "Imm too large!") ? static_cast
<void> (0) : __assert_fail ("Imm12 < (1 << 12) && \"Imm too large!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 401, __PRETTY_FUNCTION__))
;
402 bool isSub = Opc == sub;
403 return Imm12 | ((int)isSub << 12) | (SO << 13) | (IdxMode << 16) ;
404 }
405 inline unsigned getAM2Offset(unsigned AM2Opc) {
406 return AM2Opc & ((1 << 12)-1);
407 }
408 inline AddrOpc getAM2Op(unsigned AM2Opc) {
409 return ((AM2Opc >> 12) & 1) ? sub : add;
410 }
411 inline ShiftOpc getAM2ShiftOpc(unsigned AM2Opc) {
412 return (ShiftOpc)((AM2Opc >> 13) & 7);
413 }
414 inline unsigned getAM2IdxMode(unsigned AM2Opc) { return (AM2Opc >> 16); }
415
416 //===--------------------------------------------------------------------===//
417 // Addressing Mode #3
418 //===--------------------------------------------------------------------===//
419 //
420 // This is used for sign-extending loads, and load/store-pair instructions.
421 //
422 // addrmode3 := reg +/- reg
423 // addrmode3 := reg +/- imm8
424 //
425 // The first operand is always a Reg. The second operand is a reg if in
426 // reg/reg form, otherwise it's reg#0. The third field encodes the operation
427 // in bit 8, the immediate in bits 0-7. The fourth operand 9-10 encodes the
428 // index mode.
429
430 /// getAM3Opc - This function encodes the addrmode3 opc field.
431 inline unsigned getAM3Opc(AddrOpc Opc, unsigned char Offset,
432 unsigned IdxMode = 0) {
433 bool isSub = Opc == sub;
434 return ((int)isSub << 8) | Offset | (IdxMode << 9);
435 }
436 inline unsigned char getAM3Offset(unsigned AM3Opc) { return AM3Opc & 0xFF; }
437 inline AddrOpc getAM3Op(unsigned AM3Opc) {
438 return ((AM3Opc >> 8) & 1) ? sub : add;
439 }
440 inline unsigned getAM3IdxMode(unsigned AM3Opc) { return (AM3Opc >> 9); }
441
442 //===--------------------------------------------------------------------===//
443 // Addressing Mode #4
444 //===--------------------------------------------------------------------===//
445 //
446 // This is used for load / store multiple instructions.
447 //
448 // addrmode4 := reg, <mode>
449 //
450 // The four modes are:
451 // IA - Increment after
452 // IB - Increment before
453 // DA - Decrement after
454 // DB - Decrement before
455 // For VFP instructions, only the IA and DB modes are valid.
456
457 inline AMSubMode getAM4SubMode(unsigned Mode) {
458 return (AMSubMode)(Mode & 0x7);
459 }
460
461 inline unsigned getAM4ModeImm(AMSubMode SubMode) { return (int)SubMode; }
462
463 //===--------------------------------------------------------------------===//
464 // Addressing Mode #5
465 //===--------------------------------------------------------------------===//
466 //
467 // This is used for coprocessor instructions, such as FP load/stores.
468 //
469 // addrmode5 := reg +/- imm8*4
470 //
471 // The first operand is always a Reg. The second operand encodes the
472 // operation (add or subtract) in bit 8 and the immediate in bits 0-7.
473
474 /// getAM5Opc - This function encodes the addrmode5 opc field.
475 inline unsigned getAM5Opc(AddrOpc Opc, unsigned char Offset) {
476 bool isSub = Opc == sub;
477 return ((int)isSub << 8) | Offset;
478 }
479 inline unsigned char getAM5Offset(unsigned AM5Opc) { return AM5Opc & 0xFF; }
480 inline AddrOpc getAM5Op(unsigned AM5Opc) {
481 return ((AM5Opc >> 8) & 1) ? sub : add;
482 }
483
484 //===--------------------------------------------------------------------===//
485 // Addressing Mode #5 FP16
486 //===--------------------------------------------------------------------===//
487 //
488 // This is used for coprocessor instructions, such as 16-bit FP load/stores.
489 //
490 // addrmode5fp16 := reg +/- imm8*2
491 //
492 // The first operand is always a Reg. The second operand encodes the
493 // operation (add or subtract) in bit 8 and the immediate in bits 0-7.
494
495 /// getAM5FP16Opc - This function encodes the addrmode5fp16 opc field.
496 inline unsigned getAM5FP16Opc(AddrOpc Opc, unsigned char Offset) {
497 bool isSub = Opc == sub;
498 return ((int)isSub << 8) | Offset;
499 }
500 inline unsigned char getAM5FP16Offset(unsigned AM5Opc) {
501 return AM5Opc & 0xFF;
502 }
503 inline AddrOpc getAM5FP16Op(unsigned AM5Opc) {
504 return ((AM5Opc >> 8) & 1) ? sub : add;
505 }
506
507 //===--------------------------------------------------------------------===//
508 // Addressing Mode #6
509 //===--------------------------------------------------------------------===//
510 //
511 // This is used for NEON load / store instructions.
512 //
513 // addrmode6 := reg with optional alignment
514 //
515 // This is stored in two operands [regaddr, align]. The first is the
516 // address register. The second operand is the value of the alignment
517 // specifier in bytes or zero if no explicit alignment.
518 // Valid alignments depend on the specific instruction.
519
520 //===--------------------------------------------------------------------===//
521 // NEON/MVE Modified Immediates
522 //===--------------------------------------------------------------------===//
523 //
524 // Several NEON and MVE instructions (e.g., VMOV) take a "modified immediate"
525 // vector operand, where a small immediate encoded in the instruction
526 // specifies a full NEON vector value. These modified immediates are
527 // represented here as encoded integers. The low 8 bits hold the immediate
528 // value; bit 12 holds the "Op" field of the instruction, and bits 11-8 hold
529 // the "Cmode" field of the instruction. The interfaces below treat the
530 // Op and Cmode values as a single 5-bit value.
531
532 inline unsigned createVMOVModImm(unsigned OpCmode, unsigned Val) {
533 return (OpCmode << 8) | Val;
534 }
535 inline unsigned getVMOVModImmOpCmode(unsigned ModImm) {
536 return (ModImm >> 8) & 0x1f;
537 }
538 inline unsigned getVMOVModImmVal(unsigned ModImm) { return ModImm & 0xff; }
539
540 /// decodeVMOVModImm - Decode a NEON/MVE modified immediate value into the
541 /// element value and the element size in bits. (If the element size is
542 /// smaller than the vector, it is splatted into all the elements.)
543 inline uint64_t decodeVMOVModImm(unsigned ModImm, unsigned &EltBits) {
544 unsigned OpCmode = getVMOVModImmOpCmode(ModImm);
545 unsigned Imm8 = getVMOVModImmVal(ModImm);
546 uint64_t Val = 0;
547
548 if (OpCmode == 0xe) {
549 // 8-bit vector elements
550 Val = Imm8;
551 EltBits = 8;
552 } else if ((OpCmode & 0xc) == 0x8) {
553 // 16-bit vector elements
554 unsigned ByteNum = (OpCmode & 0x6) >> 1;
555 Val = Imm8 << (8 * ByteNum);
556 EltBits = 16;
557 } else if ((OpCmode & 0x8) == 0) {
558 // 32-bit vector elements, zero with one byte set
559 unsigned ByteNum = (OpCmode & 0x6) >> 1;
560 Val = Imm8 << (8 * ByteNum);
561 EltBits = 32;
562 } else if ((OpCmode & 0xe) == 0xc) {
563 // 32-bit vector elements, one byte with low bits set
564 unsigned ByteNum = 1 + (OpCmode & 0x1);
565 Val = (Imm8 << (8 * ByteNum)) | (0xffff >> (8 * (2 - ByteNum)));
566 EltBits = 32;
567 } else if (OpCmode == 0x1e) {
568 // 64-bit vector elements
569 for (unsigned ByteNum = 0; ByteNum < 8; ++ByteNum) {
570 if ((ModImm >> ByteNum) & 1)
571 Val |= (uint64_t)0xff << (8 * ByteNum);
572 }
573 EltBits = 64;
574 } else {
575 llvm_unreachable("Unsupported VMOV immediate")::llvm::llvm_unreachable_internal("Unsupported VMOV immediate"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 575)
;
576 }
577 return Val;
578 }
579
580 // Generic validation for single-byte immediate (0X00, 00X0, etc).
581 inline bool isNEONBytesplat(unsigned Value, unsigned Size) {
582 assert(Size >= 1 && Size <= 4 && "Invalid size")((Size >= 1 && Size <= 4 && "Invalid size"
) ? static_cast<void> (0) : __assert_fail ("Size >= 1 && Size <= 4 && \"Invalid size\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 582, __PRETTY_FUNCTION__))
;
583 unsigned count = 0;
584 for (unsigned i = 0; i < Size; ++i) {
585 if (Value & 0xff) count++;
586 Value >>= 8;
587 }
588 return count == 1;
589 }
590
591 /// Checks if Value is a correct immediate for instructions like VBIC/VORR.
592 inline bool isNEONi16splat(unsigned Value) {
593 if (Value > 0xffff)
594 return false;
595 // i16 value with set bits only in one byte X0 or 0X.
596 return Value == 0 || isNEONBytesplat(Value, 2);
597 }
598
599 // Encode NEON 16 bits Splat immediate for instructions like VBIC/VORR
600 inline unsigned encodeNEONi16splat(unsigned Value) {
601 assert(isNEONi16splat(Value) && "Invalid NEON splat value")((isNEONi16splat(Value) && "Invalid NEON splat value"
) ? static_cast<void> (0) : __assert_fail ("isNEONi16splat(Value) && \"Invalid NEON splat value\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 601, __PRETTY_FUNCTION__))
;
602 if (Value >= 0x100)
603 Value = (Value >> 8) | 0xa00;
604 else
605 Value |= 0x800;
606 return Value;
607 }
608
609 /// Checks if Value is a correct immediate for instructions like VBIC/VORR.
610 inline bool isNEONi32splat(unsigned Value) {
611 // i32 value with set bits only in one byte X000, 0X00, 00X0, or 000X.
612 return Value == 0 || isNEONBytesplat(Value, 4);
613 }
614
615 /// Encode NEON 32 bits Splat immediate for instructions like VBIC/VORR.
616 inline unsigned encodeNEONi32splat(unsigned Value) {
617 assert(isNEONi32splat(Value) && "Invalid NEON splat value")((isNEONi32splat(Value) && "Invalid NEON splat value"
) ? static_cast<void> (0) : __assert_fail ("isNEONi32splat(Value) && \"Invalid NEON splat value\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h"
, 617, __PRETTY_FUNCTION__))
;
618 if (Value >= 0x100 && Value <= 0xff00)
619 Value = (Value >> 8) | 0x200;
620 else if (Value > 0xffff && Value <= 0xff0000)
621 Value = (Value >> 16) | 0x400;
622 else if (Value > 0xffffff)
623 Value = (Value >> 24) | 0x600;
624 return Value;
625 }
626
627 //===--------------------------------------------------------------------===//
628 // Floating-point Immediates
629 //
630 inline float getFPImmFloat(unsigned Imm) {
631 // We expect an 8-bit binary encoding of a floating-point number here.
632
633 uint8_t Sign = (Imm >> 7) & 0x1;
634 uint8_t Exp = (Imm >> 4) & 0x7;
635 uint8_t Mantissa = Imm & 0xf;
636
637 // 8-bit FP IEEE Float Encoding
638 // abcd efgh aBbbbbbc defgh000 00000000 00000000
639 //
640 // where B = NOT(b);
641 uint32_t I = 0;
642 I |= Sign << 31;
643 I |= ((Exp & 0x4) != 0 ? 0 : 1) << 30;
644 I |= ((Exp & 0x4) != 0 ? 0x1f : 0) << 25;
645 I |= (Exp & 0x3) << 23;
646 I |= Mantissa << 19;
647 return bit_cast<float>(I);
648 }
649
650 /// getFP16Imm - Return an 8-bit floating-point version of the 16-bit
651 /// floating-point value. If the value cannot be represented as an 8-bit
652 /// floating-point value, then return -1.
653 inline int getFP16Imm(const APInt &Imm) {
654 uint32_t Sign = Imm.lshr(15).getZExtValue() & 1;
655 int32_t Exp = (Imm.lshr(10).getSExtValue() & 0x1f) - 15; // -14 to 15
656 int64_t Mantissa = Imm.getZExtValue() & 0x3ff; // 10 bits
657
658 // We can handle 4 bits of mantissa.
659 // mantissa = (16+UInt(e:f:g:h))/16.
660 if (Mantissa & 0x3f)
661 return -1;
662 Mantissa >>= 6;
663
664 // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
665 if (Exp < -3 || Exp > 4)
666 return -1;
667 Exp = ((Exp+3) & 0x7) ^ 4;
668
669 return ((int)Sign << 7) | (Exp << 4) | Mantissa;
670 }
671
672 inline int getFP16Imm(const APFloat &FPImm) {
673 return getFP16Imm(FPImm.bitcastToAPInt());
674 }
675
676 /// getFP32Imm - Return an 8-bit floating-point version of the 32-bit
677 /// floating-point value. If the value cannot be represented as an 8-bit
678 /// floating-point value, then return -1.
679 inline int getFP32Imm(const APInt &Imm) {
680 uint32_t Sign = Imm.lshr(31).getZExtValue() & 1;
681 int32_t Exp = (Imm.lshr(23).getSExtValue() & 0xff) - 127; // -126 to 127
682 int64_t Mantissa = Imm.getZExtValue() & 0x7fffff; // 23 bits
683
684 // We can handle 4 bits of mantissa.
685 // mantissa = (16+UInt(e:f:g:h))/16.
686 if (Mantissa & 0x7ffff)
687 return -1;
688 Mantissa >>= 19;
689 if ((Mantissa & 0xf) != Mantissa)
690 return -1;
691
692 // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
693 if (Exp < -3 || Exp > 4)
694 return -1;
695 Exp = ((Exp+3) & 0x7) ^ 4;
696
697 return ((int)Sign << 7) | (Exp << 4) | Mantissa;
698 }
699
700 inline int getFP32Imm(const APFloat &FPImm) {
701 return getFP32Imm(FPImm.bitcastToAPInt());
702 }
703
704 /// getFP64Imm - Return an 8-bit floating-point version of the 64-bit
705 /// floating-point value. If the value cannot be represented as an 8-bit
706 /// floating-point value, then return -1.
707 inline int getFP64Imm(const APInt &Imm) {
708 uint64_t Sign = Imm.lshr(63).getZExtValue() & 1;
709 int64_t Exp = (Imm.lshr(52).getSExtValue() & 0x7ff) - 1023; // -1022 to 1023
710 uint64_t Mantissa = Imm.getZExtValue() & 0xfffffffffffffULL;
711
712 // We can handle 4 bits of mantissa.
713 // mantissa = (16+UInt(e:f:g:h))/16.
714 if (Mantissa & 0xffffffffffffULL)
715 return -1;
716 Mantissa >>= 48;
717 if ((Mantissa & 0xf) != Mantissa)
718 return -1;
719
720 // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
721 if (Exp < -3 || Exp > 4)
722 return -1;
723 Exp = ((Exp+3) & 0x7) ^ 4;
724
725 return ((int)Sign << 7) | (Exp << 4) | Mantissa;
726 }
727
728 inline int getFP64Imm(const APFloat &FPImm) {
729 return getFP64Imm(FPImm.bitcastToAPInt());
730 }
731
732} // end namespace ARM_AM
733} // end namespace llvm
734
735#endif
736

/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/Support/MathExtras.h

1//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains some functions that are useful for math stuff.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_SUPPORT_MATHEXTRAS_H
14#define LLVM_SUPPORT_MATHEXTRAS_H
15
16#include "llvm/Support/Compiler.h"
17#include <algorithm>
18#include <cassert>
19#include <climits>
20#include <cmath>
21#include <cstdint>
22#include <cstring>
23#include <limits>
24#include <type_traits>
25
26#ifdef __ANDROID_NDK__
27#include <android/api-level.h>
28#endif
29
30#ifdef _MSC_VER
31// Declare these intrinsics manually rather including intrin.h. It's very
32// expensive, and MathExtras.h is popular.
33// #include <intrin.h>
34extern "C" {
35unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
36unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
37unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
38unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
39}
40#endif
41
42namespace llvm {
43
44/// The behavior an operation has on an input of 0.
45enum ZeroBehavior {
46 /// The returned value is undefined.
47 ZB_Undefined,
48 /// The returned value is numeric_limits<T>::max()
49 ZB_Max,
50 /// The returned value is numeric_limits<T>::digits
51 ZB_Width
52};
53
54/// Mathematical constants.
55namespace numbers {
56// TODO: Track C++20 std::numbers.
57// TODO: Favor using the hexadecimal FP constants (requires C++17).
58constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113
59 egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620
60 ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162
61 ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392
62 log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0)
63 log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2)
64 pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796
65 inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541
66 sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161
67 inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197
68 sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219
69 inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1)
70 sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194
71 inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1)
72 phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622
73constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113
74 egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620
75 ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162
76 ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392
77 log2ef = 1.44269504F, // (0x1.715476P+0)
78 log10ef = .434294482F, // (0x1.bcb7b2P-2)
79 pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796
80 inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541
81 sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161
82 inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197
83 sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193
84 inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1)
85 sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194
86 inv_sqrt3f = .577350269F, // (0x1.279a74P-1)
87 phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622
88} // namespace numbers
89
90namespace detail {
91template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
92 static unsigned count(T Val, ZeroBehavior) {
93 if (!Val)
94 return std::numeric_limits<T>::digits;
95 if (Val & 0x1)
96 return 0;
97
98 // Bisection method.
99 unsigned ZeroBits = 0;
100 T Shift = std::numeric_limits<T>::digits >> 1;
101 T Mask = std::numeric_limits<T>::max() >> Shift;
102 while (Shift) {
103 if ((Val & Mask) == 0) {
104 Val >>= Shift;
105 ZeroBits |= Shift;
106 }
107 Shift >>= 1;
108 Mask >>= Shift;
109 }
110 return ZeroBits;
111 }
112};
113
114#if defined(__GNUC__4) || defined(_MSC_VER)
115template <typename T> struct TrailingZerosCounter<T, 4> {
116 static unsigned count(T Val, ZeroBehavior ZB) {
117 if (ZB
23.1
'ZB' is not equal to ZB_Undefined
23.1
'ZB' is not equal to ZB_Undefined
23.1
'ZB' is not equal to ZB_Undefined
!= ZB_Undefined && Val == 0)
24
Assuming 'Val' is equal to 0
25
Taking true branch
118 return 32;
26
Returning the value 32
119
120#if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4)
121 return __builtin_ctz(Val);
122#elif defined(_MSC_VER)
123 unsigned long Index;
124 _BitScanForward(&Index, Val);
125 return Index;
126#endif
127 }
128};
129
130#if !defined(_MSC_VER) || defined(_M_X64)
131template <typename T> struct TrailingZerosCounter<T, 8> {
132 static unsigned count(T Val, ZeroBehavior ZB) {
133 if (ZB != ZB_Undefined && Val == 0)
134 return 64;
135
136#if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4)
137 return __builtin_ctzll(Val);
138#elif defined(_MSC_VER)
139 unsigned long Index;
140 _BitScanForward64(&Index, Val);
141 return Index;
142#endif
143 }
144};
145#endif
146#endif
147} // namespace detail
148
149/// Count number of 0's from the least significant bit to the most
150/// stopping at the first 1.
151///
152/// Only unsigned integral types are allowed.
153///
154/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
155/// valid arguments.
156template <typename T>
157unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
158 static_assert(std::numeric_limits<T>::is_integer &&
159 !std::numeric_limits<T>::is_signed,
160 "Only unsigned integral types are allowed.");
161 return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
23
Calling 'TrailingZerosCounter::count'
27
Returning from 'TrailingZerosCounter::count'
28
Returning the value 32
162}
163
164namespace detail {
165template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
166 static unsigned count(T Val, ZeroBehavior) {
167 if (!Val)
168 return std::numeric_limits<T>::digits;
169
170 // Bisection method.
171 unsigned ZeroBits = 0;
172 for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
173 T Tmp = Val >> Shift;
174 if (Tmp)
175 Val = Tmp;
176 else
177 ZeroBits |= Shift;
178 }
179 return ZeroBits;
180 }
181};
182
183#if defined(__GNUC__4) || defined(_MSC_VER)
184template <typename T> struct LeadingZerosCounter<T, 4> {
185 static unsigned count(T Val, ZeroBehavior ZB) {
186 if (ZB != ZB_Undefined && Val == 0)
187 return 32;
188
189#if __has_builtin(__builtin_clz)1 || defined(__GNUC__4)
190 return __builtin_clz(Val);
191#elif defined(_MSC_VER)
192 unsigned long Index;
193 _BitScanReverse(&Index, Val);
194 return Index ^ 31;
195#endif
196 }
197};
198
199#if !defined(_MSC_VER) || defined(_M_X64)
200template <typename T> struct LeadingZerosCounter<T, 8> {
201 static unsigned count(T Val, ZeroBehavior ZB) {
202 if (ZB != ZB_Undefined && Val == 0)
203 return 64;
204
205#if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4)
206 return __builtin_clzll(Val);
207#elif defined(_MSC_VER)
208 unsigned long Index;
209 _BitScanReverse64(&Index, Val);
210 return Index ^ 63;
211#endif
212 }
213};
214#endif
215#endif
216} // namespace detail
217
218/// Count number of 0's from the most significant bit to the least
219/// stopping at the first 1.
220///
221/// Only unsigned integral types are allowed.
222///
223/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
224/// valid arguments.
225template <typename T>
226unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
227 static_assert(std::numeric_limits<T>::is_integer &&
228 !std::numeric_limits<T>::is_signed,
229 "Only unsigned integral types are allowed.");
230 return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
231}
232
233/// Get the index of the first set bit starting from the least
234/// significant bit.
235///
236/// Only unsigned integral types are allowed.
237///
238/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
239/// valid arguments.
240template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
241 if (ZB == ZB_Max && Val == 0)
242 return std::numeric_limits<T>::max();
243
244 return countTrailingZeros(Val, ZB_Undefined);
245}
246
247/// Create a bitmask with the N right-most bits set to 1, and all other
248/// bits set to 0. Only unsigned types are allowed.
249template <typename T> T maskTrailingOnes(unsigned N) {
250 static_assert(std::is_unsigned<T>::value, "Invalid type!");
251 const unsigned Bits = CHAR_BIT8 * sizeof(T);
252 assert(N <= Bits && "Invalid bit index")((N <= Bits && "Invalid bit index") ? static_cast<
void> (0) : __assert_fail ("N <= Bits && \"Invalid bit index\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/Support/MathExtras.h"
, 252, __PRETTY_FUNCTION__))
;
253 return N == 0 ? 0 : (T(-1) >> (Bits - N));
254}
255
256/// Create a bitmask with the N left-most bits set to 1, and all other
257/// bits set to 0. Only unsigned types are allowed.
258template <typename T> T maskLeadingOnes(unsigned N) {
259 return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
260}
261
262/// Create a bitmask with the N right-most bits set to 0, and all other
263/// bits set to 1. Only unsigned types are allowed.
264template <typename T> T maskTrailingZeros(unsigned N) {
265 return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
266}
267
268/// Create a bitmask with the N left-most bits set to 0, and all other
269/// bits set to 1. Only unsigned types are allowed.
270template <typename T> T maskLeadingZeros(unsigned N) {
271 return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
272}
273
274/// Get the index of the last set bit starting from the least
275/// significant bit.
276///
277/// Only unsigned integral types are allowed.
278///
279/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
280/// valid arguments.
281template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
282 if (ZB == ZB_Max && Val == 0)
283 return std::numeric_limits<T>::max();
284
285 // Use ^ instead of - because both gcc and llvm can remove the associated ^
286 // in the __builtin_clz intrinsic on x86.
287 return countLeadingZeros(Val, ZB_Undefined) ^
288 (std::numeric_limits<T>::digits - 1);
289}
290
291/// Macro compressed bit reversal table for 256 bits.
292///
293/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
294static const unsigned char BitReverseTable256[256] = {
295#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
296#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
297#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
298 R6(0), R6(2), R6(1), R6(3)
299#undef R2
300#undef R4
301#undef R6
302};
303
304/// Reverse the bits in \p Val.
305template <typename T>
306T reverseBits(T Val) {
307 unsigned char in[sizeof(Val)];
308 unsigned char out[sizeof(Val)];
309 std::memcpy(in, &Val, sizeof(Val));
310 for (unsigned i = 0; i < sizeof(Val); ++i)
311 out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
312 std::memcpy(&Val, out, sizeof(Val));
313 return Val;
314}
315
316// NOTE: The following support functions use the _32/_64 extensions instead of
317// type overloading so that signed and unsigned integers can be used without
318// ambiguity.
319
320/// Return the high 32 bits of a 64 bit value.
321constexpr inline uint32_t Hi_32(uint64_t Value) {
322 return static_cast<uint32_t>(Value >> 32);
323}
324
325/// Return the low 32 bits of a 64 bit value.
326constexpr inline uint32_t Lo_32(uint64_t Value) {
327 return static_cast<uint32_t>(Value);
328}
329
330/// Make a 64-bit integer from a high / low pair of 32-bit integers.
331constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
332 return ((uint64_t)High << 32) | (uint64_t)Low;
333}
334
335/// Checks if an integer fits into the given bit width.
336template <unsigned N> constexpr inline bool isInt(int64_t x) {
337 return N >= 64 || (-(INT64_C(1)1L<<(N-1)) <= x && x < (INT64_C(1)1L<<(N-1)));
338}
339// Template specializations to get better code for common cases.
340template <> constexpr inline bool isInt<8>(int64_t x) {
341 return static_cast<int8_t>(x) == x;
342}
343template <> constexpr inline bool isInt<16>(int64_t x) {
344 return static_cast<int16_t>(x) == x;
345}
346template <> constexpr inline bool isInt<32>(int64_t x) {
347 return static_cast<int32_t>(x) == x;
348}
349
350/// Checks if a signed integer is an N bit number shifted left by S.
351template <unsigned N, unsigned S>
352constexpr inline bool isShiftedInt(int64_t x) {
353 static_assert(
354 N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number.");
355 static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide.");
356 return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
357}
358
359/// Checks if an unsigned integer fits into the given bit width.
360///
361/// This is written as two functions rather than as simply
362///
363/// return N >= 64 || X < (UINT64_C(1) << N);
364///
365/// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting
366/// left too many places.
367template <unsigned N>
368constexpr inline std::enable_if_t<(N < 64), bool> isUInt(uint64_t X) {
369 static_assert(N > 0, "isUInt<0> doesn't make sense");
370 return X < (UINT64_C(1)1UL << (N));
371}
372template <unsigned N>
373constexpr inline std::enable_if_t<N >= 64, bool> isUInt(uint64_t X) {
374 return true;
375}
376
377// Template specializations to get better code for common cases.
378template <> constexpr inline bool isUInt<8>(uint64_t x) {
379 return static_cast<uint8_t>(x) == x;
380}
381template <> constexpr inline bool isUInt<16>(uint64_t x) {
382 return static_cast<uint16_t>(x) == x;
383}
384template <> constexpr inline bool isUInt<32>(uint64_t x) {
385 return static_cast<uint32_t>(x) == x;
386}
387
388/// Checks if a unsigned integer is an N bit number shifted left by S.
389template <unsigned N, unsigned S>
390constexpr inline bool isShiftedUInt(uint64_t x) {
391 static_assert(
392 N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)");
393 static_assert(N + S <= 64,
394 "isShiftedUInt<N, S> with N + S > 64 is too wide.");
395 // Per the two static_asserts above, S must be strictly less than 64. So
396 // 1 << S is not undefined behavior.
397 return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
398}
399
400/// Gets the maximum value for a N-bit unsigned integer.
401inline uint64_t maxUIntN(uint64_t N) {
402 assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range"
) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/Support/MathExtras.h"
, 402, __PRETTY_FUNCTION__))
;
403
404 // uint64_t(1) << 64 is undefined behavior, so we can't do
405 // (uint64_t(1) << N) - 1
406 // without checking first that N != 64. But this works and doesn't have a
407 // branch.
408 return UINT64_MAX(18446744073709551615UL) >> (64 - N);
409}
410
411/// Gets the minimum value for a N-bit signed integer.
412inline int64_t minIntN(int64_t N) {
413 assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range"
) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/Support/MathExtras.h"
, 413, __PRETTY_FUNCTION__))
;
414
415 return -(UINT64_C(1)1UL<<(N-1));
416}
417
418/// Gets the maximum value for a N-bit signed integer.
419inline int64_t maxIntN(int64_t N) {
420 assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range"
) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/Support/MathExtras.h"
, 420, __PRETTY_FUNCTION__))
;
421
422 // This relies on two's complement wraparound when N == 64, so we convert to
423 // int64_t only at the very end to avoid UB.
424 return (UINT64_C(1)1UL << (N - 1)) - 1;
425}
426
427/// Checks if an unsigned integer fits into the given (dynamic) bit width.
428inline bool isUIntN(unsigned N, uint64_t x) {
429 return N >= 64 || x <= maxUIntN(N);
430}
431
432/// Checks if an signed integer fits into the given (dynamic) bit width.
433inline bool isIntN(unsigned N, int64_t x) {
434 return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N));
435}
436
437/// Return true if the argument is a non-empty sequence of ones starting at the
438/// least significant bit with the remainder zero (32 bit version).
439/// Ex. isMask_32(0x0000FFFFU) == true.
440constexpr inline bool isMask_32(uint32_t Value) {
441 return Value && ((Value + 1) & Value) == 0;
442}
443
444/// Return true if the argument is a non-empty sequence of ones starting at the
445/// least significant bit with the remainder zero (64 bit version).
446constexpr inline bool isMask_64(uint64_t Value) {
447 return Value && ((Value + 1) & Value) == 0;
448}
449
450/// Return true if the argument contains a non-empty sequence of ones with the
451/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
452constexpr inline bool isShiftedMask_32(uint32_t Value) {
453 return Value && isMask_32((Value - 1) | Value);
454}
455
456/// Return true if the argument contains a non-empty sequence of ones with the
457/// remainder zero (64 bit version.)
458constexpr inline bool isShiftedMask_64(uint64_t Value) {
459 return Value && isMask_64((Value - 1) | Value);
460}
461
462/// Return true if the argument is a power of two > 0.
463/// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
464constexpr inline bool isPowerOf2_32(uint32_t Value) {
465 return Value && !(Value & (Value - 1));
466}
467
468/// Return true if the argument is a power of two > 0 (64 bit edition.)
469constexpr inline bool isPowerOf2_64(uint64_t Value) {
470 return Value && !(Value & (Value - 1));
471}
472
473/// Count the number of ones from the most significant bit to the first
474/// zero bit.
475///
476/// Ex. countLeadingOnes(0xFF0FFF00) == 8.
477/// Only unsigned integral types are allowed.
478///
479/// \param ZB the behavior on an input of all ones. Only ZB_Width and
480/// ZB_Undefined are valid arguments.
481template <typename T>
482unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
483 static_assert(std::numeric_limits<T>::is_integer &&
484 !std::numeric_limits<T>::is_signed,
485 "Only unsigned integral types are allowed.");
486 return countLeadingZeros<T>(~Value, ZB);
487}
488
489/// Count the number of ones from the least significant bit to the first
490/// zero bit.
491///
492/// Ex. countTrailingOnes(0x00FF00FF) == 8.
493/// Only unsigned integral types are allowed.
494///
495/// \param ZB the behavior on an input of all ones. Only ZB_Width and
496/// ZB_Undefined are valid arguments.
497template <typename T>
498unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
499 static_assert(std::numeric_limits<T>::is_integer &&
500 !std::numeric_limits<T>::is_signed,
501 "Only unsigned integral types are allowed.");
502 return countTrailingZeros<T>(~Value, ZB);
503}
504
505namespace detail {
506template <typename T, std::size_t SizeOfT> struct PopulationCounter {
507 static unsigned count(T Value) {
508 // Generic version, forward to 32 bits.
509 static_assert(SizeOfT <= 4, "Not implemented!");
510#if defined(__GNUC__4)
511 return __builtin_popcount(Value);
512#else
513 uint32_t v = Value;
514 v = v - ((v >> 1) & 0x55555555);
515 v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
516 return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
517#endif
518 }
519};
520
521template <typename T> struct PopulationCounter<T, 8> {
522 static unsigned count(T Value) {
523#if defined(__GNUC__4)
524 return __builtin_popcountll(Value);
525#else
526 uint64_t v = Value;
527 v = v - ((v >> 1) & 0x5555555555555555ULL);
528 v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
529 v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
530 return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
531#endif
532 }
533};
534} // namespace detail
535
536/// Count the number of set bits in a value.
537/// Ex. countPopulation(0xF000F000) = 8
538/// Returns 0 if the word is zero.
539template <typename T>
540inline unsigned countPopulation(T Value) {
541 static_assert(std::numeric_limits<T>::is_integer &&
542 !std::numeric_limits<T>::is_signed,
543 "Only unsigned integral types are allowed.");
544 return detail::PopulationCounter<T, sizeof(T)>::count(Value);
545}
546
547/// Compile time Log2.
548/// Valid only for positive powers of two.
549template <size_t kValue> constexpr inline size_t CTLog2() {
550 static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue),
551 "Value is not a valid power of 2");
552 return 1 + CTLog2<kValue / 2>();
553}
554
555template <> constexpr inline size_t CTLog2<1>() { return 0; }
556
557/// Return the log base 2 of the specified value.
558inline double Log2(double Value) {
559#if defined(__ANDROID_API__) && __ANDROID_API__ < 18
560 return __builtin_log(Value) / __builtin_log(2.0);
561#else
562 return log2(Value);
563#endif
564}
565
566/// Return the floor log base 2 of the specified value, -1 if the value is zero.
567/// (32 bit edition.)
568/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
569inline unsigned Log2_32(uint32_t Value) {
570 return 31 - countLeadingZeros(Value);
571}
572
573/// Return the floor log base 2 of the specified value, -1 if the value is zero.
574/// (64 bit edition.)
575inline unsigned Log2_64(uint64_t Value) {
576 return 63 - countLeadingZeros(Value);
577}
578
579/// Return the ceil log base 2 of the specified value, 32 if the value is zero.
580/// (32 bit edition).
581/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
582inline unsigned Log2_32_Ceil(uint32_t Value) {
583 return 32 - countLeadingZeros(Value - 1);
584}
585
586/// Return the ceil log base 2 of the specified value, 64 if the value is zero.
587/// (64 bit edition.)
588inline unsigned Log2_64_Ceil(uint64_t Value) {
589 return 64 - countLeadingZeros(Value - 1);
590}
591
592/// Return the greatest common divisor of the values using Euclid's algorithm.
593template <typename T>
594inline T greatestCommonDivisor(T A, T B) {
595 while (B) {
596 T Tmp = B;
597 B = A % B;
598 A = Tmp;
599 }
600 return A;
601}
602
603inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) {
604 return greatestCommonDivisor<uint64_t>(A, B);
605}
606
607/// This function takes a 64-bit integer and returns the bit equivalent double.
608inline double BitsToDouble(uint64_t Bits) {
609 double D;
610 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
611 memcpy(&D, &Bits, sizeof(Bits));
612 return D;
613}
614
615/// This function takes a 32-bit integer and returns the bit equivalent float.
616inline float BitsToFloat(uint32_t Bits) {
617 float F;
618 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
619 memcpy(&F, &Bits, sizeof(Bits));
620 return F;
621}
622
623/// This function takes a double and returns the bit equivalent 64-bit integer.
624/// Note that copying doubles around changes the bits of NaNs on some hosts,
625/// notably x86, so this routine cannot be used if these bits are needed.
626inline uint64_t DoubleToBits(double Double) {
627 uint64_t Bits;
628 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
629 memcpy(&Bits, &Double, sizeof(Double));
630 return Bits;
631}
632
633/// This function takes a float and returns the bit equivalent 32-bit integer.
634/// Note that copying floats around changes the bits of NaNs on some hosts,
635/// notably x86, so this routine cannot be used if these bits are needed.
636inline uint32_t FloatToBits(float Float) {
637 uint32_t Bits;
638 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
639 memcpy(&Bits, &Float, sizeof(Float));
640 return Bits;
641}
642
643/// A and B are either alignments or offsets. Return the minimum alignment that
644/// may be assumed after adding the two together.
645constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
646 // The largest power of 2 that divides both A and B.
647 //
648 // Replace "-Value" by "1+~Value" in the following commented code to avoid
649 // MSVC warning C4146
650 // return (A | B) & -(A | B);
651 return (A | B) & (1 + ~(A | B));
652}
653
654/// Returns the next power of two (in 64-bits) that is strictly greater than A.
655/// Returns zero on overflow.
656inline uint64_t NextPowerOf2(uint64_t A) {
657 A |= (A >> 1);
658 A |= (A >> 2);
659 A |= (A >> 4);
660 A |= (A >> 8);
661 A |= (A >> 16);
662 A |= (A >> 32);
663 return A + 1;
664}
665
666/// Returns the power of two which is less than or equal to the given value.
667/// Essentially, it is a floor operation across the domain of powers of two.
668inline uint64_t PowerOf2Floor(uint64_t A) {
669 if (!A) return 0;
670 return 1ull << (63 - countLeadingZeros(A, ZB_Undefined));
671}
672
673/// Returns the power of two which is greater than or equal to the given value.
674/// Essentially, it is a ceil operation across the domain of powers of two.
675inline uint64_t PowerOf2Ceil(uint64_t A) {
676 if (!A)
677 return 0;
678 return NextPowerOf2(A - 1);
679}
680
681/// Returns the next integer (mod 2**64) that is greater than or equal to
682/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
683///
684/// If non-zero \p Skew is specified, the return value will be a minimal
685/// integer that is greater than or equal to \p Value and equal to
686/// \p Align * N + \p Skew for some integer N. If \p Skew is larger than
687/// \p Align, its value is adjusted to '\p Skew mod \p Align'.
688///
689/// Examples:
690/// \code
691/// alignTo(5, 8) = 8
692/// alignTo(17, 8) = 24
693/// alignTo(~0LL, 8) = 0
694/// alignTo(321, 255) = 510
695///
696/// alignTo(5, 8, 7) = 7
697/// alignTo(17, 8, 1) = 17
698/// alignTo(~0LL, 8, 3) = 3
699/// alignTo(321, 255, 42) = 552
700/// \endcode
701inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
702 assert(Align != 0u && "Align can't be 0.")((Align != 0u && "Align can't be 0.") ? static_cast<
void> (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/Support/MathExtras.h"
, 702, __PRETTY_FUNCTION__))
;
703 Skew %= Align;
704 return (Value + Align - 1 - Skew) / Align * Align + Skew;
705}
706
707/// Returns the next integer (mod 2**64) that is greater than or equal to
708/// \p Value and is a multiple of \c Align. \c Align must be non-zero.
709template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) {
710 static_assert(Align != 0u, "Align must be non-zero");
711 return (Value + Align - 1) / Align * Align;
712}
713
714/// Returns the integer ceil(Numerator / Denominator).
715inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
716 return alignTo(Numerator, Denominator) / Denominator;
717}
718
719/// Returns the integer nearest(Numerator / Denominator).
720inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) {
721 return (Numerator + (Denominator / 2)) / Denominator;
722}
723
724/// Returns the largest uint64_t less than or equal to \p Value and is
725/// \p Skew mod \p Align. \p Align must be non-zero
726inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
727 assert(Align != 0u && "Align can't be 0.")((Align != 0u && "Align can't be 0.") ? static_cast<
void> (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/Support/MathExtras.h"
, 727, __PRETTY_FUNCTION__))
;
728 Skew %= Align;
729 return (Value - Skew) / Align * Align + Skew;
730}
731
732/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
733/// Requires 0 < B <= 32.
734template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) {
735 static_assert(B > 0, "Bit width can't be 0.");
736 static_assert(B <= 32, "Bit width out of range.");
737 return int32_t(X << (32 - B)) >> (32 - B);
738}
739
740/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
741/// Requires 0 < B < 32.
742inline int32_t SignExtend32(uint32_t X, unsigned B) {
743 assert(B > 0 && "Bit width can't be 0.")((B > 0 && "Bit width can't be 0.") ? static_cast<
void> (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/Support/MathExtras.h"
, 743, __PRETTY_FUNCTION__))
;
744 assert(B <= 32 && "Bit width out of range.")((B <= 32 && "Bit width out of range.") ? static_cast
<void> (0) : __assert_fail ("B <= 32 && \"Bit width out of range.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/Support/MathExtras.h"
, 744, __PRETTY_FUNCTION__))
;
745 return int32_t(X << (32 - B)) >> (32 - B);
746}
747
748/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
749/// Requires 0 < B < 64.
750template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) {
751 static_assert(B > 0, "Bit width can't be 0.");
752 static_assert(B <= 64, "Bit width out of range.");
753 return int64_t(x << (64 - B)) >> (64 - B);
754}
755
756/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
757/// Requires 0 < B < 64.
758inline int64_t SignExtend64(uint64_t X, unsigned B) {
759 assert(B > 0 && "Bit width can't be 0.")((B > 0 && "Bit width can't be 0.") ? static_cast<
void> (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/Support/MathExtras.h"
, 759, __PRETTY_FUNCTION__))
;
760 assert(B <= 64 && "Bit width out of range.")((B <= 64 && "Bit width out of range.") ? static_cast
<void> (0) : __assert_fail ("B <= 64 && \"Bit width out of range.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/Support/MathExtras.h"
, 760, __PRETTY_FUNCTION__))
;
761 return int64_t(X << (64 - B)) >> (64 - B);
762}
763
764/// Subtract two unsigned integers, X and Y, of type T and return the absolute
765/// value of the result.
766template <typename T>
767std::enable_if_t<std::is_unsigned<T>::value, T> AbsoluteDifference(T X, T Y) {
768 return std::max(X, Y) - std::min(X, Y);
769}
770
771/// Add two unsigned integers, X and Y, of type T. Clamp the result to the
772/// maximum representable value of T on overflow. ResultOverflowed indicates if
773/// the result is larger than the maximum representable value of type T.
774template <typename T>
775std::enable_if_t<std::is_unsigned<T>::value, T>
776SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) {
777 bool Dummy;
778 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
779 // Hacker's Delight, p. 29
780 T Z = X + Y;
781 Overflowed = (Z < X || Z < Y);
782 if (Overflowed)
783 return std::numeric_limits<T>::max();
784 else
785 return Z;
786}
787
788/// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the
789/// maximum representable value of T on overflow. ResultOverflowed indicates if
790/// the result is larger than the maximum representable value of type T.
791template <typename T>
792std::enable_if_t<std::is_unsigned<T>::value, T>
793SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) {
794 bool Dummy;
795 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
796
797 // Hacker's Delight, p. 30 has a different algorithm, but we don't use that
798 // because it fails for uint16_t (where multiplication can have undefined
799 // behavior due to promotion to int), and requires a division in addition
800 // to the multiplication.
801
802 Overflowed = false;
803
804 // Log2(Z) would be either Log2Z or Log2Z + 1.
805 // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z
806 // will necessarily be less than Log2Max as desired.
807 int Log2Z = Log2_64(X) + Log2_64(Y);
808 const T Max = std::numeric_limits<T>::max();
809 int Log2Max = Log2_64(Max);
810 if (Log2Z < Log2Max) {
811 return X * Y;
812 }
813 if (Log2Z > Log2Max) {
814 Overflowed = true;
815 return Max;
816 }
817
818 // We're going to use the top bit, and maybe overflow one
819 // bit past it. Multiply all but the bottom bit then add
820 // that on at the end.
821 T Z = (X >> 1) * Y;
822 if (Z & ~(Max >> 1)) {
823 Overflowed = true;
824 return Max;
825 }
826 Z <<= 1;
827 if (X & 1)
828 return SaturatingAdd(Z, Y, ResultOverflowed);
829
830 return Z;
831}
832
833/// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to
834/// the product. Clamp the result to the maximum representable value of T on
835/// overflow. ResultOverflowed indicates if the result is larger than the
836/// maximum representable value of type T.
837template <typename T>
838std::enable_if_t<std::is_unsigned<T>::value, T>
839SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) {
840 bool Dummy;
841 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
842
843 T Product = SaturatingMultiply(X, Y, &Overflowed);
844 if (Overflowed)
845 return Product;
846
847 return SaturatingAdd(A, Product, &Overflowed);
848}
849
850/// Use this rather than HUGE_VALF; the latter causes warnings on MSVC.
851extern const float huge_valf;
852
853
854/// Add two signed integers, computing the two's complement truncated result,
855/// returning true if overflow occured.
856template <typename T>
857std::enable_if_t<std::is_signed<T>::value, T> AddOverflow(T X, T Y, T &Result) {
858#if __has_builtin(__builtin_add_overflow)1
859 return __builtin_add_overflow(X, Y, &Result);
860#else
861 // Perform the unsigned addition.
862 using U = std::make_unsigned_t<T>;
863 const U UX = static_cast<U>(X);
864 const U UY = static_cast<U>(Y);
865 const U UResult = UX + UY;
866
867 // Convert to signed.
868 Result = static_cast<T>(UResult);
869
870 // Adding two positive numbers should result in a positive number.
871 if (X > 0 && Y > 0)
872 return Result <= 0;
873 // Adding two negatives should result in a negative number.
874 if (X < 0 && Y < 0)
875 return Result >= 0;
876 return false;
877#endif
878}
879
880/// Subtract two signed integers, computing the two's complement truncated
881/// result, returning true if an overflow ocurred.
882template <typename T>
883std::enable_if_t<std::is_signed<T>::value, T> SubOverflow(T X, T Y, T &Result) {
884#if __has_builtin(__builtin_sub_overflow)1
885 return __builtin_sub_overflow(X, Y, &Result);
886#else
887 // Perform the unsigned addition.
888 using U = std::make_unsigned_t<T>;
889 const U UX = static_cast<U>(X);
890 const U UY = static_cast<U>(Y);
891 const U UResult = UX - UY;
892
893 // Convert to signed.
894 Result = static_cast<T>(UResult);
895
896 // Subtracting a positive number from a negative results in a negative number.
897 if (X <= 0 && Y > 0)
898 return Result >= 0;
899 // Subtracting a negative number from a positive results in a positive number.
900 if (X >= 0 && Y < 0)
901 return Result <= 0;
902 return false;
903#endif
904}
905
906/// Multiply two signed integers, computing the two's complement truncated
907/// result, returning true if an overflow ocurred.
908template <typename T>
909std::enable_if_t<std::is_signed<T>::value, T> MulOverflow(T X, T Y, T &Result) {
910 // Perform the unsigned multiplication on absolute values.
911 using U = std::make_unsigned_t<T>;
912 const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X);
913 const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y);
914 const U UResult = UX * UY;
915
916 // Convert to signed.
917 const bool IsNegative = (X < 0) ^ (Y < 0);
918 Result = IsNegative ? (0 - UResult) : UResult;
919
920 // If any of the args was 0, result is 0 and no overflow occurs.
921 if (UX == 0 || UY == 0)
922 return false;
923
924 // UX and UY are in [1, 2^n], where n is the number of digits.
925 // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for
926 // positive) divided by an argument compares to the other.
927 if (IsNegative)
928 return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY;
929 else
930 return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY;
931}
932
933} // End llvm namespace
934
935#endif