File: | llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h |
Warning: | line 237, column 16 The result of the left shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int' |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===// | ||||||
2 | // | ||||||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||||||
4 | // See https://llvm.org/LICENSE.txt for license information. | ||||||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||||||
6 | // | ||||||
7 | //===----------------------------------------------------------------------===// | ||||||
8 | |||||||
9 | #include "ARMTargetTransformInfo.h" | ||||||
10 | #include "ARMSubtarget.h" | ||||||
11 | #include "MCTargetDesc/ARMAddressingModes.h" | ||||||
12 | #include "llvm/ADT/APInt.h" | ||||||
13 | #include "llvm/ADT/SmallVector.h" | ||||||
14 | #include "llvm/Analysis/LoopInfo.h" | ||||||
15 | #include "llvm/CodeGen/CostTable.h" | ||||||
16 | #include "llvm/CodeGen/ISDOpcodes.h" | ||||||
17 | #include "llvm/CodeGen/ValueTypes.h" | ||||||
18 | #include "llvm/IR/BasicBlock.h" | ||||||
19 | #include "llvm/IR/DataLayout.h" | ||||||
20 | #include "llvm/IR/DerivedTypes.h" | ||||||
21 | #include "llvm/IR/Instruction.h" | ||||||
22 | #include "llvm/IR/Instructions.h" | ||||||
23 | #include "llvm/IR/Intrinsics.h" | ||||||
24 | #include "llvm/IR/IntrinsicInst.h" | ||||||
25 | #include "llvm/IR/IntrinsicsARM.h" | ||||||
26 | #include "llvm/IR/PatternMatch.h" | ||||||
27 | #include "llvm/IR/Type.h" | ||||||
28 | #include "llvm/MC/SubtargetFeature.h" | ||||||
29 | #include "llvm/Support/Casting.h" | ||||||
30 | #include "llvm/Support/KnownBits.h" | ||||||
31 | #include "llvm/Support/MachineValueType.h" | ||||||
32 | #include "llvm/Target/TargetMachine.h" | ||||||
33 | #include "llvm/Transforms/InstCombine/InstCombiner.h" | ||||||
34 | #include "llvm/Transforms/Utils/Local.h" | ||||||
35 | #include "llvm/Transforms/Utils/LoopUtils.h" | ||||||
36 | #include <algorithm> | ||||||
37 | #include <cassert> | ||||||
38 | #include <cstdint> | ||||||
39 | #include <utility> | ||||||
40 | |||||||
41 | using namespace llvm; | ||||||
42 | |||||||
43 | #define DEBUG_TYPE"armtti" "armtti" | ||||||
44 | |||||||
45 | static cl::opt<bool> EnableMaskedLoadStores( | ||||||
46 | "enable-arm-maskedldst", cl::Hidden, cl::init(true), | ||||||
47 | cl::desc("Enable the generation of masked loads and stores")); | ||||||
48 | |||||||
49 | static cl::opt<bool> DisableLowOverheadLoops( | ||||||
50 | "disable-arm-loloops", cl::Hidden, cl::init(false), | ||||||
51 | cl::desc("Disable the generation of low-overhead loops")); | ||||||
52 | |||||||
53 | static cl::opt<bool> | ||||||
54 | AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true), | ||||||
55 | cl::desc("Enable the generation of WLS loops")); | ||||||
56 | |||||||
57 | extern cl::opt<TailPredication::Mode> EnableTailPredication; | ||||||
58 | |||||||
59 | extern cl::opt<bool> EnableMaskedGatherScatters; | ||||||
60 | |||||||
61 | extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor; | ||||||
62 | |||||||
63 | /// Convert a vector load intrinsic into a simple llvm load instruction. | ||||||
64 | /// This is beneficial when the underlying object being addressed comes | ||||||
65 | /// from a constant, since we get constant-folding for free. | ||||||
66 | static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign, | ||||||
67 | InstCombiner::BuilderTy &Builder) { | ||||||
68 | auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1)); | ||||||
69 | |||||||
70 | if (!IntrAlign) | ||||||
71 | return nullptr; | ||||||
72 | |||||||
73 | unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign | ||||||
74 | ? MemAlign | ||||||
75 | : IntrAlign->getLimitedValue(); | ||||||
76 | |||||||
77 | if (!isPowerOf2_32(Alignment)) | ||||||
78 | return nullptr; | ||||||
79 | |||||||
80 | auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0), | ||||||
81 | PointerType::get(II.getType(), 0)); | ||||||
82 | return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment)); | ||||||
83 | } | ||||||
84 | |||||||
85 | bool ARMTTIImpl::areInlineCompatible(const Function *Caller, | ||||||
86 | const Function *Callee) const { | ||||||
87 | const TargetMachine &TM = getTLI()->getTargetMachine(); | ||||||
88 | const FeatureBitset &CallerBits = | ||||||
89 | TM.getSubtargetImpl(*Caller)->getFeatureBits(); | ||||||
90 | const FeatureBitset &CalleeBits = | ||||||
91 | TM.getSubtargetImpl(*Callee)->getFeatureBits(); | ||||||
92 | |||||||
93 | // To inline a callee, all features not in the allowed list must match exactly. | ||||||
94 | bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) == | ||||||
95 | (CalleeBits & ~InlineFeaturesAllowed); | ||||||
96 | // For features in the allowed list, the callee's features must be a subset of | ||||||
97 | // the callers'. | ||||||
98 | bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) == | ||||||
99 | (CalleeBits & InlineFeaturesAllowed); | ||||||
100 | return MatchExact && MatchSubset; | ||||||
101 | } | ||||||
102 | |||||||
103 | TTI::AddressingModeKind | ||||||
104 | ARMTTIImpl::getPreferredAddressingMode(const Loop *L, | ||||||
105 | ScalarEvolution *SE) const { | ||||||
106 | if (ST->hasMVEIntegerOps()) | ||||||
107 | return TTI::AMK_PostIndexed; | ||||||
108 | |||||||
109 | if (L->getHeader()->getParent()->hasOptSize()) | ||||||
110 | return TTI::AMK_None; | ||||||
111 | |||||||
112 | if (ST->isMClass() && ST->isThumb2() && | ||||||
113 | L->getNumBlocks() == 1) | ||||||
114 | return TTI::AMK_PreIndexed; | ||||||
115 | |||||||
116 | return TTI::AMK_None; | ||||||
117 | } | ||||||
118 | |||||||
119 | Optional<Instruction *> | ||||||
120 | ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { | ||||||
121 | using namespace PatternMatch; | ||||||
122 | Intrinsic::ID IID = II.getIntrinsicID(); | ||||||
123 | switch (IID) { | ||||||
124 | default: | ||||||
125 | break; | ||||||
126 | case Intrinsic::arm_neon_vld1: { | ||||||
127 | Align MemAlign = | ||||||
128 | getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II, | ||||||
129 | &IC.getAssumptionCache(), &IC.getDominatorTree()); | ||||||
130 | if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) { | ||||||
131 | return IC.replaceInstUsesWith(II, V); | ||||||
132 | } | ||||||
133 | break; | ||||||
134 | } | ||||||
135 | |||||||
136 | case Intrinsic::arm_neon_vld2: | ||||||
137 | case Intrinsic::arm_neon_vld3: | ||||||
138 | case Intrinsic::arm_neon_vld4: | ||||||
139 | case Intrinsic::arm_neon_vld2lane: | ||||||
140 | case Intrinsic::arm_neon_vld3lane: | ||||||
141 | case Intrinsic::arm_neon_vld4lane: | ||||||
142 | case Intrinsic::arm_neon_vst1: | ||||||
143 | case Intrinsic::arm_neon_vst2: | ||||||
144 | case Intrinsic::arm_neon_vst3: | ||||||
145 | case Intrinsic::arm_neon_vst4: | ||||||
146 | case Intrinsic::arm_neon_vst2lane: | ||||||
147 | case Intrinsic::arm_neon_vst3lane: | ||||||
148 | case Intrinsic::arm_neon_vst4lane: { | ||||||
149 | Align MemAlign = | ||||||
150 | getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II, | ||||||
151 | &IC.getAssumptionCache(), &IC.getDominatorTree()); | ||||||
152 | unsigned AlignArg = II.arg_size() - 1; | ||||||
153 | Value *AlignArgOp = II.getArgOperand(AlignArg); | ||||||
154 | MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue(); | ||||||
155 | if (Align && *Align < MemAlign) { | ||||||
156 | return IC.replaceOperand( | ||||||
157 | II, AlignArg, | ||||||
158 | ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(), | ||||||
159 | false)); | ||||||
160 | } | ||||||
161 | break; | ||||||
162 | } | ||||||
163 | |||||||
164 | case Intrinsic::arm_mve_pred_i2v: { | ||||||
165 | Value *Arg = II.getArgOperand(0); | ||||||
166 | Value *ArgArg; | ||||||
167 | if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>( | ||||||
168 | PatternMatch::m_Value(ArgArg))) && | ||||||
169 | II.getType() == ArgArg->getType()) { | ||||||
170 | return IC.replaceInstUsesWith(II, ArgArg); | ||||||
171 | } | ||||||
172 | Constant *XorMask; | ||||||
173 | if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>( | ||||||
174 | PatternMatch::m_Value(ArgArg)), | ||||||
175 | PatternMatch::m_Constant(XorMask))) && | ||||||
176 | II.getType() == ArgArg->getType()) { | ||||||
177 | if (auto *CI = dyn_cast<ConstantInt>(XorMask)) { | ||||||
178 | if (CI->getValue().trunc(16).isAllOnes()) { | ||||||
179 | auto TrueVector = IC.Builder.CreateVectorSplat( | ||||||
180 | cast<FixedVectorType>(II.getType())->getNumElements(), | ||||||
181 | IC.Builder.getTrue()); | ||||||
182 | return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector); | ||||||
183 | } | ||||||
184 | } | ||||||
185 | } | ||||||
186 | KnownBits ScalarKnown(32); | ||||||
187 | if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16), | ||||||
188 | ScalarKnown, 0)) { | ||||||
189 | return &II; | ||||||
190 | } | ||||||
191 | break; | ||||||
192 | } | ||||||
193 | case Intrinsic::arm_mve_pred_v2i: { | ||||||
194 | Value *Arg = II.getArgOperand(0); | ||||||
195 | Value *ArgArg; | ||||||
196 | if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>( | ||||||
197 | PatternMatch::m_Value(ArgArg)))) { | ||||||
198 | return IC.replaceInstUsesWith(II, ArgArg); | ||||||
199 | } | ||||||
200 | if (!II.getMetadata(LLVMContext::MD_range)) { | ||||||
201 | Type *IntTy32 = Type::getInt32Ty(II.getContext()); | ||||||
202 | Metadata *M[] = { | ||||||
203 | ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)), | ||||||
204 | ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0x10000))}; | ||||||
205 | II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M)); | ||||||
206 | return &II; | ||||||
207 | } | ||||||
208 | break; | ||||||
209 | } | ||||||
210 | case Intrinsic::arm_mve_vadc: | ||||||
211 | case Intrinsic::arm_mve_vadc_predicated: { | ||||||
212 | unsigned CarryOp = | ||||||
213 | (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2; | ||||||
214 | assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&(static_cast <bool> (II.getArgOperand(CarryOp)->getType ()->getScalarSizeInBits() == 32 && "Bad type for intrinsic!" ) ? void (0) : __assert_fail ("II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 && \"Bad type for intrinsic!\"" , "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp" , 215, __extension__ __PRETTY_FUNCTION__)) | ||||||
215 | "Bad type for intrinsic!")(static_cast <bool> (II.getArgOperand(CarryOp)->getType ()->getScalarSizeInBits() == 32 && "Bad type for intrinsic!" ) ? void (0) : __assert_fail ("II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 && \"Bad type for intrinsic!\"" , "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp" , 215, __extension__ __PRETTY_FUNCTION__)); | ||||||
216 | |||||||
217 | KnownBits CarryKnown(32); | ||||||
218 | if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29), | ||||||
219 | CarryKnown)) { | ||||||
220 | return &II; | ||||||
221 | } | ||||||
222 | break; | ||||||
223 | } | ||||||
224 | case Intrinsic::arm_mve_vmldava: { | ||||||
225 | Instruction *I = cast<Instruction>(&II); | ||||||
226 | if (I->hasOneUse()) { | ||||||
227 | auto *User = cast<Instruction>(*I->user_begin()); | ||||||
228 | Value *OpZ; | ||||||
229 | if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) && | ||||||
230 | match(I->getOperand(3), m_Zero())) { | ||||||
231 | Value *OpX = I->getOperand(4); | ||||||
232 | Value *OpY = I->getOperand(5); | ||||||
233 | Type *OpTy = OpX->getType(); | ||||||
234 | |||||||
235 | IC.Builder.SetInsertPoint(User); | ||||||
236 | Value *V = | ||||||
237 | IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy}, | ||||||
238 | {I->getOperand(0), I->getOperand(1), | ||||||
239 | I->getOperand(2), OpZ, OpX, OpY}); | ||||||
240 | |||||||
241 | IC.replaceInstUsesWith(*User, V); | ||||||
242 | return IC.eraseInstFromFunction(*User); | ||||||
243 | } | ||||||
244 | } | ||||||
245 | return None; | ||||||
246 | } | ||||||
247 | } | ||||||
248 | return None; | ||||||
249 | } | ||||||
250 | |||||||
251 | Optional<Value *> ARMTTIImpl::simplifyDemandedVectorEltsIntrinsic( | ||||||
252 | InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts, | ||||||
253 | APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, | ||||||
254 | std::function<void(Instruction *, unsigned, APInt, APInt &)> | ||||||
255 | SimplifyAndSetOp) const { | ||||||
256 | |||||||
257 | // Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the | ||||||
258 | // opcode specifying a Top/Bottom instruction, which can change between | ||||||
259 | // instructions. | ||||||
260 | auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) { | ||||||
261 | unsigned NumElts = cast<FixedVectorType>(II.getType())->getNumElements(); | ||||||
262 | unsigned IsTop = cast<ConstantInt>(II.getOperand(TopOpc))->getZExtValue(); | ||||||
263 | |||||||
264 | // The only odd/even lanes of operand 0 will only be demanded depending | ||||||
265 | // on whether this is a top/bottom instruction. | ||||||
266 | APInt DemandedElts = | ||||||
267 | APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1) | ||||||
268 | : APInt::getHighBitsSet(2, 1)); | ||||||
269 | SimplifyAndSetOp(&II, 0, OrigDemandedElts & DemandedElts, UndefElts); | ||||||
270 | // The other lanes will be defined from the inserted elements. | ||||||
271 | UndefElts &= APInt::getSplat(NumElts, !IsTop ? APInt::getLowBitsSet(2, 1) | ||||||
272 | : APInt::getHighBitsSet(2, 1)); | ||||||
273 | return None; | ||||||
274 | }; | ||||||
275 | |||||||
276 | switch (II.getIntrinsicID()) { | ||||||
277 | default: | ||||||
278 | break; | ||||||
279 | case Intrinsic::arm_mve_vcvt_narrow: | ||||||
280 | SimplifyNarrowInstrTopBottom(2); | ||||||
281 | break; | ||||||
282 | case Intrinsic::arm_mve_vqmovn: | ||||||
283 | SimplifyNarrowInstrTopBottom(4); | ||||||
284 | break; | ||||||
285 | case Intrinsic::arm_mve_vshrn: | ||||||
286 | SimplifyNarrowInstrTopBottom(7); | ||||||
287 | break; | ||||||
288 | } | ||||||
289 | |||||||
290 | return None; | ||||||
291 | } | ||||||
292 | |||||||
293 | InstructionCost ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, | ||||||
294 | TTI::TargetCostKind CostKind) { | ||||||
295 | assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) : __assert_fail ("Ty->isIntegerTy()", "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp" , 295, __extension__ __PRETTY_FUNCTION__)); | ||||||
296 | |||||||
297 | unsigned Bits = Ty->getPrimitiveSizeInBits(); | ||||||
298 | if (Bits == 0 || Imm.getActiveBits() >= 64) | ||||||
299 | return 4; | ||||||
300 | |||||||
301 | int64_t SImmVal = Imm.getSExtValue(); | ||||||
302 | uint64_t ZImmVal = Imm.getZExtValue(); | ||||||
303 | if (!ST->isThumb()) { | ||||||
304 | if ((SImmVal >= 0 && SImmVal < 65536) || | ||||||
305 | (ARM_AM::getSOImmVal(ZImmVal) != -1) || | ||||||
306 | (ARM_AM::getSOImmVal(~ZImmVal) != -1)) | ||||||
307 | return 1; | ||||||
308 | return ST->hasV6T2Ops() ? 2 : 3; | ||||||
309 | } | ||||||
310 | if (ST->isThumb2()) { | ||||||
311 | if ((SImmVal >= 0 && SImmVal < 65536) || | ||||||
312 | (ARM_AM::getT2SOImmVal(ZImmVal) != -1) || | ||||||
313 | (ARM_AM::getT2SOImmVal(~ZImmVal) != -1)) | ||||||
314 | return 1; | ||||||
315 | return ST->hasV6T2Ops() ? 2 : 3; | ||||||
316 | } | ||||||
317 | // Thumb1, any i8 imm cost 1. | ||||||
318 | if (Bits == 8 || (SImmVal
| ||||||
319 | return 1; | ||||||
320 | if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal)) | ||||||
321 | return 2; | ||||||
322 | // Load from constantpool. | ||||||
323 | return 3; | ||||||
324 | } | ||||||
325 | |||||||
326 | // Constants smaller than 256 fit in the immediate field of | ||||||
327 | // Thumb1 instructions so we return a zero cost and 1 otherwise. | ||||||
328 | InstructionCost ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, | ||||||
329 | const APInt &Imm, Type *Ty) { | ||||||
330 | if (Imm.isNonNegative() && Imm.getLimitedValue() < 256) | ||||||
331 | return 0; | ||||||
332 | |||||||
333 | return 1; | ||||||
334 | } | ||||||
335 | |||||||
336 | // Checks whether Inst is part of a min(max()) or max(min()) pattern | ||||||
337 | // that will match to an SSAT instruction | ||||||
338 | static bool isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) { | ||||||
339 | Value *LHS, *RHS; | ||||||
340 | ConstantInt *C; | ||||||
341 | SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor; | ||||||
342 | |||||||
343 | if (InstSPF == SPF_SMAX && | ||||||
344 | PatternMatch::match(RHS, PatternMatch::m_ConstantInt(C)) && | ||||||
345 | C->getValue() == Imm && Imm.isNegative() && (-Imm).isPowerOf2()) { | ||||||
346 | |||||||
347 | auto isSSatMin = [&](Value *MinInst) { | ||||||
348 | if (isa<SelectInst>(MinInst)) { | ||||||
349 | Value *MinLHS, *MinRHS; | ||||||
350 | ConstantInt *MinC; | ||||||
351 | SelectPatternFlavor MinSPF = | ||||||
352 | matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor; | ||||||
353 | if (MinSPF == SPF_SMIN && | ||||||
354 | PatternMatch::match(MinRHS, PatternMatch::m_ConstantInt(MinC)) && | ||||||
355 | MinC->getValue() == ((-Imm) - 1)) | ||||||
356 | return true; | ||||||
357 | } | ||||||
358 | return false; | ||||||
359 | }; | ||||||
360 | |||||||
361 | if (isSSatMin(Inst->getOperand(1)) || | ||||||
362 | (Inst->hasNUses(2) && (isSSatMin(*Inst->user_begin()) || | ||||||
363 | isSSatMin(*(++Inst->user_begin()))))) | ||||||
364 | return true; | ||||||
365 | } | ||||||
366 | return false; | ||||||
367 | } | ||||||
368 | |||||||
369 | InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, | ||||||
370 | const APInt &Imm, Type *Ty, | ||||||
371 | TTI::TargetCostKind CostKind, | ||||||
372 | Instruction *Inst) { | ||||||
373 | // Division by a constant can be turned into multiplication, but only if we | ||||||
374 | // know it's constant. So it's not so much that the immediate is cheap (it's | ||||||
375 | // not), but that the alternative is worse. | ||||||
376 | // FIXME: this is probably unneeded with GlobalISel. | ||||||
377 | if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv || | ||||||
| |||||||
378 | Opcode == Instruction::SRem || Opcode == Instruction::URem) && | ||||||
379 | Idx == 1) | ||||||
380 | return 0; | ||||||
381 | |||||||
382 | // Leave any gep offsets for the CodeGenPrepare, which will do a better job at | ||||||
383 | // splitting any large offsets. | ||||||
384 | if (Opcode == Instruction::GetElementPtr && Idx != 0) | ||||||
385 | return 0; | ||||||
386 | |||||||
387 | if (Opcode == Instruction::And) { | ||||||
388 | // UXTB/UXTH | ||||||
389 | if (Imm == 255 || Imm == 65535) | ||||||
390 | return 0; | ||||||
391 | // Conversion to BIC is free, and means we can use ~Imm instead. | ||||||
392 | return std::min(getIntImmCost(Imm, Ty, CostKind), | ||||||
393 | getIntImmCost(~Imm, Ty, CostKind)); | ||||||
394 | } | ||||||
395 | |||||||
396 | if (Opcode == Instruction::Add) | ||||||
397 | // Conversion to SUB is free, and means we can use -Imm instead. | ||||||
398 | return std::min(getIntImmCost(Imm, Ty, CostKind), | ||||||
399 | getIntImmCost(-Imm, Ty, CostKind)); | ||||||
400 | |||||||
401 | if (Opcode == Instruction::ICmp && Imm.isNegative() && | ||||||
402 | Ty->getIntegerBitWidth() == 32) { | ||||||
403 | int64_t NegImm = -Imm.getSExtValue(); | ||||||
404 | if (ST->isThumb2() && NegImm < 1<<12) | ||||||
405 | // icmp X, #-C -> cmn X, #C | ||||||
406 | return 0; | ||||||
407 | if (ST->isThumb() && NegImm < 1<<8) | ||||||
408 | // icmp X, #-C -> adds X, #C | ||||||
409 | return 0; | ||||||
410 | } | ||||||
411 | |||||||
412 | // xor a, -1 can always be folded to MVN | ||||||
413 | if (Opcode == Instruction::Xor && Imm.isAllOnes()) | ||||||
414 | return 0; | ||||||
415 | |||||||
416 | // Ensures negative constant of min(max()) or max(min()) patterns that | ||||||
417 | // match to SSAT instructions don't get hoisted | ||||||
418 | if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) && | ||||||
419 | Ty->getIntegerBitWidth() <= 32) { | ||||||
420 | if (isSSATMinMaxPattern(Inst, Imm) || | ||||||
421 | (isa<ICmpInst>(Inst) && Inst->hasOneUse() && | ||||||
422 | isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm))) | ||||||
423 | return 0; | ||||||
424 | } | ||||||
425 | |||||||
426 | // We can convert <= -1 to < 0, which is generally quite cheap. | ||||||
427 | if (Inst
| ||||||
428 | ICmpInst::Predicate Pred = cast<ICmpInst>(Inst)->getPredicate(); | ||||||
429 | if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE) | ||||||
430 | return std::min(getIntImmCost(Imm, Ty, CostKind), | ||||||
431 | getIntImmCost(Imm + 1, Ty, CostKind)); | ||||||
432 | } | ||||||
433 | |||||||
434 | return getIntImmCost(Imm, Ty, CostKind); | ||||||
435 | } | ||||||
436 | |||||||
437 | InstructionCost ARMTTIImpl::getCFInstrCost(unsigned Opcode, | ||||||
438 | TTI::TargetCostKind CostKind, | ||||||
439 | const Instruction *I) { | ||||||
440 | if (CostKind == TTI::TCK_RecipThroughput && | ||||||
441 | (ST->hasNEON() || ST->hasMVEIntegerOps())) { | ||||||
442 | // FIXME: The vectorizer is highly sensistive to the cost of these | ||||||
443 | // instructions, which suggests that it may be using the costs incorrectly. | ||||||
444 | // But, for now, just make them free to avoid performance regressions for | ||||||
445 | // vector targets. | ||||||
446 | return 0; | ||||||
447 | } | ||||||
448 | return BaseT::getCFInstrCost(Opcode, CostKind, I); | ||||||
449 | } | ||||||
450 | |||||||
451 | InstructionCost ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, | ||||||
452 | Type *Src, | ||||||
453 | TTI::CastContextHint CCH, | ||||||
454 | TTI::TargetCostKind CostKind, | ||||||
455 | const Instruction *I) { | ||||||
456 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | ||||||
457 | assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ? void (0) : __assert_fail ("ISD && \"Invalid opcode\"" , "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp" , 457, __extension__ __PRETTY_FUNCTION__)); | ||||||
458 | |||||||
459 | // TODO: Allow non-throughput costs that aren't binary. | ||||||
460 | auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost { | ||||||
461 | if (CostKind != TTI::TCK_RecipThroughput) | ||||||
462 | return Cost == 0 ? 0 : 1; | ||||||
463 | return Cost; | ||||||
464 | }; | ||||||
465 | auto IsLegalFPType = [this](EVT VT) { | ||||||
466 | EVT EltVT = VT.getScalarType(); | ||||||
467 | return (EltVT == MVT::f32 && ST->hasVFP2Base()) || | ||||||
468 | (EltVT == MVT::f64 && ST->hasFP64()) || | ||||||
469 | (EltVT == MVT::f16 && ST->hasFullFP16()); | ||||||
470 | }; | ||||||
471 | |||||||
472 | EVT SrcTy = TLI->getValueType(DL, Src); | ||||||
473 | EVT DstTy = TLI->getValueType(DL, Dst); | ||||||
474 | |||||||
475 | if (!SrcTy.isSimple() || !DstTy.isSimple()) | ||||||
476 | return AdjustCost( | ||||||
477 | BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); | ||||||
478 | |||||||
479 | // Extending masked load/Truncating masked stores is expensive because we | ||||||
480 | // currently don't split them. This means that we'll likely end up | ||||||
481 | // loading/storing each element individually (hence the high cost). | ||||||
482 | if ((ST->hasMVEIntegerOps() && | ||||||
483 | (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt || | ||||||
484 | Opcode == Instruction::SExt)) || | ||||||
485 | (ST->hasMVEFloatOps() && | ||||||
486 | (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) && | ||||||
487 | IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))) | ||||||
488 | if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128) | ||||||
489 | return 2 * DstTy.getVectorNumElements() * | ||||||
490 | ST->getMVEVectorCostFactor(CostKind); | ||||||
491 | |||||||
492 | // The extend of other kinds of load is free | ||||||
493 | if (CCH == TTI::CastContextHint::Normal || | ||||||
494 | CCH == TTI::CastContextHint::Masked) { | ||||||
495 | static const TypeConversionCostTblEntry LoadConversionTbl[] = { | ||||||
496 | {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0}, | ||||||
497 | {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0}, | ||||||
498 | {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0}, | ||||||
499 | {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0}, | ||||||
500 | {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0}, | ||||||
501 | {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0}, | ||||||
502 | {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1}, | ||||||
503 | {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1}, | ||||||
504 | {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1}, | ||||||
505 | {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1}, | ||||||
506 | {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1}, | ||||||
507 | {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1}, | ||||||
508 | }; | ||||||
509 | if (const auto *Entry = ConvertCostTableLookup( | ||||||
510 | LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) | ||||||
511 | return AdjustCost(Entry->Cost); | ||||||
512 | |||||||
513 | static const TypeConversionCostTblEntry MVELoadConversionTbl[] = { | ||||||
514 | {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0}, | ||||||
515 | {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0}, | ||||||
516 | {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0}, | ||||||
517 | {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0}, | ||||||
518 | {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0}, | ||||||
519 | {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0}, | ||||||
520 | // The following extend from a legal type to an illegal type, so need to | ||||||
521 | // split the load. This introduced an extra load operation, but the | ||||||
522 | // extend is still "free". | ||||||
523 | {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1}, | ||||||
524 | {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1}, | ||||||
525 | {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3}, | ||||||
526 | {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3}, | ||||||
527 | {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1}, | ||||||
528 | {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1}, | ||||||
529 | }; | ||||||
530 | if (SrcTy.isVector() && ST->hasMVEIntegerOps()) { | ||||||
531 | if (const auto *Entry = | ||||||
532 | ConvertCostTableLookup(MVELoadConversionTbl, ISD, | ||||||
533 | DstTy.getSimpleVT(), SrcTy.getSimpleVT())) | ||||||
534 | return Entry->Cost * ST->getMVEVectorCostFactor(CostKind); | ||||||
535 | } | ||||||
536 | |||||||
537 | static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = { | ||||||
538 | // FPExtends are similar but also require the VCVT instructions. | ||||||
539 | {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1}, | ||||||
540 | {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3}, | ||||||
541 | }; | ||||||
542 | if (SrcTy.isVector() && ST->hasMVEFloatOps()) { | ||||||
543 | if (const auto *Entry = | ||||||
544 | ConvertCostTableLookup(MVEFLoadConversionTbl, ISD, | ||||||
545 | DstTy.getSimpleVT(), SrcTy.getSimpleVT())) | ||||||
546 | return Entry->Cost * ST->getMVEVectorCostFactor(CostKind); | ||||||
547 | } | ||||||
548 | |||||||
549 | // The truncate of a store is free. This is the mirror of extends above. | ||||||
550 | static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = { | ||||||
551 | {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0}, | ||||||
552 | {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0}, | ||||||
553 | {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0}, | ||||||
554 | {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1}, | ||||||
555 | {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1}, | ||||||
556 | {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3}, | ||||||
557 | {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1}, | ||||||
558 | }; | ||||||
559 | if (SrcTy.isVector() && ST->hasMVEIntegerOps()) { | ||||||
560 | if (const auto *Entry = | ||||||
561 | ConvertCostTableLookup(MVEStoreConversionTbl, ISD, | ||||||
562 | SrcTy.getSimpleVT(), DstTy.getSimpleVT())) | ||||||
563 | return Entry->Cost * ST->getMVEVectorCostFactor(CostKind); | ||||||
564 | } | ||||||
565 | |||||||
566 | static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = { | ||||||
567 | {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1}, | ||||||
568 | {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3}, | ||||||
569 | }; | ||||||
570 | if (SrcTy.isVector() && ST->hasMVEFloatOps()) { | ||||||
571 | if (const auto *Entry = | ||||||
572 | ConvertCostTableLookup(MVEFStoreConversionTbl, ISD, | ||||||
573 | SrcTy.getSimpleVT(), DstTy.getSimpleVT())) | ||||||
574 | return Entry->Cost * ST->getMVEVectorCostFactor(CostKind); | ||||||
575 | } | ||||||
576 | } | ||||||
577 | |||||||
578 | // NEON vector operations that can extend their inputs. | ||||||
579 | if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) && | ||||||
580 | I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) { | ||||||
581 | static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = { | ||||||
582 | // vaddl | ||||||
583 | { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 }, | ||||||
584 | { ISD::ADD, MVT::v8i16, MVT::v8i8, 0 }, | ||||||
585 | // vsubl | ||||||
586 | { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 }, | ||||||
587 | { ISD::SUB, MVT::v8i16, MVT::v8i8, 0 }, | ||||||
588 | // vmull | ||||||
589 | { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 }, | ||||||
590 | { ISD::MUL, MVT::v8i16, MVT::v8i8, 0 }, | ||||||
591 | // vshll | ||||||
592 | { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 }, | ||||||
593 | { ISD::SHL, MVT::v8i16, MVT::v8i8, 0 }, | ||||||
594 | }; | ||||||
595 | |||||||
596 | auto *User = cast<Instruction>(*I->user_begin()); | ||||||
597 | int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode()); | ||||||
598 | if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD, | ||||||
599 | DstTy.getSimpleVT(), | ||||||
600 | SrcTy.getSimpleVT())) { | ||||||
601 | return AdjustCost(Entry->Cost); | ||||||
602 | } | ||||||
603 | } | ||||||
604 | |||||||
605 | // Single to/from double precision conversions. | ||||||
606 | if (Src->isVectorTy() && ST->hasNEON() && | ||||||
607 | ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 && | ||||||
608 | DstTy.getScalarType() == MVT::f32) || | ||||||
609 | (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 && | ||||||
610 | DstTy.getScalarType() == MVT::f64))) { | ||||||
611 | static const CostTblEntry NEONFltDblTbl[] = { | ||||||
612 | // Vector fptrunc/fpext conversions. | ||||||
613 | {ISD::FP_ROUND, MVT::v2f64, 2}, | ||||||
614 | {ISD::FP_EXTEND, MVT::v2f32, 2}, | ||||||
615 | {ISD::FP_EXTEND, MVT::v4f32, 4}}; | ||||||
616 | |||||||
617 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); | ||||||
618 | if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second)) | ||||||
619 | return AdjustCost(LT.first * Entry->Cost); | ||||||
620 | } | ||||||
621 | |||||||
622 | // Some arithmetic, load and store operations have specific instructions | ||||||
623 | // to cast up/down their types automatically at no extra cost. | ||||||
624 | // TODO: Get these tables to know at least what the related operations are. | ||||||
625 | static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = { | ||||||
626 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, | ||||||
627 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, | ||||||
628 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 }, | ||||||
629 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 }, | ||||||
630 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 }, | ||||||
631 | { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, | ||||||
632 | |||||||
633 | // The number of vmovl instructions for the extension. | ||||||
634 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, | ||||||
635 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, | ||||||
636 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, | ||||||
637 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, | ||||||
638 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 3 }, | ||||||
639 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 3 }, | ||||||
640 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 }, | ||||||
641 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 }, | ||||||
642 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, | ||||||
643 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, | ||||||
644 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, | ||||||
645 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, | ||||||
646 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, | ||||||
647 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, | ||||||
648 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, | ||||||
649 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, | ||||||
650 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, | ||||||
651 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, | ||||||
652 | |||||||
653 | // Operations that we legalize using splitting. | ||||||
654 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 }, | ||||||
655 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, | ||||||
656 | |||||||
657 | // Vector float <-> i32 conversions. | ||||||
658 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, | ||||||
659 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, | ||||||
660 | |||||||
661 | { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, | ||||||
662 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, | ||||||
663 | { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 }, | ||||||
664 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 }, | ||||||
665 | { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, | ||||||
666 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, | ||||||
667 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, | ||||||
668 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, | ||||||
669 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, | ||||||
670 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, | ||||||
671 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, | ||||||
672 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, | ||||||
673 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, | ||||||
674 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, | ||||||
675 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 }, | ||||||
676 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 }, | ||||||
677 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 }, | ||||||
678 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 }, | ||||||
679 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 }, | ||||||
680 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 }, | ||||||
681 | |||||||
682 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, | ||||||
683 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, | ||||||
684 | { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 }, | ||||||
685 | { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 }, | ||||||
686 | { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 }, | ||||||
687 | { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 }, | ||||||
688 | |||||||
689 | // Vector double <-> i32 conversions. | ||||||
690 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, | ||||||
691 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, | ||||||
692 | |||||||
693 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, | ||||||
694 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, | ||||||
695 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 }, | ||||||
696 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 }, | ||||||
697 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, | ||||||
698 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, | ||||||
699 | |||||||
700 | { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 }, | ||||||
701 | { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 }, | ||||||
702 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 4 }, | ||||||
703 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 4 }, | ||||||
704 | { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 8 }, | ||||||
705 | { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 8 } | ||||||
706 | }; | ||||||
707 | |||||||
708 | if (SrcTy.isVector() && ST->hasNEON()) { | ||||||
709 | if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD, | ||||||
710 | DstTy.getSimpleVT(), | ||||||
711 | SrcTy.getSimpleVT())) | ||||||
712 | return AdjustCost(Entry->Cost); | ||||||
713 | } | ||||||
714 | |||||||
715 | // Scalar float to integer conversions. | ||||||
716 | static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = { | ||||||
717 | { ISD::FP_TO_SINT, MVT::i1, MVT::f32, 2 }, | ||||||
718 | { ISD::FP_TO_UINT, MVT::i1, MVT::f32, 2 }, | ||||||
719 | { ISD::FP_TO_SINT, MVT::i1, MVT::f64, 2 }, | ||||||
720 | { ISD::FP_TO_UINT, MVT::i1, MVT::f64, 2 }, | ||||||
721 | { ISD::FP_TO_SINT, MVT::i8, MVT::f32, 2 }, | ||||||
722 | { ISD::FP_TO_UINT, MVT::i8, MVT::f32, 2 }, | ||||||
723 | { ISD::FP_TO_SINT, MVT::i8, MVT::f64, 2 }, | ||||||
724 | { ISD::FP_TO_UINT, MVT::i8, MVT::f64, 2 }, | ||||||
725 | { ISD::FP_TO_SINT, MVT::i16, MVT::f32, 2 }, | ||||||
726 | { ISD::FP_TO_UINT, MVT::i16, MVT::f32, 2 }, | ||||||
727 | { ISD::FP_TO_SINT, MVT::i16, MVT::f64, 2 }, | ||||||
728 | { ISD::FP_TO_UINT, MVT::i16, MVT::f64, 2 }, | ||||||
729 | { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 2 }, | ||||||
730 | { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 2 }, | ||||||
731 | { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 2 }, | ||||||
732 | { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 2 }, | ||||||
733 | { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 10 }, | ||||||
734 | { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 10 }, | ||||||
735 | { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 10 }, | ||||||
736 | { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 10 } | ||||||
737 | }; | ||||||
738 | if (SrcTy.isFloatingPoint() && ST->hasNEON()) { | ||||||
739 | if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD, | ||||||
740 | DstTy.getSimpleVT(), | ||||||
741 | SrcTy.getSimpleVT())) | ||||||
742 | return AdjustCost(Entry->Cost); | ||||||
743 | } | ||||||
744 | |||||||
745 | // Scalar integer to float conversions. | ||||||
746 | static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = { | ||||||
747 | { ISD::SINT_TO_FP, MVT::f32, MVT::i1, 2 }, | ||||||
748 | { ISD::UINT_TO_FP, MVT::f32, MVT::i1, 2 }, | ||||||
749 | { ISD::SINT_TO_FP, MVT::f64, MVT::i1, 2 }, | ||||||
750 | { ISD::UINT_TO_FP, MVT::f64, MVT::i1, 2 }, | ||||||
751 | { ISD::SINT_TO_FP, MVT::f32, MVT::i8, 2 }, | ||||||
752 | { ISD::UINT_TO_FP, MVT::f32, MVT::i8, 2 }, | ||||||
753 | { ISD::SINT_TO_FP, MVT::f64, MVT::i8, 2 }, | ||||||
754 | { ISD::UINT_TO_FP, MVT::f64, MVT::i8, 2 }, | ||||||
755 | { ISD::SINT_TO_FP, MVT::f32, MVT::i16, 2 }, | ||||||
756 | { ISD::UINT_TO_FP, MVT::f32, MVT::i16, 2 }, | ||||||
757 | { ISD::SINT_TO_FP, MVT::f64, MVT::i16, 2 }, | ||||||
758 | { ISD::UINT_TO_FP, MVT::f64, MVT::i16, 2 }, | ||||||
759 | { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 2 }, | ||||||
760 | { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 2 }, | ||||||
761 | { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 2 }, | ||||||
762 | { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 2 }, | ||||||
763 | { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 10 }, | ||||||
764 | { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 10 }, | ||||||
765 | { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 10 }, | ||||||
766 | { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 10 } | ||||||
767 | }; | ||||||
768 | |||||||
769 | if (SrcTy.isInteger() && ST->hasNEON()) { | ||||||
770 | if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl, | ||||||
771 | ISD, DstTy.getSimpleVT(), | ||||||
772 | SrcTy.getSimpleVT())) | ||||||
773 | return AdjustCost(Entry->Cost); | ||||||
774 | } | ||||||
775 | |||||||
776 | // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one | ||||||
777 | // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext | ||||||
778 | // are linearised so take more. | ||||||
779 | static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = { | ||||||
780 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, | ||||||
781 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, | ||||||
782 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, | ||||||
783 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, | ||||||
784 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 }, | ||||||
785 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 }, | ||||||
786 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, | ||||||
787 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, | ||||||
788 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 }, | ||||||
789 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 }, | ||||||
790 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 }, | ||||||
791 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 }, | ||||||
792 | }; | ||||||
793 | |||||||
794 | if (SrcTy.isVector() && ST->hasMVEIntegerOps()) { | ||||||
795 | if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl, | ||||||
796 | ISD, DstTy.getSimpleVT(), | ||||||
797 | SrcTy.getSimpleVT())) | ||||||
798 | return Entry->Cost * ST->getMVEVectorCostFactor(CostKind); | ||||||
799 | } | ||||||
800 | |||||||
801 | if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) { | ||||||
802 | // As general rule, fp converts that were not matched above are scalarized | ||||||
803 | // and cost 1 vcvt for each lane, so long as the instruction is available. | ||||||
804 | // If not it will become a series of function calls. | ||||||
805 | const InstructionCost CallCost = | ||||||
806 | getCallInstrCost(nullptr, Dst, {Src}, CostKind); | ||||||
807 | int Lanes = 1; | ||||||
808 | if (SrcTy.isFixedLengthVector()) | ||||||
809 | Lanes = SrcTy.getVectorNumElements(); | ||||||
810 | |||||||
811 | if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)) | ||||||
812 | return Lanes; | ||||||
813 | else | ||||||
814 | return Lanes * CallCost; | ||||||
815 | } | ||||||
816 | |||||||
817 | if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() && | ||||||
818 | SrcTy.isFixedLengthVector()) { | ||||||
819 | // Treat a truncate with larger than legal source (128bits for MVE) as | ||||||
820 | // expensive, 2 instructions per lane. | ||||||
821 | if ((SrcTy.getScalarType() == MVT::i8 || | ||||||
822 | SrcTy.getScalarType() == MVT::i16 || | ||||||
823 | SrcTy.getScalarType() == MVT::i32) && | ||||||
824 | SrcTy.getSizeInBits() > 128 && | ||||||
825 | SrcTy.getSizeInBits() > DstTy.getSizeInBits()) | ||||||
826 | return SrcTy.getVectorNumElements() * 2; | ||||||
827 | } | ||||||
828 | |||||||
829 | // Scalar integer conversion costs. | ||||||
830 | static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = { | ||||||
831 | // i16 -> i64 requires two dependent operations. | ||||||
832 | { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 }, | ||||||
833 | |||||||
834 | // Truncates on i64 are assumed to be free. | ||||||
835 | { ISD::TRUNCATE, MVT::i32, MVT::i64, 0 }, | ||||||
836 | { ISD::TRUNCATE, MVT::i16, MVT::i64, 0 }, | ||||||
837 | { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 }, | ||||||
838 | { ISD::TRUNCATE, MVT::i1, MVT::i64, 0 } | ||||||
839 | }; | ||||||
840 | |||||||
841 | if (SrcTy.isInteger()) { | ||||||
842 | if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD, | ||||||
843 | DstTy.getSimpleVT(), | ||||||
844 | SrcTy.getSimpleVT())) | ||||||
845 | return AdjustCost(Entry->Cost); | ||||||
846 | } | ||||||
847 | |||||||
848 | int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy() | ||||||
849 | ? ST->getMVEVectorCostFactor(CostKind) | ||||||
850 | : 1; | ||||||
851 | return AdjustCost( | ||||||
852 | BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); | ||||||
853 | } | ||||||
854 | |||||||
855 | InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, | ||||||
856 | unsigned Index) { | ||||||
857 | // Penalize inserting into an D-subregister. We end up with a three times | ||||||
858 | // lower estimated throughput on swift. | ||||||
859 | if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement && | ||||||
860 | ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32) | ||||||
861 | return 3; | ||||||
862 | |||||||
863 | if (ST->hasNEON() && (Opcode == Instruction::InsertElement || | ||||||
864 | Opcode == Instruction::ExtractElement)) { | ||||||
865 | // Cross-class copies are expensive on many microarchitectures, | ||||||
866 | // so assume they are expensive by default. | ||||||
867 | if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy()) | ||||||
868 | return 3; | ||||||
869 | |||||||
870 | // Even if it's not a cross class copy, this likely leads to mixing | ||||||
871 | // of NEON and VFP code and should be therefore penalized. | ||||||
872 | if (ValTy->isVectorTy() && | ||||||
873 | ValTy->getScalarSizeInBits() <= 32) | ||||||
874 | return std::max<InstructionCost>( | ||||||
875 | BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U); | ||||||
876 | } | ||||||
877 | |||||||
878 | if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement || | ||||||
879 | Opcode == Instruction::ExtractElement)) { | ||||||
880 | // Integer cross-lane moves are more expensive than float, which can | ||||||
881 | // sometimes just be vmovs. Integer involve being passes to GPR registers, | ||||||
882 | // causing more of a delay. | ||||||
883 | std::pair<InstructionCost, MVT> LT = | ||||||
884 | getTLI()->getTypeLegalizationCost(DL, ValTy->getScalarType()); | ||||||
885 | return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1); | ||||||
886 | } | ||||||
887 | |||||||
888 | return BaseT::getVectorInstrCost(Opcode, ValTy, Index); | ||||||
889 | } | ||||||
890 | |||||||
891 | InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, | ||||||
892 | Type *CondTy, | ||||||
893 | CmpInst::Predicate VecPred, | ||||||
894 | TTI::TargetCostKind CostKind, | ||||||
895 | const Instruction *I) { | ||||||
896 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | ||||||
897 | |||||||
898 | // Thumb scalar code size cost for select. | ||||||
899 | if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT && | ||||||
900 | ST->isThumb() && !ValTy->isVectorTy()) { | ||||||
901 | // Assume expensive structs. | ||||||
902 | if (TLI->getValueType(DL, ValTy, true) == MVT::Other) | ||||||
903 | return TTI::TCC_Expensive; | ||||||
904 | |||||||
905 | // Select costs can vary because they: | ||||||
906 | // - may require one or more conditional mov (including an IT), | ||||||
907 | // - can't operate directly on immediates, | ||||||
908 | // - require live flags, which we can't copy around easily. | ||||||
909 | InstructionCost Cost = TLI->getTypeLegalizationCost(DL, ValTy).first; | ||||||
910 | |||||||
911 | // Possible IT instruction for Thumb2, or more for Thumb1. | ||||||
912 | ++Cost; | ||||||
913 | |||||||
914 | // i1 values may need rematerialising by using mov immediates and/or | ||||||
915 | // flag setting instructions. | ||||||
916 | if (ValTy->isIntegerTy(1)) | ||||||
917 | ++Cost; | ||||||
918 | |||||||
919 | return Cost; | ||||||
920 | } | ||||||
921 | |||||||
922 | // If this is a vector min/max/abs, use the cost of that intrinsic directly | ||||||
923 | // instead. Hopefully when min/max intrinsics are more prevalent this code | ||||||
924 | // will not be needed. | ||||||
925 | const Instruction *Sel = I; | ||||||
926 | if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel && | ||||||
927 | Sel->hasOneUse()) | ||||||
928 | Sel = cast<Instruction>(Sel->user_back()); | ||||||
929 | if (Sel && ValTy->isVectorTy() && | ||||||
930 | (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) { | ||||||
931 | const Value *LHS, *RHS; | ||||||
932 | SelectPatternFlavor SPF = matchSelectPattern(Sel, LHS, RHS).Flavor; | ||||||
933 | unsigned IID = 0; | ||||||
934 | switch (SPF) { | ||||||
935 | case SPF_ABS: | ||||||
936 | IID = Intrinsic::abs; | ||||||
937 | break; | ||||||
938 | case SPF_SMIN: | ||||||
939 | IID = Intrinsic::smin; | ||||||
940 | break; | ||||||
941 | case SPF_SMAX: | ||||||
942 | IID = Intrinsic::smax; | ||||||
943 | break; | ||||||
944 | case SPF_UMIN: | ||||||
945 | IID = Intrinsic::umin; | ||||||
946 | break; | ||||||
947 | case SPF_UMAX: | ||||||
948 | IID = Intrinsic::umax; | ||||||
949 | break; | ||||||
950 | case SPF_FMINNUM: | ||||||
951 | IID = Intrinsic::minnum; | ||||||
952 | break; | ||||||
953 | case SPF_FMAXNUM: | ||||||
954 | IID = Intrinsic::maxnum; | ||||||
955 | break; | ||||||
956 | default: | ||||||
957 | break; | ||||||
958 | } | ||||||
959 | if (IID) { | ||||||
960 | // The ICmp is free, the select gets the cost of the min/max/etc | ||||||
961 | if (Sel != I) | ||||||
962 | return 0; | ||||||
963 | IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy}); | ||||||
964 | return getIntrinsicInstrCost(CostAttrs, CostKind); | ||||||
965 | } | ||||||
966 | } | ||||||
967 | |||||||
968 | // On NEON a vector select gets lowered to vbsl. | ||||||
969 | if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) { | ||||||
970 | // Lowering of some vector selects is currently far from perfect. | ||||||
971 | static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = { | ||||||
972 | { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 }, | ||||||
973 | { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 }, | ||||||
974 | { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 } | ||||||
975 | }; | ||||||
976 | |||||||
977 | EVT SelCondTy = TLI->getValueType(DL, CondTy); | ||||||
978 | EVT SelValTy = TLI->getValueType(DL, ValTy); | ||||||
979 | if (SelCondTy.isSimple() && SelValTy.isSimple()) { | ||||||
980 | if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD, | ||||||
981 | SelCondTy.getSimpleVT(), | ||||||
982 | SelValTy.getSimpleVT())) | ||||||
983 | return Entry->Cost; | ||||||
984 | } | ||||||
985 | |||||||
986 | std::pair<InstructionCost, MVT> LT = | ||||||
987 | TLI->getTypeLegalizationCost(DL, ValTy); | ||||||
988 | return LT.first; | ||||||
989 | } | ||||||
990 | |||||||
991 | if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() && | ||||||
992 | (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && | ||||||
993 | cast<FixedVectorType>(ValTy)->getNumElements() > 1) { | ||||||
994 | FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy); | ||||||
995 | FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy); | ||||||
996 | if (!VecCondTy) | ||||||
997 | VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy)); | ||||||
998 | |||||||
999 | // If we don't have mve.fp any fp operations will need to be scalarized. | ||||||
1000 | if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) { | ||||||
1001 | // One scalaization insert, one scalarization extract and the cost of the | ||||||
1002 | // fcmps. | ||||||
1003 | return BaseT::getScalarizationOverhead(VecValTy, false, true) + | ||||||
1004 | BaseT::getScalarizationOverhead(VecCondTy, true, false) + | ||||||
1005 | VecValTy->getNumElements() * | ||||||
1006 | getCmpSelInstrCost(Opcode, ValTy->getScalarType(), | ||||||
1007 | VecCondTy->getScalarType(), VecPred, CostKind, | ||||||
1008 | I); | ||||||
1009 | } | ||||||
1010 | |||||||
1011 | std::pair<InstructionCost, MVT> LT = | ||||||
1012 | TLI->getTypeLegalizationCost(DL, ValTy); | ||||||
1013 | int BaseCost = ST->getMVEVectorCostFactor(CostKind); | ||||||
1014 | // There are two types - the input that specifies the type of the compare | ||||||
1015 | // and the output vXi1 type. Because we don't know how the output will be | ||||||
1016 | // split, we may need an expensive shuffle to get two in sync. This has the | ||||||
1017 | // effect of making larger than legal compares (v8i32 for example) | ||||||
1018 | // expensive. | ||||||
1019 | if (LT.second.getVectorNumElements() > 2) { | ||||||
1020 | if (LT.first > 1) | ||||||
1021 | return LT.first * BaseCost + | ||||||
1022 | BaseT::getScalarizationOverhead(VecCondTy, true, false); | ||||||
1023 | return BaseCost; | ||||||
1024 | } | ||||||
1025 | } | ||||||
1026 | |||||||
1027 | // Default to cheap (throughput/size of 1 instruction) but adjust throughput | ||||||
1028 | // for "multiple beats" potentially needed by MVE instructions. | ||||||
1029 | int BaseCost = 1; | ||||||
1030 | if (ST->hasMVEIntegerOps() && ValTy->isVectorTy()) | ||||||
1031 | BaseCost = ST->getMVEVectorCostFactor(CostKind); | ||||||
1032 | |||||||
1033 | return BaseCost * | ||||||
1034 | BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); | ||||||
1035 | } | ||||||
1036 | |||||||
1037 | InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty, | ||||||
1038 | ScalarEvolution *SE, | ||||||
1039 | const SCEV *Ptr) { | ||||||
1040 | // Address computations in vectorized code with non-consecutive addresses will | ||||||
1041 | // likely result in more instructions compared to scalar code where the | ||||||
1042 | // computation can more often be merged into the index mode. The resulting | ||||||
1043 | // extra micro-ops can significantly decrease throughput. | ||||||
1044 | unsigned NumVectorInstToHideOverhead = 10; | ||||||
1045 | int MaxMergeDistance = 64; | ||||||
1046 | |||||||
1047 | if (ST->hasNEON()) { | ||||||
1048 | if (Ty->isVectorTy() && SE && | ||||||
1049 | !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)) | ||||||
1050 | return NumVectorInstToHideOverhead; | ||||||
1051 | |||||||
1052 | // In many cases the address computation is not merged into the instruction | ||||||
1053 | // addressing mode. | ||||||
1054 | return 1; | ||||||
1055 | } | ||||||
1056 | return BaseT::getAddressComputationCost(Ty, SE, Ptr); | ||||||
1057 | } | ||||||
1058 | |||||||
1059 | bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) { | ||||||
1060 | if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { | ||||||
1061 | // If a VCTP is part of a chain, it's already profitable and shouldn't be | ||||||
1062 | // optimized, else LSR may block tail-predication. | ||||||
1063 | switch (II->getIntrinsicID()) { | ||||||
1064 | case Intrinsic::arm_mve_vctp8: | ||||||
1065 | case Intrinsic::arm_mve_vctp16: | ||||||
1066 | case Intrinsic::arm_mve_vctp32: | ||||||
1067 | case Intrinsic::arm_mve_vctp64: | ||||||
1068 | return true; | ||||||
1069 | default: | ||||||
1070 | break; | ||||||
1071 | } | ||||||
1072 | } | ||||||
1073 | return false; | ||||||
1074 | } | ||||||
1075 | |||||||
1076 | bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) { | ||||||
1077 | if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps()) | ||||||
1078 | return false; | ||||||
1079 | |||||||
1080 | if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) { | ||||||
1081 | // Don't support v2i1 yet. | ||||||
1082 | if (VecTy->getNumElements() == 2) | ||||||
1083 | return false; | ||||||
1084 | |||||||
1085 | // We don't support extending fp types. | ||||||
1086 | unsigned VecWidth = DataTy->getPrimitiveSizeInBits(); | ||||||
1087 | if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy()) | ||||||
1088 | return false; | ||||||
1089 | } | ||||||
1090 | |||||||
1091 | unsigned EltWidth = DataTy->getScalarSizeInBits(); | ||||||
1092 | return (EltWidth == 32 && Alignment >= 4) || | ||||||
1093 | (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8); | ||||||
1094 | } | ||||||
1095 | |||||||
1096 | bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) { | ||||||
1097 | if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps()) | ||||||
1098 | return false; | ||||||
1099 | |||||||
1100 | // This method is called in 2 places: | ||||||
1101 | // - from the vectorizer with a scalar type, in which case we need to get | ||||||
1102 | // this as good as we can with the limited info we have (and rely on the cost | ||||||
1103 | // model for the rest). | ||||||
1104 | // - from the masked intrinsic lowering pass with the actual vector type. | ||||||
1105 | // For MVE, we have a custom lowering pass that will already have custom | ||||||
1106 | // legalised any gathers that we can to MVE intrinsics, and want to expand all | ||||||
1107 | // the rest. The pass runs before the masked intrinsic lowering pass, so if we | ||||||
1108 | // are here, we know we want to expand. | ||||||
1109 | if (isa<VectorType>(Ty)) | ||||||
1110 | return false; | ||||||
1111 | |||||||
1112 | unsigned EltWidth = Ty->getScalarSizeInBits(); | ||||||
1113 | return ((EltWidth == 32 && Alignment >= 4) || | ||||||
1114 | (EltWidth == 16 && Alignment >= 2) || EltWidth == 8); | ||||||
1115 | } | ||||||
1116 | |||||||
1117 | /// Given a memcpy/memset/memmove instruction, return the number of memory | ||||||
1118 | /// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a | ||||||
1119 | /// call is used. | ||||||
1120 | int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const { | ||||||
1121 | MemOp MOp; | ||||||
1122 | unsigned DstAddrSpace = ~0u; | ||||||
1123 | unsigned SrcAddrSpace = ~0u; | ||||||
1124 | const Function *F = I->getParent()->getParent(); | ||||||
1125 | |||||||
1126 | if (const auto *MC = dyn_cast<MemTransferInst>(I)) { | ||||||
1127 | ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength()); | ||||||
1128 | // If 'size' is not a constant, a library call will be generated. | ||||||
1129 | if (!C) | ||||||
1130 | return -1; | ||||||
1131 | |||||||
1132 | const unsigned Size = C->getValue().getZExtValue(); | ||||||
1133 | const Align DstAlign = *MC->getDestAlign(); | ||||||
1134 | const Align SrcAlign = *MC->getSourceAlign(); | ||||||
1135 | |||||||
1136 | MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign, | ||||||
1137 | /*IsVolatile*/ false); | ||||||
1138 | DstAddrSpace = MC->getDestAddressSpace(); | ||||||
1139 | SrcAddrSpace = MC->getSourceAddressSpace(); | ||||||
1140 | } | ||||||
1141 | else if (const auto *MS = dyn_cast<MemSetInst>(I)) { | ||||||
1142 | ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength()); | ||||||
1143 | // If 'size' is not a constant, a library call will be generated. | ||||||
1144 | if (!C) | ||||||
1145 | return -1; | ||||||
1146 | |||||||
1147 | const unsigned Size = C->getValue().getZExtValue(); | ||||||
1148 | const Align DstAlign = *MS->getDestAlign(); | ||||||
1149 | |||||||
1150 | MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign, | ||||||
1151 | /*IsZeroMemset*/ false, /*IsVolatile*/ false); | ||||||
1152 | DstAddrSpace = MS->getDestAddressSpace(); | ||||||
1153 | } | ||||||
1154 | else | ||||||
1155 | llvm_unreachable("Expected a memcpy/move or memset!")::llvm::llvm_unreachable_internal("Expected a memcpy/move or memset!" , "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp" , 1155); | ||||||
1156 | |||||||
1157 | unsigned Limit, Factor = 2; | ||||||
1158 | switch(I->getIntrinsicID()) { | ||||||
1159 | case Intrinsic::memcpy: | ||||||
1160 | Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize()); | ||||||
1161 | break; | ||||||
1162 | case Intrinsic::memmove: | ||||||
1163 | Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize()); | ||||||
1164 | break; | ||||||
1165 | case Intrinsic::memset: | ||||||
1166 | Limit = TLI->getMaxStoresPerMemset(F->hasMinSize()); | ||||||
1167 | Factor = 1; | ||||||
1168 | break; | ||||||
1169 | default: | ||||||
1170 | llvm_unreachable("Expected a memcpy/move or memset!")::llvm::llvm_unreachable_internal("Expected a memcpy/move or memset!" , "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp" , 1170); | ||||||
1171 | } | ||||||
1172 | |||||||
1173 | // MemOps will be poplulated with a list of data types that needs to be | ||||||
1174 | // loaded and stored. That's why we multiply the number of elements by 2 to | ||||||
1175 | // get the cost for this memcpy. | ||||||
1176 | std::vector<EVT> MemOps; | ||||||
1177 | if (getTLI()->findOptimalMemOpLowering( | ||||||
1178 | MemOps, Limit, MOp, DstAddrSpace, | ||||||
1179 | SrcAddrSpace, F->getAttributes())) | ||||||
1180 | return MemOps.size() * Factor; | ||||||
1181 | |||||||
1182 | // If we can't find an optimal memop lowering, return the default cost | ||||||
1183 | return -1; | ||||||
1184 | } | ||||||
1185 | |||||||
1186 | InstructionCost ARMTTIImpl::getMemcpyCost(const Instruction *I) { | ||||||
1187 | int NumOps = getNumMemOps(cast<IntrinsicInst>(I)); | ||||||
1188 | |||||||
1189 | // To model the cost of a library call, we assume 1 for the call, and | ||||||
1190 | // 3 for the argument setup. | ||||||
1191 | if (NumOps == -1) | ||||||
1192 | return 4; | ||||||
1193 | return NumOps; | ||||||
1194 | } | ||||||
1195 | |||||||
1196 | InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, | ||||||
1197 | VectorType *Tp, ArrayRef<int> Mask, | ||||||
1198 | int Index, VectorType *SubTp) { | ||||||
1199 | Kind = improveShuffleKindFromMask(Kind, Mask); | ||||||
1200 | if (ST->hasNEON()) { | ||||||
1201 | if (Kind == TTI::SK_Broadcast) { | ||||||
1202 | static const CostTblEntry NEONDupTbl[] = { | ||||||
1203 | // VDUP handles these cases. | ||||||
1204 | {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, | ||||||
1205 | {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, | ||||||
1206 | {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, | ||||||
1207 | {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, | ||||||
1208 | {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1}, | ||||||
1209 | {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1}, | ||||||
1210 | |||||||
1211 | {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1}, | ||||||
1212 | {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, | ||||||
1213 | {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, | ||||||
1214 | {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}}; | ||||||
1215 | |||||||
1216 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); | ||||||
1217 | if (const auto *Entry = | ||||||
1218 | CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second)) | ||||||
1219 | return LT.first * Entry->Cost; | ||||||
1220 | } | ||||||
1221 | if (Kind == TTI::SK_Reverse) { | ||||||
1222 | static const CostTblEntry NEONShuffleTbl[] = { | ||||||
1223 | // Reverse shuffle cost one instruction if we are shuffling within a | ||||||
1224 | // double word (vrev) or two if we shuffle a quad word (vrev, vext). | ||||||
1225 | {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, | ||||||
1226 | {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, | ||||||
1227 | {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, | ||||||
1228 | {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, | ||||||
1229 | {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1}, | ||||||
1230 | {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1}, | ||||||
1231 | |||||||
1232 | {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, | ||||||
1233 | {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, | ||||||
1234 | {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2}, | ||||||
1235 | {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}}; | ||||||
1236 | |||||||
1237 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); | ||||||
1238 | if (const auto *Entry = | ||||||
1239 | CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) | ||||||
1240 | return LT.first * Entry->Cost; | ||||||
1241 | } | ||||||
1242 | if (Kind == TTI::SK_Select) { | ||||||
1243 | static const CostTblEntry NEONSelShuffleTbl[] = { | ||||||
1244 | // Select shuffle cost table for ARM. Cost is the number of | ||||||
1245 | // instructions | ||||||
1246 | // required to create the shuffled vector. | ||||||
1247 | |||||||
1248 | {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, | ||||||
1249 | {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, | ||||||
1250 | {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, | ||||||
1251 | {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, | ||||||
1252 | |||||||
1253 | {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, | ||||||
1254 | {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, | ||||||
1255 | {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2}, | ||||||
1256 | |||||||
1257 | {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16}, | ||||||
1258 | |||||||
1259 | {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}}; | ||||||
1260 | |||||||
1261 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); | ||||||
1262 | if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl, | ||||||
1263 | ISD::VECTOR_SHUFFLE, LT.second)) | ||||||
1264 | return LT.first * Entry->Cost; | ||||||
1265 | } | ||||||
1266 | } | ||||||
1267 | if (ST->hasMVEIntegerOps()) { | ||||||
1268 | if (Kind == TTI::SK_Broadcast) { | ||||||
1269 | static const CostTblEntry MVEDupTbl[] = { | ||||||
1270 | // VDUP handles these cases. | ||||||
1271 | {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1}, | ||||||
1272 | {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, | ||||||
1273 | {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}, | ||||||
1274 | {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, | ||||||
1275 | {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}}; | ||||||
1276 | |||||||
1277 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); | ||||||
1278 | if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE, | ||||||
1279 | LT.second)) | ||||||
1280 | return LT.first * Entry->Cost * | ||||||
1281 | ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput); | ||||||
1282 | } | ||||||
1283 | |||||||
1284 | if (!Mask.empty()) { | ||||||
1285 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); | ||||||
1286 | if (Mask.size() <= LT.second.getVectorNumElements() && | ||||||
1287 | (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) || | ||||||
1288 | isVREVMask(Mask, LT.second, 64))) | ||||||
1289 | return ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) * LT.first; | ||||||
1290 | } | ||||||
1291 | } | ||||||
1292 | |||||||
1293 | int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy() | ||||||
1294 | ? ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) | ||||||
1295 | : 1; | ||||||
1296 | return BaseCost * BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp); | ||||||
1297 | } | ||||||
1298 | |||||||
1299 | InstructionCost ARMTTIImpl::getArithmeticInstrCost( | ||||||
1300 | unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, | ||||||
1301 | TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, | ||||||
1302 | TTI::OperandValueProperties Opd1PropInfo, | ||||||
1303 | TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args, | ||||||
1304 | const Instruction *CxtI) { | ||||||
1305 | int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode); | ||||||
1306 | if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) { | ||||||
1307 | // Make operations on i1 relatively expensive as this often involves | ||||||
1308 | // combining predicates. AND and XOR should be easier to handle with IT | ||||||
1309 | // blocks. | ||||||
1310 | switch (ISDOpcode) { | ||||||
1311 | default: | ||||||
1312 | break; | ||||||
1313 | case ISD::AND: | ||||||
1314 | case ISD::XOR: | ||||||
1315 | return 2; | ||||||
1316 | case ISD::OR: | ||||||
1317 | return 3; | ||||||
1318 | } | ||||||
1319 | } | ||||||
1320 | |||||||
1321 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); | ||||||
1322 | |||||||
1323 | if (ST->hasNEON()) { | ||||||
1324 | const unsigned FunctionCallDivCost = 20; | ||||||
1325 | const unsigned ReciprocalDivCost = 10; | ||||||
1326 | static const CostTblEntry CostTbl[] = { | ||||||
1327 | // Division. | ||||||
1328 | // These costs are somewhat random. Choose a cost of 20 to indicate that | ||||||
1329 | // vectorizing devision (added function call) is going to be very expensive. | ||||||
1330 | // Double registers types. | ||||||
1331 | { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost}, | ||||||
1332 | { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost}, | ||||||
1333 | { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost}, | ||||||
1334 | { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost}, | ||||||
1335 | { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost}, | ||||||
1336 | { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost}, | ||||||
1337 | { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost}, | ||||||
1338 | { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost}, | ||||||
1339 | { ISD::SDIV, MVT::v4i16, ReciprocalDivCost}, | ||||||
1340 | { ISD::UDIV, MVT::v4i16, ReciprocalDivCost}, | ||||||
1341 | { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost}, | ||||||
1342 | { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost}, | ||||||
1343 | { ISD::SDIV, MVT::v8i8, ReciprocalDivCost}, | ||||||
1344 | { ISD::UDIV, MVT::v8i8, ReciprocalDivCost}, | ||||||
1345 | { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost}, | ||||||
1346 | { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost}, | ||||||
1347 | // Quad register types. | ||||||
1348 | { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost}, | ||||||
1349 | { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost}, | ||||||
1350 | { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost}, | ||||||
1351 | { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost}, | ||||||
1352 | { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost}, | ||||||
1353 | { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost}, | ||||||
1354 | { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost}, | ||||||
1355 | { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost}, | ||||||
1356 | { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost}, | ||||||
1357 | { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost}, | ||||||
1358 | { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost}, | ||||||
1359 | { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost}, | ||||||
1360 | { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost}, | ||||||
1361 | { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost}, | ||||||
1362 | { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost}, | ||||||
1363 | { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost}, | ||||||
1364 | // Multiplication. | ||||||
1365 | }; | ||||||
1366 | |||||||
1367 | if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second)) | ||||||
1368 | return LT.first * Entry->Cost; | ||||||
1369 | |||||||
1370 | InstructionCost Cost = BaseT::getArithmeticInstrCost( | ||||||
1371 | Opcode, Ty, CostKind, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo); | ||||||
1372 | |||||||
1373 | // This is somewhat of a hack. The problem that we are facing is that SROA | ||||||
1374 | // creates a sequence of shift, and, or instructions to construct values. | ||||||
1375 | // These sequences are recognized by the ISel and have zero-cost. Not so for | ||||||
1376 | // the vectorized code. Because we have support for v2i64 but not i64 those | ||||||
1377 | // sequences look particularly beneficial to vectorize. | ||||||
1378 | // To work around this we increase the cost of v2i64 operations to make them | ||||||
1379 | // seem less beneficial. | ||||||
1380 | if (LT.second == MVT::v2i64 && | ||||||
1381 | Op2Info == TargetTransformInfo::OK_UniformConstantValue) | ||||||
1382 | Cost += 4; | ||||||
1383 | |||||||
1384 | return Cost; | ||||||
1385 | } | ||||||
1386 | |||||||
1387 | // If this operation is a shift on arm/thumb2, it might well be folded into | ||||||
1388 | // the following instruction, hence having a cost of 0. | ||||||
1389 | auto LooksLikeAFreeShift = [&]() { | ||||||
1390 | if (ST->isThumb1Only() || Ty->isVectorTy()) | ||||||
1391 | return false; | ||||||
1392 | |||||||
1393 | if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift()) | ||||||
1394 | return false; | ||||||
1395 | if (Op2Info != TargetTransformInfo::OK_UniformConstantValue) | ||||||
1396 | return false; | ||||||
1397 | |||||||
1398 | // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB | ||||||
1399 | switch (cast<Instruction>(CxtI->user_back())->getOpcode()) { | ||||||
1400 | case Instruction::Add: | ||||||
1401 | case Instruction::Sub: | ||||||
1402 | case Instruction::And: | ||||||
1403 | case Instruction::Xor: | ||||||
1404 | case Instruction::Or: | ||||||
1405 | case Instruction::ICmp: | ||||||
1406 | return true; | ||||||
1407 | default: | ||||||
1408 | return false; | ||||||
1409 | } | ||||||
1410 | }; | ||||||
1411 | if (LooksLikeAFreeShift()) | ||||||
1412 | return 0; | ||||||
1413 | |||||||
1414 | // Default to cheap (throughput/size of 1 instruction) but adjust throughput | ||||||
1415 | // for "multiple beats" potentially needed by MVE instructions. | ||||||
1416 | int BaseCost = 1; | ||||||
1417 | if (ST->hasMVEIntegerOps() && Ty->isVectorTy()) | ||||||
1418 | BaseCost = ST->getMVEVectorCostFactor(CostKind); | ||||||
1419 | |||||||
1420 | // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost, | ||||||
1421 | // without treating floats as more expensive that scalars or increasing the | ||||||
1422 | // costs for custom operations. The results is also multiplied by the | ||||||
1423 | // MVEVectorCostFactor where appropriate. | ||||||
1424 | if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second)) | ||||||
1425 | return LT.first * BaseCost; | ||||||
1426 | |||||||
1427 | // Else this is expand, assume that we need to scalarize this op. | ||||||
1428 | if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) { | ||||||
1429 | unsigned Num = VTy->getNumElements(); | ||||||
1430 | InstructionCost Cost = | ||||||
1431 | getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind); | ||||||
1432 | // Return the cost of multiple scalar invocation plus the cost of | ||||||
1433 | // inserting and extracting the values. | ||||||
1434 | SmallVector<Type *> Tys(Args.size(), Ty); | ||||||
1435 | return BaseT::getScalarizationOverhead(VTy, Args, Tys) + Num * Cost; | ||||||
1436 | } | ||||||
1437 | |||||||
1438 | return BaseCost; | ||||||
1439 | } | ||||||
1440 | |||||||
1441 | InstructionCost ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, | ||||||
1442 | MaybeAlign Alignment, | ||||||
1443 | unsigned AddressSpace, | ||||||
1444 | TTI::TargetCostKind CostKind, | ||||||
1445 | const Instruction *I) { | ||||||
1446 | // TODO: Handle other cost kinds. | ||||||
1447 | if (CostKind != TTI::TCK_RecipThroughput) | ||||||
1448 | return 1; | ||||||
1449 | |||||||
1450 | // Type legalization can't handle structs | ||||||
1451 | if (TLI->getValueType(DL, Src, true) == MVT::Other) | ||||||
1452 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, | ||||||
1453 | CostKind); | ||||||
1454 | |||||||
1455 | if (ST->hasNEON() && Src->isVectorTy() && | ||||||
1456 | (Alignment && *Alignment != Align(16)) && | ||||||
1457 | cast<VectorType>(Src)->getElementType()->isDoubleTy()) { | ||||||
1458 | // Unaligned loads/stores are extremely inefficient. | ||||||
1459 | // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr. | ||||||
1460 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); | ||||||
1461 | return LT.first * 4; | ||||||
1462 | } | ||||||
1463 | |||||||
1464 | // MVE can optimize a fpext(load(4xhalf)) using an extending integer load. | ||||||
1465 | // Same for stores. | ||||||
1466 | if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I && | ||||||
1467 | ((Opcode == Instruction::Load && I->hasOneUse() && | ||||||
1468 | isa<FPExtInst>(*I->user_begin())) || | ||||||
1469 | (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) { | ||||||
1470 | FixedVectorType *SrcVTy = cast<FixedVectorType>(Src); | ||||||
1471 | Type *DstTy = | ||||||
1472 | Opcode == Instruction::Load | ||||||
1473 | ? (*I->user_begin())->getType() | ||||||
1474 | : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType(); | ||||||
1475 | if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() && | ||||||
1476 | DstTy->getScalarType()->isFloatTy()) | ||||||
1477 | return ST->getMVEVectorCostFactor(CostKind); | ||||||
1478 | } | ||||||
1479 | |||||||
1480 | int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy() | ||||||
1481 | ? ST->getMVEVectorCostFactor(CostKind) | ||||||
1482 | : 1; | ||||||
1483 | return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, | ||||||
1484 | CostKind, I); | ||||||
1485 | } | ||||||
1486 | |||||||
1487 | InstructionCost | ||||||
1488 | ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, | ||||||
1489 | unsigned AddressSpace, | ||||||
1490 | TTI::TargetCostKind CostKind) { | ||||||
1491 | if (ST->hasMVEIntegerOps()) { | ||||||
1492 | if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment)) | ||||||
1493 | return ST->getMVEVectorCostFactor(CostKind); | ||||||
1494 | if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment)) | ||||||
1495 | return ST->getMVEVectorCostFactor(CostKind); | ||||||
1496 | } | ||||||
1497 | if (!isa<FixedVectorType>(Src)) | ||||||
1498 | return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, | ||||||
1499 | CostKind); | ||||||
1500 | // Scalar cost, which is currently very high due to the efficiency of the | ||||||
1501 | // generated code. | ||||||
1502 | return cast<FixedVectorType>(Src)->getNumElements() * 8; | ||||||
1503 | } | ||||||
1504 | |||||||
1505 | InstructionCost ARMTTIImpl::getInterleavedMemoryOpCost( | ||||||
1506 | unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, | ||||||
1507 | Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, | ||||||
1508 | bool UseMaskForCond, bool UseMaskForGaps) { | ||||||
1509 | assert(Factor >= 2 && "Invalid interleave factor")(static_cast <bool> (Factor >= 2 && "Invalid interleave factor" ) ? void (0) : __assert_fail ("Factor >= 2 && \"Invalid interleave factor\"" , "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp" , 1509, __extension__ __PRETTY_FUNCTION__)); | ||||||
1510 | assert(isa<VectorType>(VecTy) && "Expect a vector type")(static_cast <bool> (isa<VectorType>(VecTy) && "Expect a vector type") ? void (0) : __assert_fail ("isa<VectorType>(VecTy) && \"Expect a vector type\"" , "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp" , 1510, __extension__ __PRETTY_FUNCTION__)); | ||||||
1511 | |||||||
1512 | // vldN/vstN doesn't support vector types of i64/f64 element. | ||||||
1513 | bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64; | ||||||
1514 | |||||||
1515 | if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits && | ||||||
1516 | !UseMaskForCond && !UseMaskForGaps) { | ||||||
1517 | unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements(); | ||||||
1518 | auto *SubVecTy = | ||||||
1519 | FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor); | ||||||
1520 | |||||||
1521 | // vldN/vstN only support legal vector types of size 64 or 128 in bits. | ||||||
1522 | // Accesses having vector types that are a multiple of 128 bits can be | ||||||
1523 | // matched to more than one vldN/vstN instruction. | ||||||
1524 | int BaseCost = | ||||||
1525 | ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1; | ||||||
1526 | if (NumElts % Factor == 0 && | ||||||
1527 | TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL)) | ||||||
1528 | return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL); | ||||||
1529 | |||||||
1530 | // Some smaller than legal interleaved patterns are cheap as we can make | ||||||
1531 | // use of the vmovn or vrev patterns to interleave a standard load. This is | ||||||
1532 | // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is | ||||||
1533 | // promoted differently). The cost of 2 here is then a load and vrev or | ||||||
1534 | // vmovn. | ||||||
1535 | if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 && | ||||||
1536 | VecTy->isIntOrIntVectorTy() && | ||||||
1537 | DL.getTypeSizeInBits(SubVecTy).getFixedSize() <= 64) | ||||||
1538 | return 2 * BaseCost; | ||||||
1539 | } | ||||||
1540 | |||||||
1541 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | ||||||
1542 | Alignment, AddressSpace, CostKind, | ||||||
1543 | UseMaskForCond, UseMaskForGaps); | ||||||
1544 | } | ||||||
1545 | |||||||
1546 | InstructionCost ARMTTIImpl::getGatherScatterOpCost( | ||||||
1547 | unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, | ||||||
1548 | Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { | ||||||
1549 | using namespace PatternMatch; | ||||||
1550 | if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters) | ||||||
1551 | return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, | ||||||
1552 | Alignment, CostKind, I); | ||||||
1553 | |||||||
1554 | assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!")(static_cast <bool> (DataTy->isVectorTy() && "Can't do gather/scatters on scalar!") ? void (0) : __assert_fail ("DataTy->isVectorTy() && \"Can't do gather/scatters on scalar!\"" , "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp" , 1554, __extension__ __PRETTY_FUNCTION__)); | ||||||
1555 | auto *VTy = cast<FixedVectorType>(DataTy); | ||||||
1556 | |||||||
1557 | // TODO: Splitting, once we do that. | ||||||
1558 | |||||||
1559 | unsigned NumElems = VTy->getNumElements(); | ||||||
1560 | unsigned EltSize = VTy->getScalarSizeInBits(); | ||||||
1561 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, DataTy); | ||||||
1562 | |||||||
1563 | // For now, it is assumed that for the MVE gather instructions the loads are | ||||||
1564 | // all effectively serialised. This means the cost is the scalar cost | ||||||
1565 | // multiplied by the number of elements being loaded. This is possibly very | ||||||
1566 | // conservative, but even so we still end up vectorising loops because the | ||||||
1567 | // cost per iteration for many loops is lower than for scalar loops. | ||||||
1568 | InstructionCost VectorCost = | ||||||
1569 | NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind); | ||||||
1570 | // The scalarization cost should be a lot higher. We use the number of vector | ||||||
1571 | // elements plus the scalarization overhead. | ||||||
1572 | InstructionCost ScalarCost = | ||||||
1573 | NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, true, false) + | ||||||
1574 | BaseT::getScalarizationOverhead(VTy, false, true); | ||||||
1575 | |||||||
1576 | if (EltSize < 8 || Alignment < EltSize / 8) | ||||||
1577 | return ScalarCost; | ||||||
1578 | |||||||
1579 | unsigned ExtSize = EltSize; | ||||||
1580 | // Check whether there's a single user that asks for an extended type | ||||||
1581 | if (I != nullptr) { | ||||||
1582 | // Dependent of the caller of this function, a gather instruction will | ||||||
1583 | // either have opcode Instruction::Load or be a call to the masked_gather | ||||||
1584 | // intrinsic | ||||||
1585 | if ((I->getOpcode() == Instruction::Load || | ||||||
1586 | match(I, m_Intrinsic<Intrinsic::masked_gather>())) && | ||||||
1587 | I->hasOneUse()) { | ||||||
1588 | const User *Us = *I->users().begin(); | ||||||
1589 | if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) { | ||||||
1590 | // only allow valid type combinations | ||||||
1591 | unsigned TypeSize = | ||||||
1592 | cast<Instruction>(Us)->getType()->getScalarSizeInBits(); | ||||||
1593 | if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) || | ||||||
1594 | (TypeSize == 16 && EltSize == 8)) && | ||||||
1595 | TypeSize * NumElems == 128) { | ||||||
1596 | ExtSize = TypeSize; | ||||||
1597 | } | ||||||
1598 | } | ||||||
1599 | } | ||||||
1600 | // Check whether the input data needs to be truncated | ||||||
1601 | TruncInst *T; | ||||||
1602 | if ((I->getOpcode() == Instruction::Store || | ||||||
1603 | match(I, m_Intrinsic<Intrinsic::masked_scatter>())) && | ||||||
1604 | (T = dyn_cast<TruncInst>(I->getOperand(0)))) { | ||||||
1605 | // Only allow valid type combinations | ||||||
1606 | unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits(); | ||||||
1607 | if (((EltSize == 16 && TypeSize == 32) || | ||||||
1608 | (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) && | ||||||
1609 | TypeSize * NumElems == 128) | ||||||
1610 | ExtSize = TypeSize; | ||||||
1611 | } | ||||||
1612 | } | ||||||
1613 | |||||||
1614 | if (ExtSize * NumElems != 128 || NumElems < 4) | ||||||
1615 | return ScalarCost; | ||||||
1616 | |||||||
1617 | // Any (aligned) i32 gather will not need to be scalarised. | ||||||
1618 | if (ExtSize == 32) | ||||||
1619 | return VectorCost; | ||||||
1620 | // For smaller types, we need to ensure that the gep's inputs are correctly | ||||||
1621 | // extended from a small enough value. Other sizes (including i64) are | ||||||
1622 | // scalarized for now. | ||||||
1623 | if (ExtSize != 8 && ExtSize != 16) | ||||||
1624 | return ScalarCost; | ||||||
1625 | |||||||
1626 | if (const auto *BC = dyn_cast<BitCastInst>(Ptr)) | ||||||
1627 | Ptr = BC->getOperand(0); | ||||||
1628 | if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) { | ||||||
1629 | if (GEP->getNumOperands() != 2) | ||||||
1630 | return ScalarCost; | ||||||
1631 | unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType()); | ||||||
1632 | // Scale needs to be correct (which is only relevant for i16s). | ||||||
1633 | if (Scale != 1 && Scale * 8 != ExtSize) | ||||||
1634 | return ScalarCost; | ||||||
1635 | // And we need to zext (not sext) the indexes from a small enough type. | ||||||
1636 | if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) { | ||||||
1637 | if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize) | ||||||
1638 | return VectorCost; | ||||||
1639 | } | ||||||
1640 | return ScalarCost; | ||||||
1641 | } | ||||||
1642 | return ScalarCost; | ||||||
1643 | } | ||||||
1644 | |||||||
1645 | InstructionCost | ||||||
1646 | ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, | ||||||
1647 | Optional<FastMathFlags> FMF, | ||||||
1648 | TTI::TargetCostKind CostKind) { | ||||||
1649 | if (TTI::requiresOrderedReduction(FMF)) | ||||||
1650 | return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); | ||||||
1651 | |||||||
1652 | EVT ValVT = TLI->getValueType(DL, ValTy); | ||||||
1653 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | ||||||
1654 | if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD) | ||||||
1655 | return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); | ||||||
1656 | |||||||
1657 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); | ||||||
1658 | |||||||
1659 | static const CostTblEntry CostTblAdd[]{ | ||||||
1660 | {ISD::ADD, MVT::v16i8, 1}, | ||||||
1661 | {ISD::ADD, MVT::v8i16, 1}, | ||||||
1662 | {ISD::ADD, MVT::v4i32, 1}, | ||||||
1663 | }; | ||||||
1664 | if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second)) | ||||||
1665 | return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first; | ||||||
1666 | |||||||
1667 | return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); | ||||||
1668 | } | ||||||
1669 | |||||||
1670 | InstructionCost | ||||||
1671 | ARMTTIImpl::getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, | ||||||
1672 | Type *ResTy, VectorType *ValTy, | ||||||
1673 | TTI::TargetCostKind CostKind) { | ||||||
1674 | EVT ValVT = TLI->getValueType(DL, ValTy); | ||||||
1675 | EVT ResVT = TLI->getValueType(DL, ResTy); | ||||||
1676 | |||||||
1677 | if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) { | ||||||
1678 | std::pair<InstructionCost, MVT> LT = | ||||||
1679 | TLI->getTypeLegalizationCost(DL, ValTy); | ||||||
1680 | |||||||
1681 | // The legal cases are: | ||||||
1682 | // VADDV u/s 8/16/32 | ||||||
1683 | // VMLAV u/s 8/16/32 | ||||||
1684 | // VADDLV u/s 32 | ||||||
1685 | // VMLALV u/s 16/32 | ||||||
1686 | // Codegen currently cannot always handle larger than legal vectors very | ||||||
1687 | // well, especially for predicated reductions where the mask needs to be | ||||||
1688 | // split, so restrict to 128bit or smaller input types. | ||||||
1689 | unsigned RevVTSize = ResVT.getSizeInBits(); | ||||||
1690 | if (ValVT.getSizeInBits() <= 128 && | ||||||
1691 | ((LT.second == MVT::v16i8 && RevVTSize <= 32) || | ||||||
1692 | (LT.second == MVT::v8i16 && RevVTSize <= (IsMLA ? 64u : 32u)) || | ||||||
1693 | (LT.second == MVT::v4i32 && RevVTSize <= 64))) | ||||||
1694 | return ST->getMVEVectorCostFactor(CostKind) * LT.first; | ||||||
1695 | } | ||||||
1696 | |||||||
1697 | return BaseT::getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, ValTy, | ||||||
1698 | CostKind); | ||||||
1699 | } | ||||||
1700 | |||||||
1701 | InstructionCost | ||||||
1702 | ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, | ||||||
1703 | TTI::TargetCostKind CostKind) { | ||||||
1704 | switch (ICA.getID()) { | ||||||
1705 | case Intrinsic::get_active_lane_mask: | ||||||
1706 | // Currently we make a somewhat optimistic assumption that | ||||||
1707 | // active_lane_mask's are always free. In reality it may be freely folded | ||||||
1708 | // into a tail predicated loop, expanded into a VCPT or expanded into a lot | ||||||
1709 | // of add/icmp code. We may need to improve this in the future, but being | ||||||
1710 | // able to detect if it is free or not involves looking at a lot of other | ||||||
1711 | // code. We currently assume that the vectorizer inserted these, and knew | ||||||
1712 | // what it was doing in adding one. | ||||||
1713 | if (ST->hasMVEIntegerOps()) | ||||||
1714 | return 0; | ||||||
1715 | break; | ||||||
1716 | case Intrinsic::sadd_sat: | ||||||
1717 | case Intrinsic::ssub_sat: | ||||||
1718 | case Intrinsic::uadd_sat: | ||||||
1719 | case Intrinsic::usub_sat: { | ||||||
1720 | if (!ST->hasMVEIntegerOps()) | ||||||
1721 | break; | ||||||
1722 | Type *VT = ICA.getReturnType(); | ||||||
1723 | |||||||
1724 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT); | ||||||
1725 | if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 || | ||||||
1726 | LT.second == MVT::v16i8) { | ||||||
1727 | // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we | ||||||
1728 | // need to extend the type, as it uses shr(qadd(shl, shl)). | ||||||
1729 | unsigned Instrs = | ||||||
1730 | LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4; | ||||||
1731 | return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs; | ||||||
1732 | } | ||||||
1733 | break; | ||||||
1734 | } | ||||||
1735 | case Intrinsic::abs: | ||||||
1736 | case Intrinsic::smin: | ||||||
1737 | case Intrinsic::smax: | ||||||
1738 | case Intrinsic::umin: | ||||||
1739 | case Intrinsic::umax: { | ||||||
1740 | if (!ST->hasMVEIntegerOps()) | ||||||
1741 | break; | ||||||
1742 | Type *VT = ICA.getReturnType(); | ||||||
1743 | |||||||
1744 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT); | ||||||
1745 | if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 || | ||||||
1746 | LT.second == MVT::v16i8) | ||||||
1747 | return LT.first * ST->getMVEVectorCostFactor(CostKind); | ||||||
1748 | break; | ||||||
1749 | } | ||||||
1750 | case Intrinsic::minnum: | ||||||
1751 | case Intrinsic::maxnum: { | ||||||
1752 | if (!ST->hasMVEFloatOps()) | ||||||
1753 | break; | ||||||
1754 | Type *VT = ICA.getReturnType(); | ||||||
1755 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT); | ||||||
1756 | if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16) | ||||||
1757 | return LT.first * ST->getMVEVectorCostFactor(CostKind); | ||||||
1758 | break; | ||||||
1759 | } | ||||||
1760 | } | ||||||
1761 | |||||||
1762 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); | ||||||
1763 | } | ||||||
1764 | |||||||
1765 | bool ARMTTIImpl::isLoweredToCall(const Function *F) { | ||||||
1766 | if (!F->isIntrinsic()) | ||||||
1767 | BaseT::isLoweredToCall(F); | ||||||
1768 | |||||||
1769 | // Assume all Arm-specific intrinsics map to an instruction. | ||||||
1770 | if (F->getName().startswith("llvm.arm")) | ||||||
1771 | return false; | ||||||
1772 | |||||||
1773 | switch (F->getIntrinsicID()) { | ||||||
1774 | default: break; | ||||||
1775 | case Intrinsic::powi: | ||||||
1776 | case Intrinsic::sin: | ||||||
1777 | case Intrinsic::cos: | ||||||
1778 | case Intrinsic::pow: | ||||||
1779 | case Intrinsic::log: | ||||||
1780 | case Intrinsic::log10: | ||||||
1781 | case Intrinsic::log2: | ||||||
1782 | case Intrinsic::exp: | ||||||
1783 | case Intrinsic::exp2: | ||||||
1784 | return true; | ||||||
1785 | case Intrinsic::sqrt: | ||||||
1786 | case Intrinsic::fabs: | ||||||
1787 | case Intrinsic::copysign: | ||||||
1788 | case Intrinsic::floor: | ||||||
1789 | case Intrinsic::ceil: | ||||||
1790 | case Intrinsic::trunc: | ||||||
1791 | case Intrinsic::rint: | ||||||
1792 | case Intrinsic::nearbyint: | ||||||
1793 | case Intrinsic::round: | ||||||
1794 | case Intrinsic::canonicalize: | ||||||
1795 | case Intrinsic::lround: | ||||||
1796 | case Intrinsic::llround: | ||||||
1797 | case Intrinsic::lrint: | ||||||
1798 | case Intrinsic::llrint: | ||||||
1799 | if (F->getReturnType()->isDoubleTy() && !ST->hasFP64()) | ||||||
1800 | return true; | ||||||
1801 | if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16()) | ||||||
1802 | return true; | ||||||
1803 | // Some operations can be handled by vector instructions and assume | ||||||
1804 | // unsupported vectors will be expanded into supported scalar ones. | ||||||
1805 | // TODO Handle scalar operations properly. | ||||||
1806 | return !ST->hasFPARMv8Base() && !ST->hasVFP2Base(); | ||||||
1807 | case Intrinsic::masked_store: | ||||||
1808 | case Intrinsic::masked_load: | ||||||
1809 | case Intrinsic::masked_gather: | ||||||
1810 | case Intrinsic::masked_scatter: | ||||||
1811 | return !ST->hasMVEIntegerOps(); | ||||||
1812 | case Intrinsic::sadd_with_overflow: | ||||||
1813 | case Intrinsic::uadd_with_overflow: | ||||||
1814 | case Intrinsic::ssub_with_overflow: | ||||||
1815 | case Intrinsic::usub_with_overflow: | ||||||
1816 | case Intrinsic::sadd_sat: | ||||||
1817 | case Intrinsic::uadd_sat: | ||||||
1818 | case Intrinsic::ssub_sat: | ||||||
1819 | case Intrinsic::usub_sat: | ||||||
1820 | return false; | ||||||
1821 | } | ||||||
1822 | |||||||
1823 | return BaseT::isLoweredToCall(F); | ||||||
1824 | } | ||||||
1825 | |||||||
1826 | bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) { | ||||||
1827 | unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode()); | ||||||
1828 | EVT VT = TLI->getValueType(DL, I.getType(), true); | ||||||
1829 | if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall) | ||||||
1830 | return true; | ||||||
1831 | |||||||
1832 | // Check if an intrinsic will be lowered to a call and assume that any | ||||||
1833 | // other CallInst will generate a bl. | ||||||
1834 | if (auto *Call = dyn_cast<CallInst>(&I)) { | ||||||
1835 | if (auto *II = dyn_cast<IntrinsicInst>(Call)) { | ||||||
1836 | switch(II->getIntrinsicID()) { | ||||||
1837 | case Intrinsic::memcpy: | ||||||
1838 | case Intrinsic::memset: | ||||||
1839 | case Intrinsic::memmove: | ||||||
1840 | return getNumMemOps(II) == -1; | ||||||
1841 | default: | ||||||
1842 | if (const Function *F = Call->getCalledFunction()) | ||||||
1843 | return isLoweredToCall(F); | ||||||
1844 | } | ||||||
1845 | } | ||||||
1846 | return true; | ||||||
1847 | } | ||||||
1848 | |||||||
1849 | // FPv5 provides conversions between integer, double-precision, | ||||||
1850 | // single-precision, and half-precision formats. | ||||||
1851 | switch (I.getOpcode()) { | ||||||
1852 | default: | ||||||
1853 | break; | ||||||
1854 | case Instruction::FPToSI: | ||||||
1855 | case Instruction::FPToUI: | ||||||
1856 | case Instruction::SIToFP: | ||||||
1857 | case Instruction::UIToFP: | ||||||
1858 | case Instruction::FPTrunc: | ||||||
1859 | case Instruction::FPExt: | ||||||
1860 | return !ST->hasFPARMv8Base(); | ||||||
1861 | } | ||||||
1862 | |||||||
1863 | // FIXME: Unfortunately the approach of checking the Operation Action does | ||||||
1864 | // not catch all cases of Legalization that use library calls. Our | ||||||
1865 | // Legalization step categorizes some transformations into library calls as | ||||||
1866 | // Custom, Expand or even Legal when doing type legalization. So for now | ||||||
1867 | // we have to special case for instance the SDIV of 64bit integers and the | ||||||
1868 | // use of floating point emulation. | ||||||
1869 | if (VT.isInteger() && VT.getSizeInBits() >= 64) { | ||||||
1870 | switch (ISD) { | ||||||
1871 | default: | ||||||
1872 | break; | ||||||
1873 | case ISD::SDIV: | ||||||
1874 | case ISD::UDIV: | ||||||
1875 | case ISD::SREM: | ||||||
1876 | case ISD::UREM: | ||||||
1877 | case ISD::SDIVREM: | ||||||
1878 | case ISD::UDIVREM: | ||||||
1879 | return true; | ||||||
1880 | } | ||||||
1881 | } | ||||||
1882 | |||||||
1883 | // Assume all other non-float operations are supported. | ||||||
1884 | if (!VT.isFloatingPoint()) | ||||||
1885 | return false; | ||||||
1886 | |||||||
1887 | // We'll need a library call to handle most floats when using soft. | ||||||
1888 | if (TLI->useSoftFloat()) { | ||||||
1889 | switch (I.getOpcode()) { | ||||||
1890 | default: | ||||||
1891 | return true; | ||||||
1892 | case Instruction::Alloca: | ||||||
1893 | case Instruction::Load: | ||||||
1894 | case Instruction::Store: | ||||||
1895 | case Instruction::Select: | ||||||
1896 | case Instruction::PHI: | ||||||
1897 | return false; | ||||||
1898 | } | ||||||
1899 | } | ||||||
1900 | |||||||
1901 | // We'll need a libcall to perform double precision operations on a single | ||||||
1902 | // precision only FPU. | ||||||
1903 | if (I.getType()->isDoubleTy() && !ST->hasFP64()) | ||||||
1904 | return true; | ||||||
1905 | |||||||
1906 | // Likewise for half precision arithmetic. | ||||||
1907 | if (I.getType()->isHalfTy() && !ST->hasFullFP16()) | ||||||
1908 | return true; | ||||||
1909 | |||||||
1910 | return false; | ||||||
1911 | } | ||||||
1912 | |||||||
1913 | bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, | ||||||
1914 | AssumptionCache &AC, | ||||||
1915 | TargetLibraryInfo *LibInfo, | ||||||
1916 | HardwareLoopInfo &HWLoopInfo) { | ||||||
1917 | // Low-overhead branches are only supported in the 'low-overhead branch' | ||||||
1918 | // extension of v8.1-m. | ||||||
1919 | if (!ST->hasLOB() || DisableLowOverheadLoops) { | ||||||
1920 | LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "ARMHWLoops: Disabled\n"; } } while (false); | ||||||
1921 | return false; | ||||||
1922 | } | ||||||
1923 | |||||||
1924 | if (!SE.hasLoopInvariantBackedgeTakenCount(L)) { | ||||||
1925 | LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "ARMHWLoops: No BETC\n"; } } while (false); | ||||||
1926 | return false; | ||||||
1927 | } | ||||||
1928 | |||||||
1929 | const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L); | ||||||
1930 | if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) { | ||||||
1931 | LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "ARMHWLoops: Uncomputable BETC\n" ; } } while (false); | ||||||
1932 | return false; | ||||||
1933 | } | ||||||
1934 | |||||||
1935 | const SCEV *TripCountSCEV = | ||||||
1936 | SE.getAddExpr(BackedgeTakenCount, | ||||||
1937 | SE.getOne(BackedgeTakenCount->getType())); | ||||||
1938 | |||||||
1939 | // We need to store the trip count in LR, a 32-bit register. | ||||||
1940 | if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) { | ||||||
1941 | LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n" ; } } while (false); | ||||||
1942 | return false; | ||||||
1943 | } | ||||||
1944 | |||||||
1945 | // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little | ||||||
1946 | // point in generating a hardware loop if that's going to happen. | ||||||
1947 | |||||||
1948 | auto IsHardwareLoopIntrinsic = [](Instruction &I) { | ||||||
1949 | if (auto *Call = dyn_cast<IntrinsicInst>(&I)) { | ||||||
1950 | switch (Call->getIntrinsicID()) { | ||||||
1951 | default: | ||||||
1952 | break; | ||||||
1953 | case Intrinsic::start_loop_iterations: | ||||||
1954 | case Intrinsic::test_start_loop_iterations: | ||||||
1955 | case Intrinsic::loop_decrement: | ||||||
1956 | case Intrinsic::loop_decrement_reg: | ||||||
1957 | return true; | ||||||
1958 | } | ||||||
1959 | } | ||||||
1960 | return false; | ||||||
1961 | }; | ||||||
1962 | |||||||
1963 | // Scan the instructions to see if there's any that we know will turn into a | ||||||
1964 | // call or if this loop is already a low-overhead loop or will become a tail | ||||||
1965 | // predicated loop. | ||||||
1966 | bool IsTailPredLoop = false; | ||||||
1967 | auto ScanLoop = [&](Loop *L) { | ||||||
1968 | for (auto *BB : L->getBlocks()) { | ||||||
1969 | for (auto &I : *BB) { | ||||||
1970 | if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) || | ||||||
1971 | isa<InlineAsm>(I)) { | ||||||
1972 | LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n"; } } while (false); | ||||||
1973 | return false; | ||||||
1974 | } | ||||||
1975 | if (auto *II = dyn_cast<IntrinsicInst>(&I)) | ||||||
1976 | IsTailPredLoop |= | ||||||
1977 | II->getIntrinsicID() == Intrinsic::get_active_lane_mask || | ||||||
1978 | II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 || | ||||||
1979 | II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 || | ||||||
1980 | II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 || | ||||||
1981 | II->getIntrinsicID() == Intrinsic::arm_mve_vctp64; | ||||||
1982 | } | ||||||
1983 | } | ||||||
1984 | return true; | ||||||
1985 | }; | ||||||
1986 | |||||||
1987 | // Visit inner loops. | ||||||
1988 | for (auto Inner : *L) | ||||||
1989 | if (!ScanLoop(Inner)) | ||||||
1990 | return false; | ||||||
1991 | |||||||
1992 | if (!ScanLoop(L)) | ||||||
1993 | return false; | ||||||
1994 | |||||||
1995 | // TODO: Check whether the trip count calculation is expensive. If L is the | ||||||
1996 | // inner loop but we know it has a low trip count, calculating that trip | ||||||
1997 | // count (in the parent loop) may be detrimental. | ||||||
1998 | |||||||
1999 | LLVMContext &C = L->getHeader()->getContext(); | ||||||
2000 | HWLoopInfo.CounterInReg = true; | ||||||
2001 | HWLoopInfo.IsNestingLegal = false; | ||||||
2002 | HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop; | ||||||
2003 | HWLoopInfo.CountType = Type::getInt32Ty(C); | ||||||
2004 | HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1); | ||||||
2005 | return true; | ||||||
2006 | } | ||||||
2007 | |||||||
2008 | static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) { | ||||||
2009 | // We don't allow icmp's, and because we only look at single block loops, | ||||||
2010 | // we simply count the icmps, i.e. there should only be 1 for the backedge. | ||||||
2011 | if (isa<ICmpInst>(&I) && ++ICmpCount > 1) | ||||||
2012 | return false; | ||||||
2013 | // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are | ||||||
2014 | // not currently canonical, but soon will be. Code without them uses icmp, and | ||||||
2015 | // so is not tail predicated as per the condition above. In order to get the | ||||||
2016 | // same performance we treat min and max the same as an icmp for tailpred | ||||||
2017 | // purposes for the moment (we often rely on non-tailpred and higher VF's to | ||||||
2018 | // pick more optimial instructions like VQDMULH. They need to be recognized | ||||||
2019 | // directly by the vectorizer). | ||||||
2020 | if (auto *II = dyn_cast<IntrinsicInst>(&I)) | ||||||
2021 | if ((II->getIntrinsicID() == Intrinsic::smin || | ||||||
2022 | II->getIntrinsicID() == Intrinsic::smax || | ||||||
2023 | II->getIntrinsicID() == Intrinsic::umin || | ||||||
2024 | II->getIntrinsicID() == Intrinsic::umax) && | ||||||
2025 | ++ICmpCount > 1) | ||||||
2026 | return false; | ||||||
2027 | |||||||
2028 | if (isa<FCmpInst>(&I)) | ||||||
2029 | return false; | ||||||
2030 | |||||||
2031 | // We could allow extending/narrowing FP loads/stores, but codegen is | ||||||
2032 | // too inefficient so reject this for now. | ||||||
2033 | if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I)) | ||||||
2034 | return false; | ||||||
2035 | |||||||
2036 | // Extends have to be extending-loads | ||||||
2037 | if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) ) | ||||||
2038 | if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0))) | ||||||
2039 | return false; | ||||||
2040 | |||||||
2041 | // Truncs have to be narrowing-stores | ||||||
2042 | if (isa<TruncInst>(&I) ) | ||||||
2043 | if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin())) | ||||||
2044 | return false; | ||||||
2045 | |||||||
2046 | return true; | ||||||
2047 | } | ||||||
2048 | |||||||
2049 | // To set up a tail-predicated loop, we need to know the total number of | ||||||
2050 | // elements processed by that loop. Thus, we need to determine the element | ||||||
2051 | // size and: | ||||||
2052 | // 1) it should be uniform for all operations in the vector loop, so we | ||||||
2053 | // e.g. don't want any widening/narrowing operations. | ||||||
2054 | // 2) it should be smaller than i64s because we don't have vector operations | ||||||
2055 | // that work on i64s. | ||||||
2056 | // 3) we don't want elements to be reversed or shuffled, to make sure the | ||||||
2057 | // tail-predication masks/predicates the right lanes. | ||||||
2058 | // | ||||||
2059 | static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, | ||||||
2060 | const DataLayout &DL, | ||||||
2061 | const LoopAccessInfo *LAI) { | ||||||
2062 | LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "Tail-predication: checking allowed instructions\n" ; } } while (false); | ||||||
2063 | |||||||
2064 | // If there are live-out values, it is probably a reduction. We can predicate | ||||||
2065 | // most reduction operations freely under MVE using a combination of | ||||||
2066 | // prefer-predicated-reduction-select and inloop reductions. We limit this to | ||||||
2067 | // floating point and integer reductions, but don't check for operators | ||||||
2068 | // specifically here. If the value ends up not being a reduction (and so the | ||||||
2069 | // vectorizer cannot tailfold the loop), we should fall back to standard | ||||||
2070 | // vectorization automatically. | ||||||
2071 | SmallVector< Instruction *, 8 > LiveOuts; | ||||||
2072 | LiveOuts = llvm::findDefsUsedOutsideOfLoop(L); | ||||||
2073 | bool ReductionsDisabled = | ||||||
2074 | EnableTailPredication == TailPredication::EnabledNoReductions || | ||||||
2075 | EnableTailPredication == TailPredication::ForceEnabledNoReductions; | ||||||
2076 | |||||||
2077 | for (auto *I : LiveOuts) { | ||||||
2078 | if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() && | ||||||
2079 | !I->getType()->isHalfTy()) { | ||||||
2080 | LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "Don't tail-predicate loop with non-integer/float " "live-out value\n"; } } while (false) | ||||||
2081 | "live-out value\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "Don't tail-predicate loop with non-integer/float " "live-out value\n"; } } while (false); | ||||||
2082 | return false; | ||||||
2083 | } | ||||||
2084 | if (ReductionsDisabled) { | ||||||
2085 | LLVM_DEBUG(dbgs() << "Reductions not enabled\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "Reductions not enabled\n"; } } while (false); | ||||||
2086 | return false; | ||||||
2087 | } | ||||||
2088 | } | ||||||
2089 | |||||||
2090 | // Next, check that all instructions can be tail-predicated. | ||||||
2091 | PredicatedScalarEvolution PSE = LAI->getPSE(); | ||||||
2092 | SmallVector<Instruction *, 16> LoadStores; | ||||||
2093 | int ICmpCount = 0; | ||||||
2094 | |||||||
2095 | for (BasicBlock *BB : L->blocks()) { | ||||||
2096 | for (Instruction &I : BB->instructionsWithoutDebug()) { | ||||||
2097 | if (isa<PHINode>(&I)) | ||||||
2098 | continue; | ||||||
2099 | if (!canTailPredicateInstruction(I, ICmpCount)) { | ||||||
2100 | LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "Instruction not allowed: "; I. dump(); } } while (false); | ||||||
2101 | return false; | ||||||
2102 | } | ||||||
2103 | |||||||
2104 | Type *T = I.getType(); | ||||||
2105 | if (T->isPointerTy()) | ||||||
2106 | T = T->getPointerElementType(); | ||||||
2107 | |||||||
2108 | if (T->getScalarSizeInBits() > 32) { | ||||||
2109 | LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "Unsupported Type: "; T->dump (); } } while (false); | ||||||
2110 | return false; | ||||||
2111 | } | ||||||
2112 | if (isa<StoreInst>(I) || isa<LoadInst>(I)) { | ||||||
2113 | Value *Ptr = getLoadStorePointerOperand(&I); | ||||||
2114 | Type *AccessTy = getLoadStoreType(&I); | ||||||
2115 | int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L); | ||||||
2116 | if (NextStride == 1) { | ||||||
2117 | // TODO: for now only allow consecutive strides of 1. We could support | ||||||
2118 | // other strides as long as it is uniform, but let's keep it simple | ||||||
2119 | // for now. | ||||||
2120 | continue; | ||||||
2121 | } else if (NextStride == -1 || | ||||||
2122 | (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) || | ||||||
2123 | (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) { | ||||||
2124 | LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "Consecutive strides of 2 found, vld2/vstr2 can't " "be tail-predicated\n."; } } while (false) | ||||||
2125 | << "Consecutive strides of 2 found, vld2/vstr2 can't "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "Consecutive strides of 2 found, vld2/vstr2 can't " "be tail-predicated\n."; } } while (false) | ||||||
2126 | "be tail-predicated\n.")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "Consecutive strides of 2 found, vld2/vstr2 can't " "be tail-predicated\n."; } } while (false); | ||||||
2127 | return false; | ||||||
2128 | // TODO: don't tail predicate if there is a reversed load? | ||||||
2129 | } else if (EnableMaskedGatherScatters) { | ||||||
2130 | // Gather/scatters do allow loading from arbitrary strides, at | ||||||
2131 | // least if they are loop invariant. | ||||||
2132 | // TODO: Loop variant strides should in theory work, too, but | ||||||
2133 | // this requires further testing. | ||||||
2134 | const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr); | ||||||
2135 | if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) { | ||||||
2136 | const SCEV *Step = AR->getStepRecurrence(*PSE.getSE()); | ||||||
2137 | if (PSE.getSE()->isLoopInvariant(Step, L)) | ||||||
2138 | continue; | ||||||
2139 | } | ||||||
2140 | } | ||||||
2141 | LLVM_DEBUG(dbgs() << "Bad stride found, can't "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "Bad stride found, can't " "tail-predicate\n." ; } } while (false) | ||||||
2142 | "tail-predicate\n.")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "Bad stride found, can't " "tail-predicate\n." ; } } while (false); | ||||||
2143 | return false; | ||||||
2144 | } | ||||||
2145 | } | ||||||
2146 | } | ||||||
2147 | |||||||
2148 | LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "tail-predication: all instructions allowed!\n" ; } } while (false); | ||||||
2149 | return true; | ||||||
2150 | } | ||||||
2151 | |||||||
2152 | bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, | ||||||
2153 | ScalarEvolution &SE, | ||||||
2154 | AssumptionCache &AC, | ||||||
2155 | TargetLibraryInfo *TLI, | ||||||
2156 | DominatorTree *DT, | ||||||
2157 | const LoopAccessInfo *LAI) { | ||||||
2158 | if (!EnableTailPredication) { | ||||||
2159 | LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "Tail-predication not enabled.\n" ; } } while (false); | ||||||
2160 | return false; | ||||||
2161 | } | ||||||
2162 | |||||||
2163 | // Creating a predicated vector loop is the first step for generating a | ||||||
2164 | // tail-predicated hardware loop, for which we need the MVE masked | ||||||
2165 | // load/stores instructions: | ||||||
2166 | if (!ST->hasMVEIntegerOps()) | ||||||
2167 | return false; | ||||||
2168 | |||||||
2169 | // For now, restrict this to single block loops. | ||||||
2170 | if (L->getNumBlocks() > 1) { | ||||||
2171 | LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "preferPredicateOverEpilogue: not a single block " "loop.\n"; } } while (false) | ||||||
2172 | "loop.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "preferPredicateOverEpilogue: not a single block " "loop.\n"; } } while (false); | ||||||
2173 | return false; | ||||||
2174 | } | ||||||
2175 | |||||||
2176 | assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected")(static_cast <bool> (L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected" ) ? void (0) : __assert_fail ("L->isInnermost() && \"preferPredicateOverEpilogue: inner-loop expected\"" , "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp" , 2176, __extension__ __PRETTY_FUNCTION__)); | ||||||
2177 | |||||||
2178 | HardwareLoopInfo HWLoopInfo(L); | ||||||
2179 | if (!HWLoopInfo.canAnalyze(*LI)) { | ||||||
2180 | LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "preferPredicateOverEpilogue: hardware-loop is not " "analyzable.\n"; } } while (false) | ||||||
2181 | "analyzable.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "preferPredicateOverEpilogue: hardware-loop is not " "analyzable.\n"; } } while (false); | ||||||
2182 | return false; | ||||||
2183 | } | ||||||
2184 | |||||||
2185 | // This checks if we have the low-overhead branch architecture | ||||||
2186 | // extension, and if we will create a hardware-loop: | ||||||
2187 | if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) { | ||||||
2188 | LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "preferPredicateOverEpilogue: hardware-loop is not " "profitable.\n"; } } while (false) | ||||||
2189 | "profitable.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "preferPredicateOverEpilogue: hardware-loop is not " "profitable.\n"; } } while (false); | ||||||
2190 | return false; | ||||||
2191 | } | ||||||
2192 | |||||||
2193 | if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) { | ||||||
2194 | LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "preferPredicateOverEpilogue: hardware-loop is not " "a candidate.\n"; } } while (false) | ||||||
2195 | "a candidate.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "preferPredicateOverEpilogue: hardware-loop is not " "a candidate.\n"; } } while (false); | ||||||
2196 | return false; | ||||||
2197 | } | ||||||
2198 | |||||||
2199 | return canTailPredicateLoop(L, LI, SE, DL, LAI); | ||||||
2200 | } | ||||||
2201 | |||||||
2202 | bool ARMTTIImpl::emitGetActiveLaneMask() const { | ||||||
2203 | if (!ST->hasMVEIntegerOps() || !EnableTailPredication) | ||||||
2204 | return false; | ||||||
2205 | |||||||
2206 | // Intrinsic @llvm.get.active.lane.mask is supported. | ||||||
2207 | // It is used in the MVETailPredication pass, which requires the number of | ||||||
2208 | // elements processed by this vector loop to setup the tail-predicated | ||||||
2209 | // loop. | ||||||
2210 | return true; | ||||||
2211 | } | ||||||
2212 | void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, | ||||||
2213 | TTI::UnrollingPreferences &UP, | ||||||
2214 | OptimizationRemarkEmitter *ORE) { | ||||||
2215 | // Enable Upper bound unrolling universally, not dependant upon the conditions | ||||||
2216 | // below. | ||||||
2217 | UP.UpperBound = true; | ||||||
2218 | |||||||
2219 | // Only currently enable these preferences for M-Class cores. | ||||||
2220 | if (!ST->isMClass()) | ||||||
2221 | return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE); | ||||||
2222 | |||||||
2223 | // Disable loop unrolling for Oz and Os. | ||||||
2224 | UP.OptSizeThreshold = 0; | ||||||
2225 | UP.PartialOptSizeThreshold = 0; | ||||||
2226 | if (L->getHeader()->getParent()->hasOptSize()) | ||||||
2227 | return; | ||||||
2228 | |||||||
2229 | SmallVector<BasicBlock*, 4> ExitingBlocks; | ||||||
2230 | L->getExitingBlocks(ExitingBlocks); | ||||||
2231 | LLVM_DEBUG(dbgs() << "Loop has:\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "Loop has:\n" << "Blocks: " << L->getNumBlocks() << "\n" << "Exit blocks: " << ExitingBlocks.size() << "\n"; } } while (false ) | ||||||
2232 | << "Blocks: " << L->getNumBlocks() << "\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "Loop has:\n" << "Blocks: " << L->getNumBlocks() << "\n" << "Exit blocks: " << ExitingBlocks.size() << "\n"; } } while (false ) | ||||||
2233 | << "Exit blocks: " << ExitingBlocks.size() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "Loop has:\n" << "Blocks: " << L->getNumBlocks() << "\n" << "Exit blocks: " << ExitingBlocks.size() << "\n"; } } while (false ); | ||||||
2234 | |||||||
2235 | // Only allow another exit other than the latch. This acts as an early exit | ||||||
2236 | // as it mirrors the profitability calculation of the runtime unroller. | ||||||
2237 | if (ExitingBlocks.size() > 2) | ||||||
2238 | return; | ||||||
2239 | |||||||
2240 | // Limit the CFG of the loop body for targets with a branch predictor. | ||||||
2241 | // Allowing 4 blocks permits if-then-else diamonds in the body. | ||||||
2242 | if (ST->hasBranchPredictor() && L->getNumBlocks() > 4) | ||||||
2243 | return; | ||||||
2244 | |||||||
2245 | // Don't unroll vectorized loops, including the remainder loop | ||||||
2246 | if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized")) | ||||||
2247 | return; | ||||||
2248 | |||||||
2249 | // Scan the loop: don't unroll loops with calls as this could prevent | ||||||
2250 | // inlining. | ||||||
2251 | InstructionCost Cost = 0; | ||||||
2252 | for (auto *BB : L->getBlocks()) { | ||||||
2253 | for (auto &I : *BB) { | ||||||
2254 | // Don't unroll vectorised loop. MVE does not benefit from it as much as | ||||||
2255 | // scalar code. | ||||||
2256 | if (I.getType()->isVectorTy()) | ||||||
2257 | return; | ||||||
2258 | |||||||
2259 | if (isa<CallInst>(I) || isa<InvokeInst>(I)) { | ||||||
2260 | if (const Function *F = cast<CallBase>(I).getCalledFunction()) { | ||||||
2261 | if (!isLoweredToCall(F)) | ||||||
2262 | continue; | ||||||
2263 | } | ||||||
2264 | return; | ||||||
2265 | } | ||||||
2266 | |||||||
2267 | SmallVector<const Value*, 4> Operands(I.operand_values()); | ||||||
2268 | Cost += | ||||||
2269 | getUserCost(&I, Operands, TargetTransformInfo::TCK_SizeAndLatency); | ||||||
2270 | } | ||||||
2271 | } | ||||||
2272 | |||||||
2273 | // On v6m cores, there are very few registers available. We can easily end up | ||||||
2274 | // spilling and reloading more registers in an unrolled loop. Look at the | ||||||
2275 | // number of LCSSA phis as a rough measure of how many registers will need to | ||||||
2276 | // be live out of the loop, reducing the default unroll count if more than 1 | ||||||
2277 | // value is needed. In the long run, all of this should be being learnt by a | ||||||
2278 | // machine. | ||||||
2279 | unsigned UnrollCount = 4; | ||||||
2280 | if (ST->isThumb1Only()) { | ||||||
2281 | unsigned ExitingValues = 0; | ||||||
2282 | SmallVector<BasicBlock *, 4> ExitBlocks; | ||||||
2283 | L->getExitBlocks(ExitBlocks); | ||||||
2284 | for (auto *Exit : ExitBlocks) { | ||||||
2285 | // Count the number of LCSSA phis. Exclude values coming from GEP's as | ||||||
2286 | // only the last is expected to be needed for address operands. | ||||||
2287 | unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) { | ||||||
2288 | return PH.getNumOperands() != 1 || | ||||||
2289 | !isa<GetElementPtrInst>(PH.getOperand(0)); | ||||||
2290 | }); | ||||||
2291 | ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues; | ||||||
2292 | } | ||||||
2293 | if (ExitingValues) | ||||||
2294 | UnrollCount /= ExitingValues; | ||||||
2295 | if (UnrollCount <= 1) | ||||||
2296 | return; | ||||||
2297 | } | ||||||
2298 | |||||||
2299 | LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "Cost of loop: " << Cost << "\n"; } } while (false); | ||||||
2300 | LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n"; } } while (false); | ||||||
2301 | |||||||
2302 | UP.Partial = true; | ||||||
2303 | UP.Runtime = true; | ||||||
2304 | UP.UnrollRemainder = true; | ||||||
2305 | UP.DefaultUnrollRuntimeCount = UnrollCount; | ||||||
2306 | UP.UnrollAndJam = true; | ||||||
2307 | UP.UnrollAndJamInnerLoopThreshold = 60; | ||||||
2308 | |||||||
2309 | // Force unrolling small loops can be very useful because of the branch | ||||||
2310 | // taken cost of the backedge. | ||||||
2311 | if (Cost < 12) | ||||||
2312 | UP.Force = true; | ||||||
2313 | } | ||||||
2314 | |||||||
2315 | void ARMTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, | ||||||
2316 | TTI::PeelingPreferences &PP) { | ||||||
2317 | BaseT::getPeelingPreferences(L, SE, PP); | ||||||
2318 | } | ||||||
2319 | |||||||
2320 | bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty, | ||||||
2321 | TTI::ReductionFlags Flags) const { | ||||||
2322 | if (!ST->hasMVEIntegerOps()) | ||||||
2323 | return false; | ||||||
2324 | |||||||
2325 | unsigned ScalarBits = Ty->getScalarSizeInBits(); | ||||||
2326 | switch (Opcode) { | ||||||
2327 | case Instruction::Add: | ||||||
2328 | return ScalarBits <= 64; | ||||||
2329 | default: | ||||||
2330 | return false; | ||||||
2331 | } | ||||||
2332 | } | ||||||
2333 | |||||||
2334 | bool ARMTTIImpl::preferPredicatedReductionSelect( | ||||||
2335 | unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const { | ||||||
2336 | if (!ST->hasMVEIntegerOps()) | ||||||
2337 | return false; | ||||||
2338 | return true; | ||||||
2339 | } |
1 | //===-- ARMAddressingModes.h - ARM Addressing Modes -------------*- C++ -*-===// | |||
2 | // | |||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | |||
4 | // See https://llvm.org/LICENSE.txt for license information. | |||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | |||
6 | // | |||
7 | //===----------------------------------------------------------------------===// | |||
8 | // | |||
9 | // This file contains the ARM addressing mode implementation stuff. | |||
10 | // | |||
11 | //===----------------------------------------------------------------------===// | |||
12 | ||||
13 | #ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMADDRESSINGMODES_H | |||
14 | #define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMADDRESSINGMODES_H | |||
15 | ||||
16 | #include "llvm/ADT/APFloat.h" | |||
17 | #include "llvm/ADT/APInt.h" | |||
18 | #include "llvm/ADT/bit.h" | |||
19 | #include "llvm/Support/ErrorHandling.h" | |||
20 | #include "llvm/Support/MathExtras.h" | |||
21 | #include <cassert> | |||
22 | ||||
23 | namespace llvm { | |||
24 | ||||
25 | /// ARM_AM - ARM Addressing Mode Stuff | |||
26 | namespace ARM_AM { | |||
27 | enum ShiftOpc { | |||
28 | no_shift = 0, | |||
29 | asr, | |||
30 | lsl, | |||
31 | lsr, | |||
32 | ror, | |||
33 | rrx, | |||
34 | uxtw | |||
35 | }; | |||
36 | ||||
37 | enum AddrOpc { | |||
38 | sub = 0, | |||
39 | add | |||
40 | }; | |||
41 | ||||
42 | inline const char *getAddrOpcStr(AddrOpc Op) { return Op == sub ? "-" : ""; } | |||
43 | ||||
44 | inline const char *getShiftOpcStr(ShiftOpc Op) { | |||
45 | switch (Op) { | |||
46 | default: llvm_unreachable("Unknown shift opc!")::llvm::llvm_unreachable_internal("Unknown shift opc!", "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h" , 46); | |||
47 | case ARM_AM::asr: return "asr"; | |||
48 | case ARM_AM::lsl: return "lsl"; | |||
49 | case ARM_AM::lsr: return "lsr"; | |||
50 | case ARM_AM::ror: return "ror"; | |||
51 | case ARM_AM::rrx: return "rrx"; | |||
52 | case ARM_AM::uxtw: return "uxtw"; | |||
53 | } | |||
54 | } | |||
55 | ||||
56 | inline unsigned getShiftOpcEncoding(ShiftOpc Op) { | |||
57 | switch (Op) { | |||
58 | default: llvm_unreachable("Unknown shift opc!")::llvm::llvm_unreachable_internal("Unknown shift opc!", "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h" , 58); | |||
59 | case ARM_AM::asr: return 2; | |||
60 | case ARM_AM::lsl: return 0; | |||
61 | case ARM_AM::lsr: return 1; | |||
62 | case ARM_AM::ror: return 3; | |||
63 | } | |||
64 | } | |||
65 | ||||
66 | enum AMSubMode { | |||
67 | bad_am_submode = 0, | |||
68 | ia, | |||
69 | ib, | |||
70 | da, | |||
71 | db | |||
72 | }; | |||
73 | ||||
74 | inline const char *getAMSubModeStr(AMSubMode Mode) { | |||
75 | switch (Mode) { | |||
76 | default: llvm_unreachable("Unknown addressing sub-mode!")::llvm::llvm_unreachable_internal("Unknown addressing sub-mode!" , "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h" , 76); | |||
77 | case ARM_AM::ia: return "ia"; | |||
78 | case ARM_AM::ib: return "ib"; | |||
79 | case ARM_AM::da: return "da"; | |||
80 | case ARM_AM::db: return "db"; | |||
81 | } | |||
82 | } | |||
83 | ||||
84 | /// rotr32 - Rotate a 32-bit unsigned value right by a specified # bits. | |||
85 | /// | |||
86 | inline unsigned rotr32(unsigned Val, unsigned Amt) { | |||
87 | assert(Amt < 32 && "Invalid rotate amount")(static_cast <bool> (Amt < 32 && "Invalid rotate amount" ) ? void (0) : __assert_fail ("Amt < 32 && \"Invalid rotate amount\"" , "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h" , 87, __extension__ __PRETTY_FUNCTION__)); | |||
88 | return (Val >> Amt) | (Val << ((32-Amt)&31)); | |||
89 | } | |||
90 | ||||
91 | /// rotl32 - Rotate a 32-bit unsigned value left by a specified # bits. | |||
92 | /// | |||
93 | inline unsigned rotl32(unsigned Val, unsigned Amt) { | |||
94 | assert(Amt < 32 && "Invalid rotate amount")(static_cast <bool> (Amt < 32 && "Invalid rotate amount" ) ? void (0) : __assert_fail ("Amt < 32 && \"Invalid rotate amount\"" , "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h" , 94, __extension__ __PRETTY_FUNCTION__)); | |||
95 | return (Val << Amt) | (Val >> ((32-Amt)&31)); | |||
96 | } | |||
97 | ||||
98 | //===--------------------------------------------------------------------===// | |||
99 | // Addressing Mode #1: shift_operand with registers | |||
100 | //===--------------------------------------------------------------------===// | |||
101 | // | |||
102 | // This 'addressing mode' is used for arithmetic instructions. It can | |||
103 | // represent things like: | |||
104 | // reg | |||
105 | // reg [asr|lsl|lsr|ror|rrx] reg | |||
106 | // reg [asr|lsl|lsr|ror|rrx] imm | |||
107 | // | |||
108 | // This is stored three operands [rega, regb, opc]. The first is the base | |||
109 | // reg, the second is the shift amount (or reg0 if not present or imm). The | |||
110 | // third operand encodes the shift opcode and the imm if a reg isn't present. | |||
111 | // | |||
112 | inline unsigned getSORegOpc(ShiftOpc ShOp, unsigned Imm) { | |||
113 | return ShOp | (Imm << 3); | |||
114 | } | |||
115 | inline unsigned getSORegOffset(unsigned Op) { return Op >> 3; } | |||
116 | inline ShiftOpc getSORegShOp(unsigned Op) { return (ShiftOpc)(Op & 7); } | |||
117 | ||||
118 | /// getSOImmValImm - Given an encoded imm field for the reg/imm form, return | |||
119 | /// the 8-bit imm value. | |||
120 | inline unsigned getSOImmValImm(unsigned Imm) { return Imm & 0xFF; } | |||
121 | /// getSOImmValRot - Given an encoded imm field for the reg/imm form, return | |||
122 | /// the rotate amount. | |||
123 | inline unsigned getSOImmValRot(unsigned Imm) { return (Imm >> 8) * 2; } | |||
124 | ||||
125 | /// getSOImmValRotate - Try to handle Imm with an immediate shifter operand, | |||
126 | /// computing the rotate amount to use. If this immediate value cannot be | |||
127 | /// handled with a single shifter-op, determine a good rotate amount that will | |||
128 | /// take a maximal chunk of bits out of the immediate. | |||
129 | inline unsigned getSOImmValRotate(unsigned Imm) { | |||
130 | // 8-bit (or less) immediates are trivially shifter_operands with a rotate | |||
131 | // of zero. | |||
132 | if ((Imm & ~255U) == 0) return 0; | |||
133 | ||||
134 | // Use CTZ to compute the rotate amount. | |||
135 | unsigned TZ = countTrailingZeros(Imm); | |||
136 | ||||
137 | // Rotate amount must be even. Something like 0x200 must be rotated 8 bits, | |||
138 | // not 9. | |||
139 | unsigned RotAmt = TZ & ~1; | |||
140 | ||||
141 | // If we can handle this spread, return it. | |||
142 | if ((rotr32(Imm, RotAmt) & ~255U) == 0) | |||
143 | return (32-RotAmt)&31; // HW rotates right, not left. | |||
144 | ||||
145 | // For values like 0xF000000F, we should ignore the low 6 bits, then | |||
146 | // retry the hunt. | |||
147 | if (Imm & 63U) { | |||
148 | unsigned TZ2 = countTrailingZeros(Imm & ~63U); | |||
149 | unsigned RotAmt2 = TZ2 & ~1; | |||
150 | if ((rotr32(Imm, RotAmt2) & ~255U) == 0) | |||
151 | return (32-RotAmt2)&31; // HW rotates right, not left. | |||
152 | } | |||
153 | ||||
154 | // Otherwise, we have no way to cover this span of bits with a single | |||
155 | // shifter_op immediate. Return a chunk of bits that will be useful to | |||
156 | // handle. | |||
157 | return (32-RotAmt)&31; // HW rotates right, not left. | |||
158 | } | |||
159 | ||||
160 | /// getSOImmVal - Given a 32-bit immediate, if it is something that can fit | |||
161 | /// into an shifter_operand immediate operand, return the 12-bit encoding for | |||
162 | /// it. If not, return -1. | |||
163 | inline int getSOImmVal(unsigned Arg) { | |||
164 | // 8-bit (or less) immediates are trivially shifter_operands with a rotate | |||
165 | // of zero. | |||
166 | if ((Arg & ~255U) == 0) return Arg; | |||
167 | ||||
168 | unsigned RotAmt = getSOImmValRotate(Arg); | |||
169 | ||||
170 | // If this cannot be handled with a single shifter_op, bail out. | |||
171 | if (rotr32(~255U, RotAmt) & Arg) | |||
172 | return -1; | |||
173 | ||||
174 | // Encode this correctly. | |||
175 | return rotl32(Arg, RotAmt) | ((RotAmt>>1) << 8); | |||
176 | } | |||
177 | ||||
178 | /// isSOImmTwoPartVal - Return true if the specified value can be obtained by | |||
179 | /// or'ing together two SOImmVal's. | |||
180 | inline bool isSOImmTwoPartVal(unsigned V) { | |||
181 | // If this can be handled with a single shifter_op, bail out. | |||
182 | V = rotr32(~255U, getSOImmValRotate(V)) & V; | |||
183 | if (V == 0) | |||
184 | return false; | |||
185 | ||||
186 | // If this can be handled with two shifter_op's, accept. | |||
187 | V = rotr32(~255U, getSOImmValRotate(V)) & V; | |||
188 | return V == 0; | |||
189 | } | |||
190 | ||||
191 | /// getSOImmTwoPartFirst - If V is a value that satisfies isSOImmTwoPartVal, | |||
192 | /// return the first chunk of it. | |||
193 | inline unsigned getSOImmTwoPartFirst(unsigned V) { | |||
194 | return rotr32(255U, getSOImmValRotate(V)) & V; | |||
195 | } | |||
196 | ||||
197 | /// getSOImmTwoPartSecond - If V is a value that satisfies isSOImmTwoPartVal, | |||
198 | /// return the second chunk of it. | |||
199 | inline unsigned getSOImmTwoPartSecond(unsigned V) { | |||
200 | // Mask out the first hunk. | |||
201 | V = rotr32(~255U, getSOImmValRotate(V)) & V; | |||
202 | ||||
203 | // Take what's left. | |||
204 | assert(V == (rotr32(255U, getSOImmValRotate(V)) & V))(static_cast <bool> (V == (rotr32(255U, getSOImmValRotate (V)) & V)) ? void (0) : __assert_fail ("V == (rotr32(255U, getSOImmValRotate(V)) & V)" , "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h" , 204, __extension__ __PRETTY_FUNCTION__)); | |||
205 | return V; | |||
206 | } | |||
207 | ||||
208 | /// isSOImmTwoPartValNeg - Return true if the specified value can be obtained | |||
209 | /// by two SOImmVal, that -V = First + Second. | |||
210 | /// "R+V" can be optimized to (sub (sub R, First), Second). | |||
211 | /// "R=V" can be optimized to (sub (mvn R, ~(-First)), Second). | |||
212 | inline bool isSOImmTwoPartValNeg(unsigned V) { | |||
213 | unsigned First; | |||
214 | if (!isSOImmTwoPartVal(-V)) | |||
215 | return false; | |||
216 | // Return false if ~(-First) is not a SoImmval. | |||
217 | First = getSOImmTwoPartFirst(-V); | |||
218 | First = ~(-First); | |||
219 | return !(rotr32(~255U, getSOImmValRotate(First)) & First); | |||
220 | } | |||
221 | ||||
222 | /// getThumbImmValShift - Try to handle Imm with a 8-bit immediate followed | |||
223 | /// by a left shift. Returns the shift amount to use. | |||
224 | inline unsigned getThumbImmValShift(unsigned Imm) { | |||
225 | // 8-bit (or less) immediates are trivially immediate operand with a shift | |||
226 | // of zero. | |||
227 | if ((Imm & ~255U) == 0) return 0; | |||
228 | ||||
229 | // Use CTZ to compute the shift amount. | |||
230 | return countTrailingZeros(Imm); | |||
231 | } | |||
232 | ||||
233 | /// isThumbImmShiftedVal - Return true if the specified value can be obtained | |||
234 | /// by left shifting a 8-bit immediate. | |||
235 | inline bool isThumbImmShiftedVal(unsigned V) { | |||
236 | // If this can be handled with | |||
237 | V = (~255U << getThumbImmValShift(V)) & V; | |||
| ||||
238 | return V == 0; | |||
239 | } | |||
240 | ||||
241 | /// getThumbImm16ValShift - Try to handle Imm with a 16-bit immediate followed | |||
242 | /// by a left shift. Returns the shift amount to use. | |||
243 | inline unsigned getThumbImm16ValShift(unsigned Imm) { | |||
244 | // 16-bit (or less) immediates are trivially immediate operand with a shift | |||
245 | // of zero. | |||
246 | if ((Imm & ~65535U) == 0) return 0; | |||
247 | ||||
248 | // Use CTZ to compute the shift amount. | |||
249 | return countTrailingZeros(Imm); | |||
250 | } | |||
251 | ||||
252 | /// isThumbImm16ShiftedVal - Return true if the specified value can be | |||
253 | /// obtained by left shifting a 16-bit immediate. | |||
254 | inline bool isThumbImm16ShiftedVal(unsigned V) { | |||
255 | // If this can be handled with | |||
256 | V = (~65535U << getThumbImm16ValShift(V)) & V; | |||
257 | return V == 0; | |||
258 | } | |||
259 | ||||
260 | /// getThumbImmNonShiftedVal - If V is a value that satisfies | |||
261 | /// isThumbImmShiftedVal, return the non-shiftd value. | |||
262 | inline unsigned getThumbImmNonShiftedVal(unsigned V) { | |||
263 | return V >> getThumbImmValShift(V); | |||
264 | } | |||
265 | ||||
266 | ||||
267 | /// getT2SOImmValSplat - Return the 12-bit encoded representation | |||
268 | /// if the specified value can be obtained by splatting the low 8 bits | |||
269 | /// into every other byte or every byte of a 32-bit value. i.e., | |||
270 | /// 00000000 00000000 00000000 abcdefgh control = 0 | |||
271 | /// 00000000 abcdefgh 00000000 abcdefgh control = 1 | |||
272 | /// abcdefgh 00000000 abcdefgh 00000000 control = 2 | |||
273 | /// abcdefgh abcdefgh abcdefgh abcdefgh control = 3 | |||
274 | /// Return -1 if none of the above apply. | |||
275 | /// See ARM Reference Manual A6.3.2. | |||
276 | inline int getT2SOImmValSplatVal(unsigned V) { | |||
277 | unsigned u, Vs, Imm; | |||
278 | // control = 0 | |||
279 | if ((V & 0xffffff00) == 0) | |||
280 | return V; | |||
281 | ||||
282 | // If the value is zeroes in the first byte, just shift those off | |||
283 | Vs = ((V & 0xff) == 0) ? V >> 8 : V; | |||
284 | // Any passing value only has 8 bits of payload, splatted across the word | |||
285 | Imm = Vs & 0xff; | |||
286 | // Likewise, any passing values have the payload splatted into the 3rd byte | |||
287 | u = Imm | (Imm << 16); | |||
288 | ||||
289 | // control = 1 or 2 | |||
290 | if (Vs == u) | |||
291 | return (((Vs == V) ? 1 : 2) << 8) | Imm; | |||
292 | ||||
293 | // control = 3 | |||
294 | if (Vs == (u | (u << 8))) | |||
295 | return (3 << 8) | Imm; | |||
296 | ||||
297 | return -1; | |||
298 | } | |||
299 | ||||
300 | /// getT2SOImmValRotateVal - Return the 12-bit encoded representation if the | |||
301 | /// specified value is a rotated 8-bit value. Return -1 if no rotation | |||
302 | /// encoding is possible. | |||
303 | /// See ARM Reference Manual A6.3.2. | |||
304 | inline int getT2SOImmValRotateVal(unsigned V) { | |||
305 | unsigned RotAmt = countLeadingZeros(V); | |||
306 | if (RotAmt >= 24) | |||
307 | return -1; | |||
308 | ||||
309 | // If 'Arg' can be handled with a single shifter_op return the value. | |||
310 | if ((rotr32(0xff000000U, RotAmt) & V) == V) | |||
311 | return (rotr32(V, 24 - RotAmt) & 0x7f) | ((RotAmt + 8) << 7); | |||
312 | ||||
313 | return -1; | |||
314 | } | |||
315 | ||||
316 | /// getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit | |||
317 | /// into a Thumb-2 shifter_operand immediate operand, return the 12-bit | |||
318 | /// encoding for it. If not, return -1. | |||
319 | /// See ARM Reference Manual A6.3.2. | |||
320 | inline int getT2SOImmVal(unsigned Arg) { | |||
321 | // If 'Arg' is an 8-bit splat, then get the encoded value. | |||
322 | int Splat = getT2SOImmValSplatVal(Arg); | |||
323 | if (Splat != -1) | |||
324 | return Splat; | |||
325 | ||||
326 | // If 'Arg' can be handled with a single shifter_op return the value. | |||
327 | int Rot = getT2SOImmValRotateVal(Arg); | |||
328 | if (Rot != -1) | |||
329 | return Rot; | |||
330 | ||||
331 | return -1; | |||
332 | } | |||
333 | ||||
334 | inline unsigned getT2SOImmValRotate(unsigned V) { | |||
335 | if ((V & ~255U) == 0) return 0; | |||
336 | // Use CTZ to compute the rotate amount. | |||
337 | unsigned RotAmt = countTrailingZeros(V); | |||
338 | return (32 - RotAmt) & 31; | |||
339 | } | |||
340 | ||||
341 | inline bool isT2SOImmTwoPartVal(unsigned Imm) { | |||
342 | unsigned V = Imm; | |||
343 | // Passing values can be any combination of splat values and shifter | |||
344 | // values. If this can be handled with a single shifter or splat, bail | |||
345 | // out. Those should be handled directly, not with a two-part val. | |||
346 | if (getT2SOImmValSplatVal(V) != -1) | |||
347 | return false; | |||
348 | V = rotr32 (~255U, getT2SOImmValRotate(V)) & V; | |||
349 | if (V == 0) | |||
350 | return false; | |||
351 | ||||
352 | // If this can be handled as an immediate, accept. | |||
353 | if (getT2SOImmVal(V) != -1) return true; | |||
354 | ||||
355 | // Likewise, try masking out a splat value first. | |||
356 | V = Imm; | |||
357 | if (getT2SOImmValSplatVal(V & 0xff00ff00U) != -1) | |||
358 | V &= ~0xff00ff00U; | |||
359 | else if (getT2SOImmValSplatVal(V & 0x00ff00ffU) != -1) | |||
360 | V &= ~0x00ff00ffU; | |||
361 | // If what's left can be handled as an immediate, accept. | |||
362 | if (getT2SOImmVal(V) != -1) return true; | |||
363 | ||||
364 | // Otherwise, do not accept. | |||
365 | return false; | |||
366 | } | |||
367 | ||||
368 | inline unsigned getT2SOImmTwoPartFirst(unsigned Imm) { | |||
369 | assert (isT2SOImmTwoPartVal(Imm) &&(static_cast <bool> (isT2SOImmTwoPartVal(Imm) && "Immedate cannot be encoded as two part immediate!") ? void ( 0) : __assert_fail ("isT2SOImmTwoPartVal(Imm) && \"Immedate cannot be encoded as two part immediate!\"" , "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h" , 370, __extension__ __PRETTY_FUNCTION__)) | |||
370 | "Immedate cannot be encoded as two part immediate!")(static_cast <bool> (isT2SOImmTwoPartVal(Imm) && "Immedate cannot be encoded as two part immediate!") ? void ( 0) : __assert_fail ("isT2SOImmTwoPartVal(Imm) && \"Immedate cannot be encoded as two part immediate!\"" , "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h" , 370, __extension__ __PRETTY_FUNCTION__)); | |||
371 | // Try a shifter operand as one part | |||
372 | unsigned V = rotr32 (~255, getT2SOImmValRotate(Imm)) & Imm; | |||
373 | // If the rest is encodable as an immediate, then return it. | |||
374 | if (getT2SOImmVal(V) != -1) return V; | |||
375 | ||||
376 | // Try masking out a splat value first. | |||
377 | if (getT2SOImmValSplatVal(Imm & 0xff00ff00U) != -1) | |||
378 | return Imm & 0xff00ff00U; | |||
379 | ||||
380 | // The other splat is all that's left as an option. | |||
381 | assert (getT2SOImmValSplatVal(Imm & 0x00ff00ffU) != -1)(static_cast <bool> (getT2SOImmValSplatVal(Imm & 0x00ff00ffU ) != -1) ? void (0) : __assert_fail ("getT2SOImmValSplatVal(Imm & 0x00ff00ffU) != -1" , "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h" , 381, __extension__ __PRETTY_FUNCTION__)); | |||
382 | return Imm & 0x00ff00ffU; | |||
383 | } | |||
384 | ||||
385 | inline unsigned getT2SOImmTwoPartSecond(unsigned Imm) { | |||
386 | // Mask out the first hunk | |||
387 | Imm ^= getT2SOImmTwoPartFirst(Imm); | |||
388 | // Return what's left | |||
389 | assert (getT2SOImmVal(Imm) != -1 &&(static_cast <bool> (getT2SOImmVal(Imm) != -1 && "Unable to encode second part of T2 two part SO immediate") ? void (0) : __assert_fail ("getT2SOImmVal(Imm) != -1 && \"Unable to encode second part of T2 two part SO immediate\"" , "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h" , 390, __extension__ __PRETTY_FUNCTION__)) | |||
390 | "Unable to encode second part of T2 two part SO immediate")(static_cast <bool> (getT2SOImmVal(Imm) != -1 && "Unable to encode second part of T2 two part SO immediate") ? void (0) : __assert_fail ("getT2SOImmVal(Imm) != -1 && \"Unable to encode second part of T2 two part SO immediate\"" , "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h" , 390, __extension__ __PRETTY_FUNCTION__)); | |||
391 | return Imm; | |||
392 | } | |||
393 | ||||
394 | ||||
395 | //===--------------------------------------------------------------------===// | |||
396 | // Addressing Mode #2 | |||
397 | //===--------------------------------------------------------------------===// | |||
398 | // | |||
399 | // This is used for most simple load/store instructions. | |||
400 | // | |||
401 | // addrmode2 := reg +/- reg shop imm | |||
402 | // addrmode2 := reg +/- imm12 | |||
403 | // | |||
404 | // The first operand is always a Reg. The second operand is a reg if in | |||
405 | // reg/reg form, otherwise it's reg#0. The third field encodes the operation | |||
406 | // in bit 12, the immediate in bits 0-11, and the shift op in 13-15. The | |||
407 | // fourth operand 16-17 encodes the index mode. | |||
408 | // | |||
409 | // If this addressing mode is a frame index (before prolog/epilog insertion | |||
410 | // and code rewriting), this operand will have the form: FI#, reg0, <offs> | |||
411 | // with no shift amount for the frame offset. | |||
412 | // | |||
413 | inline unsigned getAM2Opc(AddrOpc Opc, unsigned Imm12, ShiftOpc SO, | |||
414 | unsigned IdxMode = 0) { | |||
415 | assert(Imm12 < (1 << 12) && "Imm too large!")(static_cast <bool> (Imm12 < (1 << 12) && "Imm too large!") ? void (0) : __assert_fail ("Imm12 < (1 << 12) && \"Imm too large!\"" , "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h" , 415, __extension__ __PRETTY_FUNCTION__)); | |||
416 | bool isSub = Opc == sub; | |||
417 | return Imm12 | ((int)isSub << 12) | (SO << 13) | (IdxMode << 16) ; | |||
418 | } | |||
419 | inline unsigned getAM2Offset(unsigned AM2Opc) { | |||
420 | return AM2Opc & ((1 << 12)-1); | |||
421 | } | |||
422 | inline AddrOpc getAM2Op(unsigned AM2Opc) { | |||
423 | return ((AM2Opc >> 12) & 1) ? sub : add; | |||
424 | } | |||
425 | inline ShiftOpc getAM2ShiftOpc(unsigned AM2Opc) { | |||
426 | return (ShiftOpc)((AM2Opc >> 13) & 7); | |||
427 | } | |||
428 | inline unsigned getAM2IdxMode(unsigned AM2Opc) { return (AM2Opc >> 16); } | |||
429 | ||||
430 | //===--------------------------------------------------------------------===// | |||
431 | // Addressing Mode #3 | |||
432 | //===--------------------------------------------------------------------===// | |||
433 | // | |||
434 | // This is used for sign-extending loads, and load/store-pair instructions. | |||
435 | // | |||
436 | // addrmode3 := reg +/- reg | |||
437 | // addrmode3 := reg +/- imm8 | |||
438 | // | |||
439 | // The first operand is always a Reg. The second operand is a reg if in | |||
440 | // reg/reg form, otherwise it's reg#0. The third field encodes the operation | |||
441 | // in bit 8, the immediate in bits 0-7. The fourth operand 9-10 encodes the | |||
442 | // index mode. | |||
443 | ||||
444 | /// getAM3Opc - This function encodes the addrmode3 opc field. | |||
445 | inline unsigned getAM3Opc(AddrOpc Opc, unsigned char Offset, | |||
446 | unsigned IdxMode = 0) { | |||
447 | bool isSub = Opc == sub; | |||
448 | return ((int)isSub << 8) | Offset | (IdxMode << 9); | |||
449 | } | |||
450 | inline unsigned char getAM3Offset(unsigned AM3Opc) { return AM3Opc & 0xFF; } | |||
451 | inline AddrOpc getAM3Op(unsigned AM3Opc) { | |||
452 | return ((AM3Opc >> 8) & 1) ? sub : add; | |||
453 | } | |||
454 | inline unsigned getAM3IdxMode(unsigned AM3Opc) { return (AM3Opc >> 9); } | |||
455 | ||||
456 | //===--------------------------------------------------------------------===// | |||
457 | // Addressing Mode #4 | |||
458 | //===--------------------------------------------------------------------===// | |||
459 | // | |||
460 | // This is used for load / store multiple instructions. | |||
461 | // | |||
462 | // addrmode4 := reg, <mode> | |||
463 | // | |||
464 | // The four modes are: | |||
465 | // IA - Increment after | |||
466 | // IB - Increment before | |||
467 | // DA - Decrement after | |||
468 | // DB - Decrement before | |||
469 | // For VFP instructions, only the IA and DB modes are valid. | |||
470 | ||||
471 | inline AMSubMode getAM4SubMode(unsigned Mode) { | |||
472 | return (AMSubMode)(Mode & 0x7); | |||
473 | } | |||
474 | ||||
475 | inline unsigned getAM4ModeImm(AMSubMode SubMode) { return (int)SubMode; } | |||
476 | ||||
477 | //===--------------------------------------------------------------------===// | |||
478 | // Addressing Mode #5 | |||
479 | //===--------------------------------------------------------------------===// | |||
480 | // | |||
481 | // This is used for coprocessor instructions, such as FP load/stores. | |||
482 | // | |||
483 | // addrmode5 := reg +/- imm8*4 | |||
484 | // | |||
485 | // The first operand is always a Reg. The second operand encodes the | |||
486 | // operation (add or subtract) in bit 8 and the immediate in bits 0-7. | |||
487 | ||||
488 | /// getAM5Opc - This function encodes the addrmode5 opc field. | |||
489 | inline unsigned getAM5Opc(AddrOpc Opc, unsigned char Offset) { | |||
490 | bool isSub = Opc == sub; | |||
491 | return ((int)isSub << 8) | Offset; | |||
492 | } | |||
493 | inline unsigned char getAM5Offset(unsigned AM5Opc) { return AM5Opc & 0xFF; } | |||
494 | inline AddrOpc getAM5Op(unsigned AM5Opc) { | |||
495 | return ((AM5Opc >> 8) & 1) ? sub : add; | |||
496 | } | |||
497 | ||||
498 | //===--------------------------------------------------------------------===// | |||
499 | // Addressing Mode #5 FP16 | |||
500 | //===--------------------------------------------------------------------===// | |||
501 | // | |||
502 | // This is used for coprocessor instructions, such as 16-bit FP load/stores. | |||
503 | // | |||
504 | // addrmode5fp16 := reg +/- imm8*2 | |||
505 | // | |||
506 | // The first operand is always a Reg. The second operand encodes the | |||
507 | // operation (add or subtract) in bit 8 and the immediate in bits 0-7. | |||
508 | ||||
509 | /// getAM5FP16Opc - This function encodes the addrmode5fp16 opc field. | |||
510 | inline unsigned getAM5FP16Opc(AddrOpc Opc, unsigned char Offset) { | |||
511 | bool isSub = Opc == sub; | |||
512 | return ((int)isSub << 8) | Offset; | |||
513 | } | |||
514 | inline unsigned char getAM5FP16Offset(unsigned AM5Opc) { | |||
515 | return AM5Opc & 0xFF; | |||
516 | } | |||
517 | inline AddrOpc getAM5FP16Op(unsigned AM5Opc) { | |||
518 | return ((AM5Opc >> 8) & 1) ? sub : add; | |||
519 | } | |||
520 | ||||
521 | //===--------------------------------------------------------------------===// | |||
522 | // Addressing Mode #6 | |||
523 | //===--------------------------------------------------------------------===// | |||
524 | // | |||
525 | // This is used for NEON load / store instructions. | |||
526 | // | |||
527 | // addrmode6 := reg with optional alignment | |||
528 | // | |||
529 | // This is stored in two operands [regaddr, align]. The first is the | |||
530 | // address register. The second operand is the value of the alignment | |||
531 | // specifier in bytes or zero if no explicit alignment. | |||
532 | // Valid alignments depend on the specific instruction. | |||
533 | ||||
534 | //===--------------------------------------------------------------------===// | |||
535 | // NEON/MVE Modified Immediates | |||
536 | //===--------------------------------------------------------------------===// | |||
537 | // | |||
538 | // Several NEON and MVE instructions (e.g., VMOV) take a "modified immediate" | |||
539 | // vector operand, where a small immediate encoded in the instruction | |||
540 | // specifies a full NEON vector value. These modified immediates are | |||
541 | // represented here as encoded integers. The low 8 bits hold the immediate | |||
542 | // value; bit 12 holds the "Op" field of the instruction, and bits 11-8 hold | |||
543 | // the "Cmode" field of the instruction. The interfaces below treat the | |||
544 | // Op and Cmode values as a single 5-bit value. | |||
545 | ||||
546 | inline unsigned createVMOVModImm(unsigned OpCmode, unsigned Val) { | |||
547 | return (OpCmode << 8) | Val; | |||
548 | } | |||
549 | inline unsigned getVMOVModImmOpCmode(unsigned ModImm) { | |||
550 | return (ModImm >> 8) & 0x1f; | |||
551 | } | |||
552 | inline unsigned getVMOVModImmVal(unsigned ModImm) { return ModImm & 0xff; } | |||
553 | ||||
554 | /// decodeVMOVModImm - Decode a NEON/MVE modified immediate value into the | |||
555 | /// element value and the element size in bits. (If the element size is | |||
556 | /// smaller than the vector, it is splatted into all the elements.) | |||
557 | inline uint64_t decodeVMOVModImm(unsigned ModImm, unsigned &EltBits) { | |||
558 | unsigned OpCmode = getVMOVModImmOpCmode(ModImm); | |||
559 | unsigned Imm8 = getVMOVModImmVal(ModImm); | |||
560 | uint64_t Val = 0; | |||
561 | ||||
562 | if (OpCmode == 0xe) { | |||
563 | // 8-bit vector elements | |||
564 | Val = Imm8; | |||
565 | EltBits = 8; | |||
566 | } else if ((OpCmode & 0xc) == 0x8) { | |||
567 | // 16-bit vector elements | |||
568 | unsigned ByteNum = (OpCmode & 0x6) >> 1; | |||
569 | Val = Imm8 << (8 * ByteNum); | |||
570 | EltBits = 16; | |||
571 | } else if ((OpCmode & 0x8) == 0) { | |||
572 | // 32-bit vector elements, zero with one byte set | |||
573 | unsigned ByteNum = (OpCmode & 0x6) >> 1; | |||
574 | Val = Imm8 << (8 * ByteNum); | |||
575 | EltBits = 32; | |||
576 | } else if ((OpCmode & 0xe) == 0xc) { | |||
577 | // 32-bit vector elements, one byte with low bits set | |||
578 | unsigned ByteNum = 1 + (OpCmode & 0x1); | |||
579 | Val = (Imm8 << (8 * ByteNum)) | (0xffff >> (8 * (2 - ByteNum))); | |||
580 | EltBits = 32; | |||
581 | } else if (OpCmode == 0x1e) { | |||
582 | // 64-bit vector elements | |||
583 | for (unsigned ByteNum = 0; ByteNum < 8; ++ByteNum) { | |||
584 | if ((ModImm >> ByteNum) & 1) | |||
585 | Val |= (uint64_t)0xff << (8 * ByteNum); | |||
586 | } | |||
587 | EltBits = 64; | |||
588 | } else { | |||
589 | llvm_unreachable("Unsupported VMOV immediate")::llvm::llvm_unreachable_internal("Unsupported VMOV immediate" , "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h" , 589); | |||
590 | } | |||
591 | return Val; | |||
592 | } | |||
593 | ||||
594 | // Generic validation for single-byte immediate (0X00, 00X0, etc). | |||
595 | inline bool isNEONBytesplat(unsigned Value, unsigned Size) { | |||
596 | assert(Size >= 1 && Size <= 4 && "Invalid size")(static_cast <bool> (Size >= 1 && Size <= 4 && "Invalid size") ? void (0) : __assert_fail ("Size >= 1 && Size <= 4 && \"Invalid size\"" , "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h" , 596, __extension__ __PRETTY_FUNCTION__)); | |||
597 | unsigned count = 0; | |||
598 | for (unsigned i = 0; i < Size; ++i) { | |||
599 | if (Value & 0xff) count++; | |||
600 | Value >>= 8; | |||
601 | } | |||
602 | return count == 1; | |||
603 | } | |||
604 | ||||
605 | /// Checks if Value is a correct immediate for instructions like VBIC/VORR. | |||
606 | inline bool isNEONi16splat(unsigned Value) { | |||
607 | if (Value > 0xffff) | |||
608 | return false; | |||
609 | // i16 value with set bits only in one byte X0 or 0X. | |||
610 | return Value == 0 || isNEONBytesplat(Value, 2); | |||
611 | } | |||
612 | ||||
613 | // Encode NEON 16 bits Splat immediate for instructions like VBIC/VORR | |||
614 | inline unsigned encodeNEONi16splat(unsigned Value) { | |||
615 | assert(isNEONi16splat(Value) && "Invalid NEON splat value")(static_cast <bool> (isNEONi16splat(Value) && "Invalid NEON splat value" ) ? void (0) : __assert_fail ("isNEONi16splat(Value) && \"Invalid NEON splat value\"" , "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h" , 615, __extension__ __PRETTY_FUNCTION__)); | |||
616 | if (Value >= 0x100) | |||
617 | Value = (Value >> 8) | 0xa00; | |||
618 | else | |||
619 | Value |= 0x800; | |||
620 | return Value; | |||
621 | } | |||
622 | ||||
623 | /// Checks if Value is a correct immediate for instructions like VBIC/VORR. | |||
624 | inline bool isNEONi32splat(unsigned Value) { | |||
625 | // i32 value with set bits only in one byte X000, 0X00, 00X0, or 000X. | |||
626 | return Value == 0 || isNEONBytesplat(Value, 4); | |||
627 | } | |||
628 | ||||
629 | /// Encode NEON 32 bits Splat immediate for instructions like VBIC/VORR. | |||
630 | inline unsigned encodeNEONi32splat(unsigned Value) { | |||
631 | assert(isNEONi32splat(Value) && "Invalid NEON splat value")(static_cast <bool> (isNEONi32splat(Value) && "Invalid NEON splat value" ) ? void (0) : __assert_fail ("isNEONi32splat(Value) && \"Invalid NEON splat value\"" , "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h" , 631, __extension__ __PRETTY_FUNCTION__)); | |||
632 | if (Value >= 0x100 && Value <= 0xff00) | |||
633 | Value = (Value >> 8) | 0x200; | |||
634 | else if (Value > 0xffff && Value <= 0xff0000) | |||
635 | Value = (Value >> 16) | 0x400; | |||
636 | else if (Value > 0xffffff) | |||
637 | Value = (Value >> 24) | 0x600; | |||
638 | return Value; | |||
639 | } | |||
640 | ||||
641 | //===--------------------------------------------------------------------===// | |||
642 | // Floating-point Immediates | |||
643 | // | |||
644 | inline float getFPImmFloat(unsigned Imm) { | |||
645 | // We expect an 8-bit binary encoding of a floating-point number here. | |||
646 | ||||
647 | uint8_t Sign = (Imm >> 7) & 0x1; | |||
648 | uint8_t Exp = (Imm >> 4) & 0x7; | |||
649 | uint8_t Mantissa = Imm & 0xf; | |||
650 | ||||
651 | // 8-bit FP IEEE Float Encoding | |||
652 | // abcd efgh aBbbbbbc defgh000 00000000 00000000 | |||
653 | // | |||
654 | // where B = NOT(b); | |||
655 | uint32_t I = 0; | |||
656 | I |= Sign << 31; | |||
657 | I |= ((Exp & 0x4) != 0 ? 0 : 1) << 30; | |||
658 | I |= ((Exp & 0x4) != 0 ? 0x1f : 0) << 25; | |||
659 | I |= (Exp & 0x3) << 23; | |||
660 | I |= Mantissa << 19; | |||
661 | return bit_cast<float>(I); | |||
662 | } | |||
663 | ||||
664 | /// getFP16Imm - Return an 8-bit floating-point version of the 16-bit | |||
665 | /// floating-point value. If the value cannot be represented as an 8-bit | |||
666 | /// floating-point value, then return -1. | |||
667 | inline int getFP16Imm(const APInt &Imm) { | |||
668 | uint32_t Sign = Imm.lshr(15).getZExtValue() & 1; | |||
669 | int32_t Exp = (Imm.lshr(10).getSExtValue() & 0x1f) - 15; // -14 to 15 | |||
670 | int64_t Mantissa = Imm.getZExtValue() & 0x3ff; // 10 bits | |||
671 | ||||
672 | // We can handle 4 bits of mantissa. | |||
673 | // mantissa = (16+UInt(e:f:g:h))/16. | |||
674 | if (Mantissa & 0x3f) | |||
675 | return -1; | |||
676 | Mantissa >>= 6; | |||
677 | ||||
678 | // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3 | |||
679 | if (Exp < -3 || Exp > 4) | |||
680 | return -1; | |||
681 | Exp = ((Exp+3) & 0x7) ^ 4; | |||
682 | ||||
683 | return ((int)Sign << 7) | (Exp << 4) | Mantissa; | |||
684 | } | |||
685 | ||||
686 | inline int getFP16Imm(const APFloat &FPImm) { | |||
687 | return getFP16Imm(FPImm.bitcastToAPInt()); | |||
688 | } | |||
689 | ||||
690 | /// If this is a FP16Imm encoded as a fp32 value, return the 8-bit encoding | |||
691 | /// for it. Otherwise return -1 like getFP16Imm. | |||
692 | inline int getFP32FP16Imm(const APInt &Imm) { | |||
693 | if (Imm.getActiveBits() > 16) | |||
694 | return -1; | |||
695 | return ARM_AM::getFP16Imm(Imm.trunc(16)); | |||
696 | } | |||
697 | ||||
698 | inline int getFP32FP16Imm(const APFloat &FPImm) { | |||
699 | return getFP32FP16Imm(FPImm.bitcastToAPInt()); | |||
700 | } | |||
701 | ||||
702 | /// getFP32Imm - Return an 8-bit floating-point version of the 32-bit | |||
703 | /// floating-point value. If the value cannot be represented as an 8-bit | |||
704 | /// floating-point value, then return -1. | |||
705 | inline int getFP32Imm(const APInt &Imm) { | |||
706 | uint32_t Sign = Imm.lshr(31).getZExtValue() & 1; | |||
707 | int32_t Exp = (Imm.lshr(23).getSExtValue() & 0xff) - 127; // -126 to 127 | |||
708 | int64_t Mantissa = Imm.getZExtValue() & 0x7fffff; // 23 bits | |||
709 | ||||
710 | // We can handle 4 bits of mantissa. | |||
711 | // mantissa = (16+UInt(e:f:g:h))/16. | |||
712 | if (Mantissa & 0x7ffff) | |||
713 | return -1; | |||
714 | Mantissa >>= 19; | |||
715 | if ((Mantissa & 0xf) != Mantissa) | |||
716 | return -1; | |||
717 | ||||
718 | // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3 | |||
719 | if (Exp < -3 || Exp > 4) | |||
720 | return -1; | |||
721 | Exp = ((Exp+3) & 0x7) ^ 4; | |||
722 | ||||
723 | return ((int)Sign << 7) | (Exp << 4) | Mantissa; | |||
724 | } | |||
725 | ||||
726 | inline int getFP32Imm(const APFloat &FPImm) { | |||
727 | return getFP32Imm(FPImm.bitcastToAPInt()); | |||
728 | } | |||
729 | ||||
730 | /// getFP64Imm - Return an 8-bit floating-point version of the 64-bit | |||
731 | /// floating-point value. If the value cannot be represented as an 8-bit | |||
732 | /// floating-point value, then return -1. | |||
733 | inline int getFP64Imm(const APInt &Imm) { | |||
734 | uint64_t Sign = Imm.lshr(63).getZExtValue() & 1; | |||
735 | int64_t Exp = (Imm.lshr(52).getSExtValue() & 0x7ff) - 1023; // -1022 to 1023 | |||
736 | uint64_t Mantissa = Imm.getZExtValue() & 0xfffffffffffffULL; | |||
737 | ||||
738 | // We can handle 4 bits of mantissa. | |||
739 | // mantissa = (16+UInt(e:f:g:h))/16. | |||
740 | if (Mantissa & 0xffffffffffffULL) | |||
741 | return -1; | |||
742 | Mantissa >>= 48; | |||
743 | if ((Mantissa & 0xf) != Mantissa) | |||
744 | return -1; | |||
745 | ||||
746 | // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3 | |||
747 | if (Exp < -3 || Exp > 4) | |||
748 | return -1; | |||
749 | Exp = ((Exp+3) & 0x7) ^ 4; | |||
750 | ||||
751 | return ((int)Sign << 7) | (Exp << 4) | Mantissa; | |||
752 | } | |||
753 | ||||
754 | inline int getFP64Imm(const APFloat &FPImm) { | |||
755 | return getFP64Imm(FPImm.bitcastToAPInt()); | |||
756 | } | |||
757 | ||||
758 | } // end namespace ARM_AM | |||
759 | } // end namespace llvm | |||
760 | ||||
761 | #endif | |||
762 |
1 | //===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===// | ||||||
2 | // | ||||||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||||||
4 | // See https://llvm.org/LICENSE.txt for license information. | ||||||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||||||
6 | // | ||||||
7 | //===----------------------------------------------------------------------===// | ||||||
8 | // | ||||||
9 | // This file contains some functions that are useful for math stuff. | ||||||
10 | // | ||||||
11 | //===----------------------------------------------------------------------===// | ||||||
12 | |||||||
13 | #ifndef LLVM_SUPPORT_MATHEXTRAS_H | ||||||
14 | #define LLVM_SUPPORT_MATHEXTRAS_H | ||||||
15 | |||||||
16 | #include "llvm/Support/Compiler.h" | ||||||
17 | #include <cassert> | ||||||
18 | #include <climits> | ||||||
19 | #include <cmath> | ||||||
20 | #include <cstdint> | ||||||
21 | #include <cstring> | ||||||
22 | #include <limits> | ||||||
23 | #include <type_traits> | ||||||
24 | |||||||
25 | #ifdef __ANDROID_NDK__ | ||||||
26 | #include <android/api-level.h> | ||||||
27 | #endif | ||||||
28 | |||||||
29 | #ifdef _MSC_VER | ||||||
30 | // Declare these intrinsics manually rather including intrin.h. It's very | ||||||
31 | // expensive, and MathExtras.h is popular. | ||||||
32 | // #include <intrin.h> | ||||||
33 | extern "C" { | ||||||
34 | unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask); | ||||||
35 | unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask); | ||||||
36 | unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask); | ||||||
37 | unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask); | ||||||
38 | } | ||||||
39 | #endif | ||||||
40 | |||||||
41 | namespace llvm { | ||||||
42 | |||||||
43 | /// The behavior an operation has on an input of 0. | ||||||
44 | enum ZeroBehavior { | ||||||
45 | /// The returned value is undefined. | ||||||
46 | ZB_Undefined, | ||||||
47 | /// The returned value is numeric_limits<T>::max() | ||||||
48 | ZB_Max, | ||||||
49 | /// The returned value is numeric_limits<T>::digits | ||||||
50 | ZB_Width | ||||||
51 | }; | ||||||
52 | |||||||
53 | /// Mathematical constants. | ||||||
54 | namespace numbers { | ||||||
55 | // TODO: Track C++20 std::numbers. | ||||||
56 | // TODO: Favor using the hexadecimal FP constants (requires C++17). | ||||||
57 | constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113 | ||||||
58 | egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620 | ||||||
59 | ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162 | ||||||
60 | ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392 | ||||||
61 | log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0) | ||||||
62 | log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2) | ||||||
63 | pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796 | ||||||
64 | inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541 | ||||||
65 | sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161 | ||||||
66 | inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197 | ||||||
67 | sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219 | ||||||
68 | inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1) | ||||||
69 | sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194 | ||||||
70 | inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1) | ||||||
71 | phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622 | ||||||
72 | constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113 | ||||||
73 | egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620 | ||||||
74 | ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162 | ||||||
75 | ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392 | ||||||
76 | log2ef = 1.44269504F, // (0x1.715476P+0) | ||||||
77 | log10ef = .434294482F, // (0x1.bcb7b2P-2) | ||||||
78 | pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796 | ||||||
79 | inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541 | ||||||
80 | sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161 | ||||||
81 | inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197 | ||||||
82 | sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193 | ||||||
83 | inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1) | ||||||
84 | sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194 | ||||||
85 | inv_sqrt3f = .577350269F, // (0x1.279a74P-1) | ||||||
86 | phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622 | ||||||
87 | } // namespace numbers | ||||||
88 | |||||||
89 | namespace detail { | ||||||
90 | template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter { | ||||||
91 | static unsigned count(T Val, ZeroBehavior) { | ||||||
92 | if (!Val) | ||||||
93 | return std::numeric_limits<T>::digits; | ||||||
94 | if (Val & 0x1) | ||||||
95 | return 0; | ||||||
96 | |||||||
97 | // Bisection method. | ||||||
98 | unsigned ZeroBits = 0; | ||||||
99 | T Shift = std::numeric_limits<T>::digits >> 1; | ||||||
100 | T Mask = std::numeric_limits<T>::max() >> Shift; | ||||||
101 | while (Shift) { | ||||||
102 | if ((Val & Mask) == 0) { | ||||||
103 | Val >>= Shift; | ||||||
104 | ZeroBits |= Shift; | ||||||
105 | } | ||||||
106 | Shift >>= 1; | ||||||
107 | Mask >>= Shift; | ||||||
108 | } | ||||||
109 | return ZeroBits; | ||||||
110 | } | ||||||
111 | }; | ||||||
112 | |||||||
113 | #if defined(__GNUC__4) || defined(_MSC_VER) | ||||||
114 | template <typename T> struct TrailingZerosCounter<T, 4> { | ||||||
115 | static unsigned count(T Val, ZeroBehavior ZB) { | ||||||
116 | if (ZB
| ||||||
117 | return 32; | ||||||
118 | |||||||
119 | #if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4) | ||||||
120 | return __builtin_ctz(Val); | ||||||
121 | #elif defined(_MSC_VER) | ||||||
122 | unsigned long Index; | ||||||
123 | _BitScanForward(&Index, Val); | ||||||
124 | return Index; | ||||||
125 | #endif | ||||||
126 | } | ||||||
127 | }; | ||||||
128 | |||||||
129 | #if !defined(_MSC_VER) || defined(_M_X64) | ||||||
130 | template <typename T> struct TrailingZerosCounter<T, 8> { | ||||||
131 | static unsigned count(T Val, ZeroBehavior ZB) { | ||||||
132 | if (ZB != ZB_Undefined && Val == 0) | ||||||
133 | return 64; | ||||||
134 | |||||||
135 | #if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4) | ||||||
136 | return __builtin_ctzll(Val); | ||||||
137 | #elif defined(_MSC_VER) | ||||||
138 | unsigned long Index; | ||||||
139 | _BitScanForward64(&Index, Val); | ||||||
140 | return Index; | ||||||
141 | #endif | ||||||
142 | } | ||||||
143 | }; | ||||||
144 | #endif | ||||||
145 | #endif | ||||||
146 | } // namespace detail | ||||||
147 | |||||||
148 | /// Count number of 0's from the least significant bit to the most | ||||||
149 | /// stopping at the first 1. | ||||||
150 | /// | ||||||
151 | /// Only unsigned integral types are allowed. | ||||||
152 | /// | ||||||
153 | /// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are | ||||||
154 | /// valid arguments. | ||||||
155 | template <typename T> | ||||||
156 | unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) { | ||||||
157 | static_assert(std::numeric_limits<T>::is_integer && | ||||||
158 | !std::numeric_limits<T>::is_signed, | ||||||
159 | "Only unsigned integral types are allowed."); | ||||||
160 | return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB); | ||||||
161 | } | ||||||
162 | |||||||
163 | namespace detail { | ||||||
164 | template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter { | ||||||
165 | static unsigned count(T Val, ZeroBehavior) { | ||||||
166 | if (!Val) | ||||||
167 | return std::numeric_limits<T>::digits; | ||||||
168 | |||||||
169 | // Bisection method. | ||||||
170 | unsigned ZeroBits = 0; | ||||||
171 | for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) { | ||||||
172 | T Tmp = Val >> Shift; | ||||||
173 | if (Tmp) | ||||||
174 | Val = Tmp; | ||||||
175 | else | ||||||
176 | ZeroBits |= Shift; | ||||||
177 | } | ||||||
178 | return ZeroBits; | ||||||
179 | } | ||||||
180 | }; | ||||||
181 | |||||||
182 | #if defined(__GNUC__4) || defined(_MSC_VER) | ||||||
183 | template <typename T> struct LeadingZerosCounter<T, 4> { | ||||||
184 | static unsigned count(T Val, ZeroBehavior ZB) { | ||||||
185 | if (ZB != ZB_Undefined && Val == 0) | ||||||
186 | return 32; | ||||||
187 | |||||||
188 | #if __has_builtin(__builtin_clz)1 || defined(__GNUC__4) | ||||||
189 | return __builtin_clz(Val); | ||||||
190 | #elif defined(_MSC_VER) | ||||||
191 | unsigned long Index; | ||||||
192 | _BitScanReverse(&Index, Val); | ||||||
193 | return Index ^ 31; | ||||||
194 | #endif | ||||||
195 | } | ||||||
196 | }; | ||||||
197 | |||||||
198 | #if !defined(_MSC_VER) || defined(_M_X64) | ||||||
199 | template <typename T> struct LeadingZerosCounter<T, 8> { | ||||||
200 | static unsigned count(T Val, ZeroBehavior ZB) { | ||||||
201 | if (ZB != ZB_Undefined && Val == 0) | ||||||
202 | return 64; | ||||||
203 | |||||||
204 | #if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4) | ||||||
205 | return __builtin_clzll(Val); | ||||||
206 | #elif defined(_MSC_VER) | ||||||
207 | unsigned long Index; | ||||||
208 | _BitScanReverse64(&Index, Val); | ||||||
209 | return Index ^ 63; | ||||||
210 | #endif | ||||||
211 | } | ||||||
212 | }; | ||||||
213 | #endif | ||||||
214 | #endif | ||||||
215 | } // namespace detail | ||||||
216 | |||||||
217 | /// Count number of 0's from the most significant bit to the least | ||||||
218 | /// stopping at the first 1. | ||||||
219 | /// | ||||||
220 | /// Only unsigned integral types are allowed. | ||||||
221 | /// | ||||||
222 | /// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are | ||||||
223 | /// valid arguments. | ||||||
224 | template <typename T> | ||||||
225 | unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) { | ||||||
226 | static_assert(std::numeric_limits<T>::is_integer && | ||||||
227 | !std::numeric_limits<T>::is_signed, | ||||||
228 | "Only unsigned integral types are allowed."); | ||||||
229 | return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB); | ||||||
230 | } | ||||||
231 | |||||||
232 | /// Get the index of the first set bit starting from the least | ||||||
233 | /// significant bit. | ||||||
234 | /// | ||||||
235 | /// Only unsigned integral types are allowed. | ||||||
236 | /// | ||||||
237 | /// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are | ||||||
238 | /// valid arguments. | ||||||
239 | template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) { | ||||||
240 | if (ZB == ZB_Max && Val == 0) | ||||||
241 | return std::numeric_limits<T>::max(); | ||||||
242 | |||||||
243 | return countTrailingZeros(Val, ZB_Undefined); | ||||||
244 | } | ||||||
245 | |||||||
246 | /// Create a bitmask with the N right-most bits set to 1, and all other | ||||||
247 | /// bits set to 0. Only unsigned types are allowed. | ||||||
248 | template <typename T> T maskTrailingOnes(unsigned N) { | ||||||
249 | static_assert(std::is_unsigned<T>::value, "Invalid type!"); | ||||||
250 | const unsigned Bits = CHAR_BIT8 * sizeof(T); | ||||||
251 | assert(N <= Bits && "Invalid bit index")(static_cast <bool> (N <= Bits && "Invalid bit index" ) ? void (0) : __assert_fail ("N <= Bits && \"Invalid bit index\"" , "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/include/llvm/Support/MathExtras.h" , 251, __extension__ __PRETTY_FUNCTION__)); | ||||||
252 | return N == 0 ? 0 : (T(-1) >> (Bits - N)); | ||||||
253 | } | ||||||
254 | |||||||
255 | /// Create a bitmask with the N left-most bits set to 1, and all other | ||||||
256 | /// bits set to 0. Only unsigned types are allowed. | ||||||
257 | template <typename T> T maskLeadingOnes(unsigned N) { | ||||||
258 | return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N); | ||||||
259 | } | ||||||
260 | |||||||
261 | /// Create a bitmask with the N right-most bits set to 0, and all other | ||||||
262 | /// bits set to 1. Only unsigned types are allowed. | ||||||
263 | template <typename T> T maskTrailingZeros(unsigned N) { | ||||||
264 | return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N); | ||||||
265 | } | ||||||
266 | |||||||
267 | /// Create a bitmask with the N left-most bits set to 0, and all other | ||||||
268 | /// bits set to 1. Only unsigned types are allowed. | ||||||
269 | template <typename T> T maskLeadingZeros(unsigned N) { | ||||||
270 | return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N); | ||||||
271 | } | ||||||
272 | |||||||
273 | /// Get the index of the last set bit starting from the least | ||||||
274 | /// significant bit. | ||||||
275 | /// | ||||||
276 | /// Only unsigned integral types are allowed. | ||||||
277 | /// | ||||||
278 | /// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are | ||||||
279 | /// valid arguments. | ||||||
280 | template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) { | ||||||
281 | if (ZB == ZB_Max && Val == 0) | ||||||
282 | return std::numeric_limits<T>::max(); | ||||||
283 | |||||||
284 | // Use ^ instead of - because both gcc and llvm can remove the associated ^ | ||||||
285 | // in the __builtin_clz intrinsic on x86. | ||||||
286 | return countLeadingZeros(Val, ZB_Undefined) ^ | ||||||
287 | (std::numeric_limits<T>::digits - 1); | ||||||
288 | } | ||||||
289 | |||||||
290 | /// Macro compressed bit reversal table for 256 bits. | ||||||
291 | /// | ||||||
292 | /// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable | ||||||
293 | static const unsigned char BitReverseTable256[256] = { | ||||||
294 | #define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64 | ||||||
295 | #define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16) | ||||||
296 | #define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4) | ||||||
297 | R6(0), R6(2), R6(1), R6(3) | ||||||
298 | #undef R2 | ||||||
299 | #undef R4 | ||||||
300 | #undef R6 | ||||||
301 | }; | ||||||
302 | |||||||
303 | /// Reverse the bits in \p Val. | ||||||
304 | template <typename T> | ||||||
305 | T reverseBits(T Val) { | ||||||
306 | unsigned char in[sizeof(Val)]; | ||||||
307 | unsigned char out[sizeof(Val)]; | ||||||
308 | std::memcpy(in, &Val, sizeof(Val)); | ||||||
309 | for (unsigned i = 0; i < sizeof(Val); ++i) | ||||||
310 | out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]]; | ||||||
311 | std::memcpy(&Val, out, sizeof(Val)); | ||||||
312 | return Val; | ||||||
313 | } | ||||||
314 | |||||||
315 | #if __has_builtin(__builtin_bitreverse8)1 | ||||||
316 | template<> | ||||||
317 | inline uint8_t reverseBits<uint8_t>(uint8_t Val) { | ||||||
318 | return __builtin_bitreverse8(Val); | ||||||
319 | } | ||||||
320 | #endif | ||||||
321 | |||||||
322 | #if __has_builtin(__builtin_bitreverse16)1 | ||||||
323 | template<> | ||||||
324 | inline uint16_t reverseBits<uint16_t>(uint16_t Val) { | ||||||
325 | return __builtin_bitreverse16(Val); | ||||||
326 | } | ||||||
327 | #endif | ||||||
328 | |||||||
329 | #if __has_builtin(__builtin_bitreverse32)1 | ||||||
330 | template<> | ||||||
331 | inline uint32_t reverseBits<uint32_t>(uint32_t Val) { | ||||||
332 | return __builtin_bitreverse32(Val); | ||||||
333 | } | ||||||
334 | #endif | ||||||
335 | |||||||
336 | #if __has_builtin(__builtin_bitreverse64)1 | ||||||
337 | template<> | ||||||
338 | inline uint64_t reverseBits<uint64_t>(uint64_t Val) { | ||||||
339 | return __builtin_bitreverse64(Val); | ||||||
340 | } | ||||||
341 | #endif | ||||||
342 | |||||||
343 | // NOTE: The following support functions use the _32/_64 extensions instead of | ||||||
344 | // type overloading so that signed and unsigned integers can be used without | ||||||
345 | // ambiguity. | ||||||
346 | |||||||
347 | /// Return the high 32 bits of a 64 bit value. | ||||||
348 | constexpr inline uint32_t Hi_32(uint64_t Value) { | ||||||
349 | return static_cast<uint32_t>(Value >> 32); | ||||||
350 | } | ||||||
351 | |||||||
352 | /// Return the low 32 bits of a 64 bit value. | ||||||
353 | constexpr inline uint32_t Lo_32(uint64_t Value) { | ||||||
354 | return static_cast<uint32_t>(Value); | ||||||
355 | } | ||||||
356 | |||||||
357 | /// Make a 64-bit integer from a high / low pair of 32-bit integers. | ||||||
358 | constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) { | ||||||
359 | return ((uint64_t)High << 32) | (uint64_t)Low; | ||||||
360 | } | ||||||
361 | |||||||
362 | /// Checks if an integer fits into the given bit width. | ||||||
363 | template <unsigned N> constexpr inline bool isInt(int64_t x) { | ||||||
364 | return N >= 64 || (-(INT64_C(1)1L<<(N-1)) <= x && x < (INT64_C(1)1L<<(N-1))); | ||||||
365 | } | ||||||
366 | // Template specializations to get better code for common cases. | ||||||
367 | template <> constexpr inline bool isInt<8>(int64_t x) { | ||||||
368 | return static_cast<int8_t>(x) == x; | ||||||
369 | } | ||||||
370 | template <> constexpr inline bool isInt<16>(int64_t x) { | ||||||
371 | return static_cast<int16_t>(x) == x; | ||||||
372 | } | ||||||
373 | template <> constexpr inline bool isInt<32>(int64_t x) { | ||||||
374 | return static_cast<int32_t>(x) == x; | ||||||
375 | } | ||||||
376 | |||||||
377 | /// Checks if a signed integer is an N bit number shifted left by S. | ||||||
378 | template <unsigned N, unsigned S> | ||||||
379 | constexpr inline bool isShiftedInt(int64_t x) { | ||||||
380 | static_assert( | ||||||
381 | N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number."); | ||||||
382 | static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide."); | ||||||
383 | return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0); | ||||||
384 | } | ||||||
385 | |||||||
386 | /// Checks if an unsigned integer fits into the given bit width. | ||||||
387 | /// | ||||||
388 | /// This is written as two functions rather than as simply | ||||||
389 | /// | ||||||
390 | /// return N >= 64 || X < (UINT64_C(1) << N); | ||||||
391 | /// | ||||||
392 | /// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting | ||||||
393 | /// left too many places. | ||||||
394 | template <unsigned N> | ||||||
395 | constexpr inline std::enable_if_t<(N < 64), bool> isUInt(uint64_t X) { | ||||||
396 | static_assert(N > 0, "isUInt<0> doesn't make sense"); | ||||||
397 | return X < (UINT64_C(1)1UL << (N)); | ||||||
398 | } | ||||||
399 | template <unsigned N> | ||||||
400 | constexpr inline std::enable_if_t<N >= 64, bool> isUInt(uint64_t) { | ||||||
401 | return true; | ||||||
402 | } | ||||||
403 | |||||||
404 | // Template specializations to get better code for common cases. | ||||||
405 | template <> constexpr inline bool isUInt<8>(uint64_t x) { | ||||||
406 | return static_cast<uint8_t>(x) == x; | ||||||
407 | } | ||||||
408 | template <> constexpr inline bool isUInt<16>(uint64_t x) { | ||||||
409 | return static_cast<uint16_t>(x) == x; | ||||||
410 | } | ||||||
411 | template <> constexpr inline bool isUInt<32>(uint64_t x) { | ||||||
412 | return static_cast<uint32_t>(x) == x; | ||||||
413 | } | ||||||
414 | |||||||
415 | /// Checks if a unsigned integer is an N bit number shifted left by S. | ||||||
416 | template <unsigned N, unsigned S> | ||||||
417 | constexpr inline bool isShiftedUInt(uint64_t x) { | ||||||
418 | static_assert( | ||||||
419 | N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)"); | ||||||
420 | static_assert(N + S <= 64, | ||||||
421 | "isShiftedUInt<N, S> with N + S > 64 is too wide."); | ||||||
422 | // Per the two static_asserts above, S must be strictly less than 64. So | ||||||
423 | // 1 << S is not undefined behavior. | ||||||
424 | return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0); | ||||||
425 | } | ||||||
426 | |||||||
427 | /// Gets the maximum value for a N-bit unsigned integer. | ||||||
428 | inline uint64_t maxUIntN(uint64_t N) { | ||||||
429 | assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 && "integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\"" , "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/include/llvm/Support/MathExtras.h" , 429, __extension__ __PRETTY_FUNCTION__)); | ||||||
430 | |||||||
431 | // uint64_t(1) << 64 is undefined behavior, so we can't do | ||||||
432 | // (uint64_t(1) << N) - 1 | ||||||
433 | // without checking first that N != 64. But this works and doesn't have a | ||||||
434 | // branch. | ||||||
435 | return UINT64_MAX(18446744073709551615UL) >> (64 - N); | ||||||
436 | } | ||||||
437 | |||||||
438 | /// Gets the minimum value for a N-bit signed integer. | ||||||
439 | inline int64_t minIntN(int64_t N) { | ||||||
440 | assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 && "integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\"" , "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/include/llvm/Support/MathExtras.h" , 440, __extension__ __PRETTY_FUNCTION__)); | ||||||
441 | |||||||
442 | return UINT64_C(1)1UL + ~(UINT64_C(1)1UL << (N - 1)); | ||||||
443 | } | ||||||
444 | |||||||
445 | /// Gets the maximum value for a N-bit signed integer. | ||||||
446 | inline int64_t maxIntN(int64_t N) { | ||||||
447 | assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 && "integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\"" , "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/include/llvm/Support/MathExtras.h" , 447, __extension__ __PRETTY_FUNCTION__)); | ||||||
448 | |||||||
449 | // This relies on two's complement wraparound when N == 64, so we convert to | ||||||
450 | // int64_t only at the very end to avoid UB. | ||||||
451 | return (UINT64_C(1)1UL << (N - 1)) - 1; | ||||||
452 | } | ||||||
453 | |||||||
454 | /// Checks if an unsigned integer fits into the given (dynamic) bit width. | ||||||
455 | inline bool isUIntN(unsigned N, uint64_t x) { | ||||||
456 | return N >= 64 || x <= maxUIntN(N); | ||||||
457 | } | ||||||
458 | |||||||
459 | /// Checks if an signed integer fits into the given (dynamic) bit width. | ||||||
460 | inline bool isIntN(unsigned N, int64_t x) { | ||||||
461 | return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N)); | ||||||
462 | } | ||||||
463 | |||||||
464 | /// Return true if the argument is a non-empty sequence of ones starting at the | ||||||
465 | /// least significant bit with the remainder zero (32 bit version). | ||||||
466 | /// Ex. isMask_32(0x0000FFFFU) == true. | ||||||
467 | constexpr inline bool isMask_32(uint32_t Value) { | ||||||
468 | return Value && ((Value + 1) & Value) == 0; | ||||||
469 | } | ||||||
470 | |||||||
471 | /// Return true if the argument is a non-empty sequence of ones starting at the | ||||||
472 | /// least significant bit with the remainder zero (64 bit version). | ||||||
473 | constexpr inline bool isMask_64(uint64_t Value) { | ||||||
474 | return Value && ((Value + 1) & Value) == 0; | ||||||
475 | } | ||||||
476 | |||||||
477 | /// Return true if the argument contains a non-empty sequence of ones with the | ||||||
478 | /// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true. | ||||||
479 | constexpr inline bool isShiftedMask_32(uint32_t Value) { | ||||||
480 | return Value && isMask_32((Value - 1) | Value); | ||||||
481 | } | ||||||
482 | |||||||
483 | /// Return true if the argument contains a non-empty sequence of ones with the | ||||||
484 | /// remainder zero (64 bit version.) | ||||||
485 | constexpr inline bool isShiftedMask_64(uint64_t Value) { | ||||||
486 | return Value && isMask_64((Value - 1) | Value); | ||||||
487 | } | ||||||
488 | |||||||
489 | /// Return true if the argument is a power of two > 0. | ||||||
490 | /// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.) | ||||||
491 | constexpr inline bool isPowerOf2_32(uint32_t Value) { | ||||||
492 | return Value && !(Value & (Value - 1)); | ||||||
493 | } | ||||||
494 | |||||||
495 | /// Return true if the argument is a power of two > 0 (64 bit edition.) | ||||||
496 | constexpr inline bool isPowerOf2_64(uint64_t Value) { | ||||||
497 | return Value && !(Value & (Value - 1)); | ||||||
498 | } | ||||||
499 | |||||||
500 | /// Count the number of ones from the most significant bit to the first | ||||||
501 | /// zero bit. | ||||||
502 | /// | ||||||
503 | /// Ex. countLeadingOnes(0xFF0FFF00) == 8. | ||||||
504 | /// Only unsigned integral types are allowed. | ||||||
505 | /// | ||||||
506 | /// \param ZB the behavior on an input of all ones. Only ZB_Width and | ||||||
507 | /// ZB_Undefined are valid arguments. | ||||||
508 | template <typename T> | ||||||
509 | unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) { | ||||||
510 | static_assert(std::numeric_limits<T>::is_integer && | ||||||
511 | !std::numeric_limits<T>::is_signed, | ||||||
512 | "Only unsigned integral types are allowed."); | ||||||
513 | return countLeadingZeros<T>(~Value, ZB); | ||||||
514 | } | ||||||
515 | |||||||
516 | /// Count the number of ones from the least significant bit to the first | ||||||
517 | /// zero bit. | ||||||
518 | /// | ||||||
519 | /// Ex. countTrailingOnes(0x00FF00FF) == 8. | ||||||
520 | /// Only unsigned integral types are allowed. | ||||||
521 | /// | ||||||
522 | /// \param ZB the behavior on an input of all ones. Only ZB_Width and | ||||||
523 | /// ZB_Undefined are valid arguments. | ||||||
524 | template <typename T> | ||||||
525 | unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) { | ||||||
526 | static_assert(std::numeric_limits<T>::is_integer && | ||||||
527 | !std::numeric_limits<T>::is_signed, | ||||||
528 | "Only unsigned integral types are allowed."); | ||||||
529 | return countTrailingZeros<T>(~Value, ZB); | ||||||
530 | } | ||||||
531 | |||||||
532 | namespace detail { | ||||||
533 | template <typename T, std::size_t SizeOfT> struct PopulationCounter { | ||||||
534 | static unsigned count(T Value) { | ||||||
535 | // Generic version, forward to 32 bits. | ||||||
536 | static_assert(SizeOfT <= 4, "Not implemented!"); | ||||||
537 | #if defined(__GNUC__4) | ||||||
538 | return __builtin_popcount(Value); | ||||||
539 | #else | ||||||
540 | uint32_t v = Value; | ||||||
541 | v = v - ((v >> 1) & 0x55555555); | ||||||
542 | v = (v & 0x33333333) + ((v >> 2) & 0x33333333); | ||||||
543 | return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24; | ||||||
544 | #endif | ||||||
545 | } | ||||||
546 | }; | ||||||
547 | |||||||
548 | template <typename T> struct PopulationCounter<T, 8> { | ||||||
549 | static unsigned count(T Value) { | ||||||
550 | #if defined(__GNUC__4) | ||||||
551 | return __builtin_popcountll(Value); | ||||||
552 | #else | ||||||
553 | uint64_t v = Value; | ||||||
554 | v = v - ((v >> 1) & 0x5555555555555555ULL); | ||||||
555 | v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL); | ||||||
556 | v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL; | ||||||
557 | return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56); | ||||||
558 | #endif | ||||||
559 | } | ||||||
560 | }; | ||||||
561 | } // namespace detail | ||||||
562 | |||||||
563 | /// Count the number of set bits in a value. | ||||||
564 | /// Ex. countPopulation(0xF000F000) = 8 | ||||||
565 | /// Returns 0 if the word is zero. | ||||||
566 | template <typename T> | ||||||
567 | inline unsigned countPopulation(T Value) { | ||||||
568 | static_assert(std::numeric_limits<T>::is_integer && | ||||||
569 | !std::numeric_limits<T>::is_signed, | ||||||
570 | "Only unsigned integral types are allowed."); | ||||||
571 | return detail::PopulationCounter<T, sizeof(T)>::count(Value); | ||||||
572 | } | ||||||
573 | |||||||
574 | /// Compile time Log2. | ||||||
575 | /// Valid only for positive powers of two. | ||||||
576 | template <size_t kValue> constexpr inline size_t CTLog2() { | ||||||
577 | static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue), | ||||||
578 | "Value is not a valid power of 2"); | ||||||
579 | return 1 + CTLog2<kValue / 2>(); | ||||||
580 | } | ||||||
581 | |||||||
582 | template <> constexpr inline size_t CTLog2<1>() { return 0; } | ||||||
583 | |||||||
584 | /// Return the log base 2 of the specified value. | ||||||
585 | inline double Log2(double Value) { | ||||||
586 | #if defined(__ANDROID_API__) && __ANDROID_API__ < 18 | ||||||
587 | return __builtin_log(Value) / __builtin_log(2.0); | ||||||
588 | #else | ||||||
589 | return log2(Value); | ||||||
590 | #endif | ||||||
591 | } | ||||||
592 | |||||||
593 | /// Return the floor log base 2 of the specified value, -1 if the value is zero. | ||||||
594 | /// (32 bit edition.) | ||||||
595 | /// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2 | ||||||
596 | inline unsigned Log2_32(uint32_t Value) { | ||||||
597 | return 31 - countLeadingZeros(Value); | ||||||
598 | } | ||||||
599 | |||||||
600 | /// Return the floor log base 2 of the specified value, -1 if the value is zero. | ||||||
601 | /// (64 bit edition.) | ||||||
602 | inline unsigned Log2_64(uint64_t Value) { | ||||||
603 | return 63 - countLeadingZeros(Value); | ||||||
604 | } | ||||||
605 | |||||||
606 | /// Return the ceil log base 2 of the specified value, 32 if the value is zero. | ||||||
607 | /// (32 bit edition). | ||||||
608 | /// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3 | ||||||
609 | inline unsigned Log2_32_Ceil(uint32_t Value) { | ||||||
610 | return 32 - countLeadingZeros(Value - 1); | ||||||
611 | } | ||||||
612 | |||||||
613 | /// Return the ceil log base 2 of the specified value, 64 if the value is zero. | ||||||
614 | /// (64 bit edition.) | ||||||
615 | inline unsigned Log2_64_Ceil(uint64_t Value) { | ||||||
616 | return 64 - countLeadingZeros(Value - 1); | ||||||
617 | } | ||||||
618 | |||||||
619 | /// Return the greatest common divisor of the values using Euclid's algorithm. | ||||||
620 | template <typename T> | ||||||
621 | inline T greatestCommonDivisor(T A, T B) { | ||||||
622 | while (B) { | ||||||
623 | T Tmp = B; | ||||||
624 | B = A % B; | ||||||
625 | A = Tmp; | ||||||
626 | } | ||||||
627 | return A; | ||||||
628 | } | ||||||
629 | |||||||
630 | inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) { | ||||||
631 | return greatestCommonDivisor<uint64_t>(A, B); | ||||||
632 | } | ||||||
633 | |||||||
634 | /// This function takes a 64-bit integer and returns the bit equivalent double. | ||||||
635 | inline double BitsToDouble(uint64_t Bits) { | ||||||
636 | double D; | ||||||
637 | static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes"); | ||||||
638 | memcpy(&D, &Bits, sizeof(Bits)); | ||||||
639 | return D; | ||||||
640 | } | ||||||
641 | |||||||
642 | /// This function takes a 32-bit integer and returns the bit equivalent float. | ||||||
643 | inline float BitsToFloat(uint32_t Bits) { | ||||||
644 | float F; | ||||||
645 | static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes"); | ||||||
646 | memcpy(&F, &Bits, sizeof(Bits)); | ||||||
647 | return F; | ||||||
648 | } | ||||||
649 | |||||||
650 | /// This function takes a double and returns the bit equivalent 64-bit integer. | ||||||
651 | /// Note that copying doubles around changes the bits of NaNs on some hosts, | ||||||
652 | /// notably x86, so this routine cannot be used if these bits are needed. | ||||||
653 | inline uint64_t DoubleToBits(double Double) { | ||||||
654 | uint64_t Bits; | ||||||
655 | static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes"); | ||||||
656 | memcpy(&Bits, &Double, sizeof(Double)); | ||||||
657 | return Bits; | ||||||
658 | } | ||||||
659 | |||||||
660 | /// This function takes a float and returns the bit equivalent 32-bit integer. | ||||||
661 | /// Note that copying floats around changes the bits of NaNs on some hosts, | ||||||
662 | /// notably x86, so this routine cannot be used if these bits are needed. | ||||||
663 | inline uint32_t FloatToBits(float Float) { | ||||||
664 | uint32_t Bits; | ||||||
665 | static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes"); | ||||||
666 | memcpy(&Bits, &Float, sizeof(Float)); | ||||||
667 | return Bits; | ||||||
668 | } | ||||||
669 | |||||||
670 | /// A and B are either alignments or offsets. Return the minimum alignment that | ||||||
671 | /// may be assumed after adding the two together. | ||||||
672 | constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) { | ||||||
673 | // The largest power of 2 that divides both A and B. | ||||||
674 | // | ||||||
675 | // Replace "-Value" by "1+~Value" in the following commented code to avoid | ||||||
676 | // MSVC warning C4146 | ||||||
677 | // return (A | B) & -(A | B); | ||||||
678 | return (A | B) & (1 + ~(A | B)); | ||||||
679 | } | ||||||
680 | |||||||
681 | /// Returns the next power of two (in 64-bits) that is strictly greater than A. | ||||||
682 | /// Returns zero on overflow. | ||||||
683 | inline uint64_t NextPowerOf2(uint64_t A) { | ||||||
684 | A |= (A >> 1); | ||||||
685 | A |= (A >> 2); | ||||||
686 | A |= (A >> 4); | ||||||
687 | A |= (A >> 8); | ||||||
688 | A |= (A >> 16); | ||||||
689 | A |= (A >> 32); | ||||||
690 | return A + 1; | ||||||
691 | } | ||||||
692 | |||||||
693 | /// Returns the power of two which is less than or equal to the given value. | ||||||
694 | /// Essentially, it is a floor operation across the domain of powers of two. | ||||||
695 | inline uint64_t PowerOf2Floor(uint64_t A) { | ||||||
696 | if (!A) return 0; | ||||||
697 | return 1ull << (63 - countLeadingZeros(A, ZB_Undefined)); | ||||||
698 | } | ||||||
699 | |||||||
700 | /// Returns the power of two which is greater than or equal to the given value. | ||||||
701 | /// Essentially, it is a ceil operation across the domain of powers of two. | ||||||
702 | inline uint64_t PowerOf2Ceil(uint64_t A) { | ||||||
703 | if (!A) | ||||||
704 | return 0; | ||||||
705 | return NextPowerOf2(A - 1); | ||||||
706 | } | ||||||
707 | |||||||
708 | /// Returns the next integer (mod 2**64) that is greater than or equal to | ||||||
709 | /// \p Value and is a multiple of \p Align. \p Align must be non-zero. | ||||||
710 | /// | ||||||
711 | /// If non-zero \p Skew is specified, the return value will be a minimal | ||||||
712 | /// integer that is greater than or equal to \p Value and equal to | ||||||
713 | /// \p Align * N + \p Skew for some integer N. If \p Skew is larger than | ||||||
714 | /// \p Align, its value is adjusted to '\p Skew mod \p Align'. | ||||||
715 | /// | ||||||
716 | /// Examples: | ||||||
717 | /// \code | ||||||
718 | /// alignTo(5, 8) = 8 | ||||||
719 | /// alignTo(17, 8) = 24 | ||||||
720 | /// alignTo(~0LL, 8) = 0 | ||||||
721 | /// alignTo(321, 255) = 510 | ||||||
722 | /// | ||||||
723 | /// alignTo(5, 8, 7) = 7 | ||||||
724 | /// alignTo(17, 8, 1) = 17 | ||||||
725 | /// alignTo(~0LL, 8, 3) = 3 | ||||||
726 | /// alignTo(321, 255, 42) = 552 | ||||||
727 | /// \endcode | ||||||
728 | inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) { | ||||||
729 | assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0." ) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\"" , "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/include/llvm/Support/MathExtras.h" , 729, __extension__ __PRETTY_FUNCTION__)); | ||||||
730 | Skew %= Align; | ||||||
731 | return (Value + Align - 1 - Skew) / Align * Align + Skew; | ||||||
732 | } | ||||||
733 | |||||||
734 | /// Returns the next integer (mod 2**64) that is greater than or equal to | ||||||
735 | /// \p Value and is a multiple of \c Align. \c Align must be non-zero. | ||||||
736 | template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) { | ||||||
737 | static_assert(Align != 0u, "Align must be non-zero"); | ||||||
738 | return (Value + Align - 1) / Align * Align; | ||||||
739 | } | ||||||
740 | |||||||
741 | /// Returns the integer ceil(Numerator / Denominator). | ||||||
742 | inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) { | ||||||
743 | return alignTo(Numerator, Denominator) / Denominator; | ||||||
744 | } | ||||||
745 | |||||||
746 | /// Returns the integer nearest(Numerator / Denominator). | ||||||
747 | inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) { | ||||||
748 | return (Numerator + (Denominator / 2)) / Denominator; | ||||||
749 | } | ||||||
750 | |||||||
751 | /// Returns the largest uint64_t less than or equal to \p Value and is | ||||||
752 | /// \p Skew mod \p Align. \p Align must be non-zero | ||||||
753 | inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) { | ||||||
754 | assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0." ) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\"" , "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/include/llvm/Support/MathExtras.h" , 754, __extension__ __PRETTY_FUNCTION__)); | ||||||
755 | Skew %= Align; | ||||||
756 | return (Value - Skew) / Align * Align + Skew; | ||||||
757 | } | ||||||
758 | |||||||
759 | /// Sign-extend the number in the bottom B bits of X to a 32-bit integer. | ||||||
760 | /// Requires 0 < B <= 32. | ||||||
761 | template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) { | ||||||
762 | static_assert(B > 0, "Bit width can't be 0."); | ||||||
763 | static_assert(B <= 32, "Bit width out of range."); | ||||||
764 | return int32_t(X << (32 - B)) >> (32 - B); | ||||||
765 | } | ||||||
766 | |||||||
767 | /// Sign-extend the number in the bottom B bits of X to a 32-bit integer. | ||||||
768 | /// Requires 0 < B <= 32. | ||||||
769 | inline int32_t SignExtend32(uint32_t X, unsigned B) { | ||||||
770 | assert(B > 0 && "Bit width can't be 0.")(static_cast <bool> (B > 0 && "Bit width can't be 0." ) ? void (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\"" , "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/include/llvm/Support/MathExtras.h" , 770, __extension__ __PRETTY_FUNCTION__)); | ||||||
771 | assert(B <= 32 && "Bit width out of range.")(static_cast <bool> (B <= 32 && "Bit width out of range." ) ? void (0) : __assert_fail ("B <= 32 && \"Bit width out of range.\"" , "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/include/llvm/Support/MathExtras.h" , 771, __extension__ __PRETTY_FUNCTION__)); | ||||||
772 | return int32_t(X << (32 - B)) >> (32 - B); | ||||||
773 | } | ||||||
774 | |||||||
775 | /// Sign-extend the number in the bottom B bits of X to a 64-bit integer. | ||||||
776 | /// Requires 0 < B <= 64. | ||||||
777 | template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) { | ||||||
778 | static_assert(B > 0, "Bit width can't be 0."); | ||||||
779 | static_assert(B <= 64, "Bit width out of range."); | ||||||
780 | return int64_t(x << (64 - B)) >> (64 - B); | ||||||
781 | } | ||||||
782 | |||||||
783 | /// Sign-extend the number in the bottom B bits of X to a 64-bit integer. | ||||||
784 | /// Requires 0 < B <= 64. | ||||||
785 | inline int64_t SignExtend64(uint64_t X, unsigned B) { | ||||||
786 | assert(B > 0 && "Bit width can't be 0.")(static_cast <bool> (B > 0 && "Bit width can't be 0." ) ? void (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\"" , "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/include/llvm/Support/MathExtras.h" , 786, __extension__ __PRETTY_FUNCTION__)); | ||||||
787 | assert(B <= 64 && "Bit width out of range.")(static_cast <bool> (B <= 64 && "Bit width out of range." ) ? void (0) : __assert_fail ("B <= 64 && \"Bit width out of range.\"" , "/build/llvm-toolchain-snapshot-14~++20211015062411+81e9c90686f7/llvm/include/llvm/Support/MathExtras.h" , 787, __extension__ __PRETTY_FUNCTION__)); | ||||||
788 | return int64_t(X << (64 - B)) >> (64 - B); | ||||||
789 | } | ||||||
790 | |||||||
791 | /// Subtract two unsigned integers, X and Y, of type T and return the absolute | ||||||
792 | /// value of the result. | ||||||
793 | template <typename T> | ||||||
794 | std::enable_if_t<std::is_unsigned<T>::value, T> AbsoluteDifference(T X, T Y) { | ||||||
795 | return X > Y ? (X - Y) : (Y - X); | ||||||
796 | } | ||||||
797 | |||||||
798 | /// Add two unsigned integers, X and Y, of type T. Clamp the result to the | ||||||
799 | /// maximum representable value of T on overflow. ResultOverflowed indicates if | ||||||
800 | /// the result is larger than the maximum representable value of type T. | ||||||
801 | template <typename T> | ||||||
802 | std::enable_if_t<std::is_unsigned<T>::value, T> | ||||||
803 | SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) { | ||||||
804 | bool Dummy; | ||||||
805 | bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy; | ||||||
806 | // Hacker's Delight, p. 29 | ||||||
807 | T Z = X + Y; | ||||||
808 | Overflowed = (Z < X || Z < Y); | ||||||
809 | if (Overflowed) | ||||||
810 | return std::numeric_limits<T>::max(); | ||||||
811 | else | ||||||
812 | return Z; | ||||||
813 | } | ||||||
814 | |||||||
815 | /// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the | ||||||
816 | /// maximum representable value of T on overflow. ResultOverflowed indicates if | ||||||
817 | /// the result is larger than the maximum representable value of type T. | ||||||
818 | template <typename T> | ||||||
819 | std::enable_if_t<std::is_unsigned<T>::value, T> | ||||||
820 | SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) { | ||||||
821 | bool Dummy; | ||||||
822 | bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy; | ||||||
823 | |||||||
824 | // Hacker's Delight, p. 30 has a different algorithm, but we don't use that | ||||||
825 | // because it fails for uint16_t (where multiplication can have undefined | ||||||
826 | // behavior due to promotion to int), and requires a division in addition | ||||||
827 | // to the multiplication. | ||||||
828 | |||||||
829 | Overflowed = false; | ||||||
830 | |||||||
831 | // Log2(Z) would be either Log2Z or Log2Z + 1. | ||||||
832 | // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z | ||||||
833 | // will necessarily be less than Log2Max as desired. | ||||||
834 | int Log2Z = Log2_64(X) + Log2_64(Y); | ||||||
835 | const T Max = std::numeric_limits<T>::max(); | ||||||
836 | int Log2Max = Log2_64(Max); | ||||||
837 | if (Log2Z < Log2Max) { | ||||||
838 | return X * Y; | ||||||
839 | } | ||||||
840 | if (Log2Z > Log2Max) { | ||||||
841 | Overflowed = true; | ||||||
842 | return Max; | ||||||
843 | } | ||||||
844 | |||||||
845 | // We're going to use the top bit, and maybe overflow one | ||||||
846 | // bit past it. Multiply all but the bottom bit then add | ||||||
847 | // that on at the end. | ||||||
848 | T Z = (X >> 1) * Y; | ||||||
849 | if (Z & ~(Max >> 1)) { | ||||||
850 | Overflowed = true; | ||||||
851 | return Max; | ||||||
852 | } | ||||||
853 | Z <<= 1; | ||||||
854 | if (X & 1) | ||||||
855 | return SaturatingAdd(Z, Y, ResultOverflowed); | ||||||
856 | |||||||
857 | return Z; | ||||||
858 | } | ||||||
859 | |||||||
860 | /// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to | ||||||
861 | /// the product. Clamp the result to the maximum representable value of T on | ||||||
862 | /// overflow. ResultOverflowed indicates if the result is larger than the | ||||||
863 | /// maximum representable value of type T. | ||||||
864 | template <typename T> | ||||||
865 | std::enable_if_t<std::is_unsigned<T>::value, T> | ||||||
866 | SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) { | ||||||
867 | bool Dummy; | ||||||
868 | bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy; | ||||||
869 | |||||||
870 | T Product = SaturatingMultiply(X, Y, &Overflowed); | ||||||
871 | if (Overflowed) | ||||||
872 | return Product; | ||||||
873 | |||||||
874 | return SaturatingAdd(A, Product, &Overflowed); | ||||||
875 | } | ||||||
876 | |||||||
877 | /// Use this rather than HUGE_VALF; the latter causes warnings on MSVC. | ||||||
878 | extern const float huge_valf; | ||||||
879 | |||||||
880 | |||||||
881 | /// Add two signed integers, computing the two's complement truncated result, | ||||||
882 | /// returning true if overflow occured. | ||||||
883 | template <typename T> | ||||||
884 | std::enable_if_t<std::is_signed<T>::value, T> AddOverflow(T X, T Y, T &Result) { | ||||||
885 | #if __has_builtin(__builtin_add_overflow)1 | ||||||
886 | return __builtin_add_overflow(X, Y, &Result); | ||||||
887 | #else | ||||||
888 | // Perform the unsigned addition. | ||||||
889 | using U = std::make_unsigned_t<T>; | ||||||
890 | const U UX = static_cast<U>(X); | ||||||
891 | const U UY = static_cast<U>(Y); | ||||||
892 | const U UResult = UX + UY; | ||||||
893 | |||||||
894 | // Convert to signed. | ||||||
895 | Result = static_cast<T>(UResult); | ||||||
896 | |||||||
897 | // Adding two positive numbers should result in a positive number. | ||||||
898 | if (X > 0 && Y > 0) | ||||||
899 | return Result <= 0; | ||||||
900 | // Adding two negatives should result in a negative number. | ||||||
901 | if (X < 0 && Y < 0) | ||||||
902 | return Result >= 0; | ||||||
903 | return false; | ||||||
904 | #endif | ||||||
905 | } | ||||||
906 | |||||||
907 | /// Subtract two signed integers, computing the two's complement truncated | ||||||
908 | /// result, returning true if an overflow ocurred. | ||||||
909 | template <typename T> | ||||||
910 | std::enable_if_t<std::is_signed<T>::value, T> SubOverflow(T X, T Y, T &Result) { | ||||||
911 | #if __has_builtin(__builtin_sub_overflow)1 | ||||||
912 | return __builtin_sub_overflow(X, Y, &Result); | ||||||
913 | #else | ||||||
914 | // Perform the unsigned addition. | ||||||
915 | using U = std::make_unsigned_t<T>; | ||||||
916 | const U UX = static_cast<U>(X); | ||||||
917 | const U UY = static_cast<U>(Y); | ||||||
918 | const U UResult = UX - UY; | ||||||
919 | |||||||
920 | // Convert to signed. | ||||||
921 | Result = static_cast<T>(UResult); | ||||||
922 | |||||||
923 | // Subtracting a positive number from a negative results in a negative number. | ||||||
924 | if (X <= 0 && Y > 0) | ||||||
925 | return Result >= 0; | ||||||
926 | // Subtracting a negative number from a positive results in a positive number. | ||||||
927 | if (X >= 0 && Y < 0) | ||||||
928 | return Result <= 0; | ||||||
929 | return false; | ||||||
930 | #endif | ||||||
931 | } | ||||||
932 | |||||||
933 | /// Multiply two signed integers, computing the two's complement truncated | ||||||
934 | /// result, returning true if an overflow ocurred. | ||||||
935 | template <typename T> | ||||||
936 | std::enable_if_t<std::is_signed<T>::value, T> MulOverflow(T X, T Y, T &Result) { | ||||||
937 | // Perform the unsigned multiplication on absolute values. | ||||||
938 | using U = std::make_unsigned_t<T>; | ||||||
939 | const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X); | ||||||
940 | const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y); | ||||||
941 | const U UResult = UX * UY; | ||||||
942 | |||||||
943 | // Convert to signed. | ||||||
944 | const bool IsNegative = (X < 0) ^ (Y < 0); | ||||||
945 | Result = IsNegative ? (0 - UResult) : UResult; | ||||||
946 | |||||||
947 | // If any of the args was 0, result is 0 and no overflow occurs. | ||||||
948 | if (UX == 0 || UY == 0) | ||||||
949 | return false; | ||||||
950 | |||||||
951 | // UX and UY are in [1, 2^n], where n is the number of digits. | ||||||
952 | // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for | ||||||
953 | // positive) divided by an argument compares to the other. | ||||||
954 | if (IsNegative) | ||||||
955 | return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY; | ||||||
956 | else | ||||||
957 | return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY; | ||||||
958 | } | ||||||
959 | |||||||
960 | } // End llvm namespace | ||||||
961 | |||||||
962 | #endif |