Bug Summary

File:llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Warning:line 2074, column 21
The result of the left shift is undefined due to shifting by '4294967295', which is greater or equal to the width of type 'int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name AArch64TargetTransformInfo.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-14~++20220116100644+5f782d25a742/build-llvm/tools/clang/stage2-bins -resource-dir /usr/lib/llvm-14/lib/clang/14.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Target/AArch64 -I /build/llvm-toolchain-snapshot-14~++20220116100644+5f782d25a742/llvm/lib/Target/AArch64 -I include -I /build/llvm-toolchain-snapshot-14~++20220116100644+5f782d25a742/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-14/lib/clang/14.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/llvm-toolchain-snapshot-14~++20220116100644+5f782d25a742/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fmacro-prefix-map=/build/llvm-toolchain-snapshot-14~++20220116100644+5f782d25a742/= -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-14~++20220116100644+5f782d25a742/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-14~++20220116100644+5f782d25a742/= -O3 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-14~++20220116100644+5f782d25a742/build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/llvm-toolchain-snapshot-14~++20220116100644+5f782d25a742/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/llvm-toolchain-snapshot-14~++20220116100644+5f782d25a742/= -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-01-16-232930-107970-1 -x c++ /build/llvm-toolchain-snapshot-14~++20220116100644+5f782d25a742/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

/build/llvm-toolchain-snapshot-14~++20220116100644+5f782d25a742/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "AArch64TargetTransformInfo.h"
10#include "AArch64ExpandImm.h"
11#include "MCTargetDesc/AArch64AddressingModes.h"
12#include "llvm/Analysis/IVDescriptors.h"
13#include "llvm/Analysis/LoopInfo.h"
14#include "llvm/Analysis/TargetTransformInfo.h"
15#include "llvm/CodeGen/BasicTTIImpl.h"
16#include "llvm/CodeGen/CostTable.h"
17#include "llvm/CodeGen/TargetLowering.h"
18#include "llvm/IR/Intrinsics.h"
19#include "llvm/IR/IntrinsicInst.h"
20#include "llvm/IR/IntrinsicsAArch64.h"
21#include "llvm/IR/PatternMatch.h"
22#include "llvm/Support/Debug.h"
23#include "llvm/Transforms/InstCombine/InstCombiner.h"
24#include <algorithm>
25using namespace llvm;
26using namespace llvm::PatternMatch;
27
28#define DEBUG_TYPE"aarch64tti" "aarch64tti"
29
30static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
31 cl::init(true), cl::Hidden);
32
33static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
34 cl::Hidden);
35
36static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
37 cl::init(10), cl::Hidden);
38
39bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
40 const Function *Callee) const {
41 const TargetMachine &TM = getTLI()->getTargetMachine();
42
43 const FeatureBitset &CallerBits =
44 TM.getSubtargetImpl(*Caller)->getFeatureBits();
45 const FeatureBitset &CalleeBits =
46 TM.getSubtargetImpl(*Callee)->getFeatureBits();
47
48 // Inline a callee if its target-features are a subset of the callers
49 // target-features.
50 return (CallerBits & CalleeBits) == CalleeBits;
51}
52
53/// Calculate the cost of materializing a 64-bit value. This helper
54/// method might only calculate a fraction of a larger immediate. Therefore it
55/// is valid to return a cost of ZERO.
56InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) {
57 // Check if the immediate can be encoded within an instruction.
58 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
59 return 0;
60
61 if (Val < 0)
62 Val = ~Val;
63
64 // Calculate how many moves we will need to materialize this constant.
65 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
66 AArch64_IMM::expandMOVImm(Val, 64, Insn);
67 return Insn.size();
68}
69
70/// Calculate the cost of materializing the given constant.
71InstructionCost AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
72 TTI::TargetCostKind CostKind) {
73 assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
__assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 73, __extension__ __PRETTY_FUNCTION__))
;
74
75 unsigned BitSize = Ty->getPrimitiveSizeInBits();
76 if (BitSize == 0)
77 return ~0U;
78
79 // Sign-extend all constants to a multiple of 64-bit.
80 APInt ImmVal = Imm;
81 if (BitSize & 0x3f)
82 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
83
84 // Split the constant into 64-bit chunks and calculate the cost for each
85 // chunk.
86 InstructionCost Cost = 0;
87 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
88 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
89 int64_t Val = Tmp.getSExtValue();
90 Cost += getIntImmCost(Val);
91 }
92 // We need at least one instruction to materialze the constant.
93 return std::max<InstructionCost>(1, Cost);
94}
95
96InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
97 const APInt &Imm, Type *Ty,
98 TTI::TargetCostKind CostKind,
99 Instruction *Inst) {
100 assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
__assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 100, __extension__ __PRETTY_FUNCTION__))
;
101
102 unsigned BitSize = Ty->getPrimitiveSizeInBits();
103 // There is no cost model for constants with a bit size of 0. Return TCC_Free
104 // here, so that constant hoisting will ignore this constant.
105 if (BitSize == 0)
106 return TTI::TCC_Free;
107
108 unsigned ImmIdx = ~0U;
109 switch (Opcode) {
110 default:
111 return TTI::TCC_Free;
112 case Instruction::GetElementPtr:
113 // Always hoist the base address of a GetElementPtr.
114 if (Idx == 0)
115 return 2 * TTI::TCC_Basic;
116 return TTI::TCC_Free;
117 case Instruction::Store:
118 ImmIdx = 0;
119 break;
120 case Instruction::Add:
121 case Instruction::Sub:
122 case Instruction::Mul:
123 case Instruction::UDiv:
124 case Instruction::SDiv:
125 case Instruction::URem:
126 case Instruction::SRem:
127 case Instruction::And:
128 case Instruction::Or:
129 case Instruction::Xor:
130 case Instruction::ICmp:
131 ImmIdx = 1;
132 break;
133 // Always return TCC_Free for the shift value of a shift instruction.
134 case Instruction::Shl:
135 case Instruction::LShr:
136 case Instruction::AShr:
137 if (Idx == 1)
138 return TTI::TCC_Free;
139 break;
140 case Instruction::Trunc:
141 case Instruction::ZExt:
142 case Instruction::SExt:
143 case Instruction::IntToPtr:
144 case Instruction::PtrToInt:
145 case Instruction::BitCast:
146 case Instruction::PHI:
147 case Instruction::Call:
148 case Instruction::Select:
149 case Instruction::Ret:
150 case Instruction::Load:
151 break;
152 }
153
154 if (Idx == ImmIdx) {
155 int NumConstants = (BitSize + 63) / 64;
156 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
157 return (Cost <= NumConstants * TTI::TCC_Basic)
158 ? static_cast<int>(TTI::TCC_Free)
159 : Cost;
160 }
161 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
162}
163
164InstructionCost
165AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
166 const APInt &Imm, Type *Ty,
167 TTI::TargetCostKind CostKind) {
168 assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
__assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 168, __extension__ __PRETTY_FUNCTION__))
;
169
170 unsigned BitSize = Ty->getPrimitiveSizeInBits();
171 // There is no cost model for constants with a bit size of 0. Return TCC_Free
172 // here, so that constant hoisting will ignore this constant.
173 if (BitSize == 0)
174 return TTI::TCC_Free;
175
176 // Most (all?) AArch64 intrinsics do not support folding immediates into the
177 // selected instruction, so we compute the materialization cost for the
178 // immediate directly.
179 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
180 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
181
182 switch (IID) {
183 default:
184 return TTI::TCC_Free;
185 case Intrinsic::sadd_with_overflow:
186 case Intrinsic::uadd_with_overflow:
187 case Intrinsic::ssub_with_overflow:
188 case Intrinsic::usub_with_overflow:
189 case Intrinsic::smul_with_overflow:
190 case Intrinsic::umul_with_overflow:
191 if (Idx == 1) {
192 int NumConstants = (BitSize + 63) / 64;
193 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
194 return (Cost <= NumConstants * TTI::TCC_Basic)
195 ? static_cast<int>(TTI::TCC_Free)
196 : Cost;
197 }
198 break;
199 case Intrinsic::experimental_stackmap:
200 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
201 return TTI::TCC_Free;
202 break;
203 case Intrinsic::experimental_patchpoint_void:
204 case Intrinsic::experimental_patchpoint_i64:
205 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
206 return TTI::TCC_Free;
207 break;
208 case Intrinsic::experimental_gc_statepoint:
209 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
210 return TTI::TCC_Free;
211 break;
212 }
213 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
214}
215
216TargetTransformInfo::PopcntSupportKind
217AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
218 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2")(static_cast <bool> (isPowerOf2_32(TyWidth) && "Ty width must be power of 2"
) ? void (0) : __assert_fail ("isPowerOf2_32(TyWidth) && \"Ty width must be power of 2\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 218
, __extension__ __PRETTY_FUNCTION__))
;
219 if (TyWidth == 32 || TyWidth == 64)
220 return TTI::PSK_FastHardware;
221 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
222 return TTI::PSK_Software;
223}
224
225InstructionCost
226AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
227 TTI::TargetCostKind CostKind) {
228 auto *RetTy = ICA.getReturnType();
229 switch (ICA.getID()) {
230 case Intrinsic::umin:
231 case Intrinsic::umax:
232 case Intrinsic::smin:
233 case Intrinsic::smax: {
234 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
235 MVT::v8i16, MVT::v2i32, MVT::v4i32};
236 auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
237 // v2i64 types get converted to cmp+bif hence the cost of 2
238 if (LT.second == MVT::v2i64)
239 return LT.first * 2;
240 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
241 return LT.first;
242 break;
243 }
244 case Intrinsic::sadd_sat:
245 case Intrinsic::ssub_sat:
246 case Intrinsic::uadd_sat:
247 case Intrinsic::usub_sat: {
248 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
249 MVT::v8i16, MVT::v2i32, MVT::v4i32,
250 MVT::v2i64};
251 auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
252 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
253 // need to extend the type, as it uses shr(qadd(shl, shl)).
254 unsigned Instrs =
255 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
256 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
257 return LT.first * Instrs;
258 break;
259 }
260 case Intrinsic::abs: {
261 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
262 MVT::v8i16, MVT::v2i32, MVT::v4i32,
263 MVT::v2i64};
264 auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
265 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
266 return LT.first;
267 break;
268 }
269 case Intrinsic::experimental_stepvector: {
270 InstructionCost Cost = 1; // Cost of the `index' instruction
271 auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
272 // Legalisation of illegal vectors involves an `index' instruction plus
273 // (LT.first - 1) vector adds.
274 if (LT.first > 1) {
275 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
276 InstructionCost AddCost =
277 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
278 Cost += AddCost * (LT.first - 1);
279 }
280 return Cost;
281 }
282 case Intrinsic::bitreverse: {
283 static const CostTblEntry BitreverseTbl[] = {
284 {Intrinsic::bitreverse, MVT::i32, 1},
285 {Intrinsic::bitreverse, MVT::i64, 1},
286 {Intrinsic::bitreverse, MVT::v8i8, 1},
287 {Intrinsic::bitreverse, MVT::v16i8, 1},
288 {Intrinsic::bitreverse, MVT::v4i16, 2},
289 {Intrinsic::bitreverse, MVT::v8i16, 2},
290 {Intrinsic::bitreverse, MVT::v2i32, 2},
291 {Intrinsic::bitreverse, MVT::v4i32, 2},
292 {Intrinsic::bitreverse, MVT::v1i64, 2},
293 {Intrinsic::bitreverse, MVT::v2i64, 2},
294 };
295 const auto LegalisationCost = TLI->getTypeLegalizationCost(DL, RetTy);
296 const auto *Entry =
297 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
298 if (Entry) {
299 // Cost Model is using the legal type(i32) that i8 and i16 will be
300 // converted to +1 so that we match the actual lowering cost
301 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
302 TLI->getValueType(DL, RetTy, true) == MVT::i16)
303 return LegalisationCost.first * Entry->Cost + 1;
304
305 return LegalisationCost.first * Entry->Cost;
306 }
307 break;
308 }
309 case Intrinsic::ctpop: {
310 static const CostTblEntry CtpopCostTbl[] = {
311 {ISD::CTPOP, MVT::v2i64, 4},
312 {ISD::CTPOP, MVT::v4i32, 3},
313 {ISD::CTPOP, MVT::v8i16, 2},
314 {ISD::CTPOP, MVT::v16i8, 1},
315 {ISD::CTPOP, MVT::i64, 4},
316 {ISD::CTPOP, MVT::v2i32, 3},
317 {ISD::CTPOP, MVT::v4i16, 2},
318 {ISD::CTPOP, MVT::v8i8, 1},
319 {ISD::CTPOP, MVT::i32, 5},
320 };
321 auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
322 MVT MTy = LT.second;
323 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
324 // Extra cost of +1 when illegal vector types are legalized by promoting
325 // the integer type.
326 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
327 RetTy->getScalarSizeInBits()
328 ? 1
329 : 0;
330 return LT.first * Entry->Cost + ExtraCost;
331 }
332 break;
333 }
334 case Intrinsic::sadd_with_overflow:
335 case Intrinsic::uadd_with_overflow:
336 case Intrinsic::ssub_with_overflow:
337 case Intrinsic::usub_with_overflow:
338 case Intrinsic::smul_with_overflow:
339 case Intrinsic::umul_with_overflow: {
340 static const CostTblEntry WithOverflowCostTbl[] = {
341 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
342 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
343 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
344 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
345 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
346 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
347 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
348 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
349 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
350 {Intrinsic::usub_with_overflow, MVT::i8, 3},
351 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
352 {Intrinsic::usub_with_overflow, MVT::i16, 3},
353 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
354 {Intrinsic::usub_with_overflow, MVT::i32, 1},
355 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
356 {Intrinsic::usub_with_overflow, MVT::i64, 1},
357 {Intrinsic::smul_with_overflow, MVT::i8, 5},
358 {Intrinsic::umul_with_overflow, MVT::i8, 4},
359 {Intrinsic::smul_with_overflow, MVT::i16, 5},
360 {Intrinsic::umul_with_overflow, MVT::i16, 4},
361 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
362 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
363 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
364 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
365 };
366 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
367 if (MTy.isSimple())
368 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
369 MTy.getSimpleVT()))
370 return Entry->Cost;
371 break;
372 }
373 default:
374 break;
375 }
376 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
377}
378
379/// The function will remove redundant reinterprets casting in the presence
380/// of the control flow
381static Optional<Instruction *> processPhiNode(InstCombiner &IC,
382 IntrinsicInst &II) {
383 SmallVector<Instruction *, 32> Worklist;
384 auto RequiredType = II.getType();
385
386 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
387 assert(PN && "Expected Phi Node!")(static_cast <bool> (PN && "Expected Phi Node!"
) ? void (0) : __assert_fail ("PN && \"Expected Phi Node!\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 387
, __extension__ __PRETTY_FUNCTION__))
;
388
389 // Don't create a new Phi unless we can remove the old one.
390 if (!PN->hasOneUse())
391 return None;
392
393 for (Value *IncValPhi : PN->incoming_values()) {
394 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
395 if (!Reinterpret ||
396 Reinterpret->getIntrinsicID() !=
397 Intrinsic::aarch64_sve_convert_to_svbool ||
398 RequiredType != Reinterpret->getArgOperand(0)->getType())
399 return None;
400 }
401
402 // Create the new Phi
403 LLVMContext &Ctx = PN->getContext();
404 IRBuilder<> Builder(Ctx);
405 Builder.SetInsertPoint(PN);
406 PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
407 Worklist.push_back(PN);
408
409 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
410 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
411 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
412 Worklist.push_back(Reinterpret);
413 }
414
415 // Cleanup Phi Node and reinterprets
416 return IC.replaceInstUsesWith(II, NPN);
417}
418
419static Optional<Instruction *> instCombineConvertFromSVBool(InstCombiner &IC,
420 IntrinsicInst &II) {
421 // If the reinterpret instruction operand is a PHI Node
422 if (isa<PHINode>(II.getArgOperand(0)))
423 return processPhiNode(IC, II);
424
425 SmallVector<Instruction *, 32> CandidatesForRemoval;
426 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
427
428 const auto *IVTy = cast<VectorType>(II.getType());
429
430 // Walk the chain of conversions.
431 while (Cursor) {
432 // If the type of the cursor has fewer lanes than the final result, zeroing
433 // must take place, which breaks the equivalence chain.
434 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
435 if (CursorVTy->getElementCount().getKnownMinValue() <
436 IVTy->getElementCount().getKnownMinValue())
437 break;
438
439 // If the cursor has the same type as I, it is a viable replacement.
440 if (Cursor->getType() == IVTy)
441 EarliestReplacement = Cursor;
442
443 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
444
445 // If this is not an SVE conversion intrinsic, this is the end of the chain.
446 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
447 Intrinsic::aarch64_sve_convert_to_svbool ||
448 IntrinsicCursor->getIntrinsicID() ==
449 Intrinsic::aarch64_sve_convert_from_svbool))
450 break;
451
452 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
453 Cursor = IntrinsicCursor->getOperand(0);
454 }
455
456 // If no viable replacement in the conversion chain was found, there is
457 // nothing to do.
458 if (!EarliestReplacement)
459 return None;
460
461 return IC.replaceInstUsesWith(II, EarliestReplacement);
462}
463
464static Optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
465 IntrinsicInst &II) {
466 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
467 if (!Pg)
468 return None;
469
470 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
471 return None;
472
473 const auto PTruePattern =
474 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
475 if (PTruePattern != AArch64SVEPredPattern::vl1)
476 return None;
477
478 // The intrinsic is inserting into lane zero so use an insert instead.
479 auto *IdxTy = Type::getInt64Ty(II.getContext());
480 auto *Insert = InsertElementInst::Create(
481 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
482 Insert->insertBefore(&II);
483 Insert->takeName(&II);
484
485 return IC.replaceInstUsesWith(II, Insert);
486}
487
488static Optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
489 IntrinsicInst &II) {
490 // Replace DupX with a regular IR splat.
491 IRBuilder<> Builder(II.getContext());
492 Builder.SetInsertPoint(&II);
493 auto *RetTy = cast<ScalableVectorType>(II.getType());
494 Value *Splat =
495 Builder.CreateVectorSplat(RetTy->getElementCount(), II.getArgOperand(0));
496 Splat->takeName(&II);
497 return IC.replaceInstUsesWith(II, Splat);
498}
499
500static Optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
501 IntrinsicInst &II) {
502 LLVMContext &Ctx = II.getContext();
503 IRBuilder<> Builder(Ctx);
504 Builder.SetInsertPoint(&II);
505
506 // Check that the predicate is all active
507 auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
508 if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
509 return None;
510
511 const auto PTruePattern =
512 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
513 if (PTruePattern != AArch64SVEPredPattern::all)
514 return None;
515
516 // Check that we have a compare of zero..
517 auto *SplatValue =
518 dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2)));
519 if (!SplatValue || !SplatValue->isZero())
520 return None;
521
522 // ..against a dupq
523 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
524 if (!DupQLane ||
525 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
526 return None;
527
528 // Where the dupq is a lane 0 replicate of a vector insert
529 if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero())
530 return None;
531
532 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
533 if (!VecIns ||
534 VecIns->getIntrinsicID() != Intrinsic::experimental_vector_insert)
535 return None;
536
537 // Where the vector insert is a fixed constant vector insert into undef at
538 // index zero
539 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
540 return None;
541
542 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
543 return None;
544
545 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
546 if (!ConstVec)
547 return None;
548
549 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
550 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
551 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
552 return None;
553
554 unsigned NumElts = VecTy->getNumElements();
555 unsigned PredicateBits = 0;
556
557 // Expand intrinsic operands to a 16-bit byte level predicate
558 for (unsigned I = 0; I < NumElts; ++I) {
559 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
560 if (!Arg)
561 return None;
562 if (!Arg->isZero())
563 PredicateBits |= 1 << (I * (16 / NumElts));
564 }
565
566 // If all bits are zero bail early with an empty predicate
567 if (PredicateBits == 0) {
568 auto *PFalse = Constant::getNullValue(II.getType());
569 PFalse->takeName(&II);
570 return IC.replaceInstUsesWith(II, PFalse);
571 }
572
573 // Calculate largest predicate type used (where byte predicate is largest)
574 unsigned Mask = 8;
575 for (unsigned I = 0; I < 16; ++I)
576 if ((PredicateBits & (1 << I)) != 0)
577 Mask |= (I % 8);
578
579 unsigned PredSize = Mask & -Mask;
580 auto *PredType = ScalableVectorType::get(
581 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
582
583 // Ensure all relevant bits are set
584 for (unsigned I = 0; I < 16; I += PredSize)
585 if ((PredicateBits & (1 << I)) == 0)
586 return None;
587
588 auto *PTruePat =
589 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
590 auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
591 {PredType}, {PTruePat});
592 auto *ConvertToSVBool = Builder.CreateIntrinsic(
593 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
594 auto *ConvertFromSVBool =
595 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
596 {II.getType()}, {ConvertToSVBool});
597
598 ConvertFromSVBool->takeName(&II);
599 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
600}
601
602static Optional<Instruction *> instCombineSVELast(InstCombiner &IC,
603 IntrinsicInst &II) {
604 IRBuilder<> Builder(II.getContext());
605 Builder.SetInsertPoint(&II);
606 Value *Pg = II.getArgOperand(0);
607 Value *Vec = II.getArgOperand(1);
608 auto IntrinsicID = II.getIntrinsicID();
609 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
610
611 // lastX(splat(X)) --> X
612 if (auto *SplatVal = getSplatValue(Vec))
613 return IC.replaceInstUsesWith(II, SplatVal);
614
615 // If x and/or y is a splat value then:
616 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
617 Value *LHS, *RHS;
618 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
619 if (isSplatValue(LHS) || isSplatValue(RHS)) {
620 auto *OldBinOp = cast<BinaryOperator>(Vec);
621 auto OpC = OldBinOp->getOpcode();
622 auto *NewLHS =
623 Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
624 auto *NewRHS =
625 Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
626 auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags(
627 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), &II);
628 return IC.replaceInstUsesWith(II, NewBinOp);
629 }
630 }
631
632 auto *C = dyn_cast<Constant>(Pg);
633 if (IsAfter && C && C->isNullValue()) {
634 // The intrinsic is extracting lane 0 so use an extract instead.
635 auto *IdxTy = Type::getInt64Ty(II.getContext());
636 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
637 Extract->insertBefore(&II);
638 Extract->takeName(&II);
639 return IC.replaceInstUsesWith(II, Extract);
640 }
641
642 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
643 if (!IntrPG)
644 return None;
645
646 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
647 return None;
648
649 const auto PTruePattern =
650 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
651
652 // Can the intrinsic's predicate be converted to a known constant index?
653 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
654 if (!MinNumElts)
655 return None;
656
657 unsigned Idx = MinNumElts - 1;
658 // Increment the index if extracting the element after the last active
659 // predicate element.
660 if (IsAfter)
661 ++Idx;
662
663 // Ignore extracts whose index is larger than the known minimum vector
664 // length. NOTE: This is an artificial constraint where we prefer to
665 // maintain what the user asked for until an alternative is proven faster.
666 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
667 if (Idx >= PgVTy->getMinNumElements())
668 return None;
669
670 // The intrinsic is extracting a fixed lane so use an extract instead.
671 auto *IdxTy = Type::getInt64Ty(II.getContext());
672 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
673 Extract->insertBefore(&II);
674 Extract->takeName(&II);
675 return IC.replaceInstUsesWith(II, Extract);
676}
677
678static Optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
679 IntrinsicInst &II) {
680 LLVMContext &Ctx = II.getContext();
681 IRBuilder<> Builder(Ctx);
682 Builder.SetInsertPoint(&II);
683 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
684 // can work with RDFFR_PP for ptest elimination.
685 auto *AllPat =
686 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
687 auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
688 {II.getType()}, {AllPat});
689 auto *RDFFR =
690 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue});
691 RDFFR->takeName(&II);
692 return IC.replaceInstUsesWith(II, RDFFR);
693}
694
695static Optional<Instruction *>
696instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) {
697 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
698
699 if (Pattern == AArch64SVEPredPattern::all) {
700 LLVMContext &Ctx = II.getContext();
701 IRBuilder<> Builder(Ctx);
702 Builder.SetInsertPoint(&II);
703
704 Constant *StepVal = ConstantInt::get(II.getType(), NumElts);
705 auto *VScale = Builder.CreateVScale(StepVal);
706 VScale->takeName(&II);
707 return IC.replaceInstUsesWith(II, VScale);
708 }
709
710 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
711
712 return MinNumElts && NumElts >= MinNumElts
713 ? Optional<Instruction *>(IC.replaceInstUsesWith(
714 II, ConstantInt::get(II.getType(), MinNumElts)))
715 : None;
716}
717
718static Optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
719 IntrinsicInst &II) {
720 IntrinsicInst *Op1 = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
721 IntrinsicInst *Op2 = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
722
723 if (Op1 && Op2 &&
724 Op1->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
725 Op2->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
726 Op1->getArgOperand(0)->getType() == Op2->getArgOperand(0)->getType()) {
727
728 IRBuilder<> Builder(II.getContext());
729 Builder.SetInsertPoint(&II);
730
731 Value *Ops[] = {Op1->getArgOperand(0), Op2->getArgOperand(0)};
732 Type *Tys[] = {Op1->getArgOperand(0)->getType()};
733
734 auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
735
736 PTest->takeName(&II);
737 return IC.replaceInstUsesWith(II, PTest);
738 }
739
740 return None;
741}
742
743static Optional<Instruction *> instCombineSVEVectorFMLA(InstCombiner &IC,
744 IntrinsicInst &II) {
745 // fold (fadd p a (fmul p b c)) -> (fma p a b c)
746 Value *P = II.getOperand(0);
747 Value *A = II.getOperand(1);
748 auto FMul = II.getOperand(2);
749 Value *B, *C;
750 if (!match(FMul, m_Intrinsic<Intrinsic::aarch64_sve_fmul>(
751 m_Specific(P), m_Value(B), m_Value(C))))
752 return None;
753
754 if (!FMul->hasOneUse())
755 return None;
756
757 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
758 // Stop the combine when the flags on the inputs differ in case dropping flags
759 // would lead to us missing out on more beneficial optimizations.
760 if (FAddFlags != cast<CallInst>(FMul)->getFastMathFlags())
761 return None;
762 if (!FAddFlags.allowContract())
763 return None;
764
765 IRBuilder<> Builder(II.getContext());
766 Builder.SetInsertPoint(&II);
767 auto FMLA = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_fmla,
768 {II.getType()}, {P, A, B, C}, &II);
769 FMLA->setFastMathFlags(FAddFlags);
770 return IC.replaceInstUsesWith(II, FMLA);
771}
772
773static bool isAllActivePredicate(Value *Pred) {
774 // Look through convert.from.svbool(convert.to.svbool(...) chain.
775 Value *UncastedPred;
776 if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
777 m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
778 m_Value(UncastedPred)))))
779 // If the predicate has the same or less lanes than the uncasted
780 // predicate then we know the casting has no effect.
781 if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
782 cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
783 Pred = UncastedPred;
784
785 return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
786 m_ConstantInt<AArch64SVEPredPattern::all>()));
787}
788
789static Optional<Instruction *>
790instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
791 IRBuilder<> Builder(II.getContext());
792 Builder.SetInsertPoint(&II);
793
794 Value *Pred = II.getOperand(0);
795 Value *PtrOp = II.getOperand(1);
796 Type *VecTy = II.getType();
797 Value *VecPtr = Builder.CreateBitCast(PtrOp, VecTy->getPointerTo());
798
799 if (isAllActivePredicate(Pred)) {
800 LoadInst *Load = Builder.CreateLoad(VecTy, VecPtr);
801 return IC.replaceInstUsesWith(II, Load);
802 }
803
804 CallInst *MaskedLoad =
805 Builder.CreateMaskedLoad(VecTy, VecPtr, PtrOp->getPointerAlignment(DL),
806 Pred, ConstantAggregateZero::get(VecTy));
807 return IC.replaceInstUsesWith(II, MaskedLoad);
808}
809
810static Optional<Instruction *>
811instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
812 IRBuilder<> Builder(II.getContext());
813 Builder.SetInsertPoint(&II);
814
815 Value *VecOp = II.getOperand(0);
816 Value *Pred = II.getOperand(1);
817 Value *PtrOp = II.getOperand(2);
818 Value *VecPtr =
819 Builder.CreateBitCast(PtrOp, VecOp->getType()->getPointerTo());
820
821 if (isAllActivePredicate(Pred)) {
822 Builder.CreateStore(VecOp, VecPtr);
823 return IC.eraseInstFromFunction(II);
824 }
825
826 Builder.CreateMaskedStore(VecOp, VecPtr, PtrOp->getPointerAlignment(DL),
827 Pred);
828 return IC.eraseInstFromFunction(II);
829}
830
831static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) {
832 switch (Intrinsic) {
833 case Intrinsic::aarch64_sve_fmul:
834 return Instruction::BinaryOps::FMul;
835 case Intrinsic::aarch64_sve_fadd:
836 return Instruction::BinaryOps::FAdd;
837 case Intrinsic::aarch64_sve_fsub:
838 return Instruction::BinaryOps::FSub;
839 default:
840 return Instruction::BinaryOpsEnd;
841 }
842}
843
844static Optional<Instruction *> instCombineSVEVectorBinOp(InstCombiner &IC,
845 IntrinsicInst &II) {
846 auto *OpPredicate = II.getOperand(0);
847 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
848 if (BinOpCode == Instruction::BinaryOpsEnd ||
849 !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
850 m_ConstantInt<AArch64SVEPredPattern::all>())))
851 return None;
852 IRBuilder<> Builder(II.getContext());
853 Builder.SetInsertPoint(&II);
854 Builder.setFastMathFlags(II.getFastMathFlags());
855 auto BinOp =
856 Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2));
857 return IC.replaceInstUsesWith(II, BinOp);
858}
859
860static Optional<Instruction *> instCombineSVEVectorFAdd(InstCombiner &IC,
861 IntrinsicInst &II) {
862 if (auto FMLA = instCombineSVEVectorFMLA(IC, II))
863 return FMLA;
864 return instCombineSVEVectorBinOp(IC, II);
865}
866
867static Optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
868 IntrinsicInst &II) {
869 auto *OpPredicate = II.getOperand(0);
870 auto *OpMultiplicand = II.getOperand(1);
871 auto *OpMultiplier = II.getOperand(2);
872
873 IRBuilder<> Builder(II.getContext());
874 Builder.SetInsertPoint(&II);
875
876 // Return true if a given instruction is a unit splat value, false otherwise.
877 auto IsUnitSplat = [](auto *I) {
878 auto *SplatValue = getSplatValue(I);
879 if (!SplatValue)
880 return false;
881 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
882 };
883
884 // Return true if a given instruction is an aarch64_sve_dup intrinsic call
885 // with a unit splat value, false otherwise.
886 auto IsUnitDup = [](auto *I) {
887 auto *IntrI = dyn_cast<IntrinsicInst>(I);
888 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
889 return false;
890
891 auto *SplatValue = IntrI->getOperand(2);
892 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
893 };
894
895 if (IsUnitSplat(OpMultiplier)) {
896 // [f]mul pg %n, (dupx 1) => %n
897 OpMultiplicand->takeName(&II);
898 return IC.replaceInstUsesWith(II, OpMultiplicand);
899 } else if (IsUnitDup(OpMultiplier)) {
900 // [f]mul pg %n, (dup pg 1) => %n
901 auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
902 auto *DupPg = DupInst->getOperand(1);
903 // TODO: this is naive. The optimization is still valid if DupPg
904 // 'encompasses' OpPredicate, not only if they're the same predicate.
905 if (OpPredicate == DupPg) {
906 OpMultiplicand->takeName(&II);
907 return IC.replaceInstUsesWith(II, OpMultiplicand);
908 }
909 }
910
911 return instCombineSVEVectorBinOp(IC, II);
912}
913
914static Optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
915 IntrinsicInst &II) {
916 IRBuilder<> Builder(II.getContext());
917 Builder.SetInsertPoint(&II);
918 Value *UnpackArg = II.getArgOperand(0);
919 auto *RetTy = cast<ScalableVectorType>(II.getType());
920 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
921 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
922
923 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
924 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
925 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
926 ScalarArg =
927 Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
928 Value *NewVal =
929 Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
930 NewVal->takeName(&II);
931 return IC.replaceInstUsesWith(II, NewVal);
932 }
933
934 return None;
935}
936static Optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
937 IntrinsicInst &II) {
938 auto *OpVal = II.getOperand(0);
939 auto *OpIndices = II.getOperand(1);
940 VectorType *VTy = cast<VectorType>(II.getType());
941
942 // Check whether OpIndices is a constant splat value < minimal element count
943 // of result.
944 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
945 if (!SplatValue ||
946 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
947 return None;
948
949 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
950 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
951 IRBuilder<> Builder(II.getContext());
952 Builder.SetInsertPoint(&II);
953 auto *Extract = Builder.CreateExtractElement(OpVal, SplatValue);
954 auto *VectorSplat =
955 Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
956
957 VectorSplat->takeName(&II);
958 return IC.replaceInstUsesWith(II, VectorSplat);
959}
960
961static Optional<Instruction *> instCombineSVETupleGet(InstCombiner &IC,
962 IntrinsicInst &II) {
963 // Try to remove sequences of tuple get/set.
964 Value *SetTuple, *SetIndex, *SetValue;
965 auto *GetTuple = II.getArgOperand(0);
966 auto *GetIndex = II.getArgOperand(1);
967 // Check that we have tuple_get(GetTuple, GetIndex) where GetTuple is a
968 // call to tuple_set i.e. tuple_set(SetTuple, SetIndex, SetValue).
969 // Make sure that the types of the current intrinsic and SetValue match
970 // in order to safely remove the sequence.
971 if (!match(GetTuple,
972 m_Intrinsic<Intrinsic::aarch64_sve_tuple_set>(
973 m_Value(SetTuple), m_Value(SetIndex), m_Value(SetValue))) ||
974 SetValue->getType() != II.getType())
975 return None;
976 // Case where we get the same index right after setting it.
977 // tuple_get(tuple_set(SetTuple, SetIndex, SetValue), GetIndex) --> SetValue
978 if (GetIndex == SetIndex)
979 return IC.replaceInstUsesWith(II, SetValue);
980 // If we are getting a different index than what was set in the tuple_set
981 // intrinsic. We can just set the input tuple to the one up in the chain.
982 // tuple_get(tuple_set(SetTuple, SetIndex, SetValue), GetIndex)
983 // --> tuple_get(SetTuple, GetIndex)
984 return IC.replaceOperand(II, 0, SetTuple);
985}
986
987static Optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
988 IntrinsicInst &II) {
989 // zip1(uzp1(A, B), uzp2(A, B)) --> A
990 // zip2(uzp1(A, B), uzp2(A, B)) --> B
991 Value *A, *B;
992 if (match(II.getArgOperand(0),
993 m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) &&
994 match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
995 m_Specific(A), m_Specific(B))))
996 return IC.replaceInstUsesWith(
997 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
998
999 return None;
1000}
1001
1002static Optional<Instruction *> instCombineLD1GatherIndex(InstCombiner &IC,
1003 IntrinsicInst &II) {
1004 Value *Mask = II.getOperand(0);
1005 Value *BasePtr = II.getOperand(1);
1006 Value *Index = II.getOperand(2);
1007 Type *Ty = II.getType();
1008 Type *BasePtrTy = BasePtr->getType();
1009 Value *PassThru = ConstantAggregateZero::get(Ty);
1010
1011 // Contiguous gather => masked load.
1012 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
1013 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
1014 Value *IndexBase;
1015 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1016 m_Value(IndexBase), m_SpecificInt(1)))) {
1017 IRBuilder<> Builder(II.getContext());
1018 Builder.SetInsertPoint(&II);
1019
1020 Align Alignment =
1021 BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
1022
1023 Type *VecPtrTy = PointerType::getUnqual(Ty);
1024 Value *Ptr = Builder.CreateGEP(BasePtrTy->getPointerElementType(), BasePtr,
1025 IndexBase);
1026 Ptr = Builder.CreateBitCast(Ptr, VecPtrTy);
1027 CallInst *MaskedLoad =
1028 Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
1029 MaskedLoad->takeName(&II);
1030 return IC.replaceInstUsesWith(II, MaskedLoad);
1031 }
1032
1033 return None;
1034}
1035
1036static Optional<Instruction *> instCombineST1ScatterIndex(InstCombiner &IC,
1037 IntrinsicInst &II) {
1038 Value *Val = II.getOperand(0);
1039 Value *Mask = II.getOperand(1);
1040 Value *BasePtr = II.getOperand(2);
1041 Value *Index = II.getOperand(3);
1042 Type *Ty = Val->getType();
1043 Type *BasePtrTy = BasePtr->getType();
1044
1045 // Contiguous scatter => masked store.
1046 // (sve.ld1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
1047 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
1048 Value *IndexBase;
1049 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1050 m_Value(IndexBase), m_SpecificInt(1)))) {
1051 IRBuilder<> Builder(II.getContext());
1052 Builder.SetInsertPoint(&II);
1053
1054 Align Alignment =
1055 BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
1056
1057 Value *Ptr = Builder.CreateGEP(BasePtrTy->getPointerElementType(), BasePtr,
1058 IndexBase);
1059 Type *VecPtrTy = PointerType::getUnqual(Ty);
1060 Ptr = Builder.CreateBitCast(Ptr, VecPtrTy);
1061
1062 (void)Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
1063
1064 return IC.eraseInstFromFunction(II);
1065 }
1066
1067 return None;
1068}
1069
1070static Optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
1071 IntrinsicInst &II) {
1072 IRBuilder<> Builder(II.getContext());
1073 Builder.SetInsertPoint(&II);
1074 Type *Int32Ty = Builder.getInt32Ty();
1075 Value *Pred = II.getOperand(0);
1076 Value *Vec = II.getOperand(1);
1077 Value *DivVec = II.getOperand(2);
1078
1079 Value *SplatValue = getSplatValue(DivVec);
1080 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
1081 if (!SplatConstantInt)
1082 return None;
1083 APInt Divisor = SplatConstantInt->getValue();
1084
1085 if (Divisor.isPowerOf2()) {
1086 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
1087 auto ASRD = Builder.CreateIntrinsic(
1088 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
1089 return IC.replaceInstUsesWith(II, ASRD);
1090 }
1091 if (Divisor.isNegatedPowerOf2()) {
1092 Divisor.negate();
1093 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
1094 auto ASRD = Builder.CreateIntrinsic(
1095 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
1096 auto NEG = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_neg,
1097 {ASRD->getType()}, {ASRD, Pred, ASRD});
1098 return IC.replaceInstUsesWith(II, NEG);
1099 }
1100
1101 return None;
1102}
1103
1104Optional<Instruction *>
1105AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
1106 IntrinsicInst &II) const {
1107 Intrinsic::ID IID = II.getIntrinsicID();
1108 switch (IID) {
1109 default:
1110 break;
1111 case Intrinsic::aarch64_sve_convert_from_svbool:
1112 return instCombineConvertFromSVBool(IC, II);
1113 case Intrinsic::aarch64_sve_dup:
1114 return instCombineSVEDup(IC, II);
1115 case Intrinsic::aarch64_sve_dup_x:
1116 return instCombineSVEDupX(IC, II);
1117 case Intrinsic::aarch64_sve_cmpne:
1118 case Intrinsic::aarch64_sve_cmpne_wide:
1119 return instCombineSVECmpNE(IC, II);
1120 case Intrinsic::aarch64_sve_rdffr:
1121 return instCombineRDFFR(IC, II);
1122 case Intrinsic::aarch64_sve_lasta:
1123 case Intrinsic::aarch64_sve_lastb:
1124 return instCombineSVELast(IC, II);
1125 case Intrinsic::aarch64_sve_cntd:
1126 return instCombineSVECntElts(IC, II, 2);
1127 case Intrinsic::aarch64_sve_cntw:
1128 return instCombineSVECntElts(IC, II, 4);
1129 case Intrinsic::aarch64_sve_cnth:
1130 return instCombineSVECntElts(IC, II, 8);
1131 case Intrinsic::aarch64_sve_cntb:
1132 return instCombineSVECntElts(IC, II, 16);
1133 case Intrinsic::aarch64_sve_ptest_any:
1134 case Intrinsic::aarch64_sve_ptest_first:
1135 case Intrinsic::aarch64_sve_ptest_last:
1136 return instCombineSVEPTest(IC, II);
1137 case Intrinsic::aarch64_sve_mul:
1138 case Intrinsic::aarch64_sve_fmul:
1139 return instCombineSVEVectorMul(IC, II);
1140 case Intrinsic::aarch64_sve_fadd:
1141 return instCombineSVEVectorFAdd(IC, II);
1142 case Intrinsic::aarch64_sve_fsub:
1143 return instCombineSVEVectorBinOp(IC, II);
1144 case Intrinsic::aarch64_sve_tbl:
1145 return instCombineSVETBL(IC, II);
1146 case Intrinsic::aarch64_sve_uunpkhi:
1147 case Intrinsic::aarch64_sve_uunpklo:
1148 case Intrinsic::aarch64_sve_sunpkhi:
1149 case Intrinsic::aarch64_sve_sunpklo:
1150 return instCombineSVEUnpack(IC, II);
1151 case Intrinsic::aarch64_sve_tuple_get:
1152 return instCombineSVETupleGet(IC, II);
1153 case Intrinsic::aarch64_sve_zip1:
1154 case Intrinsic::aarch64_sve_zip2:
1155 return instCombineSVEZip(IC, II);
1156 case Intrinsic::aarch64_sve_ld1_gather_index:
1157 return instCombineLD1GatherIndex(IC, II);
1158 case Intrinsic::aarch64_sve_st1_scatter_index:
1159 return instCombineST1ScatterIndex(IC, II);
1160 case Intrinsic::aarch64_sve_ld1:
1161 return instCombineSVELD1(IC, II, DL);
1162 case Intrinsic::aarch64_sve_st1:
1163 return instCombineSVEST1(IC, II, DL);
1164 case Intrinsic::aarch64_sve_sdiv:
1165 return instCombineSVESDIV(IC, II);
1166 }
1167
1168 return None;
1169}
1170
1171Optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic(
1172 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
1173 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
1174 std::function<void(Instruction *, unsigned, APInt, APInt &)>
1175 SimplifyAndSetOp) const {
1176 switch (II.getIntrinsicID()) {
1177 default:
1178 break;
1179 case Intrinsic::aarch64_neon_fcvtxn:
1180 case Intrinsic::aarch64_neon_rshrn:
1181 case Intrinsic::aarch64_neon_sqrshrn:
1182 case Intrinsic::aarch64_neon_sqrshrun:
1183 case Intrinsic::aarch64_neon_sqshrn:
1184 case Intrinsic::aarch64_neon_sqshrun:
1185 case Intrinsic::aarch64_neon_sqxtn:
1186 case Intrinsic::aarch64_neon_sqxtun:
1187 case Intrinsic::aarch64_neon_uqrshrn:
1188 case Intrinsic::aarch64_neon_uqshrn:
1189 case Intrinsic::aarch64_neon_uqxtn:
1190 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
1191 break;
1192 }
1193
1194 return None;
1195}
1196
1197bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
1198 ArrayRef<const Value *> Args) {
1199
1200 // A helper that returns a vector type from the given type. The number of
1201 // elements in type Ty determine the vector width.
1202 auto toVectorTy = [&](Type *ArgTy) {
1203 return VectorType::get(ArgTy->getScalarType(),
1204 cast<VectorType>(DstTy)->getElementCount());
1205 };
1206
1207 // Exit early if DstTy is not a vector type whose elements are at least
1208 // 16-bits wide.
1209 if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16)
1210 return false;
1211
1212 // Determine if the operation has a widening variant. We consider both the
1213 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
1214 // instructions.
1215 //
1216 // TODO: Add additional widening operations (e.g., mul, shl, etc.) once we
1217 // verify that their extending operands are eliminated during code
1218 // generation.
1219 switch (Opcode) {
1220 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
1221 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
1222 break;
1223 default:
1224 return false;
1225 }
1226
1227 // To be a widening instruction (either the "wide" or "long" versions), the
1228 // second operand must be a sign- or zero extend having a single user. We
1229 // only consider extends having a single user because they may otherwise not
1230 // be eliminated.
1231 if (Args.size() != 2 ||
1232 (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])) ||
1233 !Args[1]->hasOneUse())
1234 return false;
1235 auto *Extend = cast<CastInst>(Args[1]);
1236
1237 // Legalize the destination type and ensure it can be used in a widening
1238 // operation.
1239 auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy);
1240 unsigned DstElTySize = DstTyL.second.getScalarSizeInBits();
1241 if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits())
1242 return false;
1243
1244 // Legalize the source type and ensure it can be used in a widening
1245 // operation.
1246 auto *SrcTy = toVectorTy(Extend->getSrcTy());
1247 auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy);
1248 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
1249 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
1250 return false;
1251
1252 // Get the total number of vector elements in the legalized types.
1253 InstructionCost NumDstEls =
1254 DstTyL.first * DstTyL.second.getVectorMinNumElements();
1255 InstructionCost NumSrcEls =
1256 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
1257
1258 // Return true if the legalized types have the same number of vector elements
1259 // and the destination element type size is twice that of the source type.
1260 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize;
1261}
1262
1263InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
1264 Type *Src,
1265 TTI::CastContextHint CCH,
1266 TTI::TargetCostKind CostKind,
1267 const Instruction *I) {
1268 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1269 assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ?
void (0) : __assert_fail ("ISD && \"Invalid opcode\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 1269
, __extension__ __PRETTY_FUNCTION__))
;
1270
1271 // If the cast is observable, and it is used by a widening instruction (e.g.,
1272 // uaddl, saddw, etc.), it may be free.
1273 if (I && I->hasOneUse()) {
1274 auto *SingleUser = cast<Instruction>(*I->user_begin());
1275 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
1276 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
1277 // If the cast is the second operand, it is free. We will generate either
1278 // a "wide" or "long" version of the widening instruction.
1279 if (I == SingleUser->getOperand(1))
1280 return 0;
1281 // If the cast is not the second operand, it will be free if it looks the
1282 // same as the second operand. In this case, we will generate a "long"
1283 // version of the widening instruction.
1284 if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
1285 if (I->getOpcode() == unsigned(Cast->getOpcode()) &&
1286 cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
1287 return 0;
1288 }
1289 }
1290
1291 // TODO: Allow non-throughput costs that aren't binary.
1292 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
1293 if (CostKind != TTI::TCK_RecipThroughput)
1294 return Cost == 0 ? 0 : 1;
1295 return Cost;
1296 };
1297
1298 EVT SrcTy = TLI->getValueType(DL, Src);
1299 EVT DstTy = TLI->getValueType(DL, Dst);
1300
1301 if (!SrcTy.isSimple() || !DstTy.isSimple())
1302 return AdjustCost(
1303 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
1304
1305 static const TypeConversionCostTblEntry
1306 ConversionTbl[] = {
1307 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
1308 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
1309 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
1310 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
1311
1312 // Truncations on nxvmiN
1313 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 },
1314 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 },
1315 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 },
1316 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 },
1317 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 },
1318 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 },
1319 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 },
1320 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 },
1321 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 },
1322 { ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1 },
1323 { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 },
1324 { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 },
1325 { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 },
1326 { ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 },
1327 { ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 },
1328 { ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 },
1329
1330 // The number of shll instructions for the extension.
1331 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
1332 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
1333 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
1334 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
1335 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
1336 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
1337 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
1338 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
1339 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
1340 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
1341 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
1342 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
1343 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
1344 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
1345 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
1346 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
1347
1348 // LowerVectorINT_TO_FP:
1349 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
1350 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
1351 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
1352 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
1353 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
1354 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
1355
1356 // Complex: to v2f32
1357 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
1358 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
1359 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
1360 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
1361 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
1362 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
1363
1364 // Complex: to v4f32
1365 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 },
1366 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
1367 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
1368 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
1369
1370 // Complex: to v8f32
1371 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
1372 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
1373 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
1374 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
1375
1376 // Complex: to v16f32
1377 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
1378 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
1379
1380 // Complex: to v2f64
1381 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
1382 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
1383 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
1384 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
1385 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
1386 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
1387
1388
1389 // LowerVectorFP_TO_INT
1390 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
1391 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
1392 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
1393 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
1394 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
1395 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
1396
1397 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
1398 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
1399 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
1400 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 },
1401 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
1402 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
1403 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 },
1404
1405 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
1406 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
1407 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 },
1408 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
1409 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 },
1410
1411 // Complex, from nxv2f32.
1412 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
1413 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
1414 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
1415 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1 },
1416 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
1417 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
1418 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
1419 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1 },
1420
1421 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
1422 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
1423 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
1424 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 },
1425 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
1426 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
1427 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 },
1428
1429 // Complex, from nxv2f64.
1430 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
1431 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
1432 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
1433 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1 },
1434 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
1435 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
1436 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
1437 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1 },
1438
1439 // Complex, from nxv4f32.
1440 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
1441 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
1442 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
1443 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1 },
1444 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
1445 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
1446 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
1447 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1 },
1448
1449 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
1450 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
1451 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7 },
1452 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
1453 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7 },
1454
1455 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
1456 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
1457 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
1458 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3 },
1459 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
1460 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
1461 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3 },
1462
1463 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
1464 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
1465 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3 },
1466 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
1467 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3 },
1468
1469 // Complex, from nxv8f16.
1470 { ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
1471 { ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
1472 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
1473 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1 },
1474 { ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
1475 { ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
1476 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
1477 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1 },
1478
1479 // Complex, from nxv4f16.
1480 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
1481 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
1482 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
1483 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1 },
1484 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
1485 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
1486 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
1487 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1 },
1488
1489 // Complex, from nxv2f16.
1490 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
1491 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
1492 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
1493 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1 },
1494 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
1495 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
1496 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
1497 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1 },
1498
1499 // Truncate from nxvmf32 to nxvmf16.
1500 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 },
1501 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 },
1502 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 },
1503
1504 // Truncate from nxvmf64 to nxvmf16.
1505 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 },
1506 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 },
1507 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 },
1508
1509 // Truncate from nxvmf64 to nxvmf32.
1510 { ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 },
1511 { ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 },
1512 { ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 },
1513
1514 // Extend from nxvmf16 to nxvmf32.
1515 { ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
1516 { ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
1517 { ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
1518
1519 // Extend from nxvmf16 to nxvmf64.
1520 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
1521 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
1522 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
1523
1524 // Extend from nxvmf32 to nxvmf64.
1525 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
1526 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
1527 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
1528
1529 };
1530
1531 if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
1532 DstTy.getSimpleVT(),
1533 SrcTy.getSimpleVT()))
1534 return AdjustCost(Entry->Cost);
1535
1536 return AdjustCost(
1537 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
1538}
1539
1540InstructionCost AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode,
1541 Type *Dst,
1542 VectorType *VecTy,
1543 unsigned Index) {
1544
1545 // Make sure we were given a valid extend opcode.
1546 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&(static_cast <bool> ((Opcode == Instruction::SExt || Opcode
== Instruction::ZExt) && "Invalid opcode") ? void (0
) : __assert_fail ("(Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && \"Invalid opcode\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 1547
, __extension__ __PRETTY_FUNCTION__))
1547 "Invalid opcode")(static_cast <bool> ((Opcode == Instruction::SExt || Opcode
== Instruction::ZExt) && "Invalid opcode") ? void (0
) : __assert_fail ("(Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && \"Invalid opcode\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 1547
, __extension__ __PRETTY_FUNCTION__))
;
1548
1549 // We are extending an element we extract from a vector, so the source type
1550 // of the extend is the element type of the vector.
1551 auto *Src = VecTy->getElementType();
1552
1553 // Sign- and zero-extends are for integer types only.
1554 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type")(static_cast <bool> (isa<IntegerType>(Dst) &&
isa<IntegerType>(Src) && "Invalid type") ? void
(0) : __assert_fail ("isa<IntegerType>(Dst) && isa<IntegerType>(Src) && \"Invalid type\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 1554
, __extension__ __PRETTY_FUNCTION__))
;
1555
1556 // Get the cost for the extract. We compute the cost (if any) for the extend
1557 // below.
1558 InstructionCost Cost =
1559 getVectorInstrCost(Instruction::ExtractElement, VecTy, Index);
1560
1561 // Legalize the types.
1562 auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy);
1563 auto DstVT = TLI->getValueType(DL, Dst);
1564 auto SrcVT = TLI->getValueType(DL, Src);
1565 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1566
1567 // If the resulting type is still a vector and the destination type is legal,
1568 // we may get the extension for free. If not, get the default cost for the
1569 // extend.
1570 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
1571 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
1572 CostKind);
1573
1574 // The destination type should be larger than the element type. If not, get
1575 // the default cost for the extend.
1576 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
1577 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
1578 CostKind);
1579
1580 switch (Opcode) {
1581 default:
1582 llvm_unreachable("Opcode should be either SExt or ZExt")::llvm::llvm_unreachable_internal("Opcode should be either SExt or ZExt"
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 1582
)
;
1583
1584 // For sign-extends, we only need a smov, which performs the extension
1585 // automatically.
1586 case Instruction::SExt:
1587 return Cost;
1588
1589 // For zero-extends, the extend is performed automatically by a umov unless
1590 // the destination type is i64 and the element type is i8 or i16.
1591 case Instruction::ZExt:
1592 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
1593 return Cost;
1594 }
1595
1596 // If we are unable to perform the extend for free, get the default cost.
1597 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
1598 CostKind);
1599}
1600
1601InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
1602 TTI::TargetCostKind CostKind,
1603 const Instruction *I) {
1604 if (CostKind != TTI::TCK_RecipThroughput)
1605 return Opcode == Instruction::PHI ? 0 : 1;
1606 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind")(static_cast <bool> (CostKind == TTI::TCK_RecipThroughput
&& "unexpected CostKind") ? void (0) : __assert_fail
("CostKind == TTI::TCK_RecipThroughput && \"unexpected CostKind\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 1606
, __extension__ __PRETTY_FUNCTION__))
;
1607 // Branches are assumed to be predicted.
1608 return 0;
1609}
1610
1611InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
1612 unsigned Index) {
1613 assert(Val->isVectorTy() && "This must be a vector type")(static_cast <bool> (Val->isVectorTy() && "This must be a vector type"
) ? void (0) : __assert_fail ("Val->isVectorTy() && \"This must be a vector type\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 1613
, __extension__ __PRETTY_FUNCTION__))
;
1614
1615 if (Index != -1U) {
1616 // Legalize the type.
1617 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
1618
1619 // This type is legalized to a scalar type.
1620 if (!LT.second.isVector())
1621 return 0;
1622
1623 // The type may be split. For fixed-width vectors we can normalize the
1624 // index to the new type.
1625 if (LT.second.isFixedLengthVector()) {
1626 unsigned Width = LT.second.getVectorNumElements();
1627 Index = Index % Width;
1628 }
1629
1630 // The element at index zero is already inside the vector.
1631 if (Index == 0)
1632 return 0;
1633 }
1634
1635 // All other insert/extracts cost this much.
1636 return ST->getVectorInsertExtractBaseCost();
1637}
1638
1639InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
1640 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1641 TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info,
1642 TTI::OperandValueProperties Opd1PropInfo,
1643 TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
1644 const Instruction *CxtI) {
1645 // TODO: Handle more cost kinds.
1646 if (CostKind != TTI::TCK_RecipThroughput)
1647 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
1648 Opd2Info, Opd1PropInfo,
1649 Opd2PropInfo, Args, CxtI);
1650
1651 // Legalize the type.
1652 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
1653
1654 // If the instruction is a widening instruction (e.g., uaddl, saddw, etc.),
1655 // add in the widening overhead specified by the sub-target. Since the
1656 // extends feeding widening instructions are performed automatically, they
1657 // aren't present in the generated code and have a zero cost. By adding a
1658 // widening overhead here, we attach the total cost of the combined operation
1659 // to the widening instruction.
1660 InstructionCost Cost = 0;
1661 if (isWideningInstruction(Ty, Opcode, Args))
1662 Cost += ST->getWideningBaseCost();
1663
1664 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1665
1666 switch (ISD) {
1667 default:
1668 return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
1669 Opd2Info,
1670 Opd1PropInfo, Opd2PropInfo);
1671 case ISD::SDIV:
1672 if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue &&
1673 Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
1674 // On AArch64, scalar signed division by constants power-of-two are
1675 // normally expanded to the sequence ADD + CMP + SELECT + SRA.
1676 // The OperandValue properties many not be same as that of previous
1677 // operation; conservatively assume OP_None.
1678 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
1679 Opd1Info, Opd2Info,
1680 TargetTransformInfo::OP_None,
1681 TargetTransformInfo::OP_None);
1682 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
1683 Opd1Info, Opd2Info,
1684 TargetTransformInfo::OP_None,
1685 TargetTransformInfo::OP_None);
1686 Cost += getArithmeticInstrCost(Instruction::Select, Ty, CostKind,
1687 Opd1Info, Opd2Info,
1688 TargetTransformInfo::OP_None,
1689 TargetTransformInfo::OP_None);
1690 Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
1691 Opd1Info, Opd2Info,
1692 TargetTransformInfo::OP_None,
1693 TargetTransformInfo::OP_None);
1694 return Cost;
1695 }
1696 LLVM_FALLTHROUGH[[gnu::fallthrough]];
1697 case ISD::UDIV:
1698 if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue) {
1699 auto VT = TLI->getValueType(DL, Ty);
1700 if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
1701 // Vector signed division by constant are expanded to the
1702 // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
1703 // to MULHS + SUB + SRL + ADD + SRL.
1704 InstructionCost MulCost = getArithmeticInstrCost(
1705 Instruction::Mul, Ty, CostKind, Opd1Info, Opd2Info,
1706 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
1707 InstructionCost AddCost = getArithmeticInstrCost(
1708 Instruction::Add, Ty, CostKind, Opd1Info, Opd2Info,
1709 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
1710 InstructionCost ShrCost = getArithmeticInstrCost(
1711 Instruction::AShr, Ty, CostKind, Opd1Info, Opd2Info,
1712 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
1713 return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
1714 }
1715 }
1716
1717 Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
1718 Opd2Info,
1719 Opd1PropInfo, Opd2PropInfo);
1720 if (Ty->isVectorTy()) {
1721 // On AArch64, vector divisions are not supported natively and are
1722 // expanded into scalar divisions of each pair of elements.
1723 Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, CostKind,
1724 Opd1Info, Opd2Info, Opd1PropInfo,
1725 Opd2PropInfo);
1726 Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind,
1727 Opd1Info, Opd2Info, Opd1PropInfo,
1728 Opd2PropInfo);
1729 // TODO: if one of the arguments is scalar, then it's not necessary to
1730 // double the cost of handling the vector elements.
1731 Cost += Cost;
1732 }
1733 return Cost;
1734
1735 case ISD::MUL:
1736 if (LT.second != MVT::v2i64)
1737 return (Cost + 1) * LT.first;
1738 // Since we do not have a MUL.2d instruction, a mul <2 x i64> is expensive
1739 // as elements are extracted from the vectors and the muls scalarized.
1740 // As getScalarizationOverhead is a bit too pessimistic, we estimate the
1741 // cost for a i64 vector directly here, which is:
1742 // - four i64 extracts,
1743 // - two i64 inserts, and
1744 // - two muls.
1745 // So, for a v2i64 with LT.First = 1 the cost is 8, and for a v4i64 with
1746 // LT.first = 2 the cost is 16.
1747 return LT.first * 8;
1748 case ISD::ADD:
1749 case ISD::XOR:
1750 case ISD::OR:
1751 case ISD::AND:
1752 // These nodes are marked as 'custom' for combining purposes only.
1753 // We know that they are legal. See LowerAdd in ISelLowering.
1754 return (Cost + 1) * LT.first;
1755
1756 case ISD::FADD:
1757 case ISD::FSUB:
1758 case ISD::FMUL:
1759 case ISD::FDIV:
1760 case ISD::FNEG:
1761 // These nodes are marked as 'custom' just to lower them to SVE.
1762 // We know said lowering will incur no additional cost.
1763 if (!Ty->getScalarType()->isFP128Ty())
1764 return (Cost + 2) * LT.first;
1765
1766 return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
1767 Opd2Info,
1768 Opd1PropInfo, Opd2PropInfo);
1769 }
1770}
1771
1772InstructionCost AArch64TTIImpl::getAddressComputationCost(Type *Ty,
1773 ScalarEvolution *SE,
1774 const SCEV *Ptr) {
1775 // Address computations in vectorized code with non-consecutive addresses will
1776 // likely result in more instructions compared to scalar code where the
1777 // computation can more often be merged into the index mode. The resulting
1778 // extra micro-ops can significantly decrease throughput.
1779 unsigned NumVectorInstToHideOverhead = 10;
1780 int MaxMergeDistance = 64;
1781
1782 if (Ty->isVectorTy() && SE &&
1783 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
1784 return NumVectorInstToHideOverhead;
1785
1786 // In many cases the address computation is not merged into the instruction
1787 // addressing mode.
1788 return 1;
1789}
1790
1791InstructionCost AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
1792 Type *CondTy,
1793 CmpInst::Predicate VecPred,
1794 TTI::TargetCostKind CostKind,
1795 const Instruction *I) {
1796 // TODO: Handle other cost kinds.
1797 if (CostKind != TTI::TCK_RecipThroughput)
1798 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1799 I);
1800
1801 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1802 // We don't lower some vector selects well that are wider than the register
1803 // width.
1804 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) {
1805 // We would need this many instructions to hide the scalarization happening.
1806 const int AmortizationCost = 20;
1807
1808 // If VecPred is not set, check if we can get a predicate from the context
1809 // instruction, if its type matches the requested ValTy.
1810 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
1811 CmpInst::Predicate CurrentPred;
1812 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
1813 m_Value())))
1814 VecPred = CurrentPred;
1815 }
1816 // Check if we have a compare/select chain that can be lowered using CMxx &
1817 // BFI pair.
1818 if (CmpInst::isIntPredicate(VecPred)) {
1819 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
1820 MVT::v8i16, MVT::v2i32, MVT::v4i32,
1821 MVT::v2i64};
1822 auto LT = TLI->getTypeLegalizationCost(DL, ValTy);
1823 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
1824 return LT.first;
1825 }
1826
1827 static const TypeConversionCostTblEntry
1828 VectorSelectTbl[] = {
1829 { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
1830 { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
1831 { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
1832 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
1833 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
1834 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
1835 };
1836
1837 EVT SelCondTy = TLI->getValueType(DL, CondTy);
1838 EVT SelValTy = TLI->getValueType(DL, ValTy);
1839 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
1840 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
1841 SelCondTy.getSimpleVT(),
1842 SelValTy.getSimpleVT()))
1843 return Entry->Cost;
1844 }
1845 }
1846 // The base case handles scalable vectors fine for now, since it treats the
1847 // cost as 1 * legalization cost.
1848 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1849}
1850
1851AArch64TTIImpl::TTI::MemCmpExpansionOptions
1852AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
1853 TTI::MemCmpExpansionOptions Options;
1854 if (ST->requiresStrictAlign()) {
1855 // TODO: Add cost modeling for strict align. Misaligned loads expand to
1856 // a bunch of instructions when strict align is enabled.
1857 return Options;
1858 }
1859 Options.AllowOverlappingLoads = true;
1860 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
1861 Options.NumLoadsPerBlock = Options.MaxNumLoads;
1862 // TODO: Though vector loads usually perform well on AArch64, in some targets
1863 // they may wake up the FP unit, which raises the power consumption. Perhaps
1864 // they could be used with no holds barred (-O3).
1865 Options.LoadSizes = {8, 4, 2, 1};
1866 return Options;
1867}
1868
1869InstructionCost
1870AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
1871 Align Alignment, unsigned AddressSpace,
1872 TTI::TargetCostKind CostKind) {
1873 if (useNeonVector(Src))
1874 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1875 CostKind);
1876 auto LT = TLI->getTypeLegalizationCost(DL, Src);
1877 if (!LT.first.isValid())
1878 return InstructionCost::getInvalid();
1879
1880 // The code-generator is currently not able to handle scalable vectors
1881 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
1882 // it. This change will be removed when code-generation for these types is
1883 // sufficiently reliable.
1884 if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1))
1885 return InstructionCost::getInvalid();
1886
1887 return LT.first * 2;
1888}
1889
1890static unsigned getSVEGatherScatterOverhead(unsigned Opcode) {
1891 return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead;
1892}
1893
1894InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
1895 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1896 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
1897 if (useNeonVector(DataTy))
1898 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1899 Alignment, CostKind, I);
1900 auto *VT = cast<VectorType>(DataTy);
1901 auto LT = TLI->getTypeLegalizationCost(DL, DataTy);
1902 if (!LT.first.isValid())
1903 return InstructionCost::getInvalid();
1904
1905 // The code-generator is currently not able to handle scalable vectors
1906 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
1907 // it. This change will be removed when code-generation for these types is
1908 // sufficiently reliable.
1909 if (cast<VectorType>(DataTy)->getElementCount() ==
1910 ElementCount::getScalable(1))
1911 return InstructionCost::getInvalid();
1912
1913 ElementCount LegalVF = LT.second.getVectorElementCount();
1914 InstructionCost MemOpCost =
1915 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I);
1916 // Add on an overhead cost for using gathers/scatters.
1917 // TODO: At the moment this is applied unilaterally for all CPUs, but at some
1918 // point we may want a per-CPU overhead.
1919 MemOpCost *= getSVEGatherScatterOverhead(Opcode);
1920 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
1921}
1922
1923bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {
1924 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
1925}
1926
1927InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
1928 MaybeAlign Alignment,
1929 unsigned AddressSpace,
1930 TTI::TargetCostKind CostKind,
1931 const Instruction *I) {
1932 EVT VT = TLI->getValueType(DL, Ty, true);
1933 // Type legalization can't handle structs
1934 if (VT == MVT::Other)
1935 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
1936 CostKind);
1937
1938 auto LT = TLI->getTypeLegalizationCost(DL, Ty);
1939 if (!LT.first.isValid())
1940 return InstructionCost::getInvalid();
1941
1942 // The code-generator is currently not able to handle scalable vectors
1943 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
1944 // it. This change will be removed when code-generation for these types is
1945 // sufficiently reliable.
1946 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
1947 if (VTy->getElementCount() == ElementCount::getScalable(1))
1948 return InstructionCost::getInvalid();
1949
1950 // TODO: consider latency as well for TCK_SizeAndLatency.
1951 if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
1952 return LT.first;
1953
1954 if (CostKind != TTI::TCK_RecipThroughput)
1955 return 1;
1956
1957 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
1958 LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
1959 // Unaligned stores are extremely inefficient. We don't split all
1960 // unaligned 128-bit stores because the negative impact that has shown in
1961 // practice on inlined block copy code.
1962 // We make such stores expensive so that we will only vectorize if there
1963 // are 6 other instructions getting vectorized.
1964 const int AmortizationCost = 6;
1965
1966 return LT.first * 2 * AmortizationCost;
1967 }
1968
1969 // Check truncating stores and extending loads.
1970 if (useNeonVector(Ty) &&
1971 Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
1972 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
1973 if (VT == MVT::v4i8)
1974 return 2;
1975 // Otherwise we need to scalarize.
1976 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
1977 }
1978
1979 return LT.first;
1980}
1981
1982InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
1983 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1984 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1985 bool UseMaskForCond, bool UseMaskForGaps) {
1986 assert(Factor >= 2 && "Invalid interleave factor")(static_cast <bool> (Factor >= 2 && "Invalid interleave factor"
) ? void (0) : __assert_fail ("Factor >= 2 && \"Invalid interleave factor\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 1986
, __extension__ __PRETTY_FUNCTION__))
;
1987 auto *VecVTy = cast<FixedVectorType>(VecTy);
1988
1989 if (!UseMaskForCond && !UseMaskForGaps &&
1990 Factor <= TLI->getMaxSupportedInterleaveFactor()) {
1991 unsigned NumElts = VecVTy->getNumElements();
1992 auto *SubVecTy =
1993 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
1994
1995 // ldN/stN only support legal vector types of size 64 or 128 in bits.
1996 // Accesses having vector types that are a multiple of 128 bits can be
1997 // matched to more than one ldN/stN instruction.
1998 bool UseScalable;
1999 if (NumElts % Factor == 0 &&
2000 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
2001 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
2002 }
2003
2004 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2005 Alignment, AddressSpace, CostKind,
2006 UseMaskForCond, UseMaskForGaps);
2007}
2008
2009InstructionCost
2010AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
2011 InstructionCost Cost = 0;
2012 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2013 for (auto *I : Tys) {
2014 if (!I->isVectorTy())
2015 continue;
2016 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
2017 128)
2018 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
2019 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
2020 }
2021 return Cost;
2022}
2023
2024unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
2025 return ST->getMaxInterleaveFactor();
2026}
2027
2028// For Falkor, we want to avoid having too many strided loads in a loop since
2029// that can exhaust the HW prefetcher resources. We adjust the unroller
2030// MaxCount preference below to attempt to ensure unrolling doesn't create too
2031// many strided loads.
2032static void
2033getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
2034 TargetTransformInfo::UnrollingPreferences &UP) {
2035 enum { MaxStridedLoads = 7 };
2036 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
2037 int StridedLoads = 0;
2038 // FIXME? We could make this more precise by looking at the CFG and
2039 // e.g. not counting loads in each side of an if-then-else diamond.
2040 for (const auto BB : L->blocks()) {
2041 for (auto &I : *BB) {
2042 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
2043 if (!LMemI)
2044 continue;
2045
2046 Value *PtrValue = LMemI->getPointerOperand();
2047 if (L->isLoopInvariant(PtrValue))
2048 continue;
2049
2050 const SCEV *LSCEV = SE.getSCEV(PtrValue);
2051 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
2052 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
2053 continue;
2054
2055 // FIXME? We could take pairing of unrolled load copies into account
2056 // by looking at the AddRec, but we would probably have to limit this
2057 // to loops with no stores or other memory optimization barriers.
2058 ++StridedLoads;
2059 // We've seen enough strided loads that seeing more won't make a
2060 // difference.
2061 if (StridedLoads > MaxStridedLoads / 2)
2062 return StridedLoads;
2063 }
2064 }
2065 return StridedLoads;
2066 };
2067
2068 int StridedLoads = countStridedLoads(L, SE);
2069 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoadsdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("aarch64tti")) { dbgs() << "falkor-hwpf: detected " <<
StridedLoads << " strided loads\n"; } } while (false)
7
Assuming 'DebugFlag' is false
8
Loop condition is false. Exiting loop
2070 << " strided loads\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("aarch64tti")) { dbgs() << "falkor-hwpf: detected " <<
StridedLoads << " strided loads\n"; } } while (false)
;
2071 // Pick the largest power of 2 unroll count that won't result in too many
2072 // strided loads.
2073 if (StridedLoads) {
9
Assuming 'StridedLoads' is not equal to 0
10
Taking true branch
2074 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
11
Calling 'Log2_32'
13
Returning from 'Log2_32'
14
The result of the left shift is undefined due to shifting by '4294967295', which is greater or equal to the width of type 'int'
2075 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("aarch64tti")) { dbgs() << "falkor-hwpf: setting unroll MaxCount to "
<< UP.MaxCount << '\n'; } } while (false)
2076 << UP.MaxCount << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("aarch64tti")) { dbgs() << "falkor-hwpf: setting unroll MaxCount to "
<< UP.MaxCount << '\n'; } } while (false)
;
2077 }
2078}
2079
2080void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
2081 TTI::UnrollingPreferences &UP,
2082 OptimizationRemarkEmitter *ORE) {
2083 // Enable partial unrolling and runtime unrolling.
2084 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
2085
2086 UP.UpperBound = true;
2087
2088 // For inner loop, it is more likely to be a hot one, and the runtime check
2089 // can be promoted out from LICM pass, so the overhead is less, let's try
2090 // a larger threshold to unroll more loops.
2091 if (L->getLoopDepth() > 1)
1
Assuming the condition is false
2
Taking false branch
2092 UP.PartialThreshold *= 2;
2093
2094 // Disable partial & runtime unrolling on -Os.
2095 UP.PartialOptSizeThreshold = 0;
2096
2097 if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
3
Assuming the condition is true
5
Taking true branch
2098 EnableFalkorHWPFUnrollFix)
4
Assuming the condition is true
2099 getFalkorUnrollingPreferences(L, SE, UP);
6
Calling 'getFalkorUnrollingPreferences'
2100
2101 // Scan the loop: don't unroll loops with calls as this could prevent
2102 // inlining. Don't unroll vector loops either, as they don't benefit much from
2103 // unrolling.
2104 for (auto *BB : L->getBlocks()) {
2105 for (auto &I : *BB) {
2106 // Don't unroll vectorised loop.
2107 if (I.getType()->isVectorTy())
2108 return;
2109
2110 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2111 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2112 if (!isLoweredToCall(F))
2113 continue;
2114 }
2115 return;
2116 }
2117 }
2118 }
2119
2120 // Enable runtime unrolling for in-order models
2121 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
2122 // checking for that case, we can ensure that the default behaviour is
2123 // unchanged
2124 if (ST->getProcFamily() != AArch64Subtarget::Others &&
2125 !ST->getSchedModel().isOutOfOrder()) {
2126 UP.Runtime = true;
2127 UP.Partial = true;
2128 UP.UnrollRemainder = true;
2129 UP.DefaultUnrollRuntimeCount = 4;
2130
2131 UP.UnrollAndJam = true;
2132 UP.UnrollAndJamInnerLoopThreshold = 60;
2133 }
2134}
2135
2136void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
2137 TTI::PeelingPreferences &PP) {
2138 BaseT::getPeelingPreferences(L, SE, PP);
2139}
2140
2141Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
2142 Type *ExpectedType) {
2143 switch (Inst->getIntrinsicID()) {
2144 default:
2145 return nullptr;
2146 case Intrinsic::aarch64_neon_st2:
2147 case Intrinsic::aarch64_neon_st3:
2148 case Intrinsic::aarch64_neon_st4: {
2149 // Create a struct type
2150 StructType *ST = dyn_cast<StructType>(ExpectedType);
2151 if (!ST)
2152 return nullptr;
2153 unsigned NumElts = Inst->arg_size() - 1;
2154 if (ST->getNumElements() != NumElts)
2155 return nullptr;
2156 for (unsigned i = 0, e = NumElts; i != e; ++i) {
2157 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
2158 return nullptr;
2159 }
2160 Value *Res = UndefValue::get(ExpectedType);
2161 IRBuilder<> Builder(Inst);
2162 for (unsigned i = 0, e = NumElts; i != e; ++i) {
2163 Value *L = Inst->getArgOperand(i);
2164 Res = Builder.CreateInsertValue(Res, L, i);
2165 }
2166 return Res;
2167 }
2168 case Intrinsic::aarch64_neon_ld2:
2169 case Intrinsic::aarch64_neon_ld3:
2170 case Intrinsic::aarch64_neon_ld4:
2171 if (Inst->getType() == ExpectedType)
2172 return Inst;
2173 return nullptr;
2174 }
2175}
2176
2177bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
2178 MemIntrinsicInfo &Info) {
2179 switch (Inst->getIntrinsicID()) {
2180 default:
2181 break;
2182 case Intrinsic::aarch64_neon_ld2:
2183 case Intrinsic::aarch64_neon_ld3:
2184 case Intrinsic::aarch64_neon_ld4:
2185 Info.ReadMem = true;
2186 Info.WriteMem = false;
2187 Info.PtrVal = Inst->getArgOperand(0);
2188 break;
2189 case Intrinsic::aarch64_neon_st2:
2190 case Intrinsic::aarch64_neon_st3:
2191 case Intrinsic::aarch64_neon_st4:
2192 Info.ReadMem = false;
2193 Info.WriteMem = true;
2194 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
2195 break;
2196 }
2197
2198 switch (Inst->getIntrinsicID()) {
2199 default:
2200 return false;
2201 case Intrinsic::aarch64_neon_ld2:
2202 case Intrinsic::aarch64_neon_st2:
2203 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
2204 break;
2205 case Intrinsic::aarch64_neon_ld3:
2206 case Intrinsic::aarch64_neon_st3:
2207 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
2208 break;
2209 case Intrinsic::aarch64_neon_ld4:
2210 case Intrinsic::aarch64_neon_st4:
2211 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
2212 break;
2213 }
2214 return true;
2215}
2216
2217/// See if \p I should be considered for address type promotion. We check if \p
2218/// I is a sext with right type and used in memory accesses. If it used in a
2219/// "complex" getelementptr, we allow it to be promoted without finding other
2220/// sext instructions that sign extended the same initial value. A getelementptr
2221/// is considered as "complex" if it has more than 2 operands.
2222bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
2223 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
2224 bool Considerable = false;
2225 AllowPromotionWithoutCommonHeader = false;
2226 if (!isa<SExtInst>(&I))
2227 return false;
2228 Type *ConsideredSExtType =
2229 Type::getInt64Ty(I.getParent()->getParent()->getContext());
2230 if (I.getType() != ConsideredSExtType)
2231 return false;
2232 // See if the sext is the one with the right type and used in at least one
2233 // GetElementPtrInst.
2234 for (const User *U : I.users()) {
2235 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
2236 Considerable = true;
2237 // A getelementptr is considered as "complex" if it has more than 2
2238 // operands. We will promote a SExt used in such complex GEP as we
2239 // expect some computation to be merged if they are done on 64 bits.
2240 if (GEPInst->getNumOperands() > 2) {
2241 AllowPromotionWithoutCommonHeader = true;
2242 break;
2243 }
2244 }
2245 }
2246 return Considerable;
2247}
2248
2249bool AArch64TTIImpl::isLegalToVectorizeReduction(
2250 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
2251 if (!VF.isScalable())
2252 return true;
2253
2254 Type *Ty = RdxDesc.getRecurrenceType();
2255 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
2256 return false;
2257
2258 switch (RdxDesc.getRecurrenceKind()) {
2259 case RecurKind::Add:
2260 case RecurKind::FAdd:
2261 case RecurKind::And:
2262 case RecurKind::Or:
2263 case RecurKind::Xor:
2264 case RecurKind::SMin:
2265 case RecurKind::SMax:
2266 case RecurKind::UMin:
2267 case RecurKind::UMax:
2268 case RecurKind::FMin:
2269 case RecurKind::FMax:
2270 case RecurKind::SelectICmp:
2271 case RecurKind::SelectFCmp:
2272 case RecurKind::FMulAdd:
2273 return true;
2274 default:
2275 return false;
2276 }
2277}
2278
2279InstructionCost
2280AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
2281 bool IsUnsigned,
2282 TTI::TargetCostKind CostKind) {
2283 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
2284
2285 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
2286 return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
2287
2288 assert((isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) &&(static_cast <bool> ((isa<ScalableVectorType>(Ty)
== isa<ScalableVectorType>(CondTy)) && "Both vector needs to be equally scalable"
) ? void (0) : __assert_fail ("(isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) && \"Both vector needs to be equally scalable\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2289
, __extension__ __PRETTY_FUNCTION__))
2289 "Both vector needs to be equally scalable")(static_cast <bool> ((isa<ScalableVectorType>(Ty)
== isa<ScalableVectorType>(CondTy)) && "Both vector needs to be equally scalable"
) ? void (0) : __assert_fail ("(isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) && \"Both vector needs to be equally scalable\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2289
, __extension__ __PRETTY_FUNCTION__))
;
2290
2291 InstructionCost LegalizationCost = 0;
2292 if (LT.first > 1) {
2293 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
2294 unsigned MinMaxOpcode =
2295 Ty->isFPOrFPVectorTy()
2296 ? Intrinsic::maxnum
2297 : (IsUnsigned ? Intrinsic::umin : Intrinsic::smin);
2298 IntrinsicCostAttributes Attrs(MinMaxOpcode, LegalVTy, {LegalVTy, LegalVTy});
2299 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
2300 }
2301
2302 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
2303}
2304
2305InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE(
2306 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) {
2307 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2308 InstructionCost LegalizationCost = 0;
2309 if (LT.first > 1) {
2310 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
2311 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
2312 LegalizationCost *= LT.first - 1;
2313 }
2314
2315 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2316 assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ?
void (0) : __assert_fail ("ISD && \"Invalid opcode\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2316
, __extension__ __PRETTY_FUNCTION__))
;
2317 // Add the final reduction cost for the legal horizontal reduction
2318 switch (ISD) {
2319 case ISD::ADD:
2320 case ISD::AND:
2321 case ISD::OR:
2322 case ISD::XOR:
2323 case ISD::FADD:
2324 return LegalizationCost + 2;
2325 default:
2326 return InstructionCost::getInvalid();
2327 }
2328}
2329
2330InstructionCost
2331AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
2332 Optional<FastMathFlags> FMF,
2333 TTI::TargetCostKind CostKind) {
2334 if (TTI::requiresOrderedReduction(FMF)) {
2335 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
2336 InstructionCost BaseCost =
2337 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2338 // Add on extra cost to reflect the extra overhead on some CPUs. We still
2339 // end up vectorizing for more computationally intensive loops.
2340 return BaseCost + FixedVTy->getNumElements();
2341 }
2342
2343 if (Opcode != Instruction::FAdd)
2344 return InstructionCost::getInvalid();
2345
2346 auto *VTy = cast<ScalableVectorType>(ValTy);
2347 InstructionCost Cost =
2348 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
2349 Cost *= getMaxNumElements(VTy->getElementCount());
2350 return Cost;
2351 }
2352
2353 if (isa<ScalableVectorType>(ValTy))
2354 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
2355
2356 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2357 MVT MTy = LT.second;
2358 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2359 assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ?
void (0) : __assert_fail ("ISD && \"Invalid opcode\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2359
, __extension__ __PRETTY_FUNCTION__))
;
2360
2361 // Horizontal adds can use the 'addv' instruction. We model the cost of these
2362 // instructions as twice a normal vector add, plus 1 for each legalization
2363 // step (LT.first). This is the only arithmetic vector reduction operation for
2364 // which we have an instruction.
2365 // OR, XOR and AND costs should match the codegen from:
2366 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
2367 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
2368 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
2369 static const CostTblEntry CostTblNoPairwise[]{
2370 {ISD::ADD, MVT::v8i8, 2},
2371 {ISD::ADD, MVT::v16i8, 2},
2372 {ISD::ADD, MVT::v4i16, 2},
2373 {ISD::ADD, MVT::v8i16, 2},
2374 {ISD::ADD, MVT::v4i32, 2},
2375 {ISD::OR, MVT::v8i8, 15},
2376 {ISD::OR, MVT::v16i8, 17},
2377 {ISD::OR, MVT::v4i16, 7},
2378 {ISD::OR, MVT::v8i16, 9},
2379 {ISD::OR, MVT::v2i32, 3},
2380 {ISD::OR, MVT::v4i32, 5},
2381 {ISD::OR, MVT::v2i64, 3},
2382 {ISD::XOR, MVT::v8i8, 15},
2383 {ISD::XOR, MVT::v16i8, 17},
2384 {ISD::XOR, MVT::v4i16, 7},
2385 {ISD::XOR, MVT::v8i16, 9},
2386 {ISD::XOR, MVT::v2i32, 3},
2387 {ISD::XOR, MVT::v4i32, 5},
2388 {ISD::XOR, MVT::v2i64, 3},
2389 {ISD::AND, MVT::v8i8, 15},
2390 {ISD::AND, MVT::v16i8, 17},
2391 {ISD::AND, MVT::v4i16, 7},
2392 {ISD::AND, MVT::v8i16, 9},
2393 {ISD::AND, MVT::v2i32, 3},
2394 {ISD::AND, MVT::v4i32, 5},
2395 {ISD::AND, MVT::v2i64, 3},
2396 };
2397 switch (ISD) {
2398 default:
2399 break;
2400 case ISD::ADD:
2401 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
2402 return (LT.first - 1) + Entry->Cost;
2403 break;
2404 case ISD::XOR:
2405 case ISD::AND:
2406 case ISD::OR:
2407 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
2408 if (!Entry)
2409 break;
2410 auto *ValVTy = cast<FixedVectorType>(ValTy);
2411 if (!ValVTy->getElementType()->isIntegerTy(1) &&
2412 MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
2413 isPowerOf2_32(ValVTy->getNumElements())) {
2414 InstructionCost ExtraCost = 0;
2415 if (LT.first != 1) {
2416 // Type needs to be split, so there is an extra cost of LT.first - 1
2417 // arithmetic ops.
2418 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
2419 MTy.getVectorNumElements());
2420 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
2421 ExtraCost *= LT.first - 1;
2422 }
2423 return Entry->Cost + ExtraCost;
2424 }
2425 break;
2426 }
2427 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2428}
2429
2430InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) {
2431 static const CostTblEntry ShuffleTbl[] = {
2432 { TTI::SK_Splice, MVT::nxv16i8, 1 },
2433 { TTI::SK_Splice, MVT::nxv8i16, 1 },
2434 { TTI::SK_Splice, MVT::nxv4i32, 1 },
2435 { TTI::SK_Splice, MVT::nxv2i64, 1 },
2436 { TTI::SK_Splice, MVT::nxv2f16, 1 },
2437 { TTI::SK_Splice, MVT::nxv4f16, 1 },
2438 { TTI::SK_Splice, MVT::nxv8f16, 1 },
2439 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
2440 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
2441 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
2442 { TTI::SK_Splice, MVT::nxv2f32, 1 },
2443 { TTI::SK_Splice, MVT::nxv4f32, 1 },
2444 { TTI::SK_Splice, MVT::nxv2f64, 1 },
2445 };
2446
2447 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
2448 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
2449 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2450 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
2451 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
2452 : LT.second;
2453 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
2454 InstructionCost LegalizationCost = 0;
2455 if (Index < 0) {
2456 LegalizationCost =
2457 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
2458 CmpInst::BAD_ICMP_PREDICATE, CostKind) +
2459 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
2460 CmpInst::BAD_ICMP_PREDICATE, CostKind);
2461 }
2462
2463 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
2464 // Cost performed on a promoted type.
2465 if (LT.second.getScalarType() == MVT::i1) {
2466 LegalizationCost +=
2467 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
2468 TTI::CastContextHint::None, CostKind) +
2469 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
2470 TTI::CastContextHint::None, CostKind);
2471 }
2472 const auto *Entry =
2473 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
2474 assert(Entry && "Illegal Type for Splice")(static_cast <bool> (Entry && "Illegal Type for Splice"
) ? void (0) : __assert_fail ("Entry && \"Illegal Type for Splice\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2474
, __extension__ __PRETTY_FUNCTION__))
;
2475 LegalizationCost += Entry->Cost;
2476 return LegalizationCost * LT.first;
2477}
2478
2479InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
2480 VectorType *Tp,
2481 ArrayRef<int> Mask, int Index,
2482 VectorType *SubTp) {
2483 Kind = improveShuffleKindFromMask(Kind, Mask);
2484 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
2485 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
2486 Kind == TTI::SK_Reverse) {
2487 static const CostTblEntry ShuffleTbl[] = {
2488 // Broadcast shuffle kinds can be performed with 'dup'.
2489 { TTI::SK_Broadcast, MVT::v8i8, 1 },
2490 { TTI::SK_Broadcast, MVT::v16i8, 1 },
2491 { TTI::SK_Broadcast, MVT::v4i16, 1 },
2492 { TTI::SK_Broadcast, MVT::v8i16, 1 },
2493 { TTI::SK_Broadcast, MVT::v2i32, 1 },
2494 { TTI::SK_Broadcast, MVT::v4i32, 1 },
2495 { TTI::SK_Broadcast, MVT::v2i64, 1 },
2496 { TTI::SK_Broadcast, MVT::v2f32, 1 },
2497 { TTI::SK_Broadcast, MVT::v4f32, 1 },
2498 { TTI::SK_Broadcast, MVT::v2f64, 1 },
2499 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
2500 // 'zip1/zip2' instructions.
2501 { TTI::SK_Transpose, MVT::v8i8, 1 },
2502 { TTI::SK_Transpose, MVT::v16i8, 1 },
2503 { TTI::SK_Transpose, MVT::v4i16, 1 },
2504 { TTI::SK_Transpose, MVT::v8i16, 1 },
2505 { TTI::SK_Transpose, MVT::v2i32, 1 },
2506 { TTI::SK_Transpose, MVT::v4i32, 1 },
2507 { TTI::SK_Transpose, MVT::v2i64, 1 },
2508 { TTI::SK_Transpose, MVT::v2f32, 1 },
2509 { TTI::SK_Transpose, MVT::v4f32, 1 },
2510 { TTI::SK_Transpose, MVT::v2f64, 1 },
2511 // Select shuffle kinds.
2512 // TODO: handle vXi8/vXi16.
2513 { TTI::SK_Select, MVT::v2i32, 1 }, // mov.
2514 { TTI::SK_Select, MVT::v4i32, 2 }, // rev+trn (or similar).
2515 { TTI::SK_Select, MVT::v2i64, 1 }, // mov.
2516 { TTI::SK_Select, MVT::v2f32, 1 }, // mov.
2517 { TTI::SK_Select, MVT::v4f32, 2 }, // rev+trn (or similar).
2518 { TTI::SK_Select, MVT::v2f64, 1 }, // mov.
2519 // PermuteSingleSrc shuffle kinds.
2520 { TTI::SK_PermuteSingleSrc, MVT::v2i32, 1 }, // mov.
2521 { TTI::SK_PermuteSingleSrc, MVT::v4i32, 3 }, // perfectshuffle worst case.
2522 { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // mov.
2523 { TTI::SK_PermuteSingleSrc, MVT::v2f32, 1 }, // mov.
2524 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 3 }, // perfectshuffle worst case.
2525 { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // mov.
2526 { TTI::SK_PermuteSingleSrc, MVT::v4i16, 3 }, // perfectshuffle worst case.
2527 { TTI::SK_PermuteSingleSrc, MVT::v4f16, 3 }, // perfectshuffle worst case.
2528 { TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3 }, // perfectshuffle worst case.
2529 { TTI::SK_PermuteSingleSrc, MVT::v8i16, 8 }, // constpool + load + tbl
2530 { TTI::SK_PermuteSingleSrc, MVT::v8f16, 8 }, // constpool + load + tbl
2531 { TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8 }, // constpool + load + tbl
2532 { TTI::SK_PermuteSingleSrc, MVT::v8i8, 8 }, // constpool + load + tbl
2533 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 8 }, // constpool + load + tbl
2534 // Reverse can be lowered with `rev`.
2535 { TTI::SK_Reverse, MVT::v2i32, 1 }, // mov.
2536 { TTI::SK_Reverse, MVT::v4i32, 2 }, // REV64; EXT
2537 { TTI::SK_Reverse, MVT::v2i64, 1 }, // mov.
2538 { TTI::SK_Reverse, MVT::v2f32, 1 }, // mov.
2539 { TTI::SK_Reverse, MVT::v4f32, 2 }, // REV64; EXT
2540 { TTI::SK_Reverse, MVT::v2f64, 1 }, // mov.
2541 // Broadcast shuffle kinds for scalable vectors
2542 { TTI::SK_Broadcast, MVT::nxv16i8, 1 },
2543 { TTI::SK_Broadcast, MVT::nxv8i16, 1 },
2544 { TTI::SK_Broadcast, MVT::nxv4i32, 1 },
2545 { TTI::SK_Broadcast, MVT::nxv2i64, 1 },
2546 { TTI::SK_Broadcast, MVT::nxv2f16, 1 },
2547 { TTI::SK_Broadcast, MVT::nxv4f16, 1 },
2548 { TTI::SK_Broadcast, MVT::nxv8f16, 1 },
2549 { TTI::SK_Broadcast, MVT::nxv2bf16, 1 },
2550 { TTI::SK_Broadcast, MVT::nxv4bf16, 1 },
2551 { TTI::SK_Broadcast, MVT::nxv8bf16, 1 },
2552 { TTI::SK_Broadcast, MVT::nxv2f32, 1 },
2553 { TTI::SK_Broadcast, MVT::nxv4f32, 1 },
2554 { TTI::SK_Broadcast, MVT::nxv2f64, 1 },
2555 { TTI::SK_Broadcast, MVT::nxv16i1, 1 },
2556 { TTI::SK_Broadcast, MVT::nxv8i1, 1 },
2557 { TTI::SK_Broadcast, MVT::nxv4i1, 1 },
2558 { TTI::SK_Broadcast, MVT::nxv2i1, 1 },
2559 // Handle the cases for vector.reverse with scalable vectors
2560 { TTI::SK_Reverse, MVT::nxv16i8, 1 },
2561 { TTI::SK_Reverse, MVT::nxv8i16, 1 },
2562 { TTI::SK_Reverse, MVT::nxv4i32, 1 },
2563 { TTI::SK_Reverse, MVT::nxv2i64, 1 },
2564 { TTI::SK_Reverse, MVT::nxv2f16, 1 },
2565 { TTI::SK_Reverse, MVT::nxv4f16, 1 },
2566 { TTI::SK_Reverse, MVT::nxv8f16, 1 },
2567 { TTI::SK_Reverse, MVT::nxv2bf16, 1 },
2568 { TTI::SK_Reverse, MVT::nxv4bf16, 1 },
2569 { TTI::SK_Reverse, MVT::nxv8bf16, 1 },
2570 { TTI::SK_Reverse, MVT::nxv2f32, 1 },
2571 { TTI::SK_Reverse, MVT::nxv4f32, 1 },
2572 { TTI::SK_Reverse, MVT::nxv2f64, 1 },
2573 { TTI::SK_Reverse, MVT::nxv16i1, 1 },
2574 { TTI::SK_Reverse, MVT::nxv8i1, 1 },
2575 { TTI::SK_Reverse, MVT::nxv4i1, 1 },
2576 { TTI::SK_Reverse, MVT::nxv2i1, 1 },
2577 };
2578 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
2579 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
2580 return LT.first * Entry->Cost;
2581 }
2582 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))
2583 return getSpliceCost(Tp, Index);
2584 return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
2585}

/build/llvm-toolchain-snapshot-14~++20220116100644+5f782d25a742/llvm/include/llvm/Support/MathExtras.h

1//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains some functions that are useful for math stuff.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_SUPPORT_MATHEXTRAS_H
14#define LLVM_SUPPORT_MATHEXTRAS_H
15
16#include "llvm/Support/Compiler.h"
17#include <cassert>
18#include <climits>
19#include <cmath>
20#include <cstdint>
21#include <cstring>
22#include <limits>
23#include <type_traits>
24
25#ifdef __ANDROID_NDK__
26#include <android/api-level.h>
27#endif
28
29#ifdef _MSC_VER
30// Declare these intrinsics manually rather including intrin.h. It's very
31// expensive, and MathExtras.h is popular.
32// #include <intrin.h>
33extern "C" {
34unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
35unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
36unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
37unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
38}
39#endif
40
41namespace llvm {
42
43/// The behavior an operation has on an input of 0.
44enum ZeroBehavior {
45 /// The returned value is undefined.
46 ZB_Undefined,
47 /// The returned value is numeric_limits<T>::max()
48 ZB_Max,
49 /// The returned value is numeric_limits<T>::digits
50 ZB_Width
51};
52
53/// Mathematical constants.
54namespace numbers {
55// TODO: Track C++20 std::numbers.
56// TODO: Favor using the hexadecimal FP constants (requires C++17).
57constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113
58 egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620
59 ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162
60 ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392
61 log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0)
62 log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2)
63 pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796
64 inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541
65 sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161
66 inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197
67 sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219
68 inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1)
69 sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194
70 inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1)
71 phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622
72constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113
73 egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620
74 ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162
75 ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392
76 log2ef = 1.44269504F, // (0x1.715476P+0)
77 log10ef = .434294482F, // (0x1.bcb7b2P-2)
78 pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796
79 inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541
80 sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161
81 inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197
82 sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193
83 inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1)
84 sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194
85 inv_sqrt3f = .577350269F, // (0x1.279a74P-1)
86 phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622
87} // namespace numbers
88
89namespace detail {
90template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
91 static unsigned count(T Val, ZeroBehavior) {
92 if (!Val)
93 return std::numeric_limits<T>::digits;
94 if (Val & 0x1)
95 return 0;
96
97 // Bisection method.
98 unsigned ZeroBits = 0;
99 T Shift = std::numeric_limits<T>::digits >> 1;
100 T Mask = std::numeric_limits<T>::max() >> Shift;
101 while (Shift) {
102 if ((Val & Mask) == 0) {
103 Val >>= Shift;
104 ZeroBits |= Shift;
105 }
106 Shift >>= 1;
107 Mask >>= Shift;
108 }
109 return ZeroBits;
110 }
111};
112
113#if defined(__GNUC__4) || defined(_MSC_VER)
114template <typename T> struct TrailingZerosCounter<T, 4> {
115 static unsigned count(T Val, ZeroBehavior ZB) {
116 if (ZB != ZB_Undefined && Val == 0)
117 return 32;
118
119#if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4)
120 return __builtin_ctz(Val);
121#elif defined(_MSC_VER)
122 unsigned long Index;
123 _BitScanForward(&Index, Val);
124 return Index;
125#endif
126 }
127};
128
129#if !defined(_MSC_VER) || defined(_M_X64)
130template <typename T> struct TrailingZerosCounter<T, 8> {
131 static unsigned count(T Val, ZeroBehavior ZB) {
132 if (ZB != ZB_Undefined && Val == 0)
133 return 64;
134
135#if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4)
136 return __builtin_ctzll(Val);
137#elif defined(_MSC_VER)
138 unsigned long Index;
139 _BitScanForward64(&Index, Val);
140 return Index;
141#endif
142 }
143};
144#endif
145#endif
146} // namespace detail
147
148/// Count number of 0's from the least significant bit to the most
149/// stopping at the first 1.
150///
151/// Only unsigned integral types are allowed.
152///
153/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
154/// valid arguments.
155template <typename T>
156unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
157 static_assert(std::numeric_limits<T>::is_integer &&
158 !std::numeric_limits<T>::is_signed,
159 "Only unsigned integral types are allowed.");
160 return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
161}
162
163namespace detail {
164template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
165 static unsigned count(T Val, ZeroBehavior) {
166 if (!Val)
167 return std::numeric_limits<T>::digits;
168
169 // Bisection method.
170 unsigned ZeroBits = 0;
171 for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
172 T Tmp = Val >> Shift;
173 if (Tmp)
174 Val = Tmp;
175 else
176 ZeroBits |= Shift;
177 }
178 return ZeroBits;
179 }
180};
181
182#if defined(__GNUC__4) || defined(_MSC_VER)
183template <typename T> struct LeadingZerosCounter<T, 4> {
184 static unsigned count(T Val, ZeroBehavior ZB) {
185 if (ZB != ZB_Undefined && Val == 0)
186 return 32;
187
188#if __has_builtin(__builtin_clz)1 || defined(__GNUC__4)
189 return __builtin_clz(Val);
190#elif defined(_MSC_VER)
191 unsigned long Index;
192 _BitScanReverse(&Index, Val);
193 return Index ^ 31;
194#endif
195 }
196};
197
198#if !defined(_MSC_VER) || defined(_M_X64)
199template <typename T> struct LeadingZerosCounter<T, 8> {
200 static unsigned count(T Val, ZeroBehavior ZB) {
201 if (ZB != ZB_Undefined && Val == 0)
202 return 64;
203
204#if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4)
205 return __builtin_clzll(Val);
206#elif defined(_MSC_VER)
207 unsigned long Index;
208 _BitScanReverse64(&Index, Val);
209 return Index ^ 63;
210#endif
211 }
212};
213#endif
214#endif
215} // namespace detail
216
217/// Count number of 0's from the most significant bit to the least
218/// stopping at the first 1.
219///
220/// Only unsigned integral types are allowed.
221///
222/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
223/// valid arguments.
224template <typename T>
225unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
226 static_assert(std::numeric_limits<T>::is_integer &&
227 !std::numeric_limits<T>::is_signed,
228 "Only unsigned integral types are allowed.");
229 return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
230}
231
232/// Get the index of the first set bit starting from the least
233/// significant bit.
234///
235/// Only unsigned integral types are allowed.
236///
237/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
238/// valid arguments.
239template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
240 if (ZB == ZB_Max && Val == 0)
241 return std::numeric_limits<T>::max();
242
243 return countTrailingZeros(Val, ZB_Undefined);
244}
245
246/// Create a bitmask with the N right-most bits set to 1, and all other
247/// bits set to 0. Only unsigned types are allowed.
248template <typename T> T maskTrailingOnes(unsigned N) {
249 static_assert(std::is_unsigned<T>::value, "Invalid type!");
250 const unsigned Bits = CHAR_BIT8 * sizeof(T);
251 assert(N <= Bits && "Invalid bit index")(static_cast <bool> (N <= Bits && "Invalid bit index"
) ? void (0) : __assert_fail ("N <= Bits && \"Invalid bit index\""
, "llvm/include/llvm/Support/MathExtras.h", 251, __extension__
__PRETTY_FUNCTION__))
;
252 return N == 0 ? 0 : (T(-1) >> (Bits - N));
253}
254
255/// Create a bitmask with the N left-most bits set to 1, and all other
256/// bits set to 0. Only unsigned types are allowed.
257template <typename T> T maskLeadingOnes(unsigned N) {
258 return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
259}
260
261/// Create a bitmask with the N right-most bits set to 0, and all other
262/// bits set to 1. Only unsigned types are allowed.
263template <typename T> T maskTrailingZeros(unsigned N) {
264 return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
265}
266
267/// Create a bitmask with the N left-most bits set to 0, and all other
268/// bits set to 1. Only unsigned types are allowed.
269template <typename T> T maskLeadingZeros(unsigned N) {
270 return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
271}
272
273/// Get the index of the last set bit starting from the least
274/// significant bit.
275///
276/// Only unsigned integral types are allowed.
277///
278/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
279/// valid arguments.
280template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
281 if (ZB == ZB_Max && Val == 0)
282 return std::numeric_limits<T>::max();
283
284 // Use ^ instead of - because both gcc and llvm can remove the associated ^
285 // in the __builtin_clz intrinsic on x86.
286 return countLeadingZeros(Val, ZB_Undefined) ^
287 (std::numeric_limits<T>::digits - 1);
288}
289
290/// Macro compressed bit reversal table for 256 bits.
291///
292/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
293static const unsigned char BitReverseTable256[256] = {
294#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
295#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
296#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
297 R6(0), R6(2), R6(1), R6(3)
298#undef R2
299#undef R4
300#undef R6
301};
302
303/// Reverse the bits in \p Val.
304template <typename T>
305T reverseBits(T Val) {
306 unsigned char in[sizeof(Val)];
307 unsigned char out[sizeof(Val)];
308 std::memcpy(in, &Val, sizeof(Val));
309 for (unsigned i = 0; i < sizeof(Val); ++i)
310 out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
311 std::memcpy(&Val, out, sizeof(Val));
312 return Val;
313}
314
315#if __has_builtin(__builtin_bitreverse8)1
316template<>
317inline uint8_t reverseBits<uint8_t>(uint8_t Val) {
318 return __builtin_bitreverse8(Val);
319}
320#endif
321
322#if __has_builtin(__builtin_bitreverse16)1
323template<>
324inline uint16_t reverseBits<uint16_t>(uint16_t Val) {
325 return __builtin_bitreverse16(Val);
326}
327#endif
328
329#if __has_builtin(__builtin_bitreverse32)1
330template<>
331inline uint32_t reverseBits<uint32_t>(uint32_t Val) {
332 return __builtin_bitreverse32(Val);
333}
334#endif
335
336#if __has_builtin(__builtin_bitreverse64)1
337template<>
338inline uint64_t reverseBits<uint64_t>(uint64_t Val) {
339 return __builtin_bitreverse64(Val);
340}
341#endif
342
343// NOTE: The following support functions use the _32/_64 extensions instead of
344// type overloading so that signed and unsigned integers can be used without
345// ambiguity.
346
347/// Return the high 32 bits of a 64 bit value.
348constexpr inline uint32_t Hi_32(uint64_t Value) {
349 return static_cast<uint32_t>(Value >> 32);
350}
351
352/// Return the low 32 bits of a 64 bit value.
353constexpr inline uint32_t Lo_32(uint64_t Value) {
354 return static_cast<uint32_t>(Value);
355}
356
357/// Make a 64-bit integer from a high / low pair of 32-bit integers.
358constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
359 return ((uint64_t)High << 32) | (uint64_t)Low;
360}
361
362/// Checks if an integer fits into the given bit width.
363template <unsigned N> constexpr inline bool isInt(int64_t x) {
364 return N >= 64 || (-(INT64_C(1)1L<<(N-1)) <= x && x < (INT64_C(1)1L<<(N-1)));
365}
366// Template specializations to get better code for common cases.
367template <> constexpr inline bool isInt<8>(int64_t x) {
368 return static_cast<int8_t>(x) == x;
369}
370template <> constexpr inline bool isInt<16>(int64_t x) {
371 return static_cast<int16_t>(x) == x;
372}
373template <> constexpr inline bool isInt<32>(int64_t x) {
374 return static_cast<int32_t>(x) == x;
375}
376
377/// Checks if a signed integer is an N bit number shifted left by S.
378template <unsigned N, unsigned S>
379constexpr inline bool isShiftedInt(int64_t x) {
380 static_assert(
381 N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number.");
382 static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide.");
383 return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
384}
385
386/// Checks if an unsigned integer fits into the given bit width.
387///
388/// This is written as two functions rather than as simply
389///
390/// return N >= 64 || X < (UINT64_C(1) << N);
391///
392/// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting
393/// left too many places.
394template <unsigned N>
395constexpr inline std::enable_if_t<(N < 64), bool> isUInt(uint64_t X) {
396 static_assert(N > 0, "isUInt<0> doesn't make sense");
397 return X < (UINT64_C(1)1UL << (N));
398}
399template <unsigned N>
400constexpr inline std::enable_if_t<N >= 64, bool> isUInt(uint64_t) {
401 return true;
402}
403
404// Template specializations to get better code for common cases.
405template <> constexpr inline bool isUInt<8>(uint64_t x) {
406 return static_cast<uint8_t>(x) == x;
407}
408template <> constexpr inline bool isUInt<16>(uint64_t x) {
409 return static_cast<uint16_t>(x) == x;
410}
411template <> constexpr inline bool isUInt<32>(uint64_t x) {
412 return static_cast<uint32_t>(x) == x;
413}
414
415/// Checks if a unsigned integer is an N bit number shifted left by S.
416template <unsigned N, unsigned S>
417constexpr inline bool isShiftedUInt(uint64_t x) {
418 static_assert(
419 N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)");
420 static_assert(N + S <= 64,
421 "isShiftedUInt<N, S> with N + S > 64 is too wide.");
422 // Per the two static_asserts above, S must be strictly less than 64. So
423 // 1 << S is not undefined behavior.
424 return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
425}
426
427/// Gets the maximum value for a N-bit unsigned integer.
428inline uint64_t maxUIntN(uint64_t N) {
429 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
"integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "llvm/include/llvm/Support/MathExtras.h", 429, __extension__
__PRETTY_FUNCTION__))
;
430
431 // uint64_t(1) << 64 is undefined behavior, so we can't do
432 // (uint64_t(1) << N) - 1
433 // without checking first that N != 64. But this works and doesn't have a
434 // branch.
435 return UINT64_MAX(18446744073709551615UL) >> (64 - N);
436}
437
438/// Gets the minimum value for a N-bit signed integer.
439inline int64_t minIntN(int64_t N) {
440 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
"integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "llvm/include/llvm/Support/MathExtras.h", 440, __extension__
__PRETTY_FUNCTION__))
;
441
442 return UINT64_C(1)1UL + ~(UINT64_C(1)1UL << (N - 1));
443}
444
445/// Gets the maximum value for a N-bit signed integer.
446inline int64_t maxIntN(int64_t N) {
447 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
"integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "llvm/include/llvm/Support/MathExtras.h", 447, __extension__
__PRETTY_FUNCTION__))
;
448
449 // This relies on two's complement wraparound when N == 64, so we convert to
450 // int64_t only at the very end to avoid UB.
451 return (UINT64_C(1)1UL << (N - 1)) - 1;
452}
453
454/// Checks if an unsigned integer fits into the given (dynamic) bit width.
455inline bool isUIntN(unsigned N, uint64_t x) {
456 return N >= 64 || x <= maxUIntN(N);
457}
458
459/// Checks if an signed integer fits into the given (dynamic) bit width.
460inline bool isIntN(unsigned N, int64_t x) {
461 return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N));
462}
463
464/// Return true if the argument is a non-empty sequence of ones starting at the
465/// least significant bit with the remainder zero (32 bit version).
466/// Ex. isMask_32(0x0000FFFFU) == true.
467constexpr inline bool isMask_32(uint32_t Value) {
468 return Value && ((Value + 1) & Value) == 0;
469}
470
471/// Return true if the argument is a non-empty sequence of ones starting at the
472/// least significant bit with the remainder zero (64 bit version).
473constexpr inline bool isMask_64(uint64_t Value) {
474 return Value && ((Value + 1) & Value) == 0;
475}
476
477/// Return true if the argument contains a non-empty sequence of ones with the
478/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
479constexpr inline bool isShiftedMask_32(uint32_t Value) {
480 return Value && isMask_32((Value - 1) | Value);
481}
482
483/// Return true if the argument contains a non-empty sequence of ones with the
484/// remainder zero (64 bit version.)
485constexpr inline bool isShiftedMask_64(uint64_t Value) {
486 return Value && isMask_64((Value - 1) | Value);
487}
488
489/// Return true if the argument is a power of two > 0.
490/// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
491constexpr inline bool isPowerOf2_32(uint32_t Value) {
492 return Value && !(Value & (Value - 1));
493}
494
495/// Return true if the argument is a power of two > 0 (64 bit edition.)
496constexpr inline bool isPowerOf2_64(uint64_t Value) {
497 return Value && !(Value & (Value - 1));
498}
499
500/// Count the number of ones from the most significant bit to the first
501/// zero bit.
502///
503/// Ex. countLeadingOnes(0xFF0FFF00) == 8.
504/// Only unsigned integral types are allowed.
505///
506/// \param ZB the behavior on an input of all ones. Only ZB_Width and
507/// ZB_Undefined are valid arguments.
508template <typename T>
509unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
510 static_assert(std::numeric_limits<T>::is_integer &&
511 !std::numeric_limits<T>::is_signed,
512 "Only unsigned integral types are allowed.");
513 return countLeadingZeros<T>(~Value, ZB);
514}
515
516/// Count the number of ones from the least significant bit to the first
517/// zero bit.
518///
519/// Ex. countTrailingOnes(0x00FF00FF) == 8.
520/// Only unsigned integral types are allowed.
521///
522/// \param ZB the behavior on an input of all ones. Only ZB_Width and
523/// ZB_Undefined are valid arguments.
524template <typename T>
525unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
526 static_assert(std::numeric_limits<T>::is_integer &&
527 !std::numeric_limits<T>::is_signed,
528 "Only unsigned integral types are allowed.");
529 return countTrailingZeros<T>(~Value, ZB);
530}
531
532namespace detail {
533template <typename T, std::size_t SizeOfT> struct PopulationCounter {
534 static unsigned count(T Value) {
535 // Generic version, forward to 32 bits.
536 static_assert(SizeOfT <= 4, "Not implemented!");
537#if defined(__GNUC__4)
538 return __builtin_popcount(Value);
539#else
540 uint32_t v = Value;
541 v = v - ((v >> 1) & 0x55555555);
542 v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
543 return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
544#endif
545 }
546};
547
548template <typename T> struct PopulationCounter<T, 8> {
549 static unsigned count(T Value) {
550#if defined(__GNUC__4)
551 return __builtin_popcountll(Value);
552#else
553 uint64_t v = Value;
554 v = v - ((v >> 1) & 0x5555555555555555ULL);
555 v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
556 v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
557 return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
558#endif
559 }
560};
561} // namespace detail
562
563/// Count the number of set bits in a value.
564/// Ex. countPopulation(0xF000F000) = 8
565/// Returns 0 if the word is zero.
566template <typename T>
567inline unsigned countPopulation(T Value) {
568 static_assert(std::numeric_limits<T>::is_integer &&
569 !std::numeric_limits<T>::is_signed,
570 "Only unsigned integral types are allowed.");
571 return detail::PopulationCounter<T, sizeof(T)>::count(Value);
572}
573
574/// Compile time Log2.
575/// Valid only for positive powers of two.
576template <size_t kValue> constexpr inline size_t CTLog2() {
577 static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue),
578 "Value is not a valid power of 2");
579 return 1 + CTLog2<kValue / 2>();
580}
581
582template <> constexpr inline size_t CTLog2<1>() { return 0; }
583
584/// Return the log base 2 of the specified value.
585inline double Log2(double Value) {
586#if defined(__ANDROID_API__) && __ANDROID_API__ < 18
587 return __builtin_log(Value) / __builtin_log(2.0);
588#else
589 return log2(Value);
590#endif
591}
592
593/// Return the floor log base 2 of the specified value, -1 if the value is zero.
594/// (32 bit edition.)
595/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
596inline unsigned Log2_32(uint32_t Value) {
597 return 31 - countLeadingZeros(Value);
12
Returning the value 4294967295
598}
599
600/// Return the floor log base 2 of the specified value, -1 if the value is zero.
601/// (64 bit edition.)
602inline unsigned Log2_64(uint64_t Value) {
603 return 63 - countLeadingZeros(Value);
604}
605
606/// Return the ceil log base 2 of the specified value, 32 if the value is zero.
607/// (32 bit edition).
608/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
609inline unsigned Log2_32_Ceil(uint32_t Value) {
610 return 32 - countLeadingZeros(Value - 1);
611}
612
613/// Return the ceil log base 2 of the specified value, 64 if the value is zero.
614/// (64 bit edition.)
615inline unsigned Log2_64_Ceil(uint64_t Value) {
616 return 64 - countLeadingZeros(Value - 1);
617}
618
619/// Return the greatest common divisor of the values using Euclid's algorithm.
620template <typename T>
621inline T greatestCommonDivisor(T A, T B) {
622 while (B) {
623 T Tmp = B;
624 B = A % B;
625 A = Tmp;
626 }
627 return A;
628}
629
630inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) {
631 return greatestCommonDivisor<uint64_t>(A, B);
632}
633
634/// This function takes a 64-bit integer and returns the bit equivalent double.
635inline double BitsToDouble(uint64_t Bits) {
636 double D;
637 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
638 memcpy(&D, &Bits, sizeof(Bits));
639 return D;
640}
641
642/// This function takes a 32-bit integer and returns the bit equivalent float.
643inline float BitsToFloat(uint32_t Bits) {
644 float F;
645 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
646 memcpy(&F, &Bits, sizeof(Bits));
647 return F;
648}
649
650/// This function takes a double and returns the bit equivalent 64-bit integer.
651/// Note that copying doubles around changes the bits of NaNs on some hosts,
652/// notably x86, so this routine cannot be used if these bits are needed.
653inline uint64_t DoubleToBits(double Double) {
654 uint64_t Bits;
655 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
656 memcpy(&Bits, &Double, sizeof(Double));
657 return Bits;
658}
659
660/// This function takes a float and returns the bit equivalent 32-bit integer.
661/// Note that copying floats around changes the bits of NaNs on some hosts,
662/// notably x86, so this routine cannot be used if these bits are needed.
663inline uint32_t FloatToBits(float Float) {
664 uint32_t Bits;
665 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
666 memcpy(&Bits, &Float, sizeof(Float));
667 return Bits;
668}
669
670/// A and B are either alignments or offsets. Return the minimum alignment that
671/// may be assumed after adding the two together.
672constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
673 // The largest power of 2 that divides both A and B.
674 //
675 // Replace "-Value" by "1+~Value" in the following commented code to avoid
676 // MSVC warning C4146
677 // return (A | B) & -(A | B);
678 return (A | B) & (1 + ~(A | B));
679}
680
681/// Returns the next power of two (in 64-bits) that is strictly greater than A.
682/// Returns zero on overflow.
683inline uint64_t NextPowerOf2(uint64_t A) {
684 A |= (A >> 1);
685 A |= (A >> 2);
686 A |= (A >> 4);
687 A |= (A >> 8);
688 A |= (A >> 16);
689 A |= (A >> 32);
690 return A + 1;
691}
692
693/// Returns the power of two which is less than or equal to the given value.
694/// Essentially, it is a floor operation across the domain of powers of two.
695inline uint64_t PowerOf2Floor(uint64_t A) {
696 if (!A) return 0;
697 return 1ull << (63 - countLeadingZeros(A, ZB_Undefined));
698}
699
700/// Returns the power of two which is greater than or equal to the given value.
701/// Essentially, it is a ceil operation across the domain of powers of two.
702inline uint64_t PowerOf2Ceil(uint64_t A) {
703 if (!A)
704 return 0;
705 return NextPowerOf2(A - 1);
706}
707
708/// Returns the next integer (mod 2**64) that is greater than or equal to
709/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
710///
711/// If non-zero \p Skew is specified, the return value will be a minimal
712/// integer that is greater than or equal to \p Value and equal to
713/// \p Align * N + \p Skew for some integer N. If \p Skew is larger than
714/// \p Align, its value is adjusted to '\p Skew mod \p Align'.
715///
716/// Examples:
717/// \code
718/// alignTo(5, 8) = 8
719/// alignTo(17, 8) = 24
720/// alignTo(~0LL, 8) = 0
721/// alignTo(321, 255) = 510
722///
723/// alignTo(5, 8, 7) = 7
724/// alignTo(17, 8, 1) = 17
725/// alignTo(~0LL, 8, 3) = 3
726/// alignTo(321, 255, 42) = 552
727/// \endcode
728inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
729 assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0."
) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 729, __extension__
__PRETTY_FUNCTION__))
;
730 Skew %= Align;
731 return (Value + Align - 1 - Skew) / Align * Align + Skew;
732}
733
734/// Returns the next integer (mod 2**64) that is greater than or equal to
735/// \p Value and is a multiple of \c Align. \c Align must be non-zero.
736template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) {
737 static_assert(Align != 0u, "Align must be non-zero");
738 return (Value + Align - 1) / Align * Align;
739}
740
741/// Returns the integer ceil(Numerator / Denominator).
742inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
743 return alignTo(Numerator, Denominator) / Denominator;
744}
745
746/// Returns the integer nearest(Numerator / Denominator).
747inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) {
748 return (Numerator + (Denominator / 2)) / Denominator;
749}
750
751/// Returns the largest uint64_t less than or equal to \p Value and is
752/// \p Skew mod \p Align. \p Align must be non-zero
753inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
754 assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0."
) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 754, __extension__
__PRETTY_FUNCTION__))
;
755 Skew %= Align;
756 return (Value - Skew) / Align * Align + Skew;
757}
758
759/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
760/// Requires 0 < B <= 32.
761template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) {
762 static_assert(B > 0, "Bit width can't be 0.");
763 static_assert(B <= 32, "Bit width out of range.");
764 return int32_t(X << (32 - B)) >> (32 - B);
765}
766
767/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
768/// Requires 0 < B <= 32.
769inline int32_t SignExtend32(uint32_t X, unsigned B) {
770 assert(B > 0 && "Bit width can't be 0.")(static_cast <bool> (B > 0 && "Bit width can't be 0."
) ? void (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 770, __extension__
__PRETTY_FUNCTION__))
;
771 assert(B <= 32 && "Bit width out of range.")(static_cast <bool> (B <= 32 && "Bit width out of range."
) ? void (0) : __assert_fail ("B <= 32 && \"Bit width out of range.\""
, "llvm/include/llvm/Support/MathExtras.h", 771, __extension__
__PRETTY_FUNCTION__))
;
772 return int32_t(X << (32 - B)) >> (32 - B);
773}
774
775/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
776/// Requires 0 < B <= 64.
777template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) {
778 static_assert(B > 0, "Bit width can't be 0.");
779 static_assert(B <= 64, "Bit width out of range.");
780 return int64_t(x << (64 - B)) >> (64 - B);
781}
782
783/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
784/// Requires 0 < B <= 64.
785inline int64_t SignExtend64(uint64_t X, unsigned B) {
786 assert(B > 0 && "Bit width can't be 0.")(static_cast <bool> (B > 0 && "Bit width can't be 0."
) ? void (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 786, __extension__
__PRETTY_FUNCTION__))
;
787 assert(B <= 64 && "Bit width out of range.")(static_cast <bool> (B <= 64 && "Bit width out of range."
) ? void (0) : __assert_fail ("B <= 64 && \"Bit width out of range.\""
, "llvm/include/llvm/Support/MathExtras.h", 787, __extension__
__PRETTY_FUNCTION__))
;
788 return int64_t(X << (64 - B)) >> (64 - B);
789}
790
791/// Subtract two unsigned integers, X and Y, of type T and return the absolute
792/// value of the result.
793template <typename T>
794std::enable_if_t<std::is_unsigned<T>::value, T> AbsoluteDifference(T X, T Y) {
795 return X > Y ? (X - Y) : (Y - X);
796}
797
798/// Add two unsigned integers, X and Y, of type T. Clamp the result to the
799/// maximum representable value of T on overflow. ResultOverflowed indicates if
800/// the result is larger than the maximum representable value of type T.
801template <typename T>
802std::enable_if_t<std::is_unsigned<T>::value, T>
803SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) {
804 bool Dummy;
805 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
806 // Hacker's Delight, p. 29
807 T Z = X + Y;
808 Overflowed = (Z < X || Z < Y);
809 if (Overflowed)
810 return std::numeric_limits<T>::max();
811 else
812 return Z;
813}
814
815/// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the
816/// maximum representable value of T on overflow. ResultOverflowed indicates if
817/// the result is larger than the maximum representable value of type T.
818template <typename T>
819std::enable_if_t<std::is_unsigned<T>::value, T>
820SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) {
821 bool Dummy;
822 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
823
824 // Hacker's Delight, p. 30 has a different algorithm, but we don't use that
825 // because it fails for uint16_t (where multiplication can have undefined
826 // behavior due to promotion to int), and requires a division in addition
827 // to the multiplication.
828
829 Overflowed = false;
830
831 // Log2(Z) would be either Log2Z or Log2Z + 1.
832 // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z
833 // will necessarily be less than Log2Max as desired.
834 int Log2Z = Log2_64(X) + Log2_64(Y);
835 const T Max = std::numeric_limits<T>::max();
836 int Log2Max = Log2_64(Max);
837 if (Log2Z < Log2Max) {
838 return X * Y;
839 }
840 if (Log2Z > Log2Max) {
841 Overflowed = true;
842 return Max;
843 }
844
845 // We're going to use the top bit, and maybe overflow one
846 // bit past it. Multiply all but the bottom bit then add
847 // that on at the end.
848 T Z = (X >> 1) * Y;
849 if (Z & ~(Max >> 1)) {
850 Overflowed = true;
851 return Max;
852 }
853 Z <<= 1;
854 if (X & 1)
855 return SaturatingAdd(Z, Y, ResultOverflowed);
856
857 return Z;
858}
859
860/// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to
861/// the product. Clamp the result to the maximum representable value of T on
862/// overflow. ResultOverflowed indicates if the result is larger than the
863/// maximum representable value of type T.
864template <typename T>
865std::enable_if_t<std::is_unsigned<T>::value, T>
866SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) {
867 bool Dummy;
868 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
869
870 T Product = SaturatingMultiply(X, Y, &Overflowed);
871 if (Overflowed)
872 return Product;
873
874 return SaturatingAdd(A, Product, &Overflowed);
875}
876
877/// Use this rather than HUGE_VALF; the latter causes warnings on MSVC.
878extern const float huge_valf;
879
880
881/// Add two signed integers, computing the two's complement truncated result,
882/// returning true if overflow occured.
883template <typename T>
884std::enable_if_t<std::is_signed<T>::value, T> AddOverflow(T X, T Y, T &Result) {
885#if __has_builtin(__builtin_add_overflow)1
886 return __builtin_add_overflow(X, Y, &Result);
887#else
888 // Perform the unsigned addition.
889 using U = std::make_unsigned_t<T>;
890 const U UX = static_cast<U>(X);
891 const U UY = static_cast<U>(Y);
892 const U UResult = UX + UY;
893
894 // Convert to signed.
895 Result = static_cast<T>(UResult);
896
897 // Adding two positive numbers should result in a positive number.
898 if (X > 0 && Y > 0)
899 return Result <= 0;
900 // Adding two negatives should result in a negative number.
901 if (X < 0 && Y < 0)
902 return Result >= 0;
903 return false;
904#endif
905}
906
907/// Subtract two signed integers, computing the two's complement truncated
908/// result, returning true if an overflow ocurred.
909template <typename T>
910std::enable_if_t<std::is_signed<T>::value, T> SubOverflow(T X, T Y, T &Result) {
911#if __has_builtin(__builtin_sub_overflow)1
912 return __builtin_sub_overflow(X, Y, &Result);
913#else
914 // Perform the unsigned addition.
915 using U = std::make_unsigned_t<T>;
916 const U UX = static_cast<U>(X);
917 const U UY = static_cast<U>(Y);
918 const U UResult = UX - UY;
919
920 // Convert to signed.
921 Result = static_cast<T>(UResult);
922
923 // Subtracting a positive number from a negative results in a negative number.
924 if (X <= 0 && Y > 0)
925 return Result >= 0;
926 // Subtracting a negative number from a positive results in a positive number.
927 if (X >= 0 && Y < 0)
928 return Result <= 0;
929 return false;
930#endif
931}
932
933/// Multiply two signed integers, computing the two's complement truncated
934/// result, returning true if an overflow ocurred.
935template <typename T>
936std::enable_if_t<std::is_signed<T>::value, T> MulOverflow(T X, T Y, T &Result) {
937 // Perform the unsigned multiplication on absolute values.
938 using U = std::make_unsigned_t<T>;
939 const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X);
940 const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y);
941 const U UResult = UX * UY;
942
943 // Convert to signed.
944 const bool IsNegative = (X < 0) ^ (Y < 0);
945 Result = IsNegative ? (0 - UResult) : UResult;
946
947 // If any of the args was 0, result is 0 and no overflow occurs.
948 if (UX == 0 || UY == 0)
949 return false;
950
951 // UX and UY are in [1, 2^n], where n is the number of digits.
952 // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for
953 // positive) divided by an argument compares to the other.
954 if (IsNegative)
955 return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY;
956 else
957 return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY;
958}
959
960} // End llvm namespace
961
962#endif