Bug Summary

File:build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Warning:line 2188, column 21
The result of the left shift is undefined due to shifting by '4294967295', which is greater or equal to the width of type 'int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name AArch64TargetTransformInfo.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/build-llvm/tools/clang/stage2-bins -resource-dir /usr/lib/llvm-15/lib/clang/15.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Target/AArch64 -I /build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/llvm/lib/Target/AArch64 -I include -I /build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-15/lib/clang/15.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fmacro-prefix-map=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/= -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/= -O3 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/= -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-04-20-140412-16051-1 -x c++ /build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "AArch64TargetTransformInfo.h"
10#include "AArch64ExpandImm.h"
11#include "MCTargetDesc/AArch64AddressingModes.h"
12#include "llvm/Analysis/IVDescriptors.h"
13#include "llvm/Analysis/LoopInfo.h"
14#include "llvm/Analysis/TargetTransformInfo.h"
15#include "llvm/CodeGen/BasicTTIImpl.h"
16#include "llvm/CodeGen/CostTable.h"
17#include "llvm/CodeGen/TargetLowering.h"
18#include "llvm/IR/Intrinsics.h"
19#include "llvm/IR/IntrinsicInst.h"
20#include "llvm/IR/IntrinsicsAArch64.h"
21#include "llvm/IR/PatternMatch.h"
22#include "llvm/Support/Debug.h"
23#include "llvm/Transforms/InstCombine/InstCombiner.h"
24#include <algorithm>
25using namespace llvm;
26using namespace llvm::PatternMatch;
27
28#define DEBUG_TYPE"aarch64tti" "aarch64tti"
29
30static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
31 cl::init(true), cl::Hidden);
32
33static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
34 cl::Hidden);
35
36static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
37 cl::init(10), cl::Hidden);
38
39bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
40 const Function *Callee) const {
41 const TargetMachine &TM = getTLI()->getTargetMachine();
42
43 const FeatureBitset &CallerBits =
44 TM.getSubtargetImpl(*Caller)->getFeatureBits();
45 const FeatureBitset &CalleeBits =
46 TM.getSubtargetImpl(*Callee)->getFeatureBits();
47
48 // Inline a callee if its target-features are a subset of the callers
49 // target-features.
50 return (CallerBits & CalleeBits) == CalleeBits;
51}
52
53/// Calculate the cost of materializing a 64-bit value. This helper
54/// method might only calculate a fraction of a larger immediate. Therefore it
55/// is valid to return a cost of ZERO.
56InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) {
57 // Check if the immediate can be encoded within an instruction.
58 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
59 return 0;
60
61 if (Val < 0)
62 Val = ~Val;
63
64 // Calculate how many moves we will need to materialize this constant.
65 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
66 AArch64_IMM::expandMOVImm(Val, 64, Insn);
67 return Insn.size();
68}
69
70/// Calculate the cost of materializing the given constant.
71InstructionCost AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
72 TTI::TargetCostKind CostKind) {
73 assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
__assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 73, __extension__ __PRETTY_FUNCTION__))
;
74
75 unsigned BitSize = Ty->getPrimitiveSizeInBits();
76 if (BitSize == 0)
77 return ~0U;
78
79 // Sign-extend all constants to a multiple of 64-bit.
80 APInt ImmVal = Imm;
81 if (BitSize & 0x3f)
82 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
83
84 // Split the constant into 64-bit chunks and calculate the cost for each
85 // chunk.
86 InstructionCost Cost = 0;
87 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
88 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
89 int64_t Val = Tmp.getSExtValue();
90 Cost += getIntImmCost(Val);
91 }
92 // We need at least one instruction to materialze the constant.
93 return std::max<InstructionCost>(1, Cost);
94}
95
96InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
97 const APInt &Imm, Type *Ty,
98 TTI::TargetCostKind CostKind,
99 Instruction *Inst) {
100 assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
__assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 100, __extension__ __PRETTY_FUNCTION__))
;
101
102 unsigned BitSize = Ty->getPrimitiveSizeInBits();
103 // There is no cost model for constants with a bit size of 0. Return TCC_Free
104 // here, so that constant hoisting will ignore this constant.
105 if (BitSize == 0)
106 return TTI::TCC_Free;
107
108 unsigned ImmIdx = ~0U;
109 switch (Opcode) {
110 default:
111 return TTI::TCC_Free;
112 case Instruction::GetElementPtr:
113 // Always hoist the base address of a GetElementPtr.
114 if (Idx == 0)
115 return 2 * TTI::TCC_Basic;
116 return TTI::TCC_Free;
117 case Instruction::Store:
118 ImmIdx = 0;
119 break;
120 case Instruction::Add:
121 case Instruction::Sub:
122 case Instruction::Mul:
123 case Instruction::UDiv:
124 case Instruction::SDiv:
125 case Instruction::URem:
126 case Instruction::SRem:
127 case Instruction::And:
128 case Instruction::Or:
129 case Instruction::Xor:
130 case Instruction::ICmp:
131 ImmIdx = 1;
132 break;
133 // Always return TCC_Free for the shift value of a shift instruction.
134 case Instruction::Shl:
135 case Instruction::LShr:
136 case Instruction::AShr:
137 if (Idx == 1)
138 return TTI::TCC_Free;
139 break;
140 case Instruction::Trunc:
141 case Instruction::ZExt:
142 case Instruction::SExt:
143 case Instruction::IntToPtr:
144 case Instruction::PtrToInt:
145 case Instruction::BitCast:
146 case Instruction::PHI:
147 case Instruction::Call:
148 case Instruction::Select:
149 case Instruction::Ret:
150 case Instruction::Load:
151 break;
152 }
153
154 if (Idx == ImmIdx) {
155 int NumConstants = (BitSize + 63) / 64;
156 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
157 return (Cost <= NumConstants * TTI::TCC_Basic)
158 ? static_cast<int>(TTI::TCC_Free)
159 : Cost;
160 }
161 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
162}
163
164InstructionCost
165AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
166 const APInt &Imm, Type *Ty,
167 TTI::TargetCostKind CostKind) {
168 assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
__assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 168, __extension__ __PRETTY_FUNCTION__))
;
169
170 unsigned BitSize = Ty->getPrimitiveSizeInBits();
171 // There is no cost model for constants with a bit size of 0. Return TCC_Free
172 // here, so that constant hoisting will ignore this constant.
173 if (BitSize == 0)
174 return TTI::TCC_Free;
175
176 // Most (all?) AArch64 intrinsics do not support folding immediates into the
177 // selected instruction, so we compute the materialization cost for the
178 // immediate directly.
179 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
180 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
181
182 switch (IID) {
183 default:
184 return TTI::TCC_Free;
185 case Intrinsic::sadd_with_overflow:
186 case Intrinsic::uadd_with_overflow:
187 case Intrinsic::ssub_with_overflow:
188 case Intrinsic::usub_with_overflow:
189 case Intrinsic::smul_with_overflow:
190 case Intrinsic::umul_with_overflow:
191 if (Idx == 1) {
192 int NumConstants = (BitSize + 63) / 64;
193 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
194 return (Cost <= NumConstants * TTI::TCC_Basic)
195 ? static_cast<int>(TTI::TCC_Free)
196 : Cost;
197 }
198 break;
199 case Intrinsic::experimental_stackmap:
200 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
201 return TTI::TCC_Free;
202 break;
203 case Intrinsic::experimental_patchpoint_void:
204 case Intrinsic::experimental_patchpoint_i64:
205 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
206 return TTI::TCC_Free;
207 break;
208 case Intrinsic::experimental_gc_statepoint:
209 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
210 return TTI::TCC_Free;
211 break;
212 }
213 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
214}
215
216TargetTransformInfo::PopcntSupportKind
217AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
218 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2")(static_cast <bool> (isPowerOf2_32(TyWidth) && "Ty width must be power of 2"
) ? void (0) : __assert_fail ("isPowerOf2_32(TyWidth) && \"Ty width must be power of 2\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 218
, __extension__ __PRETTY_FUNCTION__))
;
219 if (TyWidth == 32 || TyWidth == 64)
220 return TTI::PSK_FastHardware;
221 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
222 return TTI::PSK_Software;
223}
224
225InstructionCost
226AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
227 TTI::TargetCostKind CostKind) {
228 auto *RetTy = ICA.getReturnType();
229 switch (ICA.getID()) {
230 case Intrinsic::umin:
231 case Intrinsic::umax:
232 case Intrinsic::smin:
233 case Intrinsic::smax: {
234 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
235 MVT::v8i16, MVT::v2i32, MVT::v4i32};
236 auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
237 // v2i64 types get converted to cmp+bif hence the cost of 2
238 if (LT.second == MVT::v2i64)
239 return LT.first * 2;
240 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
241 return LT.first;
242 break;
243 }
244 case Intrinsic::sadd_sat:
245 case Intrinsic::ssub_sat:
246 case Intrinsic::uadd_sat:
247 case Intrinsic::usub_sat: {
248 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
249 MVT::v8i16, MVT::v2i32, MVT::v4i32,
250 MVT::v2i64};
251 auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
252 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
253 // need to extend the type, as it uses shr(qadd(shl, shl)).
254 unsigned Instrs =
255 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
256 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
257 return LT.first * Instrs;
258 break;
259 }
260 case Intrinsic::abs: {
261 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
262 MVT::v8i16, MVT::v2i32, MVT::v4i32,
263 MVT::v2i64};
264 auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
265 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
266 return LT.first;
267 break;
268 }
269 case Intrinsic::experimental_stepvector: {
270 InstructionCost Cost = 1; // Cost of the `index' instruction
271 auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
272 // Legalisation of illegal vectors involves an `index' instruction plus
273 // (LT.first - 1) vector adds.
274 if (LT.first > 1) {
275 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
276 InstructionCost AddCost =
277 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
278 Cost += AddCost * (LT.first - 1);
279 }
280 return Cost;
281 }
282 case Intrinsic::bitreverse: {
283 static const CostTblEntry BitreverseTbl[] = {
284 {Intrinsic::bitreverse, MVT::i32, 1},
285 {Intrinsic::bitreverse, MVT::i64, 1},
286 {Intrinsic::bitreverse, MVT::v8i8, 1},
287 {Intrinsic::bitreverse, MVT::v16i8, 1},
288 {Intrinsic::bitreverse, MVT::v4i16, 2},
289 {Intrinsic::bitreverse, MVT::v8i16, 2},
290 {Intrinsic::bitreverse, MVT::v2i32, 2},
291 {Intrinsic::bitreverse, MVT::v4i32, 2},
292 {Intrinsic::bitreverse, MVT::v1i64, 2},
293 {Intrinsic::bitreverse, MVT::v2i64, 2},
294 };
295 const auto LegalisationCost = TLI->getTypeLegalizationCost(DL, RetTy);
296 const auto *Entry =
297 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
298 if (Entry) {
299 // Cost Model is using the legal type(i32) that i8 and i16 will be
300 // converted to +1 so that we match the actual lowering cost
301 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
302 TLI->getValueType(DL, RetTy, true) == MVT::i16)
303 return LegalisationCost.first * Entry->Cost + 1;
304
305 return LegalisationCost.first * Entry->Cost;
306 }
307 break;
308 }
309 case Intrinsic::ctpop: {
310 static const CostTblEntry CtpopCostTbl[] = {
311 {ISD::CTPOP, MVT::v2i64, 4},
312 {ISD::CTPOP, MVT::v4i32, 3},
313 {ISD::CTPOP, MVT::v8i16, 2},
314 {ISD::CTPOP, MVT::v16i8, 1},
315 {ISD::CTPOP, MVT::i64, 4},
316 {ISD::CTPOP, MVT::v2i32, 3},
317 {ISD::CTPOP, MVT::v4i16, 2},
318 {ISD::CTPOP, MVT::v8i8, 1},
319 {ISD::CTPOP, MVT::i32, 5},
320 };
321 auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
322 MVT MTy = LT.second;
323 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
324 // Extra cost of +1 when illegal vector types are legalized by promoting
325 // the integer type.
326 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
327 RetTy->getScalarSizeInBits()
328 ? 1
329 : 0;
330 return LT.first * Entry->Cost + ExtraCost;
331 }
332 break;
333 }
334 case Intrinsic::sadd_with_overflow:
335 case Intrinsic::uadd_with_overflow:
336 case Intrinsic::ssub_with_overflow:
337 case Intrinsic::usub_with_overflow:
338 case Intrinsic::smul_with_overflow:
339 case Intrinsic::umul_with_overflow: {
340 static const CostTblEntry WithOverflowCostTbl[] = {
341 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
342 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
343 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
344 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
345 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
346 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
347 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
348 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
349 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
350 {Intrinsic::usub_with_overflow, MVT::i8, 3},
351 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
352 {Intrinsic::usub_with_overflow, MVT::i16, 3},
353 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
354 {Intrinsic::usub_with_overflow, MVT::i32, 1},
355 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
356 {Intrinsic::usub_with_overflow, MVT::i64, 1},
357 {Intrinsic::smul_with_overflow, MVT::i8, 5},
358 {Intrinsic::umul_with_overflow, MVT::i8, 4},
359 {Intrinsic::smul_with_overflow, MVT::i16, 5},
360 {Intrinsic::umul_with_overflow, MVT::i16, 4},
361 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
362 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
363 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
364 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
365 };
366 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
367 if (MTy.isSimple())
368 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
369 MTy.getSimpleVT()))
370 return Entry->Cost;
371 break;
372 }
373 default:
374 break;
375 }
376 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
377}
378
379/// The function will remove redundant reinterprets casting in the presence
380/// of the control flow
381static Optional<Instruction *> processPhiNode(InstCombiner &IC,
382 IntrinsicInst &II) {
383 SmallVector<Instruction *, 32> Worklist;
384 auto RequiredType = II.getType();
385
386 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
387 assert(PN && "Expected Phi Node!")(static_cast <bool> (PN && "Expected Phi Node!"
) ? void (0) : __assert_fail ("PN && \"Expected Phi Node!\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 387
, __extension__ __PRETTY_FUNCTION__))
;
388
389 // Don't create a new Phi unless we can remove the old one.
390 if (!PN->hasOneUse())
391 return None;
392
393 for (Value *IncValPhi : PN->incoming_values()) {
394 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
395 if (!Reinterpret ||
396 Reinterpret->getIntrinsicID() !=
397 Intrinsic::aarch64_sve_convert_to_svbool ||
398 RequiredType != Reinterpret->getArgOperand(0)->getType())
399 return None;
400 }
401
402 // Create the new Phi
403 LLVMContext &Ctx = PN->getContext();
404 IRBuilder<> Builder(Ctx);
405 Builder.SetInsertPoint(PN);
406 PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
407 Worklist.push_back(PN);
408
409 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
410 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
411 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
412 Worklist.push_back(Reinterpret);
413 }
414
415 // Cleanup Phi Node and reinterprets
416 return IC.replaceInstUsesWith(II, NPN);
417}
418
419// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
420// => (binop (pred) (from_svbool _) (from_svbool _))
421//
422// The above transformation eliminates a `to_svbool` in the predicate
423// operand of bitwise operation `binop` by narrowing the vector width of
424// the operation. For example, it would convert a `<vscale x 16 x i1>
425// and` into a `<vscale x 4 x i1> and`. This is profitable because
426// to_svbool must zero the new lanes during widening, whereas
427// from_svbool is free.
428static Optional<Instruction *> tryCombineFromSVBoolBinOp(InstCombiner &IC,
429 IntrinsicInst &II) {
430 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
431 if (!BinOp)
432 return None;
433
434 auto IntrinsicID = BinOp->getIntrinsicID();
435 switch (IntrinsicID) {
436 case Intrinsic::aarch64_sve_and_z:
437 case Intrinsic::aarch64_sve_bic_z:
438 case Intrinsic::aarch64_sve_eor_z:
439 case Intrinsic::aarch64_sve_nand_z:
440 case Intrinsic::aarch64_sve_nor_z:
441 case Intrinsic::aarch64_sve_orn_z:
442 case Intrinsic::aarch64_sve_orr_z:
443 break;
444 default:
445 return None;
446 }
447
448 auto BinOpPred = BinOp->getOperand(0);
449 auto BinOpOp1 = BinOp->getOperand(1);
450 auto BinOpOp2 = BinOp->getOperand(2);
451
452 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
453 if (!PredIntr ||
454 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
455 return None;
456
457 auto PredOp = PredIntr->getOperand(0);
458 auto PredOpTy = cast<VectorType>(PredOp->getType());
459 if (PredOpTy != II.getType())
460 return None;
461
462 IRBuilder<> Builder(II.getContext());
463 Builder.SetInsertPoint(&II);
464
465 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
466 auto NarrowBinOpOp1 = Builder.CreateIntrinsic(
467 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
468 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
469 if (BinOpOp1 == BinOpOp2)
470 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
471 else
472 NarrowedBinOpArgs.push_back(Builder.CreateIntrinsic(
473 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
474
475 auto NarrowedBinOp =
476 Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
477 return IC.replaceInstUsesWith(II, NarrowedBinOp);
478}
479
480static Optional<Instruction *> instCombineConvertFromSVBool(InstCombiner &IC,
481 IntrinsicInst &II) {
482 // If the reinterpret instruction operand is a PHI Node
483 if (isa<PHINode>(II.getArgOperand(0)))
484 return processPhiNode(IC, II);
485
486 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
487 return BinOpCombine;
488
489 SmallVector<Instruction *, 32> CandidatesForRemoval;
490 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
491
492 const auto *IVTy = cast<VectorType>(II.getType());
493
494 // Walk the chain of conversions.
495 while (Cursor) {
496 // If the type of the cursor has fewer lanes than the final result, zeroing
497 // must take place, which breaks the equivalence chain.
498 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
499 if (CursorVTy->getElementCount().getKnownMinValue() <
500 IVTy->getElementCount().getKnownMinValue())
501 break;
502
503 // If the cursor has the same type as I, it is a viable replacement.
504 if (Cursor->getType() == IVTy)
505 EarliestReplacement = Cursor;
506
507 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
508
509 // If this is not an SVE conversion intrinsic, this is the end of the chain.
510 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
511 Intrinsic::aarch64_sve_convert_to_svbool ||
512 IntrinsicCursor->getIntrinsicID() ==
513 Intrinsic::aarch64_sve_convert_from_svbool))
514 break;
515
516 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
517 Cursor = IntrinsicCursor->getOperand(0);
518 }
519
520 // If no viable replacement in the conversion chain was found, there is
521 // nothing to do.
522 if (!EarliestReplacement)
523 return None;
524
525 return IC.replaceInstUsesWith(II, EarliestReplacement);
526}
527
528static Optional<Instruction *> instCombineSVESel(InstCombiner &IC,
529 IntrinsicInst &II) {
530 IRBuilder<> Builder(&II);
531 auto Select = Builder.CreateSelect(II.getOperand(0), II.getOperand(1),
532 II.getOperand(2));
533 return IC.replaceInstUsesWith(II, Select);
534}
535
536static Optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
537 IntrinsicInst &II) {
538 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
539 if (!Pg)
540 return None;
541
542 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
543 return None;
544
545 const auto PTruePattern =
546 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
547 if (PTruePattern != AArch64SVEPredPattern::vl1)
548 return None;
549
550 // The intrinsic is inserting into lane zero so use an insert instead.
551 auto *IdxTy = Type::getInt64Ty(II.getContext());
552 auto *Insert = InsertElementInst::Create(
553 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
554 Insert->insertBefore(&II);
555 Insert->takeName(&II);
556
557 return IC.replaceInstUsesWith(II, Insert);
558}
559
560static Optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
561 IntrinsicInst &II) {
562 // Replace DupX with a regular IR splat.
563 IRBuilder<> Builder(II.getContext());
564 Builder.SetInsertPoint(&II);
565 auto *RetTy = cast<ScalableVectorType>(II.getType());
566 Value *Splat =
567 Builder.CreateVectorSplat(RetTy->getElementCount(), II.getArgOperand(0));
568 Splat->takeName(&II);
569 return IC.replaceInstUsesWith(II, Splat);
570}
571
572static Optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
573 IntrinsicInst &II) {
574 LLVMContext &Ctx = II.getContext();
575 IRBuilder<> Builder(Ctx);
576 Builder.SetInsertPoint(&II);
577
578 // Check that the predicate is all active
579 auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
580 if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
581 return None;
582
583 const auto PTruePattern =
584 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
585 if (PTruePattern != AArch64SVEPredPattern::all)
586 return None;
587
588 // Check that we have a compare of zero..
589 auto *SplatValue =
590 dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2)));
591 if (!SplatValue || !SplatValue->isZero())
592 return None;
593
594 // ..against a dupq
595 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
596 if (!DupQLane ||
597 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
598 return None;
599
600 // Where the dupq is a lane 0 replicate of a vector insert
601 if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero())
602 return None;
603
604 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
605 if (!VecIns ||
606 VecIns->getIntrinsicID() != Intrinsic::experimental_vector_insert)
607 return None;
608
609 // Where the vector insert is a fixed constant vector insert into undef at
610 // index zero
611 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
612 return None;
613
614 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
615 return None;
616
617 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
618 if (!ConstVec)
619 return None;
620
621 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
622 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
623 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
624 return None;
625
626 unsigned NumElts = VecTy->getNumElements();
627 unsigned PredicateBits = 0;
628
629 // Expand intrinsic operands to a 16-bit byte level predicate
630 for (unsigned I = 0; I < NumElts; ++I) {
631 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
632 if (!Arg)
633 return None;
634 if (!Arg->isZero())
635 PredicateBits |= 1 << (I * (16 / NumElts));
636 }
637
638 // If all bits are zero bail early with an empty predicate
639 if (PredicateBits == 0) {
640 auto *PFalse = Constant::getNullValue(II.getType());
641 PFalse->takeName(&II);
642 return IC.replaceInstUsesWith(II, PFalse);
643 }
644
645 // Calculate largest predicate type used (where byte predicate is largest)
646 unsigned Mask = 8;
647 for (unsigned I = 0; I < 16; ++I)
648 if ((PredicateBits & (1 << I)) != 0)
649 Mask |= (I % 8);
650
651 unsigned PredSize = Mask & -Mask;
652 auto *PredType = ScalableVectorType::get(
653 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
654
655 // Ensure all relevant bits are set
656 for (unsigned I = 0; I < 16; I += PredSize)
657 if ((PredicateBits & (1 << I)) == 0)
658 return None;
659
660 auto *PTruePat =
661 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
662 auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
663 {PredType}, {PTruePat});
664 auto *ConvertToSVBool = Builder.CreateIntrinsic(
665 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
666 auto *ConvertFromSVBool =
667 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
668 {II.getType()}, {ConvertToSVBool});
669
670 ConvertFromSVBool->takeName(&II);
671 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
672}
673
674static Optional<Instruction *> instCombineSVELast(InstCombiner &IC,
675 IntrinsicInst &II) {
676 IRBuilder<> Builder(II.getContext());
677 Builder.SetInsertPoint(&II);
678 Value *Pg = II.getArgOperand(0);
679 Value *Vec = II.getArgOperand(1);
680 auto IntrinsicID = II.getIntrinsicID();
681 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
682
683 // lastX(splat(X)) --> X
684 if (auto *SplatVal = getSplatValue(Vec))
685 return IC.replaceInstUsesWith(II, SplatVal);
686
687 // If x and/or y is a splat value then:
688 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
689 Value *LHS, *RHS;
690 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
691 if (isSplatValue(LHS) || isSplatValue(RHS)) {
692 auto *OldBinOp = cast<BinaryOperator>(Vec);
693 auto OpC = OldBinOp->getOpcode();
694 auto *NewLHS =
695 Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
696 auto *NewRHS =
697 Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
698 auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags(
699 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), &II);
700 return IC.replaceInstUsesWith(II, NewBinOp);
701 }
702 }
703
704 auto *C = dyn_cast<Constant>(Pg);
705 if (IsAfter && C && C->isNullValue()) {
706 // The intrinsic is extracting lane 0 so use an extract instead.
707 auto *IdxTy = Type::getInt64Ty(II.getContext());
708 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
709 Extract->insertBefore(&II);
710 Extract->takeName(&II);
711 return IC.replaceInstUsesWith(II, Extract);
712 }
713
714 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
715 if (!IntrPG)
716 return None;
717
718 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
719 return None;
720
721 const auto PTruePattern =
722 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
723
724 // Can the intrinsic's predicate be converted to a known constant index?
725 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
726 if (!MinNumElts)
727 return None;
728
729 unsigned Idx = MinNumElts - 1;
730 // Increment the index if extracting the element after the last active
731 // predicate element.
732 if (IsAfter)
733 ++Idx;
734
735 // Ignore extracts whose index is larger than the known minimum vector
736 // length. NOTE: This is an artificial constraint where we prefer to
737 // maintain what the user asked for until an alternative is proven faster.
738 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
739 if (Idx >= PgVTy->getMinNumElements())
740 return None;
741
742 // The intrinsic is extracting a fixed lane so use an extract instead.
743 auto *IdxTy = Type::getInt64Ty(II.getContext());
744 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
745 Extract->insertBefore(&II);
746 Extract->takeName(&II);
747 return IC.replaceInstUsesWith(II, Extract);
748}
749
750static Optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
751 IntrinsicInst &II) {
752 LLVMContext &Ctx = II.getContext();
753 IRBuilder<> Builder(Ctx);
754 Builder.SetInsertPoint(&II);
755 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
756 // can work with RDFFR_PP for ptest elimination.
757 auto *AllPat =
758 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
759 auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
760 {II.getType()}, {AllPat});
761 auto *RDFFR =
762 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue});
763 RDFFR->takeName(&II);
764 return IC.replaceInstUsesWith(II, RDFFR);
765}
766
767static Optional<Instruction *>
768instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) {
769 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
770
771 if (Pattern == AArch64SVEPredPattern::all) {
772 LLVMContext &Ctx = II.getContext();
773 IRBuilder<> Builder(Ctx);
774 Builder.SetInsertPoint(&II);
775
776 Constant *StepVal = ConstantInt::get(II.getType(), NumElts);
777 auto *VScale = Builder.CreateVScale(StepVal);
778 VScale->takeName(&II);
779 return IC.replaceInstUsesWith(II, VScale);
780 }
781
782 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
783
784 return MinNumElts && NumElts >= MinNumElts
785 ? Optional<Instruction *>(IC.replaceInstUsesWith(
786 II, ConstantInt::get(II.getType(), MinNumElts)))
787 : None;
788}
789
790static Optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
791 IntrinsicInst &II) {
792 IntrinsicInst *Op1 = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
793 IntrinsicInst *Op2 = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
794
795 if (Op1 && Op2 &&
796 Op1->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
797 Op2->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
798 Op1->getArgOperand(0)->getType() == Op2->getArgOperand(0)->getType()) {
799
800 IRBuilder<> Builder(II.getContext());
801 Builder.SetInsertPoint(&II);
802
803 Value *Ops[] = {Op1->getArgOperand(0), Op2->getArgOperand(0)};
804 Type *Tys[] = {Op1->getArgOperand(0)->getType()};
805
806 auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
807
808 PTest->takeName(&II);
809 return IC.replaceInstUsesWith(II, PTest);
810 }
811
812 return None;
813}
814
815static Optional<Instruction *> instCombineSVEVectorFMLA(InstCombiner &IC,
816 IntrinsicInst &II) {
817 // fold (fadd p a (fmul p b c)) -> (fma p a b c)
818 Value *P = II.getOperand(0);
819 Value *A = II.getOperand(1);
820 auto FMul = II.getOperand(2);
821 Value *B, *C;
822 if (!match(FMul, m_Intrinsic<Intrinsic::aarch64_sve_fmul>(
823 m_Specific(P), m_Value(B), m_Value(C))))
824 return None;
825
826 if (!FMul->hasOneUse())
827 return None;
828
829 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
830 // Stop the combine when the flags on the inputs differ in case dropping flags
831 // would lead to us missing out on more beneficial optimizations.
832 if (FAddFlags != cast<CallInst>(FMul)->getFastMathFlags())
833 return None;
834 if (!FAddFlags.allowContract())
835 return None;
836
837 IRBuilder<> Builder(II.getContext());
838 Builder.SetInsertPoint(&II);
839 auto FMLA = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_fmla,
840 {II.getType()}, {P, A, B, C}, &II);
841 FMLA->setFastMathFlags(FAddFlags);
842 return IC.replaceInstUsesWith(II, FMLA);
843}
844
845static bool isAllActivePredicate(Value *Pred) {
846 // Look through convert.from.svbool(convert.to.svbool(...) chain.
847 Value *UncastedPred;
848 if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
849 m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
850 m_Value(UncastedPred)))))
851 // If the predicate has the same or less lanes than the uncasted
852 // predicate then we know the casting has no effect.
853 if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
854 cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
855 Pred = UncastedPred;
856
857 return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
858 m_ConstantInt<AArch64SVEPredPattern::all>()));
859}
860
861static Optional<Instruction *>
862instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
863 IRBuilder<> Builder(II.getContext());
864 Builder.SetInsertPoint(&II);
865
866 Value *Pred = II.getOperand(0);
867 Value *PtrOp = II.getOperand(1);
868 Type *VecTy = II.getType();
869 Value *VecPtr = Builder.CreateBitCast(PtrOp, VecTy->getPointerTo());
870
871 if (isAllActivePredicate(Pred)) {
872 LoadInst *Load = Builder.CreateLoad(VecTy, VecPtr);
873 Load->copyMetadata(II);
874 return IC.replaceInstUsesWith(II, Load);
875 }
876
877 CallInst *MaskedLoad =
878 Builder.CreateMaskedLoad(VecTy, VecPtr, PtrOp->getPointerAlignment(DL),
879 Pred, ConstantAggregateZero::get(VecTy));
880 MaskedLoad->copyMetadata(II);
881 return IC.replaceInstUsesWith(II, MaskedLoad);
882}
883
884static Optional<Instruction *>
885instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
886 IRBuilder<> Builder(II.getContext());
887 Builder.SetInsertPoint(&II);
888
889 Value *VecOp = II.getOperand(0);
890 Value *Pred = II.getOperand(1);
891 Value *PtrOp = II.getOperand(2);
892 Value *VecPtr =
893 Builder.CreateBitCast(PtrOp, VecOp->getType()->getPointerTo());
894
895 if (isAllActivePredicate(Pred)) {
896 StoreInst *Store = Builder.CreateStore(VecOp, VecPtr);
897 Store->copyMetadata(II);
898 return IC.eraseInstFromFunction(II);
899 }
900
901 CallInst *MaskedStore = Builder.CreateMaskedStore(
902 VecOp, VecPtr, PtrOp->getPointerAlignment(DL), Pred);
903 MaskedStore->copyMetadata(II);
904 return IC.eraseInstFromFunction(II);
905}
906
907static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) {
908 switch (Intrinsic) {
909 case Intrinsic::aarch64_sve_fmul:
910 return Instruction::BinaryOps::FMul;
911 case Intrinsic::aarch64_sve_fadd:
912 return Instruction::BinaryOps::FAdd;
913 case Intrinsic::aarch64_sve_fsub:
914 return Instruction::BinaryOps::FSub;
915 default:
916 return Instruction::BinaryOpsEnd;
917 }
918}
919
920static Optional<Instruction *> instCombineSVEVectorBinOp(InstCombiner &IC,
921 IntrinsicInst &II) {
922 auto *OpPredicate = II.getOperand(0);
923 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
924 if (BinOpCode == Instruction::BinaryOpsEnd ||
925 !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
926 m_ConstantInt<AArch64SVEPredPattern::all>())))
927 return None;
928 IRBuilder<> Builder(II.getContext());
929 Builder.SetInsertPoint(&II);
930 Builder.setFastMathFlags(II.getFastMathFlags());
931 auto BinOp =
932 Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2));
933 return IC.replaceInstUsesWith(II, BinOp);
934}
935
936static Optional<Instruction *> instCombineSVEVectorFAdd(InstCombiner &IC,
937 IntrinsicInst &II) {
938 if (auto FMLA = instCombineSVEVectorFMLA(IC, II))
939 return FMLA;
940 return instCombineSVEVectorBinOp(IC, II);
941}
942
943static Optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
944 IntrinsicInst &II) {
945 auto *OpPredicate = II.getOperand(0);
946 auto *OpMultiplicand = II.getOperand(1);
947 auto *OpMultiplier = II.getOperand(2);
948
949 IRBuilder<> Builder(II.getContext());
950 Builder.SetInsertPoint(&II);
951
952 // Return true if a given instruction is a unit splat value, false otherwise.
953 auto IsUnitSplat = [](auto *I) {
954 auto *SplatValue = getSplatValue(I);
955 if (!SplatValue)
956 return false;
957 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
958 };
959
960 // Return true if a given instruction is an aarch64_sve_dup intrinsic call
961 // with a unit splat value, false otherwise.
962 auto IsUnitDup = [](auto *I) {
963 auto *IntrI = dyn_cast<IntrinsicInst>(I);
964 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
965 return false;
966
967 auto *SplatValue = IntrI->getOperand(2);
968 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
969 };
970
971 if (IsUnitSplat(OpMultiplier)) {
972 // [f]mul pg %n, (dupx 1) => %n
973 OpMultiplicand->takeName(&II);
974 return IC.replaceInstUsesWith(II, OpMultiplicand);
975 } else if (IsUnitDup(OpMultiplier)) {
976 // [f]mul pg %n, (dup pg 1) => %n
977 auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
978 auto *DupPg = DupInst->getOperand(1);
979 // TODO: this is naive. The optimization is still valid if DupPg
980 // 'encompasses' OpPredicate, not only if they're the same predicate.
981 if (OpPredicate == DupPg) {
982 OpMultiplicand->takeName(&II);
983 return IC.replaceInstUsesWith(II, OpMultiplicand);
984 }
985 }
986
987 return instCombineSVEVectorBinOp(IC, II);
988}
989
990static Optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
991 IntrinsicInst &II) {
992 IRBuilder<> Builder(II.getContext());
993 Builder.SetInsertPoint(&II);
994 Value *UnpackArg = II.getArgOperand(0);
995 auto *RetTy = cast<ScalableVectorType>(II.getType());
996 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
997 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
998
999 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
1000 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
1001 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
1002 ScalarArg =
1003 Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
1004 Value *NewVal =
1005 Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
1006 NewVal->takeName(&II);
1007 return IC.replaceInstUsesWith(II, NewVal);
1008 }
1009
1010 return None;
1011}
1012static Optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
1013 IntrinsicInst &II) {
1014 auto *OpVal = II.getOperand(0);
1015 auto *OpIndices = II.getOperand(1);
1016 VectorType *VTy = cast<VectorType>(II.getType());
1017
1018 // Check whether OpIndices is a constant splat value < minimal element count
1019 // of result.
1020 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
1021 if (!SplatValue ||
1022 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
1023 return None;
1024
1025 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
1026 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
1027 IRBuilder<> Builder(II.getContext());
1028 Builder.SetInsertPoint(&II);
1029 auto *Extract = Builder.CreateExtractElement(OpVal, SplatValue);
1030 auto *VectorSplat =
1031 Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
1032
1033 VectorSplat->takeName(&II);
1034 return IC.replaceInstUsesWith(II, VectorSplat);
1035}
1036
1037static Optional<Instruction *> instCombineSVETupleGet(InstCombiner &IC,
1038 IntrinsicInst &II) {
1039 // Try to remove sequences of tuple get/set.
1040 Value *SetTuple, *SetIndex, *SetValue;
1041 auto *GetTuple = II.getArgOperand(0);
1042 auto *GetIndex = II.getArgOperand(1);
1043 // Check that we have tuple_get(GetTuple, GetIndex) where GetTuple is a
1044 // call to tuple_set i.e. tuple_set(SetTuple, SetIndex, SetValue).
1045 // Make sure that the types of the current intrinsic and SetValue match
1046 // in order to safely remove the sequence.
1047 if (!match(GetTuple,
1048 m_Intrinsic<Intrinsic::aarch64_sve_tuple_set>(
1049 m_Value(SetTuple), m_Value(SetIndex), m_Value(SetValue))) ||
1050 SetValue->getType() != II.getType())
1051 return None;
1052 // Case where we get the same index right after setting it.
1053 // tuple_get(tuple_set(SetTuple, SetIndex, SetValue), GetIndex) --> SetValue
1054 if (GetIndex == SetIndex)
1055 return IC.replaceInstUsesWith(II, SetValue);
1056 // If we are getting a different index than what was set in the tuple_set
1057 // intrinsic. We can just set the input tuple to the one up in the chain.
1058 // tuple_get(tuple_set(SetTuple, SetIndex, SetValue), GetIndex)
1059 // --> tuple_get(SetTuple, GetIndex)
1060 return IC.replaceOperand(II, 0, SetTuple);
1061}
1062
1063static Optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
1064 IntrinsicInst &II) {
1065 // zip1(uzp1(A, B), uzp2(A, B)) --> A
1066 // zip2(uzp1(A, B), uzp2(A, B)) --> B
1067 Value *A, *B;
1068 if (match(II.getArgOperand(0),
1069 m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) &&
1070 match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
1071 m_Specific(A), m_Specific(B))))
1072 return IC.replaceInstUsesWith(
1073 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
1074
1075 return None;
1076}
1077
1078static Optional<Instruction *> instCombineLD1GatherIndex(InstCombiner &IC,
1079 IntrinsicInst &II) {
1080 Value *Mask = II.getOperand(0);
1081 Value *BasePtr = II.getOperand(1);
1082 Value *Index = II.getOperand(2);
1083 Type *Ty = II.getType();
1084 Value *PassThru = ConstantAggregateZero::get(Ty);
1085
1086 // Contiguous gather => masked load.
1087 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
1088 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
1089 Value *IndexBase;
1090 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1091 m_Value(IndexBase), m_SpecificInt(1)))) {
1092 IRBuilder<> Builder(II.getContext());
1093 Builder.SetInsertPoint(&II);
1094
1095 Align Alignment =
1096 BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
1097
1098 Type *VecPtrTy = PointerType::getUnqual(Ty);
1099 Value *Ptr = Builder.CreateGEP(
1100 cast<VectorType>(Ty)->getElementType(), BasePtr, IndexBase);
1101 Ptr = Builder.CreateBitCast(Ptr, VecPtrTy);
1102 CallInst *MaskedLoad =
1103 Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
1104 MaskedLoad->takeName(&II);
1105 return IC.replaceInstUsesWith(II, MaskedLoad);
1106 }
1107
1108 return None;
1109}
1110
1111static Optional<Instruction *> instCombineST1ScatterIndex(InstCombiner &IC,
1112 IntrinsicInst &II) {
1113 Value *Val = II.getOperand(0);
1114 Value *Mask = II.getOperand(1);
1115 Value *BasePtr = II.getOperand(2);
1116 Value *Index = II.getOperand(3);
1117 Type *Ty = Val->getType();
1118
1119 // Contiguous scatter => masked store.
1120 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
1121 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
1122 Value *IndexBase;
1123 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1124 m_Value(IndexBase), m_SpecificInt(1)))) {
1125 IRBuilder<> Builder(II.getContext());
1126 Builder.SetInsertPoint(&II);
1127
1128 Align Alignment =
1129 BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
1130
1131 Value *Ptr = Builder.CreateGEP(
1132 cast<VectorType>(Ty)->getElementType(), BasePtr, IndexBase);
1133 Type *VecPtrTy = PointerType::getUnqual(Ty);
1134 Ptr = Builder.CreateBitCast(Ptr, VecPtrTy);
1135
1136 (void)Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
1137
1138 return IC.eraseInstFromFunction(II);
1139 }
1140
1141 return None;
1142}
1143
1144static Optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
1145 IntrinsicInst &II) {
1146 IRBuilder<> Builder(II.getContext());
1147 Builder.SetInsertPoint(&II);
1148 Type *Int32Ty = Builder.getInt32Ty();
1149 Value *Pred = II.getOperand(0);
1150 Value *Vec = II.getOperand(1);
1151 Value *DivVec = II.getOperand(2);
1152
1153 Value *SplatValue = getSplatValue(DivVec);
1154 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
1155 if (!SplatConstantInt)
1156 return None;
1157 APInt Divisor = SplatConstantInt->getValue();
1158
1159 if (Divisor.isPowerOf2()) {
1160 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
1161 auto ASRD = Builder.CreateIntrinsic(
1162 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
1163 return IC.replaceInstUsesWith(II, ASRD);
1164 }
1165 if (Divisor.isNegatedPowerOf2()) {
1166 Divisor.negate();
1167 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
1168 auto ASRD = Builder.CreateIntrinsic(
1169 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
1170 auto NEG = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_neg,
1171 {ASRD->getType()}, {ASRD, Pred, ASRD});
1172 return IC.replaceInstUsesWith(II, NEG);
1173 }
1174
1175 return None;
1176}
1177
1178Optional<Instruction *>
1179AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
1180 IntrinsicInst &II) const {
1181 Intrinsic::ID IID = II.getIntrinsicID();
1182 switch (IID) {
1183 default:
1184 break;
1185 case Intrinsic::aarch64_sve_convert_from_svbool:
1186 return instCombineConvertFromSVBool(IC, II);
1187 case Intrinsic::aarch64_sve_dup:
1188 return instCombineSVEDup(IC, II);
1189 case Intrinsic::aarch64_sve_dup_x:
1190 return instCombineSVEDupX(IC, II);
1191 case Intrinsic::aarch64_sve_cmpne:
1192 case Intrinsic::aarch64_sve_cmpne_wide:
1193 return instCombineSVECmpNE(IC, II);
1194 case Intrinsic::aarch64_sve_rdffr:
1195 return instCombineRDFFR(IC, II);
1196 case Intrinsic::aarch64_sve_lasta:
1197 case Intrinsic::aarch64_sve_lastb:
1198 return instCombineSVELast(IC, II);
1199 case Intrinsic::aarch64_sve_cntd:
1200 return instCombineSVECntElts(IC, II, 2);
1201 case Intrinsic::aarch64_sve_cntw:
1202 return instCombineSVECntElts(IC, II, 4);
1203 case Intrinsic::aarch64_sve_cnth:
1204 return instCombineSVECntElts(IC, II, 8);
1205 case Intrinsic::aarch64_sve_cntb:
1206 return instCombineSVECntElts(IC, II, 16);
1207 case Intrinsic::aarch64_sve_ptest_any:
1208 case Intrinsic::aarch64_sve_ptest_first:
1209 case Intrinsic::aarch64_sve_ptest_last:
1210 return instCombineSVEPTest(IC, II);
1211 case Intrinsic::aarch64_sve_mul:
1212 case Intrinsic::aarch64_sve_fmul:
1213 return instCombineSVEVectorMul(IC, II);
1214 case Intrinsic::aarch64_sve_fadd:
1215 return instCombineSVEVectorFAdd(IC, II);
1216 case Intrinsic::aarch64_sve_fsub:
1217 return instCombineSVEVectorBinOp(IC, II);
1218 case Intrinsic::aarch64_sve_tbl:
1219 return instCombineSVETBL(IC, II);
1220 case Intrinsic::aarch64_sve_uunpkhi:
1221 case Intrinsic::aarch64_sve_uunpklo:
1222 case Intrinsic::aarch64_sve_sunpkhi:
1223 case Intrinsic::aarch64_sve_sunpklo:
1224 return instCombineSVEUnpack(IC, II);
1225 case Intrinsic::aarch64_sve_tuple_get:
1226 return instCombineSVETupleGet(IC, II);
1227 case Intrinsic::aarch64_sve_zip1:
1228 case Intrinsic::aarch64_sve_zip2:
1229 return instCombineSVEZip(IC, II);
1230 case Intrinsic::aarch64_sve_ld1_gather_index:
1231 return instCombineLD1GatherIndex(IC, II);
1232 case Intrinsic::aarch64_sve_st1_scatter_index:
1233 return instCombineST1ScatterIndex(IC, II);
1234 case Intrinsic::aarch64_sve_ld1:
1235 return instCombineSVELD1(IC, II, DL);
1236 case Intrinsic::aarch64_sve_st1:
1237 return instCombineSVEST1(IC, II, DL);
1238 case Intrinsic::aarch64_sve_sdiv:
1239 return instCombineSVESDIV(IC, II);
1240 case Intrinsic::aarch64_sve_sel:
1241 return instCombineSVESel(IC, II);
1242 }
1243
1244 return None;
1245}
1246
1247Optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic(
1248 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
1249 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
1250 std::function<void(Instruction *, unsigned, APInt, APInt &)>
1251 SimplifyAndSetOp) const {
1252 switch (II.getIntrinsicID()) {
1253 default:
1254 break;
1255 case Intrinsic::aarch64_neon_fcvtxn:
1256 case Intrinsic::aarch64_neon_rshrn:
1257 case Intrinsic::aarch64_neon_sqrshrn:
1258 case Intrinsic::aarch64_neon_sqrshrun:
1259 case Intrinsic::aarch64_neon_sqshrn:
1260 case Intrinsic::aarch64_neon_sqshrun:
1261 case Intrinsic::aarch64_neon_sqxtn:
1262 case Intrinsic::aarch64_neon_sqxtun:
1263 case Intrinsic::aarch64_neon_uqrshrn:
1264 case Intrinsic::aarch64_neon_uqshrn:
1265 case Intrinsic::aarch64_neon_uqxtn:
1266 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
1267 break;
1268 }
1269
1270 return None;
1271}
1272
1273bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
1274 ArrayRef<const Value *> Args) {
1275
1276 // A helper that returns a vector type from the given type. The number of
1277 // elements in type Ty determine the vector width.
1278 auto toVectorTy = [&](Type *ArgTy) {
1279 return VectorType::get(ArgTy->getScalarType(),
1280 cast<VectorType>(DstTy)->getElementCount());
1281 };
1282
1283 // Exit early if DstTy is not a vector type whose elements are at least
1284 // 16-bits wide.
1285 if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16)
1286 return false;
1287
1288 // Determine if the operation has a widening variant. We consider both the
1289 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
1290 // instructions.
1291 //
1292 // TODO: Add additional widening operations (e.g., shl, etc.) once we
1293 // verify that their extending operands are eliminated during code
1294 // generation.
1295 switch (Opcode) {
1296 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
1297 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
1298 case Instruction::Mul: // SMULL(2), UMULL(2)
1299 break;
1300 default:
1301 return false;
1302 }
1303
1304 // To be a widening instruction (either the "wide" or "long" versions), the
1305 // second operand must be a sign- or zero extend.
1306 if (Args.size() != 2 ||
1307 (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])))
1308 return false;
1309 auto *Extend = cast<CastInst>(Args[1]);
1310 auto *Arg0 = dyn_cast<CastInst>(Args[0]);
1311
1312 // A mul only has a mull version (not like addw). Both operands need to be
1313 // extending and the same type.
1314 if (Opcode == Instruction::Mul &&
1315 (!Arg0 || Arg0->getOpcode() != Extend->getOpcode() ||
1316 Arg0->getOperand(0)->getType() != Extend->getOperand(0)->getType()))
1317 return false;
1318
1319 // Legalize the destination type and ensure it can be used in a widening
1320 // operation.
1321 auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy);
1322 unsigned DstElTySize = DstTyL.second.getScalarSizeInBits();
1323 if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits())
1324 return false;
1325
1326 // Legalize the source type and ensure it can be used in a widening
1327 // operation.
1328 auto *SrcTy = toVectorTy(Extend->getSrcTy());
1329 auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy);
1330 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
1331 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
1332 return false;
1333
1334 // Get the total number of vector elements in the legalized types.
1335 InstructionCost NumDstEls =
1336 DstTyL.first * DstTyL.second.getVectorMinNumElements();
1337 InstructionCost NumSrcEls =
1338 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
1339
1340 // Return true if the legalized types have the same number of vector elements
1341 // and the destination element type size is twice that of the source type.
1342 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize;
1343}
1344
1345InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
1346 Type *Src,
1347 TTI::CastContextHint CCH,
1348 TTI::TargetCostKind CostKind,
1349 const Instruction *I) {
1350 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1351 assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ?
void (0) : __assert_fail ("ISD && \"Invalid opcode\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 1351
, __extension__ __PRETTY_FUNCTION__))
;
1352
1353 // If the cast is observable, and it is used by a widening instruction (e.g.,
1354 // uaddl, saddw, etc.), it may be free.
1355 if (I && I->hasOneUser()) {
1356 auto *SingleUser = cast<Instruction>(*I->user_begin());
1357 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
1358 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
1359 // If the cast is the second operand, it is free. We will generate either
1360 // a "wide" or "long" version of the widening instruction.
1361 if (I == SingleUser->getOperand(1))
1362 return 0;
1363 // If the cast is not the second operand, it will be free if it looks the
1364 // same as the second operand. In this case, we will generate a "long"
1365 // version of the widening instruction.
1366 if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
1367 if (I->getOpcode() == unsigned(Cast->getOpcode()) &&
1368 cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
1369 return 0;
1370 }
1371 }
1372
1373 // TODO: Allow non-throughput costs that aren't binary.
1374 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
1375 if (CostKind != TTI::TCK_RecipThroughput)
1376 return Cost == 0 ? 0 : 1;
1377 return Cost;
1378 };
1379
1380 EVT SrcTy = TLI->getValueType(DL, Src);
1381 EVT DstTy = TLI->getValueType(DL, Dst);
1382
1383 if (!SrcTy.isSimple() || !DstTy.isSimple())
1384 return AdjustCost(
1385 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
1386
1387 static const TypeConversionCostTblEntry
1388 ConversionTbl[] = {
1389 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
1390 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
1391 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
1392 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
1393
1394 // Truncations on nxvmiN
1395 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 },
1396 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 },
1397 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 },
1398 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 },
1399 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 },
1400 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 },
1401 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 },
1402 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 },
1403 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 },
1404 { ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1 },
1405 { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 },
1406 { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 },
1407 { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 },
1408 { ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 },
1409 { ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 },
1410 { ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 },
1411
1412 // The number of shll instructions for the extension.
1413 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
1414 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
1415 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
1416 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
1417 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
1418 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
1419 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
1420 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
1421 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
1422 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
1423 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
1424 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
1425 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
1426 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
1427 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
1428 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
1429
1430 // LowerVectorINT_TO_FP:
1431 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
1432 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
1433 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
1434 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
1435 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
1436 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
1437
1438 // Complex: to v2f32
1439 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
1440 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
1441 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
1442 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
1443 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
1444 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
1445
1446 // Complex: to v4f32
1447 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 },
1448 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
1449 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
1450 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
1451
1452 // Complex: to v8f32
1453 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
1454 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
1455 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
1456 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
1457
1458 // Complex: to v16f32
1459 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
1460 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
1461
1462 // Complex: to v2f64
1463 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
1464 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
1465 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
1466 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
1467 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
1468 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
1469
1470
1471 // LowerVectorFP_TO_INT
1472 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
1473 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
1474 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
1475 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
1476 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
1477 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
1478
1479 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
1480 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
1481 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
1482 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 },
1483 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
1484 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
1485 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 },
1486
1487 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
1488 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
1489 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 },
1490 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
1491 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 },
1492
1493 // Complex, from nxv2f32.
1494 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
1495 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
1496 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
1497 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1 },
1498 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
1499 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
1500 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
1501 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1 },
1502
1503 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
1504 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
1505 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
1506 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 },
1507 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
1508 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
1509 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 },
1510
1511 // Complex, from nxv2f64.
1512 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
1513 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
1514 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
1515 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1 },
1516 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
1517 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
1518 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
1519 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1 },
1520
1521 // Complex, from nxv4f32.
1522 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
1523 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
1524 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
1525 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1 },
1526 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
1527 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
1528 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
1529 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1 },
1530
1531 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
1532 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
1533 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7 },
1534 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
1535 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7 },
1536
1537 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
1538 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
1539 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
1540 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3 },
1541 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
1542 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
1543 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3 },
1544
1545 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
1546 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
1547 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3 },
1548 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
1549 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3 },
1550
1551 // Complex, from nxv8f16.
1552 { ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
1553 { ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
1554 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
1555 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1 },
1556 { ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
1557 { ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
1558 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
1559 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1 },
1560
1561 // Complex, from nxv4f16.
1562 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
1563 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
1564 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
1565 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1 },
1566 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
1567 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
1568 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
1569 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1 },
1570
1571 // Complex, from nxv2f16.
1572 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
1573 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
1574 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
1575 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1 },
1576 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
1577 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
1578 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
1579 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1 },
1580
1581 // Truncate from nxvmf32 to nxvmf16.
1582 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 },
1583 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 },
1584 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 },
1585
1586 // Truncate from nxvmf64 to nxvmf16.
1587 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 },
1588 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 },
1589 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 },
1590
1591 // Truncate from nxvmf64 to nxvmf32.
1592 { ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 },
1593 { ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 },
1594 { ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 },
1595
1596 // Extend from nxvmf16 to nxvmf32.
1597 { ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
1598 { ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
1599 { ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
1600
1601 // Extend from nxvmf16 to nxvmf64.
1602 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
1603 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
1604 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
1605
1606 // Extend from nxvmf32 to nxvmf64.
1607 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
1608 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
1609 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
1610
1611 // Bitcasts from float to integer
1612 { ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0 },
1613 { ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0 },
1614 { ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0 },
1615
1616 // Bitcasts from integer to float
1617 { ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0 },
1618 { ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0 },
1619 { ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0 },
1620 };
1621
1622 if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
1623 DstTy.getSimpleVT(),
1624 SrcTy.getSimpleVT()))
1625 return AdjustCost(Entry->Cost);
1626
1627 static const TypeConversionCostTblEntry FP16Tbl[] = {
1628 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
1629 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
1630 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
1631 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
1632 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
1633 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
1634 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
1635 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
1636 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
1637 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
1638 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
1639 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
1640 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
1641 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
1642 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
1643 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
1644 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
1645 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
1646 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
1647 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
1648 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
1649 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
1650 };
1651
1652 if (ST->hasFullFP16())
1653 if (const auto *Entry = ConvertCostTableLookup(
1654 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
1655 return AdjustCost(Entry->Cost);
1656
1657 return AdjustCost(
1658 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
1659}
1660
1661InstructionCost AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode,
1662 Type *Dst,
1663 VectorType *VecTy,
1664 unsigned Index) {
1665
1666 // Make sure we were given a valid extend opcode.
1667 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&(static_cast <bool> ((Opcode == Instruction::SExt || Opcode
== Instruction::ZExt) && "Invalid opcode") ? void (0
) : __assert_fail ("(Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && \"Invalid opcode\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 1668
, __extension__ __PRETTY_FUNCTION__))
1668 "Invalid opcode")(static_cast <bool> ((Opcode == Instruction::SExt || Opcode
== Instruction::ZExt) && "Invalid opcode") ? void (0
) : __assert_fail ("(Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && \"Invalid opcode\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 1668
, __extension__ __PRETTY_FUNCTION__))
;
1669
1670 // We are extending an element we extract from a vector, so the source type
1671 // of the extend is the element type of the vector.
1672 auto *Src = VecTy->getElementType();
1673
1674 // Sign- and zero-extends are for integer types only.
1675 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type")(static_cast <bool> (isa<IntegerType>(Dst) &&
isa<IntegerType>(Src) && "Invalid type") ? void
(0) : __assert_fail ("isa<IntegerType>(Dst) && isa<IntegerType>(Src) && \"Invalid type\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 1675
, __extension__ __PRETTY_FUNCTION__))
;
1676
1677 // Get the cost for the extract. We compute the cost (if any) for the extend
1678 // below.
1679 InstructionCost Cost =
1680 getVectorInstrCost(Instruction::ExtractElement, VecTy, Index);
1681
1682 // Legalize the types.
1683 auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy);
1684 auto DstVT = TLI->getValueType(DL, Dst);
1685 auto SrcVT = TLI->getValueType(DL, Src);
1686 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1687
1688 // If the resulting type is still a vector and the destination type is legal,
1689 // we may get the extension for free. If not, get the default cost for the
1690 // extend.
1691 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
1692 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
1693 CostKind);
1694
1695 // The destination type should be larger than the element type. If not, get
1696 // the default cost for the extend.
1697 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
1698 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
1699 CostKind);
1700
1701 switch (Opcode) {
1702 default:
1703 llvm_unreachable("Opcode should be either SExt or ZExt")::llvm::llvm_unreachable_internal("Opcode should be either SExt or ZExt"
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 1703
)
;
1704
1705 // For sign-extends, we only need a smov, which performs the extension
1706 // automatically.
1707 case Instruction::SExt:
1708 return Cost;
1709
1710 // For zero-extends, the extend is performed automatically by a umov unless
1711 // the destination type is i64 and the element type is i8 or i16.
1712 case Instruction::ZExt:
1713 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
1714 return Cost;
1715 }
1716
1717 // If we are unable to perform the extend for free, get the default cost.
1718 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
1719 CostKind);
1720}
1721
1722InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
1723 TTI::TargetCostKind CostKind,
1724 const Instruction *I) {
1725 if (CostKind != TTI::TCK_RecipThroughput)
1726 return Opcode == Instruction::PHI ? 0 : 1;
1727 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind")(static_cast <bool> (CostKind == TTI::TCK_RecipThroughput
&& "unexpected CostKind") ? void (0) : __assert_fail
("CostKind == TTI::TCK_RecipThroughput && \"unexpected CostKind\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 1727
, __extension__ __PRETTY_FUNCTION__))
;
1728 // Branches are assumed to be predicted.
1729 return 0;
1730}
1731
1732InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
1733 unsigned Index) {
1734 assert(Val->isVectorTy() && "This must be a vector type")(static_cast <bool> (Val->isVectorTy() && "This must be a vector type"
) ? void (0) : __assert_fail ("Val->isVectorTy() && \"This must be a vector type\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 1734
, __extension__ __PRETTY_FUNCTION__))
;
1735
1736 if (Index != -1U) {
1737 // Legalize the type.
1738 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
1739
1740 // This type is legalized to a scalar type.
1741 if (!LT.second.isVector())
1742 return 0;
1743
1744 // The type may be split. For fixed-width vectors we can normalize the
1745 // index to the new type.
1746 if (LT.second.isFixedLengthVector()) {
1747 unsigned Width = LT.second.getVectorNumElements();
1748 Index = Index % Width;
1749 }
1750
1751 // The element at index zero is already inside the vector.
1752 if (Index == 0)
1753 return 0;
1754 }
1755
1756 // All other insert/extracts cost this much.
1757 return ST->getVectorInsertExtractBaseCost();
1758}
1759
1760InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
1761 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1762 TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info,
1763 TTI::OperandValueProperties Opd1PropInfo,
1764 TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
1765 const Instruction *CxtI) {
1766 // TODO: Handle more cost kinds.
1767 if (CostKind != TTI::TCK_RecipThroughput)
1768 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
1769 Opd2Info, Opd1PropInfo,
1770 Opd2PropInfo, Args, CxtI);
1771
1772 // Legalize the type.
1773 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
1774 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1775
1776 switch (ISD) {
1777 default:
1778 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
1779 Opd2Info, Opd1PropInfo, Opd2PropInfo);
1780 case ISD::SDIV:
1781 if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue &&
1782 Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
1783 // On AArch64, scalar signed division by constants power-of-two are
1784 // normally expanded to the sequence ADD + CMP + SELECT + SRA.
1785 // The OperandValue properties many not be same as that of previous
1786 // operation; conservatively assume OP_None.
1787 InstructionCost Cost = getArithmeticInstrCost(
1788 Instruction::Add, Ty, CostKind, Opd1Info, Opd2Info,
1789 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
1790 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Opd1Info,
1791 Opd2Info, TargetTransformInfo::OP_None,
1792 TargetTransformInfo::OP_None);
1793 Cost += getArithmeticInstrCost(
1794 Instruction::Select, Ty, CostKind, Opd1Info, Opd2Info,
1795 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
1796 Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Opd1Info,
1797 Opd2Info, TargetTransformInfo::OP_None,
1798 TargetTransformInfo::OP_None);
1799 return Cost;
1800 }
1801 LLVM_FALLTHROUGH[[gnu::fallthrough]];
1802 case ISD::UDIV: {
1803 if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue) {
1804 auto VT = TLI->getValueType(DL, Ty);
1805 if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
1806 // Vector signed division by constant are expanded to the
1807 // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
1808 // to MULHS + SUB + SRL + ADD + SRL.
1809 InstructionCost MulCost = getArithmeticInstrCost(
1810 Instruction::Mul, Ty, CostKind, Opd1Info, Opd2Info,
1811 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
1812 InstructionCost AddCost = getArithmeticInstrCost(
1813 Instruction::Add, Ty, CostKind, Opd1Info, Opd2Info,
1814 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
1815 InstructionCost ShrCost = getArithmeticInstrCost(
1816 Instruction::AShr, Ty, CostKind, Opd1Info, Opd2Info,
1817 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
1818 return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
1819 }
1820 }
1821
1822 InstructionCost Cost = BaseT::getArithmeticInstrCost(
1823 Opcode, Ty, CostKind, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo);
1824 if (Ty->isVectorTy()) {
1825 // On AArch64, vector divisions are not supported natively and are
1826 // expanded into scalar divisions of each pair of elements.
1827 Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, CostKind,
1828 Opd1Info, Opd2Info, Opd1PropInfo,
1829 Opd2PropInfo);
1830 Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind,
1831 Opd1Info, Opd2Info, Opd1PropInfo,
1832 Opd2PropInfo);
1833 // TODO: if one of the arguments is scalar, then it's not necessary to
1834 // double the cost of handling the vector elements.
1835 Cost += Cost;
1836 }
1837 return Cost;
1838 }
1839 case ISD::MUL:
1840 // Since we do not have a MUL.2d instruction, a mul <2 x i64> is expensive
1841 // as elements are extracted from the vectors and the muls scalarized.
1842 // As getScalarizationOverhead is a bit too pessimistic, we estimate the
1843 // cost for a i64 vector directly here, which is:
1844 // - four 2-cost i64 extracts,
1845 // - two 2-cost i64 inserts, and
1846 // - two 1-cost muls.
1847 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
1848 // LT.first = 2 the cost is 28. If both operands are extensions it will not
1849 // need to scalarize so the cost can be cheaper (smull or umull).
1850 if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
1851 return LT.first;
1852 return LT.first * 14;
1853 case ISD::ADD:
1854 case ISD::XOR:
1855 case ISD::OR:
1856 case ISD::AND:
1857 case ISD::SRL:
1858 case ISD::SRA:
1859 case ISD::SHL:
1860 // These nodes are marked as 'custom' for combining purposes only.
1861 // We know that they are legal. See LowerAdd in ISelLowering.
1862 return LT.first;
1863
1864 case ISD::FADD:
1865 case ISD::FSUB:
1866 case ISD::FMUL:
1867 case ISD::FDIV:
1868 case ISD::FNEG:
1869 // These nodes are marked as 'custom' just to lower them to SVE.
1870 // We know said lowering will incur no additional cost.
1871 if (!Ty->getScalarType()->isFP128Ty())
1872 return 2 * LT.first;
1873
1874 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
1875 Opd2Info, Opd1PropInfo, Opd2PropInfo);
1876 }
1877}
1878
1879InstructionCost AArch64TTIImpl::getAddressComputationCost(Type *Ty,
1880 ScalarEvolution *SE,
1881 const SCEV *Ptr) {
1882 // Address computations in vectorized code with non-consecutive addresses will
1883 // likely result in more instructions compared to scalar code where the
1884 // computation can more often be merged into the index mode. The resulting
1885 // extra micro-ops can significantly decrease throughput.
1886 unsigned NumVectorInstToHideOverhead = 10;
1887 int MaxMergeDistance = 64;
1888
1889 if (Ty->isVectorTy() && SE &&
1890 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
1891 return NumVectorInstToHideOverhead;
1892
1893 // In many cases the address computation is not merged into the instruction
1894 // addressing mode.
1895 return 1;
1896}
1897
1898InstructionCost AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
1899 Type *CondTy,
1900 CmpInst::Predicate VecPred,
1901 TTI::TargetCostKind CostKind,
1902 const Instruction *I) {
1903 // TODO: Handle other cost kinds.
1904 if (CostKind != TTI::TCK_RecipThroughput)
1905 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1906 I);
1907
1908 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1909 // We don't lower some vector selects well that are wider than the register
1910 // width.
1911 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) {
1912 // We would need this many instructions to hide the scalarization happening.
1913 const int AmortizationCost = 20;
1914
1915 // If VecPred is not set, check if we can get a predicate from the context
1916 // instruction, if its type matches the requested ValTy.
1917 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
1918 CmpInst::Predicate CurrentPred;
1919 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
1920 m_Value())))
1921 VecPred = CurrentPred;
1922 }
1923 // Check if we have a compare/select chain that can be lowered using
1924 // a (F)CMxx & BFI pair.
1925 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
1926 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
1927 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
1928 VecPred == CmpInst::FCMP_UNE) {
1929 static const auto ValidMinMaxTys = {
1930 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1931 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
1932 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
1933
1934 auto LT = TLI->getTypeLegalizationCost(DL, ValTy);
1935 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }) ||
1936 (ST->hasFullFP16() &&
1937 any_of(ValidFP16MinMaxTys, [&LT](MVT M) { return M == LT.second; })))
1938 return LT.first;
1939 }
1940
1941 static const TypeConversionCostTblEntry
1942 VectorSelectTbl[] = {
1943 { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
1944 { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
1945 { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
1946 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
1947 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
1948 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
1949 };
1950
1951 EVT SelCondTy = TLI->getValueType(DL, CondTy);
1952 EVT SelValTy = TLI->getValueType(DL, ValTy);
1953 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
1954 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
1955 SelCondTy.getSimpleVT(),
1956 SelValTy.getSimpleVT()))
1957 return Entry->Cost;
1958 }
1959 }
1960 // The base case handles scalable vectors fine for now, since it treats the
1961 // cost as 1 * legalization cost.
1962 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1963}
1964
1965AArch64TTIImpl::TTI::MemCmpExpansionOptions
1966AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
1967 TTI::MemCmpExpansionOptions Options;
1968 if (ST->requiresStrictAlign()) {
1969 // TODO: Add cost modeling for strict align. Misaligned loads expand to
1970 // a bunch of instructions when strict align is enabled.
1971 return Options;
1972 }
1973 Options.AllowOverlappingLoads = true;
1974 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
1975 Options.NumLoadsPerBlock = Options.MaxNumLoads;
1976 // TODO: Though vector loads usually perform well on AArch64, in some targets
1977 // they may wake up the FP unit, which raises the power consumption. Perhaps
1978 // they could be used with no holds barred (-O3).
1979 Options.LoadSizes = {8, 4, 2, 1};
1980 return Options;
1981}
1982
1983InstructionCost
1984AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
1985 Align Alignment, unsigned AddressSpace,
1986 TTI::TargetCostKind CostKind) {
1987 if (useNeonVector(Src))
1988 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1989 CostKind);
1990 auto LT = TLI->getTypeLegalizationCost(DL, Src);
1991 if (!LT.first.isValid())
1992 return InstructionCost::getInvalid();
1993
1994 // The code-generator is currently not able to handle scalable vectors
1995 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
1996 // it. This change will be removed when code-generation for these types is
1997 // sufficiently reliable.
1998 if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1))
1999 return InstructionCost::getInvalid();
2000
2001 return LT.first * 2;
2002}
2003
2004static unsigned getSVEGatherScatterOverhead(unsigned Opcode) {
2005 return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead;
2006}
2007
2008InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
2009 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
2010 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
2011 if (useNeonVector(DataTy))
2012 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
2013 Alignment, CostKind, I);
2014 auto *VT = cast<VectorType>(DataTy);
2015 auto LT = TLI->getTypeLegalizationCost(DL, DataTy);
2016 if (!LT.first.isValid())
2017 return InstructionCost::getInvalid();
2018
2019 // The code-generator is currently not able to handle scalable vectors
2020 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
2021 // it. This change will be removed when code-generation for these types is
2022 // sufficiently reliable.
2023 if (cast<VectorType>(DataTy)->getElementCount() ==
2024 ElementCount::getScalable(1))
2025 return InstructionCost::getInvalid();
2026
2027 ElementCount LegalVF = LT.second.getVectorElementCount();
2028 InstructionCost MemOpCost =
2029 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I);
2030 // Add on an overhead cost for using gathers/scatters.
2031 // TODO: At the moment this is applied unilaterally for all CPUs, but at some
2032 // point we may want a per-CPU overhead.
2033 MemOpCost *= getSVEGatherScatterOverhead(Opcode);
2034 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
2035}
2036
2037bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {
2038 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
2039}
2040
2041InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
2042 MaybeAlign Alignment,
2043 unsigned AddressSpace,
2044 TTI::TargetCostKind CostKind,
2045 const Instruction *I) {
2046 EVT VT = TLI->getValueType(DL, Ty, true);
2047 // Type legalization can't handle structs
2048 if (VT == MVT::Other)
2049 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
2050 CostKind);
2051
2052 auto LT = TLI->getTypeLegalizationCost(DL, Ty);
2053 if (!LT.first.isValid())
2054 return InstructionCost::getInvalid();
2055
2056 // The code-generator is currently not able to handle scalable vectors
2057 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
2058 // it. This change will be removed when code-generation for these types is
2059 // sufficiently reliable.
2060 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
2061 if (VTy->getElementCount() == ElementCount::getScalable(1))
2062 return InstructionCost::getInvalid();
2063
2064 // TODO: consider latency as well for TCK_SizeAndLatency.
2065 if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
2066 return LT.first;
2067
2068 if (CostKind != TTI::TCK_RecipThroughput)
2069 return 1;
2070
2071 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
2072 LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
2073 // Unaligned stores are extremely inefficient. We don't split all
2074 // unaligned 128-bit stores because the negative impact that has shown in
2075 // practice on inlined block copy code.
2076 // We make such stores expensive so that we will only vectorize if there
2077 // are 6 other instructions getting vectorized.
2078 const int AmortizationCost = 6;
2079
2080 return LT.first * 2 * AmortizationCost;
2081 }
2082
2083 // Check truncating stores and extending loads.
2084 if (useNeonVector(Ty) &&
2085 Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
2086 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
2087 if (VT == MVT::v4i8)
2088 return 2;
2089 // Otherwise we need to scalarize.
2090 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
2091 }
2092
2093 return LT.first;
2094}
2095
2096InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
2097 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
2098 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
2099 bool UseMaskForCond, bool UseMaskForGaps) {
2100 assert(Factor >= 2 && "Invalid interleave factor")(static_cast <bool> (Factor >= 2 && "Invalid interleave factor"
) ? void (0) : __assert_fail ("Factor >= 2 && \"Invalid interleave factor\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2100
, __extension__ __PRETTY_FUNCTION__))
;
2101 auto *VecVTy = cast<FixedVectorType>(VecTy);
2102
2103 if (!UseMaskForCond && !UseMaskForGaps &&
2104 Factor <= TLI->getMaxSupportedInterleaveFactor()) {
2105 unsigned NumElts = VecVTy->getNumElements();
2106 auto *SubVecTy =
2107 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
2108
2109 // ldN/stN only support legal vector types of size 64 or 128 in bits.
2110 // Accesses having vector types that are a multiple of 128 bits can be
2111 // matched to more than one ldN/stN instruction.
2112 bool UseScalable;
2113 if (NumElts % Factor == 0 &&
2114 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
2115 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
2116 }
2117
2118 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2119 Alignment, AddressSpace, CostKind,
2120 UseMaskForCond, UseMaskForGaps);
2121}
2122
2123InstructionCost
2124AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
2125 InstructionCost Cost = 0;
2126 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2127 for (auto *I : Tys) {
2128 if (!I->isVectorTy())
2129 continue;
2130 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
2131 128)
2132 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
2133 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
2134 }
2135 return Cost;
2136}
2137
2138unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
2139 return ST->getMaxInterleaveFactor();
2140}
2141
2142// For Falkor, we want to avoid having too many strided loads in a loop since
2143// that can exhaust the HW prefetcher resources. We adjust the unroller
2144// MaxCount preference below to attempt to ensure unrolling doesn't create too
2145// many strided loads.
2146static void
2147getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
2148 TargetTransformInfo::UnrollingPreferences &UP) {
2149 enum { MaxStridedLoads = 7 };
2150 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
2151 int StridedLoads = 0;
2152 // FIXME? We could make this more precise by looking at the CFG and
2153 // e.g. not counting loads in each side of an if-then-else diamond.
2154 for (const auto BB : L->blocks()) {
2155 for (auto &I : *BB) {
2156 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
2157 if (!LMemI)
2158 continue;
2159
2160 Value *PtrValue = LMemI->getPointerOperand();
2161 if (L->isLoopInvariant(PtrValue))
2162 continue;
2163
2164 const SCEV *LSCEV = SE.getSCEV(PtrValue);
2165 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
2166 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
2167 continue;
2168
2169 // FIXME? We could take pairing of unrolled load copies into account
2170 // by looking at the AddRec, but we would probably have to limit this
2171 // to loops with no stores or other memory optimization barriers.
2172 ++StridedLoads;
2173 // We've seen enough strided loads that seeing more won't make a
2174 // difference.
2175 if (StridedLoads > MaxStridedLoads / 2)
2176 return StridedLoads;
2177 }
2178 }
2179 return StridedLoads;
2180 };
2181
2182 int StridedLoads = countStridedLoads(L, SE);
2183 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoadsdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("aarch64tti")) { dbgs() << "falkor-hwpf: detected " <<
StridedLoads << " strided loads\n"; } } while (false)
7
Assuming 'DebugFlag' is false
8
Loop condition is false. Exiting loop
2184 << " strided loads\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("aarch64tti")) { dbgs() << "falkor-hwpf: detected " <<
StridedLoads << " strided loads\n"; } } while (false)
;
2185 // Pick the largest power of 2 unroll count that won't result in too many
2186 // strided loads.
2187 if (StridedLoads) {
9
Assuming 'StridedLoads' is not equal to 0
10
Taking true branch
2188 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
11
Calling 'Log2_32'
13
Returning from 'Log2_32'
14
The result of the left shift is undefined due to shifting by '4294967295', which is greater or equal to the width of type 'int'
2189 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("aarch64tti")) { dbgs() << "falkor-hwpf: setting unroll MaxCount to "
<< UP.MaxCount << '\n'; } } while (false)
2190 << UP.MaxCount << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("aarch64tti")) { dbgs() << "falkor-hwpf: setting unroll MaxCount to "
<< UP.MaxCount << '\n'; } } while (false)
;
2191 }
2192}
2193
2194void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
2195 TTI::UnrollingPreferences &UP,
2196 OptimizationRemarkEmitter *ORE) {
2197 // Enable partial unrolling and runtime unrolling.
2198 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
2199
2200 UP.UpperBound = true;
2201
2202 // For inner loop, it is more likely to be a hot one, and the runtime check
2203 // can be promoted out from LICM pass, so the overhead is less, let's try
2204 // a larger threshold to unroll more loops.
2205 if (L->getLoopDepth() > 1)
1
Assuming the condition is false
2
Taking false branch
2206 UP.PartialThreshold *= 2;
2207
2208 // Disable partial & runtime unrolling on -Os.
2209 UP.PartialOptSizeThreshold = 0;
2210
2211 if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
3
Assuming the condition is true
5
Taking true branch
2212 EnableFalkorHWPFUnrollFix)
4
Assuming the condition is true
2213 getFalkorUnrollingPreferences(L, SE, UP);
6
Calling 'getFalkorUnrollingPreferences'
2214
2215 // Scan the loop: don't unroll loops with calls as this could prevent
2216 // inlining. Don't unroll vector loops either, as they don't benefit much from
2217 // unrolling.
2218 for (auto *BB : L->getBlocks()) {
2219 for (auto &I : *BB) {
2220 // Don't unroll vectorised loop.
2221 if (I.getType()->isVectorTy())
2222 return;
2223
2224 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2225 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2226 if (!isLoweredToCall(F))
2227 continue;
2228 }
2229 return;
2230 }
2231 }
2232 }
2233
2234 // Enable runtime unrolling for in-order models
2235 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
2236 // checking for that case, we can ensure that the default behaviour is
2237 // unchanged
2238 if (ST->getProcFamily() != AArch64Subtarget::Others &&
2239 !ST->getSchedModel().isOutOfOrder()) {
2240 UP.Runtime = true;
2241 UP.Partial = true;
2242 UP.UnrollRemainder = true;
2243 UP.DefaultUnrollRuntimeCount = 4;
2244
2245 UP.UnrollAndJam = true;
2246 UP.UnrollAndJamInnerLoopThreshold = 60;
2247 }
2248}
2249
2250void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
2251 TTI::PeelingPreferences &PP) {
2252 BaseT::getPeelingPreferences(L, SE, PP);
2253}
2254
2255Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
2256 Type *ExpectedType) {
2257 switch (Inst->getIntrinsicID()) {
2258 default:
2259 return nullptr;
2260 case Intrinsic::aarch64_neon_st2:
2261 case Intrinsic::aarch64_neon_st3:
2262 case Intrinsic::aarch64_neon_st4: {
2263 // Create a struct type
2264 StructType *ST = dyn_cast<StructType>(ExpectedType);
2265 if (!ST)
2266 return nullptr;
2267 unsigned NumElts = Inst->arg_size() - 1;
2268 if (ST->getNumElements() != NumElts)
2269 return nullptr;
2270 for (unsigned i = 0, e = NumElts; i != e; ++i) {
2271 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
2272 return nullptr;
2273 }
2274 Value *Res = UndefValue::get(ExpectedType);
2275 IRBuilder<> Builder(Inst);
2276 for (unsigned i = 0, e = NumElts; i != e; ++i) {
2277 Value *L = Inst->getArgOperand(i);
2278 Res = Builder.CreateInsertValue(Res, L, i);
2279 }
2280 return Res;
2281 }
2282 case Intrinsic::aarch64_neon_ld2:
2283 case Intrinsic::aarch64_neon_ld3:
2284 case Intrinsic::aarch64_neon_ld4:
2285 if (Inst->getType() == ExpectedType)
2286 return Inst;
2287 return nullptr;
2288 }
2289}
2290
2291bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
2292 MemIntrinsicInfo &Info) {
2293 switch (Inst->getIntrinsicID()) {
2294 default:
2295 break;
2296 case Intrinsic::aarch64_neon_ld2:
2297 case Intrinsic::aarch64_neon_ld3:
2298 case Intrinsic::aarch64_neon_ld4:
2299 Info.ReadMem = true;
2300 Info.WriteMem = false;
2301 Info.PtrVal = Inst->getArgOperand(0);
2302 break;
2303 case Intrinsic::aarch64_neon_st2:
2304 case Intrinsic::aarch64_neon_st3:
2305 case Intrinsic::aarch64_neon_st4:
2306 Info.ReadMem = false;
2307 Info.WriteMem = true;
2308 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
2309 break;
2310 }
2311
2312 switch (Inst->getIntrinsicID()) {
2313 default:
2314 return false;
2315 case Intrinsic::aarch64_neon_ld2:
2316 case Intrinsic::aarch64_neon_st2:
2317 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
2318 break;
2319 case Intrinsic::aarch64_neon_ld3:
2320 case Intrinsic::aarch64_neon_st3:
2321 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
2322 break;
2323 case Intrinsic::aarch64_neon_ld4:
2324 case Intrinsic::aarch64_neon_st4:
2325 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
2326 break;
2327 }
2328 return true;
2329}
2330
2331/// See if \p I should be considered for address type promotion. We check if \p
2332/// I is a sext with right type and used in memory accesses. If it used in a
2333/// "complex" getelementptr, we allow it to be promoted without finding other
2334/// sext instructions that sign extended the same initial value. A getelementptr
2335/// is considered as "complex" if it has more than 2 operands.
2336bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
2337 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
2338 bool Considerable = false;
2339 AllowPromotionWithoutCommonHeader = false;
2340 if (!isa<SExtInst>(&I))
2341 return false;
2342 Type *ConsideredSExtType =
2343 Type::getInt64Ty(I.getParent()->getParent()->getContext());
2344 if (I.getType() != ConsideredSExtType)
2345 return false;
2346 // See if the sext is the one with the right type and used in at least one
2347 // GetElementPtrInst.
2348 for (const User *U : I.users()) {
2349 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
2350 Considerable = true;
2351 // A getelementptr is considered as "complex" if it has more than 2
2352 // operands. We will promote a SExt used in such complex GEP as we
2353 // expect some computation to be merged if they are done on 64 bits.
2354 if (GEPInst->getNumOperands() > 2) {
2355 AllowPromotionWithoutCommonHeader = true;
2356 break;
2357 }
2358 }
2359 }
2360 return Considerable;
2361}
2362
2363bool AArch64TTIImpl::isLegalToVectorizeReduction(
2364 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
2365 if (!VF.isScalable())
2366 return true;
2367
2368 Type *Ty = RdxDesc.getRecurrenceType();
2369 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
2370 return false;
2371
2372 switch (RdxDesc.getRecurrenceKind()) {
2373 case RecurKind::Add:
2374 case RecurKind::FAdd:
2375 case RecurKind::And:
2376 case RecurKind::Or:
2377 case RecurKind::Xor:
2378 case RecurKind::SMin:
2379 case RecurKind::SMax:
2380 case RecurKind::UMin:
2381 case RecurKind::UMax:
2382 case RecurKind::FMin:
2383 case RecurKind::FMax:
2384 case RecurKind::SelectICmp:
2385 case RecurKind::SelectFCmp:
2386 case RecurKind::FMulAdd:
2387 return true;
2388 default:
2389 return false;
2390 }
2391}
2392
2393InstructionCost
2394AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
2395 bool IsUnsigned,
2396 TTI::TargetCostKind CostKind) {
2397 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
2398
2399 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
2400 return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
2401
2402 assert((isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) &&(static_cast <bool> ((isa<ScalableVectorType>(Ty)
== isa<ScalableVectorType>(CondTy)) && "Both vector needs to be equally scalable"
) ? void (0) : __assert_fail ("(isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) && \"Both vector needs to be equally scalable\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2403
, __extension__ __PRETTY_FUNCTION__))
2403 "Both vector needs to be equally scalable")(static_cast <bool> ((isa<ScalableVectorType>(Ty)
== isa<ScalableVectorType>(CondTy)) && "Both vector needs to be equally scalable"
) ? void (0) : __assert_fail ("(isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) && \"Both vector needs to be equally scalable\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2403
, __extension__ __PRETTY_FUNCTION__))
;
2404
2405 InstructionCost LegalizationCost = 0;
2406 if (LT.first > 1) {
2407 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
2408 unsigned MinMaxOpcode =
2409 Ty->isFPOrFPVectorTy()
2410 ? Intrinsic::maxnum
2411 : (IsUnsigned ? Intrinsic::umin : Intrinsic::smin);
2412 IntrinsicCostAttributes Attrs(MinMaxOpcode, LegalVTy, {LegalVTy, LegalVTy});
2413 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
2414 }
2415
2416 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
2417}
2418
2419InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE(
2420 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) {
2421 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2422 InstructionCost LegalizationCost = 0;
2423 if (LT.first > 1) {
2424 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
2425 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
2426 LegalizationCost *= LT.first - 1;
2427 }
2428
2429 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2430 assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ?
void (0) : __assert_fail ("ISD && \"Invalid opcode\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2430
, __extension__ __PRETTY_FUNCTION__))
;
2431 // Add the final reduction cost for the legal horizontal reduction
2432 switch (ISD) {
2433 case ISD::ADD:
2434 case ISD::AND:
2435 case ISD::OR:
2436 case ISD::XOR:
2437 case ISD::FADD:
2438 return LegalizationCost + 2;
2439 default:
2440 return InstructionCost::getInvalid();
2441 }
2442}
2443
2444InstructionCost
2445AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
2446 Optional<FastMathFlags> FMF,
2447 TTI::TargetCostKind CostKind) {
2448 if (TTI::requiresOrderedReduction(FMF)) {
2449 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
2450 InstructionCost BaseCost =
2451 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2452 // Add on extra cost to reflect the extra overhead on some CPUs. We still
2453 // end up vectorizing for more computationally intensive loops.
2454 return BaseCost + FixedVTy->getNumElements();
2455 }
2456
2457 if (Opcode != Instruction::FAdd)
2458 return InstructionCost::getInvalid();
2459
2460 auto *VTy = cast<ScalableVectorType>(ValTy);
2461 InstructionCost Cost =
2462 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
2463 Cost *= getMaxNumElements(VTy->getElementCount());
2464 return Cost;
2465 }
2466
2467 if (isa<ScalableVectorType>(ValTy))
2468 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
2469
2470 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2471 MVT MTy = LT.second;
2472 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2473 assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ?
void (0) : __assert_fail ("ISD && \"Invalid opcode\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2473
, __extension__ __PRETTY_FUNCTION__))
;
2474
2475 // Horizontal adds can use the 'addv' instruction. We model the cost of these
2476 // instructions as twice a normal vector add, plus 1 for each legalization
2477 // step (LT.first). This is the only arithmetic vector reduction operation for
2478 // which we have an instruction.
2479 // OR, XOR and AND costs should match the codegen from:
2480 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
2481 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
2482 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
2483 static const CostTblEntry CostTblNoPairwise[]{
2484 {ISD::ADD, MVT::v8i8, 2},
2485 {ISD::ADD, MVT::v16i8, 2},
2486 {ISD::ADD, MVT::v4i16, 2},
2487 {ISD::ADD, MVT::v8i16, 2},
2488 {ISD::ADD, MVT::v4i32, 2},
2489 {ISD::OR, MVT::v8i8, 15},
2490 {ISD::OR, MVT::v16i8, 17},
2491 {ISD::OR, MVT::v4i16, 7},
2492 {ISD::OR, MVT::v8i16, 9},
2493 {ISD::OR, MVT::v2i32, 3},
2494 {ISD::OR, MVT::v4i32, 5},
2495 {ISD::OR, MVT::v2i64, 3},
2496 {ISD::XOR, MVT::v8i8, 15},
2497 {ISD::XOR, MVT::v16i8, 17},
2498 {ISD::XOR, MVT::v4i16, 7},
2499 {ISD::XOR, MVT::v8i16, 9},
2500 {ISD::XOR, MVT::v2i32, 3},
2501 {ISD::XOR, MVT::v4i32, 5},
2502 {ISD::XOR, MVT::v2i64, 3},
2503 {ISD::AND, MVT::v8i8, 15},
2504 {ISD::AND, MVT::v16i8, 17},
2505 {ISD::AND, MVT::v4i16, 7},
2506 {ISD::AND, MVT::v8i16, 9},
2507 {ISD::AND, MVT::v2i32, 3},
2508 {ISD::AND, MVT::v4i32, 5},
2509 {ISD::AND, MVT::v2i64, 3},
2510 };
2511 switch (ISD) {
2512 default:
2513 break;
2514 case ISD::ADD:
2515 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
2516 return (LT.first - 1) + Entry->Cost;
2517 break;
2518 case ISD::XOR:
2519 case ISD::AND:
2520 case ISD::OR:
2521 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
2522 if (!Entry)
2523 break;
2524 auto *ValVTy = cast<FixedVectorType>(ValTy);
2525 if (!ValVTy->getElementType()->isIntegerTy(1) &&
2526 MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
2527 isPowerOf2_32(ValVTy->getNumElements())) {
2528 InstructionCost ExtraCost = 0;
2529 if (LT.first != 1) {
2530 // Type needs to be split, so there is an extra cost of LT.first - 1
2531 // arithmetic ops.
2532 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
2533 MTy.getVectorNumElements());
2534 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
2535 ExtraCost *= LT.first - 1;
2536 }
2537 return Entry->Cost + ExtraCost;
2538 }
2539 break;
2540 }
2541 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2542}
2543
2544InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) {
2545 static const CostTblEntry ShuffleTbl[] = {
2546 { TTI::SK_Splice, MVT::nxv16i8, 1 },
2547 { TTI::SK_Splice, MVT::nxv8i16, 1 },
2548 { TTI::SK_Splice, MVT::nxv4i32, 1 },
2549 { TTI::SK_Splice, MVT::nxv2i64, 1 },
2550 { TTI::SK_Splice, MVT::nxv2f16, 1 },
2551 { TTI::SK_Splice, MVT::nxv4f16, 1 },
2552 { TTI::SK_Splice, MVT::nxv8f16, 1 },
2553 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
2554 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
2555 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
2556 { TTI::SK_Splice, MVT::nxv2f32, 1 },
2557 { TTI::SK_Splice, MVT::nxv4f32, 1 },
2558 { TTI::SK_Splice, MVT::nxv2f64, 1 },
2559 };
2560
2561 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
2562 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
2563 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2564 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
2565 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
2566 : LT.second;
2567 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
2568 InstructionCost LegalizationCost = 0;
2569 if (Index < 0) {
2570 LegalizationCost =
2571 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
2572 CmpInst::BAD_ICMP_PREDICATE, CostKind) +
2573 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
2574 CmpInst::BAD_ICMP_PREDICATE, CostKind);
2575 }
2576
2577 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
2578 // Cost performed on a promoted type.
2579 if (LT.second.getScalarType() == MVT::i1) {
2580 LegalizationCost +=
2581 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
2582 TTI::CastContextHint::None, CostKind) +
2583 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
2584 TTI::CastContextHint::None, CostKind);
2585 }
2586 const auto *Entry =
2587 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
2588 assert(Entry && "Illegal Type for Splice")(static_cast <bool> (Entry && "Illegal Type for Splice"
) ? void (0) : __assert_fail ("Entry && \"Illegal Type for Splice\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2588
, __extension__ __PRETTY_FUNCTION__))
;
2589 LegalizationCost += Entry->Cost;
2590 return LegalizationCost * LT.first;
2591}
2592
2593InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
2594 VectorType *Tp,
2595 ArrayRef<int> Mask, int Index,
2596 VectorType *SubTp,
2597 ArrayRef<Value *> Args) {
2598 Kind = improveShuffleKindFromMask(Kind, Mask);
2599 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
2600 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
2601 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
2602 Kind == TTI::SK_Reverse) {
2603 static const CostTblEntry ShuffleTbl[] = {
2604 // Broadcast shuffle kinds can be performed with 'dup'.
2605 { TTI::SK_Broadcast, MVT::v8i8, 1 },
2606 { TTI::SK_Broadcast, MVT::v16i8, 1 },
2607 { TTI::SK_Broadcast, MVT::v4i16, 1 },
2608 { TTI::SK_Broadcast, MVT::v8i16, 1 },
2609 { TTI::SK_Broadcast, MVT::v2i32, 1 },
2610 { TTI::SK_Broadcast, MVT::v4i32, 1 },
2611 { TTI::SK_Broadcast, MVT::v2i64, 1 },
2612 { TTI::SK_Broadcast, MVT::v2f32, 1 },
2613 { TTI::SK_Broadcast, MVT::v4f32, 1 },
2614 { TTI::SK_Broadcast, MVT::v2f64, 1 },
2615 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
2616 // 'zip1/zip2' instructions.
2617 { TTI::SK_Transpose, MVT::v8i8, 1 },
2618 { TTI::SK_Transpose, MVT::v16i8, 1 },
2619 { TTI::SK_Transpose, MVT::v4i16, 1 },
2620 { TTI::SK_Transpose, MVT::v8i16, 1 },
2621 { TTI::SK_Transpose, MVT::v2i32, 1 },
2622 { TTI::SK_Transpose, MVT::v4i32, 1 },
2623 { TTI::SK_Transpose, MVT::v2i64, 1 },
2624 { TTI::SK_Transpose, MVT::v2f32, 1 },
2625 { TTI::SK_Transpose, MVT::v4f32, 1 },
2626 { TTI::SK_Transpose, MVT::v2f64, 1 },
2627 // Select shuffle kinds.
2628 // TODO: handle vXi8/vXi16.
2629 { TTI::SK_Select, MVT::v2i32, 1 }, // mov.
2630 { TTI::SK_Select, MVT::v4i32, 2 }, // rev+trn (or similar).
2631 { TTI::SK_Select, MVT::v2i64, 1 }, // mov.
2632 { TTI::SK_Select, MVT::v2f32, 1 }, // mov.
2633 { TTI::SK_Select, MVT::v4f32, 2 }, // rev+trn (or similar).
2634 { TTI::SK_Select, MVT::v2f64, 1 }, // mov.
2635 // PermuteSingleSrc shuffle kinds.
2636 { TTI::SK_PermuteSingleSrc, MVT::v2i32, 1 }, // mov.
2637 { TTI::SK_PermuteSingleSrc, MVT::v4i32, 3 }, // perfectshuffle worst case.
2638 { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // mov.
2639 { TTI::SK_PermuteSingleSrc, MVT::v2f32, 1 }, // mov.
2640 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 3 }, // perfectshuffle worst case.
2641 { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // mov.
2642 { TTI::SK_PermuteSingleSrc, MVT::v4i16, 3 }, // perfectshuffle worst case.
2643 { TTI::SK_PermuteSingleSrc, MVT::v4f16, 3 }, // perfectshuffle worst case.
2644 { TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3 }, // perfectshuffle worst case.
2645 { TTI::SK_PermuteSingleSrc, MVT::v8i16, 8 }, // constpool + load + tbl
2646 { TTI::SK_PermuteSingleSrc, MVT::v8f16, 8 }, // constpool + load + tbl
2647 { TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8 }, // constpool + load + tbl
2648 { TTI::SK_PermuteSingleSrc, MVT::v8i8, 8 }, // constpool + load + tbl
2649 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 8 }, // constpool + load + tbl
2650 // Reverse can be lowered with `rev`.
2651 { TTI::SK_Reverse, MVT::v2i32, 1 }, // mov.
2652 { TTI::SK_Reverse, MVT::v4i32, 2 }, // REV64; EXT
2653 { TTI::SK_Reverse, MVT::v2i64, 1 }, // mov.
2654 { TTI::SK_Reverse, MVT::v2f32, 1 }, // mov.
2655 { TTI::SK_Reverse, MVT::v4f32, 2 }, // REV64; EXT
2656 { TTI::SK_Reverse, MVT::v2f64, 1 }, // mov.
2657 // Broadcast shuffle kinds for scalable vectors
2658 { TTI::SK_Broadcast, MVT::nxv16i8, 1 },
2659 { TTI::SK_Broadcast, MVT::nxv8i16, 1 },
2660 { TTI::SK_Broadcast, MVT::nxv4i32, 1 },
2661 { TTI::SK_Broadcast, MVT::nxv2i64, 1 },
2662 { TTI::SK_Broadcast, MVT::nxv2f16, 1 },
2663 { TTI::SK_Broadcast, MVT::nxv4f16, 1 },
2664 { TTI::SK_Broadcast, MVT::nxv8f16, 1 },
2665 { TTI::SK_Broadcast, MVT::nxv2bf16, 1 },
2666 { TTI::SK_Broadcast, MVT::nxv4bf16, 1 },
2667 { TTI::SK_Broadcast, MVT::nxv8bf16, 1 },
2668 { TTI::SK_Broadcast, MVT::nxv2f32, 1 },
2669 { TTI::SK_Broadcast, MVT::nxv4f32, 1 },
2670 { TTI::SK_Broadcast, MVT::nxv2f64, 1 },
2671 { TTI::SK_Broadcast, MVT::nxv16i1, 1 },
2672 { TTI::SK_Broadcast, MVT::nxv8i1, 1 },
2673 { TTI::SK_Broadcast, MVT::nxv4i1, 1 },
2674 { TTI::SK_Broadcast, MVT::nxv2i1, 1 },
2675 // Handle the cases for vector.reverse with scalable vectors
2676 { TTI::SK_Reverse, MVT::nxv16i8, 1 },
2677 { TTI::SK_Reverse, MVT::nxv8i16, 1 },
2678 { TTI::SK_Reverse, MVT::nxv4i32, 1 },
2679 { TTI::SK_Reverse, MVT::nxv2i64, 1 },
2680 { TTI::SK_Reverse, MVT::nxv2f16, 1 },
2681 { TTI::SK_Reverse, MVT::nxv4f16, 1 },
2682 { TTI::SK_Reverse, MVT::nxv8f16, 1 },
2683 { TTI::SK_Reverse, MVT::nxv2bf16, 1 },
2684 { TTI::SK_Reverse, MVT::nxv4bf16, 1 },
2685 { TTI::SK_Reverse, MVT::nxv8bf16, 1 },
2686 { TTI::SK_Reverse, MVT::nxv2f32, 1 },
2687 { TTI::SK_Reverse, MVT::nxv4f32, 1 },
2688 { TTI::SK_Reverse, MVT::nxv2f64, 1 },
2689 { TTI::SK_Reverse, MVT::nxv16i1, 1 },
2690 { TTI::SK_Reverse, MVT::nxv8i1, 1 },
2691 { TTI::SK_Reverse, MVT::nxv4i1, 1 },
2692 { TTI::SK_Reverse, MVT::nxv2i1, 1 },
2693 };
2694 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
2695 return LT.first * Entry->Cost;
2696 }
2697
2698 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))
2699 return getSpliceCost(Tp, Index);
2700
2701 // Inserting a subvector can often be done with either a D, S or H register
2702 // move, so long as the inserted vector is "aligned".
2703 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
2704 LT.second.getSizeInBits() <= 128 && SubTp) {
2705 std::pair<InstructionCost, MVT> SubLT =
2706 TLI->getTypeLegalizationCost(DL, SubTp);
2707 if (SubLT.second.isVector()) {
2708 int NumElts = LT.second.getVectorNumElements();
2709 int NumSubElts = SubLT.second.getVectorNumElements();
2710 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
2711 return SubLT.first;
2712 }
2713 }
2714
2715 return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
2716}

/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/llvm/include/llvm/Support/MathExtras.h

1//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains some functions that are useful for math stuff.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_SUPPORT_MATHEXTRAS_H
14#define LLVM_SUPPORT_MATHEXTRAS_H
15
16#include "llvm/Support/Compiler.h"
17#include <cassert>
18#include <climits>
19#include <cmath>
20#include <cstdint>
21#include <cstring>
22#include <limits>
23#include <type_traits>
24
25#ifdef __ANDROID_NDK__
26#include <android/api-level.h>
27#endif
28
29#ifdef _MSC_VER
30// Declare these intrinsics manually rather including intrin.h. It's very
31// expensive, and MathExtras.h is popular.
32// #include <intrin.h>
33extern "C" {
34unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
35unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
36unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
37unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
38}
39#endif
40
41namespace llvm {
42
43/// The behavior an operation has on an input of 0.
44enum ZeroBehavior {
45 /// The returned value is undefined.
46 ZB_Undefined,
47 /// The returned value is numeric_limits<T>::max()
48 ZB_Max,
49 /// The returned value is numeric_limits<T>::digits
50 ZB_Width
51};
52
53/// Mathematical constants.
54namespace numbers {
55// TODO: Track C++20 std::numbers.
56// TODO: Favor using the hexadecimal FP constants (requires C++17).
57constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113
58 egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620
59 ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162
60 ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392
61 log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0)
62 log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2)
63 pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796
64 inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541
65 sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161
66 inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197
67 sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219
68 inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1)
69 sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194
70 inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1)
71 phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622
72constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113
73 egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620
74 ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162
75 ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392
76 log2ef = 1.44269504F, // (0x1.715476P+0)
77 log10ef = .434294482F, // (0x1.bcb7b2P-2)
78 pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796
79 inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541
80 sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161
81 inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197
82 sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193
83 inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1)
84 sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194
85 inv_sqrt3f = .577350269F, // (0x1.279a74P-1)
86 phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622
87} // namespace numbers
88
89namespace detail {
90template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
91 static unsigned count(T Val, ZeroBehavior) {
92 if (!Val)
93 return std::numeric_limits<T>::digits;
94 if (Val & 0x1)
95 return 0;
96
97 // Bisection method.
98 unsigned ZeroBits = 0;
99 T Shift = std::numeric_limits<T>::digits >> 1;
100 T Mask = std::numeric_limits<T>::max() >> Shift;
101 while (Shift) {
102 if ((Val & Mask) == 0) {
103 Val >>= Shift;
104 ZeroBits |= Shift;
105 }
106 Shift >>= 1;
107 Mask >>= Shift;
108 }
109 return ZeroBits;
110 }
111};
112
113#if defined(__GNUC__4) || defined(_MSC_VER)
114template <typename T> struct TrailingZerosCounter<T, 4> {
115 static unsigned count(T Val, ZeroBehavior ZB) {
116 if (ZB != ZB_Undefined && Val == 0)
117 return 32;
118
119#if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4)
120 return __builtin_ctz(Val);
121#elif defined(_MSC_VER)
122 unsigned long Index;
123 _BitScanForward(&Index, Val);
124 return Index;
125#endif
126 }
127};
128
129#if !defined(_MSC_VER) || defined(_M_X64)
130template <typename T> struct TrailingZerosCounter<T, 8> {
131 static unsigned count(T Val, ZeroBehavior ZB) {
132 if (ZB != ZB_Undefined && Val == 0)
133 return 64;
134
135#if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4)
136 return __builtin_ctzll(Val);
137#elif defined(_MSC_VER)
138 unsigned long Index;
139 _BitScanForward64(&Index, Val);
140 return Index;
141#endif
142 }
143};
144#endif
145#endif
146} // namespace detail
147
148/// Count number of 0's from the least significant bit to the most
149/// stopping at the first 1.
150///
151/// Only unsigned integral types are allowed.
152///
153/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
154/// valid arguments.
155template <typename T>
156unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
157 static_assert(std::numeric_limits<T>::is_integer &&
158 !std::numeric_limits<T>::is_signed,
159 "Only unsigned integral types are allowed.");
160 return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
161}
162
163namespace detail {
164template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
165 static unsigned count(T Val, ZeroBehavior) {
166 if (!Val)
167 return std::numeric_limits<T>::digits;
168
169 // Bisection method.
170 unsigned ZeroBits = 0;
171 for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
172 T Tmp = Val >> Shift;
173 if (Tmp)
174 Val = Tmp;
175 else
176 ZeroBits |= Shift;
177 }
178 return ZeroBits;
179 }
180};
181
182#if defined(__GNUC__4) || defined(_MSC_VER)
183template <typename T> struct LeadingZerosCounter<T, 4> {
184 static unsigned count(T Val, ZeroBehavior ZB) {
185 if (ZB != ZB_Undefined && Val == 0)
186 return 32;
187
188#if __has_builtin(__builtin_clz)1 || defined(__GNUC__4)
189 return __builtin_clz(Val);
190#elif defined(_MSC_VER)
191 unsigned long Index;
192 _BitScanReverse(&Index, Val);
193 return Index ^ 31;
194#endif
195 }
196};
197
198#if !defined(_MSC_VER) || defined(_M_X64)
199template <typename T> struct LeadingZerosCounter<T, 8> {
200 static unsigned count(T Val, ZeroBehavior ZB) {
201 if (ZB != ZB_Undefined && Val == 0)
202 return 64;
203
204#if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4)
205 return __builtin_clzll(Val);
206#elif defined(_MSC_VER)
207 unsigned long Index;
208 _BitScanReverse64(&Index, Val);
209 return Index ^ 63;
210#endif
211 }
212};
213#endif
214#endif
215} // namespace detail
216
217/// Count number of 0's from the most significant bit to the least
218/// stopping at the first 1.
219///
220/// Only unsigned integral types are allowed.
221///
222/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
223/// valid arguments.
224template <typename T>
225unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
226 static_assert(std::numeric_limits<T>::is_integer &&
227 !std::numeric_limits<T>::is_signed,
228 "Only unsigned integral types are allowed.");
229 return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
230}
231
232/// Get the index of the first set bit starting from the least
233/// significant bit.
234///
235/// Only unsigned integral types are allowed.
236///
237/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
238/// valid arguments.
239template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
240 if (ZB == ZB_Max && Val == 0)
241 return std::numeric_limits<T>::max();
242
243 return countTrailingZeros(Val, ZB_Undefined);
244}
245
246/// Create a bitmask with the N right-most bits set to 1, and all other
247/// bits set to 0. Only unsigned types are allowed.
248template <typename T> T maskTrailingOnes(unsigned N) {
249 static_assert(std::is_unsigned<T>::value, "Invalid type!");
250 const unsigned Bits = CHAR_BIT8 * sizeof(T);
251 assert(N <= Bits && "Invalid bit index")(static_cast <bool> (N <= Bits && "Invalid bit index"
) ? void (0) : __assert_fail ("N <= Bits && \"Invalid bit index\""
, "llvm/include/llvm/Support/MathExtras.h", 251, __extension__
__PRETTY_FUNCTION__))
;
252 return N == 0 ? 0 : (T(-1) >> (Bits - N));
253}
254
255/// Create a bitmask with the N left-most bits set to 1, and all other
256/// bits set to 0. Only unsigned types are allowed.
257template <typename T> T maskLeadingOnes(unsigned N) {
258 return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
259}
260
261/// Create a bitmask with the N right-most bits set to 0, and all other
262/// bits set to 1. Only unsigned types are allowed.
263template <typename T> T maskTrailingZeros(unsigned N) {
264 return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
265}
266
267/// Create a bitmask with the N left-most bits set to 0, and all other
268/// bits set to 1. Only unsigned types are allowed.
269template <typename T> T maskLeadingZeros(unsigned N) {
270 return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
271}
272
273/// Get the index of the last set bit starting from the least
274/// significant bit.
275///
276/// Only unsigned integral types are allowed.
277///
278/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
279/// valid arguments.
280template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
281 if (ZB == ZB_Max && Val == 0)
282 return std::numeric_limits<T>::max();
283
284 // Use ^ instead of - because both gcc and llvm can remove the associated ^
285 // in the __builtin_clz intrinsic on x86.
286 return countLeadingZeros(Val, ZB_Undefined) ^
287 (std::numeric_limits<T>::digits - 1);
288}
289
290/// Macro compressed bit reversal table for 256 bits.
291///
292/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
293static const unsigned char BitReverseTable256[256] = {
294#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
295#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
296#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
297 R6(0), R6(2), R6(1), R6(3)
298#undef R2
299#undef R4
300#undef R6
301};
302
303/// Reverse the bits in \p Val.
304template <typename T>
305T reverseBits(T Val) {
306 unsigned char in[sizeof(Val)];
307 unsigned char out[sizeof(Val)];
308 std::memcpy(in, &Val, sizeof(Val));
309 for (unsigned i = 0; i < sizeof(Val); ++i)
310 out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
311 std::memcpy(&Val, out, sizeof(Val));
312 return Val;
313}
314
315#if __has_builtin(__builtin_bitreverse8)1
316template<>
317inline uint8_t reverseBits<uint8_t>(uint8_t Val) {
318 return __builtin_bitreverse8(Val);
319}
320#endif
321
322#if __has_builtin(__builtin_bitreverse16)1
323template<>
324inline uint16_t reverseBits<uint16_t>(uint16_t Val) {
325 return __builtin_bitreverse16(Val);
326}
327#endif
328
329#if __has_builtin(__builtin_bitreverse32)1
330template<>
331inline uint32_t reverseBits<uint32_t>(uint32_t Val) {
332 return __builtin_bitreverse32(Val);
333}
334#endif
335
336#if __has_builtin(__builtin_bitreverse64)1
337template<>
338inline uint64_t reverseBits<uint64_t>(uint64_t Val) {
339 return __builtin_bitreverse64(Val);
340}
341#endif
342
343// NOTE: The following support functions use the _32/_64 extensions instead of
344// type overloading so that signed and unsigned integers can be used without
345// ambiguity.
346
347/// Return the high 32 bits of a 64 bit value.
348constexpr inline uint32_t Hi_32(uint64_t Value) {
349 return static_cast<uint32_t>(Value >> 32);
350}
351
352/// Return the low 32 bits of a 64 bit value.
353constexpr inline uint32_t Lo_32(uint64_t Value) {
354 return static_cast<uint32_t>(Value);
355}
356
357/// Make a 64-bit integer from a high / low pair of 32-bit integers.
358constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
359 return ((uint64_t)High << 32) | (uint64_t)Low;
360}
361
362/// Checks if an integer fits into the given bit width.
363template <unsigned N> constexpr inline bool isInt(int64_t x) {
364 return N >= 64 || (-(INT64_C(1)1L<<(N-1)) <= x && x < (INT64_C(1)1L<<(N-1)));
365}
366// Template specializations to get better code for common cases.
367template <> constexpr inline bool isInt<8>(int64_t x) {
368 return static_cast<int8_t>(x) == x;
369}
370template <> constexpr inline bool isInt<16>(int64_t x) {
371 return static_cast<int16_t>(x) == x;
372}
373template <> constexpr inline bool isInt<32>(int64_t x) {
374 return static_cast<int32_t>(x) == x;
375}
376
377/// Checks if a signed integer is an N bit number shifted left by S.
378template <unsigned N, unsigned S>
379constexpr inline bool isShiftedInt(int64_t x) {
380 static_assert(
381 N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number.");
382 static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide.");
383 return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
384}
385
386/// Checks if an unsigned integer fits into the given bit width.
387///
388/// This is written as two functions rather than as simply
389///
390/// return N >= 64 || X < (UINT64_C(1) << N);
391///
392/// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting
393/// left too many places.
394template <unsigned N>
395constexpr inline std::enable_if_t<(N < 64), bool> isUInt(uint64_t X) {
396 static_assert(N > 0, "isUInt<0> doesn't make sense");
397 return X < (UINT64_C(1)1UL << (N));
398}
399template <unsigned N>
400constexpr inline std::enable_if_t<N >= 64, bool> isUInt(uint64_t) {
401 return true;
402}
403
404// Template specializations to get better code for common cases.
405template <> constexpr inline bool isUInt<8>(uint64_t x) {
406 return static_cast<uint8_t>(x) == x;
407}
408template <> constexpr inline bool isUInt<16>(uint64_t x) {
409 return static_cast<uint16_t>(x) == x;
410}
411template <> constexpr inline bool isUInt<32>(uint64_t x) {
412 return static_cast<uint32_t>(x) == x;
413}
414
415/// Checks if a unsigned integer is an N bit number shifted left by S.
416template <unsigned N, unsigned S>
417constexpr inline bool isShiftedUInt(uint64_t x) {
418 static_assert(
419 N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)");
420 static_assert(N + S <= 64,
421 "isShiftedUInt<N, S> with N + S > 64 is too wide.");
422 // Per the two static_asserts above, S must be strictly less than 64. So
423 // 1 << S is not undefined behavior.
424 return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
425}
426
427/// Gets the maximum value for a N-bit unsigned integer.
428inline uint64_t maxUIntN(uint64_t N) {
429 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
"integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "llvm/include/llvm/Support/MathExtras.h", 429, __extension__
__PRETTY_FUNCTION__))
;
430
431 // uint64_t(1) << 64 is undefined behavior, so we can't do
432 // (uint64_t(1) << N) - 1
433 // without checking first that N != 64. But this works and doesn't have a
434 // branch.
435 return UINT64_MAX(18446744073709551615UL) >> (64 - N);
436}
437
438/// Gets the minimum value for a N-bit signed integer.
439inline int64_t minIntN(int64_t N) {
440 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
"integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "llvm/include/llvm/Support/MathExtras.h", 440, __extension__
__PRETTY_FUNCTION__))
;
441
442 return UINT64_C(1)1UL + ~(UINT64_C(1)1UL << (N - 1));
443}
444
445/// Gets the maximum value for a N-bit signed integer.
446inline int64_t maxIntN(int64_t N) {
447 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
"integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "llvm/include/llvm/Support/MathExtras.h", 447, __extension__
__PRETTY_FUNCTION__))
;
448
449 // This relies on two's complement wraparound when N == 64, so we convert to
450 // int64_t only at the very end to avoid UB.
451 return (UINT64_C(1)1UL << (N - 1)) - 1;
452}
453
454/// Checks if an unsigned integer fits into the given (dynamic) bit width.
455inline bool isUIntN(unsigned N, uint64_t x) {
456 return N >= 64 || x <= maxUIntN(N);
457}
458
459/// Checks if an signed integer fits into the given (dynamic) bit width.
460inline bool isIntN(unsigned N, int64_t x) {
461 return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N));
462}
463
464/// Return true if the argument is a non-empty sequence of ones starting at the
465/// least significant bit with the remainder zero (32 bit version).
466/// Ex. isMask_32(0x0000FFFFU) == true.
467constexpr inline bool isMask_32(uint32_t Value) {
468 return Value && ((Value + 1) & Value) == 0;
469}
470
471/// Return true if the argument is a non-empty sequence of ones starting at the
472/// least significant bit with the remainder zero (64 bit version).
473constexpr inline bool isMask_64(uint64_t Value) {
474 return Value && ((Value + 1) & Value) == 0;
475}
476
477/// Return true if the argument contains a non-empty sequence of ones with the
478/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
479constexpr inline bool isShiftedMask_32(uint32_t Value) {
480 return Value && isMask_32((Value - 1) | Value);
481}
482
483/// Return true if the argument contains a non-empty sequence of ones with the
484/// remainder zero (64 bit version.)
485constexpr inline bool isShiftedMask_64(uint64_t Value) {
486 return Value && isMask_64((Value - 1) | Value);
487}
488
489/// Return true if the argument is a power of two > 0.
490/// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
491constexpr inline bool isPowerOf2_32(uint32_t Value) {
492 return Value && !(Value & (Value - 1));
493}
494
495/// Return true if the argument is a power of two > 0 (64 bit edition.)
496constexpr inline bool isPowerOf2_64(uint64_t Value) {
497 return Value && !(Value & (Value - 1));
498}
499
500/// Count the number of ones from the most significant bit to the first
501/// zero bit.
502///
503/// Ex. countLeadingOnes(0xFF0FFF00) == 8.
504/// Only unsigned integral types are allowed.
505///
506/// \param ZB the behavior on an input of all ones. Only ZB_Width and
507/// ZB_Undefined are valid arguments.
508template <typename T>
509unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
510 static_assert(std::numeric_limits<T>::is_integer &&
511 !std::numeric_limits<T>::is_signed,
512 "Only unsigned integral types are allowed.");
513 return countLeadingZeros<T>(~Value, ZB);
514}
515
516/// Count the number of ones from the least significant bit to the first
517/// zero bit.
518///
519/// Ex. countTrailingOnes(0x00FF00FF) == 8.
520/// Only unsigned integral types are allowed.
521///
522/// \param ZB the behavior on an input of all ones. Only ZB_Width and
523/// ZB_Undefined are valid arguments.
524template <typename T>
525unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
526 static_assert(std::numeric_limits<T>::is_integer &&
527 !std::numeric_limits<T>::is_signed,
528 "Only unsigned integral types are allowed.");
529 return countTrailingZeros<T>(~Value, ZB);
530}
531
532namespace detail {
533template <typename T, std::size_t SizeOfT> struct PopulationCounter {
534 static unsigned count(T Value) {
535 // Generic version, forward to 32 bits.
536 static_assert(SizeOfT <= 4, "Not implemented!");
537#if defined(__GNUC__4)
538 return __builtin_popcount(Value);
539#else
540 uint32_t v = Value;
541 v = v - ((v >> 1) & 0x55555555);
542 v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
543 return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
544#endif
545 }
546};
547
548template <typename T> struct PopulationCounter<T, 8> {
549 static unsigned count(T Value) {
550#if defined(__GNUC__4)
551 return __builtin_popcountll(Value);
552#else
553 uint64_t v = Value;
554 v = v - ((v >> 1) & 0x5555555555555555ULL);
555 v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
556 v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
557 return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
558#endif
559 }
560};
561} // namespace detail
562
563/// Count the number of set bits in a value.
564/// Ex. countPopulation(0xF000F000) = 8
565/// Returns 0 if the word is zero.
566template <typename T>
567inline unsigned countPopulation(T Value) {
568 static_assert(std::numeric_limits<T>::is_integer &&
569 !std::numeric_limits<T>::is_signed,
570 "Only unsigned integral types are allowed.");
571 return detail::PopulationCounter<T, sizeof(T)>::count(Value);
572}
573
574/// Return true if the argument contains a non-empty sequence of ones with the
575/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
576/// If true, \p MaskIdx will specify the index of the lowest set bit and \p
577/// MaskLen is updated to specify the length of the mask, else neither are
578/// updated.
579inline bool isShiftedMask_32(uint32_t Value, unsigned &MaskIdx,
580 unsigned &MaskLen) {
581 if (!isShiftedMask_32(Value))
582 return false;
583 MaskIdx = countTrailingZeros(Value);
584 MaskLen = countPopulation(Value);
585 return true;
586}
587
588/// Return true if the argument contains a non-empty sequence of ones with the
589/// remainder zero (64 bit version.) If true, \p MaskIdx will specify the index
590/// of the lowest set bit and \p MaskLen is updated to specify the length of the
591/// mask, else neither are updated.
592inline bool isShiftedMask_64(uint64_t Value, unsigned &MaskIdx,
593 unsigned &MaskLen) {
594 if (!isShiftedMask_64(Value))
595 return false;
596 MaskIdx = countTrailingZeros(Value);
597 MaskLen = countPopulation(Value);
598 return true;
599}
600
601/// Compile time Log2.
602/// Valid only for positive powers of two.
603template <size_t kValue> constexpr inline size_t CTLog2() {
604 static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue),
605 "Value is not a valid power of 2");
606 return 1 + CTLog2<kValue / 2>();
607}
608
609template <> constexpr inline size_t CTLog2<1>() { return 0; }
610
611/// Return the log base 2 of the specified value.
612inline double Log2(double Value) {
613#if defined(__ANDROID_API__) && __ANDROID_API__ < 18
614 return __builtin_log(Value) / __builtin_log(2.0);
615#else
616 return log2(Value);
617#endif
618}
619
620/// Return the floor log base 2 of the specified value, -1 if the value is zero.
621/// (32 bit edition.)
622/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
623inline unsigned Log2_32(uint32_t Value) {
624 return 31 - countLeadingZeros(Value);
12
Returning the value 4294967295
625}
626
627/// Return the floor log base 2 of the specified value, -1 if the value is zero.
628/// (64 bit edition.)
629inline unsigned Log2_64(uint64_t Value) {
630 return 63 - countLeadingZeros(Value);
631}
632
633/// Return the ceil log base 2 of the specified value, 32 if the value is zero.
634/// (32 bit edition).
635/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
636inline unsigned Log2_32_Ceil(uint32_t Value) {
637 return 32 - countLeadingZeros(Value - 1);
638}
639
640/// Return the ceil log base 2 of the specified value, 64 if the value is zero.
641/// (64 bit edition.)
642inline unsigned Log2_64_Ceil(uint64_t Value) {
643 return 64 - countLeadingZeros(Value - 1);
644}
645
646/// Return the greatest common divisor of the values using Euclid's algorithm.
647template <typename T>
648inline T greatestCommonDivisor(T A, T B) {
649 while (B) {
650 T Tmp = B;
651 B = A % B;
652 A = Tmp;
653 }
654 return A;
655}
656
657inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) {
658 return greatestCommonDivisor<uint64_t>(A, B);
659}
660
661/// This function takes a 64-bit integer and returns the bit equivalent double.
662inline double BitsToDouble(uint64_t Bits) {
663 double D;
664 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
665 memcpy(&D, &Bits, sizeof(Bits));
666 return D;
667}
668
669/// This function takes a 32-bit integer and returns the bit equivalent float.
670inline float BitsToFloat(uint32_t Bits) {
671 float F;
672 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
673 memcpy(&F, &Bits, sizeof(Bits));
674 return F;
675}
676
677/// This function takes a double and returns the bit equivalent 64-bit integer.
678/// Note that copying doubles around changes the bits of NaNs on some hosts,
679/// notably x86, so this routine cannot be used if these bits are needed.
680inline uint64_t DoubleToBits(double Double) {
681 uint64_t Bits;
682 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
683 memcpy(&Bits, &Double, sizeof(Double));
684 return Bits;
685}
686
687/// This function takes a float and returns the bit equivalent 32-bit integer.
688/// Note that copying floats around changes the bits of NaNs on some hosts,
689/// notably x86, so this routine cannot be used if these bits are needed.
690inline uint32_t FloatToBits(float Float) {
691 uint32_t Bits;
692 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
693 memcpy(&Bits, &Float, sizeof(Float));
694 return Bits;
695}
696
697/// A and B are either alignments or offsets. Return the minimum alignment that
698/// may be assumed after adding the two together.
699constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
700 // The largest power of 2 that divides both A and B.
701 //
702 // Replace "-Value" by "1+~Value" in the following commented code to avoid
703 // MSVC warning C4146
704 // return (A | B) & -(A | B);
705 return (A | B) & (1 + ~(A | B));
706}
707
708/// Returns the next power of two (in 64-bits) that is strictly greater than A.
709/// Returns zero on overflow.
710constexpr inline uint64_t NextPowerOf2(uint64_t A) {
711 A |= (A >> 1);
712 A |= (A >> 2);
713 A |= (A >> 4);
714 A |= (A >> 8);
715 A |= (A >> 16);
716 A |= (A >> 32);
717 return A + 1;
718}
719
720/// Returns the power of two which is less than or equal to the given value.
721/// Essentially, it is a floor operation across the domain of powers of two.
722inline uint64_t PowerOf2Floor(uint64_t A) {
723 if (!A) return 0;
724 return 1ull << (63 - countLeadingZeros(A, ZB_Undefined));
725}
726
727/// Returns the power of two which is greater than or equal to the given value.
728/// Essentially, it is a ceil operation across the domain of powers of two.
729inline uint64_t PowerOf2Ceil(uint64_t A) {
730 if (!A)
731 return 0;
732 return NextPowerOf2(A - 1);
733}
734
735/// Returns the next integer (mod 2**64) that is greater than or equal to
736/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
737///
738/// If non-zero \p Skew is specified, the return value will be a minimal
739/// integer that is greater than or equal to \p Value and equal to
740/// \p Align * N + \p Skew for some integer N. If \p Skew is larger than
741/// \p Align, its value is adjusted to '\p Skew mod \p Align'.
742///
743/// Examples:
744/// \code
745/// alignTo(5, 8) = 8
746/// alignTo(17, 8) = 24
747/// alignTo(~0LL, 8) = 0
748/// alignTo(321, 255) = 510
749///
750/// alignTo(5, 8, 7) = 7
751/// alignTo(17, 8, 1) = 17
752/// alignTo(~0LL, 8, 3) = 3
753/// alignTo(321, 255, 42) = 552
754/// \endcode
755inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
756 assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0."
) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 756, __extension__
__PRETTY_FUNCTION__))
;
757 Skew %= Align;
758 return (Value + Align - 1 - Skew) / Align * Align + Skew;
759}
760
761/// Returns the next integer (mod 2**64) that is greater than or equal to
762/// \p Value and is a multiple of \c Align. \c Align must be non-zero.
763template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) {
764 static_assert(Align != 0u, "Align must be non-zero");
765 return (Value + Align - 1) / Align * Align;
766}
767
768/// Returns the integer ceil(Numerator / Denominator).
769inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
770 return alignTo(Numerator, Denominator) / Denominator;
771}
772
773/// Returns the integer nearest(Numerator / Denominator).
774inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) {
775 return (Numerator + (Denominator / 2)) / Denominator;
776}
777
778/// Returns the largest uint64_t less than or equal to \p Value and is
779/// \p Skew mod \p Align. \p Align must be non-zero
780inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
781 assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0."
) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 781, __extension__
__PRETTY_FUNCTION__))
;
782 Skew %= Align;
783 return (Value - Skew) / Align * Align + Skew;
784}
785
786/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
787/// Requires 0 < B <= 32.
788template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) {
789 static_assert(B > 0, "Bit width can't be 0.");
790 static_assert(B <= 32, "Bit width out of range.");
791 return int32_t(X << (32 - B)) >> (32 - B);
792}
793
794/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
795/// Requires 0 < B <= 32.
796inline int32_t SignExtend32(uint32_t X, unsigned B) {
797 assert(B > 0 && "Bit width can't be 0.")(static_cast <bool> (B > 0 && "Bit width can't be 0."
) ? void (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 797, __extension__
__PRETTY_FUNCTION__))
;
798 assert(B <= 32 && "Bit width out of range.")(static_cast <bool> (B <= 32 && "Bit width out of range."
) ? void (0) : __assert_fail ("B <= 32 && \"Bit width out of range.\""
, "llvm/include/llvm/Support/MathExtras.h", 798, __extension__
__PRETTY_FUNCTION__))
;
799 return int32_t(X << (32 - B)) >> (32 - B);
800}
801
802/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
803/// Requires 0 < B <= 64.
804template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) {
805 static_assert(B > 0, "Bit width can't be 0.");
806 static_assert(B <= 64, "Bit width out of range.");
807 return int64_t(x << (64 - B)) >> (64 - B);
808}
809
810/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
811/// Requires 0 < B <= 64.
812inline int64_t SignExtend64(uint64_t X, unsigned B) {
813 assert(B > 0 && "Bit width can't be 0.")(static_cast <bool> (B > 0 && "Bit width can't be 0."
) ? void (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 813, __extension__
__PRETTY_FUNCTION__))
;
814 assert(B <= 64 && "Bit width out of range.")(static_cast <bool> (B <= 64 && "Bit width out of range."
) ? void (0) : __assert_fail ("B <= 64 && \"Bit width out of range.\""
, "llvm/include/llvm/Support/MathExtras.h", 814, __extension__
__PRETTY_FUNCTION__))
;
815 return int64_t(X << (64 - B)) >> (64 - B);
816}
817
818/// Subtract two unsigned integers, X and Y, of type T and return the absolute
819/// value of the result.
820template <typename T>
821std::enable_if_t<std::is_unsigned<T>::value, T> AbsoluteDifference(T X, T Y) {
822 return X > Y ? (X - Y) : (Y - X);
823}
824
825/// Add two unsigned integers, X and Y, of type T. Clamp the result to the
826/// maximum representable value of T on overflow. ResultOverflowed indicates if
827/// the result is larger than the maximum representable value of type T.
828template <typename T>
829std::enable_if_t<std::is_unsigned<T>::value, T>
830SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) {
831 bool Dummy;
832 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
833 // Hacker's Delight, p. 29
834 T Z = X + Y;
835 Overflowed = (Z < X || Z < Y);
836 if (Overflowed)
837 return std::numeric_limits<T>::max();
838 else
839 return Z;
840}
841
842/// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the
843/// maximum representable value of T on overflow. ResultOverflowed indicates if
844/// the result is larger than the maximum representable value of type T.
845template <typename T>
846std::enable_if_t<std::is_unsigned<T>::value, T>
847SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) {
848 bool Dummy;
849 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
850
851 // Hacker's Delight, p. 30 has a different algorithm, but we don't use that
852 // because it fails for uint16_t (where multiplication can have undefined
853 // behavior due to promotion to int), and requires a division in addition
854 // to the multiplication.
855
856 Overflowed = false;
857
858 // Log2(Z) would be either Log2Z or Log2Z + 1.
859 // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z
860 // will necessarily be less than Log2Max as desired.
861 int Log2Z = Log2_64(X) + Log2_64(Y);
862 const T Max = std::numeric_limits<T>::max();
863 int Log2Max = Log2_64(Max);
864 if (Log2Z < Log2Max) {
865 return X * Y;
866 }
867 if (Log2Z > Log2Max) {
868 Overflowed = true;
869 return Max;
870 }
871
872 // We're going to use the top bit, and maybe overflow one
873 // bit past it. Multiply all but the bottom bit then add
874 // that on at the end.
875 T Z = (X >> 1) * Y;
876 if (Z & ~(Max >> 1)) {
877 Overflowed = true;
878 return Max;
879 }
880 Z <<= 1;
881 if (X & 1)
882 return SaturatingAdd(Z, Y, ResultOverflowed);
883
884 return Z;
885}
886
887/// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to
888/// the product. Clamp the result to the maximum representable value of T on
889/// overflow. ResultOverflowed indicates if the result is larger than the
890/// maximum representable value of type T.
891template <typename T>
892std::enable_if_t<std::is_unsigned<T>::value, T>
893SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) {
894 bool Dummy;
895 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
896
897 T Product = SaturatingMultiply(X, Y, &Overflowed);
898 if (Overflowed)
899 return Product;
900
901 return SaturatingAdd(A, Product, &Overflowed);
902}
903
904/// Use this rather than HUGE_VALF; the latter causes warnings on MSVC.
905extern const float huge_valf;
906
907
908/// Add two signed integers, computing the two's complement truncated result,
909/// returning true if overflow occurred.
910template <typename T>
911std::enable_if_t<std::is_signed<T>::value, T> AddOverflow(T X, T Y, T &Result) {
912#if __has_builtin(__builtin_add_overflow)1
913 return __builtin_add_overflow(X, Y, &Result);
914#else
915 // Perform the unsigned addition.
916 using U = std::make_unsigned_t<T>;
917 const U UX = static_cast<U>(X);
918 const U UY = static_cast<U>(Y);
919 const U UResult = UX + UY;
920
921 // Convert to signed.
922 Result = static_cast<T>(UResult);
923
924 // Adding two positive numbers should result in a positive number.
925 if (X > 0 && Y > 0)
926 return Result <= 0;
927 // Adding two negatives should result in a negative number.
928 if (X < 0 && Y < 0)
929 return Result >= 0;
930 return false;
931#endif
932}
933
934/// Subtract two signed integers, computing the two's complement truncated
935/// result, returning true if an overflow ocurred.
936template <typename T>
937std::enable_if_t<std::is_signed<T>::value, T> SubOverflow(T X, T Y, T &Result) {
938#if __has_builtin(__builtin_sub_overflow)1
939 return __builtin_sub_overflow(X, Y, &Result);
940#else
941 // Perform the unsigned addition.
942 using U = std::make_unsigned_t<T>;
943 const U UX = static_cast<U>(X);
944 const U UY = static_cast<U>(Y);
945 const U UResult = UX - UY;
946
947 // Convert to signed.
948 Result = static_cast<T>(UResult);
949
950 // Subtracting a positive number from a negative results in a negative number.
951 if (X <= 0 && Y > 0)
952 return Result >= 0;
953 // Subtracting a negative number from a positive results in a positive number.
954 if (X >= 0 && Y < 0)
955 return Result <= 0;
956 return false;
957#endif
958}
959
960/// Multiply two signed integers, computing the two's complement truncated
961/// result, returning true if an overflow ocurred.
962template <typename T>
963std::enable_if_t<std::is_signed<T>::value, T> MulOverflow(T X, T Y, T &Result) {
964 // Perform the unsigned multiplication on absolute values.
965 using U = std::make_unsigned_t<T>;
966 const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X);
967 const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y);
968 const U UResult = UX * UY;
969
970 // Convert to signed.
971 const bool IsNegative = (X < 0) ^ (Y < 0);
972 Result = IsNegative ? (0 - UResult) : UResult;
973
974 // If any of the args was 0, result is 0 and no overflow occurs.
975 if (UX == 0 || UY == 0)
976 return false;
977
978 // UX and UY are in [1, 2^n], where n is the number of digits.
979 // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for
980 // positive) divided by an argument compares to the other.
981 if (IsNegative)
982 return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY;
983 else
984 return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY;
985}
986
987} // End llvm namespace
988
989#endif