Bug Summary

File:build/source/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Warning:line 2741, column 21
The result of the left shift is undefined due to shifting by '4294967295', which is greater or equal to the width of type 'int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name AArch64TargetTransformInfo.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/source/build-llvm/tools/clang/stage2-bins -resource-dir /usr/lib/llvm-17/lib/clang/17 -D _DEBUG -D _GLIBCXX_ASSERTIONS -D _GNU_SOURCE -D _LIBCPP_ENABLE_ASSERTIONS -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Target/AArch64 -I /build/source/llvm/lib/Target/AArch64 -I include -I /build/source/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-17/lib/clang/17/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fmacro-prefix-map=/build/source/= -fcoverage-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fcoverage-prefix-map=/build/source/= -source-date-epoch 1680300532 -O2 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -Wno-misleading-indentation -std=c++17 -fdeprecated-macro -fdebug-compilation-dir=/build/source/build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/source/= -ferror-limit 19 -fvisibility=hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2023-04-01-083001-16331-1 -x c++ /build/source/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

/build/source/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "AArch64TargetTransformInfo.h"
10#include "AArch64ExpandImm.h"
11#include "AArch64PerfectShuffle.h"
12#include "MCTargetDesc/AArch64AddressingModes.h"
13#include "llvm/Analysis/IVDescriptors.h"
14#include "llvm/Analysis/LoopInfo.h"
15#include "llvm/Analysis/TargetTransformInfo.h"
16#include "llvm/CodeGen/BasicTTIImpl.h"
17#include "llvm/CodeGen/CostTable.h"
18#include "llvm/CodeGen/TargetLowering.h"
19#include "llvm/IR/IntrinsicInst.h"
20#include "llvm/IR/Intrinsics.h"
21#include "llvm/IR/IntrinsicsAArch64.h"
22#include "llvm/IR/PatternMatch.h"
23#include "llvm/Support/Debug.h"
24#include "llvm/Transforms/InstCombine/InstCombiner.h"
25#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
26#include <algorithm>
27#include <optional>
28using namespace llvm;
29using namespace llvm::PatternMatch;
30
31#define DEBUG_TYPE"aarch64tti" "aarch64tti"
32
33static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
34 cl::init(true), cl::Hidden);
35
36static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
37 cl::Hidden);
38
39static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
40 cl::init(10), cl::Hidden);
41
42namespace {
43class TailFoldingKind {
44private:
45 uint8_t Bits = 0; // Currently defaults to disabled.
46
47public:
48 enum TailFoldingOpts {
49 TFDisabled = 0x0,
50 TFReductions = 0x01,
51 TFRecurrences = 0x02,
52 TFReverse = 0x04,
53 TFSimple = 0x80,
54 TFAll = TFReductions | TFRecurrences | TFReverse | TFSimple
55 };
56
57 void operator=(const std::string &Val) {
58 if (Val.empty())
59 return;
60 SmallVector<StringRef, 6> TailFoldTypes;
61 StringRef(Val).split(TailFoldTypes, '+', -1, false);
62 for (auto TailFoldType : TailFoldTypes) {
63 if (TailFoldType == "disabled")
64 Bits = 0;
65 else if (TailFoldType == "all")
66 Bits = TFAll;
67 else if (TailFoldType == "default")
68 Bits = 0; // Currently defaults to never tail-folding.
69 else if (TailFoldType == "simple")
70 add(TFSimple);
71 else if (TailFoldType == "reductions")
72 add(TFReductions);
73 else if (TailFoldType == "recurrences")
74 add(TFRecurrences);
75 else if (TailFoldType == "reverse")
76 add(TFReverse);
77 else if (TailFoldType == "noreductions")
78 remove(TFReductions);
79 else if (TailFoldType == "norecurrences")
80 remove(TFRecurrences);
81 else if (TailFoldType == "noreverse")
82 remove(TFReverse);
83 else {
84 errs()
85 << "invalid argument " << TailFoldType.str()
86 << " to -sve-tail-folding=; each element must be one of: disabled, "
87 "all, default, simple, reductions, noreductions, recurrences, "
88 "norecurrences\n";
89 }
90 }
91 }
92
93 operator uint8_t() const { return Bits; }
94
95 void add(uint8_t Flag) { Bits |= Flag; }
96 void remove(uint8_t Flag) { Bits &= ~Flag; }
97};
98} // namespace
99
100TailFoldingKind TailFoldingKindLoc;
101
102cl::opt<TailFoldingKind, true, cl::parser<std::string>> SVETailFolding(
103 "sve-tail-folding",
104 cl::desc(
105 "Control the use of vectorisation using tail-folding for SVE:"
106 "\ndisabled No loop types will vectorize using tail-folding"
107 "\ndefault Uses the default tail-folding settings for the target "
108 "CPU"
109 "\nall All legal loop types will vectorize using tail-folding"
110 "\nsimple Use tail-folding for simple loops (not reductions or "
111 "recurrences)"
112 "\nreductions Use tail-folding for loops containing reductions"
113 "\nrecurrences Use tail-folding for loops containing fixed order "
114 "recurrences"
115 "\nreverse Use tail-folding for loops requiring reversed "
116 "predicates"),
117 cl::location(TailFoldingKindLoc));
118
119// Experimental option that will only be fully functional when the
120// code-generator is changed to use SVE instead of NEON for all fixed-width
121// operations.
122static cl::opt<bool> EnableFixedwidthAutovecInStreamingMode(
123 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
124
125// Experimental option that will only be fully functional when the cost-model
126// and code-generator have been changed to avoid using scalable vector
127// instructions that are not legal in streaming SVE mode.
128static cl::opt<bool> EnableScalableAutovecInStreamingMode(
129 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
130
131bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
132 const Function *Callee) const {
133 SMEAttrs CallerAttrs(*Caller);
134 SMEAttrs CalleeAttrs(*Callee);
135 if (CallerAttrs.requiresSMChange(CalleeAttrs,
136 /*BodyOverridesInterface=*/true) ||
137 CallerAttrs.requiresLazySave(CalleeAttrs) ||
138 CalleeAttrs.hasNewZAInterface())
139 return false;
140
141 const TargetMachine &TM = getTLI()->getTargetMachine();
142
143 const FeatureBitset &CallerBits =
144 TM.getSubtargetImpl(*Caller)->getFeatureBits();
145 const FeatureBitset &CalleeBits =
146 TM.getSubtargetImpl(*Callee)->getFeatureBits();
147
148 // Inline a callee if its target-features are a subset of the callers
149 // target-features.
150 return (CallerBits & CalleeBits) == CalleeBits;
151}
152
153bool AArch64TTIImpl::shouldMaximizeVectorBandwidth(
154 TargetTransformInfo::RegisterKind K) const {
155 assert(K != TargetTransformInfo::RGK_Scalar)(static_cast <bool> (K != TargetTransformInfo::RGK_Scalar
) ? void (0) : __assert_fail ("K != TargetTransformInfo::RGK_Scalar"
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 155
, __extension__ __PRETTY_FUNCTION__))
;
156 return K == TargetTransformInfo::RGK_FixedWidthVector;
157}
158
159/// Calculate the cost of materializing a 64-bit value. This helper
160/// method might only calculate a fraction of a larger immediate. Therefore it
161/// is valid to return a cost of ZERO.
162InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) {
163 // Check if the immediate can be encoded within an instruction.
164 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
165 return 0;
166
167 if (Val < 0)
168 Val = ~Val;
169
170 // Calculate how many moves we will need to materialize this constant.
171 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
172 AArch64_IMM::expandMOVImm(Val, 64, Insn);
173 return Insn.size();
174}
175
176/// Calculate the cost of materializing the given constant.
177InstructionCost AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
178 TTI::TargetCostKind CostKind) {
179 assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
__assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 179, __extension__ __PRETTY_FUNCTION__))
;
180
181 unsigned BitSize = Ty->getPrimitiveSizeInBits();
182 if (BitSize == 0)
183 return ~0U;
184
185 // Sign-extend all constants to a multiple of 64-bit.
186 APInt ImmVal = Imm;
187 if (BitSize & 0x3f)
188 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
189
190 // Split the constant into 64-bit chunks and calculate the cost for each
191 // chunk.
192 InstructionCost Cost = 0;
193 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
194 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
195 int64_t Val = Tmp.getSExtValue();
196 Cost += getIntImmCost(Val);
197 }
198 // We need at least one instruction to materialze the constant.
199 return std::max<InstructionCost>(1, Cost);
200}
201
202InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
203 const APInt &Imm, Type *Ty,
204 TTI::TargetCostKind CostKind,
205 Instruction *Inst) {
206 assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
__assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 206, __extension__ __PRETTY_FUNCTION__))
;
207
208 unsigned BitSize = Ty->getPrimitiveSizeInBits();
209 // There is no cost model for constants with a bit size of 0. Return TCC_Free
210 // here, so that constant hoisting will ignore this constant.
211 if (BitSize == 0)
212 return TTI::TCC_Free;
213
214 unsigned ImmIdx = ~0U;
215 switch (Opcode) {
216 default:
217 return TTI::TCC_Free;
218 case Instruction::GetElementPtr:
219 // Always hoist the base address of a GetElementPtr.
220 if (Idx == 0)
221 return 2 * TTI::TCC_Basic;
222 return TTI::TCC_Free;
223 case Instruction::Store:
224 ImmIdx = 0;
225 break;
226 case Instruction::Add:
227 case Instruction::Sub:
228 case Instruction::Mul:
229 case Instruction::UDiv:
230 case Instruction::SDiv:
231 case Instruction::URem:
232 case Instruction::SRem:
233 case Instruction::And:
234 case Instruction::Or:
235 case Instruction::Xor:
236 case Instruction::ICmp:
237 ImmIdx = 1;
238 break;
239 // Always return TCC_Free for the shift value of a shift instruction.
240 case Instruction::Shl:
241 case Instruction::LShr:
242 case Instruction::AShr:
243 if (Idx == 1)
244 return TTI::TCC_Free;
245 break;
246 case Instruction::Trunc:
247 case Instruction::ZExt:
248 case Instruction::SExt:
249 case Instruction::IntToPtr:
250 case Instruction::PtrToInt:
251 case Instruction::BitCast:
252 case Instruction::PHI:
253 case Instruction::Call:
254 case Instruction::Select:
255 case Instruction::Ret:
256 case Instruction::Load:
257 break;
258 }
259
260 if (Idx == ImmIdx) {
261 int NumConstants = (BitSize + 63) / 64;
262 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
263 return (Cost <= NumConstants * TTI::TCC_Basic)
264 ? static_cast<int>(TTI::TCC_Free)
265 : Cost;
266 }
267 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
268}
269
270InstructionCost
271AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
272 const APInt &Imm, Type *Ty,
273 TTI::TargetCostKind CostKind) {
274 assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
__assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 274, __extension__ __PRETTY_FUNCTION__))
;
275
276 unsigned BitSize = Ty->getPrimitiveSizeInBits();
277 // There is no cost model for constants with a bit size of 0. Return TCC_Free
278 // here, so that constant hoisting will ignore this constant.
279 if (BitSize == 0)
280 return TTI::TCC_Free;
281
282 // Most (all?) AArch64 intrinsics do not support folding immediates into the
283 // selected instruction, so we compute the materialization cost for the
284 // immediate directly.
285 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
286 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
287
288 switch (IID) {
289 default:
290 return TTI::TCC_Free;
291 case Intrinsic::sadd_with_overflow:
292 case Intrinsic::uadd_with_overflow:
293 case Intrinsic::ssub_with_overflow:
294 case Intrinsic::usub_with_overflow:
295 case Intrinsic::smul_with_overflow:
296 case Intrinsic::umul_with_overflow:
297 if (Idx == 1) {
298 int NumConstants = (BitSize + 63) / 64;
299 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
300 return (Cost <= NumConstants * TTI::TCC_Basic)
301 ? static_cast<int>(TTI::TCC_Free)
302 : Cost;
303 }
304 break;
305 case Intrinsic::experimental_stackmap:
306 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
307 return TTI::TCC_Free;
308 break;
309 case Intrinsic::experimental_patchpoint_void:
310 case Intrinsic::experimental_patchpoint_i64:
311 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
312 return TTI::TCC_Free;
313 break;
314 case Intrinsic::experimental_gc_statepoint:
315 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
316 return TTI::TCC_Free;
317 break;
318 }
319 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
320}
321
322TargetTransformInfo::PopcntSupportKind
323AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
324 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2")(static_cast <bool> (isPowerOf2_32(TyWidth) && "Ty width must be power of 2"
) ? void (0) : __assert_fail ("isPowerOf2_32(TyWidth) && \"Ty width must be power of 2\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 324
, __extension__ __PRETTY_FUNCTION__))
;
325 if (TyWidth == 32 || TyWidth == 64)
326 return TTI::PSK_FastHardware;
327 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
328 return TTI::PSK_Software;
329}
330
331InstructionCost
332AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
333 TTI::TargetCostKind CostKind) {
334 auto *RetTy = ICA.getReturnType();
335 switch (ICA.getID()) {
336 case Intrinsic::umin:
337 case Intrinsic::umax:
338 case Intrinsic::smin:
339 case Intrinsic::smax: {
340 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
341 MVT::v8i16, MVT::v2i32, MVT::v4i32};
342 auto LT = getTypeLegalizationCost(RetTy);
343 // v2i64 types get converted to cmp+bif hence the cost of 2
344 if (LT.second == MVT::v2i64)
345 return LT.first * 2;
346 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
347 return LT.first;
348 break;
349 }
350 case Intrinsic::sadd_sat:
351 case Intrinsic::ssub_sat:
352 case Intrinsic::uadd_sat:
353 case Intrinsic::usub_sat: {
354 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
355 MVT::v8i16, MVT::v2i32, MVT::v4i32,
356 MVT::v2i64};
357 auto LT = getTypeLegalizationCost(RetTy);
358 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
359 // need to extend the type, as it uses shr(qadd(shl, shl)).
360 unsigned Instrs =
361 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
362 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
363 return LT.first * Instrs;
364 break;
365 }
366 case Intrinsic::abs: {
367 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
368 MVT::v8i16, MVT::v2i32, MVT::v4i32,
369 MVT::v2i64};
370 auto LT = getTypeLegalizationCost(RetTy);
371 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
372 return LT.first;
373 break;
374 }
375 case Intrinsic::experimental_stepvector: {
376 InstructionCost Cost = 1; // Cost of the `index' instruction
377 auto LT = getTypeLegalizationCost(RetTy);
378 // Legalisation of illegal vectors involves an `index' instruction plus
379 // (LT.first - 1) vector adds.
380 if (LT.first > 1) {
381 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
382 InstructionCost AddCost =
383 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
384 Cost += AddCost * (LT.first - 1);
385 }
386 return Cost;
387 }
388 case Intrinsic::bitreverse: {
389 static const CostTblEntry BitreverseTbl[] = {
390 {Intrinsic::bitreverse, MVT::i32, 1},
391 {Intrinsic::bitreverse, MVT::i64, 1},
392 {Intrinsic::bitreverse, MVT::v8i8, 1},
393 {Intrinsic::bitreverse, MVT::v16i8, 1},
394 {Intrinsic::bitreverse, MVT::v4i16, 2},
395 {Intrinsic::bitreverse, MVT::v8i16, 2},
396 {Intrinsic::bitreverse, MVT::v2i32, 2},
397 {Intrinsic::bitreverse, MVT::v4i32, 2},
398 {Intrinsic::bitreverse, MVT::v1i64, 2},
399 {Intrinsic::bitreverse, MVT::v2i64, 2},
400 };
401 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
402 const auto *Entry =
403 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
404 if (Entry) {
405 // Cost Model is using the legal type(i32) that i8 and i16 will be
406 // converted to +1 so that we match the actual lowering cost
407 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
408 TLI->getValueType(DL, RetTy, true) == MVT::i16)
409 return LegalisationCost.first * Entry->Cost + 1;
410
411 return LegalisationCost.first * Entry->Cost;
412 }
413 break;
414 }
415 case Intrinsic::ctpop: {
416 if (!ST->hasNEON()) {
417 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
418 return getTypeLegalizationCost(RetTy).first * 12;
419 }
420 static const CostTblEntry CtpopCostTbl[] = {
421 {ISD::CTPOP, MVT::v2i64, 4},
422 {ISD::CTPOP, MVT::v4i32, 3},
423 {ISD::CTPOP, MVT::v8i16, 2},
424 {ISD::CTPOP, MVT::v16i8, 1},
425 {ISD::CTPOP, MVT::i64, 4},
426 {ISD::CTPOP, MVT::v2i32, 3},
427 {ISD::CTPOP, MVT::v4i16, 2},
428 {ISD::CTPOP, MVT::v8i8, 1},
429 {ISD::CTPOP, MVT::i32, 5},
430 };
431 auto LT = getTypeLegalizationCost(RetTy);
432 MVT MTy = LT.second;
433 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
434 // Extra cost of +1 when illegal vector types are legalized by promoting
435 // the integer type.
436 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
437 RetTy->getScalarSizeInBits()
438 ? 1
439 : 0;
440 return LT.first * Entry->Cost + ExtraCost;
441 }
442 break;
443 }
444 case Intrinsic::sadd_with_overflow:
445 case Intrinsic::uadd_with_overflow:
446 case Intrinsic::ssub_with_overflow:
447 case Intrinsic::usub_with_overflow:
448 case Intrinsic::smul_with_overflow:
449 case Intrinsic::umul_with_overflow: {
450 static const CostTblEntry WithOverflowCostTbl[] = {
451 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
452 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
453 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
454 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
455 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
456 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
457 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
458 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
459 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
460 {Intrinsic::usub_with_overflow, MVT::i8, 3},
461 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
462 {Intrinsic::usub_with_overflow, MVT::i16, 3},
463 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
464 {Intrinsic::usub_with_overflow, MVT::i32, 1},
465 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
466 {Intrinsic::usub_with_overflow, MVT::i64, 1},
467 {Intrinsic::smul_with_overflow, MVT::i8, 5},
468 {Intrinsic::umul_with_overflow, MVT::i8, 4},
469 {Intrinsic::smul_with_overflow, MVT::i16, 5},
470 {Intrinsic::umul_with_overflow, MVT::i16, 4},
471 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
472 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
473 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
474 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
475 };
476 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
477 if (MTy.isSimple())
478 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
479 MTy.getSimpleVT()))
480 return Entry->Cost;
481 break;
482 }
483 case Intrinsic::fptosi_sat:
484 case Intrinsic::fptoui_sat: {
485 if (ICA.getArgTypes().empty())
486 break;
487 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
488 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
489 EVT MTy = TLI->getValueType(DL, RetTy);
490 // Check for the legal types, which are where the size of the input and the
491 // output are the same, or we are using cvt f64->i32 or f32->i64.
492 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
493 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
494 LT.second == MVT::v2f64) &&
495 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
496 (LT.second == MVT::f64 && MTy == MVT::i32) ||
497 (LT.second == MVT::f32 && MTy == MVT::i64)))
498 return LT.first;
499 // Similarly for fp16 sizes
500 if (ST->hasFullFP16() &&
501 ((LT.second == MVT::f16 && MTy == MVT::i32) ||
502 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
503 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits()))))
504 return LT.first;
505
506 // Otherwise we use a legal convert followed by a min+max
507 if ((LT.second.getScalarType() == MVT::f32 ||
508 LT.second.getScalarType() == MVT::f64 ||
509 (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) &&
510 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
511 Type *LegalTy =
512 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
513 if (LT.second.isVector())
514 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
515 InstructionCost Cost = 1;
516 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
517 LegalTy, {LegalTy, LegalTy});
518 Cost += getIntrinsicInstrCost(Attrs1, CostKind);
519 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
520 LegalTy, {LegalTy, LegalTy});
521 Cost += getIntrinsicInstrCost(Attrs2, CostKind);
522 return LT.first * Cost;
523 }
524 break;
525 }
526 default:
527 break;
528 }
529 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
530}
531
532/// The function will remove redundant reinterprets casting in the presence
533/// of the control flow
534static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
535 IntrinsicInst &II) {
536 SmallVector<Instruction *, 32> Worklist;
537 auto RequiredType = II.getType();
538
539 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
540 assert(PN && "Expected Phi Node!")(static_cast <bool> (PN && "Expected Phi Node!"
) ? void (0) : __assert_fail ("PN && \"Expected Phi Node!\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 540
, __extension__ __PRETTY_FUNCTION__))
;
541
542 // Don't create a new Phi unless we can remove the old one.
543 if (!PN->hasOneUse())
544 return std::nullopt;
545
546 for (Value *IncValPhi : PN->incoming_values()) {
547 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
548 if (!Reinterpret ||
549 Reinterpret->getIntrinsicID() !=
550 Intrinsic::aarch64_sve_convert_to_svbool ||
551 RequiredType != Reinterpret->getArgOperand(0)->getType())
552 return std::nullopt;
553 }
554
555 // Create the new Phi
556 LLVMContext &Ctx = PN->getContext();
557 IRBuilder<> Builder(Ctx);
558 Builder.SetInsertPoint(PN);
559 PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
560 Worklist.push_back(PN);
561
562 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
563 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
564 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
565 Worklist.push_back(Reinterpret);
566 }
567
568 // Cleanup Phi Node and reinterprets
569 return IC.replaceInstUsesWith(II, NPN);
570}
571
572// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
573// => (binop (pred) (from_svbool _) (from_svbool _))
574//
575// The above transformation eliminates a `to_svbool` in the predicate
576// operand of bitwise operation `binop` by narrowing the vector width of
577// the operation. For example, it would convert a `<vscale x 16 x i1>
578// and` into a `<vscale x 4 x i1> and`. This is profitable because
579// to_svbool must zero the new lanes during widening, whereas
580// from_svbool is free.
581static std::optional<Instruction *>
582tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II) {
583 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
584 if (!BinOp)
585 return std::nullopt;
586
587 auto IntrinsicID = BinOp->getIntrinsicID();
588 switch (IntrinsicID) {
589 case Intrinsic::aarch64_sve_and_z:
590 case Intrinsic::aarch64_sve_bic_z:
591 case Intrinsic::aarch64_sve_eor_z:
592 case Intrinsic::aarch64_sve_nand_z:
593 case Intrinsic::aarch64_sve_nor_z:
594 case Intrinsic::aarch64_sve_orn_z:
595 case Intrinsic::aarch64_sve_orr_z:
596 break;
597 default:
598 return std::nullopt;
599 }
600
601 auto BinOpPred = BinOp->getOperand(0);
602 auto BinOpOp1 = BinOp->getOperand(1);
603 auto BinOpOp2 = BinOp->getOperand(2);
604
605 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
606 if (!PredIntr ||
607 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
608 return std::nullopt;
609
610 auto PredOp = PredIntr->getOperand(0);
611 auto PredOpTy = cast<VectorType>(PredOp->getType());
612 if (PredOpTy != II.getType())
613 return std::nullopt;
614
615 IRBuilder<> Builder(II.getContext());
616 Builder.SetInsertPoint(&II);
617
618 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
619 auto NarrowBinOpOp1 = Builder.CreateIntrinsic(
620 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
621 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
622 if (BinOpOp1 == BinOpOp2)
623 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
624 else
625 NarrowedBinOpArgs.push_back(Builder.CreateIntrinsic(
626 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
627
628 auto NarrowedBinOp =
629 Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
630 return IC.replaceInstUsesWith(II, NarrowedBinOp);
631}
632
633static std::optional<Instruction *>
634instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II) {
635 // If the reinterpret instruction operand is a PHI Node
636 if (isa<PHINode>(II.getArgOperand(0)))
637 return processPhiNode(IC, II);
638
639 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
640 return BinOpCombine;
641
642 SmallVector<Instruction *, 32> CandidatesForRemoval;
643 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
644
645 const auto *IVTy = cast<VectorType>(II.getType());
646
647 // Walk the chain of conversions.
648 while (Cursor) {
649 // If the type of the cursor has fewer lanes than the final result, zeroing
650 // must take place, which breaks the equivalence chain.
651 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
652 if (CursorVTy->getElementCount().getKnownMinValue() <
653 IVTy->getElementCount().getKnownMinValue())
654 break;
655
656 // If the cursor has the same type as I, it is a viable replacement.
657 if (Cursor->getType() == IVTy)
658 EarliestReplacement = Cursor;
659
660 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
661
662 // If this is not an SVE conversion intrinsic, this is the end of the chain.
663 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
664 Intrinsic::aarch64_sve_convert_to_svbool ||
665 IntrinsicCursor->getIntrinsicID() ==
666 Intrinsic::aarch64_sve_convert_from_svbool))
667 break;
668
669 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
670 Cursor = IntrinsicCursor->getOperand(0);
671 }
672
673 // If no viable replacement in the conversion chain was found, there is
674 // nothing to do.
675 if (!EarliestReplacement)
676 return std::nullopt;
677
678 return IC.replaceInstUsesWith(II, EarliestReplacement);
679}
680
681static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
682 IntrinsicInst &II) {
683 IRBuilder<> Builder(&II);
684 auto Select = Builder.CreateSelect(II.getOperand(0), II.getOperand(1),
685 II.getOperand(2));
686 return IC.replaceInstUsesWith(II, Select);
687}
688
689static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
690 IntrinsicInst &II) {
691 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
692 if (!Pg)
693 return std::nullopt;
694
695 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
696 return std::nullopt;
697
698 const auto PTruePattern =
699 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
700 if (PTruePattern != AArch64SVEPredPattern::vl1)
701 return std::nullopt;
702
703 // The intrinsic is inserting into lane zero so use an insert instead.
704 auto *IdxTy = Type::getInt64Ty(II.getContext());
705 auto *Insert = InsertElementInst::Create(
706 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
707 Insert->insertBefore(&II);
708 Insert->takeName(&II);
709
710 return IC.replaceInstUsesWith(II, Insert);
711}
712
713static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
714 IntrinsicInst &II) {
715 // Replace DupX with a regular IR splat.
716 IRBuilder<> Builder(II.getContext());
717 Builder.SetInsertPoint(&II);
718 auto *RetTy = cast<ScalableVectorType>(II.getType());
719 Value *Splat =
720 Builder.CreateVectorSplat(RetTy->getElementCount(), II.getArgOperand(0));
721 Splat->takeName(&II);
722 return IC.replaceInstUsesWith(II, Splat);
723}
724
725static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
726 IntrinsicInst &II) {
727 LLVMContext &Ctx = II.getContext();
728 IRBuilder<> Builder(Ctx);
729 Builder.SetInsertPoint(&II);
730
731 // Check that the predicate is all active
732 auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
733 if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
734 return std::nullopt;
735
736 const auto PTruePattern =
737 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
738 if (PTruePattern != AArch64SVEPredPattern::all)
739 return std::nullopt;
740
741 // Check that we have a compare of zero..
742 auto *SplatValue =
743 dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2)));
744 if (!SplatValue || !SplatValue->isZero())
745 return std::nullopt;
746
747 // ..against a dupq
748 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
749 if (!DupQLane ||
750 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
751 return std::nullopt;
752
753 // Where the dupq is a lane 0 replicate of a vector insert
754 if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero())
755 return std::nullopt;
756
757 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
758 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
759 return std::nullopt;
760
761 // Where the vector insert is a fixed constant vector insert into undef at
762 // index zero
763 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
764 return std::nullopt;
765
766 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
767 return std::nullopt;
768
769 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
770 if (!ConstVec)
771 return std::nullopt;
772
773 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
774 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
775 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
776 return std::nullopt;
777
778 unsigned NumElts = VecTy->getNumElements();
779 unsigned PredicateBits = 0;
780
781 // Expand intrinsic operands to a 16-bit byte level predicate
782 for (unsigned I = 0; I < NumElts; ++I) {
783 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
784 if (!Arg)
785 return std::nullopt;
786 if (!Arg->isZero())
787 PredicateBits |= 1 << (I * (16 / NumElts));
788 }
789
790 // If all bits are zero bail early with an empty predicate
791 if (PredicateBits == 0) {
792 auto *PFalse = Constant::getNullValue(II.getType());
793 PFalse->takeName(&II);
794 return IC.replaceInstUsesWith(II, PFalse);
795 }
796
797 // Calculate largest predicate type used (where byte predicate is largest)
798 unsigned Mask = 8;
799 for (unsigned I = 0; I < 16; ++I)
800 if ((PredicateBits & (1 << I)) != 0)
801 Mask |= (I % 8);
802
803 unsigned PredSize = Mask & -Mask;
804 auto *PredType = ScalableVectorType::get(
805 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
806
807 // Ensure all relevant bits are set
808 for (unsigned I = 0; I < 16; I += PredSize)
809 if ((PredicateBits & (1 << I)) == 0)
810 return std::nullopt;
811
812 auto *PTruePat =
813 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
814 auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
815 {PredType}, {PTruePat});
816 auto *ConvertToSVBool = Builder.CreateIntrinsic(
817 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
818 auto *ConvertFromSVBool =
819 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
820 {II.getType()}, {ConvertToSVBool});
821
822 ConvertFromSVBool->takeName(&II);
823 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
824}
825
826static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
827 IntrinsicInst &II) {
828 IRBuilder<> Builder(II.getContext());
829 Builder.SetInsertPoint(&II);
830 Value *Pg = II.getArgOperand(0);
831 Value *Vec = II.getArgOperand(1);
832 auto IntrinsicID = II.getIntrinsicID();
833 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
834
835 // lastX(splat(X)) --> X
836 if (auto *SplatVal = getSplatValue(Vec))
837 return IC.replaceInstUsesWith(II, SplatVal);
838
839 // If x and/or y is a splat value then:
840 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
841 Value *LHS, *RHS;
842 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
843 if (isSplatValue(LHS) || isSplatValue(RHS)) {
844 auto *OldBinOp = cast<BinaryOperator>(Vec);
845 auto OpC = OldBinOp->getOpcode();
846 auto *NewLHS =
847 Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
848 auto *NewRHS =
849 Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
850 auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags(
851 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), &II);
852 return IC.replaceInstUsesWith(II, NewBinOp);
853 }
854 }
855
856 auto *C = dyn_cast<Constant>(Pg);
857 if (IsAfter && C && C->isNullValue()) {
858 // The intrinsic is extracting lane 0 so use an extract instead.
859 auto *IdxTy = Type::getInt64Ty(II.getContext());
860 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
861 Extract->insertBefore(&II);
862 Extract->takeName(&II);
863 return IC.replaceInstUsesWith(II, Extract);
864 }
865
866 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
867 if (!IntrPG)
868 return std::nullopt;
869
870 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
871 return std::nullopt;
872
873 const auto PTruePattern =
874 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
875
876 // Can the intrinsic's predicate be converted to a known constant index?
877 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
878 if (!MinNumElts)
879 return std::nullopt;
880
881 unsigned Idx = MinNumElts - 1;
882 // Increment the index if extracting the element after the last active
883 // predicate element.
884 if (IsAfter)
885 ++Idx;
886
887 // Ignore extracts whose index is larger than the known minimum vector
888 // length. NOTE: This is an artificial constraint where we prefer to
889 // maintain what the user asked for until an alternative is proven faster.
890 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
891 if (Idx >= PgVTy->getMinNumElements())
892 return std::nullopt;
893
894 // The intrinsic is extracting a fixed lane so use an extract instead.
895 auto *IdxTy = Type::getInt64Ty(II.getContext());
896 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
897 Extract->insertBefore(&II);
898 Extract->takeName(&II);
899 return IC.replaceInstUsesWith(II, Extract);
900}
901
902static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
903 IntrinsicInst &II) {
904 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
905 // integer variant across a variety of micro-architectures. Replace scalar
906 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
907 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
908 // depending on the micro-architecture, but has been observed as generally
909 // being faster, particularly when the CLAST[AB] op is a loop-carried
910 // dependency.
911 IRBuilder<> Builder(II.getContext());
912 Builder.SetInsertPoint(&II);
913 Value *Pg = II.getArgOperand(0);
914 Value *Fallback = II.getArgOperand(1);
915 Value *Vec = II.getArgOperand(2);
916 Type *Ty = II.getType();
917
918 if (!Ty->isIntegerTy())
919 return std::nullopt;
920
921 Type *FPTy;
922 switch (cast<IntegerType>(Ty)->getBitWidth()) {
923 default:
924 return std::nullopt;
925 case 16:
926 FPTy = Builder.getHalfTy();
927 break;
928 case 32:
929 FPTy = Builder.getFloatTy();
930 break;
931 case 64:
932 FPTy = Builder.getDoubleTy();
933 break;
934 }
935
936 Value *FPFallBack = Builder.CreateBitCast(Fallback, FPTy);
937 auto *FPVTy = VectorType::get(
938 FPTy, cast<VectorType>(Vec->getType())->getElementCount());
939 Value *FPVec = Builder.CreateBitCast(Vec, FPVTy);
940 auto *FPII = Builder.CreateIntrinsic(II.getIntrinsicID(), {FPVec->getType()},
941 {Pg, FPFallBack, FPVec});
942 Value *FPIItoInt = Builder.CreateBitCast(FPII, II.getType());
943 return IC.replaceInstUsesWith(II, FPIItoInt);
944}
945
946static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
947 IntrinsicInst &II) {
948 LLVMContext &Ctx = II.getContext();
949 IRBuilder<> Builder(Ctx);
950 Builder.SetInsertPoint(&II);
951 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
952 // can work with RDFFR_PP for ptest elimination.
953 auto *AllPat =
954 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
955 auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
956 {II.getType()}, {AllPat});
957 auto *RDFFR =
958 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue});
959 RDFFR->takeName(&II);
960 return IC.replaceInstUsesWith(II, RDFFR);
961}
962
963static std::optional<Instruction *>
964instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) {
965 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
966
967 if (Pattern == AArch64SVEPredPattern::all) {
968 LLVMContext &Ctx = II.getContext();
969 IRBuilder<> Builder(Ctx);
970 Builder.SetInsertPoint(&II);
971
972 Constant *StepVal = ConstantInt::get(II.getType(), NumElts);
973 auto *VScale = Builder.CreateVScale(StepVal);
974 VScale->takeName(&II);
975 return IC.replaceInstUsesWith(II, VScale);
976 }
977
978 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
979
980 return MinNumElts && NumElts >= MinNumElts
981 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
982 II, ConstantInt::get(II.getType(), MinNumElts)))
983 : std::nullopt;
984}
985
986static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
987 IntrinsicInst &II) {
988 Value *PgVal = II.getArgOperand(0);
989 Value *OpVal = II.getArgOperand(1);
990
991 IRBuilder<> Builder(II.getContext());
992 Builder.SetInsertPoint(&II);
993
994 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
995 // Later optimizations prefer this form.
996 if (PgVal == OpVal &&
997 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
998 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
999 Value *Ops[] = {PgVal, OpVal};
1000 Type *Tys[] = {PgVal->getType()};
1001
1002 auto *PTest =
1003 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
1004 PTest->takeName(&II);
1005
1006 return IC.replaceInstUsesWith(II, PTest);
1007 }
1008
1009 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(PgVal);
1010 IntrinsicInst *Op = dyn_cast<IntrinsicInst>(OpVal);
1011
1012 if (!Pg || !Op)
1013 return std::nullopt;
1014
1015 Intrinsic::ID OpIID = Op->getIntrinsicID();
1016
1017 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
1018 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
1019 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
1020 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
1021 Type *Tys[] = {Pg->getArgOperand(0)->getType()};
1022
1023 auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1024
1025 PTest->takeName(&II);
1026 return IC.replaceInstUsesWith(II, PTest);
1027 }
1028
1029 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
1030 // Later optimizations may rewrite sequence to use the flag-setting variant
1031 // of instruction X to remove PTEST.
1032 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
1033 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
1034 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
1035 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
1036 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
1037 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
1038 (OpIID == Intrinsic::aarch64_sve_and_z) ||
1039 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
1040 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
1041 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
1042 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
1043 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
1044 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
1045 Value *Ops[] = {Pg->getArgOperand(0), Pg};
1046 Type *Tys[] = {Pg->getType()};
1047
1048 auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1049 PTest->takeName(&II);
1050
1051 return IC.replaceInstUsesWith(II, PTest);
1052 }
1053
1054 return std::nullopt;
1055}
1056
1057template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc>
1058static std::optional<Instruction *>
1059instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II,
1060 bool MergeIntoAddendOp) {
1061 Value *P = II.getOperand(0);
1062 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
1063 if (MergeIntoAddendOp) {
1064 AddendOp = II.getOperand(1);
1065 Mul = II.getOperand(2);
1066 } else {
1067 AddendOp = II.getOperand(2);
1068 Mul = II.getOperand(1);
1069 }
1070
1071 if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(P), m_Value(MulOp0),
1072 m_Value(MulOp1))))
1073 return std::nullopt;
1074
1075 if (!Mul->hasOneUse())
1076 return std::nullopt;
1077
1078 Instruction *FMFSource = nullptr;
1079 if (II.getType()->isFPOrFPVectorTy()) {
1080 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
1081 // Stop the combine when the flags on the inputs differ in case dropping
1082 // flags would lead to us missing out on more beneficial optimizations.
1083 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
1084 return std::nullopt;
1085 if (!FAddFlags.allowContract())
1086 return std::nullopt;
1087 FMFSource = &II;
1088 }
1089
1090 IRBuilder<> Builder(II.getContext());
1091 Builder.SetInsertPoint(&II);
1092
1093 CallInst *Res;
1094 if (MergeIntoAddendOp)
1095 Res = Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1096 {P, AddendOp, MulOp0, MulOp1}, FMFSource);
1097 else
1098 Res = Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1099 {P, MulOp0, MulOp1, AddendOp}, FMFSource);
1100
1101 return IC.replaceInstUsesWith(II, Res);
1102}
1103
1104static bool isAllActivePredicate(Value *Pred) {
1105 // Look through convert.from.svbool(convert.to.svbool(...) chain.
1106 Value *UncastedPred;
1107 if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
1108 m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
1109 m_Value(UncastedPred)))))
1110 // If the predicate has the same or less lanes than the uncasted
1111 // predicate then we know the casting has no effect.
1112 if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
1113 cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
1114 Pred = UncastedPred;
1115
1116 return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1117 m_ConstantInt<AArch64SVEPredPattern::all>()));
1118}
1119
1120static std::optional<Instruction *>
1121instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
1122 IRBuilder<> Builder(II.getContext());
1123 Builder.SetInsertPoint(&II);
1124
1125 Value *Pred = II.getOperand(0);
1126 Value *PtrOp = II.getOperand(1);
1127 Type *VecTy = II.getType();
1128 Value *VecPtr = Builder.CreateBitCast(PtrOp, VecTy->getPointerTo());
1129
1130 if (isAllActivePredicate(Pred)) {
1131 LoadInst *Load = Builder.CreateLoad(VecTy, VecPtr);
1132 Load->copyMetadata(II);
1133 return IC.replaceInstUsesWith(II, Load);
1134 }
1135
1136 CallInst *MaskedLoad =
1137 Builder.CreateMaskedLoad(VecTy, VecPtr, PtrOp->getPointerAlignment(DL),
1138 Pred, ConstantAggregateZero::get(VecTy));
1139 MaskedLoad->copyMetadata(II);
1140 return IC.replaceInstUsesWith(II, MaskedLoad);
1141}
1142
1143static std::optional<Instruction *>
1144instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
1145 IRBuilder<> Builder(II.getContext());
1146 Builder.SetInsertPoint(&II);
1147
1148 Value *VecOp = II.getOperand(0);
1149 Value *Pred = II.getOperand(1);
1150 Value *PtrOp = II.getOperand(2);
1151 Value *VecPtr =
1152 Builder.CreateBitCast(PtrOp, VecOp->getType()->getPointerTo());
1153
1154 if (isAllActivePredicate(Pred)) {
1155 StoreInst *Store = Builder.CreateStore(VecOp, VecPtr);
1156 Store->copyMetadata(II);
1157 return IC.eraseInstFromFunction(II);
1158 }
1159
1160 CallInst *MaskedStore = Builder.CreateMaskedStore(
1161 VecOp, VecPtr, PtrOp->getPointerAlignment(DL), Pred);
1162 MaskedStore->copyMetadata(II);
1163 return IC.eraseInstFromFunction(II);
1164}
1165
1166static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) {
1167 switch (Intrinsic) {
1168 case Intrinsic::aarch64_sve_fmul:
1169 return Instruction::BinaryOps::FMul;
1170 case Intrinsic::aarch64_sve_fadd:
1171 return Instruction::BinaryOps::FAdd;
1172 case Intrinsic::aarch64_sve_fsub:
1173 return Instruction::BinaryOps::FSub;
1174 default:
1175 return Instruction::BinaryOpsEnd;
1176 }
1177}
1178
1179static std::optional<Instruction *>
1180instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II) {
1181 auto *OpPredicate = II.getOperand(0);
1182 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
1183 if (BinOpCode == Instruction::BinaryOpsEnd ||
1184 !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1185 m_ConstantInt<AArch64SVEPredPattern::all>())))
1186 return std::nullopt;
1187 IRBuilder<> Builder(II.getContext());
1188 Builder.SetInsertPoint(&II);
1189 Builder.setFastMathFlags(II.getFastMathFlags());
1190 auto BinOp =
1191 Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2));
1192 return IC.replaceInstUsesWith(II, BinOp);
1193}
1194
1195static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
1196 IntrinsicInst &II) {
1197 if (auto FMLA =
1198 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1199 Intrinsic::aarch64_sve_fmla>(IC, II,
1200 true))
1201 return FMLA;
1202 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1203 Intrinsic::aarch64_sve_mla>(
1204 IC, II, true))
1205 return MLA;
1206 if (auto FMAD =
1207 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1208 Intrinsic::aarch64_sve_fmad>(IC, II,
1209 false))
1210 return FMAD;
1211 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1212 Intrinsic::aarch64_sve_mad>(
1213 IC, II, false))
1214 return MAD;
1215 return instCombineSVEVectorBinOp(IC, II);
1216}
1217
1218static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
1219 IntrinsicInst &II) {
1220 if (auto FMLS =
1221 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1222 Intrinsic::aarch64_sve_fmls>(IC, II,
1223 true))
1224 return FMLS;
1225 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1226 Intrinsic::aarch64_sve_mls>(
1227 IC, II, true))
1228 return MLS;
1229 if (auto FMSB =
1230 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1231 Intrinsic::aarch64_sve_fnmsb>(
1232 IC, II, false))
1233 return FMSB;
1234 return instCombineSVEVectorBinOp(IC, II);
1235}
1236
1237static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
1238 IntrinsicInst &II) {
1239 auto *OpPredicate = II.getOperand(0);
1240 auto *OpMultiplicand = II.getOperand(1);
1241 auto *OpMultiplier = II.getOperand(2);
1242
1243 IRBuilder<> Builder(II.getContext());
1244 Builder.SetInsertPoint(&II);
1245
1246 // Return true if a given instruction is a unit splat value, false otherwise.
1247 auto IsUnitSplat = [](auto *I) {
1248 auto *SplatValue = getSplatValue(I);
1249 if (!SplatValue)
1250 return false;
1251 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1252 };
1253
1254 // Return true if a given instruction is an aarch64_sve_dup intrinsic call
1255 // with a unit splat value, false otherwise.
1256 auto IsUnitDup = [](auto *I) {
1257 auto *IntrI = dyn_cast<IntrinsicInst>(I);
1258 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
1259 return false;
1260
1261 auto *SplatValue = IntrI->getOperand(2);
1262 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1263 };
1264
1265 if (IsUnitSplat(OpMultiplier)) {
1266 // [f]mul pg %n, (dupx 1) => %n
1267 OpMultiplicand->takeName(&II);
1268 return IC.replaceInstUsesWith(II, OpMultiplicand);
1269 } else if (IsUnitDup(OpMultiplier)) {
1270 // [f]mul pg %n, (dup pg 1) => %n
1271 auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
1272 auto *DupPg = DupInst->getOperand(1);
1273 // TODO: this is naive. The optimization is still valid if DupPg
1274 // 'encompasses' OpPredicate, not only if they're the same predicate.
1275 if (OpPredicate == DupPg) {
1276 OpMultiplicand->takeName(&II);
1277 return IC.replaceInstUsesWith(II, OpMultiplicand);
1278 }
1279 }
1280
1281 return instCombineSVEVectorBinOp(IC, II);
1282}
1283
1284static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
1285 IntrinsicInst &II) {
1286 IRBuilder<> Builder(II.getContext());
1287 Builder.SetInsertPoint(&II);
1288 Value *UnpackArg = II.getArgOperand(0);
1289 auto *RetTy = cast<ScalableVectorType>(II.getType());
1290 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
1291 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
1292
1293 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
1294 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
1295 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
1296 ScalarArg =
1297 Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
1298 Value *NewVal =
1299 Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
1300 NewVal->takeName(&II);
1301 return IC.replaceInstUsesWith(II, NewVal);
1302 }
1303
1304 return std::nullopt;
1305}
1306static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
1307 IntrinsicInst &II) {
1308 auto *OpVal = II.getOperand(0);
1309 auto *OpIndices = II.getOperand(1);
1310 VectorType *VTy = cast<VectorType>(II.getType());
1311
1312 // Check whether OpIndices is a constant splat value < minimal element count
1313 // of result.
1314 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
1315 if (!SplatValue ||
1316 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
1317 return std::nullopt;
1318
1319 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
1320 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
1321 IRBuilder<> Builder(II.getContext());
1322 Builder.SetInsertPoint(&II);
1323 auto *Extract = Builder.CreateExtractElement(OpVal, SplatValue);
1324 auto *VectorSplat =
1325 Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
1326
1327 VectorSplat->takeName(&II);
1328 return IC.replaceInstUsesWith(II, VectorSplat);
1329}
1330
1331static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
1332 IntrinsicInst &II) {
1333 // zip1(uzp1(A, B), uzp2(A, B)) --> A
1334 // zip2(uzp1(A, B), uzp2(A, B)) --> B
1335 Value *A, *B;
1336 if (match(II.getArgOperand(0),
1337 m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) &&
1338 match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
1339 m_Specific(A), m_Specific(B))))
1340 return IC.replaceInstUsesWith(
1341 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
1342
1343 return std::nullopt;
1344}
1345
1346static std::optional<Instruction *>
1347instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) {
1348 Value *Mask = II.getOperand(0);
1349 Value *BasePtr = II.getOperand(1);
1350 Value *Index = II.getOperand(2);
1351 Type *Ty = II.getType();
1352 Value *PassThru = ConstantAggregateZero::get(Ty);
1353
1354 // Contiguous gather => masked load.
1355 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
1356 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
1357 Value *IndexBase;
1358 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1359 m_Value(IndexBase), m_SpecificInt(1)))) {
1360 IRBuilder<> Builder(II.getContext());
1361 Builder.SetInsertPoint(&II);
1362
1363 Align Alignment =
1364 BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
1365
1366 Type *VecPtrTy = PointerType::getUnqual(Ty);
1367 Value *Ptr = Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1368 BasePtr, IndexBase);
1369 Ptr = Builder.CreateBitCast(Ptr, VecPtrTy);
1370 CallInst *MaskedLoad =
1371 Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
1372 MaskedLoad->takeName(&II);
1373 return IC.replaceInstUsesWith(II, MaskedLoad);
1374 }
1375
1376 return std::nullopt;
1377}
1378
1379static std::optional<Instruction *>
1380instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II) {
1381 Value *Val = II.getOperand(0);
1382 Value *Mask = II.getOperand(1);
1383 Value *BasePtr = II.getOperand(2);
1384 Value *Index = II.getOperand(3);
1385 Type *Ty = Val->getType();
1386
1387 // Contiguous scatter => masked store.
1388 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
1389 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
1390 Value *IndexBase;
1391 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1392 m_Value(IndexBase), m_SpecificInt(1)))) {
1393 IRBuilder<> Builder(II.getContext());
1394 Builder.SetInsertPoint(&II);
1395
1396 Align Alignment =
1397 BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
1398
1399 Value *Ptr = Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1400 BasePtr, IndexBase);
1401 Type *VecPtrTy = PointerType::getUnqual(Ty);
1402 Ptr = Builder.CreateBitCast(Ptr, VecPtrTy);
1403
1404 (void)Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
1405
1406 return IC.eraseInstFromFunction(II);
1407 }
1408
1409 return std::nullopt;
1410}
1411
1412static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
1413 IntrinsicInst &II) {
1414 IRBuilder<> Builder(II.getContext());
1415 Builder.SetInsertPoint(&II);
1416 Type *Int32Ty = Builder.getInt32Ty();
1417 Value *Pred = II.getOperand(0);
1418 Value *Vec = II.getOperand(1);
1419 Value *DivVec = II.getOperand(2);
1420
1421 Value *SplatValue = getSplatValue(DivVec);
1422 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
1423 if (!SplatConstantInt)
1424 return std::nullopt;
1425 APInt Divisor = SplatConstantInt->getValue();
1426
1427 if (Divisor.isPowerOf2()) {
1428 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
1429 auto ASRD = Builder.CreateIntrinsic(
1430 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
1431 return IC.replaceInstUsesWith(II, ASRD);
1432 }
1433 if (Divisor.isNegatedPowerOf2()) {
1434 Divisor.negate();
1435 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
1436 auto ASRD = Builder.CreateIntrinsic(
1437 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
1438 auto NEG = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_neg,
1439 {ASRD->getType()}, {ASRD, Pred, ASRD});
1440 return IC.replaceInstUsesWith(II, NEG);
1441 }
1442
1443 return std::nullopt;
1444}
1445
1446bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
1447 size_t VecSize = Vec.size();
1448 if (VecSize == 1)
1449 return true;
1450 if (!isPowerOf2_64(VecSize))
1451 return false;
1452 size_t HalfVecSize = VecSize / 2;
1453
1454 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
1455 RHS != Vec.end(); LHS++, RHS++) {
1456 if (*LHS != nullptr && *RHS != nullptr) {
1457 if (*LHS == *RHS)
1458 continue;
1459 else
1460 return false;
1461 }
1462 if (!AllowPoison)
1463 return false;
1464 if (*LHS == nullptr && *RHS != nullptr)
1465 *LHS = *RHS;
1466 }
1467
1468 Vec.resize(HalfVecSize);
1469 SimplifyValuePattern(Vec, AllowPoison);
1470 return true;
1471}
1472
1473// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
1474// to dupqlane(f64(C)) where C is A concatenated with B
1475static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
1476 IntrinsicInst &II) {
1477 Value *CurrentInsertElt = nullptr, *Default = nullptr;
1478 if (!match(II.getOperand(0),
1479 m_Intrinsic<Intrinsic::vector_insert>(
1480 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
1481 !isa<FixedVectorType>(CurrentInsertElt->getType()))
1482 return std::nullopt;
1483 auto IIScalableTy = cast<ScalableVectorType>(II.getType());
1484
1485 // Insert the scalars into a container ordered by InsertElement index
1486 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
1487 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
1488 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
1489 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
1490 CurrentInsertElt = InsertElt->getOperand(0);
1491 }
1492
1493 bool AllowPoison =
1494 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
1495 if (!SimplifyValuePattern(Elts, AllowPoison))
1496 return std::nullopt;
1497
1498 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
1499 IRBuilder<> Builder(II.getContext());
1500 Builder.SetInsertPoint(&II);
1501 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
1502 for (size_t I = 0; I < Elts.size(); I++) {
1503 if (Elts[I] == nullptr)
1504 continue;
1505 InsertEltChain = Builder.CreateInsertElement(InsertEltChain, Elts[I],
1506 Builder.getInt64(I));
1507 }
1508 if (InsertEltChain == nullptr)
1509 return std::nullopt;
1510
1511 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
1512 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
1513 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
1514 // be narrowed back to the original type.
1515 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
1516 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
1517 IIScalableTy->getMinNumElements() /
1518 PatternWidth;
1519
1520 IntegerType *WideTy = Builder.getIntNTy(PatternWidth);
1521 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
1522 auto *WideShuffleMaskTy =
1523 ScalableVectorType::get(Builder.getInt32Ty(), PatternElementCount);
1524
1525 auto ZeroIdx = ConstantInt::get(Builder.getInt64Ty(), APInt(64, 0));
1526 auto InsertSubvector = Builder.CreateInsertVector(
1527 II.getType(), PoisonValue::get(II.getType()), InsertEltChain, ZeroIdx);
1528 auto WideBitcast =
1529 Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
1530 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
1531 auto WideShuffle = Builder.CreateShuffleVector(
1532 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
1533 auto NarrowBitcast =
1534 Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
1535
1536 return IC.replaceInstUsesWith(II, NarrowBitcast);
1537}
1538
1539static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
1540 IntrinsicInst &II) {
1541 Value *A = II.getArgOperand(0);
1542 Value *B = II.getArgOperand(1);
1543 if (A == B)
1544 return IC.replaceInstUsesWith(II, A);
1545
1546 return std::nullopt;
1547}
1548
1549static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
1550 IntrinsicInst &II) {
1551 IRBuilder<> Builder(&II);
1552 Value *Pred = II.getOperand(0);
1553 Value *Vec = II.getOperand(1);
1554 Value *Shift = II.getOperand(2);
1555
1556 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
1557 Value *AbsPred, *MergedValue;
1558 if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(
1559 m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
1560 !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>(
1561 m_Value(MergedValue), m_Value(AbsPred), m_Value())))
1562
1563 return std::nullopt;
1564
1565 // Transform is valid if any of the following are true:
1566 // * The ABS merge value is an undef or non-negative
1567 // * The ABS predicate is all active
1568 // * The ABS predicate and the SRSHL predicates are the same
1569 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
1570 AbsPred != Pred && !isAllActivePredicate(AbsPred))
1571 return std::nullopt;
1572
1573 // Only valid when the shift amount is non-negative, otherwise the rounding
1574 // behaviour of SRSHL cannot be ignored.
1575 if (!match(Shift, m_NonNegative()))
1576 return std::nullopt;
1577
1578 auto LSL = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl, {II.getType()},
1579 {Pred, Vec, Shift});
1580
1581 return IC.replaceInstUsesWith(II, LSL);
1582}
1583
1584std::optional<Instruction *>
1585AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
1586 IntrinsicInst &II) const {
1587 Intrinsic::ID IID = II.getIntrinsicID();
1588 switch (IID) {
1589 default:
1590 break;
1591 case Intrinsic::aarch64_neon_fmaxnm:
1592 case Intrinsic::aarch64_neon_fminnm:
1593 return instCombineMaxMinNM(IC, II);
1594 case Intrinsic::aarch64_sve_convert_from_svbool:
1595 return instCombineConvertFromSVBool(IC, II);
1596 case Intrinsic::aarch64_sve_dup:
1597 return instCombineSVEDup(IC, II);
1598 case Intrinsic::aarch64_sve_dup_x:
1599 return instCombineSVEDupX(IC, II);
1600 case Intrinsic::aarch64_sve_cmpne:
1601 case Intrinsic::aarch64_sve_cmpne_wide:
1602 return instCombineSVECmpNE(IC, II);
1603 case Intrinsic::aarch64_sve_rdffr:
1604 return instCombineRDFFR(IC, II);
1605 case Intrinsic::aarch64_sve_lasta:
1606 case Intrinsic::aarch64_sve_lastb:
1607 return instCombineSVELast(IC, II);
1608 case Intrinsic::aarch64_sve_clasta_n:
1609 case Intrinsic::aarch64_sve_clastb_n:
1610 return instCombineSVECondLast(IC, II);
1611 case Intrinsic::aarch64_sve_cntd:
1612 return instCombineSVECntElts(IC, II, 2);
1613 case Intrinsic::aarch64_sve_cntw:
1614 return instCombineSVECntElts(IC, II, 4);
1615 case Intrinsic::aarch64_sve_cnth:
1616 return instCombineSVECntElts(IC, II, 8);
1617 case Intrinsic::aarch64_sve_cntb:
1618 return instCombineSVECntElts(IC, II, 16);
1619 case Intrinsic::aarch64_sve_ptest_any:
1620 case Intrinsic::aarch64_sve_ptest_first:
1621 case Intrinsic::aarch64_sve_ptest_last:
1622 return instCombineSVEPTest(IC, II);
1623 case Intrinsic::aarch64_sve_mul:
1624 case Intrinsic::aarch64_sve_fmul:
1625 return instCombineSVEVectorMul(IC, II);
1626 case Intrinsic::aarch64_sve_fadd:
1627 case Intrinsic::aarch64_sve_add:
1628 return instCombineSVEVectorAdd(IC, II);
1629 case Intrinsic::aarch64_sve_fadd_u:
1630 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1631 Intrinsic::aarch64_sve_fmla_u>(
1632 IC, II, true);
1633 case Intrinsic::aarch64_sve_fsub:
1634 case Intrinsic::aarch64_sve_sub:
1635 return instCombineSVEVectorSub(IC, II);
1636 case Intrinsic::aarch64_sve_fsub_u:
1637 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1638 Intrinsic::aarch64_sve_fmls_u>(
1639 IC, II, true);
1640 case Intrinsic::aarch64_sve_tbl:
1641 return instCombineSVETBL(IC, II);
1642 case Intrinsic::aarch64_sve_uunpkhi:
1643 case Intrinsic::aarch64_sve_uunpklo:
1644 case Intrinsic::aarch64_sve_sunpkhi:
1645 case Intrinsic::aarch64_sve_sunpklo:
1646 return instCombineSVEUnpack(IC, II);
1647 case Intrinsic::aarch64_sve_zip1:
1648 case Intrinsic::aarch64_sve_zip2:
1649 return instCombineSVEZip(IC, II);
1650 case Intrinsic::aarch64_sve_ld1_gather_index:
1651 return instCombineLD1GatherIndex(IC, II);
1652 case Intrinsic::aarch64_sve_st1_scatter_index:
1653 return instCombineST1ScatterIndex(IC, II);
1654 case Intrinsic::aarch64_sve_ld1:
1655 return instCombineSVELD1(IC, II, DL);
1656 case Intrinsic::aarch64_sve_st1:
1657 return instCombineSVEST1(IC, II, DL);
1658 case Intrinsic::aarch64_sve_sdiv:
1659 return instCombineSVESDIV(IC, II);
1660 case Intrinsic::aarch64_sve_sel:
1661 return instCombineSVESel(IC, II);
1662 case Intrinsic::aarch64_sve_srshl:
1663 return instCombineSVESrshl(IC, II);
1664 case Intrinsic::aarch64_sve_dupq_lane:
1665 return instCombineSVEDupqLane(IC, II);
1666 }
1667
1668 return std::nullopt;
1669}
1670
1671std::optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic(
1672 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
1673 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
1674 std::function<void(Instruction *, unsigned, APInt, APInt &)>
1675 SimplifyAndSetOp) const {
1676 switch (II.getIntrinsicID()) {
1677 default:
1678 break;
1679 case Intrinsic::aarch64_neon_fcvtxn:
1680 case Intrinsic::aarch64_neon_rshrn:
1681 case Intrinsic::aarch64_neon_sqrshrn:
1682 case Intrinsic::aarch64_neon_sqrshrun:
1683 case Intrinsic::aarch64_neon_sqshrn:
1684 case Intrinsic::aarch64_neon_sqshrun:
1685 case Intrinsic::aarch64_neon_sqxtn:
1686 case Intrinsic::aarch64_neon_sqxtun:
1687 case Intrinsic::aarch64_neon_uqrshrn:
1688 case Intrinsic::aarch64_neon_uqshrn:
1689 case Intrinsic::aarch64_neon_uqxtn:
1690 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
1691 break;
1692 }
1693
1694 return std::nullopt;
1695}
1696
1697TypeSize
1698AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
1699 switch (K) {
1700 case TargetTransformInfo::RGK_Scalar:
1701 return TypeSize::getFixed(64);
1702 case TargetTransformInfo::RGK_FixedWidthVector:
1703 if (!ST->isStreamingSVEModeDisabled() &&
1704 !EnableFixedwidthAutovecInStreamingMode)
1705 return TypeSize::getFixed(0);
1706
1707 if (ST->hasSVE())
1708 return TypeSize::getFixed(
1709 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
1710
1711 return TypeSize::getFixed(ST->hasNEON() ? 128 : 0);
1712 case TargetTransformInfo::RGK_ScalableVector:
1713 if (!ST->isStreamingSVEModeDisabled() && !EnableScalableAutovecInStreamingMode)
1714 return TypeSize::getScalable(0);
1715
1716 return TypeSize::getScalable(ST->hasSVE() ? 128 : 0);
1717 }
1718 llvm_unreachable("Unsupported register kind")::llvm::llvm_unreachable_internal("Unsupported register kind"
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 1718
)
;
1719}
1720
1721bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
1722 ArrayRef<const Value *> Args) {
1723
1724 // A helper that returns a vector type from the given type. The number of
1725 // elements in type Ty determines the vector width.
1726 auto toVectorTy = [&](Type *ArgTy) {
1727 return VectorType::get(ArgTy->getScalarType(),
1728 cast<VectorType>(DstTy)->getElementCount());
1729 };
1730
1731 // Exit early if DstTy is not a vector type whose elements are at least
1732 // 16-bits wide. SVE doesn't generally have the same set of instructions to
1733 // perform an extend with the add/sub/mul. There are SMULLB style
1734 // instructions, but they operate on top/bottom, requiring some sort of lane
1735 // interleaving to be used with zext/sext.
1736 if (!useNeonVector(DstTy) || DstTy->getScalarSizeInBits() < 16)
1737 return false;
1738
1739 // Determine if the operation has a widening variant. We consider both the
1740 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
1741 // instructions.
1742 //
1743 // TODO: Add additional widening operations (e.g., shl, etc.) once we
1744 // verify that their extending operands are eliminated during code
1745 // generation.
1746 switch (Opcode) {
1747 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
1748 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
1749 case Instruction::Mul: // SMULL(2), UMULL(2)
1750 break;
1751 default:
1752 return false;
1753 }
1754
1755 // To be a widening instruction (either the "wide" or "long" versions), the
1756 // second operand must be a sign- or zero extend.
1757 if (Args.size() != 2 ||
1758 (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])))
1759 return false;
1760 auto *Extend = cast<CastInst>(Args[1]);
1761 auto *Arg0 = dyn_cast<CastInst>(Args[0]);
1762
1763 // A mul only has a mull version (not like addw). Both operands need to be
1764 // extending and the same type.
1765 if (Opcode == Instruction::Mul &&
1766 (!Arg0 || Arg0->getOpcode() != Extend->getOpcode() ||
1767 Arg0->getOperand(0)->getType() != Extend->getOperand(0)->getType()))
1768 return false;
1769
1770 // Legalize the destination type and ensure it can be used in a widening
1771 // operation.
1772 auto DstTyL = getTypeLegalizationCost(DstTy);
1773 unsigned DstElTySize = DstTyL.second.getScalarSizeInBits();
1774 if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits())
1775 return false;
1776
1777 // Legalize the source type and ensure it can be used in a widening
1778 // operation.
1779 auto *SrcTy = toVectorTy(Extend->getSrcTy());
1780 auto SrcTyL = getTypeLegalizationCost(SrcTy);
1781 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
1782 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
1783 return false;
1784
1785 // Get the total number of vector elements in the legalized types.
1786 InstructionCost NumDstEls =
1787 DstTyL.first * DstTyL.second.getVectorMinNumElements();
1788 InstructionCost NumSrcEls =
1789 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
1790
1791 // Return true if the legalized types have the same number of vector elements
1792 // and the destination element type size is twice that of the source type.
1793 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize;
1794}
1795
1796InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
1797 Type *Src,
1798 TTI::CastContextHint CCH,
1799 TTI::TargetCostKind CostKind,
1800 const Instruction *I) {
1801 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1802 assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ?
void (0) : __assert_fail ("ISD && \"Invalid opcode\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 1802
, __extension__ __PRETTY_FUNCTION__))
;
1803
1804 // If the cast is observable, and it is used by a widening instruction (e.g.,
1805 // uaddl, saddw, etc.), it may be free.
1806 if (I && I->hasOneUser()) {
1807 auto *SingleUser = cast<Instruction>(*I->user_begin());
1808 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
1809 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
1810 // If the cast is the second operand, it is free. We will generate either
1811 // a "wide" or "long" version of the widening instruction.
1812 if (I == SingleUser->getOperand(1))
1813 return 0;
1814 // If the cast is not the second operand, it will be free if it looks the
1815 // same as the second operand. In this case, we will generate a "long"
1816 // version of the widening instruction.
1817 if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
1818 if (I->getOpcode() == unsigned(Cast->getOpcode()) &&
1819 cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
1820 return 0;
1821 }
1822 }
1823
1824 // TODO: Allow non-throughput costs that aren't binary.
1825 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
1826 if (CostKind != TTI::TCK_RecipThroughput)
1827 return Cost == 0 ? 0 : 1;
1828 return Cost;
1829 };
1830
1831 EVT SrcTy = TLI->getValueType(DL, Src);
1832 EVT DstTy = TLI->getValueType(DL, Dst);
1833
1834 if (!SrcTy.isSimple() || !DstTy.isSimple())
1835 return AdjustCost(
1836 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
1837
1838 static const TypeConversionCostTblEntry
1839 ConversionTbl[] = {
1840 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
1841 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
1842 { ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
1843 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
1844 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
1845 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
1846 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
1847 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
1848 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
1849 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
1850 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
1851 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
1852 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
1853 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
1854 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
1855 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
1856 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
1857 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
1858 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
1859 { ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
1860
1861 // Truncations on nxvmiN
1862 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 },
1863 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 },
1864 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 },
1865 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 },
1866 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 },
1867 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 },
1868 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 },
1869 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 },
1870 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 },
1871 { ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1 },
1872 { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 },
1873 { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 },
1874 { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 },
1875 { ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 },
1876 { ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 },
1877 { ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 },
1878
1879 // The number of shll instructions for the extension.
1880 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
1881 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
1882 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
1883 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
1884 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
1885 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
1886 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
1887 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
1888 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
1889 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
1890 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
1891 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
1892 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
1893 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
1894 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
1895 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
1896
1897 // LowerVectorINT_TO_FP:
1898 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
1899 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
1900 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
1901 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
1902 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
1903 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
1904
1905 // Complex: to v2f32
1906 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
1907 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
1908 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
1909 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
1910 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
1911 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
1912
1913 // Complex: to v4f32
1914 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 },
1915 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
1916 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
1917 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
1918
1919 // Complex: to v8f32
1920 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
1921 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
1922 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
1923 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
1924
1925 // Complex: to v16f32
1926 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
1927 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
1928
1929 // Complex: to v2f64
1930 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
1931 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
1932 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
1933 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
1934 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
1935 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
1936
1937 // Complex: to v4f64
1938 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4 },
1939 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4 },
1940
1941 // LowerVectorFP_TO_INT
1942 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
1943 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
1944 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
1945 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
1946 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
1947 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
1948
1949 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
1950 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
1951 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
1952 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 },
1953 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
1954 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
1955 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 },
1956
1957 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
1958 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
1959 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 },
1960 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
1961 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 },
1962
1963 // Complex, from nxv2f32.
1964 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
1965 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
1966 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
1967 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1 },
1968 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
1969 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
1970 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
1971 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1 },
1972
1973 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
1974 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
1975 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
1976 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 },
1977 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
1978 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
1979 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 },
1980
1981 // Complex, from nxv2f64.
1982 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
1983 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
1984 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
1985 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1 },
1986 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
1987 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
1988 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
1989 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1 },
1990
1991 // Complex, from nxv4f32.
1992 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
1993 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
1994 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
1995 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1 },
1996 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
1997 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
1998 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
1999 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1 },
2000
2001 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
2002 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
2003 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7 },
2004 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
2005 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7 },
2006
2007 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
2008 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
2009 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
2010 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3 },
2011 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
2012 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
2013 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3 },
2014
2015 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
2016 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
2017 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3 },
2018 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
2019 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3 },
2020
2021 // Complex, from nxv8f16.
2022 { ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
2023 { ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
2024 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
2025 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1 },
2026 { ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
2027 { ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
2028 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
2029 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1 },
2030
2031 // Complex, from nxv4f16.
2032 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
2033 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
2034 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
2035 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1 },
2036 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
2037 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
2038 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
2039 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1 },
2040
2041 // Complex, from nxv2f16.
2042 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
2043 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
2044 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
2045 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1 },
2046 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
2047 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
2048 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
2049 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1 },
2050
2051 // Truncate from nxvmf32 to nxvmf16.
2052 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 },
2053 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 },
2054 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 },
2055
2056 // Truncate from nxvmf64 to nxvmf16.
2057 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 },
2058 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 },
2059 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 },
2060
2061 // Truncate from nxvmf64 to nxvmf32.
2062 { ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 },
2063 { ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 },
2064 { ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 },
2065
2066 // Extend from nxvmf16 to nxvmf32.
2067 { ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
2068 { ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
2069 { ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
2070
2071 // Extend from nxvmf16 to nxvmf64.
2072 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
2073 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
2074 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
2075
2076 // Extend from nxvmf32 to nxvmf64.
2077 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
2078 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
2079 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
2080
2081 // Bitcasts from float to integer
2082 { ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0 },
2083 { ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0 },
2084 { ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0 },
2085
2086 // Bitcasts from integer to float
2087 { ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0 },
2088 { ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0 },
2089 { ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0 },
2090 };
2091
2092 if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
2093 DstTy.getSimpleVT(),
2094 SrcTy.getSimpleVT()))
2095 return AdjustCost(Entry->Cost);
2096
2097 static const TypeConversionCostTblEntry FP16Tbl[] = {
2098 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
2099 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
2100 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
2101 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
2102 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
2103 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
2104 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
2105 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
2106 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
2107 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
2108 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
2109 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
2110 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
2111 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
2112 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
2113 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
2114 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
2115 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
2116 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
2117 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
2118 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
2119 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
2120 };
2121
2122 if (ST->hasFullFP16())
2123 if (const auto *Entry = ConvertCostTableLookup(
2124 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
2125 return AdjustCost(Entry->Cost);
2126
2127 return AdjustCost(
2128 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2129}
2130
2131InstructionCost AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode,
2132 Type *Dst,
2133 VectorType *VecTy,
2134 unsigned Index) {
2135
2136 // Make sure we were given a valid extend opcode.
2137 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&(static_cast <bool> ((Opcode == Instruction::SExt || Opcode
== Instruction::ZExt) && "Invalid opcode") ? void (0
) : __assert_fail ("(Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && \"Invalid opcode\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2138
, __extension__ __PRETTY_FUNCTION__))
2138 "Invalid opcode")(static_cast <bool> ((Opcode == Instruction::SExt || Opcode
== Instruction::ZExt) && "Invalid opcode") ? void (0
) : __assert_fail ("(Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && \"Invalid opcode\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2138
, __extension__ __PRETTY_FUNCTION__))
;
2139
2140 // We are extending an element we extract from a vector, so the source type
2141 // of the extend is the element type of the vector.
2142 auto *Src = VecTy->getElementType();
2143
2144 // Sign- and zero-extends are for integer types only.
2145 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type")(static_cast <bool> (isa<IntegerType>(Dst) &&
isa<IntegerType>(Src) && "Invalid type") ? void
(0) : __assert_fail ("isa<IntegerType>(Dst) && isa<IntegerType>(Src) && \"Invalid type\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2145
, __extension__ __PRETTY_FUNCTION__))
;
2146
2147 // Get the cost for the extract. We compute the cost (if any) for the extend
2148 // below.
2149 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2150 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
2151 CostKind, Index, nullptr, nullptr);
2152
2153 // Legalize the types.
2154 auto VecLT = getTypeLegalizationCost(VecTy);
2155 auto DstVT = TLI->getValueType(DL, Dst);
2156 auto SrcVT = TLI->getValueType(DL, Src);
2157
2158 // If the resulting type is still a vector and the destination type is legal,
2159 // we may get the extension for free. If not, get the default cost for the
2160 // extend.
2161 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
2162 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2163 CostKind);
2164
2165 // The destination type should be larger than the element type. If not, get
2166 // the default cost for the extend.
2167 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
2168 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2169 CostKind);
2170
2171 switch (Opcode) {
2172 default:
2173 llvm_unreachable("Opcode should be either SExt or ZExt")::llvm::llvm_unreachable_internal("Opcode should be either SExt or ZExt"
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2173
)
;
2174
2175 // For sign-extends, we only need a smov, which performs the extension
2176 // automatically.
2177 case Instruction::SExt:
2178 return Cost;
2179
2180 // For zero-extends, the extend is performed automatically by a umov unless
2181 // the destination type is i64 and the element type is i8 or i16.
2182 case Instruction::ZExt:
2183 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
2184 return Cost;
2185 }
2186
2187 // If we are unable to perform the extend for free, get the default cost.
2188 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2189 CostKind);
2190}
2191
2192InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
2193 TTI::TargetCostKind CostKind,
2194 const Instruction *I) {
2195 if (CostKind != TTI::TCK_RecipThroughput)
2196 return Opcode == Instruction::PHI ? 0 : 1;
2197 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind")(static_cast <bool> (CostKind == TTI::TCK_RecipThroughput
&& "unexpected CostKind") ? void (0) : __assert_fail
("CostKind == TTI::TCK_RecipThroughput && \"unexpected CostKind\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2197
, __extension__ __PRETTY_FUNCTION__))
;
2198 // Branches are assumed to be predicted.
2199 return 0;
2200}
2201
2202InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(const Instruction *I,
2203 Type *Val,
2204 unsigned Index,
2205 bool HasRealUse) {
2206 assert(Val->isVectorTy() && "This must be a vector type")(static_cast <bool> (Val->isVectorTy() && "This must be a vector type"
) ? void (0) : __assert_fail ("Val->isVectorTy() && \"This must be a vector type\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2206
, __extension__ __PRETTY_FUNCTION__))
;
2207
2208 if (Index != -1U) {
2209 // Legalize the type.
2210 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
2211
2212 // This type is legalized to a scalar type.
2213 if (!LT.second.isVector())
2214 return 0;
2215
2216 // The type may be split. For fixed-width vectors we can normalize the
2217 // index to the new type.
2218 if (LT.second.isFixedLengthVector()) {
2219 unsigned Width = LT.second.getVectorNumElements();
2220 Index = Index % Width;
2221 }
2222
2223 // The element at index zero is already inside the vector.
2224 // - For a physical (HasRealUse==true) insert-element or extract-element
2225 // instruction that extracts integers, an explicit FPR -> GPR move is
2226 // needed. So it has non-zero cost.
2227 // - For the rest of cases (virtual instruction or element type is float),
2228 // consider the instruction free.
2229 if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
2230 return 0;
2231
2232 // This is recognising a LD1 single-element structure to one lane of one
2233 // register instruction. I.e., if this is an `insertelement` instruction,
2234 // and its second operand is a load, then we will generate a LD1, which
2235 // are expensive instructions.
2236 if (I && dyn_cast<LoadInst>(I->getOperand(1)))
2237 return ST->getVectorInsertExtractBaseCost() + 1;
2238
2239 // FIXME:
2240 // If the extract-element and insert-element instructions could be
2241 // simplified away (e.g., could be combined into users by looking at use-def
2242 // context), they have no cost. This is not done in the first place for
2243 // compile-time considerations.
2244 }
2245
2246 // All other insert/extracts cost this much.
2247 return ST->getVectorInsertExtractBaseCost();
2248}
2249
2250InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
2251 TTI::TargetCostKind CostKind,
2252 unsigned Index, Value *Op0,
2253 Value *Op1) {
2254 return getVectorInstrCostHelper(nullptr, Val, Index, false /* HasRealUse */);
2255}
2256
2257InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
2258 Type *Val,
2259 TTI::TargetCostKind CostKind,
2260 unsigned Index) {
2261 return getVectorInstrCostHelper(&I, Val, Index, true /* HasRealUse */);
2262}
2263
2264InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
2265 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2266 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
2267 ArrayRef<const Value *> Args,
2268 const Instruction *CxtI) {
2269
2270 // TODO: Handle more cost kinds.
2271 if (CostKind != TTI::TCK_RecipThroughput)
2272 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
2273 Op2Info, Args, CxtI);
2274
2275 // Legalize the type.
2276 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2277 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2278
2279 switch (ISD) {
2280 default:
2281 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
2282 Op2Info);
2283 case ISD::SDIV:
2284 if (Op2Info.isConstant() && Op2Info.isUniform() && Op2Info.isPowerOf2()) {
2285 // On AArch64, scalar signed division by constants power-of-two are
2286 // normally expanded to the sequence ADD + CMP + SELECT + SRA.
2287 // The OperandValue properties many not be same as that of previous
2288 // operation; conservatively assume OP_None.
2289 InstructionCost Cost = getArithmeticInstrCost(
2290 Instruction::Add, Ty, CostKind,
2291 Op1Info.getNoProps(), Op2Info.getNoProps());
2292 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
2293 Op1Info.getNoProps(), Op2Info.getNoProps());
2294 Cost += getArithmeticInstrCost(
2295 Instruction::Select, Ty, CostKind,
2296 Op1Info.getNoProps(), Op2Info.getNoProps());
2297 Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
2298 Op1Info.getNoProps(), Op2Info.getNoProps());
2299 return Cost;
2300 }
2301 [[fallthrough]];
2302 case ISD::UDIV: {
2303 if (Op2Info.isConstant() && Op2Info.isUniform()) {
2304 auto VT = TLI->getValueType(DL, Ty);
2305 if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
2306 // Vector signed division by constant are expanded to the
2307 // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
2308 // to MULHS + SUB + SRL + ADD + SRL.
2309 InstructionCost MulCost = getArithmeticInstrCost(
2310 Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
2311 InstructionCost AddCost = getArithmeticInstrCost(
2312 Instruction::Add, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
2313 InstructionCost ShrCost = getArithmeticInstrCost(
2314 Instruction::AShr, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
2315 return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
2316 }
2317 }
2318
2319 InstructionCost Cost = BaseT::getArithmeticInstrCost(
2320 Opcode, Ty, CostKind, Op1Info, Op2Info);
2321 if (Ty->isVectorTy()) {
2322 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
2323 // SDIV/UDIV operations are lowered using SVE, then we can have less
2324 // costs.
2325 if (isa<FixedVectorType>(Ty) && cast<FixedVectorType>(Ty)
2326 ->getPrimitiveSizeInBits()
2327 .getFixedValue() < 128) {
2328 EVT VT = TLI->getValueType(DL, Ty);
2329 static const CostTblEntry DivTbl[]{
2330 {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8},
2331 {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5},
2332 {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
2333 {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8},
2334 {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5},
2335 {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
2336
2337 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
2338 if (nullptr != Entry)
2339 return Entry->Cost;
2340 }
2341 // For 8/16-bit elements, the cost is higher because the type
2342 // requires promotion and possibly splitting:
2343 if (LT.second.getScalarType() == MVT::i8)
2344 Cost *= 8;
2345 else if (LT.second.getScalarType() == MVT::i16)
2346 Cost *= 4;
2347 return Cost;
2348 } else {
2349 // If one of the operands is a uniform constant then the cost for each
2350 // element is Cost for insertion, extraction and division.
2351 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
2352 // operation with scalar type
2353 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
2354 (Op2Info.isConstant() && Op2Info.isUniform())) {
2355 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
2356 InstructionCost DivCost = BaseT::getArithmeticInstrCost(
2357 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
2358 return (4 + DivCost) * VTy->getNumElements();
2359 }
2360 }
2361 // On AArch64, without SVE, vector divisions are expanded
2362 // into scalar divisions of each pair of elements.
2363 Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty,
2364 CostKind, Op1Info, Op2Info);
2365 Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind,
2366 Op1Info, Op2Info);
2367 }
2368
2369 // TODO: if one of the arguments is scalar, then it's not necessary to
2370 // double the cost of handling the vector elements.
2371 Cost += Cost;
2372 }
2373 return Cost;
2374 }
2375 case ISD::MUL:
2376 // When SVE is available, then we can lower the v2i64 operation using
2377 // the SVE mul instruction, which has a lower cost.
2378 if (LT.second == MVT::v2i64 && ST->hasSVE())
2379 return LT.first;
2380
2381 // When SVE is not available, there is no MUL.2d instruction,
2382 // which means mul <2 x i64> is expensive as elements are extracted
2383 // from the vectors and the muls scalarized.
2384 // As getScalarizationOverhead is a bit too pessimistic, we
2385 // estimate the cost for a i64 vector directly here, which is:
2386 // - four 2-cost i64 extracts,
2387 // - two 2-cost i64 inserts, and
2388 // - two 1-cost muls.
2389 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
2390 // LT.first = 2 the cost is 28. If both operands are extensions it will not
2391 // need to scalarize so the cost can be cheaper (smull or umull).
2392 // so the cost can be cheaper (smull or umull).
2393 if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
2394 return LT.first;
2395 return LT.first * 14;
2396 case ISD::ADD:
2397 case ISD::XOR:
2398 case ISD::OR:
2399 case ISD::AND:
2400 case ISD::SRL:
2401 case ISD::SRA:
2402 case ISD::SHL:
2403 // These nodes are marked as 'custom' for combining purposes only.
2404 // We know that they are legal. See LowerAdd in ISelLowering.
2405 return LT.first;
2406
2407 case ISD::FADD:
2408 case ISD::FSUB:
2409 case ISD::FMUL:
2410 case ISD::FDIV:
2411 case ISD::FNEG:
2412 // These nodes are marked as 'custom' just to lower them to SVE.
2413 // We know said lowering will incur no additional cost.
2414 if (!Ty->getScalarType()->isFP128Ty())
2415 return 2 * LT.first;
2416
2417 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
2418 Op2Info);
2419 }
2420}
2421
2422InstructionCost AArch64TTIImpl::getAddressComputationCost(Type *Ty,
2423 ScalarEvolution *SE,
2424 const SCEV *Ptr) {
2425 // Address computations in vectorized code with non-consecutive addresses will
2426 // likely result in more instructions compared to scalar code where the
2427 // computation can more often be merged into the index mode. The resulting
2428 // extra micro-ops can significantly decrease throughput.
2429 unsigned NumVectorInstToHideOverhead = 10;
2430 int MaxMergeDistance = 64;
2431
2432 if (Ty->isVectorTy() && SE &&
2433 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
2434 return NumVectorInstToHideOverhead;
2435
2436 // In many cases the address computation is not merged into the instruction
2437 // addressing mode.
2438 return 1;
2439}
2440
2441InstructionCost AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
2442 Type *CondTy,
2443 CmpInst::Predicate VecPred,
2444 TTI::TargetCostKind CostKind,
2445 const Instruction *I) {
2446 // TODO: Handle other cost kinds.
2447 if (CostKind != TTI::TCK_RecipThroughput)
2448 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2449 I);
2450
2451 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2452 // We don't lower some vector selects well that are wider than the register
2453 // width.
2454 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) {
2455 // We would need this many instructions to hide the scalarization happening.
2456 const int AmortizationCost = 20;
2457
2458 // If VecPred is not set, check if we can get a predicate from the context
2459 // instruction, if its type matches the requested ValTy.
2460 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
2461 CmpInst::Predicate CurrentPred;
2462 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
2463 m_Value())))
2464 VecPred = CurrentPred;
2465 }
2466 // Check if we have a compare/select chain that can be lowered using
2467 // a (F)CMxx & BFI pair.
2468 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
2469 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
2470 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
2471 VecPred == CmpInst::FCMP_UNE) {
2472 static const auto ValidMinMaxTys = {
2473 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
2474 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
2475 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
2476
2477 auto LT = getTypeLegalizationCost(ValTy);
2478 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }) ||
2479 (ST->hasFullFP16() &&
2480 any_of(ValidFP16MinMaxTys, [&LT](MVT M) { return M == LT.second; })))
2481 return LT.first;
2482 }
2483
2484 static const TypeConversionCostTblEntry
2485 VectorSelectTbl[] = {
2486 { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
2487 { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
2488 { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
2489 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
2490 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
2491 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
2492 };
2493
2494 EVT SelCondTy = TLI->getValueType(DL, CondTy);
2495 EVT SelValTy = TLI->getValueType(DL, ValTy);
2496 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
2497 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
2498 SelCondTy.getSimpleVT(),
2499 SelValTy.getSimpleVT()))
2500 return Entry->Cost;
2501 }
2502 }
2503 // The base case handles scalable vectors fine for now, since it treats the
2504 // cost as 1 * legalization cost.
2505 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
2506}
2507
2508AArch64TTIImpl::TTI::MemCmpExpansionOptions
2509AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
2510 TTI::MemCmpExpansionOptions Options;
2511 if (ST->requiresStrictAlign()) {
2512 // TODO: Add cost modeling for strict align. Misaligned loads expand to
2513 // a bunch of instructions when strict align is enabled.
2514 return Options;
2515 }
2516 Options.AllowOverlappingLoads = true;
2517 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
2518 Options.NumLoadsPerBlock = Options.MaxNumLoads;
2519 // TODO: Though vector loads usually perform well on AArch64, in some targets
2520 // they may wake up the FP unit, which raises the power consumption. Perhaps
2521 // they could be used with no holds barred (-O3).
2522 Options.LoadSizes = {8, 4, 2, 1};
2523 return Options;
2524}
2525
2526bool AArch64TTIImpl::prefersVectorizedAddressing() const {
2527 return ST->hasSVE();
2528}
2529
2530InstructionCost
2531AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
2532 Align Alignment, unsigned AddressSpace,
2533 TTI::TargetCostKind CostKind) {
2534 if (useNeonVector(Src))
2535 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2536 CostKind);
2537 auto LT = getTypeLegalizationCost(Src);
2538 if (!LT.first.isValid())
2539 return InstructionCost::getInvalid();
2540
2541 // The code-generator is currently not able to handle scalable vectors
2542 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
2543 // it. This change will be removed when code-generation for these types is
2544 // sufficiently reliable.
2545 if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1))
2546 return InstructionCost::getInvalid();
2547
2548 return LT.first;
2549}
2550
2551static unsigned getSVEGatherScatterOverhead(unsigned Opcode) {
2552 return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead;
2553}
2554
2555InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
2556 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
2557 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
2558 if (useNeonVector(DataTy))
2559 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
2560 Alignment, CostKind, I);
2561 auto *VT = cast<VectorType>(DataTy);
2562 auto LT = getTypeLegalizationCost(DataTy);
2563 if (!LT.first.isValid())
2564 return InstructionCost::getInvalid();
2565
2566 // The code-generator is currently not able to handle scalable vectors
2567 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
2568 // it. This change will be removed when code-generation for these types is
2569 // sufficiently reliable.
2570 if (cast<VectorType>(DataTy)->getElementCount() ==
2571 ElementCount::getScalable(1))
2572 return InstructionCost::getInvalid();
2573
2574 ElementCount LegalVF = LT.second.getVectorElementCount();
2575 InstructionCost MemOpCost =
2576 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
2577 {TTI::OK_AnyValue, TTI::OP_None}, I);
2578 // Add on an overhead cost for using gathers/scatters.
2579 // TODO: At the moment this is applied unilaterally for all CPUs, but at some
2580 // point we may want a per-CPU overhead.
2581 MemOpCost *= getSVEGatherScatterOverhead(Opcode);
2582 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
2583}
2584
2585bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {
2586 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
2587}
2588
2589InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
2590 MaybeAlign Alignment,
2591 unsigned AddressSpace,
2592 TTI::TargetCostKind CostKind,
2593 TTI::OperandValueInfo OpInfo,
2594 const Instruction *I) {
2595 EVT VT = TLI->getValueType(DL, Ty, true);
2596 // Type legalization can't handle structs
2597 if (VT == MVT::Other)
2598 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
2599 CostKind);
2600
2601 auto LT = getTypeLegalizationCost(Ty);
2602 if (!LT.first.isValid())
2603 return InstructionCost::getInvalid();
2604
2605 // The code-generator is currently not able to handle scalable vectors
2606 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
2607 // it. This change will be removed when code-generation for these types is
2608 // sufficiently reliable.
2609 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
2610 if (VTy->getElementCount() == ElementCount::getScalable(1))
2611 return InstructionCost::getInvalid();
2612
2613 // TODO: consider latency as well for TCK_SizeAndLatency.
2614 if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
2615 return LT.first;
2616
2617 if (CostKind != TTI::TCK_RecipThroughput)
2618 return 1;
2619
2620 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
2621 LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
2622 // Unaligned stores are extremely inefficient. We don't split all
2623 // unaligned 128-bit stores because the negative impact that has shown in
2624 // practice on inlined block copy code.
2625 // We make such stores expensive so that we will only vectorize if there
2626 // are 6 other instructions getting vectorized.
2627 const int AmortizationCost = 6;
2628
2629 return LT.first * 2 * AmortizationCost;
2630 }
2631
2632 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
2633 if (Ty->isPtrOrPtrVectorTy())
2634 return LT.first;
2635
2636 // Check truncating stores and extending loads.
2637 if (useNeonVector(Ty) &&
2638 Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
2639 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
2640 if (VT == MVT::v4i8)
2641 return 2;
2642 // Otherwise we need to scalarize.
2643 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
2644 }
2645
2646 return LT.first;
2647}
2648
2649InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
2650 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
2651 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
2652 bool UseMaskForCond, bool UseMaskForGaps) {
2653 assert(Factor >= 2 && "Invalid interleave factor")(static_cast <bool> (Factor >= 2 && "Invalid interleave factor"
) ? void (0) : __assert_fail ("Factor >= 2 && \"Invalid interleave factor\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2653
, __extension__ __PRETTY_FUNCTION__))
;
2654 auto *VecVTy = cast<FixedVectorType>(VecTy);
2655
2656 if (!UseMaskForCond && !UseMaskForGaps &&
2657 Factor <= TLI->getMaxSupportedInterleaveFactor()) {
2658 unsigned NumElts = VecVTy->getNumElements();
2659 auto *SubVecTy =
2660 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
2661
2662 // ldN/stN only support legal vector types of size 64 or 128 in bits.
2663 // Accesses having vector types that are a multiple of 128 bits can be
2664 // matched to more than one ldN/stN instruction.
2665 bool UseScalable;
2666 if (NumElts % Factor == 0 &&
2667 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
2668 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
2669 }
2670
2671 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2672 Alignment, AddressSpace, CostKind,
2673 UseMaskForCond, UseMaskForGaps);
2674}
2675
2676InstructionCost
2677AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
2678 InstructionCost Cost = 0;
2679 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2680 for (auto *I : Tys) {
2681 if (!I->isVectorTy())
2682 continue;
2683 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
2684 128)
2685 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
2686 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
2687 }
2688 return Cost;
2689}
2690
2691unsigned AArch64TTIImpl::getMaxInterleaveFactor(ElementCount VF) {
2692 return ST->getMaxInterleaveFactor();
2693}
2694
2695// For Falkor, we want to avoid having too many strided loads in a loop since
2696// that can exhaust the HW prefetcher resources. We adjust the unroller
2697// MaxCount preference below to attempt to ensure unrolling doesn't create too
2698// many strided loads.
2699static void
2700getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
2701 TargetTransformInfo::UnrollingPreferences &UP) {
2702 enum { MaxStridedLoads = 7 };
2703 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
2704 int StridedLoads = 0;
2705 // FIXME? We could make this more precise by looking at the CFG and
2706 // e.g. not counting loads in each side of an if-then-else diamond.
2707 for (const auto BB : L->blocks()) {
2708 for (auto &I : *BB) {
2709 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
2710 if (!LMemI)
2711 continue;
2712
2713 Value *PtrValue = LMemI->getPointerOperand();
2714 if (L->isLoopInvariant(PtrValue))
2715 continue;
2716
2717 const SCEV *LSCEV = SE.getSCEV(PtrValue);
2718 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
2719 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
2720 continue;
2721
2722 // FIXME? We could take pairing of unrolled load copies into account
2723 // by looking at the AddRec, but we would probably have to limit this
2724 // to loops with no stores or other memory optimization barriers.
2725 ++StridedLoads;
2726 // We've seen enough strided loads that seeing more won't make a
2727 // difference.
2728 if (StridedLoads > MaxStridedLoads / 2)
2729 return StridedLoads;
2730 }
2731 }
2732 return StridedLoads;
2733 };
2734
2735 int StridedLoads = countStridedLoads(L, SE);
2736 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoadsdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("aarch64tti")) { dbgs() << "falkor-hwpf: detected " <<
StridedLoads << " strided loads\n"; } } while (false)
7
Assuming 'DebugFlag' is false
8
Loop condition is false. Exiting loop
2737 << " strided loads\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("aarch64tti")) { dbgs() << "falkor-hwpf: detected " <<
StridedLoads << " strided loads\n"; } } while (false)
;
2738 // Pick the largest power of 2 unroll count that won't result in too many
2739 // strided loads.
2740 if (StridedLoads) {
9
Assuming 'StridedLoads' is not equal to 0
10
Taking true branch
2741 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
11
Calling 'Log2_32'
13
Returning from 'Log2_32'
14
The result of the left shift is undefined due to shifting by '4294967295', which is greater or equal to the width of type 'int'
2742 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("aarch64tti")) { dbgs() << "falkor-hwpf: setting unroll MaxCount to "
<< UP.MaxCount << '\n'; } } while (false)
2743 << UP.MaxCount << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("aarch64tti")) { dbgs() << "falkor-hwpf: setting unroll MaxCount to "
<< UP.MaxCount << '\n'; } } while (false)
;
2744 }
2745}
2746
2747void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
2748 TTI::UnrollingPreferences &UP,
2749 OptimizationRemarkEmitter *ORE) {
2750 // Enable partial unrolling and runtime unrolling.
2751 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
2752
2753 UP.UpperBound = true;
2754
2755 // For inner loop, it is more likely to be a hot one, and the runtime check
2756 // can be promoted out from LICM pass, so the overhead is less, let's try
2757 // a larger threshold to unroll more loops.
2758 if (L->getLoopDepth() > 1)
1
Assuming the condition is false
2
Taking false branch
2759 UP.PartialThreshold *= 2;
2760
2761 // Disable partial & runtime unrolling on -Os.
2762 UP.PartialOptSizeThreshold = 0;
2763
2764 if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
3
Assuming the condition is true
5
Taking true branch
2765 EnableFalkorHWPFUnrollFix)
4
Assuming the condition is true
2766 getFalkorUnrollingPreferences(L, SE, UP);
6
Calling 'getFalkorUnrollingPreferences'
2767
2768 // Scan the loop: don't unroll loops with calls as this could prevent
2769 // inlining. Don't unroll vector loops either, as they don't benefit much from
2770 // unrolling.
2771 for (auto *BB : L->getBlocks()) {
2772 for (auto &I : *BB) {
2773 // Don't unroll vectorised loop.
2774 if (I.getType()->isVectorTy())
2775 return;
2776
2777 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2778 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2779 if (!isLoweredToCall(F))
2780 continue;
2781 }
2782 return;
2783 }
2784 }
2785 }
2786
2787 // Enable runtime unrolling for in-order models
2788 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
2789 // checking for that case, we can ensure that the default behaviour is
2790 // unchanged
2791 if (ST->getProcFamily() != AArch64Subtarget::Others &&
2792 !ST->getSchedModel().isOutOfOrder()) {
2793 UP.Runtime = true;
2794 UP.Partial = true;
2795 UP.UnrollRemainder = true;
2796 UP.DefaultUnrollRuntimeCount = 4;
2797
2798 UP.UnrollAndJam = true;
2799 UP.UnrollAndJamInnerLoopThreshold = 60;
2800 }
2801}
2802
2803void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
2804 TTI::PeelingPreferences &PP) {
2805 BaseT::getPeelingPreferences(L, SE, PP);
2806}
2807
2808Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
2809 Type *ExpectedType) {
2810 switch (Inst->getIntrinsicID()) {
2811 default:
2812 return nullptr;
2813 case Intrinsic::aarch64_neon_st2:
2814 case Intrinsic::aarch64_neon_st3:
2815 case Intrinsic::aarch64_neon_st4: {
2816 // Create a struct type
2817 StructType *ST = dyn_cast<StructType>(ExpectedType);
2818 if (!ST)
2819 return nullptr;
2820 unsigned NumElts = Inst->arg_size() - 1;
2821 if (ST->getNumElements() != NumElts)
2822 return nullptr;
2823 for (unsigned i = 0, e = NumElts; i != e; ++i) {
2824 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
2825 return nullptr;
2826 }
2827 Value *Res = PoisonValue::get(ExpectedType);
2828 IRBuilder<> Builder(Inst);
2829 for (unsigned i = 0, e = NumElts; i != e; ++i) {
2830 Value *L = Inst->getArgOperand(i);
2831 Res = Builder.CreateInsertValue(Res, L, i);
2832 }
2833 return Res;
2834 }
2835 case Intrinsic::aarch64_neon_ld2:
2836 case Intrinsic::aarch64_neon_ld3:
2837 case Intrinsic::aarch64_neon_ld4:
2838 if (Inst->getType() == ExpectedType)
2839 return Inst;
2840 return nullptr;
2841 }
2842}
2843
2844bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
2845 MemIntrinsicInfo &Info) {
2846 switch (Inst->getIntrinsicID()) {
2847 default:
2848 break;
2849 case Intrinsic::aarch64_neon_ld2:
2850 case Intrinsic::aarch64_neon_ld3:
2851 case Intrinsic::aarch64_neon_ld4:
2852 Info.ReadMem = true;
2853 Info.WriteMem = false;
2854 Info.PtrVal = Inst->getArgOperand(0);
2855 break;
2856 case Intrinsic::aarch64_neon_st2:
2857 case Intrinsic::aarch64_neon_st3:
2858 case Intrinsic::aarch64_neon_st4:
2859 Info.ReadMem = false;
2860 Info.WriteMem = true;
2861 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
2862 break;
2863 }
2864
2865 switch (Inst->getIntrinsicID()) {
2866 default:
2867 return false;
2868 case Intrinsic::aarch64_neon_ld2:
2869 case Intrinsic::aarch64_neon_st2:
2870 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
2871 break;
2872 case Intrinsic::aarch64_neon_ld3:
2873 case Intrinsic::aarch64_neon_st3:
2874 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
2875 break;
2876 case Intrinsic::aarch64_neon_ld4:
2877 case Intrinsic::aarch64_neon_st4:
2878 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
2879 break;
2880 }
2881 return true;
2882}
2883
2884/// See if \p I should be considered for address type promotion. We check if \p
2885/// I is a sext with right type and used in memory accesses. If it used in a
2886/// "complex" getelementptr, we allow it to be promoted without finding other
2887/// sext instructions that sign extended the same initial value. A getelementptr
2888/// is considered as "complex" if it has more than 2 operands.
2889bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
2890 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
2891 bool Considerable = false;
2892 AllowPromotionWithoutCommonHeader = false;
2893 if (!isa<SExtInst>(&I))
2894 return false;
2895 Type *ConsideredSExtType =
2896 Type::getInt64Ty(I.getParent()->getParent()->getContext());
2897 if (I.getType() != ConsideredSExtType)
2898 return false;
2899 // See if the sext is the one with the right type and used in at least one
2900 // GetElementPtrInst.
2901 for (const User *U : I.users()) {
2902 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
2903 Considerable = true;
2904 // A getelementptr is considered as "complex" if it has more than 2
2905 // operands. We will promote a SExt used in such complex GEP as we
2906 // expect some computation to be merged if they are done on 64 bits.
2907 if (GEPInst->getNumOperands() > 2) {
2908 AllowPromotionWithoutCommonHeader = true;
2909 break;
2910 }
2911 }
2912 }
2913 return Considerable;
2914}
2915
2916bool AArch64TTIImpl::isLegalToVectorizeReduction(
2917 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
2918 if (!VF.isScalable())
2919 return true;
2920
2921 Type *Ty = RdxDesc.getRecurrenceType();
2922 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
2923 return false;
2924
2925 switch (RdxDesc.getRecurrenceKind()) {
2926 case RecurKind::Add:
2927 case RecurKind::FAdd:
2928 case RecurKind::And:
2929 case RecurKind::Or:
2930 case RecurKind::Xor:
2931 case RecurKind::SMin:
2932 case RecurKind::SMax:
2933 case RecurKind::UMin:
2934 case RecurKind::UMax:
2935 case RecurKind::FMin:
2936 case RecurKind::FMax:
2937 case RecurKind::SelectICmp:
2938 case RecurKind::SelectFCmp:
2939 case RecurKind::FMulAdd:
2940 return true;
2941 default:
2942 return false;
2943 }
2944}
2945
2946InstructionCost
2947AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
2948 bool IsUnsigned,
2949 TTI::TargetCostKind CostKind) {
2950 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2951
2952 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
2953 return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
2954
2955 assert((isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) &&(static_cast <bool> ((isa<ScalableVectorType>(Ty)
== isa<ScalableVectorType>(CondTy)) && "Both vector needs to be equally scalable"
) ? void (0) : __assert_fail ("(isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) && \"Both vector needs to be equally scalable\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2956
, __extension__ __PRETTY_FUNCTION__))
2956 "Both vector needs to be equally scalable")(static_cast <bool> ((isa<ScalableVectorType>(Ty)
== isa<ScalableVectorType>(CondTy)) && "Both vector needs to be equally scalable"
) ? void (0) : __assert_fail ("(isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) && \"Both vector needs to be equally scalable\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2956
, __extension__ __PRETTY_FUNCTION__))
;
2957
2958 InstructionCost LegalizationCost = 0;
2959 if (LT.first > 1) {
2960 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
2961 unsigned MinMaxOpcode =
2962 Ty->isFPOrFPVectorTy()
2963 ? Intrinsic::maxnum
2964 : (IsUnsigned ? Intrinsic::umin : Intrinsic::smin);
2965 IntrinsicCostAttributes Attrs(MinMaxOpcode, LegalVTy, {LegalVTy, LegalVTy});
2966 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
2967 }
2968
2969 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
2970}
2971
2972InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE(
2973 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) {
2974 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2975 InstructionCost LegalizationCost = 0;
2976 if (LT.first > 1) {
2977 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
2978 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
2979 LegalizationCost *= LT.first - 1;
2980 }
2981
2982 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2983 assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ?
void (0) : __assert_fail ("ISD && \"Invalid opcode\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2983
, __extension__ __PRETTY_FUNCTION__))
;
2984 // Add the final reduction cost for the legal horizontal reduction
2985 switch (ISD) {
2986 case ISD::ADD:
2987 case ISD::AND:
2988 case ISD::OR:
2989 case ISD::XOR:
2990 case ISD::FADD:
2991 return LegalizationCost + 2;
2992 default:
2993 return InstructionCost::getInvalid();
2994 }
2995}
2996
2997InstructionCost
2998AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
2999 std::optional<FastMathFlags> FMF,
3000 TTI::TargetCostKind CostKind) {
3001 if (TTI::requiresOrderedReduction(FMF)) {
3002 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
3003 InstructionCost BaseCost =
3004 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
3005 // Add on extra cost to reflect the extra overhead on some CPUs. We still
3006 // end up vectorizing for more computationally intensive loops.
3007 return BaseCost + FixedVTy->getNumElements();
3008 }
3009
3010 if (Opcode != Instruction::FAdd)
3011 return InstructionCost::getInvalid();
3012
3013 auto *VTy = cast<ScalableVectorType>(ValTy);
3014 InstructionCost Cost =
3015 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
3016 Cost *= getMaxNumElements(VTy->getElementCount());
3017 return Cost;
3018 }
3019
3020 if (isa<ScalableVectorType>(ValTy))
3021 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
3022
3023 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3024 MVT MTy = LT.second;
3025 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3026 assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ?
void (0) : __assert_fail ("ISD && \"Invalid opcode\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 3026
, __extension__ __PRETTY_FUNCTION__))
;
3027
3028 // Horizontal adds can use the 'addv' instruction. We model the cost of these
3029 // instructions as twice a normal vector add, plus 1 for each legalization
3030 // step (LT.first). This is the only arithmetic vector reduction operation for
3031 // which we have an instruction.
3032 // OR, XOR and AND costs should match the codegen from:
3033 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
3034 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
3035 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
3036 static const CostTblEntry CostTblNoPairwise[]{
3037 {ISD::ADD, MVT::v8i8, 2},
3038 {ISD::ADD, MVT::v16i8, 2},
3039 {ISD::ADD, MVT::v4i16, 2},
3040 {ISD::ADD, MVT::v8i16, 2},
3041 {ISD::ADD, MVT::v4i32, 2},
3042 {ISD::ADD, MVT::v2i64, 2},
3043 {ISD::OR, MVT::v8i8, 15},
3044 {ISD::OR, MVT::v16i8, 17},
3045 {ISD::OR, MVT::v4i16, 7},
3046 {ISD::OR, MVT::v8i16, 9},
3047 {ISD::OR, MVT::v2i32, 3},
3048 {ISD::OR, MVT::v4i32, 5},
3049 {ISD::OR, MVT::v2i64, 3},
3050 {ISD::XOR, MVT::v8i8, 15},
3051 {ISD::XOR, MVT::v16i8, 17},
3052 {ISD::XOR, MVT::v4i16, 7},
3053 {ISD::XOR, MVT::v8i16, 9},
3054 {ISD::XOR, MVT::v2i32, 3},
3055 {ISD::XOR, MVT::v4i32, 5},
3056 {ISD::XOR, MVT::v2i64, 3},
3057 {ISD::AND, MVT::v8i8, 15},
3058 {ISD::AND, MVT::v16i8, 17},
3059 {ISD::AND, MVT::v4i16, 7},
3060 {ISD::AND, MVT::v8i16, 9},
3061 {ISD::AND, MVT::v2i32, 3},
3062 {ISD::AND, MVT::v4i32, 5},
3063 {ISD::AND, MVT::v2i64, 3},
3064 };
3065 switch (ISD) {
3066 default:
3067 break;
3068 case ISD::ADD:
3069 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
3070 return (LT.first - 1) + Entry->Cost;
3071 break;
3072 case ISD::XOR:
3073 case ISD::AND:
3074 case ISD::OR:
3075 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
3076 if (!Entry)
3077 break;
3078 auto *ValVTy = cast<FixedVectorType>(ValTy);
3079 if (!ValVTy->getElementType()->isIntegerTy(1) &&
3080 MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
3081 isPowerOf2_32(ValVTy->getNumElements())) {
3082 InstructionCost ExtraCost = 0;
3083 if (LT.first != 1) {
3084 // Type needs to be split, so there is an extra cost of LT.first - 1
3085 // arithmetic ops.
3086 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
3087 MTy.getVectorNumElements());
3088 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
3089 ExtraCost *= LT.first - 1;
3090 }
3091 return Entry->Cost + ExtraCost;
3092 }
3093 break;
3094 }
3095 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
3096}
3097
3098InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) {
3099 static const CostTblEntry ShuffleTbl[] = {
3100 { TTI::SK_Splice, MVT::nxv16i8, 1 },
3101 { TTI::SK_Splice, MVT::nxv8i16, 1 },
3102 { TTI::SK_Splice, MVT::nxv4i32, 1 },
3103 { TTI::SK_Splice, MVT::nxv2i64, 1 },
3104 { TTI::SK_Splice, MVT::nxv2f16, 1 },
3105 { TTI::SK_Splice, MVT::nxv4f16, 1 },
3106 { TTI::SK_Splice, MVT::nxv8f16, 1 },
3107 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
3108 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
3109 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
3110 { TTI::SK_Splice, MVT::nxv2f32, 1 },
3111 { TTI::SK_Splice, MVT::nxv4f32, 1 },
3112 { TTI::SK_Splice, MVT::nxv2f64, 1 },
3113 };
3114
3115 // The code-generator is currently not able to handle scalable vectors
3116 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3117 // it. This change will be removed when code-generation for these types is
3118 // sufficiently reliable.
3119 if (Tp->getElementCount() == ElementCount::getScalable(1))
3120 return InstructionCost::getInvalid();
3121
3122 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
3123 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
3124 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3125 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
3126 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
3127 : LT.second;
3128 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
3129 InstructionCost LegalizationCost = 0;
3130 if (Index < 0) {
3131 LegalizationCost =
3132 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
3133 CmpInst::BAD_ICMP_PREDICATE, CostKind) +
3134 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
3135 CmpInst::BAD_ICMP_PREDICATE, CostKind);
3136 }
3137
3138 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
3139 // Cost performed on a promoted type.
3140 if (LT.second.getScalarType() == MVT::i1) {
3141 LegalizationCost +=
3142 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
3143 TTI::CastContextHint::None, CostKind) +
3144 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
3145 TTI::CastContextHint::None, CostKind);
3146 }
3147 const auto *Entry =
3148 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
3149 assert(Entry && "Illegal Type for Splice")(static_cast <bool> (Entry && "Illegal Type for Splice"
) ? void (0) : __assert_fail ("Entry && \"Illegal Type for Splice\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 3149
, __extension__ __PRETTY_FUNCTION__))
;
3150 LegalizationCost += Entry->Cost;
3151 return LegalizationCost * LT.first;
3152}
3153
3154InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
3155 VectorType *Tp,
3156 ArrayRef<int> Mask,
3157 TTI::TargetCostKind CostKind,
3158 int Index, VectorType *SubTp,
3159 ArrayRef<const Value *> Args) {
3160 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
3161 // If we have a Mask, and the LT is being legalized somehow, split the Mask
3162 // into smaller vectors and sum the cost of each shuffle.
3163 if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() &&
3164 Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
3165 cast<FixedVectorType>(Tp)->getNumElements() >
3166 LT.second.getVectorNumElements() &&
3167 !Index && !SubTp) {
3168 unsigned TpNumElts = cast<FixedVectorType>(Tp)->getNumElements();
3169 assert(Mask.size() == TpNumElts && "Expected Mask and Tp size to match!")(static_cast <bool> (Mask.size() == TpNumElts &&
"Expected Mask and Tp size to match!") ? void (0) : __assert_fail
("Mask.size() == TpNumElts && \"Expected Mask and Tp size to match!\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 3169
, __extension__ __PRETTY_FUNCTION__))
;
3170 unsigned LTNumElts = LT.second.getVectorNumElements();
3171 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
3172 VectorType *NTp =
3173 VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount());
3174 InstructionCost Cost;
3175 for (unsigned N = 0; N < NumVecs; N++) {
3176 SmallVector<int> NMask;
3177 // Split the existing mask into chunks of size LTNumElts. Track the source
3178 // sub-vectors to ensure the result has at most 2 inputs.
3179 unsigned Source1, Source2;
3180 unsigned NumSources = 0;
3181 for (unsigned E = 0; E < LTNumElts; E++) {
3182 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
3183 : UndefMaskElem;
3184 if (MaskElt < 0) {
3185 NMask.push_back(UndefMaskElem);
3186 continue;
3187 }
3188
3189 // Calculate which source from the input this comes from and whether it
3190 // is new to us.
3191 unsigned Source = MaskElt / LTNumElts;
3192 if (NumSources == 0) {
3193 Source1 = Source;
3194 NumSources = 1;
3195 } else if (NumSources == 1 && Source != Source1) {
3196 Source2 = Source;
3197 NumSources = 2;
3198 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
3199 NumSources++;
3200 }
3201
3202 // Add to the new mask. For the NumSources>2 case these are not correct,
3203 // but are only used for the modular lane number.
3204 if (Source == Source1)
3205 NMask.push_back(MaskElt % LTNumElts);
3206 else if (Source == Source2)
3207 NMask.push_back(MaskElt % LTNumElts + LTNumElts);
3208 else
3209 NMask.push_back(MaskElt % LTNumElts);
3210 }
3211 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
3212 // getShuffleCost. If not then cost it using the worst case.
3213 if (NumSources <= 2)
3214 Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
3215 : TTI::SK_PermuteTwoSrc,
3216 NTp, NMask, CostKind, 0, nullptr, Args);
3217 else if (any_of(enumerate(NMask), [&](const auto &ME) {
3218 return ME.value() % LTNumElts == ME.index();
3219 }))
3220 Cost += LTNumElts - 1;
3221 else
3222 Cost += LTNumElts;
3223 }
3224 return Cost;
3225 }
3226
3227 Kind = improveShuffleKindFromMask(Kind, Mask);
3228
3229 // Check for broadcast loads, which are supported by the LD1R instruction.
3230 // In terms of code-size, the shuffle vector is free when a load + dup get
3231 // folded into a LD1R. That's what we check and return here. For performance
3232 // and reciprocal throughput, a LD1R is not completely free. In this case, we
3233 // return the cost for the broadcast below (i.e. 1 for most/all types), so
3234 // that we model the load + dup sequence slightly higher because LD1R is a
3235 // high latency instruction.
3236 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
3237 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
3238 if (IsLoad && LT.second.isVector() &&
3239 isLegalBroadcastLoad(Tp->getElementType(),
3240 LT.second.getVectorElementCount()))
3241 return 0;
3242 }
3243
3244 // If we have 4 elements for the shuffle and a Mask, get the cost straight
3245 // from the perfect shuffle tables.
3246 if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) &&
3247 (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) &&
3248 all_of(Mask, [](int E) { return E < 8; }))
3249 return getPerfectShuffleCost(Mask);
3250
3251 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
3252 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
3253 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
3254 static const CostTblEntry ShuffleTbl[] = {
3255 // Broadcast shuffle kinds can be performed with 'dup'.
3256 {TTI::SK_Broadcast, MVT::v8i8, 1},
3257 {TTI::SK_Broadcast, MVT::v16i8, 1},
3258 {TTI::SK_Broadcast, MVT::v4i16, 1},
3259 {TTI::SK_Broadcast, MVT::v8i16, 1},
3260 {TTI::SK_Broadcast, MVT::v2i32, 1},
3261 {TTI::SK_Broadcast, MVT::v4i32, 1},
3262 {TTI::SK_Broadcast, MVT::v2i64, 1},
3263 {TTI::SK_Broadcast, MVT::v4f16, 1},
3264 {TTI::SK_Broadcast, MVT::v8f16, 1},
3265 {TTI::SK_Broadcast, MVT::v2f32, 1},
3266 {TTI::SK_Broadcast, MVT::v4f32, 1},
3267 {TTI::SK_Broadcast, MVT::v2f64, 1},
3268 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
3269 // 'zip1/zip2' instructions.
3270 {TTI::SK_Transpose, MVT::v8i8, 1},
3271 {TTI::SK_Transpose, MVT::v16i8, 1},
3272 {TTI::SK_Transpose, MVT::v4i16, 1},
3273 {TTI::SK_Transpose, MVT::v8i16, 1},
3274 {TTI::SK_Transpose, MVT::v2i32, 1},
3275 {TTI::SK_Transpose, MVT::v4i32, 1},
3276 {TTI::SK_Transpose, MVT::v2i64, 1},
3277 {TTI::SK_Transpose, MVT::v4f16, 1},
3278 {TTI::SK_Transpose, MVT::v8f16, 1},
3279 {TTI::SK_Transpose, MVT::v2f32, 1},
3280 {TTI::SK_Transpose, MVT::v4f32, 1},
3281 {TTI::SK_Transpose, MVT::v2f64, 1},
3282 // Select shuffle kinds.
3283 // TODO: handle vXi8/vXi16.
3284 {TTI::SK_Select, MVT::v2i32, 1}, // mov.
3285 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
3286 {TTI::SK_Select, MVT::v2i64, 1}, // mov.
3287 {TTI::SK_Select, MVT::v2f32, 1}, // mov.
3288 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
3289 {TTI::SK_Select, MVT::v2f64, 1}, // mov.
3290 // PermuteSingleSrc shuffle kinds.
3291 {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
3292 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
3293 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
3294 {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
3295 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
3296 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
3297 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
3298 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
3299 {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
3300 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl
3301 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl
3302 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
3303 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl
3304 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl
3305 // Reverse can be lowered with `rev`.
3306 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
3307 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
3308 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
3309 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
3310 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
3311 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
3312 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
3313 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
3314 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
3315 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
3316 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
3317 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64
3318 // Splice can all be lowered as `ext`.
3319 {TTI::SK_Splice, MVT::v2i32, 1},
3320 {TTI::SK_Splice, MVT::v4i32, 1},
3321 {TTI::SK_Splice, MVT::v2i64, 1},
3322 {TTI::SK_Splice, MVT::v2f32, 1},
3323 {TTI::SK_Splice, MVT::v4f32, 1},
3324 {TTI::SK_Splice, MVT::v2f64, 1},
3325 {TTI::SK_Splice, MVT::v8f16, 1},
3326 {TTI::SK_Splice, MVT::v8bf16, 1},
3327 {TTI::SK_Splice, MVT::v8i16, 1},
3328 {TTI::SK_Splice, MVT::v16i8, 1},
3329 {TTI::SK_Splice, MVT::v4bf16, 1},
3330 {TTI::SK_Splice, MVT::v4f16, 1},
3331 {TTI::SK_Splice, MVT::v4i16, 1},
3332 {TTI::SK_Splice, MVT::v8i8, 1},
3333 // Broadcast shuffle kinds for scalable vectors
3334 {TTI::SK_Broadcast, MVT::nxv16i8, 1},
3335 {TTI::SK_Broadcast, MVT::nxv8i16, 1},
3336 {TTI::SK_Broadcast, MVT::nxv4i32, 1},
3337 {TTI::SK_Broadcast, MVT::nxv2i64, 1},
3338 {TTI::SK_Broadcast, MVT::nxv2f16, 1},
3339 {TTI::SK_Broadcast, MVT::nxv4f16, 1},
3340 {TTI::SK_Broadcast, MVT::nxv8f16, 1},
3341 {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
3342 {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
3343 {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
3344 {TTI::SK_Broadcast, MVT::nxv2f32, 1},
3345 {TTI::SK_Broadcast, MVT::nxv4f32, 1},
3346 {TTI::SK_Broadcast, MVT::nxv2f64, 1},
3347 {TTI::SK_Broadcast, MVT::nxv16i1, 1},
3348 {TTI::SK_Broadcast, MVT::nxv8i1, 1},
3349 {TTI::SK_Broadcast, MVT::nxv4i1, 1},
3350 {TTI::SK_Broadcast, MVT::nxv2i1, 1},
3351 // Handle the cases for vector.reverse with scalable vectors
3352 {TTI::SK_Reverse, MVT::nxv16i8, 1},
3353 {TTI::SK_Reverse, MVT::nxv8i16, 1},
3354 {TTI::SK_Reverse, MVT::nxv4i32, 1},
3355 {TTI::SK_Reverse, MVT::nxv2i64, 1},
3356 {TTI::SK_Reverse, MVT::nxv2f16, 1},
3357 {TTI::SK_Reverse, MVT::nxv4f16, 1},
3358 {TTI::SK_Reverse, MVT::nxv8f16, 1},
3359 {TTI::SK_Reverse, MVT::nxv2bf16, 1},
3360 {TTI::SK_Reverse, MVT::nxv4bf16, 1},
3361 {TTI::SK_Reverse, MVT::nxv8bf16, 1},
3362 {TTI::SK_Reverse, MVT::nxv2f32, 1},
3363 {TTI::SK_Reverse, MVT::nxv4f32, 1},
3364 {TTI::SK_Reverse, MVT::nxv2f64, 1},
3365 {TTI::SK_Reverse, MVT::nxv16i1, 1},
3366 {TTI::SK_Reverse, MVT::nxv8i1, 1},
3367 {TTI::SK_Reverse, MVT::nxv4i1, 1},
3368 {TTI::SK_Reverse, MVT::nxv2i1, 1},
3369 };
3370 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
3371 return LT.first * Entry->Cost;
3372 }
3373
3374 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))
3375 return getSpliceCost(Tp, Index);
3376
3377 // Inserting a subvector can often be done with either a D, S or H register
3378 // move, so long as the inserted vector is "aligned".
3379 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
3380 LT.second.getSizeInBits() <= 128 && SubTp) {
3381 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
3382 if (SubLT.second.isVector()) {
3383 int NumElts = LT.second.getVectorNumElements();
3384 int NumSubElts = SubLT.second.getVectorNumElements();
3385 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
3386 return SubLT.first;
3387 }
3388 }
3389
3390 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
3391}
3392
3393static bool containsDecreasingPointers(Loop *TheLoop,
3394 PredicatedScalarEvolution *PSE) {
3395 const ValueToValueMap &Strides = ValueToValueMap();
3396 for (BasicBlock *BB : TheLoop->blocks()) {
3397 // Scan the instructions in the block and look for addresses that are
3398 // consecutive and decreasing.
3399 for (Instruction &I : *BB) {
3400 if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
3401 Value *Ptr = getLoadStorePointerOperand(&I);
3402 Type *AccessTy = getLoadStoreType(&I);
3403 if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true,
3404 /*ShouldCheckWrap=*/false)
3405 .value_or(0) < 0)
3406 return true;
3407 }
3408 }
3409 }
3410 return false;
3411}
3412
3413bool AArch64TTIImpl::preferPredicateOverEpilogue(
3414 Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC,
3415 TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL,
3416 InterleavedAccessInfo *IAI) {
3417 if (!ST->hasSVE() || TailFoldingKindLoc == TailFoldingKind::TFDisabled)
3418 return false;
3419
3420 // We don't currently support vectorisation with interleaving for SVE - with
3421 // such loops we're better off not using tail-folding. This gives us a chance
3422 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
3423 if (IAI->hasGroups())
3424 return false;
3425
3426 TailFoldingKind Required; // Defaults to 0.
3427 if (LVL->getReductionVars().size())
3428 Required.add(TailFoldingKind::TFReductions);
3429 if (LVL->getFixedOrderRecurrences().size())
3430 Required.add(TailFoldingKind::TFRecurrences);
3431
3432 // We call this to discover whether any load/store pointers in the loop have
3433 // negative strides. This will require extra work to reverse the loop
3434 // predicate, which may be expensive.
3435 if (containsDecreasingPointers(L, LVL->getPredicatedScalarEvolution()))
3436 Required.add(TailFoldingKind::TFReverse);
3437 if (!Required)
3438 Required.add(TailFoldingKind::TFSimple);
3439
3440 return (TailFoldingKindLoc & Required) == Required;
3441}
3442
3443InstructionCost
3444AArch64TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
3445 int64_t BaseOffset, bool HasBaseReg,
3446 int64_t Scale, unsigned AddrSpace) const {
3447 // Scaling factors are not free at all.
3448 // Operands | Rt Latency
3449 // -------------------------------------------
3450 // Rt, [Xn, Xm] | 4
3451 // -------------------------------------------
3452 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
3453 // Rt, [Xn, Wm, <extend> #imm] |
3454 TargetLoweringBase::AddrMode AM;
3455 AM.BaseGV = BaseGV;
3456 AM.BaseOffs = BaseOffset;
3457 AM.HasBaseReg = HasBaseReg;
3458 AM.Scale = Scale;
3459 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
3460 // Scale represents reg2 * scale, thus account for 1 if
3461 // it is not equal to 0 or 1.
3462 return AM.Scale != 0 && AM.Scale != 1;
3463 return -1;
3464}

/build/source/llvm/include/llvm/Support/MathExtras.h

1//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains some functions that are useful for math stuff.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_SUPPORT_MATHEXTRAS_H
14#define LLVM_SUPPORT_MATHEXTRAS_H
15
16#include "llvm/ADT/bit.h"
17#include "llvm/Support/Compiler.h"
18#include <cassert>
19#include <climits>
20#include <cstdint>
21#include <cstring>
22#include <limits>
23#include <type_traits>
24
25namespace llvm {
26
27/// Mathematical constants.
28namespace numbers {
29// TODO: Track C++20 std::numbers.
30// TODO: Favor using the hexadecimal FP constants (requires C++17).
31constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113
32 egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620
33 ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162
34 ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392
35 log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0)
36 log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2)
37 pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796
38 inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541
39 sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161
40 inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197
41 sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219
42 inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1)
43 sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194
44 inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1)
45 phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622
46constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113
47 egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620
48 ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162
49 ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392
50 log2ef = 1.44269504F, // (0x1.715476P+0)
51 log10ef = .434294482F, // (0x1.bcb7b2P-2)
52 pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796
53 inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541
54 sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161
55 inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197
56 sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193
57 inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1)
58 sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194
59 inv_sqrt3f = .577350269F, // (0x1.279a74P-1)
60 phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622
61} // namespace numbers
62
63/// Count number of 0's from the least significant bit to the most
64/// stopping at the first 1.
65///
66/// Only unsigned integral types are allowed.
67///
68/// Returns std::numeric_limits<T>::digits on an input of 0.
69template <typename T>
70LLVM_DEPRECATED("Use llvm::countr_zero instead.", "llvm::countr_zero")__attribute__((deprecated("Use llvm::countr_zero instead.", "llvm::countr_zero"
)))
71unsigned countTrailingZeros(T Val) {
72 static_assert(std::is_unsigned_v<T>,
73 "Only unsigned integral types are allowed.");
74 return llvm::countr_zero(Val);
75}
76
77/// Count number of 0's from the most significant bit to the least
78/// stopping at the first 1.
79///
80/// Only unsigned integral types are allowed.
81///
82/// Returns std::numeric_limits<T>::digits on an input of 0.
83template <typename T>
84LLVM_DEPRECATED("Use llvm::countl_zero instead.", "llvm::countl_zero")__attribute__((deprecated("Use llvm::countl_zero instead.", "llvm::countl_zero"
)))
85unsigned countLeadingZeros(T Val) {
86 static_assert(std::is_unsigned_v<T>,
87 "Only unsigned integral types are allowed.");
88 return llvm::countl_zero(Val);
89}
90
91/// Create a bitmask with the N right-most bits set to 1, and all other
92/// bits set to 0. Only unsigned types are allowed.
93template <typename T> T maskTrailingOnes(unsigned N) {
94 static_assert(std::is_unsigned<T>::value, "Invalid type!");
95 const unsigned Bits = CHAR_BIT8 * sizeof(T);
96 assert(N <= Bits && "Invalid bit index")(static_cast <bool> (N <= Bits && "Invalid bit index"
) ? void (0) : __assert_fail ("N <= Bits && \"Invalid bit index\""
, "llvm/include/llvm/Support/MathExtras.h", 96, __extension__
__PRETTY_FUNCTION__))
;
97 return N == 0 ? 0 : (T(-1) >> (Bits - N));
98}
99
100/// Create a bitmask with the N left-most bits set to 1, and all other
101/// bits set to 0. Only unsigned types are allowed.
102template <typename T> T maskLeadingOnes(unsigned N) {
103 return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
104}
105
106/// Create a bitmask with the N right-most bits set to 0, and all other
107/// bits set to 1. Only unsigned types are allowed.
108template <typename T> T maskTrailingZeros(unsigned N) {
109 return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
110}
111
112/// Create a bitmask with the N left-most bits set to 0, and all other
113/// bits set to 1. Only unsigned types are allowed.
114template <typename T> T maskLeadingZeros(unsigned N) {
115 return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
116}
117
118/// Macro compressed bit reversal table for 256 bits.
119///
120/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
121static const unsigned char BitReverseTable256[256] = {
122#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
123#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
124#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
125 R6(0), R6(2), R6(1), R6(3)
126#undef R2
127#undef R4
128#undef R6
129};
130
131/// Reverse the bits in \p Val.
132template <typename T> T reverseBits(T Val) {
133#if __has_builtin(__builtin_bitreverse8)1
134 if constexpr (std::is_same_v<T, uint8_t>)
135 return __builtin_bitreverse8(Val);
136#endif
137#if __has_builtin(__builtin_bitreverse16)1
138 if constexpr (std::is_same_v<T, uint16_t>)
139 return __builtin_bitreverse16(Val);
140#endif
141#if __has_builtin(__builtin_bitreverse32)1
142 if constexpr (std::is_same_v<T, uint32_t>)
143 return __builtin_bitreverse32(Val);
144#endif
145#if __has_builtin(__builtin_bitreverse64)1
146 if constexpr (std::is_same_v<T, uint64_t>)
147 return __builtin_bitreverse64(Val);
148#endif
149
150 unsigned char in[sizeof(Val)];
151 unsigned char out[sizeof(Val)];
152 std::memcpy(in, &Val, sizeof(Val));
153 for (unsigned i = 0; i < sizeof(Val); ++i)
154 out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
155 std::memcpy(&Val, out, sizeof(Val));
156 return Val;
157}
158
159// NOTE: The following support functions use the _32/_64 extensions instead of
160// type overloading so that signed and unsigned integers can be used without
161// ambiguity.
162
163/// Return the high 32 bits of a 64 bit value.
164constexpr inline uint32_t Hi_32(uint64_t Value) {
165 return static_cast<uint32_t>(Value >> 32);
166}
167
168/// Return the low 32 bits of a 64 bit value.
169constexpr inline uint32_t Lo_32(uint64_t Value) {
170 return static_cast<uint32_t>(Value);
171}
172
173/// Make a 64-bit integer from a high / low pair of 32-bit integers.
174constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
175 return ((uint64_t)High << 32) | (uint64_t)Low;
176}
177
178/// Checks if an integer fits into the given bit width.
179template <unsigned N> constexpr inline bool isInt(int64_t x) {
180 if constexpr (N == 8)
181 return static_cast<int8_t>(x) == x;
182 if constexpr (N == 16)
183 return static_cast<int16_t>(x) == x;
184 if constexpr (N == 32)
185 return static_cast<int32_t>(x) == x;
186 if constexpr (N < 64)
187 return -(INT64_C(1)1L << (N - 1)) <= x && x < (INT64_C(1)1L << (N - 1));
188 (void)x; // MSVC v19.25 warns that x is unused.
189 return true;
190}
191
192/// Checks if a signed integer is an N bit number shifted left by S.
193template <unsigned N, unsigned S>
194constexpr inline bool isShiftedInt(int64_t x) {
195 static_assert(
196 N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number.");
197 static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide.");
198 return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
199}
200
201/// Checks if an unsigned integer fits into the given bit width.
202template <unsigned N> constexpr inline bool isUInt(uint64_t x) {
203 static_assert(N > 0, "isUInt<0> doesn't make sense");
204 if constexpr (N == 8)
205 return static_cast<uint8_t>(x) == x;
206 if constexpr (N == 16)
207 return static_cast<uint16_t>(x) == x;
208 if constexpr (N == 32)
209 return static_cast<uint32_t>(x) == x;
210 if constexpr (N < 64)
211 return x < (UINT64_C(1)1UL << (N));
212 (void)x; // MSVC v19.25 warns that x is unused.
213 return true;
214}
215
216/// Checks if a unsigned integer is an N bit number shifted left by S.
217template <unsigned N, unsigned S>
218constexpr inline bool isShiftedUInt(uint64_t x) {
219 static_assert(
220 N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)");
221 static_assert(N + S <= 64,
222 "isShiftedUInt<N, S> with N + S > 64 is too wide.");
223 // Per the two static_asserts above, S must be strictly less than 64. So
224 // 1 << S is not undefined behavior.
225 return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
226}
227
228/// Gets the maximum value for a N-bit unsigned integer.
229inline uint64_t maxUIntN(uint64_t N) {
230 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
"integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "llvm/include/llvm/Support/MathExtras.h", 230, __extension__
__PRETTY_FUNCTION__))
;
231
232 // uint64_t(1) << 64 is undefined behavior, so we can't do
233 // (uint64_t(1) << N) - 1
234 // without checking first that N != 64. But this works and doesn't have a
235 // branch.
236 return UINT64_MAX(18446744073709551615UL) >> (64 - N);
237}
238
239/// Gets the minimum value for a N-bit signed integer.
240inline int64_t minIntN(int64_t N) {
241 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
"integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "llvm/include/llvm/Support/MathExtras.h", 241, __extension__
__PRETTY_FUNCTION__))
;
242
243 return UINT64_C(1)1UL + ~(UINT64_C(1)1UL << (N - 1));
244}
245
246/// Gets the maximum value for a N-bit signed integer.
247inline int64_t maxIntN(int64_t N) {
248 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
"integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "llvm/include/llvm/Support/MathExtras.h", 248, __extension__
__PRETTY_FUNCTION__))
;
249
250 // This relies on two's complement wraparound when N == 64, so we convert to
251 // int64_t only at the very end to avoid UB.
252 return (UINT64_C(1)1UL << (N - 1)) - 1;
253}
254
255/// Checks if an unsigned integer fits into the given (dynamic) bit width.
256inline bool isUIntN(unsigned N, uint64_t x) {
257 return N >= 64 || x <= maxUIntN(N);
258}
259
260/// Checks if an signed integer fits into the given (dynamic) bit width.
261inline bool isIntN(unsigned N, int64_t x) {
262 return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N));
263}
264
265/// Return true if the argument is a non-empty sequence of ones starting at the
266/// least significant bit with the remainder zero (32 bit version).
267/// Ex. isMask_32(0x0000FFFFU) == true.
268constexpr inline bool isMask_32(uint32_t Value) {
269 return Value && ((Value + 1) & Value) == 0;
270}
271
272/// Return true if the argument is a non-empty sequence of ones starting at the
273/// least significant bit with the remainder zero (64 bit version).
274constexpr inline bool isMask_64(uint64_t Value) {
275 return Value && ((Value + 1) & Value) == 0;
276}
277
278/// Return true if the argument contains a non-empty sequence of ones with the
279/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
280constexpr inline bool isShiftedMask_32(uint32_t Value) {
281 return Value && isMask_32((Value - 1) | Value);
282}
283
284/// Return true if the argument contains a non-empty sequence of ones with the
285/// remainder zero (64 bit version.)
286constexpr inline bool isShiftedMask_64(uint64_t Value) {
287 return Value && isMask_64((Value - 1) | Value);
288}
289
290/// Return true if the argument is a power of two > 0.
291/// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
292constexpr inline bool isPowerOf2_32(uint32_t Value) {
293 return llvm::has_single_bit(Value);
294}
295
296/// Return true if the argument is a power of two > 0 (64 bit edition.)
297constexpr inline bool isPowerOf2_64(uint64_t Value) {
298 return llvm::has_single_bit(Value);
299}
300
301/// Count the number of ones from the most significant bit to the first
302/// zero bit.
303///
304/// Ex. countLeadingOnes(0xFF0FFF00) == 8.
305/// Only unsigned integral types are allowed.
306///
307/// Returns std::numeric_limits<T>::digits on an input of all ones.
308template <typename T>
309LLVM_DEPRECATED("Use llvm::countl_one instead.", "llvm::countl_one")__attribute__((deprecated("Use llvm::countl_one instead.", "llvm::countl_one"
)))
310unsigned countLeadingOnes(T Value) {
311 static_assert(std::is_unsigned_v<T>,
312 "Only unsigned integral types are allowed.");
313 return llvm::countl_one<T>(Value);
314}
315
316/// Count the number of ones from the least significant bit to the first
317/// zero bit.
318///
319/// Ex. countTrailingOnes(0x00FF00FF) == 8.
320/// Only unsigned integral types are allowed.
321///
322/// Returns std::numeric_limits<T>::digits on an input of all ones.
323template <typename T>
324LLVM_DEPRECATED("Use llvm::countr_one instead.", "llvm::countr_one")__attribute__((deprecated("Use llvm::countr_one instead.", "llvm::countr_one"
)))
325unsigned countTrailingOnes(T Value) {
326 static_assert(std::is_unsigned_v<T>,
327 "Only unsigned integral types are allowed.");
328 return llvm::countr_one<T>(Value);
329}
330
331/// Count the number of set bits in a value.
332/// Ex. countPopulation(0xF000F000) = 8
333/// Returns 0 if the word is zero.
334template <typename T>
335LLVM_DEPRECATED("Use llvm::popcount instead.", "llvm::popcount")__attribute__((deprecated("Use llvm::popcount instead.", "llvm::popcount"
)))
336inline unsigned countPopulation(T Value) {
337 static_assert(std::is_unsigned_v<T>,
338 "Only unsigned integral types are allowed.");
339 return (unsigned)llvm::popcount(Value);
340}
341
342/// Return true if the argument contains a non-empty sequence of ones with the
343/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
344/// If true, \p MaskIdx will specify the index of the lowest set bit and \p
345/// MaskLen is updated to specify the length of the mask, else neither are
346/// updated.
347inline bool isShiftedMask_32(uint32_t Value, unsigned &MaskIdx,
348 unsigned &MaskLen) {
349 if (!isShiftedMask_32(Value))
350 return false;
351 MaskIdx = llvm::countr_zero(Value);
352 MaskLen = llvm::popcount(Value);
353 return true;
354}
355
356/// Return true if the argument contains a non-empty sequence of ones with the
357/// remainder zero (64 bit version.) If true, \p MaskIdx will specify the index
358/// of the lowest set bit and \p MaskLen is updated to specify the length of the
359/// mask, else neither are updated.
360inline bool isShiftedMask_64(uint64_t Value, unsigned &MaskIdx,
361 unsigned &MaskLen) {
362 if (!isShiftedMask_64(Value))
363 return false;
364 MaskIdx = llvm::countr_zero(Value);
365 MaskLen = llvm::popcount(Value);
366 return true;
367}
368
369/// Compile time Log2.
370/// Valid only for positive powers of two.
371template <size_t kValue> constexpr inline size_t CTLog2() {
372 static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue),
373 "Value is not a valid power of 2");
374 return 1 + CTLog2<kValue / 2>();
375}
376
377template <> constexpr inline size_t CTLog2<1>() { return 0; }
378
379/// Return the floor log base 2 of the specified value, -1 if the value is zero.
380/// (32 bit edition.)
381/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
382inline unsigned Log2_32(uint32_t Value) {
383 return 31 - llvm::countl_zero(Value);
12
Returning the value 4294967295
384}
385
386/// Return the floor log base 2 of the specified value, -1 if the value is zero.
387/// (64 bit edition.)
388inline unsigned Log2_64(uint64_t Value) {
389 return 63 - llvm::countl_zero(Value);
390}
391
392/// Return the ceil log base 2 of the specified value, 32 if the value is zero.
393/// (32 bit edition).
394/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
395inline unsigned Log2_32_Ceil(uint32_t Value) {
396 return 32 - llvm::countl_zero(Value - 1);
397}
398
399/// Return the ceil log base 2 of the specified value, 64 if the value is zero.
400/// (64 bit edition.)
401inline unsigned Log2_64_Ceil(uint64_t Value) {
402 return 64 - llvm::countl_zero(Value - 1);
403}
404
405/// This function takes a 64-bit integer and returns the bit equivalent double.
406LLVM_DEPRECATED("use llvm::bit_cast instead", "llvm::bit_cast<double>")__attribute__((deprecated("use llvm::bit_cast instead", "llvm::bit_cast<double>"
)))
407inline double BitsToDouble(uint64_t Bits) {
408 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
409 return llvm::bit_cast<double>(Bits);
410}
411
412/// This function takes a 32-bit integer and returns the bit equivalent float.
413LLVM_DEPRECATED("use llvm::bit_cast instead", "llvm::bit_cast<float>")__attribute__((deprecated("use llvm::bit_cast instead", "llvm::bit_cast<float>"
)))
414inline float BitsToFloat(uint32_t Bits) {
415 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
416 return llvm::bit_cast<float>(Bits);
417}
418
419/// This function takes a double and returns the bit equivalent 64-bit integer.
420/// Note that copying doubles around changes the bits of NaNs on some hosts,
421/// notably x86, so this routine cannot be used if these bits are needed.
422LLVM_DEPRECATED("use llvm::bit_cast instead", "llvm::bit_cast<uint64_t>")__attribute__((deprecated("use llvm::bit_cast instead", "llvm::bit_cast<uint64_t>"
)))
423inline uint64_t DoubleToBits(double Double) {
424 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
425 return llvm::bit_cast<uint64_t>(Double);
426}
427
428/// This function takes a float and returns the bit equivalent 32-bit integer.
429/// Note that copying floats around changes the bits of NaNs on some hosts,
430/// notably x86, so this routine cannot be used if these bits are needed.
431LLVM_DEPRECATED("use llvm::bit_cast instead", "llvm::bit_cast<uint32_t>")__attribute__((deprecated("use llvm::bit_cast instead", "llvm::bit_cast<uint32_t>"
)))
432inline uint32_t FloatToBits(float Float) {
433 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
434 return llvm::bit_cast<uint32_t>(Float);
435}
436
437/// A and B are either alignments or offsets. Return the minimum alignment that
438/// may be assumed after adding the two together.
439constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
440 // The largest power of 2 that divides both A and B.
441 //
442 // Replace "-Value" by "1+~Value" in the following commented code to avoid
443 // MSVC warning C4146
444 // return (A | B) & -(A | B);
445 return (A | B) & (1 + ~(A | B));
446}
447
448/// Returns the next power of two (in 64-bits) that is strictly greater than A.
449/// Returns zero on overflow.
450constexpr inline uint64_t NextPowerOf2(uint64_t A) {
451 A |= (A >> 1);
452 A |= (A >> 2);
453 A |= (A >> 4);
454 A |= (A >> 8);
455 A |= (A >> 16);
456 A |= (A >> 32);
457 return A + 1;
458}
459
460/// Returns the power of two which is less than or equal to the given value.
461/// Essentially, it is a floor operation across the domain of powers of two.
462LLVM_DEPRECATED("use llvm::bit_floor instead", "llvm::bit_floor")__attribute__((deprecated("use llvm::bit_floor instead", "llvm::bit_floor"
)))
463inline uint64_t PowerOf2Floor(uint64_t A) {
464 return llvm::bit_floor(A);
465}
466
467/// Returns the power of two which is greater than or equal to the given value.
468/// Essentially, it is a ceil operation across the domain of powers of two.
469inline uint64_t PowerOf2Ceil(uint64_t A) {
470 if (!A)
471 return 0;
472 return NextPowerOf2(A - 1);
473}
474
475/// Returns the next integer (mod 2**64) that is greater than or equal to
476/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
477///
478/// Examples:
479/// \code
480/// alignTo(5, 8) = 8
481/// alignTo(17, 8) = 24
482/// alignTo(~0LL, 8) = 0
483/// alignTo(321, 255) = 510
484/// \endcode
485inline uint64_t alignTo(uint64_t Value, uint64_t Align) {
486 assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0."
) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 486, __extension__
__PRETTY_FUNCTION__))
;
487 return (Value + Align - 1) / Align * Align;
488}
489
490inline uint64_t alignToPowerOf2(uint64_t Value, uint64_t Align) {
491 assert(Align != 0 && (Align & (Align - 1)) == 0 &&(static_cast <bool> (Align != 0 && (Align &
(Align - 1)) == 0 && "Align must be a power of 2") ?
void (0) : __assert_fail ("Align != 0 && (Align & (Align - 1)) == 0 && \"Align must be a power of 2\""
, "llvm/include/llvm/Support/MathExtras.h", 492, __extension__
__PRETTY_FUNCTION__))
492 "Align must be a power of 2")(static_cast <bool> (Align != 0 && (Align &
(Align - 1)) == 0 && "Align must be a power of 2") ?
void (0) : __assert_fail ("Align != 0 && (Align & (Align - 1)) == 0 && \"Align must be a power of 2\""
, "llvm/include/llvm/Support/MathExtras.h", 492, __extension__
__PRETTY_FUNCTION__))
;
493 return (Value + Align - 1) & -Align;
494}
495
496/// If non-zero \p Skew is specified, the return value will be a minimal integer
497/// that is greater than or equal to \p Size and equal to \p A * N + \p Skew for
498/// some integer N. If \p Skew is larger than \p A, its value is adjusted to '\p
499/// Skew mod \p A'. \p Align must be non-zero.
500///
501/// Examples:
502/// \code
503/// alignTo(5, 8, 7) = 7
504/// alignTo(17, 8, 1) = 17
505/// alignTo(~0LL, 8, 3) = 3
506/// alignTo(321, 255, 42) = 552
507/// \endcode
508inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew) {
509 assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0."
) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 509, __extension__
__PRETTY_FUNCTION__))
;
510 Skew %= Align;
511 return alignTo(Value - Skew, Align) + Skew;
512}
513
514/// Returns the next integer (mod 2**64) that is greater than or equal to
515/// \p Value and is a multiple of \c Align. \c Align must be non-zero.
516template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) {
517 static_assert(Align != 0u, "Align must be non-zero");
518 return (Value + Align - 1) / Align * Align;
519}
520
521/// Returns the integer ceil(Numerator / Denominator).
522inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
523 return alignTo(Numerator, Denominator) / Denominator;
524}
525
526/// Returns the integer nearest(Numerator / Denominator).
527inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) {
528 return (Numerator + (Denominator / 2)) / Denominator;
529}
530
531/// Returns the largest uint64_t less than or equal to \p Value and is
532/// \p Skew mod \p Align. \p Align must be non-zero
533inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
534 assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0."
) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 534, __extension__
__PRETTY_FUNCTION__))
;
535 Skew %= Align;
536 return (Value - Skew) / Align * Align + Skew;
537}
538
539/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
540/// Requires 0 < B <= 32.
541template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) {
542 static_assert(B > 0, "Bit width can't be 0.");
543 static_assert(B <= 32, "Bit width out of range.");
544 return int32_t(X << (32 - B)) >> (32 - B);
545}
546
547/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
548/// Requires 0 < B <= 32.
549inline int32_t SignExtend32(uint32_t X, unsigned B) {
550 assert(B > 0 && "Bit width can't be 0.")(static_cast <bool> (B > 0 && "Bit width can't be 0."
) ? void (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 550, __extension__
__PRETTY_FUNCTION__))
;
551 assert(B <= 32 && "Bit width out of range.")(static_cast <bool> (B <= 32 && "Bit width out of range."
) ? void (0) : __assert_fail ("B <= 32 && \"Bit width out of range.\""
, "llvm/include/llvm/Support/MathExtras.h", 551, __extension__
__PRETTY_FUNCTION__))
;
552 return int32_t(X << (32 - B)) >> (32 - B);
553}
554
555/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
556/// Requires 0 < B <= 64.
557template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) {
558 static_assert(B > 0, "Bit width can't be 0.");
559 static_assert(B <= 64, "Bit width out of range.");
560 return int64_t(x << (64 - B)) >> (64 - B);
561}
562
563/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
564/// Requires 0 < B <= 64.
565inline int64_t SignExtend64(uint64_t X, unsigned B) {
566 assert(B > 0 && "Bit width can't be 0.")(static_cast <bool> (B > 0 && "Bit width can't be 0."
) ? void (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 566, __extension__
__PRETTY_FUNCTION__))
;
567 assert(B <= 64 && "Bit width out of range.")(static_cast <bool> (B <= 64 && "Bit width out of range."
) ? void (0) : __assert_fail ("B <= 64 && \"Bit width out of range.\""
, "llvm/include/llvm/Support/MathExtras.h", 567, __extension__
__PRETTY_FUNCTION__))
;
568 return int64_t(X << (64 - B)) >> (64 - B);
569}
570
571/// Subtract two unsigned integers, X and Y, of type T and return the absolute
572/// value of the result.
573template <typename T>
574std::enable_if_t<std::is_unsigned<T>::value, T> AbsoluteDifference(T X, T Y) {
575 return X > Y ? (X - Y) : (Y - X);
576}
577
578/// Add two unsigned integers, X and Y, of type T. Clamp the result to the
579/// maximum representable value of T on overflow. ResultOverflowed indicates if
580/// the result is larger than the maximum representable value of type T.
581template <typename T>
582std::enable_if_t<std::is_unsigned<T>::value, T>
583SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) {
584 bool Dummy;
585 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
586 // Hacker's Delight, p. 29
587 T Z = X + Y;
588 Overflowed = (Z < X || Z < Y);
589 if (Overflowed)
590 return std::numeric_limits<T>::max();
591 else
592 return Z;
593}
594
595/// Add multiple unsigned integers of type T. Clamp the result to the
596/// maximum representable value of T on overflow.
597template <class T, class... Ts>
598std::enable_if_t<std::is_unsigned_v<T>, T> SaturatingAdd(T X, T Y, T Z,
599 Ts... Args) {
600 bool Overflowed = false;
601 T XY = SaturatingAdd(X, Y, &Overflowed);
602 if (Overflowed)
603 return SaturatingAdd(std::numeric_limits<T>::max(), T(1), Args...);
604 return SaturatingAdd(XY, Z, Args...);
605}
606
607/// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the
608/// maximum representable value of T on overflow. ResultOverflowed indicates if
609/// the result is larger than the maximum representable value of type T.
610template <typename T>
611std::enable_if_t<std::is_unsigned<T>::value, T>
612SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) {
613 bool Dummy;
614 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
615
616 // Hacker's Delight, p. 30 has a different algorithm, but we don't use that
617 // because it fails for uint16_t (where multiplication can have undefined
618 // behavior due to promotion to int), and requires a division in addition
619 // to the multiplication.
620
621 Overflowed = false;
622
623 // Log2(Z) would be either Log2Z or Log2Z + 1.
624 // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z
625 // will necessarily be less than Log2Max as desired.
626 int Log2Z = Log2_64(X) + Log2_64(Y);
627 const T Max = std::numeric_limits<T>::max();
628 int Log2Max = Log2_64(Max);
629 if (Log2Z < Log2Max) {
630 return X * Y;
631 }
632 if (Log2Z > Log2Max) {
633 Overflowed = true;
634 return Max;
635 }
636
637 // We're going to use the top bit, and maybe overflow one
638 // bit past it. Multiply all but the bottom bit then add
639 // that on at the end.
640 T Z = (X >> 1) * Y;
641 if (Z & ~(Max >> 1)) {
642 Overflowed = true;
643 return Max;
644 }
645 Z <<= 1;
646 if (X & 1)
647 return SaturatingAdd(Z, Y, ResultOverflowed);
648
649 return Z;
650}
651
652/// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to
653/// the product. Clamp the result to the maximum representable value of T on
654/// overflow. ResultOverflowed indicates if the result is larger than the
655/// maximum representable value of type T.
656template <typename T>
657std::enable_if_t<std::is_unsigned<T>::value, T>
658SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) {
659 bool Dummy;
660 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
661
662 T Product = SaturatingMultiply(X, Y, &Overflowed);
663 if (Overflowed)
664 return Product;
665
666 return SaturatingAdd(A, Product, &Overflowed);
667}
668
669/// Use this rather than HUGE_VALF; the latter causes warnings on MSVC.
670extern const float huge_valf;
671
672
673/// Add two signed integers, computing the two's complement truncated result,
674/// returning true if overflow occurred.
675template <typename T>
676std::enable_if_t<std::is_signed<T>::value, T> AddOverflow(T X, T Y, T &Result) {
677#if __has_builtin(__builtin_add_overflow)1
678 return __builtin_add_overflow(X, Y, &Result);
679#else
680 // Perform the unsigned addition.
681 using U = std::make_unsigned_t<T>;
682 const U UX = static_cast<U>(X);
683 const U UY = static_cast<U>(Y);
684 const U UResult = UX + UY;
685
686 // Convert to signed.
687 Result = static_cast<T>(UResult);
688
689 // Adding two positive numbers should result in a positive number.
690 if (X > 0 && Y > 0)
691 return Result <= 0;
692 // Adding two negatives should result in a negative number.
693 if (X < 0 && Y < 0)
694 return Result >= 0;
695 return false;
696#endif
697}
698
699/// Subtract two signed integers, computing the two's complement truncated
700/// result, returning true if an overflow ocurred.
701template <typename T>
702std::enable_if_t<std::is_signed<T>::value, T> SubOverflow(T X, T Y, T &Result) {
703#if __has_builtin(__builtin_sub_overflow)1
704 return __builtin_sub_overflow(X, Y, &Result);
705#else
706 // Perform the unsigned addition.
707 using U = std::make_unsigned_t<T>;
708 const U UX = static_cast<U>(X);
709 const U UY = static_cast<U>(Y);
710 const U UResult = UX - UY;
711
712 // Convert to signed.
713 Result = static_cast<T>(UResult);
714
715 // Subtracting a positive number from a negative results in a negative number.
716 if (X <= 0 && Y > 0)
717 return Result >= 0;
718 // Subtracting a negative number from a positive results in a positive number.
719 if (X >= 0 && Y < 0)
720 return Result <= 0;
721 return false;
722#endif
723}
724
725/// Multiply two signed integers, computing the two's complement truncated
726/// result, returning true if an overflow ocurred.
727template <typename T>
728std::enable_if_t<std::is_signed<T>::value, T> MulOverflow(T X, T Y, T &Result) {
729 // Perform the unsigned multiplication on absolute values.
730 using U = std::make_unsigned_t<T>;
731 const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X);
732 const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y);
733 const U UResult = UX * UY;
734
735 // Convert to signed.
736 const bool IsNegative = (X < 0) ^ (Y < 0);
737 Result = IsNegative ? (0 - UResult) : UResult;
738
739 // If any of the args was 0, result is 0 and no overflow occurs.
740 if (UX == 0 || UY == 0)
741 return false;
742
743 // UX and UY are in [1, 2^n], where n is the number of digits.
744 // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for
745 // positive) divided by an argument compares to the other.
746 if (IsNegative)
747 return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY;
748 else
749 return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY;
750}
751
752} // End llvm namespace
753
754#endif