Bug Summary

File:build/source/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Warning:line 2825, column 21
The result of the left shift is undefined due to shifting by '4294967295', which is greater or equal to the width of type 'int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name AArch64TargetTransformInfo.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/source/build-llvm/tools/clang/stage2-bins -resource-dir /usr/lib/llvm-17/lib/clang/17 -D _DEBUG -D _GLIBCXX_ASSERTIONS -D _GNU_SOURCE -D _LIBCPP_ENABLE_ASSERTIONS -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Target/AArch64 -I /build/source/llvm/lib/Target/AArch64 -I include -I /build/source/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-17/lib/clang/17/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fmacro-prefix-map=/build/source/= -fcoverage-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fcoverage-prefix-map=/build/source/= -source-date-epoch 1683717183 -O2 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -Wno-misleading-indentation -std=c++17 -fdeprecated-macro -fdebug-compilation-dir=/build/source/build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/source/= -ferror-limit 19 -fvisibility=hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2023-05-10-133810-16478-1 -x c++ /build/source/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

/build/source/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "AArch64TargetTransformInfo.h"
10#include "AArch64ExpandImm.h"
11#include "AArch64PerfectShuffle.h"
12#include "MCTargetDesc/AArch64AddressingModes.h"
13#include "llvm/Analysis/IVDescriptors.h"
14#include "llvm/Analysis/LoopInfo.h"
15#include "llvm/Analysis/TargetTransformInfo.h"
16#include "llvm/CodeGen/BasicTTIImpl.h"
17#include "llvm/CodeGen/CostTable.h"
18#include "llvm/CodeGen/TargetLowering.h"
19#include "llvm/IR/IntrinsicInst.h"
20#include "llvm/IR/Intrinsics.h"
21#include "llvm/IR/IntrinsicsAArch64.h"
22#include "llvm/IR/PatternMatch.h"
23#include "llvm/Support/Debug.h"
24#include "llvm/Transforms/InstCombine/InstCombiner.h"
25#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
26#include <algorithm>
27#include <optional>
28using namespace llvm;
29using namespace llvm::PatternMatch;
30
31#define DEBUG_TYPE"aarch64tti" "aarch64tti"
32
33static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
34 cl::init(true), cl::Hidden);
35
36static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
37 cl::Hidden);
38
39static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
40 cl::init(10), cl::Hidden);
41
42namespace {
43class TailFoldingKind {
44private:
45 uint8_t Bits = 0; // Currently defaults to disabled.
46
47public:
48 enum TailFoldingOpts {
49 TFDisabled = 0x0,
50 TFReductions = 0x01,
51 TFRecurrences = 0x02,
52 TFReverse = 0x04,
53 TFSimple = 0x80,
54 TFAll = TFReductions | TFRecurrences | TFReverse | TFSimple
55 };
56
57 void operator=(const std::string &Val) {
58 if (Val.empty())
59 return;
60 SmallVector<StringRef, 6> TailFoldTypes;
61 StringRef(Val).split(TailFoldTypes, '+', -1, false);
62 for (auto TailFoldType : TailFoldTypes) {
63 if (TailFoldType == "disabled")
64 Bits = 0;
65 else if (TailFoldType == "all")
66 Bits = TFAll;
67 else if (TailFoldType == "default")
68 Bits = 0; // Currently defaults to never tail-folding.
69 else if (TailFoldType == "simple")
70 add(TFSimple);
71 else if (TailFoldType == "reductions")
72 add(TFReductions);
73 else if (TailFoldType == "recurrences")
74 add(TFRecurrences);
75 else if (TailFoldType == "reverse")
76 add(TFReverse);
77 else if (TailFoldType == "noreductions")
78 remove(TFReductions);
79 else if (TailFoldType == "norecurrences")
80 remove(TFRecurrences);
81 else if (TailFoldType == "noreverse")
82 remove(TFReverse);
83 else {
84 errs()
85 << "invalid argument " << TailFoldType.str()
86 << " to -sve-tail-folding=; each element must be one of: disabled, "
87 "all, default, simple, reductions, noreductions, recurrences, "
88 "norecurrences\n";
89 }
90 }
91 }
92
93 operator uint8_t() const { return Bits; }
94
95 void add(uint8_t Flag) { Bits |= Flag; }
96 void remove(uint8_t Flag) { Bits &= ~Flag; }
97};
98} // namespace
99
100TailFoldingKind TailFoldingKindLoc;
101
102cl::opt<TailFoldingKind, true, cl::parser<std::string>> SVETailFolding(
103 "sve-tail-folding",
104 cl::desc(
105 "Control the use of vectorisation using tail-folding for SVE:"
106 "\ndisabled No loop types will vectorize using tail-folding"
107 "\ndefault Uses the default tail-folding settings for the target "
108 "CPU"
109 "\nall All legal loop types will vectorize using tail-folding"
110 "\nsimple Use tail-folding for simple loops (not reductions or "
111 "recurrences)"
112 "\nreductions Use tail-folding for loops containing reductions"
113 "\nrecurrences Use tail-folding for loops containing fixed order "
114 "recurrences"
115 "\nreverse Use tail-folding for loops requiring reversed "
116 "predicates"),
117 cl::location(TailFoldingKindLoc));
118
119// Experimental option that will only be fully functional when the
120// code-generator is changed to use SVE instead of NEON for all fixed-width
121// operations.
122static cl::opt<bool> EnableFixedwidthAutovecInStreamingMode(
123 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
124
125// Experimental option that will only be fully functional when the cost-model
126// and code-generator have been changed to avoid using scalable vector
127// instructions that are not legal in streaming SVE mode.
128static cl::opt<bool> EnableScalableAutovecInStreamingMode(
129 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
130
131bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
132 const Function *Callee) const {
133 SMEAttrs CallerAttrs(*Caller);
134 SMEAttrs CalleeAttrs(*Callee);
135 if (CallerAttrs.requiresSMChange(CalleeAttrs,
136 /*BodyOverridesInterface=*/true) ||
137 CallerAttrs.requiresLazySave(CalleeAttrs) ||
138 CalleeAttrs.hasNewZAInterface())
139 return false;
140
141 const TargetMachine &TM = getTLI()->getTargetMachine();
142
143 const FeatureBitset &CallerBits =
144 TM.getSubtargetImpl(*Caller)->getFeatureBits();
145 const FeatureBitset &CalleeBits =
146 TM.getSubtargetImpl(*Callee)->getFeatureBits();
147
148 // Inline a callee if its target-features are a subset of the callers
149 // target-features.
150 return (CallerBits & CalleeBits) == CalleeBits;
151}
152
153bool AArch64TTIImpl::shouldMaximizeVectorBandwidth(
154 TargetTransformInfo::RegisterKind K) const {
155 assert(K != TargetTransformInfo::RGK_Scalar)(static_cast <bool> (K != TargetTransformInfo::RGK_Scalar
) ? void (0) : __assert_fail ("K != TargetTransformInfo::RGK_Scalar"
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 155
, __extension__ __PRETTY_FUNCTION__))
;
156 return K == TargetTransformInfo::RGK_FixedWidthVector;
157}
158
159/// Calculate the cost of materializing a 64-bit value. This helper
160/// method might only calculate a fraction of a larger immediate. Therefore it
161/// is valid to return a cost of ZERO.
162InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) {
163 // Check if the immediate can be encoded within an instruction.
164 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
165 return 0;
166
167 if (Val < 0)
168 Val = ~Val;
169
170 // Calculate how many moves we will need to materialize this constant.
171 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
172 AArch64_IMM::expandMOVImm(Val, 64, Insn);
173 return Insn.size();
174}
175
176/// Calculate the cost of materializing the given constant.
177InstructionCost AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
178 TTI::TargetCostKind CostKind) {
179 assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
__assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 179, __extension__ __PRETTY_FUNCTION__))
;
180
181 unsigned BitSize = Ty->getPrimitiveSizeInBits();
182 if (BitSize == 0)
183 return ~0U;
184
185 // Sign-extend all constants to a multiple of 64-bit.
186 APInt ImmVal = Imm;
187 if (BitSize & 0x3f)
188 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
189
190 // Split the constant into 64-bit chunks and calculate the cost for each
191 // chunk.
192 InstructionCost Cost = 0;
193 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
194 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
195 int64_t Val = Tmp.getSExtValue();
196 Cost += getIntImmCost(Val);
197 }
198 // We need at least one instruction to materialze the constant.
199 return std::max<InstructionCost>(1, Cost);
200}
201
202InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
203 const APInt &Imm, Type *Ty,
204 TTI::TargetCostKind CostKind,
205 Instruction *Inst) {
206 assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
__assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 206, __extension__ __PRETTY_FUNCTION__))
;
207
208 unsigned BitSize = Ty->getPrimitiveSizeInBits();
209 // There is no cost model for constants with a bit size of 0. Return TCC_Free
210 // here, so that constant hoisting will ignore this constant.
211 if (BitSize == 0)
212 return TTI::TCC_Free;
213
214 unsigned ImmIdx = ~0U;
215 switch (Opcode) {
216 default:
217 return TTI::TCC_Free;
218 case Instruction::GetElementPtr:
219 // Always hoist the base address of a GetElementPtr.
220 if (Idx == 0)
221 return 2 * TTI::TCC_Basic;
222 return TTI::TCC_Free;
223 case Instruction::Store:
224 ImmIdx = 0;
225 break;
226 case Instruction::Add:
227 case Instruction::Sub:
228 case Instruction::Mul:
229 case Instruction::UDiv:
230 case Instruction::SDiv:
231 case Instruction::URem:
232 case Instruction::SRem:
233 case Instruction::And:
234 case Instruction::Or:
235 case Instruction::Xor:
236 case Instruction::ICmp:
237 ImmIdx = 1;
238 break;
239 // Always return TCC_Free for the shift value of a shift instruction.
240 case Instruction::Shl:
241 case Instruction::LShr:
242 case Instruction::AShr:
243 if (Idx == 1)
244 return TTI::TCC_Free;
245 break;
246 case Instruction::Trunc:
247 case Instruction::ZExt:
248 case Instruction::SExt:
249 case Instruction::IntToPtr:
250 case Instruction::PtrToInt:
251 case Instruction::BitCast:
252 case Instruction::PHI:
253 case Instruction::Call:
254 case Instruction::Select:
255 case Instruction::Ret:
256 case Instruction::Load:
257 break;
258 }
259
260 if (Idx == ImmIdx) {
261 int NumConstants = (BitSize + 63) / 64;
262 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
263 return (Cost <= NumConstants * TTI::TCC_Basic)
264 ? static_cast<int>(TTI::TCC_Free)
265 : Cost;
266 }
267 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
268}
269
270InstructionCost
271AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
272 const APInt &Imm, Type *Ty,
273 TTI::TargetCostKind CostKind) {
274 assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
__assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 274, __extension__ __PRETTY_FUNCTION__))
;
275
276 unsigned BitSize = Ty->getPrimitiveSizeInBits();
277 // There is no cost model for constants with a bit size of 0. Return TCC_Free
278 // here, so that constant hoisting will ignore this constant.
279 if (BitSize == 0)
280 return TTI::TCC_Free;
281
282 // Most (all?) AArch64 intrinsics do not support folding immediates into the
283 // selected instruction, so we compute the materialization cost for the
284 // immediate directly.
285 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
286 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
287
288 switch (IID) {
289 default:
290 return TTI::TCC_Free;
291 case Intrinsic::sadd_with_overflow:
292 case Intrinsic::uadd_with_overflow:
293 case Intrinsic::ssub_with_overflow:
294 case Intrinsic::usub_with_overflow:
295 case Intrinsic::smul_with_overflow:
296 case Intrinsic::umul_with_overflow:
297 if (Idx == 1) {
298 int NumConstants = (BitSize + 63) / 64;
299 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
300 return (Cost <= NumConstants * TTI::TCC_Basic)
301 ? static_cast<int>(TTI::TCC_Free)
302 : Cost;
303 }
304 break;
305 case Intrinsic::experimental_stackmap:
306 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
307 return TTI::TCC_Free;
308 break;
309 case Intrinsic::experimental_patchpoint_void:
310 case Intrinsic::experimental_patchpoint_i64:
311 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
312 return TTI::TCC_Free;
313 break;
314 case Intrinsic::experimental_gc_statepoint:
315 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
316 return TTI::TCC_Free;
317 break;
318 }
319 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
320}
321
322TargetTransformInfo::PopcntSupportKind
323AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
324 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2")(static_cast <bool> (isPowerOf2_32(TyWidth) && "Ty width must be power of 2"
) ? void (0) : __assert_fail ("isPowerOf2_32(TyWidth) && \"Ty width must be power of 2\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 324
, __extension__ __PRETTY_FUNCTION__))
;
325 if (TyWidth == 32 || TyWidth == 64)
326 return TTI::PSK_FastHardware;
327 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
328 return TTI::PSK_Software;
329}
330
331InstructionCost
332AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
333 TTI::TargetCostKind CostKind) {
334 auto *RetTy = ICA.getReturnType();
335 switch (ICA.getID()) {
336 case Intrinsic::umin:
337 case Intrinsic::umax:
338 case Intrinsic::smin:
339 case Intrinsic::smax: {
340 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
341 MVT::v8i16, MVT::v2i32, MVT::v4i32};
342 auto LT = getTypeLegalizationCost(RetTy);
343 // v2i64 types get converted to cmp+bif hence the cost of 2
344 if (LT.second == MVT::v2i64)
345 return LT.first * 2;
346 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
347 return LT.first;
348 break;
349 }
350 case Intrinsic::sadd_sat:
351 case Intrinsic::ssub_sat:
352 case Intrinsic::uadd_sat:
353 case Intrinsic::usub_sat: {
354 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
355 MVT::v8i16, MVT::v2i32, MVT::v4i32,
356 MVT::v2i64};
357 auto LT = getTypeLegalizationCost(RetTy);
358 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
359 // need to extend the type, as it uses shr(qadd(shl, shl)).
360 unsigned Instrs =
361 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
362 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
363 return LT.first * Instrs;
364 break;
365 }
366 case Intrinsic::abs: {
367 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
368 MVT::v8i16, MVT::v2i32, MVT::v4i32,
369 MVT::v2i64};
370 auto LT = getTypeLegalizationCost(RetTy);
371 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
372 return LT.first;
373 break;
374 }
375 case Intrinsic::experimental_stepvector: {
376 InstructionCost Cost = 1; // Cost of the `index' instruction
377 auto LT = getTypeLegalizationCost(RetTy);
378 // Legalisation of illegal vectors involves an `index' instruction plus
379 // (LT.first - 1) vector adds.
380 if (LT.first > 1) {
381 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
382 InstructionCost AddCost =
383 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
384 Cost += AddCost * (LT.first - 1);
385 }
386 return Cost;
387 }
388 case Intrinsic::bitreverse: {
389 static const CostTblEntry BitreverseTbl[] = {
390 {Intrinsic::bitreverse, MVT::i32, 1},
391 {Intrinsic::bitreverse, MVT::i64, 1},
392 {Intrinsic::bitreverse, MVT::v8i8, 1},
393 {Intrinsic::bitreverse, MVT::v16i8, 1},
394 {Intrinsic::bitreverse, MVT::v4i16, 2},
395 {Intrinsic::bitreverse, MVT::v8i16, 2},
396 {Intrinsic::bitreverse, MVT::v2i32, 2},
397 {Intrinsic::bitreverse, MVT::v4i32, 2},
398 {Intrinsic::bitreverse, MVT::v1i64, 2},
399 {Intrinsic::bitreverse, MVT::v2i64, 2},
400 };
401 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
402 const auto *Entry =
403 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
404 if (Entry) {
405 // Cost Model is using the legal type(i32) that i8 and i16 will be
406 // converted to +1 so that we match the actual lowering cost
407 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
408 TLI->getValueType(DL, RetTy, true) == MVT::i16)
409 return LegalisationCost.first * Entry->Cost + 1;
410
411 return LegalisationCost.first * Entry->Cost;
412 }
413 break;
414 }
415 case Intrinsic::ctpop: {
416 if (!ST->hasNEON()) {
417 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
418 return getTypeLegalizationCost(RetTy).first * 12;
419 }
420 static const CostTblEntry CtpopCostTbl[] = {
421 {ISD::CTPOP, MVT::v2i64, 4},
422 {ISD::CTPOP, MVT::v4i32, 3},
423 {ISD::CTPOP, MVT::v8i16, 2},
424 {ISD::CTPOP, MVT::v16i8, 1},
425 {ISD::CTPOP, MVT::i64, 4},
426 {ISD::CTPOP, MVT::v2i32, 3},
427 {ISD::CTPOP, MVT::v4i16, 2},
428 {ISD::CTPOP, MVT::v8i8, 1},
429 {ISD::CTPOP, MVT::i32, 5},
430 };
431 auto LT = getTypeLegalizationCost(RetTy);
432 MVT MTy = LT.second;
433 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
434 // Extra cost of +1 when illegal vector types are legalized by promoting
435 // the integer type.
436 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
437 RetTy->getScalarSizeInBits()
438 ? 1
439 : 0;
440 return LT.first * Entry->Cost + ExtraCost;
441 }
442 break;
443 }
444 case Intrinsic::sadd_with_overflow:
445 case Intrinsic::uadd_with_overflow:
446 case Intrinsic::ssub_with_overflow:
447 case Intrinsic::usub_with_overflow:
448 case Intrinsic::smul_with_overflow:
449 case Intrinsic::umul_with_overflow: {
450 static const CostTblEntry WithOverflowCostTbl[] = {
451 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
452 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
453 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
454 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
455 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
456 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
457 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
458 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
459 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
460 {Intrinsic::usub_with_overflow, MVT::i8, 3},
461 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
462 {Intrinsic::usub_with_overflow, MVT::i16, 3},
463 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
464 {Intrinsic::usub_with_overflow, MVT::i32, 1},
465 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
466 {Intrinsic::usub_with_overflow, MVT::i64, 1},
467 {Intrinsic::smul_with_overflow, MVT::i8, 5},
468 {Intrinsic::umul_with_overflow, MVT::i8, 4},
469 {Intrinsic::smul_with_overflow, MVT::i16, 5},
470 {Intrinsic::umul_with_overflow, MVT::i16, 4},
471 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
472 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
473 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
474 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
475 };
476 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
477 if (MTy.isSimple())
478 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
479 MTy.getSimpleVT()))
480 return Entry->Cost;
481 break;
482 }
483 case Intrinsic::fptosi_sat:
484 case Intrinsic::fptoui_sat: {
485 if (ICA.getArgTypes().empty())
486 break;
487 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
488 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
489 EVT MTy = TLI->getValueType(DL, RetTy);
490 // Check for the legal types, which are where the size of the input and the
491 // output are the same, or we are using cvt f64->i32 or f32->i64.
492 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
493 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
494 LT.second == MVT::v2f64) &&
495 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
496 (LT.second == MVT::f64 && MTy == MVT::i32) ||
497 (LT.second == MVT::f32 && MTy == MVT::i64)))
498 return LT.first;
499 // Similarly for fp16 sizes
500 if (ST->hasFullFP16() &&
501 ((LT.second == MVT::f16 && MTy == MVT::i32) ||
502 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
503 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits()))))
504 return LT.first;
505
506 // Otherwise we use a legal convert followed by a min+max
507 if ((LT.second.getScalarType() == MVT::f32 ||
508 LT.second.getScalarType() == MVT::f64 ||
509 (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) &&
510 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
511 Type *LegalTy =
512 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
513 if (LT.second.isVector())
514 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
515 InstructionCost Cost = 1;
516 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
517 LegalTy, {LegalTy, LegalTy});
518 Cost += getIntrinsicInstrCost(Attrs1, CostKind);
519 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
520 LegalTy, {LegalTy, LegalTy});
521 Cost += getIntrinsicInstrCost(Attrs2, CostKind);
522 return LT.first * Cost;
523 }
524 break;
525 }
526 case Intrinsic::fshl:
527 case Intrinsic::fshr: {
528 if (ICA.getArgs().empty())
529 break;
530
531 // TODO: Add handling for fshl where third argument is not a constant.
532 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
533 if (!OpInfoZ.isConstant())
534 break;
535
536 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
537 if (OpInfoZ.isUniform()) {
538 // FIXME: The costs could be lower if the codegen is better.
539 static const CostTblEntry FshlTbl[] = {
540 {Intrinsic::fshl, MVT::v4i32, 3}, // ushr + shl + orr
541 {Intrinsic::fshl, MVT::v2i64, 3}, {Intrinsic::fshl, MVT::v16i8, 4},
542 {Intrinsic::fshl, MVT::v8i16, 4}, {Intrinsic::fshl, MVT::v2i32, 3},
543 {Intrinsic::fshl, MVT::v8i8, 4}, {Intrinsic::fshl, MVT::v4i16, 4}};
544 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
545 // to avoid having to duplicate the costs.
546 const auto *Entry =
547 CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
548 if (Entry)
549 return LegalisationCost.first * Entry->Cost;
550 }
551
552 auto TyL = getTypeLegalizationCost(RetTy);
553 if (!RetTy->isIntegerTy())
554 break;
555
556 // Estimate cost manually, as types like i8 and i16 will get promoted to
557 // i32 and CostTableLookup will ignore the extra conversion cost.
558 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
559 RetTy->getScalarSizeInBits() < 64) ||
560 (RetTy->getScalarSizeInBits() % 64 != 0);
561 unsigned ExtraCost = HigherCost ? 1 : 0;
562 if (RetTy->getScalarSizeInBits() == 32 ||
563 RetTy->getScalarSizeInBits() == 64)
564 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
565 // extr instruction.
566 else if (HigherCost)
567 ExtraCost = 1;
568 else
569 break;
570 return TyL.first + ExtraCost;
571 }
572 default:
573 break;
574 }
575 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
576}
577
578/// The function will remove redundant reinterprets casting in the presence
579/// of the control flow
580static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
581 IntrinsicInst &II) {
582 SmallVector<Instruction *, 32> Worklist;
583 auto RequiredType = II.getType();
584
585 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
586 assert(PN && "Expected Phi Node!")(static_cast <bool> (PN && "Expected Phi Node!"
) ? void (0) : __assert_fail ("PN && \"Expected Phi Node!\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 586
, __extension__ __PRETTY_FUNCTION__))
;
587
588 // Don't create a new Phi unless we can remove the old one.
589 if (!PN->hasOneUse())
590 return std::nullopt;
591
592 for (Value *IncValPhi : PN->incoming_values()) {
593 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
594 if (!Reinterpret ||
595 Reinterpret->getIntrinsicID() !=
596 Intrinsic::aarch64_sve_convert_to_svbool ||
597 RequiredType != Reinterpret->getArgOperand(0)->getType())
598 return std::nullopt;
599 }
600
601 // Create the new Phi
602 LLVMContext &Ctx = PN->getContext();
603 IRBuilder<> Builder(Ctx);
604 Builder.SetInsertPoint(PN);
605 PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
606 Worklist.push_back(PN);
607
608 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
609 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
610 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
611 Worklist.push_back(Reinterpret);
612 }
613
614 // Cleanup Phi Node and reinterprets
615 return IC.replaceInstUsesWith(II, NPN);
616}
617
618// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
619// => (binop (pred) (from_svbool _) (from_svbool _))
620//
621// The above transformation eliminates a `to_svbool` in the predicate
622// operand of bitwise operation `binop` by narrowing the vector width of
623// the operation. For example, it would convert a `<vscale x 16 x i1>
624// and` into a `<vscale x 4 x i1> and`. This is profitable because
625// to_svbool must zero the new lanes during widening, whereas
626// from_svbool is free.
627static std::optional<Instruction *>
628tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II) {
629 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
630 if (!BinOp)
631 return std::nullopt;
632
633 auto IntrinsicID = BinOp->getIntrinsicID();
634 switch (IntrinsicID) {
635 case Intrinsic::aarch64_sve_and_z:
636 case Intrinsic::aarch64_sve_bic_z:
637 case Intrinsic::aarch64_sve_eor_z:
638 case Intrinsic::aarch64_sve_nand_z:
639 case Intrinsic::aarch64_sve_nor_z:
640 case Intrinsic::aarch64_sve_orn_z:
641 case Intrinsic::aarch64_sve_orr_z:
642 break;
643 default:
644 return std::nullopt;
645 }
646
647 auto BinOpPred = BinOp->getOperand(0);
648 auto BinOpOp1 = BinOp->getOperand(1);
649 auto BinOpOp2 = BinOp->getOperand(2);
650
651 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
652 if (!PredIntr ||
653 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
654 return std::nullopt;
655
656 auto PredOp = PredIntr->getOperand(0);
657 auto PredOpTy = cast<VectorType>(PredOp->getType());
658 if (PredOpTy != II.getType())
659 return std::nullopt;
660
661 IRBuilder<> Builder(II.getContext());
662 Builder.SetInsertPoint(&II);
663
664 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
665 auto NarrowBinOpOp1 = Builder.CreateIntrinsic(
666 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
667 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
668 if (BinOpOp1 == BinOpOp2)
669 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
670 else
671 NarrowedBinOpArgs.push_back(Builder.CreateIntrinsic(
672 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
673
674 auto NarrowedBinOp =
675 Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
676 return IC.replaceInstUsesWith(II, NarrowedBinOp);
677}
678
679static std::optional<Instruction *>
680instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II) {
681 // If the reinterpret instruction operand is a PHI Node
682 if (isa<PHINode>(II.getArgOperand(0)))
683 return processPhiNode(IC, II);
684
685 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
686 return BinOpCombine;
687
688 SmallVector<Instruction *, 32> CandidatesForRemoval;
689 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
690
691 const auto *IVTy = cast<VectorType>(II.getType());
692
693 // Walk the chain of conversions.
694 while (Cursor) {
695 // If the type of the cursor has fewer lanes than the final result, zeroing
696 // must take place, which breaks the equivalence chain.
697 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
698 if (CursorVTy->getElementCount().getKnownMinValue() <
699 IVTy->getElementCount().getKnownMinValue())
700 break;
701
702 // If the cursor has the same type as I, it is a viable replacement.
703 if (Cursor->getType() == IVTy)
704 EarliestReplacement = Cursor;
705
706 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
707
708 // If this is not an SVE conversion intrinsic, this is the end of the chain.
709 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
710 Intrinsic::aarch64_sve_convert_to_svbool ||
711 IntrinsicCursor->getIntrinsicID() ==
712 Intrinsic::aarch64_sve_convert_from_svbool))
713 break;
714
715 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
716 Cursor = IntrinsicCursor->getOperand(0);
717 }
718
719 // If no viable replacement in the conversion chain was found, there is
720 // nothing to do.
721 if (!EarliestReplacement)
722 return std::nullopt;
723
724 return IC.replaceInstUsesWith(II, EarliestReplacement);
725}
726
727static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
728 IntrinsicInst &II) {
729 IRBuilder<> Builder(&II);
730 auto Select = Builder.CreateSelect(II.getOperand(0), II.getOperand(1),
731 II.getOperand(2));
732 return IC.replaceInstUsesWith(II, Select);
733}
734
735static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
736 IntrinsicInst &II) {
737 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
738 if (!Pg)
739 return std::nullopt;
740
741 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
742 return std::nullopt;
743
744 const auto PTruePattern =
745 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
746 if (PTruePattern != AArch64SVEPredPattern::vl1)
747 return std::nullopt;
748
749 // The intrinsic is inserting into lane zero so use an insert instead.
750 auto *IdxTy = Type::getInt64Ty(II.getContext());
751 auto *Insert = InsertElementInst::Create(
752 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
753 Insert->insertBefore(&II);
754 Insert->takeName(&II);
755
756 return IC.replaceInstUsesWith(II, Insert);
757}
758
759static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
760 IntrinsicInst &II) {
761 // Replace DupX with a regular IR splat.
762 IRBuilder<> Builder(II.getContext());
763 Builder.SetInsertPoint(&II);
764 auto *RetTy = cast<ScalableVectorType>(II.getType());
765 Value *Splat =
766 Builder.CreateVectorSplat(RetTy->getElementCount(), II.getArgOperand(0));
767 Splat->takeName(&II);
768 return IC.replaceInstUsesWith(II, Splat);
769}
770
771static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
772 IntrinsicInst &II) {
773 LLVMContext &Ctx = II.getContext();
774 IRBuilder<> Builder(Ctx);
775 Builder.SetInsertPoint(&II);
776
777 // Check that the predicate is all active
778 auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
779 if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
780 return std::nullopt;
781
782 const auto PTruePattern =
783 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
784 if (PTruePattern != AArch64SVEPredPattern::all)
785 return std::nullopt;
786
787 // Check that we have a compare of zero..
788 auto *SplatValue =
789 dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2)));
790 if (!SplatValue || !SplatValue->isZero())
791 return std::nullopt;
792
793 // ..against a dupq
794 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
795 if (!DupQLane ||
796 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
797 return std::nullopt;
798
799 // Where the dupq is a lane 0 replicate of a vector insert
800 if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero())
801 return std::nullopt;
802
803 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
804 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
805 return std::nullopt;
806
807 // Where the vector insert is a fixed constant vector insert into undef at
808 // index zero
809 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
810 return std::nullopt;
811
812 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
813 return std::nullopt;
814
815 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
816 if (!ConstVec)
817 return std::nullopt;
818
819 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
820 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
821 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
822 return std::nullopt;
823
824 unsigned NumElts = VecTy->getNumElements();
825 unsigned PredicateBits = 0;
826
827 // Expand intrinsic operands to a 16-bit byte level predicate
828 for (unsigned I = 0; I < NumElts; ++I) {
829 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
830 if (!Arg)
831 return std::nullopt;
832 if (!Arg->isZero())
833 PredicateBits |= 1 << (I * (16 / NumElts));
834 }
835
836 // If all bits are zero bail early with an empty predicate
837 if (PredicateBits == 0) {
838 auto *PFalse = Constant::getNullValue(II.getType());
839 PFalse->takeName(&II);
840 return IC.replaceInstUsesWith(II, PFalse);
841 }
842
843 // Calculate largest predicate type used (where byte predicate is largest)
844 unsigned Mask = 8;
845 for (unsigned I = 0; I < 16; ++I)
846 if ((PredicateBits & (1 << I)) != 0)
847 Mask |= (I % 8);
848
849 unsigned PredSize = Mask & -Mask;
850 auto *PredType = ScalableVectorType::get(
851 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
852
853 // Ensure all relevant bits are set
854 for (unsigned I = 0; I < 16; I += PredSize)
855 if ((PredicateBits & (1 << I)) == 0)
856 return std::nullopt;
857
858 auto *PTruePat =
859 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
860 auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
861 {PredType}, {PTruePat});
862 auto *ConvertToSVBool = Builder.CreateIntrinsic(
863 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
864 auto *ConvertFromSVBool =
865 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
866 {II.getType()}, {ConvertToSVBool});
867
868 ConvertFromSVBool->takeName(&II);
869 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
870}
871
872static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
873 IntrinsicInst &II) {
874 IRBuilder<> Builder(II.getContext());
875 Builder.SetInsertPoint(&II);
876 Value *Pg = II.getArgOperand(0);
877 Value *Vec = II.getArgOperand(1);
878 auto IntrinsicID = II.getIntrinsicID();
879 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
880
881 // lastX(splat(X)) --> X
882 if (auto *SplatVal = getSplatValue(Vec))
883 return IC.replaceInstUsesWith(II, SplatVal);
884
885 // If x and/or y is a splat value then:
886 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
887 Value *LHS, *RHS;
888 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
889 if (isSplatValue(LHS) || isSplatValue(RHS)) {
890 auto *OldBinOp = cast<BinaryOperator>(Vec);
891 auto OpC = OldBinOp->getOpcode();
892 auto *NewLHS =
893 Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
894 auto *NewRHS =
895 Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
896 auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags(
897 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), &II);
898 return IC.replaceInstUsesWith(II, NewBinOp);
899 }
900 }
901
902 auto *C = dyn_cast<Constant>(Pg);
903 if (IsAfter && C && C->isNullValue()) {
904 // The intrinsic is extracting lane 0 so use an extract instead.
905 auto *IdxTy = Type::getInt64Ty(II.getContext());
906 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
907 Extract->insertBefore(&II);
908 Extract->takeName(&II);
909 return IC.replaceInstUsesWith(II, Extract);
910 }
911
912 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
913 if (!IntrPG)
914 return std::nullopt;
915
916 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
917 return std::nullopt;
918
919 const auto PTruePattern =
920 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
921
922 // Can the intrinsic's predicate be converted to a known constant index?
923 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
924 if (!MinNumElts)
925 return std::nullopt;
926
927 unsigned Idx = MinNumElts - 1;
928 // Increment the index if extracting the element after the last active
929 // predicate element.
930 if (IsAfter)
931 ++Idx;
932
933 // Ignore extracts whose index is larger than the known minimum vector
934 // length. NOTE: This is an artificial constraint where we prefer to
935 // maintain what the user asked for until an alternative is proven faster.
936 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
937 if (Idx >= PgVTy->getMinNumElements())
938 return std::nullopt;
939
940 // The intrinsic is extracting a fixed lane so use an extract instead.
941 auto *IdxTy = Type::getInt64Ty(II.getContext());
942 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
943 Extract->insertBefore(&II);
944 Extract->takeName(&II);
945 return IC.replaceInstUsesWith(II, Extract);
946}
947
948static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
949 IntrinsicInst &II) {
950 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
951 // integer variant across a variety of micro-architectures. Replace scalar
952 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
953 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
954 // depending on the micro-architecture, but has been observed as generally
955 // being faster, particularly when the CLAST[AB] op is a loop-carried
956 // dependency.
957 IRBuilder<> Builder(II.getContext());
958 Builder.SetInsertPoint(&II);
959 Value *Pg = II.getArgOperand(0);
960 Value *Fallback = II.getArgOperand(1);
961 Value *Vec = II.getArgOperand(2);
962 Type *Ty = II.getType();
963
964 if (!Ty->isIntegerTy())
965 return std::nullopt;
966
967 Type *FPTy;
968 switch (cast<IntegerType>(Ty)->getBitWidth()) {
969 default:
970 return std::nullopt;
971 case 16:
972 FPTy = Builder.getHalfTy();
973 break;
974 case 32:
975 FPTy = Builder.getFloatTy();
976 break;
977 case 64:
978 FPTy = Builder.getDoubleTy();
979 break;
980 }
981
982 Value *FPFallBack = Builder.CreateBitCast(Fallback, FPTy);
983 auto *FPVTy = VectorType::get(
984 FPTy, cast<VectorType>(Vec->getType())->getElementCount());
985 Value *FPVec = Builder.CreateBitCast(Vec, FPVTy);
986 auto *FPII = Builder.CreateIntrinsic(II.getIntrinsicID(), {FPVec->getType()},
987 {Pg, FPFallBack, FPVec});
988 Value *FPIItoInt = Builder.CreateBitCast(FPII, II.getType());
989 return IC.replaceInstUsesWith(II, FPIItoInt);
990}
991
992static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
993 IntrinsicInst &II) {
994 LLVMContext &Ctx = II.getContext();
995 IRBuilder<> Builder(Ctx);
996 Builder.SetInsertPoint(&II);
997 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
998 // can work with RDFFR_PP for ptest elimination.
999 auto *AllPat =
1000 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
1001 auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1002 {II.getType()}, {AllPat});
1003 auto *RDFFR =
1004 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue});
1005 RDFFR->takeName(&II);
1006 return IC.replaceInstUsesWith(II, RDFFR);
1007}
1008
1009static std::optional<Instruction *>
1010instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) {
1011 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
1012
1013 if (Pattern == AArch64SVEPredPattern::all) {
1014 LLVMContext &Ctx = II.getContext();
1015 IRBuilder<> Builder(Ctx);
1016 Builder.SetInsertPoint(&II);
1017
1018 Constant *StepVal = ConstantInt::get(II.getType(), NumElts);
1019 auto *VScale = Builder.CreateVScale(StepVal);
1020 VScale->takeName(&II);
1021 return IC.replaceInstUsesWith(II, VScale);
1022 }
1023
1024 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
1025
1026 return MinNumElts && NumElts >= MinNumElts
1027 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
1028 II, ConstantInt::get(II.getType(), MinNumElts)))
1029 : std::nullopt;
1030}
1031
1032static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
1033 IntrinsicInst &II) {
1034 Value *PgVal = II.getArgOperand(0);
1035 Value *OpVal = II.getArgOperand(1);
1036
1037 IRBuilder<> Builder(II.getContext());
1038 Builder.SetInsertPoint(&II);
1039
1040 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
1041 // Later optimizations prefer this form.
1042 if (PgVal == OpVal &&
1043 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
1044 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
1045 Value *Ops[] = {PgVal, OpVal};
1046 Type *Tys[] = {PgVal->getType()};
1047
1048 auto *PTest =
1049 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
1050 PTest->takeName(&II);
1051
1052 return IC.replaceInstUsesWith(II, PTest);
1053 }
1054
1055 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(PgVal);
1056 IntrinsicInst *Op = dyn_cast<IntrinsicInst>(OpVal);
1057
1058 if (!Pg || !Op)
1059 return std::nullopt;
1060
1061 Intrinsic::ID OpIID = Op->getIntrinsicID();
1062
1063 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
1064 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
1065 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
1066 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
1067 Type *Tys[] = {Pg->getArgOperand(0)->getType()};
1068
1069 auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1070
1071 PTest->takeName(&II);
1072 return IC.replaceInstUsesWith(II, PTest);
1073 }
1074
1075 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
1076 // Later optimizations may rewrite sequence to use the flag-setting variant
1077 // of instruction X to remove PTEST.
1078 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
1079 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
1080 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
1081 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
1082 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
1083 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
1084 (OpIID == Intrinsic::aarch64_sve_and_z) ||
1085 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
1086 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
1087 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
1088 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
1089 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
1090 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
1091 Value *Ops[] = {Pg->getArgOperand(0), Pg};
1092 Type *Tys[] = {Pg->getType()};
1093
1094 auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1095 PTest->takeName(&II);
1096
1097 return IC.replaceInstUsesWith(II, PTest);
1098 }
1099
1100 return std::nullopt;
1101}
1102
1103template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc>
1104static std::optional<Instruction *>
1105instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II,
1106 bool MergeIntoAddendOp) {
1107 Value *P = II.getOperand(0);
1108 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
1109 if (MergeIntoAddendOp) {
1110 AddendOp = II.getOperand(1);
1111 Mul = II.getOperand(2);
1112 } else {
1113 AddendOp = II.getOperand(2);
1114 Mul = II.getOperand(1);
1115 }
1116
1117 if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(P), m_Value(MulOp0),
1118 m_Value(MulOp1))))
1119 return std::nullopt;
1120
1121 if (!Mul->hasOneUse())
1122 return std::nullopt;
1123
1124 Instruction *FMFSource = nullptr;
1125 if (II.getType()->isFPOrFPVectorTy()) {
1126 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
1127 // Stop the combine when the flags on the inputs differ in case dropping
1128 // flags would lead to us missing out on more beneficial optimizations.
1129 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
1130 return std::nullopt;
1131 if (!FAddFlags.allowContract())
1132 return std::nullopt;
1133 FMFSource = &II;
1134 }
1135
1136 IRBuilder<> Builder(II.getContext());
1137 Builder.SetInsertPoint(&II);
1138
1139 CallInst *Res;
1140 if (MergeIntoAddendOp)
1141 Res = Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1142 {P, AddendOp, MulOp0, MulOp1}, FMFSource);
1143 else
1144 Res = Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1145 {P, MulOp0, MulOp1, AddendOp}, FMFSource);
1146
1147 return IC.replaceInstUsesWith(II, Res);
1148}
1149
1150static bool isAllActivePredicate(Value *Pred) {
1151 // Look through convert.from.svbool(convert.to.svbool(...) chain.
1152 Value *UncastedPred;
1153 if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
1154 m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
1155 m_Value(UncastedPred)))))
1156 // If the predicate has the same or less lanes than the uncasted
1157 // predicate then we know the casting has no effect.
1158 if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
1159 cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
1160 Pred = UncastedPred;
1161
1162 return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1163 m_ConstantInt<AArch64SVEPredPattern::all>()));
1164}
1165
1166static std::optional<Instruction *>
1167instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
1168 IRBuilder<> Builder(II.getContext());
1169 Builder.SetInsertPoint(&II);
1170
1171 Value *Pred = II.getOperand(0);
1172 Value *PtrOp = II.getOperand(1);
1173 Type *VecTy = II.getType();
1174 Value *VecPtr = Builder.CreateBitCast(PtrOp, VecTy->getPointerTo());
1175
1176 if (isAllActivePredicate(Pred)) {
1177 LoadInst *Load = Builder.CreateLoad(VecTy, VecPtr);
1178 Load->copyMetadata(II);
1179 return IC.replaceInstUsesWith(II, Load);
1180 }
1181
1182 CallInst *MaskedLoad =
1183 Builder.CreateMaskedLoad(VecTy, VecPtr, PtrOp->getPointerAlignment(DL),
1184 Pred, ConstantAggregateZero::get(VecTy));
1185 MaskedLoad->copyMetadata(II);
1186 return IC.replaceInstUsesWith(II, MaskedLoad);
1187}
1188
1189static std::optional<Instruction *>
1190instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
1191 IRBuilder<> Builder(II.getContext());
1192 Builder.SetInsertPoint(&II);
1193
1194 Value *VecOp = II.getOperand(0);
1195 Value *Pred = II.getOperand(1);
1196 Value *PtrOp = II.getOperand(2);
1197 Value *VecPtr =
1198 Builder.CreateBitCast(PtrOp, VecOp->getType()->getPointerTo());
1199
1200 if (isAllActivePredicate(Pred)) {
1201 StoreInst *Store = Builder.CreateStore(VecOp, VecPtr);
1202 Store->copyMetadata(II);
1203 return IC.eraseInstFromFunction(II);
1204 }
1205
1206 CallInst *MaskedStore = Builder.CreateMaskedStore(
1207 VecOp, VecPtr, PtrOp->getPointerAlignment(DL), Pred);
1208 MaskedStore->copyMetadata(II);
1209 return IC.eraseInstFromFunction(II);
1210}
1211
1212static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) {
1213 switch (Intrinsic) {
1214 case Intrinsic::aarch64_sve_fmul:
1215 return Instruction::BinaryOps::FMul;
1216 case Intrinsic::aarch64_sve_fadd:
1217 return Instruction::BinaryOps::FAdd;
1218 case Intrinsic::aarch64_sve_fsub:
1219 return Instruction::BinaryOps::FSub;
1220 default:
1221 return Instruction::BinaryOpsEnd;
1222 }
1223}
1224
1225static std::optional<Instruction *>
1226instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II) {
1227 // Bail due to missing support for ISD::STRICT_ scalable vector operations.
1228 if (II.isStrictFP())
1229 return std::nullopt;
1230
1231 auto *OpPredicate = II.getOperand(0);
1232 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
1233 if (BinOpCode == Instruction::BinaryOpsEnd ||
1234 !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1235 m_ConstantInt<AArch64SVEPredPattern::all>())))
1236 return std::nullopt;
1237 IRBuilder<> Builder(II.getContext());
1238 Builder.SetInsertPoint(&II);
1239 Builder.setFastMathFlags(II.getFastMathFlags());
1240 auto BinOp =
1241 Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2));
1242 return IC.replaceInstUsesWith(II, BinOp);
1243}
1244
1245static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
1246 IntrinsicInst &II) {
1247 if (auto FMLA =
1248 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1249 Intrinsic::aarch64_sve_fmla>(IC, II,
1250 true))
1251 return FMLA;
1252 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1253 Intrinsic::aarch64_sve_mla>(
1254 IC, II, true))
1255 return MLA;
1256 if (auto FMAD =
1257 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1258 Intrinsic::aarch64_sve_fmad>(IC, II,
1259 false))
1260 return FMAD;
1261 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1262 Intrinsic::aarch64_sve_mad>(
1263 IC, II, false))
1264 return MAD;
1265 return instCombineSVEVectorBinOp(IC, II);
1266}
1267
1268static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
1269 IntrinsicInst &II) {
1270 if (auto FMLS =
1271 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1272 Intrinsic::aarch64_sve_fmls>(IC, II,
1273 true))
1274 return FMLS;
1275 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1276 Intrinsic::aarch64_sve_mls>(
1277 IC, II, true))
1278 return MLS;
1279 if (auto FMSB =
1280 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1281 Intrinsic::aarch64_sve_fnmsb>(
1282 IC, II, false))
1283 return FMSB;
1284 return instCombineSVEVectorBinOp(IC, II);
1285}
1286
1287static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
1288 IntrinsicInst &II) {
1289 auto *OpPredicate = II.getOperand(0);
1290 auto *OpMultiplicand = II.getOperand(1);
1291 auto *OpMultiplier = II.getOperand(2);
1292
1293 IRBuilder<> Builder(II.getContext());
1294 Builder.SetInsertPoint(&II);
1295
1296 // Return true if a given instruction is a unit splat value, false otherwise.
1297 auto IsUnitSplat = [](auto *I) {
1298 auto *SplatValue = getSplatValue(I);
1299 if (!SplatValue)
1300 return false;
1301 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1302 };
1303
1304 // Return true if a given instruction is an aarch64_sve_dup intrinsic call
1305 // with a unit splat value, false otherwise.
1306 auto IsUnitDup = [](auto *I) {
1307 auto *IntrI = dyn_cast<IntrinsicInst>(I);
1308 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
1309 return false;
1310
1311 auto *SplatValue = IntrI->getOperand(2);
1312 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1313 };
1314
1315 if (IsUnitSplat(OpMultiplier)) {
1316 // [f]mul pg %n, (dupx 1) => %n
1317 OpMultiplicand->takeName(&II);
1318 return IC.replaceInstUsesWith(II, OpMultiplicand);
1319 } else if (IsUnitDup(OpMultiplier)) {
1320 // [f]mul pg %n, (dup pg 1) => %n
1321 auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
1322 auto *DupPg = DupInst->getOperand(1);
1323 // TODO: this is naive. The optimization is still valid if DupPg
1324 // 'encompasses' OpPredicate, not only if they're the same predicate.
1325 if (OpPredicate == DupPg) {
1326 OpMultiplicand->takeName(&II);
1327 return IC.replaceInstUsesWith(II, OpMultiplicand);
1328 }
1329 }
1330
1331 return instCombineSVEVectorBinOp(IC, II);
1332}
1333
1334static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
1335 IntrinsicInst &II) {
1336 IRBuilder<> Builder(II.getContext());
1337 Builder.SetInsertPoint(&II);
1338 Value *UnpackArg = II.getArgOperand(0);
1339 auto *RetTy = cast<ScalableVectorType>(II.getType());
1340 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
1341 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
1342
1343 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
1344 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
1345 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
1346 ScalarArg =
1347 Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
1348 Value *NewVal =
1349 Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
1350 NewVal->takeName(&II);
1351 return IC.replaceInstUsesWith(II, NewVal);
1352 }
1353
1354 return std::nullopt;
1355}
1356static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
1357 IntrinsicInst &II) {
1358 auto *OpVal = II.getOperand(0);
1359 auto *OpIndices = II.getOperand(1);
1360 VectorType *VTy = cast<VectorType>(II.getType());
1361
1362 // Check whether OpIndices is a constant splat value < minimal element count
1363 // of result.
1364 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
1365 if (!SplatValue ||
1366 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
1367 return std::nullopt;
1368
1369 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
1370 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
1371 IRBuilder<> Builder(II.getContext());
1372 Builder.SetInsertPoint(&II);
1373 auto *Extract = Builder.CreateExtractElement(OpVal, SplatValue);
1374 auto *VectorSplat =
1375 Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
1376
1377 VectorSplat->takeName(&II);
1378 return IC.replaceInstUsesWith(II, VectorSplat);
1379}
1380
1381static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
1382 IntrinsicInst &II) {
1383 // zip1(uzp1(A, B), uzp2(A, B)) --> A
1384 // zip2(uzp1(A, B), uzp2(A, B)) --> B
1385 Value *A, *B;
1386 if (match(II.getArgOperand(0),
1387 m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) &&
1388 match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
1389 m_Specific(A), m_Specific(B))))
1390 return IC.replaceInstUsesWith(
1391 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
1392
1393 return std::nullopt;
1394}
1395
1396static std::optional<Instruction *>
1397instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) {
1398 Value *Mask = II.getOperand(0);
1399 Value *BasePtr = II.getOperand(1);
1400 Value *Index = II.getOperand(2);
1401 Type *Ty = II.getType();
1402 Value *PassThru = ConstantAggregateZero::get(Ty);
1403
1404 // Contiguous gather => masked load.
1405 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
1406 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
1407 Value *IndexBase;
1408 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1409 m_Value(IndexBase), m_SpecificInt(1)))) {
1410 IRBuilder<> Builder(II.getContext());
1411 Builder.SetInsertPoint(&II);
1412
1413 Align Alignment =
1414 BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
1415
1416 Type *VecPtrTy = PointerType::getUnqual(Ty);
1417 Value *Ptr = Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1418 BasePtr, IndexBase);
1419 Ptr = Builder.CreateBitCast(Ptr, VecPtrTy);
1420 CallInst *MaskedLoad =
1421 Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
1422 MaskedLoad->takeName(&II);
1423 return IC.replaceInstUsesWith(II, MaskedLoad);
1424 }
1425
1426 return std::nullopt;
1427}
1428
1429static std::optional<Instruction *>
1430instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II) {
1431 Value *Val = II.getOperand(0);
1432 Value *Mask = II.getOperand(1);
1433 Value *BasePtr = II.getOperand(2);
1434 Value *Index = II.getOperand(3);
1435 Type *Ty = Val->getType();
1436
1437 // Contiguous scatter => masked store.
1438 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
1439 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
1440 Value *IndexBase;
1441 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1442 m_Value(IndexBase), m_SpecificInt(1)))) {
1443 IRBuilder<> Builder(II.getContext());
1444 Builder.SetInsertPoint(&II);
1445
1446 Align Alignment =
1447 BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
1448
1449 Value *Ptr = Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1450 BasePtr, IndexBase);
1451 Type *VecPtrTy = PointerType::getUnqual(Ty);
1452 Ptr = Builder.CreateBitCast(Ptr, VecPtrTy);
1453
1454 (void)Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
1455
1456 return IC.eraseInstFromFunction(II);
1457 }
1458
1459 return std::nullopt;
1460}
1461
1462static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
1463 IntrinsicInst &II) {
1464 IRBuilder<> Builder(II.getContext());
1465 Builder.SetInsertPoint(&II);
1466 Type *Int32Ty = Builder.getInt32Ty();
1467 Value *Pred = II.getOperand(0);
1468 Value *Vec = II.getOperand(1);
1469 Value *DivVec = II.getOperand(2);
1470
1471 Value *SplatValue = getSplatValue(DivVec);
1472 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
1473 if (!SplatConstantInt)
1474 return std::nullopt;
1475 APInt Divisor = SplatConstantInt->getValue();
1476
1477 if (Divisor.isPowerOf2()) {
1478 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
1479 auto ASRD = Builder.CreateIntrinsic(
1480 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
1481 return IC.replaceInstUsesWith(II, ASRD);
1482 }
1483 if (Divisor.isNegatedPowerOf2()) {
1484 Divisor.negate();
1485 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
1486 auto ASRD = Builder.CreateIntrinsic(
1487 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
1488 auto NEG = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_neg,
1489 {ASRD->getType()}, {ASRD, Pred, ASRD});
1490 return IC.replaceInstUsesWith(II, NEG);
1491 }
1492
1493 return std::nullopt;
1494}
1495
1496bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
1497 size_t VecSize = Vec.size();
1498 if (VecSize == 1)
1499 return true;
1500 if (!isPowerOf2_64(VecSize))
1501 return false;
1502 size_t HalfVecSize = VecSize / 2;
1503
1504 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
1505 RHS != Vec.end(); LHS++, RHS++) {
1506 if (*LHS != nullptr && *RHS != nullptr) {
1507 if (*LHS == *RHS)
1508 continue;
1509 else
1510 return false;
1511 }
1512 if (!AllowPoison)
1513 return false;
1514 if (*LHS == nullptr && *RHS != nullptr)
1515 *LHS = *RHS;
1516 }
1517
1518 Vec.resize(HalfVecSize);
1519 SimplifyValuePattern(Vec, AllowPoison);
1520 return true;
1521}
1522
1523// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
1524// to dupqlane(f64(C)) where C is A concatenated with B
1525static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
1526 IntrinsicInst &II) {
1527 Value *CurrentInsertElt = nullptr, *Default = nullptr;
1528 if (!match(II.getOperand(0),
1529 m_Intrinsic<Intrinsic::vector_insert>(
1530 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
1531 !isa<FixedVectorType>(CurrentInsertElt->getType()))
1532 return std::nullopt;
1533 auto IIScalableTy = cast<ScalableVectorType>(II.getType());
1534
1535 // Insert the scalars into a container ordered by InsertElement index
1536 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
1537 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
1538 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
1539 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
1540 CurrentInsertElt = InsertElt->getOperand(0);
1541 }
1542
1543 bool AllowPoison =
1544 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
1545 if (!SimplifyValuePattern(Elts, AllowPoison))
1546 return std::nullopt;
1547
1548 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
1549 IRBuilder<> Builder(II.getContext());
1550 Builder.SetInsertPoint(&II);
1551 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
1552 for (size_t I = 0; I < Elts.size(); I++) {
1553 if (Elts[I] == nullptr)
1554 continue;
1555 InsertEltChain = Builder.CreateInsertElement(InsertEltChain, Elts[I],
1556 Builder.getInt64(I));
1557 }
1558 if (InsertEltChain == nullptr)
1559 return std::nullopt;
1560
1561 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
1562 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
1563 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
1564 // be narrowed back to the original type.
1565 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
1566 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
1567 IIScalableTy->getMinNumElements() /
1568 PatternWidth;
1569
1570 IntegerType *WideTy = Builder.getIntNTy(PatternWidth);
1571 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
1572 auto *WideShuffleMaskTy =
1573 ScalableVectorType::get(Builder.getInt32Ty(), PatternElementCount);
1574
1575 auto ZeroIdx = ConstantInt::get(Builder.getInt64Ty(), APInt(64, 0));
1576 auto InsertSubvector = Builder.CreateInsertVector(
1577 II.getType(), PoisonValue::get(II.getType()), InsertEltChain, ZeroIdx);
1578 auto WideBitcast =
1579 Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
1580 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
1581 auto WideShuffle = Builder.CreateShuffleVector(
1582 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
1583 auto NarrowBitcast =
1584 Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
1585
1586 return IC.replaceInstUsesWith(II, NarrowBitcast);
1587}
1588
1589static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
1590 IntrinsicInst &II) {
1591 Value *A = II.getArgOperand(0);
1592 Value *B = II.getArgOperand(1);
1593 if (A == B)
1594 return IC.replaceInstUsesWith(II, A);
1595
1596 return std::nullopt;
1597}
1598
1599static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
1600 IntrinsicInst &II) {
1601 IRBuilder<> Builder(&II);
1602 Value *Pred = II.getOperand(0);
1603 Value *Vec = II.getOperand(1);
1604 Value *Shift = II.getOperand(2);
1605
1606 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
1607 Value *AbsPred, *MergedValue;
1608 if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(
1609 m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
1610 !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>(
1611 m_Value(MergedValue), m_Value(AbsPred), m_Value())))
1612
1613 return std::nullopt;
1614
1615 // Transform is valid if any of the following are true:
1616 // * The ABS merge value is an undef or non-negative
1617 // * The ABS predicate is all active
1618 // * The ABS predicate and the SRSHL predicates are the same
1619 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
1620 AbsPred != Pred && !isAllActivePredicate(AbsPred))
1621 return std::nullopt;
1622
1623 // Only valid when the shift amount is non-negative, otherwise the rounding
1624 // behaviour of SRSHL cannot be ignored.
1625 if (!match(Shift, m_NonNegative()))
1626 return std::nullopt;
1627
1628 auto LSL = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl, {II.getType()},
1629 {Pred, Vec, Shift});
1630
1631 return IC.replaceInstUsesWith(II, LSL);
1632}
1633
1634std::optional<Instruction *>
1635AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
1636 IntrinsicInst &II) const {
1637 Intrinsic::ID IID = II.getIntrinsicID();
1638 switch (IID) {
1639 default:
1640 break;
1641 case Intrinsic::aarch64_neon_fmaxnm:
1642 case Intrinsic::aarch64_neon_fminnm:
1643 return instCombineMaxMinNM(IC, II);
1644 case Intrinsic::aarch64_sve_convert_from_svbool:
1645 return instCombineConvertFromSVBool(IC, II);
1646 case Intrinsic::aarch64_sve_dup:
1647 return instCombineSVEDup(IC, II);
1648 case Intrinsic::aarch64_sve_dup_x:
1649 return instCombineSVEDupX(IC, II);
1650 case Intrinsic::aarch64_sve_cmpne:
1651 case Intrinsic::aarch64_sve_cmpne_wide:
1652 return instCombineSVECmpNE(IC, II);
1653 case Intrinsic::aarch64_sve_rdffr:
1654 return instCombineRDFFR(IC, II);
1655 case Intrinsic::aarch64_sve_lasta:
1656 case Intrinsic::aarch64_sve_lastb:
1657 return instCombineSVELast(IC, II);
1658 case Intrinsic::aarch64_sve_clasta_n:
1659 case Intrinsic::aarch64_sve_clastb_n:
1660 return instCombineSVECondLast(IC, II);
1661 case Intrinsic::aarch64_sve_cntd:
1662 return instCombineSVECntElts(IC, II, 2);
1663 case Intrinsic::aarch64_sve_cntw:
1664 return instCombineSVECntElts(IC, II, 4);
1665 case Intrinsic::aarch64_sve_cnth:
1666 return instCombineSVECntElts(IC, II, 8);
1667 case Intrinsic::aarch64_sve_cntb:
1668 return instCombineSVECntElts(IC, II, 16);
1669 case Intrinsic::aarch64_sve_ptest_any:
1670 case Intrinsic::aarch64_sve_ptest_first:
1671 case Intrinsic::aarch64_sve_ptest_last:
1672 return instCombineSVEPTest(IC, II);
1673 case Intrinsic::aarch64_sve_mul:
1674 case Intrinsic::aarch64_sve_fmul:
1675 return instCombineSVEVectorMul(IC, II);
1676 case Intrinsic::aarch64_sve_fadd:
1677 case Intrinsic::aarch64_sve_add:
1678 return instCombineSVEVectorAdd(IC, II);
1679 case Intrinsic::aarch64_sve_fadd_u:
1680 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1681 Intrinsic::aarch64_sve_fmla_u>(
1682 IC, II, true);
1683 case Intrinsic::aarch64_sve_fsub:
1684 case Intrinsic::aarch64_sve_sub:
1685 return instCombineSVEVectorSub(IC, II);
1686 case Intrinsic::aarch64_sve_fsub_u:
1687 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1688 Intrinsic::aarch64_sve_fmls_u>(
1689 IC, II, true);
1690 case Intrinsic::aarch64_sve_tbl:
1691 return instCombineSVETBL(IC, II);
1692 case Intrinsic::aarch64_sve_uunpkhi:
1693 case Intrinsic::aarch64_sve_uunpklo:
1694 case Intrinsic::aarch64_sve_sunpkhi:
1695 case Intrinsic::aarch64_sve_sunpklo:
1696 return instCombineSVEUnpack(IC, II);
1697 case Intrinsic::aarch64_sve_zip1:
1698 case Intrinsic::aarch64_sve_zip2:
1699 return instCombineSVEZip(IC, II);
1700 case Intrinsic::aarch64_sve_ld1_gather_index:
1701 return instCombineLD1GatherIndex(IC, II);
1702 case Intrinsic::aarch64_sve_st1_scatter_index:
1703 return instCombineST1ScatterIndex(IC, II);
1704 case Intrinsic::aarch64_sve_ld1:
1705 return instCombineSVELD1(IC, II, DL);
1706 case Intrinsic::aarch64_sve_st1:
1707 return instCombineSVEST1(IC, II, DL);
1708 case Intrinsic::aarch64_sve_sdiv:
1709 return instCombineSVESDIV(IC, II);
1710 case Intrinsic::aarch64_sve_sel:
1711 return instCombineSVESel(IC, II);
1712 case Intrinsic::aarch64_sve_srshl:
1713 return instCombineSVESrshl(IC, II);
1714 case Intrinsic::aarch64_sve_dupq_lane:
1715 return instCombineSVEDupqLane(IC, II);
1716 }
1717
1718 return std::nullopt;
1719}
1720
1721std::optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic(
1722 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
1723 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
1724 std::function<void(Instruction *, unsigned, APInt, APInt &)>
1725 SimplifyAndSetOp) const {
1726 switch (II.getIntrinsicID()) {
1727 default:
1728 break;
1729 case Intrinsic::aarch64_neon_fcvtxn:
1730 case Intrinsic::aarch64_neon_rshrn:
1731 case Intrinsic::aarch64_neon_sqrshrn:
1732 case Intrinsic::aarch64_neon_sqrshrun:
1733 case Intrinsic::aarch64_neon_sqshrn:
1734 case Intrinsic::aarch64_neon_sqshrun:
1735 case Intrinsic::aarch64_neon_sqxtn:
1736 case Intrinsic::aarch64_neon_sqxtun:
1737 case Intrinsic::aarch64_neon_uqrshrn:
1738 case Intrinsic::aarch64_neon_uqshrn:
1739 case Intrinsic::aarch64_neon_uqxtn:
1740 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
1741 break;
1742 }
1743
1744 return std::nullopt;
1745}
1746
1747TypeSize
1748AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
1749 switch (K) {
1750 case TargetTransformInfo::RGK_Scalar:
1751 return TypeSize::getFixed(64);
1752 case TargetTransformInfo::RGK_FixedWidthVector:
1753 if (!ST->isStreamingSVEModeDisabled() &&
1754 !EnableFixedwidthAutovecInStreamingMode)
1755 return TypeSize::getFixed(0);
1756
1757 if (ST->hasSVE())
1758 return TypeSize::getFixed(
1759 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
1760
1761 return TypeSize::getFixed(ST->hasNEON() ? 128 : 0);
1762 case TargetTransformInfo::RGK_ScalableVector:
1763 if (!ST->isStreamingSVEModeDisabled() && !EnableScalableAutovecInStreamingMode)
1764 return TypeSize::getScalable(0);
1765
1766 return TypeSize::getScalable(ST->hasSVE() ? 128 : 0);
1767 }
1768 llvm_unreachable("Unsupported register kind")::llvm::llvm_unreachable_internal("Unsupported register kind"
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 1768
)
;
1769}
1770
1771bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
1772 ArrayRef<const Value *> Args) {
1773
1774 // A helper that returns a vector type from the given type. The number of
1775 // elements in type Ty determines the vector width.
1776 auto toVectorTy = [&](Type *ArgTy) {
1777 return VectorType::get(ArgTy->getScalarType(),
1778 cast<VectorType>(DstTy)->getElementCount());
1779 };
1780
1781 // Exit early if DstTy is not a vector type whose elements are at least
1782 // 16-bits wide. SVE doesn't generally have the same set of instructions to
1783 // perform an extend with the add/sub/mul. There are SMULLB style
1784 // instructions, but they operate on top/bottom, requiring some sort of lane
1785 // interleaving to be used with zext/sext.
1786 if (!useNeonVector(DstTy) || DstTy->getScalarSizeInBits() < 16)
1787 return false;
1788
1789 // Determine if the operation has a widening variant. We consider both the
1790 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
1791 // instructions.
1792 //
1793 // TODO: Add additional widening operations (e.g., shl, etc.) once we
1794 // verify that their extending operands are eliminated during code
1795 // generation.
1796 switch (Opcode) {
1797 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
1798 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
1799 case Instruction::Mul: // SMULL(2), UMULL(2)
1800 break;
1801 default:
1802 return false;
1803 }
1804
1805 // To be a widening instruction (either the "wide" or "long" versions), the
1806 // second operand must be a sign- or zero extend.
1807 if (Args.size() != 2 ||
1808 (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])))
1809 return false;
1810 auto *Extend = cast<CastInst>(Args[1]);
1811 auto *Arg0 = dyn_cast<CastInst>(Args[0]);
1812
1813 // A mul only has a mull version (not like addw). Both operands need to be
1814 // extending and the same type.
1815 if (Opcode == Instruction::Mul &&
1816 (!Arg0 || Arg0->getOpcode() != Extend->getOpcode() ||
1817 Arg0->getOperand(0)->getType() != Extend->getOperand(0)->getType()))
1818 return false;
1819
1820 // Legalize the destination type and ensure it can be used in a widening
1821 // operation.
1822 auto DstTyL = getTypeLegalizationCost(DstTy);
1823 unsigned DstElTySize = DstTyL.second.getScalarSizeInBits();
1824 if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits())
1825 return false;
1826
1827 // Legalize the source type and ensure it can be used in a widening
1828 // operation.
1829 auto *SrcTy = toVectorTy(Extend->getSrcTy());
1830 auto SrcTyL = getTypeLegalizationCost(SrcTy);
1831 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
1832 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
1833 return false;
1834
1835 // Get the total number of vector elements in the legalized types.
1836 InstructionCost NumDstEls =
1837 DstTyL.first * DstTyL.second.getVectorMinNumElements();
1838 InstructionCost NumSrcEls =
1839 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
1840
1841 // Return true if the legalized types have the same number of vector elements
1842 // and the destination element type size is twice that of the source type.
1843 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize;
1844}
1845
1846InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
1847 Type *Src,
1848 TTI::CastContextHint CCH,
1849 TTI::TargetCostKind CostKind,
1850 const Instruction *I) {
1851 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1852 assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ?
void (0) : __assert_fail ("ISD && \"Invalid opcode\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 1852
, __extension__ __PRETTY_FUNCTION__))
;
1853
1854 // If the cast is observable, and it is used by a widening instruction (e.g.,
1855 // uaddl, saddw, etc.), it may be free.
1856 if (I && I->hasOneUser()) {
1857 auto *SingleUser = cast<Instruction>(*I->user_begin());
1858 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
1859 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
1860 // If the cast is the second operand, it is free. We will generate either
1861 // a "wide" or "long" version of the widening instruction.
1862 if (I == SingleUser->getOperand(1))
1863 return 0;
1864 // If the cast is not the second operand, it will be free if it looks the
1865 // same as the second operand. In this case, we will generate a "long"
1866 // version of the widening instruction.
1867 if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
1868 if (I->getOpcode() == unsigned(Cast->getOpcode()) &&
1869 cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
1870 return 0;
1871 }
1872 }
1873
1874 // TODO: Allow non-throughput costs that aren't binary.
1875 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
1876 if (CostKind != TTI::TCK_RecipThroughput)
1877 return Cost == 0 ? 0 : 1;
1878 return Cost;
1879 };
1880
1881 EVT SrcTy = TLI->getValueType(DL, Src);
1882 EVT DstTy = TLI->getValueType(DL, Dst);
1883
1884 if (!SrcTy.isSimple() || !DstTy.isSimple())
1885 return AdjustCost(
1886 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
1887
1888 static const TypeConversionCostTblEntry
1889 ConversionTbl[] = {
1890 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
1891 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
1892 { ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
1893 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
1894 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
1895 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
1896 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
1897 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
1898 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
1899 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
1900 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
1901 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
1902 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
1903 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
1904 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
1905 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
1906 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
1907 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
1908 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
1909 { ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
1910
1911 // Truncations on nxvmiN
1912 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 },
1913 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 },
1914 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 },
1915 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 },
1916 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 },
1917 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 },
1918 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 },
1919 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 },
1920 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 },
1921 { ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1 },
1922 { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 },
1923 { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 },
1924 { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 },
1925 { ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 },
1926 { ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 },
1927 { ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 },
1928
1929 // The number of shll instructions for the extension.
1930 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
1931 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
1932 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
1933 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
1934 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
1935 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
1936 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
1937 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
1938 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
1939 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
1940 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
1941 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
1942 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
1943 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
1944 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
1945 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
1946
1947 // LowerVectorINT_TO_FP:
1948 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
1949 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
1950 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
1951 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
1952 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
1953 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
1954
1955 // Complex: to v2f32
1956 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
1957 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
1958 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
1959 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
1960 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
1961 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
1962
1963 // Complex: to v4f32
1964 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 },
1965 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
1966 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
1967 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
1968
1969 // Complex: to v8f32
1970 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
1971 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
1972 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
1973 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
1974
1975 // Complex: to v16f32
1976 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
1977 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
1978
1979 // Complex: to v2f64
1980 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
1981 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
1982 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
1983 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
1984 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
1985 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
1986
1987 // Complex: to v4f64
1988 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4 },
1989 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4 },
1990
1991 // LowerVectorFP_TO_INT
1992 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
1993 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
1994 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
1995 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
1996 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
1997 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
1998
1999 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
2000 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
2001 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
2002 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 },
2003 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
2004 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
2005 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 },
2006
2007 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
2008 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
2009 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 },
2010 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
2011 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 },
2012
2013 // Complex, from nxv2f32.
2014 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
2015 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
2016 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
2017 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1 },
2018 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
2019 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
2020 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
2021 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1 },
2022
2023 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
2024 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
2025 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
2026 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 },
2027 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
2028 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
2029 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 },
2030
2031 // Complex, from nxv2f64.
2032 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
2033 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
2034 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
2035 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1 },
2036 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
2037 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
2038 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
2039 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1 },
2040
2041 // Complex, from nxv4f32.
2042 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
2043 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
2044 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
2045 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1 },
2046 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
2047 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
2048 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
2049 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1 },
2050
2051 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
2052 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
2053 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7 },
2054 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
2055 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7 },
2056
2057 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
2058 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
2059 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
2060 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3 },
2061 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
2062 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
2063 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3 },
2064
2065 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
2066 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
2067 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3 },
2068 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
2069 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3 },
2070
2071 // Complex, from nxv8f16.
2072 { ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
2073 { ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
2074 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
2075 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1 },
2076 { ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
2077 { ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
2078 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
2079 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1 },
2080
2081 // Complex, from nxv4f16.
2082 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
2083 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
2084 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
2085 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1 },
2086 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
2087 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
2088 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
2089 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1 },
2090
2091 // Complex, from nxv2f16.
2092 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
2093 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
2094 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
2095 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1 },
2096 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
2097 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
2098 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
2099 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1 },
2100
2101 // Truncate from nxvmf32 to nxvmf16.
2102 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 },
2103 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 },
2104 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 },
2105
2106 // Truncate from nxvmf64 to nxvmf16.
2107 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 },
2108 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 },
2109 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 },
2110
2111 // Truncate from nxvmf64 to nxvmf32.
2112 { ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 },
2113 { ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 },
2114 { ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 },
2115
2116 // Extend from nxvmf16 to nxvmf32.
2117 { ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
2118 { ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
2119 { ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
2120
2121 // Extend from nxvmf16 to nxvmf64.
2122 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
2123 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
2124 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
2125
2126 // Extend from nxvmf32 to nxvmf64.
2127 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
2128 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
2129 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
2130
2131 // Bitcasts from float to integer
2132 { ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0 },
2133 { ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0 },
2134 { ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0 },
2135
2136 // Bitcasts from integer to float
2137 { ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0 },
2138 { ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0 },
2139 { ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0 },
2140
2141 // Add cost for extending to illegal -too wide- scalable vectors.
2142 // zero/sign extend are implemented by multiple unpack operations,
2143 // where each operation has a cost of 1.
2144 { ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
2145 { ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
2146 { ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
2147 { ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
2148 { ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
2149 { ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
2150
2151 { ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
2152 { ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
2153 { ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
2154 { ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
2155 { ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
2156 { ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
2157 };
2158
2159 if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
2160 DstTy.getSimpleVT(),
2161 SrcTy.getSimpleVT()))
2162 return AdjustCost(Entry->Cost);
2163
2164 static const TypeConversionCostTblEntry FP16Tbl[] = {
2165 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
2166 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
2167 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
2168 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
2169 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
2170 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
2171 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
2172 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
2173 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
2174 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
2175 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
2176 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
2177 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
2178 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
2179 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
2180 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
2181 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
2182 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
2183 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
2184 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
2185 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
2186 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
2187 };
2188
2189 if (ST->hasFullFP16())
2190 if (const auto *Entry = ConvertCostTableLookup(
2191 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
2192 return AdjustCost(Entry->Cost);
2193
2194 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
2195 // but we also want to include the TTI::CastContextHint::Masked case too.
2196 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
2197 CCH == TTI::CastContextHint::Masked && ST->hasSVEorSME() &&
2198 TLI->isTypeLegal(DstTy))
2199 CCH = TTI::CastContextHint::Normal;
2200
2201 return AdjustCost(
2202 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2203}
2204
2205InstructionCost AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode,
2206 Type *Dst,
2207 VectorType *VecTy,
2208 unsigned Index) {
2209
2210 // Make sure we were given a valid extend opcode.
2211 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&(static_cast <bool> ((Opcode == Instruction::SExt || Opcode
== Instruction::ZExt) && "Invalid opcode") ? void (0
) : __assert_fail ("(Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && \"Invalid opcode\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2212
, __extension__ __PRETTY_FUNCTION__))
2212 "Invalid opcode")(static_cast <bool> ((Opcode == Instruction::SExt || Opcode
== Instruction::ZExt) && "Invalid opcode") ? void (0
) : __assert_fail ("(Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && \"Invalid opcode\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2212
, __extension__ __PRETTY_FUNCTION__))
;
2213
2214 // We are extending an element we extract from a vector, so the source type
2215 // of the extend is the element type of the vector.
2216 auto *Src = VecTy->getElementType();
2217
2218 // Sign- and zero-extends are for integer types only.
2219 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type")(static_cast <bool> (isa<IntegerType>(Dst) &&
isa<IntegerType>(Src) && "Invalid type") ? void
(0) : __assert_fail ("isa<IntegerType>(Dst) && isa<IntegerType>(Src) && \"Invalid type\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2219
, __extension__ __PRETTY_FUNCTION__))
;
2220
2221 // Get the cost for the extract. We compute the cost (if any) for the extend
2222 // below.
2223 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2224 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
2225 CostKind, Index, nullptr, nullptr);
2226
2227 // Legalize the types.
2228 auto VecLT = getTypeLegalizationCost(VecTy);
2229 auto DstVT = TLI->getValueType(DL, Dst);
2230 auto SrcVT = TLI->getValueType(DL, Src);
2231
2232 // If the resulting type is still a vector and the destination type is legal,
2233 // we may get the extension for free. If not, get the default cost for the
2234 // extend.
2235 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
2236 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2237 CostKind);
2238
2239 // The destination type should be larger than the element type. If not, get
2240 // the default cost for the extend.
2241 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
2242 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2243 CostKind);
2244
2245 switch (Opcode) {
2246 default:
2247 llvm_unreachable("Opcode should be either SExt or ZExt")::llvm::llvm_unreachable_internal("Opcode should be either SExt or ZExt"
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2247
)
;
2248
2249 // For sign-extends, we only need a smov, which performs the extension
2250 // automatically.
2251 case Instruction::SExt:
2252 return Cost;
2253
2254 // For zero-extends, the extend is performed automatically by a umov unless
2255 // the destination type is i64 and the element type is i8 or i16.
2256 case Instruction::ZExt:
2257 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
2258 return Cost;
2259 }
2260
2261 // If we are unable to perform the extend for free, get the default cost.
2262 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2263 CostKind);
2264}
2265
2266InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
2267 TTI::TargetCostKind CostKind,
2268 const Instruction *I) {
2269 if (CostKind != TTI::TCK_RecipThroughput)
2270 return Opcode == Instruction::PHI ? 0 : 1;
2271 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind")(static_cast <bool> (CostKind == TTI::TCK_RecipThroughput
&& "unexpected CostKind") ? void (0) : __assert_fail
("CostKind == TTI::TCK_RecipThroughput && \"unexpected CostKind\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2271
, __extension__ __PRETTY_FUNCTION__))
;
2272 // Branches are assumed to be predicted.
2273 return 0;
2274}
2275
2276InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(const Instruction *I,
2277 Type *Val,
2278 unsigned Index,
2279 bool HasRealUse) {
2280 assert(Val->isVectorTy() && "This must be a vector type")(static_cast <bool> (Val->isVectorTy() && "This must be a vector type"
) ? void (0) : __assert_fail ("Val->isVectorTy() && \"This must be a vector type\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2280
, __extension__ __PRETTY_FUNCTION__))
;
2281
2282 if (Index != -1U) {
2283 // Legalize the type.
2284 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
2285
2286 // This type is legalized to a scalar type.
2287 if (!LT.second.isVector())
2288 return 0;
2289
2290 // The type may be split. For fixed-width vectors we can normalize the
2291 // index to the new type.
2292 if (LT.second.isFixedLengthVector()) {
2293 unsigned Width = LT.second.getVectorNumElements();
2294 Index = Index % Width;
2295 }
2296
2297 // The element at index zero is already inside the vector.
2298 // - For a physical (HasRealUse==true) insert-element or extract-element
2299 // instruction that extracts integers, an explicit FPR -> GPR move is
2300 // needed. So it has non-zero cost.
2301 // - For the rest of cases (virtual instruction or element type is float),
2302 // consider the instruction free.
2303 if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
2304 return 0;
2305
2306 // This is recognising a LD1 single-element structure to one lane of one
2307 // register instruction. I.e., if this is an `insertelement` instruction,
2308 // and its second operand is a load, then we will generate a LD1, which
2309 // are expensive instructions.
2310 if (I && dyn_cast<LoadInst>(I->getOperand(1)))
2311 return ST->getVectorInsertExtractBaseCost() + 1;
2312
2313 // FIXME:
2314 // If the extract-element and insert-element instructions could be
2315 // simplified away (e.g., could be combined into users by looking at use-def
2316 // context), they have no cost. This is not done in the first place for
2317 // compile-time considerations.
2318 }
2319
2320 // All other insert/extracts cost this much.
2321 return ST->getVectorInsertExtractBaseCost();
2322}
2323
2324InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
2325 TTI::TargetCostKind CostKind,
2326 unsigned Index, Value *Op0,
2327 Value *Op1) {
2328 bool HasRealUse =
2329 Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
2330 return getVectorInstrCostHelper(nullptr, Val, Index, HasRealUse);
2331}
2332
2333InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
2334 Type *Val,
2335 TTI::TargetCostKind CostKind,
2336 unsigned Index) {
2337 return getVectorInstrCostHelper(&I, Val, Index, true /* HasRealUse */);
2338}
2339
2340InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
2341 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2342 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
2343 ArrayRef<const Value *> Args,
2344 const Instruction *CxtI) {
2345
2346 // TODO: Handle more cost kinds.
2347 if (CostKind != TTI::TCK_RecipThroughput)
2348 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
2349 Op2Info, Args, CxtI);
2350
2351 // Legalize the type.
2352 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2353 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2354
2355 switch (ISD) {
2356 default:
2357 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
2358 Op2Info);
2359 case ISD::SDIV:
2360 if (Op2Info.isConstant() && Op2Info.isUniform() && Op2Info.isPowerOf2()) {
2361 // On AArch64, scalar signed division by constants power-of-two are
2362 // normally expanded to the sequence ADD + CMP + SELECT + SRA.
2363 // The OperandValue properties many not be same as that of previous
2364 // operation; conservatively assume OP_None.
2365 InstructionCost Cost = getArithmeticInstrCost(
2366 Instruction::Add, Ty, CostKind,
2367 Op1Info.getNoProps(), Op2Info.getNoProps());
2368 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
2369 Op1Info.getNoProps(), Op2Info.getNoProps());
2370 Cost += getArithmeticInstrCost(
2371 Instruction::Select, Ty, CostKind,
2372 Op1Info.getNoProps(), Op2Info.getNoProps());
2373 Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
2374 Op1Info.getNoProps(), Op2Info.getNoProps());
2375 return Cost;
2376 }
2377 [[fallthrough]];
2378 case ISD::UDIV: {
2379 if (Op2Info.isConstant() && Op2Info.isUniform()) {
2380 auto VT = TLI->getValueType(DL, Ty);
2381 if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
2382 // Vector signed division by constant are expanded to the
2383 // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
2384 // to MULHS + SUB + SRL + ADD + SRL.
2385 InstructionCost MulCost = getArithmeticInstrCost(
2386 Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
2387 InstructionCost AddCost = getArithmeticInstrCost(
2388 Instruction::Add, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
2389 InstructionCost ShrCost = getArithmeticInstrCost(
2390 Instruction::AShr, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
2391 return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
2392 }
2393 }
2394
2395 InstructionCost Cost = BaseT::getArithmeticInstrCost(
2396 Opcode, Ty, CostKind, Op1Info, Op2Info);
2397 if (Ty->isVectorTy()) {
2398 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
2399 // SDIV/UDIV operations are lowered using SVE, then we can have less
2400 // costs.
2401 if (isa<FixedVectorType>(Ty) && cast<FixedVectorType>(Ty)
2402 ->getPrimitiveSizeInBits()
2403 .getFixedValue() < 128) {
2404 EVT VT = TLI->getValueType(DL, Ty);
2405 static const CostTblEntry DivTbl[]{
2406 {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8},
2407 {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5},
2408 {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
2409 {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8},
2410 {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5},
2411 {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
2412
2413 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
2414 if (nullptr != Entry)
2415 return Entry->Cost;
2416 }
2417 // For 8/16-bit elements, the cost is higher because the type
2418 // requires promotion and possibly splitting:
2419 if (LT.second.getScalarType() == MVT::i8)
2420 Cost *= 8;
2421 else if (LT.second.getScalarType() == MVT::i16)
2422 Cost *= 4;
2423 return Cost;
2424 } else {
2425 // If one of the operands is a uniform constant then the cost for each
2426 // element is Cost for insertion, extraction and division.
2427 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
2428 // operation with scalar type
2429 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
2430 (Op2Info.isConstant() && Op2Info.isUniform())) {
2431 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
2432 InstructionCost DivCost = BaseT::getArithmeticInstrCost(
2433 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
2434 return (4 + DivCost) * VTy->getNumElements();
2435 }
2436 }
2437 // On AArch64, without SVE, vector divisions are expanded
2438 // into scalar divisions of each pair of elements.
2439 Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty,
2440 CostKind, Op1Info, Op2Info);
2441 Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind,
2442 Op1Info, Op2Info);
2443 }
2444
2445 // TODO: if one of the arguments is scalar, then it's not necessary to
2446 // double the cost of handling the vector elements.
2447 Cost += Cost;
2448 }
2449 return Cost;
2450 }
2451 case ISD::MUL:
2452 // When SVE is available, then we can lower the v2i64 operation using
2453 // the SVE mul instruction, which has a lower cost.
2454 if (LT.second == MVT::v2i64 && ST->hasSVE())
2455 return LT.first;
2456
2457 // When SVE is not available, there is no MUL.2d instruction,
2458 // which means mul <2 x i64> is expensive as elements are extracted
2459 // from the vectors and the muls scalarized.
2460 // As getScalarizationOverhead is a bit too pessimistic, we
2461 // estimate the cost for a i64 vector directly here, which is:
2462 // - four 2-cost i64 extracts,
2463 // - two 2-cost i64 inserts, and
2464 // - two 1-cost muls.
2465 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
2466 // LT.first = 2 the cost is 28. If both operands are extensions it will not
2467 // need to scalarize so the cost can be cheaper (smull or umull).
2468 // so the cost can be cheaper (smull or umull).
2469 if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
2470 return LT.first;
2471 return LT.first * 14;
2472 case ISD::ADD:
2473 case ISD::XOR:
2474 case ISD::OR:
2475 case ISD::AND:
2476 case ISD::SRL:
2477 case ISD::SRA:
2478 case ISD::SHL:
2479 // These nodes are marked as 'custom' for combining purposes only.
2480 // We know that they are legal. See LowerAdd in ISelLowering.
2481 return LT.first;
2482
2483 case ISD::FNEG:
2484 case ISD::FADD:
2485 case ISD::FSUB:
2486 // Increase the cost for half and bfloat types if not architecturally
2487 // supported.
2488 if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) ||
2489 (Ty->getScalarType()->isBFloatTy() && !ST->hasBF16()))
2490 return 2 * LT.first;
2491 if (!Ty->getScalarType()->isFP128Ty())
2492 return LT.first;
2493 LLVM_FALLTHROUGH[[fallthrough]];
2494 case ISD::FMUL:
2495 case ISD::FDIV:
2496 // These nodes are marked as 'custom' just to lower them to SVE.
2497 // We know said lowering will incur no additional cost.
2498 if (!Ty->getScalarType()->isFP128Ty())
2499 return 2 * LT.first;
2500
2501 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
2502 Op2Info);
2503 }
2504}
2505
2506InstructionCost AArch64TTIImpl::getAddressComputationCost(Type *Ty,
2507 ScalarEvolution *SE,
2508 const SCEV *Ptr) {
2509 // Address computations in vectorized code with non-consecutive addresses will
2510 // likely result in more instructions compared to scalar code where the
2511 // computation can more often be merged into the index mode. The resulting
2512 // extra micro-ops can significantly decrease throughput.
2513 unsigned NumVectorInstToHideOverhead = 10;
2514 int MaxMergeDistance = 64;
2515
2516 if (Ty->isVectorTy() && SE &&
2517 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
2518 return NumVectorInstToHideOverhead;
2519
2520 // In many cases the address computation is not merged into the instruction
2521 // addressing mode.
2522 return 1;
2523}
2524
2525InstructionCost AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
2526 Type *CondTy,
2527 CmpInst::Predicate VecPred,
2528 TTI::TargetCostKind CostKind,
2529 const Instruction *I) {
2530 // TODO: Handle other cost kinds.
2531 if (CostKind != TTI::TCK_RecipThroughput)
2532 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2533 I);
2534
2535 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2536 // We don't lower some vector selects well that are wider than the register
2537 // width.
2538 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) {
2539 // We would need this many instructions to hide the scalarization happening.
2540 const int AmortizationCost = 20;
2541
2542 // If VecPred is not set, check if we can get a predicate from the context
2543 // instruction, if its type matches the requested ValTy.
2544 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
2545 CmpInst::Predicate CurrentPred;
2546 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
2547 m_Value())))
2548 VecPred = CurrentPred;
2549 }
2550 // Check if we have a compare/select chain that can be lowered using
2551 // a (F)CMxx & BFI pair.
2552 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
2553 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
2554 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
2555 VecPred == CmpInst::FCMP_UNE) {
2556 static const auto ValidMinMaxTys = {
2557 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
2558 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
2559 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
2560
2561 auto LT = getTypeLegalizationCost(ValTy);
2562 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }) ||
2563 (ST->hasFullFP16() &&
2564 any_of(ValidFP16MinMaxTys, [&LT](MVT M) { return M == LT.second; })))
2565 return LT.first;
2566 }
2567
2568 static const TypeConversionCostTblEntry
2569 VectorSelectTbl[] = {
2570 { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
2571 { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
2572 { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
2573 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
2574 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
2575 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
2576 };
2577
2578 EVT SelCondTy = TLI->getValueType(DL, CondTy);
2579 EVT SelValTy = TLI->getValueType(DL, ValTy);
2580 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
2581 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
2582 SelCondTy.getSimpleVT(),
2583 SelValTy.getSimpleVT()))
2584 return Entry->Cost;
2585 }
2586 }
2587 // The base case handles scalable vectors fine for now, since it treats the
2588 // cost as 1 * legalization cost.
2589 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
2590}
2591
2592AArch64TTIImpl::TTI::MemCmpExpansionOptions
2593AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
2594 TTI::MemCmpExpansionOptions Options;
2595 if (ST->requiresStrictAlign()) {
2596 // TODO: Add cost modeling for strict align. Misaligned loads expand to
2597 // a bunch of instructions when strict align is enabled.
2598 return Options;
2599 }
2600 Options.AllowOverlappingLoads = true;
2601 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
2602 Options.NumLoadsPerBlock = Options.MaxNumLoads;
2603 // TODO: Though vector loads usually perform well on AArch64, in some targets
2604 // they may wake up the FP unit, which raises the power consumption. Perhaps
2605 // they could be used with no holds barred (-O3).
2606 Options.LoadSizes = {8, 4, 2, 1};
2607 return Options;
2608}
2609
2610bool AArch64TTIImpl::prefersVectorizedAddressing() const {
2611 return ST->hasSVE();
2612}
2613
2614InstructionCost
2615AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
2616 Align Alignment, unsigned AddressSpace,
2617 TTI::TargetCostKind CostKind) {
2618 if (useNeonVector(Src))
2619 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2620 CostKind);
2621 auto LT = getTypeLegalizationCost(Src);
2622 if (!LT.first.isValid())
2623 return InstructionCost::getInvalid();
2624
2625 // The code-generator is currently not able to handle scalable vectors
2626 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
2627 // it. This change will be removed when code-generation for these types is
2628 // sufficiently reliable.
2629 if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1))
2630 return InstructionCost::getInvalid();
2631
2632 return LT.first;
2633}
2634
2635static unsigned getSVEGatherScatterOverhead(unsigned Opcode) {
2636 return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead;
2637}
2638
2639InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
2640 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
2641 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
2642 if (useNeonVector(DataTy))
2643 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
2644 Alignment, CostKind, I);
2645 auto *VT = cast<VectorType>(DataTy);
2646 auto LT = getTypeLegalizationCost(DataTy);
2647 if (!LT.first.isValid())
2648 return InstructionCost::getInvalid();
2649
2650 // The code-generator is currently not able to handle scalable vectors
2651 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
2652 // it. This change will be removed when code-generation for these types is
2653 // sufficiently reliable.
2654 if (cast<VectorType>(DataTy)->getElementCount() ==
2655 ElementCount::getScalable(1))
2656 return InstructionCost::getInvalid();
2657
2658 ElementCount LegalVF = LT.second.getVectorElementCount();
2659 InstructionCost MemOpCost =
2660 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
2661 {TTI::OK_AnyValue, TTI::OP_None}, I);
2662 // Add on an overhead cost for using gathers/scatters.
2663 // TODO: At the moment this is applied unilaterally for all CPUs, but at some
2664 // point we may want a per-CPU overhead.
2665 MemOpCost *= getSVEGatherScatterOverhead(Opcode);
2666 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
2667}
2668
2669bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {
2670 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
2671}
2672
2673InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
2674 MaybeAlign Alignment,
2675 unsigned AddressSpace,
2676 TTI::TargetCostKind CostKind,
2677 TTI::OperandValueInfo OpInfo,
2678 const Instruction *I) {
2679 EVT VT = TLI->getValueType(DL, Ty, true);
2680 // Type legalization can't handle structs
2681 if (VT == MVT::Other)
2682 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
2683 CostKind);
2684
2685 auto LT = getTypeLegalizationCost(Ty);
2686 if (!LT.first.isValid())
2687 return InstructionCost::getInvalid();
2688
2689 // The code-generator is currently not able to handle scalable vectors
2690 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
2691 // it. This change will be removed when code-generation for these types is
2692 // sufficiently reliable.
2693 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
2694 if (VTy->getElementCount() == ElementCount::getScalable(1))
2695 return InstructionCost::getInvalid();
2696
2697 // TODO: consider latency as well for TCK_SizeAndLatency.
2698 if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
2699 return LT.first;
2700
2701 if (CostKind != TTI::TCK_RecipThroughput)
2702 return 1;
2703
2704 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
2705 LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
2706 // Unaligned stores are extremely inefficient. We don't split all
2707 // unaligned 128-bit stores because the negative impact that has shown in
2708 // practice on inlined block copy code.
2709 // We make such stores expensive so that we will only vectorize if there
2710 // are 6 other instructions getting vectorized.
2711 const int AmortizationCost = 6;
2712
2713 return LT.first * 2 * AmortizationCost;
2714 }
2715
2716 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
2717 if (Ty->isPtrOrPtrVectorTy())
2718 return LT.first;
2719
2720 // Check truncating stores and extending loads.
2721 if (useNeonVector(Ty) &&
2722 Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
2723 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
2724 if (VT == MVT::v4i8)
2725 return 2;
2726 // Otherwise we need to scalarize.
2727 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
2728 }
2729
2730 return LT.first;
2731}
2732
2733InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
2734 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
2735 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
2736 bool UseMaskForCond, bool UseMaskForGaps) {
2737 assert(Factor >= 2 && "Invalid interleave factor")(static_cast <bool> (Factor >= 2 && "Invalid interleave factor"
) ? void (0) : __assert_fail ("Factor >= 2 && \"Invalid interleave factor\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2737
, __extension__ __PRETTY_FUNCTION__))
;
2738 auto *VecVTy = cast<FixedVectorType>(VecTy);
2739
2740 if (!UseMaskForCond && !UseMaskForGaps &&
2741 Factor <= TLI->getMaxSupportedInterleaveFactor()) {
2742 unsigned NumElts = VecVTy->getNumElements();
2743 auto *SubVecTy =
2744 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
2745
2746 // ldN/stN only support legal vector types of size 64 or 128 in bits.
2747 // Accesses having vector types that are a multiple of 128 bits can be
2748 // matched to more than one ldN/stN instruction.
2749 bool UseScalable;
2750 if (NumElts % Factor == 0 &&
2751 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
2752 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
2753 }
2754
2755 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2756 Alignment, AddressSpace, CostKind,
2757 UseMaskForCond, UseMaskForGaps);
2758}
2759
2760InstructionCost
2761AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
2762 InstructionCost Cost = 0;
2763 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2764 for (auto *I : Tys) {
2765 if (!I->isVectorTy())
2766 continue;
2767 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
2768 128)
2769 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
2770 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
2771 }
2772 return Cost;
2773}
2774
2775unsigned AArch64TTIImpl::getMaxInterleaveFactor(ElementCount VF) {
2776 return ST->getMaxInterleaveFactor();
2777}
2778
2779// For Falkor, we want to avoid having too many strided loads in a loop since
2780// that can exhaust the HW prefetcher resources. We adjust the unroller
2781// MaxCount preference below to attempt to ensure unrolling doesn't create too
2782// many strided loads.
2783static void
2784getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
2785 TargetTransformInfo::UnrollingPreferences &UP) {
2786 enum { MaxStridedLoads = 7 };
2787 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
2788 int StridedLoads = 0;
2789 // FIXME? We could make this more precise by looking at the CFG and
2790 // e.g. not counting loads in each side of an if-then-else diamond.
2791 for (const auto BB : L->blocks()) {
2792 for (auto &I : *BB) {
2793 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
2794 if (!LMemI)
2795 continue;
2796
2797 Value *PtrValue = LMemI->getPointerOperand();
2798 if (L->isLoopInvariant(PtrValue))
2799 continue;
2800
2801 const SCEV *LSCEV = SE.getSCEV(PtrValue);
2802 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
2803 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
2804 continue;
2805
2806 // FIXME? We could take pairing of unrolled load copies into account
2807 // by looking at the AddRec, but we would probably have to limit this
2808 // to loops with no stores or other memory optimization barriers.
2809 ++StridedLoads;
2810 // We've seen enough strided loads that seeing more won't make a
2811 // difference.
2812 if (StridedLoads > MaxStridedLoads / 2)
2813 return StridedLoads;
2814 }
2815 }
2816 return StridedLoads;
2817 };
2818
2819 int StridedLoads = countStridedLoads(L, SE);
2820 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoadsdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("aarch64tti")) { dbgs() << "falkor-hwpf: detected " <<
StridedLoads << " strided loads\n"; } } while (false)
7
Assuming 'DebugFlag' is false
8
Loop condition is false. Exiting loop
2821 << " strided loads\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("aarch64tti")) { dbgs() << "falkor-hwpf: detected " <<
StridedLoads << " strided loads\n"; } } while (false)
;
2822 // Pick the largest power of 2 unroll count that won't result in too many
2823 // strided loads.
2824 if (StridedLoads) {
9
Assuming 'StridedLoads' is not equal to 0
10
Taking true branch
2825 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
11
Calling 'Log2_32'
13
Returning from 'Log2_32'
14
The result of the left shift is undefined due to shifting by '4294967295', which is greater or equal to the width of type 'int'
2826 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("aarch64tti")) { dbgs() << "falkor-hwpf: setting unroll MaxCount to "
<< UP.MaxCount << '\n'; } } while (false)
2827 << UP.MaxCount << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("aarch64tti")) { dbgs() << "falkor-hwpf: setting unroll MaxCount to "
<< UP.MaxCount << '\n'; } } while (false)
;
2828 }
2829}
2830
2831void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
2832 TTI::UnrollingPreferences &UP,
2833 OptimizationRemarkEmitter *ORE) {
2834 // Enable partial unrolling and runtime unrolling.
2835 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
2836
2837 UP.UpperBound = true;
2838
2839 // For inner loop, it is more likely to be a hot one, and the runtime check
2840 // can be promoted out from LICM pass, so the overhead is less, let's try
2841 // a larger threshold to unroll more loops.
2842 if (L->getLoopDepth() > 1)
1
Assuming the condition is false
2
Taking false branch
2843 UP.PartialThreshold *= 2;
2844
2845 // Disable partial & runtime unrolling on -Os.
2846 UP.PartialOptSizeThreshold = 0;
2847
2848 if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
3
Assuming the condition is true
5
Taking true branch
2849 EnableFalkorHWPFUnrollFix)
4
Assuming the condition is true
2850 getFalkorUnrollingPreferences(L, SE, UP);
6
Calling 'getFalkorUnrollingPreferences'
2851
2852 // Scan the loop: don't unroll loops with calls as this could prevent
2853 // inlining. Don't unroll vector loops either, as they don't benefit much from
2854 // unrolling.
2855 for (auto *BB : L->getBlocks()) {
2856 for (auto &I : *BB) {
2857 // Don't unroll vectorised loop.
2858 if (I.getType()->isVectorTy())
2859 return;
2860
2861 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2862 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2863 if (!isLoweredToCall(F))
2864 continue;
2865 }
2866 return;
2867 }
2868 }
2869 }
2870
2871 // Enable runtime unrolling for in-order models
2872 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
2873 // checking for that case, we can ensure that the default behaviour is
2874 // unchanged
2875 if (ST->getProcFamily() != AArch64Subtarget::Others &&
2876 !ST->getSchedModel().isOutOfOrder()) {
2877 UP.Runtime = true;
2878 UP.Partial = true;
2879 UP.UnrollRemainder = true;
2880 UP.DefaultUnrollRuntimeCount = 4;
2881
2882 UP.UnrollAndJam = true;
2883 UP.UnrollAndJamInnerLoopThreshold = 60;
2884 }
2885}
2886
2887void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
2888 TTI::PeelingPreferences &PP) {
2889 BaseT::getPeelingPreferences(L, SE, PP);
2890}
2891
2892Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
2893 Type *ExpectedType) {
2894 switch (Inst->getIntrinsicID()) {
2895 default:
2896 return nullptr;
2897 case Intrinsic::aarch64_neon_st2:
2898 case Intrinsic::aarch64_neon_st3:
2899 case Intrinsic::aarch64_neon_st4: {
2900 // Create a struct type
2901 StructType *ST = dyn_cast<StructType>(ExpectedType);
2902 if (!ST)
2903 return nullptr;
2904 unsigned NumElts = Inst->arg_size() - 1;
2905 if (ST->getNumElements() != NumElts)
2906 return nullptr;
2907 for (unsigned i = 0, e = NumElts; i != e; ++i) {
2908 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
2909 return nullptr;
2910 }
2911 Value *Res = PoisonValue::get(ExpectedType);
2912 IRBuilder<> Builder(Inst);
2913 for (unsigned i = 0, e = NumElts; i != e; ++i) {
2914 Value *L = Inst->getArgOperand(i);
2915 Res = Builder.CreateInsertValue(Res, L, i);
2916 }
2917 return Res;
2918 }
2919 case Intrinsic::aarch64_neon_ld2:
2920 case Intrinsic::aarch64_neon_ld3:
2921 case Intrinsic::aarch64_neon_ld4:
2922 if (Inst->getType() == ExpectedType)
2923 return Inst;
2924 return nullptr;
2925 }
2926}
2927
2928bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
2929 MemIntrinsicInfo &Info) {
2930 switch (Inst->getIntrinsicID()) {
2931 default:
2932 break;
2933 case Intrinsic::aarch64_neon_ld2:
2934 case Intrinsic::aarch64_neon_ld3:
2935 case Intrinsic::aarch64_neon_ld4:
2936 Info.ReadMem = true;
2937 Info.WriteMem = false;
2938 Info.PtrVal = Inst->getArgOperand(0);
2939 break;
2940 case Intrinsic::aarch64_neon_st2:
2941 case Intrinsic::aarch64_neon_st3:
2942 case Intrinsic::aarch64_neon_st4:
2943 Info.ReadMem = false;
2944 Info.WriteMem = true;
2945 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
2946 break;
2947 }
2948
2949 switch (Inst->getIntrinsicID()) {
2950 default:
2951 return false;
2952 case Intrinsic::aarch64_neon_ld2:
2953 case Intrinsic::aarch64_neon_st2:
2954 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
2955 break;
2956 case Intrinsic::aarch64_neon_ld3:
2957 case Intrinsic::aarch64_neon_st3:
2958 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
2959 break;
2960 case Intrinsic::aarch64_neon_ld4:
2961 case Intrinsic::aarch64_neon_st4:
2962 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
2963 break;
2964 }
2965 return true;
2966}
2967
2968/// See if \p I should be considered for address type promotion. We check if \p
2969/// I is a sext with right type and used in memory accesses. If it used in a
2970/// "complex" getelementptr, we allow it to be promoted without finding other
2971/// sext instructions that sign extended the same initial value. A getelementptr
2972/// is considered as "complex" if it has more than 2 operands.
2973bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
2974 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
2975 bool Considerable = false;
2976 AllowPromotionWithoutCommonHeader = false;
2977 if (!isa<SExtInst>(&I))
2978 return false;
2979 Type *ConsideredSExtType =
2980 Type::getInt64Ty(I.getParent()->getParent()->getContext());
2981 if (I.getType() != ConsideredSExtType)
2982 return false;
2983 // See if the sext is the one with the right type and used in at least one
2984 // GetElementPtrInst.
2985 for (const User *U : I.users()) {
2986 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
2987 Considerable = true;
2988 // A getelementptr is considered as "complex" if it has more than 2
2989 // operands. We will promote a SExt used in such complex GEP as we
2990 // expect some computation to be merged if they are done on 64 bits.
2991 if (GEPInst->getNumOperands() > 2) {
2992 AllowPromotionWithoutCommonHeader = true;
2993 break;
2994 }
2995 }
2996 }
2997 return Considerable;
2998}
2999
3000bool AArch64TTIImpl::isLegalToVectorizeReduction(
3001 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
3002 if (!VF.isScalable())
3003 return true;
3004
3005 Type *Ty = RdxDesc.getRecurrenceType();
3006 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
3007 return false;
3008
3009 switch (RdxDesc.getRecurrenceKind()) {
3010 case RecurKind::Add:
3011 case RecurKind::FAdd:
3012 case RecurKind::And:
3013 case RecurKind::Or:
3014 case RecurKind::Xor:
3015 case RecurKind::SMin:
3016 case RecurKind::SMax:
3017 case RecurKind::UMin:
3018 case RecurKind::UMax:
3019 case RecurKind::FMin:
3020 case RecurKind::FMax:
3021 case RecurKind::SelectICmp:
3022 case RecurKind::SelectFCmp:
3023 case RecurKind::FMulAdd:
3024 return true;
3025 default:
3026 return false;
3027 }
3028}
3029
3030InstructionCost
3031AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
3032 bool IsUnsigned, FastMathFlags FMF,
3033 TTI::TargetCostKind CostKind) {
3034 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
3035
3036 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
3037 return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, FMF, CostKind);
3038
3039 assert((isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) &&(static_cast <bool> ((isa<ScalableVectorType>(Ty)
== isa<ScalableVectorType>(CondTy)) && "Both vector needs to be equally scalable"
) ? void (0) : __assert_fail ("(isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) && \"Both vector needs to be equally scalable\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 3040
, __extension__ __PRETTY_FUNCTION__))
3040 "Both vector needs to be equally scalable")(static_cast <bool> ((isa<ScalableVectorType>(Ty)
== isa<ScalableVectorType>(CondTy)) && "Both vector needs to be equally scalable"
) ? void (0) : __assert_fail ("(isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) && \"Both vector needs to be equally scalable\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 3040
, __extension__ __PRETTY_FUNCTION__))
;
3041
3042 InstructionCost LegalizationCost = 0;
3043 if (LT.first > 1) {
3044 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
3045 Intrinsic::ID MinMaxOpcode =
3046 Ty->isFPOrFPVectorTy()
3047 ? Intrinsic::maxnum
3048 : (IsUnsigned ? Intrinsic::umin : Intrinsic::smin);
3049 IntrinsicCostAttributes Attrs(MinMaxOpcode, LegalVTy, {LegalVTy, LegalVTy},
3050 FMF);
3051 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
3052 }
3053
3054 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
3055}
3056
3057InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE(
3058 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) {
3059 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3060 InstructionCost LegalizationCost = 0;
3061 if (LT.first > 1) {
3062 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
3063 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
3064 LegalizationCost *= LT.first - 1;
3065 }
3066
3067 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3068 assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ?
void (0) : __assert_fail ("ISD && \"Invalid opcode\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 3068
, __extension__ __PRETTY_FUNCTION__))
;
3069 // Add the final reduction cost for the legal horizontal reduction
3070 switch (ISD) {
3071 case ISD::ADD:
3072 case ISD::AND:
3073 case ISD::OR:
3074 case ISD::XOR:
3075 case ISD::FADD:
3076 return LegalizationCost + 2;
3077 default:
3078 return InstructionCost::getInvalid();
3079 }
3080}
3081
3082InstructionCost
3083AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
3084 std::optional<FastMathFlags> FMF,
3085 TTI::TargetCostKind CostKind) {
3086 if (TTI::requiresOrderedReduction(FMF)) {
3087 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
3088 InstructionCost BaseCost =
3089 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
3090 // Add on extra cost to reflect the extra overhead on some CPUs. We still
3091 // end up vectorizing for more computationally intensive loops.
3092 return BaseCost + FixedVTy->getNumElements();
3093 }
3094
3095 if (Opcode != Instruction::FAdd)
3096 return InstructionCost::getInvalid();
3097
3098 auto *VTy = cast<ScalableVectorType>(ValTy);
3099 InstructionCost Cost =
3100 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
3101 Cost *= getMaxNumElements(VTy->getElementCount());
3102 return Cost;
3103 }
3104
3105 if (isa<ScalableVectorType>(ValTy))
3106 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
3107
3108 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3109 MVT MTy = LT.second;
3110 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3111 assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ?
void (0) : __assert_fail ("ISD && \"Invalid opcode\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 3111
, __extension__ __PRETTY_FUNCTION__))
;
3112
3113 // Horizontal adds can use the 'addv' instruction. We model the cost of these
3114 // instructions as twice a normal vector add, plus 1 for each legalization
3115 // step (LT.first). This is the only arithmetic vector reduction operation for
3116 // which we have an instruction.
3117 // OR, XOR and AND costs should match the codegen from:
3118 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
3119 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
3120 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
3121 static const CostTblEntry CostTblNoPairwise[]{
3122 {ISD::ADD, MVT::v8i8, 2},
3123 {ISD::ADD, MVT::v16i8, 2},
3124 {ISD::ADD, MVT::v4i16, 2},
3125 {ISD::ADD, MVT::v8i16, 2},
3126 {ISD::ADD, MVT::v4i32, 2},
3127 {ISD::ADD, MVT::v2i64, 2},
3128 {ISD::OR, MVT::v8i8, 15},
3129 {ISD::OR, MVT::v16i8, 17},
3130 {ISD::OR, MVT::v4i16, 7},
3131 {ISD::OR, MVT::v8i16, 9},
3132 {ISD::OR, MVT::v2i32, 3},
3133 {ISD::OR, MVT::v4i32, 5},
3134 {ISD::OR, MVT::v2i64, 3},
3135 {ISD::XOR, MVT::v8i8, 15},
3136 {ISD::XOR, MVT::v16i8, 17},
3137 {ISD::XOR, MVT::v4i16, 7},
3138 {ISD::XOR, MVT::v8i16, 9},
3139 {ISD::XOR, MVT::v2i32, 3},
3140 {ISD::XOR, MVT::v4i32, 5},
3141 {ISD::XOR, MVT::v2i64, 3},
3142 {ISD::AND, MVT::v8i8, 15},
3143 {ISD::AND, MVT::v16i8, 17},
3144 {ISD::AND, MVT::v4i16, 7},
3145 {ISD::AND, MVT::v8i16, 9},
3146 {ISD::AND, MVT::v2i32, 3},
3147 {ISD::AND, MVT::v4i32, 5},
3148 {ISD::AND, MVT::v2i64, 3},
3149 };
3150 switch (ISD) {
3151 default:
3152 break;
3153 case ISD::ADD:
3154 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
3155 return (LT.first - 1) + Entry->Cost;
3156 break;
3157 case ISD::XOR:
3158 case ISD::AND:
3159 case ISD::OR:
3160 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
3161 if (!Entry)
3162 break;
3163 auto *ValVTy = cast<FixedVectorType>(ValTy);
3164 if (!ValVTy->getElementType()->isIntegerTy(1) &&
3165 MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
3166 isPowerOf2_32(ValVTy->getNumElements())) {
3167 InstructionCost ExtraCost = 0;
3168 if (LT.first != 1) {
3169 // Type needs to be split, so there is an extra cost of LT.first - 1
3170 // arithmetic ops.
3171 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
3172 MTy.getVectorNumElements());
3173 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
3174 ExtraCost *= LT.first - 1;
3175 }
3176 return Entry->Cost + ExtraCost;
3177 }
3178 break;
3179 }
3180 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
3181}
3182
3183InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) {
3184 static const CostTblEntry ShuffleTbl[] = {
3185 { TTI::SK_Splice, MVT::nxv16i8, 1 },
3186 { TTI::SK_Splice, MVT::nxv8i16, 1 },
3187 { TTI::SK_Splice, MVT::nxv4i32, 1 },
3188 { TTI::SK_Splice, MVT::nxv2i64, 1 },
3189 { TTI::SK_Splice, MVT::nxv2f16, 1 },
3190 { TTI::SK_Splice, MVT::nxv4f16, 1 },
3191 { TTI::SK_Splice, MVT::nxv8f16, 1 },
3192 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
3193 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
3194 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
3195 { TTI::SK_Splice, MVT::nxv2f32, 1 },
3196 { TTI::SK_Splice, MVT::nxv4f32, 1 },
3197 { TTI::SK_Splice, MVT::nxv2f64, 1 },
3198 };
3199
3200 // The code-generator is currently not able to handle scalable vectors
3201 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3202 // it. This change will be removed when code-generation for these types is
3203 // sufficiently reliable.
3204 if (Tp->getElementCount() == ElementCount::getScalable(1))
3205 return InstructionCost::getInvalid();
3206
3207 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
3208 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
3209 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3210 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
3211 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
3212 : LT.second;
3213 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
3214 InstructionCost LegalizationCost = 0;
3215 if (Index < 0) {
3216 LegalizationCost =
3217 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
3218 CmpInst::BAD_ICMP_PREDICATE, CostKind) +
3219 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
3220 CmpInst::BAD_ICMP_PREDICATE, CostKind);
3221 }
3222
3223 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
3224 // Cost performed on a promoted type.
3225 if (LT.second.getScalarType() == MVT::i1) {
3226 LegalizationCost +=
3227 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
3228 TTI::CastContextHint::None, CostKind) +
3229 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
3230 TTI::CastContextHint::None, CostKind);
3231 }
3232 const auto *Entry =
3233 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
3234 assert(Entry && "Illegal Type for Splice")(static_cast <bool> (Entry && "Illegal Type for Splice"
) ? void (0) : __assert_fail ("Entry && \"Illegal Type for Splice\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 3234
, __extension__ __PRETTY_FUNCTION__))
;
3235 LegalizationCost += Entry->Cost;
3236 return LegalizationCost * LT.first;
3237}
3238
3239InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
3240 VectorType *Tp,
3241 ArrayRef<int> Mask,
3242 TTI::TargetCostKind CostKind,
3243 int Index, VectorType *SubTp,
3244 ArrayRef<const Value *> Args) {
3245 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
3246 // If we have a Mask, and the LT is being legalized somehow, split the Mask
3247 // into smaller vectors and sum the cost of each shuffle.
3248 if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() &&
3249 Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
3250 cast<FixedVectorType>(Tp)->getNumElements() >
3251 LT.second.getVectorNumElements() &&
3252 !Index && !SubTp) {
3253 unsigned TpNumElts = cast<FixedVectorType>(Tp)->getNumElements();
3254 assert(Mask.size() == TpNumElts && "Expected Mask and Tp size to match!")(static_cast <bool> (Mask.size() == TpNumElts &&
"Expected Mask and Tp size to match!") ? void (0) : __assert_fail
("Mask.size() == TpNumElts && \"Expected Mask and Tp size to match!\""
, "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 3254
, __extension__ __PRETTY_FUNCTION__))
;
3255 unsigned LTNumElts = LT.second.getVectorNumElements();
3256 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
3257 VectorType *NTp =
3258 VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount());
3259 InstructionCost Cost;
3260 for (unsigned N = 0; N < NumVecs; N++) {
3261 SmallVector<int> NMask;
3262 // Split the existing mask into chunks of size LTNumElts. Track the source
3263 // sub-vectors to ensure the result has at most 2 inputs.
3264 unsigned Source1, Source2;
3265 unsigned NumSources = 0;
3266 for (unsigned E = 0; E < LTNumElts; E++) {
3267 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
3268 : PoisonMaskElem;
3269 if (MaskElt < 0) {
3270 NMask.push_back(PoisonMaskElem);
3271 continue;
3272 }
3273
3274 // Calculate which source from the input this comes from and whether it
3275 // is new to us.
3276 unsigned Source = MaskElt / LTNumElts;
3277 if (NumSources == 0) {
3278 Source1 = Source;
3279 NumSources = 1;
3280 } else if (NumSources == 1 && Source != Source1) {
3281 Source2 = Source;
3282 NumSources = 2;
3283 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
3284 NumSources++;
3285 }
3286
3287 // Add to the new mask. For the NumSources>2 case these are not correct,
3288 // but are only used for the modular lane number.
3289 if (Source == Source1)
3290 NMask.push_back(MaskElt % LTNumElts);
3291 else if (Source == Source2)
3292 NMask.push_back(MaskElt % LTNumElts + LTNumElts);
3293 else
3294 NMask.push_back(MaskElt % LTNumElts);
3295 }
3296 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
3297 // getShuffleCost. If not then cost it using the worst case.
3298 if (NumSources <= 2)
3299 Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
3300 : TTI::SK_PermuteTwoSrc,
3301 NTp, NMask, CostKind, 0, nullptr, Args);
3302 else if (any_of(enumerate(NMask), [&](const auto &ME) {
3303 return ME.value() % LTNumElts == ME.index();
3304 }))
3305 Cost += LTNumElts - 1;
3306 else
3307 Cost += LTNumElts;
3308 }
3309 return Cost;
3310 }
3311
3312 Kind = improveShuffleKindFromMask(Kind, Mask);
3313
3314 // Check for broadcast loads, which are supported by the LD1R instruction.
3315 // In terms of code-size, the shuffle vector is free when a load + dup get
3316 // folded into a LD1R. That's what we check and return here. For performance
3317 // and reciprocal throughput, a LD1R is not completely free. In this case, we
3318 // return the cost for the broadcast below (i.e. 1 for most/all types), so
3319 // that we model the load + dup sequence slightly higher because LD1R is a
3320 // high latency instruction.
3321 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
3322 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
3323 if (IsLoad && LT.second.isVector() &&
3324 isLegalBroadcastLoad(Tp->getElementType(),
3325 LT.second.getVectorElementCount()))
3326 return 0;
3327 }
3328
3329 // If we have 4 elements for the shuffle and a Mask, get the cost straight
3330 // from the perfect shuffle tables.
3331 if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) &&
3332 (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) &&
3333 all_of(Mask, [](int E) { return E < 8; }))
3334 return getPerfectShuffleCost(Mask);
3335
3336 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
3337 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
3338 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
3339 static const CostTblEntry ShuffleTbl[] = {
3340 // Broadcast shuffle kinds can be performed with 'dup'.
3341 {TTI::SK_Broadcast, MVT::v8i8, 1},
3342 {TTI::SK_Broadcast, MVT::v16i8, 1},
3343 {TTI::SK_Broadcast, MVT::v4i16, 1},
3344 {TTI::SK_Broadcast, MVT::v8i16, 1},
3345 {TTI::SK_Broadcast, MVT::v2i32, 1},
3346 {TTI::SK_Broadcast, MVT::v4i32, 1},
3347 {TTI::SK_Broadcast, MVT::v2i64, 1},
3348 {TTI::SK_Broadcast, MVT::v4f16, 1},
3349 {TTI::SK_Broadcast, MVT::v8f16, 1},
3350 {TTI::SK_Broadcast, MVT::v2f32, 1},
3351 {TTI::SK_Broadcast, MVT::v4f32, 1},
3352 {TTI::SK_Broadcast, MVT::v2f64, 1},
3353 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
3354 // 'zip1/zip2' instructions.
3355 {TTI::SK_Transpose, MVT::v8i8, 1},
3356 {TTI::SK_Transpose, MVT::v16i8, 1},
3357 {TTI::SK_Transpose, MVT::v4i16, 1},
3358 {TTI::SK_Transpose, MVT::v8i16, 1},
3359 {TTI::SK_Transpose, MVT::v2i32, 1},
3360 {TTI::SK_Transpose, MVT::v4i32, 1},
3361 {TTI::SK_Transpose, MVT::v2i64, 1},
3362 {TTI::SK_Transpose, MVT::v4f16, 1},
3363 {TTI::SK_Transpose, MVT::v8f16, 1},
3364 {TTI::SK_Transpose, MVT::v2f32, 1},
3365 {TTI::SK_Transpose, MVT::v4f32, 1},
3366 {TTI::SK_Transpose, MVT::v2f64, 1},
3367 // Select shuffle kinds.
3368 // TODO: handle vXi8/vXi16.
3369 {TTI::SK_Select, MVT::v2i32, 1}, // mov.
3370 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
3371 {TTI::SK_Select, MVT::v2i64, 1}, // mov.
3372 {TTI::SK_Select, MVT::v2f32, 1}, // mov.
3373 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
3374 {TTI::SK_Select, MVT::v2f64, 1}, // mov.
3375 // PermuteSingleSrc shuffle kinds.
3376 {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
3377 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
3378 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
3379 {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
3380 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
3381 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
3382 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
3383 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
3384 {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
3385 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl
3386 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl
3387 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
3388 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl
3389 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl
3390 // Reverse can be lowered with `rev`.
3391 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
3392 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
3393 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
3394 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
3395 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
3396 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
3397 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
3398 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
3399 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
3400 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
3401 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
3402 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64
3403 // Splice can all be lowered as `ext`.
3404 {TTI::SK_Splice, MVT::v2i32, 1},
3405 {TTI::SK_Splice, MVT::v4i32, 1},
3406 {TTI::SK_Splice, MVT::v2i64, 1},
3407 {TTI::SK_Splice, MVT::v2f32, 1},
3408 {TTI::SK_Splice, MVT::v4f32, 1},
3409 {TTI::SK_Splice, MVT::v2f64, 1},
3410 {TTI::SK_Splice, MVT::v8f16, 1},
3411 {TTI::SK_Splice, MVT::v8bf16, 1},
3412 {TTI::SK_Splice, MVT::v8i16, 1},
3413 {TTI::SK_Splice, MVT::v16i8, 1},
3414 {TTI::SK_Splice, MVT::v4bf16, 1},
3415 {TTI::SK_Splice, MVT::v4f16, 1},
3416 {TTI::SK_Splice, MVT::v4i16, 1},
3417 {TTI::SK_Splice, MVT::v8i8, 1},
3418 // Broadcast shuffle kinds for scalable vectors
3419 {TTI::SK_Broadcast, MVT::nxv16i8, 1},
3420 {TTI::SK_Broadcast, MVT::nxv8i16, 1},
3421 {TTI::SK_Broadcast, MVT::nxv4i32, 1},
3422 {TTI::SK_Broadcast, MVT::nxv2i64, 1},
3423 {TTI::SK_Broadcast, MVT::nxv2f16, 1},
3424 {TTI::SK_Broadcast, MVT::nxv4f16, 1},
3425 {TTI::SK_Broadcast, MVT::nxv8f16, 1},
3426 {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
3427 {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
3428 {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
3429 {TTI::SK_Broadcast, MVT::nxv2f32, 1},
3430 {TTI::SK_Broadcast, MVT::nxv4f32, 1},
3431 {TTI::SK_Broadcast, MVT::nxv2f64, 1},
3432 {TTI::SK_Broadcast, MVT::nxv16i1, 1},
3433 {TTI::SK_Broadcast, MVT::nxv8i1, 1},
3434 {TTI::SK_Broadcast, MVT::nxv4i1, 1},
3435 {TTI::SK_Broadcast, MVT::nxv2i1, 1},
3436 // Handle the cases for vector.reverse with scalable vectors
3437 {TTI::SK_Reverse, MVT::nxv16i8, 1},
3438 {TTI::SK_Reverse, MVT::nxv8i16, 1},
3439 {TTI::SK_Reverse, MVT::nxv4i32, 1},
3440 {TTI::SK_Reverse, MVT::nxv2i64, 1},
3441 {TTI::SK_Reverse, MVT::nxv2f16, 1},
3442 {TTI::SK_Reverse, MVT::nxv4f16, 1},
3443 {TTI::SK_Reverse, MVT::nxv8f16, 1},
3444 {TTI::SK_Reverse, MVT::nxv2bf16, 1},
3445 {TTI::SK_Reverse, MVT::nxv4bf16, 1},
3446 {TTI::SK_Reverse, MVT::nxv8bf16, 1},
3447 {TTI::SK_Reverse, MVT::nxv2f32, 1},
3448 {TTI::SK_Reverse, MVT::nxv4f32, 1},
3449 {TTI::SK_Reverse, MVT::nxv2f64, 1},
3450 {TTI::SK_Reverse, MVT::nxv16i1, 1},
3451 {TTI::SK_Reverse, MVT::nxv8i1, 1},
3452 {TTI::SK_Reverse, MVT::nxv4i1, 1},
3453 {TTI::SK_Reverse, MVT::nxv2i1, 1},
3454 };
3455 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
3456 return LT.first * Entry->Cost;
3457 }
3458
3459 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))
3460 return getSpliceCost(Tp, Index);
3461
3462 // Inserting a subvector can often be done with either a D, S or H register
3463 // move, so long as the inserted vector is "aligned".
3464 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
3465 LT.second.getSizeInBits() <= 128 && SubTp) {
3466 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
3467 if (SubLT.second.isVector()) {
3468 int NumElts = LT.second.getVectorNumElements();
3469 int NumSubElts = SubLT.second.getVectorNumElements();
3470 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
3471 return SubLT.first;
3472 }
3473 }
3474
3475 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
3476}
3477
3478static bool containsDecreasingPointers(Loop *TheLoop,
3479 PredicatedScalarEvolution *PSE) {
3480 const ValueToValueMap &Strides = ValueToValueMap();
3481 for (BasicBlock *BB : TheLoop->blocks()) {
3482 // Scan the instructions in the block and look for addresses that are
3483 // consecutive and decreasing.
3484 for (Instruction &I : *BB) {
3485 if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
3486 Value *Ptr = getLoadStorePointerOperand(&I);
3487 Type *AccessTy = getLoadStoreType(&I);
3488 if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true,
3489 /*ShouldCheckWrap=*/false)
3490 .value_or(0) < 0)
3491 return true;
3492 }
3493 }
3494 }
3495 return false;
3496}
3497
3498bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
3499 if (!ST->hasSVE() || TailFoldingKindLoc == TailFoldingKind::TFDisabled)
3500 return false;
3501
3502 // We don't currently support vectorisation with interleaving for SVE - with
3503 // such loops we're better off not using tail-folding. This gives us a chance
3504 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
3505 if (TFI->IAI->hasGroups())
3506 return false;
3507
3508 TailFoldingKind Required; // Defaults to 0.
3509 if (TFI->LVL->getReductionVars().size())
3510 Required.add(TailFoldingKind::TFReductions);
3511 if (TFI->LVL->getFixedOrderRecurrences().size())
3512 Required.add(TailFoldingKind::TFRecurrences);
3513
3514 // We call this to discover whether any load/store pointers in the loop have
3515 // negative strides. This will require extra work to reverse the loop
3516 // predicate, which may be expensive.
3517 if (containsDecreasingPointers(TFI->LVL->getLoop(),
3518 TFI->LVL->getPredicatedScalarEvolution()))
3519 Required.add(TailFoldingKind::TFReverse);
3520 if (!Required)
3521 Required.add(TailFoldingKind::TFSimple);
3522
3523 return (TailFoldingKindLoc & Required) == Required;
3524}
3525
3526InstructionCost
3527AArch64TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
3528 int64_t BaseOffset, bool HasBaseReg,
3529 int64_t Scale, unsigned AddrSpace) const {
3530 // Scaling factors are not free at all.
3531 // Operands | Rt Latency
3532 // -------------------------------------------
3533 // Rt, [Xn, Xm] | 4
3534 // -------------------------------------------
3535 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
3536 // Rt, [Xn, Wm, <extend> #imm] |
3537 TargetLoweringBase::AddrMode AM;
3538 AM.BaseGV = BaseGV;
3539 AM.BaseOffs = BaseOffset;
3540 AM.HasBaseReg = HasBaseReg;
3541 AM.Scale = Scale;
3542 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
3543 // Scale represents reg2 * scale, thus account for 1 if
3544 // it is not equal to 0 or 1.
3545 return AM.Scale != 0 && AM.Scale != 1;
3546 return -1;
3547}

/build/source/llvm/include/llvm/Support/MathExtras.h

1//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains some functions that are useful for math stuff.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_SUPPORT_MATHEXTRAS_H
14#define LLVM_SUPPORT_MATHEXTRAS_H
15
16#include "llvm/ADT/bit.h"
17#include "llvm/Support/Compiler.h"
18#include <cassert>
19#include <climits>
20#include <cstdint>
21#include <cstring>
22#include <limits>
23#include <type_traits>
24
25namespace llvm {
26
27/// Mathematical constants.
28namespace numbers {
29// TODO: Track C++20 std::numbers.
30// TODO: Favor using the hexadecimal FP constants (requires C++17).
31constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113
32 egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620
33 ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162
34 ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392
35 log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0)
36 log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2)
37 pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796
38 inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541
39 sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161
40 inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197
41 sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219
42 inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1)
43 sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194
44 inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1)
45 phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622
46constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113
47 egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620
48 ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162
49 ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392
50 log2ef = 1.44269504F, // (0x1.715476P+0)
51 log10ef = .434294482F, // (0x1.bcb7b2P-2)
52 pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796
53 inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541
54 sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161
55 inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197
56 sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193
57 inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1)
58 sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194
59 inv_sqrt3f = .577350269F, // (0x1.279a74P-1)
60 phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622
61} // namespace numbers
62
63/// Count number of 0's from the least significant bit to the most
64/// stopping at the first 1.
65///
66/// Only unsigned integral types are allowed.
67///
68/// Returns std::numeric_limits<T>::digits on an input of 0.
69template <typename T>
70LLVM_DEPRECATED("Use llvm::countr_zero instead.", "llvm::countr_zero")__attribute__((deprecated("Use llvm::countr_zero instead.", "llvm::countr_zero"
)))
71unsigned countTrailingZeros(T Val) {
72 static_assert(std::is_unsigned_v<T>,
73 "Only unsigned integral types are allowed.");
74 return llvm::countr_zero(Val);
75}
76
77/// Count number of 0's from the most significant bit to the least
78/// stopping at the first 1.
79///
80/// Only unsigned integral types are allowed.
81///
82/// Returns std::numeric_limits<T>::digits on an input of 0.
83template <typename T>
84LLVM_DEPRECATED("Use llvm::countl_zero instead.", "llvm::countl_zero")__attribute__((deprecated("Use llvm::countl_zero instead.", "llvm::countl_zero"
)))
85unsigned countLeadingZeros(T Val) {
86 static_assert(std::is_unsigned_v<T>,
87 "Only unsigned integral types are allowed.");
88 return llvm::countl_zero(Val);
89}
90
91/// Create a bitmask with the N right-most bits set to 1, and all other
92/// bits set to 0. Only unsigned types are allowed.
93template <typename T> T maskTrailingOnes(unsigned N) {
94 static_assert(std::is_unsigned_v<T>, "Invalid type!");
95 const unsigned Bits = CHAR_BIT8 * sizeof(T);
96 assert(N <= Bits && "Invalid bit index")(static_cast <bool> (N <= Bits && "Invalid bit index"
) ? void (0) : __assert_fail ("N <= Bits && \"Invalid bit index\""
, "llvm/include/llvm/Support/MathExtras.h", 96, __extension__
__PRETTY_FUNCTION__))
;
97 return N == 0 ? 0 : (T(-1) >> (Bits - N));
98}
99
100/// Create a bitmask with the N left-most bits set to 1, and all other
101/// bits set to 0. Only unsigned types are allowed.
102template <typename T> T maskLeadingOnes(unsigned N) {
103 return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
104}
105
106/// Create a bitmask with the N right-most bits set to 0, and all other
107/// bits set to 1. Only unsigned types are allowed.
108template <typename T> T maskTrailingZeros(unsigned N) {
109 return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
110}
111
112/// Create a bitmask with the N left-most bits set to 0, and all other
113/// bits set to 1. Only unsigned types are allowed.
114template <typename T> T maskLeadingZeros(unsigned N) {
115 return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
116}
117
118/// Macro compressed bit reversal table for 256 bits.
119///
120/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
121static const unsigned char BitReverseTable256[256] = {
122#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
123#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
124#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
125 R6(0), R6(2), R6(1), R6(3)
126#undef R2
127#undef R4
128#undef R6
129};
130
131/// Reverse the bits in \p Val.
132template <typename T> T reverseBits(T Val) {
133#if __has_builtin(__builtin_bitreverse8)1
134 if constexpr (std::is_same_v<T, uint8_t>)
135 return __builtin_bitreverse8(Val);
136#endif
137#if __has_builtin(__builtin_bitreverse16)1
138 if constexpr (std::is_same_v<T, uint16_t>)
139 return __builtin_bitreverse16(Val);
140#endif
141#if __has_builtin(__builtin_bitreverse32)1
142 if constexpr (std::is_same_v<T, uint32_t>)
143 return __builtin_bitreverse32(Val);
144#endif
145#if __has_builtin(__builtin_bitreverse64)1
146 if constexpr (std::is_same_v<T, uint64_t>)
147 return __builtin_bitreverse64(Val);
148#endif
149
150 unsigned char in[sizeof(Val)];
151 unsigned char out[sizeof(Val)];
152 std::memcpy(in, &Val, sizeof(Val));
153 for (unsigned i = 0; i < sizeof(Val); ++i)
154 out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
155 std::memcpy(&Val, out, sizeof(Val));
156 return Val;
157}
158
159// NOTE: The following support functions use the _32/_64 extensions instead of
160// type overloading so that signed and unsigned integers can be used without
161// ambiguity.
162
163/// Return the high 32 bits of a 64 bit value.
164constexpr inline uint32_t Hi_32(uint64_t Value) {
165 return static_cast<uint32_t>(Value >> 32);
166}
167
168/// Return the low 32 bits of a 64 bit value.
169constexpr inline uint32_t Lo_32(uint64_t Value) {
170 return static_cast<uint32_t>(Value);
171}
172
173/// Make a 64-bit integer from a high / low pair of 32-bit integers.
174constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
175 return ((uint64_t)High << 32) | (uint64_t)Low;
176}
177
178/// Checks if an integer fits into the given bit width.
179template <unsigned N> constexpr inline bool isInt(int64_t x) {
180 if constexpr (N == 8)
181 return static_cast<int8_t>(x) == x;
182 if constexpr (N == 16)
183 return static_cast<int16_t>(x) == x;
184 if constexpr (N == 32)
185 return static_cast<int32_t>(x) == x;
186 if constexpr (N < 64)
187 return -(INT64_C(1)1L << (N - 1)) <= x && x < (INT64_C(1)1L << (N - 1));
188 (void)x; // MSVC v19.25 warns that x is unused.
189 return true;
190}
191
192/// Checks if a signed integer is an N bit number shifted left by S.
193template <unsigned N, unsigned S>
194constexpr inline bool isShiftedInt(int64_t x) {
195 static_assert(
196 N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number.");
197 static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide.");
198 return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
199}
200
201/// Checks if an unsigned integer fits into the given bit width.
202template <unsigned N> constexpr inline bool isUInt(uint64_t x) {
203 static_assert(N > 0, "isUInt<0> doesn't make sense");
204 if constexpr (N == 8)
205 return static_cast<uint8_t>(x) == x;
206 if constexpr (N == 16)
207 return static_cast<uint16_t>(x) == x;
208 if constexpr (N == 32)
209 return static_cast<uint32_t>(x) == x;
210 if constexpr (N < 64)
211 return x < (UINT64_C(1)1UL << (N));
212 (void)x; // MSVC v19.25 warns that x is unused.
213 return true;
214}
215
216/// Checks if a unsigned integer is an N bit number shifted left by S.
217template <unsigned N, unsigned S>
218constexpr inline bool isShiftedUInt(uint64_t x) {
219 static_assert(
220 N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)");
221 static_assert(N + S <= 64,
222 "isShiftedUInt<N, S> with N + S > 64 is too wide.");
223 // Per the two static_asserts above, S must be strictly less than 64. So
224 // 1 << S is not undefined behavior.
225 return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
226}
227
228/// Gets the maximum value for a N-bit unsigned integer.
229inline uint64_t maxUIntN(uint64_t N) {
230 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
"integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "llvm/include/llvm/Support/MathExtras.h", 230, __extension__
__PRETTY_FUNCTION__))
;
231
232 // uint64_t(1) << 64 is undefined behavior, so we can't do
233 // (uint64_t(1) << N) - 1
234 // without checking first that N != 64. But this works and doesn't have a
235 // branch.
236 return UINT64_MAX(18446744073709551615UL) >> (64 - N);
237}
238
239/// Gets the minimum value for a N-bit signed integer.
240inline int64_t minIntN(int64_t N) {
241 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
"integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "llvm/include/llvm/Support/MathExtras.h", 241, __extension__
__PRETTY_FUNCTION__))
;
242
243 return UINT64_C(1)1UL + ~(UINT64_C(1)1UL << (N - 1));
244}
245
246/// Gets the maximum value for a N-bit signed integer.
247inline int64_t maxIntN(int64_t N) {
248 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
"integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "llvm/include/llvm/Support/MathExtras.h", 248, __extension__
__PRETTY_FUNCTION__))
;
249
250 // This relies on two's complement wraparound when N == 64, so we convert to
251 // int64_t only at the very end to avoid UB.
252 return (UINT64_C(1)1UL << (N - 1)) - 1;
253}
254
255/// Checks if an unsigned integer fits into the given (dynamic) bit width.
256inline bool isUIntN(unsigned N, uint64_t x) {
257 return N >= 64 || x <= maxUIntN(N);
258}
259
260/// Checks if an signed integer fits into the given (dynamic) bit width.
261inline bool isIntN(unsigned N, int64_t x) {
262 return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N));
263}
264
265/// Return true if the argument is a non-empty sequence of ones starting at the
266/// least significant bit with the remainder zero (32 bit version).
267/// Ex. isMask_32(0x0000FFFFU) == true.
268constexpr inline bool isMask_32(uint32_t Value) {
269 return Value && ((Value + 1) & Value) == 0;
270}
271
272/// Return true if the argument is a non-empty sequence of ones starting at the
273/// least significant bit with the remainder zero (64 bit version).
274constexpr inline bool isMask_64(uint64_t Value) {
275 return Value && ((Value + 1) & Value) == 0;
276}
277
278/// Return true if the argument contains a non-empty sequence of ones with the
279/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
280constexpr inline bool isShiftedMask_32(uint32_t Value) {
281 return Value && isMask_32((Value - 1) | Value);
282}
283
284/// Return true if the argument contains a non-empty sequence of ones with the
285/// remainder zero (64 bit version.)
286constexpr inline bool isShiftedMask_64(uint64_t Value) {
287 return Value && isMask_64((Value - 1) | Value);
288}
289
290/// Return true if the argument is a power of two > 0.
291/// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
292constexpr inline bool isPowerOf2_32(uint32_t Value) {
293 return llvm::has_single_bit(Value);
294}
295
296/// Return true if the argument is a power of two > 0 (64 bit edition.)
297constexpr inline bool isPowerOf2_64(uint64_t Value) {
298 return llvm::has_single_bit(Value);
299}
300
301/// Count the number of ones from the most significant bit to the first
302/// zero bit.
303///
304/// Ex. countLeadingOnes(0xFF0FFF00) == 8.
305/// Only unsigned integral types are allowed.
306///
307/// Returns std::numeric_limits<T>::digits on an input of all ones.
308template <typename T>
309LLVM_DEPRECATED("Use llvm::countl_one instead.", "llvm::countl_one")__attribute__((deprecated("Use llvm::countl_one instead.", "llvm::countl_one"
)))
310unsigned countLeadingOnes(T Value) {
311 static_assert(std::is_unsigned_v<T>,
312 "Only unsigned integral types are allowed.");
313 return llvm::countl_one<T>(Value);
314}
315
316/// Count the number of ones from the least significant bit to the first
317/// zero bit.
318///
319/// Ex. countTrailingOnes(0x00FF00FF) == 8.
320/// Only unsigned integral types are allowed.
321///
322/// Returns std::numeric_limits<T>::digits on an input of all ones.
323template <typename T>
324LLVM_DEPRECATED("Use llvm::countr_one instead.", "llvm::countr_one")__attribute__((deprecated("Use llvm::countr_one instead.", "llvm::countr_one"
)))
325unsigned countTrailingOnes(T Value) {
326 static_assert(std::is_unsigned_v<T>,
327 "Only unsigned integral types are allowed.");
328 return llvm::countr_one<T>(Value);
329}
330
331/// Count the number of set bits in a value.
332/// Ex. countPopulation(0xF000F000) = 8
333/// Returns 0 if the word is zero.
334template <typename T>
335LLVM_DEPRECATED("Use llvm::popcount instead.", "llvm::popcount")__attribute__((deprecated("Use llvm::popcount instead.", "llvm::popcount"
)))
336inline unsigned countPopulation(T Value) {
337 static_assert(std::is_unsigned_v<T>,
338 "Only unsigned integral types are allowed.");
339 return (unsigned)llvm::popcount(Value);
340}
341
342/// Return true if the argument contains a non-empty sequence of ones with the
343/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
344/// If true, \p MaskIdx will specify the index of the lowest set bit and \p
345/// MaskLen is updated to specify the length of the mask, else neither are
346/// updated.
347inline bool isShiftedMask_32(uint32_t Value, unsigned &MaskIdx,
348 unsigned &MaskLen) {
349 if (!isShiftedMask_32(Value))
350 return false;
351 MaskIdx = llvm::countr_zero(Value);
352 MaskLen = llvm::popcount(Value);
353 return true;
354}
355
356/// Return true if the argument contains a non-empty sequence of ones with the
357/// remainder zero (64 bit version.) If true, \p MaskIdx will specify the index
358/// of the lowest set bit and \p MaskLen is updated to specify the length of the
359/// mask, else neither are updated.
360inline bool isShiftedMask_64(uint64_t Value, unsigned &MaskIdx,
361 unsigned &MaskLen) {
362 if (!isShiftedMask_64(Value))
363 return false;
364 MaskIdx = llvm::countr_zero(Value);
365 MaskLen = llvm::popcount(Value);
366 return true;
367}
368
369/// Compile time Log2.
370/// Valid only for positive powers of two.
371template <size_t kValue> constexpr inline size_t CTLog2() {
372 static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue),
373 "Value is not a valid power of 2");
374 return 1 + CTLog2<kValue / 2>();
375}
376
377template <> constexpr inline size_t CTLog2<1>() { return 0; }
378
379/// Return the floor log base 2 of the specified value, -1 if the value is zero.
380/// (32 bit edition.)
381/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
382inline unsigned Log2_32(uint32_t Value) {
383 return 31 - llvm::countl_zero(Value);
12
Returning the value 4294967295
384}
385
386/// Return the floor log base 2 of the specified value, -1 if the value is zero.
387/// (64 bit edition.)
388inline unsigned Log2_64(uint64_t Value) {
389 return 63 - llvm::countl_zero(Value);
390}
391
392/// Return the ceil log base 2 of the specified value, 32 if the value is zero.
393/// (32 bit edition).
394/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
395inline unsigned Log2_32_Ceil(uint32_t Value) {
396 return 32 - llvm::countl_zero(Value - 1);
397}
398
399/// Return the ceil log base 2 of the specified value, 64 if the value is zero.
400/// (64 bit edition.)
401inline unsigned Log2_64_Ceil(uint64_t Value) {
402 return 64 - llvm::countl_zero(Value - 1);
403}
404
405/// This function takes a 64-bit integer and returns the bit equivalent double.
406LLVM_DEPRECATED("use llvm::bit_cast instead", "llvm::bit_cast<double>")__attribute__((deprecated("use llvm::bit_cast instead", "llvm::bit_cast<double>"
)))
407inline double BitsToDouble(uint64_t Bits) {
408 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
409 return llvm::bit_cast<double>(Bits);
410}
411
412/// This function takes a 32-bit integer and returns the bit equivalent float.
413LLVM_DEPRECATED("use llvm::bit_cast instead", "llvm::bit_cast<float>")__attribute__((deprecated("use llvm::bit_cast instead", "llvm::bit_cast<float>"
)))
414inline float BitsToFloat(uint32_t Bits) {
415 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
416 return llvm::bit_cast<float>(Bits);
417}
418
419/// This function takes a double and returns the bit equivalent 64-bit integer.
420/// Note that copying doubles around changes the bits of NaNs on some hosts,
421/// notably x86, so this routine cannot be used if these bits are needed.
422LLVM_DEPRECATED("use llvm::bit_cast instead", "llvm::bit_cast<uint64_t>")__attribute__((deprecated("use llvm::bit_cast instead", "llvm::bit_cast<uint64_t>"
)))
423inline uint64_t DoubleToBits(double Double) {
424 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
425 return llvm::bit_cast<uint64_t>(Double);
426}
427
428/// This function takes a float and returns the bit equivalent 32-bit integer.
429/// Note that copying floats around changes the bits of NaNs on some hosts,
430/// notably x86, so this routine cannot be used if these bits are needed.
431LLVM_DEPRECATED("use llvm::bit_cast instead", "llvm::bit_cast<uint32_t>")__attribute__((deprecated("use llvm::bit_cast instead", "llvm::bit_cast<uint32_t>"
)))
432inline uint32_t FloatToBits(float Float) {
433 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
434 return llvm::bit_cast<uint32_t>(Float);
435}
436
437/// A and B are either alignments or offsets. Return the minimum alignment that
438/// may be assumed after adding the two together.
439constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
440 // The largest power of 2 that divides both A and B.
441 //
442 // Replace "-Value" by "1+~Value" in the following commented code to avoid
443 // MSVC warning C4146
444 // return (A | B) & -(A | B);
445 return (A | B) & (1 + ~(A | B));
446}
447
448/// Returns the next power of two (in 64-bits) that is strictly greater than A.
449/// Returns zero on overflow.
450constexpr inline uint64_t NextPowerOf2(uint64_t A) {
451 A |= (A >> 1);
452 A |= (A >> 2);
453 A |= (A >> 4);
454 A |= (A >> 8);
455 A |= (A >> 16);
456 A |= (A >> 32);
457 return A + 1;
458}
459
460/// Returns the power of two which is less than or equal to the given value.
461/// Essentially, it is a floor operation across the domain of powers of two.
462LLVM_DEPRECATED("use llvm::bit_floor instead", "llvm::bit_floor")__attribute__((deprecated("use llvm::bit_floor instead", "llvm::bit_floor"
)))
463inline uint64_t PowerOf2Floor(uint64_t A) {
464 return llvm::bit_floor(A);
465}
466
467/// Returns the power of two which is greater than or equal to the given value.
468/// Essentially, it is a ceil operation across the domain of powers of two.
469inline uint64_t PowerOf2Ceil(uint64_t A) {
470 if (!A)
471 return 0;
472 return NextPowerOf2(A - 1);
473}
474
475/// Returns the next integer (mod 2**64) that is greater than or equal to
476/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
477///
478/// Examples:
479/// \code
480/// alignTo(5, 8) = 8
481/// alignTo(17, 8) = 24
482/// alignTo(~0LL, 8) = 0
483/// alignTo(321, 255) = 510
484/// \endcode
485inline uint64_t alignTo(uint64_t Value, uint64_t Align) {
486 assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0."
) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 486, __extension__
__PRETTY_FUNCTION__))
;
487 return (Value + Align - 1) / Align * Align;
488}
489
490inline uint64_t alignToPowerOf2(uint64_t Value, uint64_t Align) {
491 assert(Align != 0 && (Align & (Align - 1)) == 0 &&(static_cast <bool> (Align != 0 && (Align &
(Align - 1)) == 0 && "Align must be a power of 2") ?
void (0) : __assert_fail ("Align != 0 && (Align & (Align - 1)) == 0 && \"Align must be a power of 2\""
, "llvm/include/llvm/Support/MathExtras.h", 492, __extension__
__PRETTY_FUNCTION__))
492 "Align must be a power of 2")(static_cast <bool> (Align != 0 && (Align &
(Align - 1)) == 0 && "Align must be a power of 2") ?
void (0) : __assert_fail ("Align != 0 && (Align & (Align - 1)) == 0 && \"Align must be a power of 2\""
, "llvm/include/llvm/Support/MathExtras.h", 492, __extension__
__PRETTY_FUNCTION__))
;
493 return (Value + Align - 1) & -Align;
494}
495
496/// If non-zero \p Skew is specified, the return value will be a minimal integer
497/// that is greater than or equal to \p Size and equal to \p A * N + \p Skew for
498/// some integer N. If \p Skew is larger than \p A, its value is adjusted to '\p
499/// Skew mod \p A'. \p Align must be non-zero.
500///
501/// Examples:
502/// \code
503/// alignTo(5, 8, 7) = 7
504/// alignTo(17, 8, 1) = 17
505/// alignTo(~0LL, 8, 3) = 3
506/// alignTo(321, 255, 42) = 552
507/// \endcode
508inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew) {
509 assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0."
) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 509, __extension__
__PRETTY_FUNCTION__))
;
510 Skew %= Align;
511 return alignTo(Value - Skew, Align) + Skew;
512}
513
514/// Returns the next integer (mod 2**64) that is greater than or equal to
515/// \p Value and is a multiple of \c Align. \c Align must be non-zero.
516template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) {
517 static_assert(Align != 0u, "Align must be non-zero");
518 return (Value + Align - 1) / Align * Align;
519}
520
521/// Returns the integer ceil(Numerator / Denominator).
522inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
523 return alignTo(Numerator, Denominator) / Denominator;
524}
525
526/// Returns the integer nearest(Numerator / Denominator).
527inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) {
528 return (Numerator + (Denominator / 2)) / Denominator;
529}
530
531/// Returns the largest uint64_t less than or equal to \p Value and is
532/// \p Skew mod \p Align. \p Align must be non-zero
533inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
534 assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0."
) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 534, __extension__
__PRETTY_FUNCTION__))
;
535 Skew %= Align;
536 return (Value - Skew) / Align * Align + Skew;
537}
538
539/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
540/// Requires 0 < B <= 32.
541template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) {
542 static_assert(B > 0, "Bit width can't be 0.");
543 static_assert(B <= 32, "Bit width out of range.");
544 return int32_t(X << (32 - B)) >> (32 - B);
545}
546
547/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
548/// Requires 0 < B <= 32.
549inline int32_t SignExtend32(uint32_t X, unsigned B) {
550 assert(B > 0 && "Bit width can't be 0.")(static_cast <bool> (B > 0 && "Bit width can't be 0."
) ? void (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 550, __extension__
__PRETTY_FUNCTION__))
;
551 assert(B <= 32 && "Bit width out of range.")(static_cast <bool> (B <= 32 && "Bit width out of range."
) ? void (0) : __assert_fail ("B <= 32 && \"Bit width out of range.\""
, "llvm/include/llvm/Support/MathExtras.h", 551, __extension__
__PRETTY_FUNCTION__))
;
552 return int32_t(X << (32 - B)) >> (32 - B);
553}
554
555/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
556/// Requires 0 < B <= 64.
557template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) {
558 static_assert(B > 0, "Bit width can't be 0.");
559 static_assert(B <= 64, "Bit width out of range.");
560 return int64_t(x << (64 - B)) >> (64 - B);
561}
562
563/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
564/// Requires 0 < B <= 64.
565inline int64_t SignExtend64(uint64_t X, unsigned B) {
566 assert(B > 0 && "Bit width can't be 0.")(static_cast <bool> (B > 0 && "Bit width can't be 0."
) ? void (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "llvm/include/llvm/Support/MathExtras.h", 566, __extension__
__PRETTY_FUNCTION__))
;
567 assert(B <= 64 && "Bit width out of range.")(static_cast <bool> (B <= 64 && "Bit width out of range."
) ? void (0) : __assert_fail ("B <= 64 && \"Bit width out of range.\""
, "llvm/include/llvm/Support/MathExtras.h", 567, __extension__
__PRETTY_FUNCTION__))
;
568 return int64_t(X << (64 - B)) >> (64 - B);
569}
570
571/// Subtract two unsigned integers, X and Y, of type T and return the absolute
572/// value of the result.
573template <typename T>
574std::enable_if_t<std::is_unsigned_v<T>, T> AbsoluteDifference(T X, T Y) {
575 return X > Y ? (X - Y) : (Y - X);
576}
577
578/// Add two unsigned integers, X and Y, of type T. Clamp the result to the
579/// maximum representable value of T on overflow. ResultOverflowed indicates if
580/// the result is larger than the maximum representable value of type T.
581template <typename T>
582std::enable_if_t<std::is_unsigned_v<T>, T>
583SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) {
584 bool Dummy;
585 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
586 // Hacker's Delight, p. 29
587 T Z = X + Y;
588 Overflowed = (Z < X || Z < Y);
589 if (Overflowed)
590 return std::numeric_limits<T>::max();
591 else
592 return Z;
593}
594
595/// Add multiple unsigned integers of type T. Clamp the result to the
596/// maximum representable value of T on overflow.
597template <class T, class... Ts>
598std::enable_if_t<std::is_unsigned_v<T>, T> SaturatingAdd(T X, T Y, T Z,
599 Ts... Args) {
600 bool Overflowed = false;
601 T XY = SaturatingAdd(X, Y, &Overflowed);
602 if (Overflowed)
603 return SaturatingAdd(std::numeric_limits<T>::max(), T(1), Args...);
604 return SaturatingAdd(XY, Z, Args...);
605}
606
607/// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the
608/// maximum representable value of T on overflow. ResultOverflowed indicates if
609/// the result is larger than the maximum representable value of type T.
610template <typename T>
611std::enable_if_t<std::is_unsigned_v<T>, T>
612SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) {
613 bool Dummy;
614 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
615
616 // Hacker's Delight, p. 30 has a different algorithm, but we don't use that
617 // because it fails for uint16_t (where multiplication can have undefined
618 // behavior due to promotion to int), and requires a division in addition
619 // to the multiplication.
620
621 Overflowed = false;
622
623 // Log2(Z) would be either Log2Z or Log2Z + 1.
624 // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z
625 // will necessarily be less than Log2Max as desired.
626 int Log2Z = Log2_64(X) + Log2_64(Y);
627 const T Max = std::numeric_limits<T>::max();
628 int Log2Max = Log2_64(Max);
629 if (Log2Z < Log2Max) {
630 return X * Y;
631 }
632 if (Log2Z > Log2Max) {
633 Overflowed = true;
634 return Max;
635 }
636
637 // We're going to use the top bit, and maybe overflow one
638 // bit past it. Multiply all but the bottom bit then add
639 // that on at the end.
640 T Z = (X >> 1) * Y;
641 if (Z & ~(Max >> 1)) {
642 Overflowed = true;
643 return Max;
644 }
645 Z <<= 1;
646 if (X & 1)
647 return SaturatingAdd(Z, Y, ResultOverflowed);
648
649 return Z;
650}
651
652/// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to
653/// the product. Clamp the result to the maximum representable value of T on
654/// overflow. ResultOverflowed indicates if the result is larger than the
655/// maximum representable value of type T.
656template <typename T>
657std::enable_if_t<std::is_unsigned_v<T>, T>
658SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) {
659 bool Dummy;
660 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
661
662 T Product = SaturatingMultiply(X, Y, &Overflowed);
663 if (Overflowed)
664 return Product;
665
666 return SaturatingAdd(A, Product, &Overflowed);
667}
668
669/// Use this rather than HUGE_VALF; the latter causes warnings on MSVC.
670extern const float huge_valf;
671
672
673/// Add two signed integers, computing the two's complement truncated result,
674/// returning true if overflow occurred.
675template <typename T>
676std::enable_if_t<std::is_signed_v<T>, T> AddOverflow(T X, T Y, T &Result) {
677#if __has_builtin(__builtin_add_overflow)1
678 return __builtin_add_overflow(X, Y, &Result);
679#else
680 // Perform the unsigned addition.
681 using U = std::make_unsigned_t<T>;
682 const U UX = static_cast<U>(X);
683 const U UY = static_cast<U>(Y);
684 const U UResult = UX + UY;
685
686 // Convert to signed.
687 Result = static_cast<T>(UResult);
688
689 // Adding two positive numbers should result in a positive number.
690 if (X > 0 && Y > 0)
691 return Result <= 0;
692 // Adding two negatives should result in a negative number.
693 if (X < 0 && Y < 0)
694 return Result >= 0;
695 return false;
696#endif
697}
698
699/// Subtract two signed integers, computing the two's complement truncated
700/// result, returning true if an overflow ocurred.
701template <typename T>
702std::enable_if_t<std::is_signed_v<T>, T> SubOverflow(T X, T Y, T &Result) {
703#if __has_builtin(__builtin_sub_overflow)1
704 return __builtin_sub_overflow(X, Y, &Result);
705#else
706 // Perform the unsigned addition.
707 using U = std::make_unsigned_t<T>;
708 const U UX = static_cast<U>(X);
709 const U UY = static_cast<U>(Y);
710 const U UResult = UX - UY;
711
712 // Convert to signed.
713 Result = static_cast<T>(UResult);
714
715 // Subtracting a positive number from a negative results in a negative number.
716 if (X <= 0 && Y > 0)
717 return Result >= 0;
718 // Subtracting a negative number from a positive results in a positive number.
719 if (X >= 0 && Y < 0)
720 return Result <= 0;
721 return false;
722#endif
723}
724
725/// Multiply two signed integers, computing the two's complement truncated
726/// result, returning true if an overflow ocurred.
727template <typename T>
728std::enable_if_t<std::is_signed_v<T>, T> MulOverflow(T X, T Y, T &Result) {
729 // Perform the unsigned multiplication on absolute values.
730 using U = std::make_unsigned_t<T>;
731 const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X);
732 const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y);
733 const U UResult = UX * UY;
734
735 // Convert to signed.
736 const bool IsNegative = (X < 0) ^ (Y < 0);
737 Result = IsNegative ? (0 - UResult) : UResult;
738
739 // If any of the args was 0, result is 0 and no overflow occurs.
740 if (UX == 0 || UY == 0)
741 return false;
742
743 // UX and UY are in [1, 2^n], where n is the number of digits.
744 // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for
745 // positive) divided by an argument compares to the other.
746 if (IsNegative)
747 return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY;
748 else
749 return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY;
750}
751
752} // End llvm namespace
753
754#endif