Bug Summary

File:llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Warning:line 1754, column 21
The result of the left shift is undefined due to shifting by '4294967295', which is greater or equal to the width of type 'int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name AArch64TargetTransformInfo.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/build-llvm -resource-dir /usr/lib/llvm-14/lib/clang/14.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Target/AArch64 -I /build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/llvm/lib/Target/AArch64 -I include -I /build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/llvm/include -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-14/lib/clang/14.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-command-line-argument -Wno-unknown-warning-option -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/build-llvm -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2021-09-26-234817-15343-1 -x c++ /build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

/build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "AArch64TargetTransformInfo.h"
10#include "AArch64ExpandImm.h"
11#include "MCTargetDesc/AArch64AddressingModes.h"
12#include "llvm/Analysis/IVDescriptors.h"
13#include "llvm/Analysis/LoopInfo.h"
14#include "llvm/Analysis/TargetTransformInfo.h"
15#include "llvm/CodeGen/BasicTTIImpl.h"
16#include "llvm/CodeGen/CostTable.h"
17#include "llvm/CodeGen/TargetLowering.h"
18#include "llvm/IR/Intrinsics.h"
19#include "llvm/IR/IntrinsicInst.h"
20#include "llvm/IR/IntrinsicsAArch64.h"
21#include "llvm/IR/PatternMatch.h"
22#include "llvm/Support/Debug.h"
23#include "llvm/Transforms/InstCombine/InstCombiner.h"
24#include <algorithm>
25using namespace llvm;
26using namespace llvm::PatternMatch;
27
28#define DEBUG_TYPE"aarch64tti" "aarch64tti"
29
30static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
31 cl::init(true), cl::Hidden);
32
33bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
34 const Function *Callee) const {
35 const TargetMachine &TM = getTLI()->getTargetMachine();
36
37 const FeatureBitset &CallerBits =
38 TM.getSubtargetImpl(*Caller)->getFeatureBits();
39 const FeatureBitset &CalleeBits =
40 TM.getSubtargetImpl(*Callee)->getFeatureBits();
41
42 // Inline a callee if its target-features are a subset of the callers
43 // target-features.
44 return (CallerBits & CalleeBits) == CalleeBits;
45}
46
47/// Calculate the cost of materializing a 64-bit value. This helper
48/// method might only calculate a fraction of a larger immediate. Therefore it
49/// is valid to return a cost of ZERO.
50InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) {
51 // Check if the immediate can be encoded within an instruction.
52 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
53 return 0;
54
55 if (Val < 0)
56 Val = ~Val;
57
58 // Calculate how many moves we will need to materialize this constant.
59 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
60 AArch64_IMM::expandMOVImm(Val, 64, Insn);
61 return Insn.size();
62}
63
64/// Calculate the cost of materializing the given constant.
65InstructionCost AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
66 TTI::TargetCostKind CostKind) {
67 assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
__assert_fail ("Ty->isIntegerTy()", "/build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 67, __extension__ __PRETTY_FUNCTION__))
;
68
69 unsigned BitSize = Ty->getPrimitiveSizeInBits();
70 if (BitSize == 0)
71 return ~0U;
72
73 // Sign-extend all constants to a multiple of 64-bit.
74 APInt ImmVal = Imm;
75 if (BitSize & 0x3f)
76 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
77
78 // Split the constant into 64-bit chunks and calculate the cost for each
79 // chunk.
80 InstructionCost Cost = 0;
81 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
82 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
83 int64_t Val = Tmp.getSExtValue();
84 Cost += getIntImmCost(Val);
85 }
86 // We need at least one instruction to materialze the constant.
87 return std::max<InstructionCost>(1, Cost);
88}
89
90InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
91 const APInt &Imm, Type *Ty,
92 TTI::TargetCostKind CostKind,
93 Instruction *Inst) {
94 assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
__assert_fail ("Ty->isIntegerTy()", "/build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 94, __extension__ __PRETTY_FUNCTION__))
;
95
96 unsigned BitSize = Ty->getPrimitiveSizeInBits();
97 // There is no cost model for constants with a bit size of 0. Return TCC_Free
98 // here, so that constant hoisting will ignore this constant.
99 if (BitSize == 0)
100 return TTI::TCC_Free;
101
102 unsigned ImmIdx = ~0U;
103 switch (Opcode) {
104 default:
105 return TTI::TCC_Free;
106 case Instruction::GetElementPtr:
107 // Always hoist the base address of a GetElementPtr.
108 if (Idx == 0)
109 return 2 * TTI::TCC_Basic;
110 return TTI::TCC_Free;
111 case Instruction::Store:
112 ImmIdx = 0;
113 break;
114 case Instruction::Add:
115 case Instruction::Sub:
116 case Instruction::Mul:
117 case Instruction::UDiv:
118 case Instruction::SDiv:
119 case Instruction::URem:
120 case Instruction::SRem:
121 case Instruction::And:
122 case Instruction::Or:
123 case Instruction::Xor:
124 case Instruction::ICmp:
125 ImmIdx = 1;
126 break;
127 // Always return TCC_Free for the shift value of a shift instruction.
128 case Instruction::Shl:
129 case Instruction::LShr:
130 case Instruction::AShr:
131 if (Idx == 1)
132 return TTI::TCC_Free;
133 break;
134 case Instruction::Trunc:
135 case Instruction::ZExt:
136 case Instruction::SExt:
137 case Instruction::IntToPtr:
138 case Instruction::PtrToInt:
139 case Instruction::BitCast:
140 case Instruction::PHI:
141 case Instruction::Call:
142 case Instruction::Select:
143 case Instruction::Ret:
144 case Instruction::Load:
145 break;
146 }
147
148 if (Idx == ImmIdx) {
149 int NumConstants = (BitSize + 63) / 64;
150 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
151 return (Cost <= NumConstants * TTI::TCC_Basic)
152 ? static_cast<int>(TTI::TCC_Free)
153 : Cost;
154 }
155 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
156}
157
158InstructionCost
159AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
160 const APInt &Imm, Type *Ty,
161 TTI::TargetCostKind CostKind) {
162 assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
__assert_fail ("Ty->isIntegerTy()", "/build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 162, __extension__ __PRETTY_FUNCTION__))
;
163
164 unsigned BitSize = Ty->getPrimitiveSizeInBits();
165 // There is no cost model for constants with a bit size of 0. Return TCC_Free
166 // here, so that constant hoisting will ignore this constant.
167 if (BitSize == 0)
168 return TTI::TCC_Free;
169
170 // Most (all?) AArch64 intrinsics do not support folding immediates into the
171 // selected instruction, so we compute the materialization cost for the
172 // immediate directly.
173 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
174 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
175
176 switch (IID) {
177 default:
178 return TTI::TCC_Free;
179 case Intrinsic::sadd_with_overflow:
180 case Intrinsic::uadd_with_overflow:
181 case Intrinsic::ssub_with_overflow:
182 case Intrinsic::usub_with_overflow:
183 case Intrinsic::smul_with_overflow:
184 case Intrinsic::umul_with_overflow:
185 if (Idx == 1) {
186 int NumConstants = (BitSize + 63) / 64;
187 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
188 return (Cost <= NumConstants * TTI::TCC_Basic)
189 ? static_cast<int>(TTI::TCC_Free)
190 : Cost;
191 }
192 break;
193 case Intrinsic::experimental_stackmap:
194 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
195 return TTI::TCC_Free;
196 break;
197 case Intrinsic::experimental_patchpoint_void:
198 case Intrinsic::experimental_patchpoint_i64:
199 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
200 return TTI::TCC_Free;
201 break;
202 case Intrinsic::experimental_gc_statepoint:
203 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
204 return TTI::TCC_Free;
205 break;
206 }
207 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
208}
209
210TargetTransformInfo::PopcntSupportKind
211AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
212 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2")(static_cast <bool> (isPowerOf2_32(TyWidth) && "Ty width must be power of 2"
) ? void (0) : __assert_fail ("isPowerOf2_32(TyWidth) && \"Ty width must be power of 2\""
, "/build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 212, __extension__ __PRETTY_FUNCTION__))
;
213 if (TyWidth == 32 || TyWidth == 64)
214 return TTI::PSK_FastHardware;
215 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
216 return TTI::PSK_Software;
217}
218
219InstructionCost
220AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
221 TTI::TargetCostKind CostKind) {
222 auto *RetTy = ICA.getReturnType();
223 switch (ICA.getID()) {
224 case Intrinsic::umin:
225 case Intrinsic::umax:
226 case Intrinsic::smin:
227 case Intrinsic::smax: {
228 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
229 MVT::v8i16, MVT::v2i32, MVT::v4i32};
230 auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
231 // v2i64 types get converted to cmp+bif hence the cost of 2
232 if (LT.second == MVT::v2i64)
233 return LT.first * 2;
234 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
235 return LT.first;
236 break;
237 }
238 case Intrinsic::sadd_sat:
239 case Intrinsic::ssub_sat:
240 case Intrinsic::uadd_sat:
241 case Intrinsic::usub_sat: {
242 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
243 MVT::v8i16, MVT::v2i32, MVT::v4i32,
244 MVT::v2i64};
245 auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
246 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
247 // need to extend the type, as it uses shr(qadd(shl, shl)).
248 unsigned Instrs =
249 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
250 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
251 return LT.first * Instrs;
252 break;
253 }
254 case Intrinsic::abs: {
255 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
256 MVT::v8i16, MVT::v2i32, MVT::v4i32,
257 MVT::v2i64};
258 auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
259 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
260 return LT.first;
261 break;
262 }
263 case Intrinsic::experimental_stepvector: {
264 InstructionCost Cost = 1; // Cost of the `index' instruction
265 auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
266 // Legalisation of illegal vectors involves an `index' instruction plus
267 // (LT.first - 1) vector adds.
268 if (LT.first > 1) {
269 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
270 InstructionCost AddCost =
271 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
272 Cost += AddCost * (LT.first - 1);
273 }
274 return Cost;
275 }
276 case Intrinsic::bitreverse: {
277 static const CostTblEntry BitreverseTbl[] = {
278 {Intrinsic::bitreverse, MVT::i32, 1},
279 {Intrinsic::bitreverse, MVT::i64, 1},
280 {Intrinsic::bitreverse, MVT::v8i8, 1},
281 {Intrinsic::bitreverse, MVT::v16i8, 1},
282 {Intrinsic::bitreverse, MVT::v4i16, 2},
283 {Intrinsic::bitreverse, MVT::v8i16, 2},
284 {Intrinsic::bitreverse, MVT::v2i32, 2},
285 {Intrinsic::bitreverse, MVT::v4i32, 2},
286 {Intrinsic::bitreverse, MVT::v1i64, 2},
287 {Intrinsic::bitreverse, MVT::v2i64, 2},
288 };
289 const auto LegalisationCost = TLI->getTypeLegalizationCost(DL, RetTy);
290 const auto *Entry =
291 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
292 // Cost Model is using the legal type(i32) that i8 and i16 will be converted
293 // to +1 so that we match the actual lowering cost
294 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
295 TLI->getValueType(DL, RetTy, true) == MVT::i16)
296 return LegalisationCost.first * Entry->Cost + 1;
297 if (Entry)
298 return LegalisationCost.first * Entry->Cost;
299 break;
300 }
301 case Intrinsic::ctpop: {
302 static const CostTblEntry CtpopCostTbl[] = {
303 {ISD::CTPOP, MVT::v2i64, 4},
304 {ISD::CTPOP, MVT::v4i32, 3},
305 {ISD::CTPOP, MVT::v8i16, 2},
306 {ISD::CTPOP, MVT::v16i8, 1},
307 {ISD::CTPOP, MVT::i64, 4},
308 {ISD::CTPOP, MVT::v2i32, 3},
309 {ISD::CTPOP, MVT::v4i16, 2},
310 {ISD::CTPOP, MVT::v8i8, 1},
311 {ISD::CTPOP, MVT::i32, 5},
312 };
313 auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
314 MVT MTy = LT.second;
315 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
316 // Extra cost of +1 when illegal vector types are legalized by promoting
317 // the integer type.
318 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
319 RetTy->getScalarSizeInBits()
320 ? 1
321 : 0;
322 return LT.first * Entry->Cost + ExtraCost;
323 }
324 break;
325 }
326 default:
327 break;
328 }
329 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
330}
331
332/// The function will remove redundant reinterprets casting in the presence
333/// of the control flow
334static Optional<Instruction *> processPhiNode(InstCombiner &IC,
335 IntrinsicInst &II) {
336 SmallVector<Instruction *, 32> Worklist;
337 auto RequiredType = II.getType();
338
339 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
340 assert(PN && "Expected Phi Node!")(static_cast <bool> (PN && "Expected Phi Node!"
) ? void (0) : __assert_fail ("PN && \"Expected Phi Node!\""
, "/build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 340, __extension__ __PRETTY_FUNCTION__))
;
341
342 // Don't create a new Phi unless we can remove the old one.
343 if (!PN->hasOneUse())
344 return None;
345
346 for (Value *IncValPhi : PN->incoming_values()) {
347 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
348 if (!Reinterpret ||
349 Reinterpret->getIntrinsicID() !=
350 Intrinsic::aarch64_sve_convert_to_svbool ||
351 RequiredType != Reinterpret->getArgOperand(0)->getType())
352 return None;
353 }
354
355 // Create the new Phi
356 LLVMContext &Ctx = PN->getContext();
357 IRBuilder<> Builder(Ctx);
358 Builder.SetInsertPoint(PN);
359 PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
360 Worklist.push_back(PN);
361
362 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
363 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
364 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
365 Worklist.push_back(Reinterpret);
366 }
367
368 // Cleanup Phi Node and reinterprets
369 return IC.replaceInstUsesWith(II, NPN);
370}
371
372static Optional<Instruction *> instCombineConvertFromSVBool(InstCombiner &IC,
373 IntrinsicInst &II) {
374 // If the reinterpret instruction operand is a PHI Node
375 if (isa<PHINode>(II.getArgOperand(0)))
376 return processPhiNode(IC, II);
377
378 SmallVector<Instruction *, 32> CandidatesForRemoval;
379 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
380
381 const auto *IVTy = cast<VectorType>(II.getType());
382
383 // Walk the chain of conversions.
384 while (Cursor) {
385 // If the type of the cursor has fewer lanes than the final result, zeroing
386 // must take place, which breaks the equivalence chain.
387 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
388 if (CursorVTy->getElementCount().getKnownMinValue() <
389 IVTy->getElementCount().getKnownMinValue())
390 break;
391
392 // If the cursor has the same type as I, it is a viable replacement.
393 if (Cursor->getType() == IVTy)
394 EarliestReplacement = Cursor;
395
396 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
397
398 // If this is not an SVE conversion intrinsic, this is the end of the chain.
399 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
400 Intrinsic::aarch64_sve_convert_to_svbool ||
401 IntrinsicCursor->getIntrinsicID() ==
402 Intrinsic::aarch64_sve_convert_from_svbool))
403 break;
404
405 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
406 Cursor = IntrinsicCursor->getOperand(0);
407 }
408
409 // If no viable replacement in the conversion chain was found, there is
410 // nothing to do.
411 if (!EarliestReplacement)
412 return None;
413
414 return IC.replaceInstUsesWith(II, EarliestReplacement);
415}
416
417static Optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
418 IntrinsicInst &II) {
419 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
420 if (!Pg)
421 return None;
422
423 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
424 return None;
425
426 const auto PTruePattern =
427 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
428 if (PTruePattern != AArch64SVEPredPattern::vl1)
429 return None;
430
431 // The intrinsic is inserting into lane zero so use an insert instead.
432 auto *IdxTy = Type::getInt64Ty(II.getContext());
433 auto *Insert = InsertElementInst::Create(
434 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
435 Insert->insertBefore(&II);
436 Insert->takeName(&II);
437
438 return IC.replaceInstUsesWith(II, Insert);
439}
440
441static Optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
442 IntrinsicInst &II) {
443 // Replace DupX with a regular IR splat.
444 IRBuilder<> Builder(II.getContext());
445 Builder.SetInsertPoint(&II);
446 auto *RetTy = cast<ScalableVectorType>(II.getType());
447 Value *Splat =
448 Builder.CreateVectorSplat(RetTy->getElementCount(), II.getArgOperand(0));
449 Splat->takeName(&II);
450 return IC.replaceInstUsesWith(II, Splat);
451}
452
453static Optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
454 IntrinsicInst &II) {
455 LLVMContext &Ctx = II.getContext();
456 IRBuilder<> Builder(Ctx);
457 Builder.SetInsertPoint(&II);
458
459 // Check that the predicate is all active
460 auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
461 if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
462 return None;
463
464 const auto PTruePattern =
465 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
466 if (PTruePattern != AArch64SVEPredPattern::all)
467 return None;
468
469 // Check that we have a compare of zero..
470 auto *SplatValue =
471 dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2)));
472 if (!SplatValue || !SplatValue->isZero())
473 return None;
474
475 // ..against a dupq
476 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
477 if (!DupQLane ||
478 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
479 return None;
480
481 // Where the dupq is a lane 0 replicate of a vector insert
482 if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero())
483 return None;
484
485 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
486 if (!VecIns ||
487 VecIns->getIntrinsicID() != Intrinsic::experimental_vector_insert)
488 return None;
489
490 // Where the vector insert is a fixed constant vector insert into undef at
491 // index zero
492 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
493 return None;
494
495 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
496 return None;
497
498 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
499 if (!ConstVec)
500 return None;
501
502 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
503 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
504 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
505 return None;
506
507 unsigned NumElts = VecTy->getNumElements();
508 unsigned PredicateBits = 0;
509
510 // Expand intrinsic operands to a 16-bit byte level predicate
511 for (unsigned I = 0; I < NumElts; ++I) {
512 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
513 if (!Arg)
514 return None;
515 if (!Arg->isZero())
516 PredicateBits |= 1 << (I * (16 / NumElts));
517 }
518
519 // If all bits are zero bail early with an empty predicate
520 if (PredicateBits == 0) {
521 auto *PFalse = Constant::getNullValue(II.getType());
522 PFalse->takeName(&II);
523 return IC.replaceInstUsesWith(II, PFalse);
524 }
525
526 // Calculate largest predicate type used (where byte predicate is largest)
527 unsigned Mask = 8;
528 for (unsigned I = 0; I < 16; ++I)
529 if ((PredicateBits & (1 << I)) != 0)
530 Mask |= (I % 8);
531
532 unsigned PredSize = Mask & -Mask;
533 auto *PredType = ScalableVectorType::get(
534 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
535
536 // Ensure all relevant bits are set
537 for (unsigned I = 0; I < 16; I += PredSize)
538 if ((PredicateBits & (1 << I)) == 0)
539 return None;
540
541 auto *PTruePat =
542 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
543 auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
544 {PredType}, {PTruePat});
545 auto *ConvertToSVBool = Builder.CreateIntrinsic(
546 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
547 auto *ConvertFromSVBool =
548 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
549 {II.getType()}, {ConvertToSVBool});
550
551 ConvertFromSVBool->takeName(&II);
552 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
553}
554
555static Optional<Instruction *> instCombineSVELast(InstCombiner &IC,
556 IntrinsicInst &II) {
557 IRBuilder<> Builder(II.getContext());
558 Builder.SetInsertPoint(&II);
559 Value *Pg = II.getArgOperand(0);
560 Value *Vec = II.getArgOperand(1);
561 auto IntrinsicID = II.getIntrinsicID();
562 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
563
564 // lastX(splat(X)) --> X
565 if (auto *SplatVal = getSplatValue(Vec))
566 return IC.replaceInstUsesWith(II, SplatVal);
567
568 // If x and/or y is a splat value then:
569 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
570 Value *LHS, *RHS;
571 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
572 if (isSplatValue(LHS) || isSplatValue(RHS)) {
573 auto *OldBinOp = cast<BinaryOperator>(Vec);
574 auto OpC = OldBinOp->getOpcode();
575 auto *NewLHS =
576 Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
577 auto *NewRHS =
578 Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
579 auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags(
580 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), &II);
581 return IC.replaceInstUsesWith(II, NewBinOp);
582 }
583 }
584
585 auto *C = dyn_cast<Constant>(Pg);
586 if (IsAfter && C && C->isNullValue()) {
587 // The intrinsic is extracting lane 0 so use an extract instead.
588 auto *IdxTy = Type::getInt64Ty(II.getContext());
589 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
590 Extract->insertBefore(&II);
591 Extract->takeName(&II);
592 return IC.replaceInstUsesWith(II, Extract);
593 }
594
595 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
596 if (!IntrPG)
597 return None;
598
599 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
600 return None;
601
602 const auto PTruePattern =
603 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
604
605 // Can the intrinsic's predicate be converted to a known constant index?
606 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
607 if (!MinNumElts)
608 return None;
609
610 unsigned Idx = MinNumElts - 1;
611 // Increment the index if extracting the element after the last active
612 // predicate element.
613 if (IsAfter)
614 ++Idx;
615
616 // Ignore extracts whose index is larger than the known minimum vector
617 // length. NOTE: This is an artificial constraint where we prefer to
618 // maintain what the user asked for until an alternative is proven faster.
619 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
620 if (Idx >= PgVTy->getMinNumElements())
621 return None;
622
623 // The intrinsic is extracting a fixed lane so use an extract instead.
624 auto *IdxTy = Type::getInt64Ty(II.getContext());
625 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
626 Extract->insertBefore(&II);
627 Extract->takeName(&II);
628 return IC.replaceInstUsesWith(II, Extract);
629}
630
631static Optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
632 IntrinsicInst &II) {
633 LLVMContext &Ctx = II.getContext();
634 IRBuilder<> Builder(Ctx);
635 Builder.SetInsertPoint(&II);
636 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
637 // can work with RDFFR_PP for ptest elimination.
638 auto *AllPat =
639 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
640 auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
641 {II.getType()}, {AllPat});
642 auto *RDFFR =
643 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue});
644 RDFFR->takeName(&II);
645 return IC.replaceInstUsesWith(II, RDFFR);
646}
647
648static Optional<Instruction *>
649instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) {
650 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
651
652 if (Pattern == AArch64SVEPredPattern::all) {
653 LLVMContext &Ctx = II.getContext();
654 IRBuilder<> Builder(Ctx);
655 Builder.SetInsertPoint(&II);
656
657 Constant *StepVal = ConstantInt::get(II.getType(), NumElts);
658 auto *VScale = Builder.CreateVScale(StepVal);
659 VScale->takeName(&II);
660 return IC.replaceInstUsesWith(II, VScale);
661 }
662
663 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
664
665 return MinNumElts && NumElts >= MinNumElts
666 ? Optional<Instruction *>(IC.replaceInstUsesWith(
667 II, ConstantInt::get(II.getType(), MinNumElts)))
668 : None;
669}
670
671static Optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
672 IntrinsicInst &II) {
673 IntrinsicInst *Op1 = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
674 IntrinsicInst *Op2 = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
675
676 if (Op1 && Op2 &&
677 Op1->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
678 Op2->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
679 Op1->getArgOperand(0)->getType() == Op2->getArgOperand(0)->getType()) {
680
681 IRBuilder<> Builder(II.getContext());
682 Builder.SetInsertPoint(&II);
683
684 Value *Ops[] = {Op1->getArgOperand(0), Op2->getArgOperand(0)};
685 Type *Tys[] = {Op1->getArgOperand(0)->getType()};
686
687 auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
688
689 PTest->takeName(&II);
690 return IC.replaceInstUsesWith(II, PTest);
691 }
692
693 return None;
694}
695
696static Optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
697 IntrinsicInst &II) {
698 auto *OpPredicate = II.getOperand(0);
699 auto *OpMultiplicand = II.getOperand(1);
700 auto *OpMultiplier = II.getOperand(2);
701
702 IRBuilder<> Builder(II.getContext());
703 Builder.SetInsertPoint(&II);
704
705 // Return true if a given instruction is a unit splat value, false otherwise.
706 auto IsUnitSplat = [](auto *I) {
707 auto *SplatValue = getSplatValue(I);
708 if (!SplatValue)
709 return false;
710 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
711 };
712
713 // Return true if a given instruction is an aarch64_sve_dup intrinsic call
714 // with a unit splat value, false otherwise.
715 auto IsUnitDup = [](auto *I) {
716 auto *IntrI = dyn_cast<IntrinsicInst>(I);
717 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
718 return false;
719
720 auto *SplatValue = IntrI->getOperand(2);
721 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
722 };
723
724 // The OpMultiplier variable should always point to the dup (if any), so
725 // swap if necessary.
726 if (IsUnitDup(OpMultiplicand) || IsUnitSplat(OpMultiplicand))
727 std::swap(OpMultiplier, OpMultiplicand);
728
729 if (IsUnitSplat(OpMultiplier)) {
730 // [f]mul pg (dupx 1) %n => %n
731 OpMultiplicand->takeName(&II);
732 return IC.replaceInstUsesWith(II, OpMultiplicand);
733 } else if (IsUnitDup(OpMultiplier)) {
734 // [f]mul pg (dup pg 1) %n => %n
735 auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
736 auto *DupPg = DupInst->getOperand(1);
737 // TODO: this is naive. The optimization is still valid if DupPg
738 // 'encompasses' OpPredicate, not only if they're the same predicate.
739 if (OpPredicate == DupPg) {
740 OpMultiplicand->takeName(&II);
741 return IC.replaceInstUsesWith(II, OpMultiplicand);
742 }
743 }
744
745 return None;
746}
747
748static Optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
749 IntrinsicInst &II) {
750 IRBuilder<> Builder(II.getContext());
751 Builder.SetInsertPoint(&II);
752 Value *UnpackArg = II.getArgOperand(0);
753 auto *RetTy = cast<ScalableVectorType>(II.getType());
754 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
755 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
756
757 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
758 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
759 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
760 ScalarArg =
761 Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
762 Value *NewVal =
763 Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
764 NewVal->takeName(&II);
765 return IC.replaceInstUsesWith(II, NewVal);
766 }
767
768 return None;
769}
770static Optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
771 IntrinsicInst &II) {
772 auto *OpVal = II.getOperand(0);
773 auto *OpIndices = II.getOperand(1);
774 VectorType *VTy = cast<VectorType>(II.getType());
775
776 // Check whether OpIndices is a constant splat value < minimal element count
777 // of result.
778 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
779 if (!SplatValue ||
780 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
781 return None;
782
783 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
784 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
785 IRBuilder<> Builder(II.getContext());
786 Builder.SetInsertPoint(&II);
787 auto *Extract = Builder.CreateExtractElement(OpVal, SplatValue);
788 auto *VectorSplat =
789 Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
790
791 VectorSplat->takeName(&II);
792 return IC.replaceInstUsesWith(II, VectorSplat);
793}
794
795static Optional<Instruction *> instCombineSVETupleGet(InstCombiner &IC,
796 IntrinsicInst &II) {
797 // Try to remove sequences of tuple get/set.
798 Value *SetTuple, *SetIndex, *SetValue;
799 auto *GetTuple = II.getArgOperand(0);
800 auto *GetIndex = II.getArgOperand(1);
801 // Check that we have tuple_get(GetTuple, GetIndex) where GetTuple is a
802 // call to tuple_set i.e. tuple_set(SetTuple, SetIndex, SetValue).
803 // Make sure that the types of the current intrinsic and SetValue match
804 // in order to safely remove the sequence.
805 if (!match(GetTuple,
806 m_Intrinsic<Intrinsic::aarch64_sve_tuple_set>(
807 m_Value(SetTuple), m_Value(SetIndex), m_Value(SetValue))) ||
808 SetValue->getType() != II.getType())
809 return None;
810 // Case where we get the same index right after setting it.
811 // tuple_get(tuple_set(SetTuple, SetIndex, SetValue), GetIndex) --> SetValue
812 if (GetIndex == SetIndex)
813 return IC.replaceInstUsesWith(II, SetValue);
814 // If we are getting a different index than what was set in the tuple_set
815 // intrinsic. We can just set the input tuple to the one up in the chain.
816 // tuple_get(tuple_set(SetTuple, SetIndex, SetValue), GetIndex)
817 // --> tuple_get(SetTuple, GetIndex)
818 return IC.replaceOperand(II, 0, SetTuple);
819}
820
821static Optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
822 IntrinsicInst &II) {
823 // zip1(uzp1(A, B), uzp2(A, B)) --> A
824 // zip2(uzp1(A, B), uzp2(A, B)) --> B
825 Value *A, *B;
826 if (match(II.getArgOperand(0),
827 m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) &&
828 match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
829 m_Specific(A), m_Specific(B))))
830 return IC.replaceInstUsesWith(
831 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
832
833 return None;
834}
835
836Optional<Instruction *>
837AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
838 IntrinsicInst &II) const {
839 Intrinsic::ID IID = II.getIntrinsicID();
840 switch (IID) {
841 default:
842 break;
843 case Intrinsic::aarch64_sve_convert_from_svbool:
844 return instCombineConvertFromSVBool(IC, II);
845 case Intrinsic::aarch64_sve_dup:
846 return instCombineSVEDup(IC, II);
847 case Intrinsic::aarch64_sve_dup_x:
848 return instCombineSVEDupX(IC, II);
849 case Intrinsic::aarch64_sve_cmpne:
850 case Intrinsic::aarch64_sve_cmpne_wide:
851 return instCombineSVECmpNE(IC, II);
852 case Intrinsic::aarch64_sve_rdffr:
853 return instCombineRDFFR(IC, II);
854 case Intrinsic::aarch64_sve_lasta:
855 case Intrinsic::aarch64_sve_lastb:
856 return instCombineSVELast(IC, II);
857 case Intrinsic::aarch64_sve_cntd:
858 return instCombineSVECntElts(IC, II, 2);
859 case Intrinsic::aarch64_sve_cntw:
860 return instCombineSVECntElts(IC, II, 4);
861 case Intrinsic::aarch64_sve_cnth:
862 return instCombineSVECntElts(IC, II, 8);
863 case Intrinsic::aarch64_sve_cntb:
864 return instCombineSVECntElts(IC, II, 16);
865 case Intrinsic::aarch64_sve_ptest_any:
866 case Intrinsic::aarch64_sve_ptest_first:
867 case Intrinsic::aarch64_sve_ptest_last:
868 return instCombineSVEPTest(IC, II);
869 case Intrinsic::aarch64_sve_mul:
870 case Intrinsic::aarch64_sve_fmul:
871 return instCombineSVEVectorMul(IC, II);
872 case Intrinsic::aarch64_sve_tbl:
873 return instCombineSVETBL(IC, II);
874 case Intrinsic::aarch64_sve_uunpkhi:
875 case Intrinsic::aarch64_sve_uunpklo:
876 case Intrinsic::aarch64_sve_sunpkhi:
877 case Intrinsic::aarch64_sve_sunpklo:
878 return instCombineSVEUnpack(IC, II);
879 case Intrinsic::aarch64_sve_tuple_get:
880 return instCombineSVETupleGet(IC, II);
881 case Intrinsic::aarch64_sve_zip1:
882 case Intrinsic::aarch64_sve_zip2:
883 return instCombineSVEZip(IC, II);
884 }
885
886 return None;
887}
888
889bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
890 ArrayRef<const Value *> Args) {
891
892 // A helper that returns a vector type from the given type. The number of
893 // elements in type Ty determine the vector width.
894 auto toVectorTy = [&](Type *ArgTy) {
895 return VectorType::get(ArgTy->getScalarType(),
896 cast<VectorType>(DstTy)->getElementCount());
897 };
898
899 // Exit early if DstTy is not a vector type whose elements are at least
900 // 16-bits wide.
901 if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16)
902 return false;
903
904 // Determine if the operation has a widening variant. We consider both the
905 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
906 // instructions.
907 //
908 // TODO: Add additional widening operations (e.g., mul, shl, etc.) once we
909 // verify that their extending operands are eliminated during code
910 // generation.
911 switch (Opcode) {
912 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
913 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
914 break;
915 default:
916 return false;
917 }
918
919 // To be a widening instruction (either the "wide" or "long" versions), the
920 // second operand must be a sign- or zero extend having a single user. We
921 // only consider extends having a single user because they may otherwise not
922 // be eliminated.
923 if (Args.size() != 2 ||
924 (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])) ||
925 !Args[1]->hasOneUse())
926 return false;
927 auto *Extend = cast<CastInst>(Args[1]);
928
929 // Legalize the destination type and ensure it can be used in a widening
930 // operation.
931 auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy);
932 unsigned DstElTySize = DstTyL.second.getScalarSizeInBits();
933 if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits())
934 return false;
935
936 // Legalize the source type and ensure it can be used in a widening
937 // operation.
938 auto *SrcTy = toVectorTy(Extend->getSrcTy());
939 auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy);
940 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
941 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
942 return false;
943
944 // Get the total number of vector elements in the legalized types.
945 InstructionCost NumDstEls =
946 DstTyL.first * DstTyL.second.getVectorMinNumElements();
947 InstructionCost NumSrcEls =
948 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
949
950 // Return true if the legalized types have the same number of vector elements
951 // and the destination element type size is twice that of the source type.
952 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize;
953}
954
955InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
956 Type *Src,
957 TTI::CastContextHint CCH,
958 TTI::TargetCostKind CostKind,
959 const Instruction *I) {
960 int ISD = TLI->InstructionOpcodeToISD(Opcode);
961 assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ?
void (0) : __assert_fail ("ISD && \"Invalid opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 961, __extension__ __PRETTY_FUNCTION__))
;
962
963 // If the cast is observable, and it is used by a widening instruction (e.g.,
964 // uaddl, saddw, etc.), it may be free.
965 if (I && I->hasOneUse()) {
966 auto *SingleUser = cast<Instruction>(*I->user_begin());
967 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
968 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
969 // If the cast is the second operand, it is free. We will generate either
970 // a "wide" or "long" version of the widening instruction.
971 if (I == SingleUser->getOperand(1))
972 return 0;
973 // If the cast is not the second operand, it will be free if it looks the
974 // same as the second operand. In this case, we will generate a "long"
975 // version of the widening instruction.
976 if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
977 if (I->getOpcode() == unsigned(Cast->getOpcode()) &&
978 cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
979 return 0;
980 }
981 }
982
983 // TODO: Allow non-throughput costs that aren't binary.
984 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
985 if (CostKind != TTI::TCK_RecipThroughput)
986 return Cost == 0 ? 0 : 1;
987 return Cost;
988 };
989
990 EVT SrcTy = TLI->getValueType(DL, Src);
991 EVT DstTy = TLI->getValueType(DL, Dst);
992
993 if (!SrcTy.isSimple() || !DstTy.isSimple())
994 return AdjustCost(
995 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
996
997 static const TypeConversionCostTblEntry
998 ConversionTbl[] = {
999 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
1000 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
1001 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
1002 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
1003
1004 // Truncations on nxvmiN
1005 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 },
1006 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 },
1007 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 },
1008 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 },
1009 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 },
1010 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 },
1011 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 },
1012 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 },
1013 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 },
1014 { ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1 },
1015 { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 },
1016 { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 },
1017 { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 },
1018 { ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 },
1019 { ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 },
1020 { ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 },
1021
1022 // The number of shll instructions for the extension.
1023 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
1024 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
1025 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
1026 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
1027 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
1028 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
1029 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
1030 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
1031 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
1032 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
1033 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
1034 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
1035 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
1036 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
1037 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
1038 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
1039
1040 // LowerVectorINT_TO_FP:
1041 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
1042 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
1043 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
1044 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
1045 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
1046 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
1047
1048 // Complex: to v2f32
1049 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
1050 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
1051 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
1052 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
1053 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
1054 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
1055
1056 // Complex: to v4f32
1057 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 },
1058 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
1059 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
1060 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
1061
1062 // Complex: to v8f32
1063 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
1064 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
1065 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
1066 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
1067
1068 // Complex: to v16f32
1069 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
1070 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
1071
1072 // Complex: to v2f64
1073 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
1074 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
1075 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
1076 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
1077 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
1078 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
1079
1080
1081 // LowerVectorFP_TO_INT
1082 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
1083 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
1084 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
1085 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
1086 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
1087 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
1088
1089 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
1090 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
1091 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
1092 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 },
1093 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
1094 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
1095 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 },
1096
1097 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
1098 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
1099 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 },
1100 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
1101 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 },
1102
1103 // Complex, from nxv2f32.
1104 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
1105 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
1106 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
1107 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1 },
1108 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
1109 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
1110 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
1111 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1 },
1112
1113 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
1114 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
1115 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
1116 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 },
1117 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
1118 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
1119 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 },
1120
1121 // Complex, from nxv2f64.
1122 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
1123 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
1124 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
1125 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1 },
1126 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
1127 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
1128 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
1129 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1 },
1130
1131 // Complex, from nxv4f32.
1132 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
1133 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
1134 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
1135 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1 },
1136 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
1137 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
1138 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
1139 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1 },
1140
1141 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
1142 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
1143 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7 },
1144 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
1145 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7 },
1146
1147 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
1148 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
1149 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
1150 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3 },
1151 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
1152 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
1153 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3 },
1154
1155 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
1156 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
1157 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3 },
1158 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
1159 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3 },
1160
1161 // Complex, from nxv8f16.
1162 { ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
1163 { ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
1164 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
1165 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1 },
1166 { ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
1167 { ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
1168 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
1169 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1 },
1170
1171 // Complex, from nxv4f16.
1172 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
1173 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
1174 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
1175 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1 },
1176 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
1177 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
1178 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
1179 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1 },
1180
1181 // Complex, from nxv2f16.
1182 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
1183 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
1184 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
1185 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1 },
1186 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
1187 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
1188 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
1189 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1 },
1190
1191 // Truncate from nxvmf32 to nxvmf16.
1192 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 },
1193 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 },
1194 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 },
1195
1196 // Truncate from nxvmf64 to nxvmf16.
1197 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 },
1198 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 },
1199 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 },
1200
1201 // Truncate from nxvmf64 to nxvmf32.
1202 { ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 },
1203 { ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 },
1204 { ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 },
1205
1206 // Extend from nxvmf16 to nxvmf32.
1207 { ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
1208 { ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
1209 { ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
1210
1211 // Extend from nxvmf16 to nxvmf64.
1212 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
1213 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
1214 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
1215
1216 // Extend from nxvmf32 to nxvmf64.
1217 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
1218 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
1219 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
1220
1221 };
1222
1223 if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
1224 DstTy.getSimpleVT(),
1225 SrcTy.getSimpleVT()))
1226 return AdjustCost(Entry->Cost);
1227
1228 return AdjustCost(
1229 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
1230}
1231
1232InstructionCost AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode,
1233 Type *Dst,
1234 VectorType *VecTy,
1235 unsigned Index) {
1236
1237 // Make sure we were given a valid extend opcode.
1238 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&(static_cast <bool> ((Opcode == Instruction::SExt || Opcode
== Instruction::ZExt) && "Invalid opcode") ? void (0
) : __assert_fail ("(Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && \"Invalid opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 1239, __extension__ __PRETTY_FUNCTION__))
1239 "Invalid opcode")(static_cast <bool> ((Opcode == Instruction::SExt || Opcode
== Instruction::ZExt) && "Invalid opcode") ? void (0
) : __assert_fail ("(Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && \"Invalid opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 1239, __extension__ __PRETTY_FUNCTION__))
;
1240
1241 // We are extending an element we extract from a vector, so the source type
1242 // of the extend is the element type of the vector.
1243 auto *Src = VecTy->getElementType();
1244
1245 // Sign- and zero-extends are for integer types only.
1246 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type")(static_cast <bool> (isa<IntegerType>(Dst) &&
isa<IntegerType>(Src) && "Invalid type") ? void
(0) : __assert_fail ("isa<IntegerType>(Dst) && isa<IntegerType>(Src) && \"Invalid type\""
, "/build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 1246, __extension__ __PRETTY_FUNCTION__))
;
1247
1248 // Get the cost for the extract. We compute the cost (if any) for the extend
1249 // below.
1250 InstructionCost Cost =
1251 getVectorInstrCost(Instruction::ExtractElement, VecTy, Index);
1252
1253 // Legalize the types.
1254 auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy);
1255 auto DstVT = TLI->getValueType(DL, Dst);
1256 auto SrcVT = TLI->getValueType(DL, Src);
1257 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1258
1259 // If the resulting type is still a vector and the destination type is legal,
1260 // we may get the extension for free. If not, get the default cost for the
1261 // extend.
1262 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
1263 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
1264 CostKind);
1265
1266 // The destination type should be larger than the element type. If not, get
1267 // the default cost for the extend.
1268 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
1269 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
1270 CostKind);
1271
1272 switch (Opcode) {
1273 default:
1274 llvm_unreachable("Opcode should be either SExt or ZExt")::llvm::llvm_unreachable_internal("Opcode should be either SExt or ZExt"
, "/build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 1274)
;
1275
1276 // For sign-extends, we only need a smov, which performs the extension
1277 // automatically.
1278 case Instruction::SExt:
1279 return Cost;
1280
1281 // For zero-extends, the extend is performed automatically by a umov unless
1282 // the destination type is i64 and the element type is i8 or i16.
1283 case Instruction::ZExt:
1284 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
1285 return Cost;
1286 }
1287
1288 // If we are unable to perform the extend for free, get the default cost.
1289 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
1290 CostKind);
1291}
1292
1293InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
1294 TTI::TargetCostKind CostKind,
1295 const Instruction *I) {
1296 if (CostKind != TTI::TCK_RecipThroughput)
1297 return Opcode == Instruction::PHI ? 0 : 1;
1298 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind")(static_cast <bool> (CostKind == TTI::TCK_RecipThroughput
&& "unexpected CostKind") ? void (0) : __assert_fail
("CostKind == TTI::TCK_RecipThroughput && \"unexpected CostKind\""
, "/build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 1298, __extension__ __PRETTY_FUNCTION__))
;
1299 // Branches are assumed to be predicted.
1300 return 0;
1301}
1302
1303InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
1304 unsigned Index) {
1305 assert(Val->isVectorTy() && "This must be a vector type")(static_cast <bool> (Val->isVectorTy() && "This must be a vector type"
) ? void (0) : __assert_fail ("Val->isVectorTy() && \"This must be a vector type\""
, "/build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 1305, __extension__ __PRETTY_FUNCTION__))
;
1306
1307 if (Index != -1U) {
1308 // Legalize the type.
1309 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
1310
1311 // This type is legalized to a scalar type.
1312 if (!LT.second.isVector())
1313 return 0;
1314
1315 // The type may be split. Normalize the index to the new type.
1316 unsigned Width = LT.second.getVectorNumElements();
1317 Index = Index % Width;
1318
1319 // The element at index zero is already inside the vector.
1320 if (Index == 0)
1321 return 0;
1322 }
1323
1324 // All other insert/extracts cost this much.
1325 return ST->getVectorInsertExtractBaseCost();
1326}
1327
1328InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
1329 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1330 TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info,
1331 TTI::OperandValueProperties Opd1PropInfo,
1332 TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
1333 const Instruction *CxtI) {
1334 // TODO: Handle more cost kinds.
1335 if (CostKind != TTI::TCK_RecipThroughput)
1336 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
1337 Opd2Info, Opd1PropInfo,
1338 Opd2PropInfo, Args, CxtI);
1339
1340 // Legalize the type.
1341 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
1342
1343 // If the instruction is a widening instruction (e.g., uaddl, saddw, etc.),
1344 // add in the widening overhead specified by the sub-target. Since the
1345 // extends feeding widening instructions are performed automatically, they
1346 // aren't present in the generated code and have a zero cost. By adding a
1347 // widening overhead here, we attach the total cost of the combined operation
1348 // to the widening instruction.
1349 InstructionCost Cost = 0;
1350 if (isWideningInstruction(Ty, Opcode, Args))
1351 Cost += ST->getWideningBaseCost();
1352
1353 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1354
1355 switch (ISD) {
1356 default:
1357 return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
1358 Opd2Info,
1359 Opd1PropInfo, Opd2PropInfo);
1360 case ISD::SDIV:
1361 if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue &&
1362 Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
1363 // On AArch64, scalar signed division by constants power-of-two are
1364 // normally expanded to the sequence ADD + CMP + SELECT + SRA.
1365 // The OperandValue properties many not be same as that of previous
1366 // operation; conservatively assume OP_None.
1367 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
1368 Opd1Info, Opd2Info,
1369 TargetTransformInfo::OP_None,
1370 TargetTransformInfo::OP_None);
1371 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
1372 Opd1Info, Opd2Info,
1373 TargetTransformInfo::OP_None,
1374 TargetTransformInfo::OP_None);
1375 Cost += getArithmeticInstrCost(Instruction::Select, Ty, CostKind,
1376 Opd1Info, Opd2Info,
1377 TargetTransformInfo::OP_None,
1378 TargetTransformInfo::OP_None);
1379 Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
1380 Opd1Info, Opd2Info,
1381 TargetTransformInfo::OP_None,
1382 TargetTransformInfo::OP_None);
1383 return Cost;
1384 }
1385 LLVM_FALLTHROUGH[[gnu::fallthrough]];
1386 case ISD::UDIV:
1387 if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue) {
1388 auto VT = TLI->getValueType(DL, Ty);
1389 if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
1390 // Vector signed division by constant are expanded to the
1391 // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
1392 // to MULHS + SUB + SRL + ADD + SRL.
1393 InstructionCost MulCost = getArithmeticInstrCost(
1394 Instruction::Mul, Ty, CostKind, Opd1Info, Opd2Info,
1395 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
1396 InstructionCost AddCost = getArithmeticInstrCost(
1397 Instruction::Add, Ty, CostKind, Opd1Info, Opd2Info,
1398 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
1399 InstructionCost ShrCost = getArithmeticInstrCost(
1400 Instruction::AShr, Ty, CostKind, Opd1Info, Opd2Info,
1401 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
1402 return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
1403 }
1404 }
1405
1406 Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
1407 Opd2Info,
1408 Opd1PropInfo, Opd2PropInfo);
1409 if (Ty->isVectorTy()) {
1410 // On AArch64, vector divisions are not supported natively and are
1411 // expanded into scalar divisions of each pair of elements.
1412 Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, CostKind,
1413 Opd1Info, Opd2Info, Opd1PropInfo,
1414 Opd2PropInfo);
1415 Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind,
1416 Opd1Info, Opd2Info, Opd1PropInfo,
1417 Opd2PropInfo);
1418 // TODO: if one of the arguments is scalar, then it's not necessary to
1419 // double the cost of handling the vector elements.
1420 Cost += Cost;
1421 }
1422 return Cost;
1423
1424 case ISD::MUL:
1425 if (LT.second != MVT::v2i64)
1426 return (Cost + 1) * LT.first;
1427 // Since we do not have a MUL.2d instruction, a mul <2 x i64> is expensive
1428 // as elements are extracted from the vectors and the muls scalarized.
1429 // As getScalarizationOverhead is a bit too pessimistic, we estimate the
1430 // cost for a i64 vector directly here, which is:
1431 // - four i64 extracts,
1432 // - two i64 inserts, and
1433 // - two muls.
1434 // So, for a v2i64 with LT.First = 1 the cost is 8, and for a v4i64 with
1435 // LT.first = 2 the cost is 16.
1436 return LT.first * 8;
1437 case ISD::ADD:
1438 case ISD::XOR:
1439 case ISD::OR:
1440 case ISD::AND:
1441 // These nodes are marked as 'custom' for combining purposes only.
1442 // We know that they are legal. See LowerAdd in ISelLowering.
1443 return (Cost + 1) * LT.first;
1444
1445 case ISD::FADD:
1446 case ISD::FSUB:
1447 case ISD::FMUL:
1448 case ISD::FDIV:
1449 case ISD::FNEG:
1450 // These nodes are marked as 'custom' just to lower them to SVE.
1451 // We know said lowering will incur no additional cost.
1452 if (!Ty->getScalarType()->isFP128Ty())
1453 return (Cost + 2) * LT.first;
1454
1455 return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
1456 Opd2Info,
1457 Opd1PropInfo, Opd2PropInfo);
1458 }
1459}
1460
1461InstructionCost AArch64TTIImpl::getAddressComputationCost(Type *Ty,
1462 ScalarEvolution *SE,
1463 const SCEV *Ptr) {
1464 // Address computations in vectorized code with non-consecutive addresses will
1465 // likely result in more instructions compared to scalar code where the
1466 // computation can more often be merged into the index mode. The resulting
1467 // extra micro-ops can significantly decrease throughput.
1468 unsigned NumVectorInstToHideOverhead = 10;
1469 int MaxMergeDistance = 64;
1470
1471 if (Ty->isVectorTy() && SE &&
1472 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
1473 return NumVectorInstToHideOverhead;
1474
1475 // In many cases the address computation is not merged into the instruction
1476 // addressing mode.
1477 return 1;
1478}
1479
1480InstructionCost AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
1481 Type *CondTy,
1482 CmpInst::Predicate VecPred,
1483 TTI::TargetCostKind CostKind,
1484 const Instruction *I) {
1485 // TODO: Handle other cost kinds.
1486 if (CostKind != TTI::TCK_RecipThroughput)
1487 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1488 I);
1489
1490 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1491 // We don't lower some vector selects well that are wider than the register
1492 // width.
1493 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) {
1494 // We would need this many instructions to hide the scalarization happening.
1495 const int AmortizationCost = 20;
1496
1497 // If VecPred is not set, check if we can get a predicate from the context
1498 // instruction, if its type matches the requested ValTy.
1499 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
1500 CmpInst::Predicate CurrentPred;
1501 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
1502 m_Value())))
1503 VecPred = CurrentPred;
1504 }
1505 // Check if we have a compare/select chain that can be lowered using CMxx &
1506 // BFI pair.
1507 if (CmpInst::isIntPredicate(VecPred)) {
1508 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
1509 MVT::v8i16, MVT::v2i32, MVT::v4i32,
1510 MVT::v2i64};
1511 auto LT = TLI->getTypeLegalizationCost(DL, ValTy);
1512 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
1513 return LT.first;
1514 }
1515
1516 static const TypeConversionCostTblEntry
1517 VectorSelectTbl[] = {
1518 { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
1519 { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
1520 { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
1521 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
1522 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
1523 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
1524 };
1525
1526 EVT SelCondTy = TLI->getValueType(DL, CondTy);
1527 EVT SelValTy = TLI->getValueType(DL, ValTy);
1528 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
1529 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
1530 SelCondTy.getSimpleVT(),
1531 SelValTy.getSimpleVT()))
1532 return Entry->Cost;
1533 }
1534 }
1535 // The base case handles scalable vectors fine for now, since it treats the
1536 // cost as 1 * legalization cost.
1537 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1538}
1539
1540AArch64TTIImpl::TTI::MemCmpExpansionOptions
1541AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
1542 TTI::MemCmpExpansionOptions Options;
1543 if (ST->requiresStrictAlign()) {
1544 // TODO: Add cost modeling for strict align. Misaligned loads expand to
1545 // a bunch of instructions when strict align is enabled.
1546 return Options;
1547 }
1548 Options.AllowOverlappingLoads = true;
1549 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
1550 Options.NumLoadsPerBlock = Options.MaxNumLoads;
1551 // TODO: Though vector loads usually perform well on AArch64, in some targets
1552 // they may wake up the FP unit, which raises the power consumption. Perhaps
1553 // they could be used with no holds barred (-O3).
1554 Options.LoadSizes = {8, 4, 2, 1};
1555 return Options;
1556}
1557
1558InstructionCost
1559AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
1560 Align Alignment, unsigned AddressSpace,
1561 TTI::TargetCostKind CostKind) {
1562 if (!isa<ScalableVectorType>(Src))
1563 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1564 CostKind);
1565 auto LT = TLI->getTypeLegalizationCost(DL, Src);
1566 if (!LT.first.isValid())
1567 return InstructionCost::getInvalid();
1568
1569 // The code-generator is currently not able to handle scalable vectors
1570 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
1571 // it. This change will be removed when code-generation for these types is
1572 // sufficiently reliable.
1573 if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1))
1574 return InstructionCost::getInvalid();
1575
1576 return LT.first * 2;
1577}
1578
1579InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
1580 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1581 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
1582 if (useNeonVector(DataTy))
1583 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1584 Alignment, CostKind, I);
1585 auto *VT = cast<VectorType>(DataTy);
1586 auto LT = TLI->getTypeLegalizationCost(DL, DataTy);
1587 if (!LT.first.isValid())
1588 return InstructionCost::getInvalid();
1589
1590 // The code-generator is currently not able to handle scalable vectors
1591 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
1592 // it. This change will be removed when code-generation for these types is
1593 // sufficiently reliable.
1594 if (cast<VectorType>(DataTy)->getElementCount() ==
1595 ElementCount::getScalable(1))
1596 return InstructionCost::getInvalid();
1597
1598 ElementCount LegalVF = LT.second.getVectorElementCount();
1599 InstructionCost MemOpCost =
1600 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I);
1601 return LT.first * MemOpCost * getMaxNumElements(LegalVF, I->getFunction());
1602}
1603
1604bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {
1605 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
1606}
1607
1608InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
1609 MaybeAlign Alignment,
1610 unsigned AddressSpace,
1611 TTI::TargetCostKind CostKind,
1612 const Instruction *I) {
1613 EVT VT = TLI->getValueType(DL, Ty, true);
1614 // Type legalization can't handle structs
1615 if (VT == MVT::Other)
1616 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
1617 CostKind);
1618
1619 auto LT = TLI->getTypeLegalizationCost(DL, Ty);
1620 if (!LT.first.isValid())
1621 return InstructionCost::getInvalid();
1622
1623 // The code-generator is currently not able to handle scalable vectors
1624 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
1625 // it. This change will be removed when code-generation for these types is
1626 // sufficiently reliable.
1627 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
1628 if (VTy->getElementCount() == ElementCount::getScalable(1))
1629 return InstructionCost::getInvalid();
1630
1631 // TODO: consider latency as well for TCK_SizeAndLatency.
1632 if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
1633 return LT.first;
1634
1635 if (CostKind != TTI::TCK_RecipThroughput)
1636 return 1;
1637
1638 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
1639 LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
1640 // Unaligned stores are extremely inefficient. We don't split all
1641 // unaligned 128-bit stores because the negative impact that has shown in
1642 // practice on inlined block copy code.
1643 // We make such stores expensive so that we will only vectorize if there
1644 // are 6 other instructions getting vectorized.
1645 const int AmortizationCost = 6;
1646
1647 return LT.first * 2 * AmortizationCost;
1648 }
1649
1650 // Check truncating stores and extending loads.
1651 if (useNeonVector(Ty) &&
1652 Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
1653 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
1654 if (VT == MVT::v4i8)
1655 return 2;
1656 // Otherwise we need to scalarize.
1657 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
1658 }
1659
1660 return LT.first;
1661}
1662
1663InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
1664 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1665 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1666 bool UseMaskForCond, bool UseMaskForGaps) {
1667 assert(Factor >= 2 && "Invalid interleave factor")(static_cast <bool> (Factor >= 2 && "Invalid interleave factor"
) ? void (0) : __assert_fail ("Factor >= 2 && \"Invalid interleave factor\""
, "/build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 1667, __extension__ __PRETTY_FUNCTION__))
;
1668 auto *VecVTy = cast<FixedVectorType>(VecTy);
1669
1670 if (!UseMaskForCond && !UseMaskForGaps &&
1671 Factor <= TLI->getMaxSupportedInterleaveFactor()) {
1672 unsigned NumElts = VecVTy->getNumElements();
1673 auto *SubVecTy =
1674 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
1675
1676 // ldN/stN only support legal vector types of size 64 or 128 in bits.
1677 // Accesses having vector types that are a multiple of 128 bits can be
1678 // matched to more than one ldN/stN instruction.
1679 if (NumElts % Factor == 0 &&
1680 TLI->isLegalInterleavedAccessType(SubVecTy, DL))
1681 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1682 }
1683
1684 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1685 Alignment, AddressSpace, CostKind,
1686 UseMaskForCond, UseMaskForGaps);
1687}
1688
1689InstructionCost
1690AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
1691 InstructionCost Cost = 0;
1692 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1693 for (auto *I : Tys) {
1694 if (!I->isVectorTy())
1695 continue;
1696 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
1697 128)
1698 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
1699 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
1700 }
1701 return Cost;
1702}
1703
1704unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
1705 return ST->getMaxInterleaveFactor();
1706}
1707
1708// For Falkor, we want to avoid having too many strided loads in a loop since
1709// that can exhaust the HW prefetcher resources. We adjust the unroller
1710// MaxCount preference below to attempt to ensure unrolling doesn't create too
1711// many strided loads.
1712static void
1713getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1714 TargetTransformInfo::UnrollingPreferences &UP) {
1715 enum { MaxStridedLoads = 7 };
1716 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
1717 int StridedLoads = 0;
1718 // FIXME? We could make this more precise by looking at the CFG and
1719 // e.g. not counting loads in each side of an if-then-else diamond.
1720 for (const auto BB : L->blocks()) {
1721 for (auto &I : *BB) {
1722 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
1723 if (!LMemI)
1724 continue;
1725
1726 Value *PtrValue = LMemI->getPointerOperand();
1727 if (L->isLoopInvariant(PtrValue))
1728 continue;
1729
1730 const SCEV *LSCEV = SE.getSCEV(PtrValue);
1731 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
1732 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
1733 continue;
1734
1735 // FIXME? We could take pairing of unrolled load copies into account
1736 // by looking at the AddRec, but we would probably have to limit this
1737 // to loops with no stores or other memory optimization barriers.
1738 ++StridedLoads;
1739 // We've seen enough strided loads that seeing more won't make a
1740 // difference.
1741 if (StridedLoads > MaxStridedLoads / 2)
1742 return StridedLoads;
1743 }
1744 }
1745 return StridedLoads;
1746 };
1747
1748 int StridedLoads = countStridedLoads(L, SE);
1749 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoadsdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("aarch64tti")) { dbgs() << "falkor-hwpf: detected " <<
StridedLoads << " strided loads\n"; } } while (false)
7
Assuming 'DebugFlag' is false
8
Loop condition is false. Exiting loop
1750 << " strided loads\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("aarch64tti")) { dbgs() << "falkor-hwpf: detected " <<
StridedLoads << " strided loads\n"; } } while (false)
;
1751 // Pick the largest power of 2 unroll count that won't result in too many
1752 // strided loads.
1753 if (StridedLoads) {
9
Assuming 'StridedLoads' is not equal to 0
10
Taking true branch
1754 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
11
Calling 'Log2_32'
13
Returning from 'Log2_32'
14
The result of the left shift is undefined due to shifting by '4294967295', which is greater or equal to the width of type 'int'
1755 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("aarch64tti")) { dbgs() << "falkor-hwpf: setting unroll MaxCount to "
<< UP.MaxCount << '\n'; } } while (false)
1756 << UP.MaxCount << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("aarch64tti")) { dbgs() << "falkor-hwpf: setting unroll MaxCount to "
<< UP.MaxCount << '\n'; } } while (false)
;
1757 }
1758}
1759
1760void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1761 TTI::UnrollingPreferences &UP,
1762 OptimizationRemarkEmitter *ORE) {
1763 // Enable partial unrolling and runtime unrolling.
1764 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
1765
1766 UP.UpperBound = true;
1767
1768 // For inner loop, it is more likely to be a hot one, and the runtime check
1769 // can be promoted out from LICM pass, so the overhead is less, let's try
1770 // a larger threshold to unroll more loops.
1771 if (L->getLoopDepth() > 1)
1
Assuming the condition is false
2
Taking false branch
1772 UP.PartialThreshold *= 2;
1773
1774 // Disable partial & runtime unrolling on -Os.
1775 UP.PartialOptSizeThreshold = 0;
1776
1777 if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
3
Assuming the condition is true
5
Taking true branch
1778 EnableFalkorHWPFUnrollFix)
4
Assuming the condition is true
1779 getFalkorUnrollingPreferences(L, SE, UP);
6
Calling 'getFalkorUnrollingPreferences'
1780
1781 // Scan the loop: don't unroll loops with calls as this could prevent
1782 // inlining. Don't unroll vector loops either, as they don't benefit much from
1783 // unrolling.
1784 for (auto *BB : L->getBlocks()) {
1785 for (auto &I : *BB) {
1786 // Don't unroll vectorised loop.
1787 if (I.getType()->isVectorTy())
1788 return;
1789
1790 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1791 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
1792 if (!isLoweredToCall(F))
1793 continue;
1794 }
1795 return;
1796 }
1797 }
1798 }
1799
1800 // Enable runtime unrolling for in-order models
1801 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
1802 // checking for that case, we can ensure that the default behaviour is
1803 // unchanged
1804 if (ST->getProcFamily() != AArch64Subtarget::Others &&
1805 !ST->getSchedModel().isOutOfOrder()) {
1806 UP.Runtime = true;
1807 UP.Partial = true;
1808 UP.UnrollRemainder = true;
1809 UP.DefaultUnrollRuntimeCount = 4;
1810
1811 UP.UnrollAndJam = true;
1812 UP.UnrollAndJamInnerLoopThreshold = 60;
1813 }
1814}
1815
1816void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1817 TTI::PeelingPreferences &PP) {
1818 BaseT::getPeelingPreferences(L, SE, PP);
1819}
1820
1821Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
1822 Type *ExpectedType) {
1823 switch (Inst->getIntrinsicID()) {
1824 default:
1825 return nullptr;
1826 case Intrinsic::aarch64_neon_st2:
1827 case Intrinsic::aarch64_neon_st3:
1828 case Intrinsic::aarch64_neon_st4: {
1829 // Create a struct type
1830 StructType *ST = dyn_cast<StructType>(ExpectedType);
1831 if (!ST)
1832 return nullptr;
1833 unsigned NumElts = Inst->getNumArgOperands() - 1;
1834 if (ST->getNumElements() != NumElts)
1835 return nullptr;
1836 for (unsigned i = 0, e = NumElts; i != e; ++i) {
1837 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
1838 return nullptr;
1839 }
1840 Value *Res = UndefValue::get(ExpectedType);
1841 IRBuilder<> Builder(Inst);
1842 for (unsigned i = 0, e = NumElts; i != e; ++i) {
1843 Value *L = Inst->getArgOperand(i);
1844 Res = Builder.CreateInsertValue(Res, L, i);
1845 }
1846 return Res;
1847 }
1848 case Intrinsic::aarch64_neon_ld2:
1849 case Intrinsic::aarch64_neon_ld3:
1850 case Intrinsic::aarch64_neon_ld4:
1851 if (Inst->getType() == ExpectedType)
1852 return Inst;
1853 return nullptr;
1854 }
1855}
1856
1857bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
1858 MemIntrinsicInfo &Info) {
1859 switch (Inst->getIntrinsicID()) {
1860 default:
1861 break;
1862 case Intrinsic::aarch64_neon_ld2:
1863 case Intrinsic::aarch64_neon_ld3:
1864 case Intrinsic::aarch64_neon_ld4:
1865 Info.ReadMem = true;
1866 Info.WriteMem = false;
1867 Info.PtrVal = Inst->getArgOperand(0);
1868 break;
1869 case Intrinsic::aarch64_neon_st2:
1870 case Intrinsic::aarch64_neon_st3:
1871 case Intrinsic::aarch64_neon_st4:
1872 Info.ReadMem = false;
1873 Info.WriteMem = true;
1874 Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1);
1875 break;
1876 }
1877
1878 switch (Inst->getIntrinsicID()) {
1879 default:
1880 return false;
1881 case Intrinsic::aarch64_neon_ld2:
1882 case Intrinsic::aarch64_neon_st2:
1883 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
1884 break;
1885 case Intrinsic::aarch64_neon_ld3:
1886 case Intrinsic::aarch64_neon_st3:
1887 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
1888 break;
1889 case Intrinsic::aarch64_neon_ld4:
1890 case Intrinsic::aarch64_neon_st4:
1891 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
1892 break;
1893 }
1894 return true;
1895}
1896
1897/// See if \p I should be considered for address type promotion. We check if \p
1898/// I is a sext with right type and used in memory accesses. If it used in a
1899/// "complex" getelementptr, we allow it to be promoted without finding other
1900/// sext instructions that sign extended the same initial value. A getelementptr
1901/// is considered as "complex" if it has more than 2 operands.
1902bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
1903 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
1904 bool Considerable = false;
1905 AllowPromotionWithoutCommonHeader = false;
1906 if (!isa<SExtInst>(&I))
1907 return false;
1908 Type *ConsideredSExtType =
1909 Type::getInt64Ty(I.getParent()->getParent()->getContext());
1910 if (I.getType() != ConsideredSExtType)
1911 return false;
1912 // See if the sext is the one with the right type and used in at least one
1913 // GetElementPtrInst.
1914 for (const User *U : I.users()) {
1915 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
1916 Considerable = true;
1917 // A getelementptr is considered as "complex" if it has more than 2
1918 // operands. We will promote a SExt used in such complex GEP as we
1919 // expect some computation to be merged if they are done on 64 bits.
1920 if (GEPInst->getNumOperands() > 2) {
1921 AllowPromotionWithoutCommonHeader = true;
1922 break;
1923 }
1924 }
1925 }
1926 return Considerable;
1927}
1928
1929bool AArch64TTIImpl::isLegalToVectorizeReduction(
1930 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
1931 if (!VF.isScalable())
1932 return true;
1933
1934 Type *Ty = RdxDesc.getRecurrenceType();
1935 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
1936 return false;
1937
1938 switch (RdxDesc.getRecurrenceKind()) {
1939 case RecurKind::Add:
1940 case RecurKind::FAdd:
1941 case RecurKind::And:
1942 case RecurKind::Or:
1943 case RecurKind::Xor:
1944 case RecurKind::SMin:
1945 case RecurKind::SMax:
1946 case RecurKind::UMin:
1947 case RecurKind::UMax:
1948 case RecurKind::FMin:
1949 case RecurKind::FMax:
1950 return true;
1951 default:
1952 return false;
1953 }
1954}
1955
1956InstructionCost
1957AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
1958 bool IsUnsigned,
1959 TTI::TargetCostKind CostKind) {
1960 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
1961
1962 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
1963 return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
1964
1965 assert((isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) &&(static_cast <bool> ((isa<ScalableVectorType>(Ty)
== isa<ScalableVectorType>(CondTy)) && "Both vector needs to be equally scalable"
) ? void (0) : __assert_fail ("(isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) && \"Both vector needs to be equally scalable\""
, "/build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 1966, __extension__ __PRETTY_FUNCTION__))
1966 "Both vector needs to be equally scalable")(static_cast <bool> ((isa<ScalableVectorType>(Ty)
== isa<ScalableVectorType>(CondTy)) && "Both vector needs to be equally scalable"
) ? void (0) : __assert_fail ("(isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) && \"Both vector needs to be equally scalable\""
, "/build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 1966, __extension__ __PRETTY_FUNCTION__))
;
1967
1968 InstructionCost LegalizationCost = 0;
1969 if (LT.first > 1) {
1970 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
1971 unsigned MinMaxOpcode =
1972 Ty->isFPOrFPVectorTy()
1973 ? Intrinsic::maxnum
1974 : (IsUnsigned ? Intrinsic::umin : Intrinsic::smin);
1975 IntrinsicCostAttributes Attrs(MinMaxOpcode, LegalVTy, {LegalVTy, LegalVTy});
1976 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
1977 }
1978
1979 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
1980}
1981
1982InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE(
1983 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) {
1984 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1985 InstructionCost LegalizationCost = 0;
1986 if (LT.first > 1) {
1987 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
1988 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
1989 LegalizationCost *= LT.first - 1;
1990 }
1991
1992 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1993 assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ?
void (0) : __assert_fail ("ISD && \"Invalid opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 1993, __extension__ __PRETTY_FUNCTION__))
;
1994 // Add the final reduction cost for the legal horizontal reduction
1995 switch (ISD) {
1996 case ISD::ADD:
1997 case ISD::AND:
1998 case ISD::OR:
1999 case ISD::XOR:
2000 case ISD::FADD:
2001 return LegalizationCost + 2;
2002 default:
2003 return InstructionCost::getInvalid();
2004 }
2005}
2006
2007InstructionCost
2008AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
2009 Optional<FastMathFlags> FMF,
2010 TTI::TargetCostKind CostKind) {
2011 if (TTI::requiresOrderedReduction(FMF)) {
2012 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
2013 InstructionCost BaseCost =
2014 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2015 // Add on extra cost to reflect the extra overhead on some CPUs. We still
2016 // end up vectorizing for more computationally intensive loops.
2017 return BaseCost + FixedVTy->getNumElements();
2018 }
2019
2020 if (Opcode != Instruction::FAdd)
2021 return InstructionCost::getInvalid();
2022
2023 auto *VTy = cast<ScalableVectorType>(ValTy);
2024 InstructionCost Cost =
2025 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
2026 Cost *= getMaxNumElements(VTy->getElementCount());
2027 return Cost;
2028 }
2029
2030 if (isa<ScalableVectorType>(ValTy))
2031 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
2032
2033 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2034 MVT MTy = LT.second;
2035 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2036 assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ?
void (0) : __assert_fail ("ISD && \"Invalid opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 2036, __extension__ __PRETTY_FUNCTION__))
;
2037
2038 // Horizontal adds can use the 'addv' instruction. We model the cost of these
2039 // instructions as twice a normal vector add, plus 1 for each legalization
2040 // step (LT.first). This is the only arithmetic vector reduction operation for
2041 // which we have an instruction.
2042 // OR, XOR and AND costs should match the codegen from:
2043 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
2044 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
2045 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
2046 static const CostTblEntry CostTblNoPairwise[]{
2047 {ISD::ADD, MVT::v8i8, 2},
2048 {ISD::ADD, MVT::v16i8, 2},
2049 {ISD::ADD, MVT::v4i16, 2},
2050 {ISD::ADD, MVT::v8i16, 2},
2051 {ISD::ADD, MVT::v4i32, 2},
2052 {ISD::OR, MVT::v8i8, 15},
2053 {ISD::OR, MVT::v16i8, 17},
2054 {ISD::OR, MVT::v4i16, 7},
2055 {ISD::OR, MVT::v8i16, 9},
2056 {ISD::OR, MVT::v2i32, 3},
2057 {ISD::OR, MVT::v4i32, 5},
2058 {ISD::OR, MVT::v2i64, 3},
2059 {ISD::XOR, MVT::v8i8, 15},
2060 {ISD::XOR, MVT::v16i8, 17},
2061 {ISD::XOR, MVT::v4i16, 7},
2062 {ISD::XOR, MVT::v8i16, 9},
2063 {ISD::XOR, MVT::v2i32, 3},
2064 {ISD::XOR, MVT::v4i32, 5},
2065 {ISD::XOR, MVT::v2i64, 3},
2066 {ISD::AND, MVT::v8i8, 15},
2067 {ISD::AND, MVT::v16i8, 17},
2068 {ISD::AND, MVT::v4i16, 7},
2069 {ISD::AND, MVT::v8i16, 9},
2070 {ISD::AND, MVT::v2i32, 3},
2071 {ISD::AND, MVT::v4i32, 5},
2072 {ISD::AND, MVT::v2i64, 3},
2073 };
2074 switch (ISD) {
2075 default:
2076 break;
2077 case ISD::ADD:
2078 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
2079 return (LT.first - 1) + Entry->Cost;
2080 break;
2081 case ISD::XOR:
2082 case ISD::AND:
2083 case ISD::OR:
2084 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
2085 if (!Entry)
2086 break;
2087 auto *ValVTy = cast<FixedVectorType>(ValTy);
2088 if (!ValVTy->getElementType()->isIntegerTy(1) &&
2089 MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
2090 isPowerOf2_32(ValVTy->getNumElements())) {
2091 InstructionCost ExtraCost = 0;
2092 if (LT.first != 1) {
2093 // Type needs to be split, so there is an extra cost of LT.first - 1
2094 // arithmetic ops.
2095 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
2096 MTy.getVectorNumElements());
2097 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
2098 ExtraCost *= LT.first - 1;
2099 }
2100 return Entry->Cost + ExtraCost;
2101 }
2102 break;
2103 }
2104 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2105}
2106
2107InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) {
2108 static const CostTblEntry ShuffleTbl[] = {
2109 { TTI::SK_Splice, MVT::nxv16i8, 1 },
2110 { TTI::SK_Splice, MVT::nxv8i16, 1 },
2111 { TTI::SK_Splice, MVT::nxv4i32, 1 },
2112 { TTI::SK_Splice, MVT::nxv2i64, 1 },
2113 { TTI::SK_Splice, MVT::nxv2f16, 1 },
2114 { TTI::SK_Splice, MVT::nxv4f16, 1 },
2115 { TTI::SK_Splice, MVT::nxv8f16, 1 },
2116 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
2117 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
2118 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
2119 { TTI::SK_Splice, MVT::nxv2f32, 1 },
2120 { TTI::SK_Splice, MVT::nxv4f32, 1 },
2121 { TTI::SK_Splice, MVT::nxv2f64, 1 },
2122 };
2123
2124 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
2125 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
2126 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2127 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
2128 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
2129 : LT.second;
2130 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
2131 InstructionCost LegalizationCost = 0;
2132 if (Index < 0) {
2133 LegalizationCost =
2134 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
2135 CmpInst::BAD_ICMP_PREDICATE, CostKind) +
2136 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
2137 CmpInst::BAD_ICMP_PREDICATE, CostKind);
2138 }
2139
2140 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
2141 // Cost performed on a promoted type.
2142 if (LT.second.getScalarType() == MVT::i1) {
2143 LegalizationCost +=
2144 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
2145 TTI::CastContextHint::None, CostKind) +
2146 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
2147 TTI::CastContextHint::None, CostKind);
2148 }
2149 const auto *Entry =
2150 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
2151 assert(Entry && "Illegal Type for Splice")(static_cast <bool> (Entry && "Illegal Type for Splice"
) ? void (0) : __assert_fail ("Entry && \"Illegal Type for Splice\""
, "/build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp"
, 2151, __extension__ __PRETTY_FUNCTION__))
;
2152 LegalizationCost += Entry->Cost;
2153 return LegalizationCost * LT.first;
2154}
2155
2156InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
2157 VectorType *Tp,
2158 ArrayRef<int> Mask, int Index,
2159 VectorType *SubTp) {
2160 Kind = improveShuffleKindFromMask(Kind, Mask);
2161 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
2162 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
2163 Kind == TTI::SK_Reverse) {
2164 static const CostTblEntry ShuffleTbl[] = {
2165 // Broadcast shuffle kinds can be performed with 'dup'.
2166 { TTI::SK_Broadcast, MVT::v8i8, 1 },
2167 { TTI::SK_Broadcast, MVT::v16i8, 1 },
2168 { TTI::SK_Broadcast, MVT::v4i16, 1 },
2169 { TTI::SK_Broadcast, MVT::v8i16, 1 },
2170 { TTI::SK_Broadcast, MVT::v2i32, 1 },
2171 { TTI::SK_Broadcast, MVT::v4i32, 1 },
2172 { TTI::SK_Broadcast, MVT::v2i64, 1 },
2173 { TTI::SK_Broadcast, MVT::v2f32, 1 },
2174 { TTI::SK_Broadcast, MVT::v4f32, 1 },
2175 { TTI::SK_Broadcast, MVT::v2f64, 1 },
2176 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
2177 // 'zip1/zip2' instructions.
2178 { TTI::SK_Transpose, MVT::v8i8, 1 },
2179 { TTI::SK_Transpose, MVT::v16i8, 1 },
2180 { TTI::SK_Transpose, MVT::v4i16, 1 },
2181 { TTI::SK_Transpose, MVT::v8i16, 1 },
2182 { TTI::SK_Transpose, MVT::v2i32, 1 },
2183 { TTI::SK_Transpose, MVT::v4i32, 1 },
2184 { TTI::SK_Transpose, MVT::v2i64, 1 },
2185 { TTI::SK_Transpose, MVT::v2f32, 1 },
2186 { TTI::SK_Transpose, MVT::v4f32, 1 },
2187 { TTI::SK_Transpose, MVT::v2f64, 1 },
2188 // Select shuffle kinds.
2189 // TODO: handle vXi8/vXi16.
2190 { TTI::SK_Select, MVT::v2i32, 1 }, // mov.
2191 { TTI::SK_Select, MVT::v4i32, 2 }, // rev+trn (or similar).
2192 { TTI::SK_Select, MVT::v2i64, 1 }, // mov.
2193 { TTI::SK_Select, MVT::v2f32, 1 }, // mov.
2194 { TTI::SK_Select, MVT::v4f32, 2 }, // rev+trn (or similar).
2195 { TTI::SK_Select, MVT::v2f64, 1 }, // mov.
2196 // PermuteSingleSrc shuffle kinds.
2197 { TTI::SK_PermuteSingleSrc, MVT::v2i32, 1 }, // mov.
2198 { TTI::SK_PermuteSingleSrc, MVT::v4i32, 3 }, // perfectshuffle worst case.
2199 { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // mov.
2200 { TTI::SK_PermuteSingleSrc, MVT::v2f32, 1 }, // mov.
2201 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 3 }, // perfectshuffle worst case.
2202 { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // mov.
2203 { TTI::SK_PermuteSingleSrc, MVT::v4i16, 3 }, // perfectshuffle worst case.
2204 { TTI::SK_PermuteSingleSrc, MVT::v4f16, 3 }, // perfectshuffle worst case.
2205 { TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3 }, // perfectshuffle worst case.
2206 { TTI::SK_PermuteSingleSrc, MVT::v8i16, 8 }, // constpool + load + tbl
2207 { TTI::SK_PermuteSingleSrc, MVT::v8f16, 8 }, // constpool + load + tbl
2208 { TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8 }, // constpool + load + tbl
2209 { TTI::SK_PermuteSingleSrc, MVT::v8i8, 8 }, // constpool + load + tbl
2210 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 8 }, // constpool + load + tbl
2211 // Reverse can be lowered with `rev`.
2212 { TTI::SK_Reverse, MVT::v2i32, 1 }, // mov.
2213 { TTI::SK_Reverse, MVT::v4i32, 2 }, // REV64; EXT
2214 { TTI::SK_Reverse, MVT::v2i64, 1 }, // mov.
2215 { TTI::SK_Reverse, MVT::v2f32, 1 }, // mov.
2216 { TTI::SK_Reverse, MVT::v4f32, 2 }, // REV64; EXT
2217 { TTI::SK_Reverse, MVT::v2f64, 1 }, // mov.
2218 // Broadcast shuffle kinds for scalable vectors
2219 { TTI::SK_Broadcast, MVT::nxv16i8, 1 },
2220 { TTI::SK_Broadcast, MVT::nxv8i16, 1 },
2221 { TTI::SK_Broadcast, MVT::nxv4i32, 1 },
2222 { TTI::SK_Broadcast, MVT::nxv2i64, 1 },
2223 { TTI::SK_Broadcast, MVT::nxv2f16, 1 },
2224 { TTI::SK_Broadcast, MVT::nxv4f16, 1 },
2225 { TTI::SK_Broadcast, MVT::nxv8f16, 1 },
2226 { TTI::SK_Broadcast, MVT::nxv2bf16, 1 },
2227 { TTI::SK_Broadcast, MVT::nxv4bf16, 1 },
2228 { TTI::SK_Broadcast, MVT::nxv8bf16, 1 },
2229 { TTI::SK_Broadcast, MVT::nxv2f32, 1 },
2230 { TTI::SK_Broadcast, MVT::nxv4f32, 1 },
2231 { TTI::SK_Broadcast, MVT::nxv2f64, 1 },
2232 { TTI::SK_Broadcast, MVT::nxv16i1, 1 },
2233 { TTI::SK_Broadcast, MVT::nxv8i1, 1 },
2234 { TTI::SK_Broadcast, MVT::nxv4i1, 1 },
2235 { TTI::SK_Broadcast, MVT::nxv2i1, 1 },
2236 // Handle the cases for vector.reverse with scalable vectors
2237 { TTI::SK_Reverse, MVT::nxv16i8, 1 },
2238 { TTI::SK_Reverse, MVT::nxv8i16, 1 },
2239 { TTI::SK_Reverse, MVT::nxv4i32, 1 },
2240 { TTI::SK_Reverse, MVT::nxv2i64, 1 },
2241 { TTI::SK_Reverse, MVT::nxv2f16, 1 },
2242 { TTI::SK_Reverse, MVT::nxv4f16, 1 },
2243 { TTI::SK_Reverse, MVT::nxv8f16, 1 },
2244 { TTI::SK_Reverse, MVT::nxv2bf16, 1 },
2245 { TTI::SK_Reverse, MVT::nxv4bf16, 1 },
2246 { TTI::SK_Reverse, MVT::nxv8bf16, 1 },
2247 { TTI::SK_Reverse, MVT::nxv2f32, 1 },
2248 { TTI::SK_Reverse, MVT::nxv4f32, 1 },
2249 { TTI::SK_Reverse, MVT::nxv2f64, 1 },
2250 { TTI::SK_Reverse, MVT::nxv16i1, 1 },
2251 { TTI::SK_Reverse, MVT::nxv8i1, 1 },
2252 { TTI::SK_Reverse, MVT::nxv4i1, 1 },
2253 { TTI::SK_Reverse, MVT::nxv2i1, 1 },
2254 };
2255 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
2256 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
2257 return LT.first * Entry->Cost;
2258 }
2259 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))
2260 return getSpliceCost(Tp, Index);
2261 return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
2262}

/build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/llvm/include/llvm/Support/MathExtras.h

1//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains some functions that are useful for math stuff.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_SUPPORT_MATHEXTRAS_H
14#define LLVM_SUPPORT_MATHEXTRAS_H
15
16#include "llvm/Support/Compiler.h"
17#include <cassert>
18#include <climits>
19#include <cmath>
20#include <cstdint>
21#include <cstring>
22#include <limits>
23#include <type_traits>
24
25#ifdef __ANDROID_NDK__
26#include <android/api-level.h>
27#endif
28
29#ifdef _MSC_VER
30// Declare these intrinsics manually rather including intrin.h. It's very
31// expensive, and MathExtras.h is popular.
32// #include <intrin.h>
33extern "C" {
34unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
35unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
36unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
37unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
38}
39#endif
40
41namespace llvm {
42
43/// The behavior an operation has on an input of 0.
44enum ZeroBehavior {
45 /// The returned value is undefined.
46 ZB_Undefined,
47 /// The returned value is numeric_limits<T>::max()
48 ZB_Max,
49 /// The returned value is numeric_limits<T>::digits
50 ZB_Width
51};
52
53/// Mathematical constants.
54namespace numbers {
55// TODO: Track C++20 std::numbers.
56// TODO: Favor using the hexadecimal FP constants (requires C++17).
57constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113
58 egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620
59 ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162
60 ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392
61 log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0)
62 log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2)
63 pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796
64 inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541
65 sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161
66 inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197
67 sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219
68 inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1)
69 sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194
70 inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1)
71 phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622
72constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113
73 egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620
74 ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162
75 ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392
76 log2ef = 1.44269504F, // (0x1.715476P+0)
77 log10ef = .434294482F, // (0x1.bcb7b2P-2)
78 pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796
79 inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541
80 sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161
81 inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197
82 sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193
83 inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1)
84 sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194
85 inv_sqrt3f = .577350269F, // (0x1.279a74P-1)
86 phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622
87} // namespace numbers
88
89namespace detail {
90template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
91 static unsigned count(T Val, ZeroBehavior) {
92 if (!Val)
93 return std::numeric_limits<T>::digits;
94 if (Val & 0x1)
95 return 0;
96
97 // Bisection method.
98 unsigned ZeroBits = 0;
99 T Shift = std::numeric_limits<T>::digits >> 1;
100 T Mask = std::numeric_limits<T>::max() >> Shift;
101 while (Shift) {
102 if ((Val & Mask) == 0) {
103 Val >>= Shift;
104 ZeroBits |= Shift;
105 }
106 Shift >>= 1;
107 Mask >>= Shift;
108 }
109 return ZeroBits;
110 }
111};
112
113#if defined(__GNUC__4) || defined(_MSC_VER)
114template <typename T> struct TrailingZerosCounter<T, 4> {
115 static unsigned count(T Val, ZeroBehavior ZB) {
116 if (ZB != ZB_Undefined && Val == 0)
117 return 32;
118
119#if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4)
120 return __builtin_ctz(Val);
121#elif defined(_MSC_VER)
122 unsigned long Index;
123 _BitScanForward(&Index, Val);
124 return Index;
125#endif
126 }
127};
128
129#if !defined(_MSC_VER) || defined(_M_X64)
130template <typename T> struct TrailingZerosCounter<T, 8> {
131 static unsigned count(T Val, ZeroBehavior ZB) {
132 if (ZB != ZB_Undefined && Val == 0)
133 return 64;
134
135#if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4)
136 return __builtin_ctzll(Val);
137#elif defined(_MSC_VER)
138 unsigned long Index;
139 _BitScanForward64(&Index, Val);
140 return Index;
141#endif
142 }
143};
144#endif
145#endif
146} // namespace detail
147
148/// Count number of 0's from the least significant bit to the most
149/// stopping at the first 1.
150///
151/// Only unsigned integral types are allowed.
152///
153/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
154/// valid arguments.
155template <typename T>
156unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
157 static_assert(std::numeric_limits<T>::is_integer &&
158 !std::numeric_limits<T>::is_signed,
159 "Only unsigned integral types are allowed.");
160 return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
161}
162
163namespace detail {
164template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
165 static unsigned count(T Val, ZeroBehavior) {
166 if (!Val)
167 return std::numeric_limits<T>::digits;
168
169 // Bisection method.
170 unsigned ZeroBits = 0;
171 for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
172 T Tmp = Val >> Shift;
173 if (Tmp)
174 Val = Tmp;
175 else
176 ZeroBits |= Shift;
177 }
178 return ZeroBits;
179 }
180};
181
182#if defined(__GNUC__4) || defined(_MSC_VER)
183template <typename T> struct LeadingZerosCounter<T, 4> {
184 static unsigned count(T Val, ZeroBehavior ZB) {
185 if (ZB != ZB_Undefined && Val == 0)
186 return 32;
187
188#if __has_builtin(__builtin_clz)1 || defined(__GNUC__4)
189 return __builtin_clz(Val);
190#elif defined(_MSC_VER)
191 unsigned long Index;
192 _BitScanReverse(&Index, Val);
193 return Index ^ 31;
194#endif
195 }
196};
197
198#if !defined(_MSC_VER) || defined(_M_X64)
199template <typename T> struct LeadingZerosCounter<T, 8> {
200 static unsigned count(T Val, ZeroBehavior ZB) {
201 if (ZB != ZB_Undefined && Val == 0)
202 return 64;
203
204#if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4)
205 return __builtin_clzll(Val);
206#elif defined(_MSC_VER)
207 unsigned long Index;
208 _BitScanReverse64(&Index, Val);
209 return Index ^ 63;
210#endif
211 }
212};
213#endif
214#endif
215} // namespace detail
216
217/// Count number of 0's from the most significant bit to the least
218/// stopping at the first 1.
219///
220/// Only unsigned integral types are allowed.
221///
222/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
223/// valid arguments.
224template <typename T>
225unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
226 static_assert(std::numeric_limits<T>::is_integer &&
227 !std::numeric_limits<T>::is_signed,
228 "Only unsigned integral types are allowed.");
229 return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
230}
231
232/// Get the index of the first set bit starting from the least
233/// significant bit.
234///
235/// Only unsigned integral types are allowed.
236///
237/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
238/// valid arguments.
239template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
240 if (ZB == ZB_Max && Val == 0)
241 return std::numeric_limits<T>::max();
242
243 return countTrailingZeros(Val, ZB_Undefined);
244}
245
246/// Create a bitmask with the N right-most bits set to 1, and all other
247/// bits set to 0. Only unsigned types are allowed.
248template <typename T> T maskTrailingOnes(unsigned N) {
249 static_assert(std::is_unsigned<T>::value, "Invalid type!");
250 const unsigned Bits = CHAR_BIT8 * sizeof(T);
251 assert(N <= Bits && "Invalid bit index")(static_cast <bool> (N <= Bits && "Invalid bit index"
) ? void (0) : __assert_fail ("N <= Bits && \"Invalid bit index\""
, "/build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/llvm/include/llvm/Support/MathExtras.h"
, 251, __extension__ __PRETTY_FUNCTION__))
;
252 return N == 0 ? 0 : (T(-1) >> (Bits - N));
253}
254
255/// Create a bitmask with the N left-most bits set to 1, and all other
256/// bits set to 0. Only unsigned types are allowed.
257template <typename T> T maskLeadingOnes(unsigned N) {
258 return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
259}
260
261/// Create a bitmask with the N right-most bits set to 0, and all other
262/// bits set to 1. Only unsigned types are allowed.
263template <typename T> T maskTrailingZeros(unsigned N) {
264 return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
265}
266
267/// Create a bitmask with the N left-most bits set to 0, and all other
268/// bits set to 1. Only unsigned types are allowed.
269template <typename T> T maskLeadingZeros(unsigned N) {
270 return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
271}
272
273/// Get the index of the last set bit starting from the least
274/// significant bit.
275///
276/// Only unsigned integral types are allowed.
277///
278/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
279/// valid arguments.
280template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
281 if (ZB == ZB_Max && Val == 0)
282 return std::numeric_limits<T>::max();
283
284 // Use ^ instead of - because both gcc and llvm can remove the associated ^
285 // in the __builtin_clz intrinsic on x86.
286 return countLeadingZeros(Val, ZB_Undefined) ^
287 (std::numeric_limits<T>::digits - 1);
288}
289
290/// Macro compressed bit reversal table for 256 bits.
291///
292/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
293static const unsigned char BitReverseTable256[256] = {
294#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
295#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
296#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
297 R6(0), R6(2), R6(1), R6(3)
298#undef R2
299#undef R4
300#undef R6
301};
302
303/// Reverse the bits in \p Val.
304template <typename T>
305T reverseBits(T Val) {
306 unsigned char in[sizeof(Val)];
307 unsigned char out[sizeof(Val)];
308 std::memcpy(in, &Val, sizeof(Val));
309 for (unsigned i = 0; i < sizeof(Val); ++i)
310 out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
311 std::memcpy(&Val, out, sizeof(Val));
312 return Val;
313}
314
315#if __has_builtin(__builtin_bitreverse8)1
316template<>
317inline uint8_t reverseBits<uint8_t>(uint8_t Val) {
318 return __builtin_bitreverse8(Val);
319}
320#endif
321
322#if __has_builtin(__builtin_bitreverse16)1
323template<>
324inline uint16_t reverseBits<uint16_t>(uint16_t Val) {
325 return __builtin_bitreverse16(Val);
326}
327#endif
328
329#if __has_builtin(__builtin_bitreverse32)1
330template<>
331inline uint32_t reverseBits<uint32_t>(uint32_t Val) {
332 return __builtin_bitreverse32(Val);
333}
334#endif
335
336#if __has_builtin(__builtin_bitreverse64)1
337template<>
338inline uint64_t reverseBits<uint64_t>(uint64_t Val) {
339 return __builtin_bitreverse64(Val);
340}
341#endif
342
343// NOTE: The following support functions use the _32/_64 extensions instead of
344// type overloading so that signed and unsigned integers can be used without
345// ambiguity.
346
347/// Return the high 32 bits of a 64 bit value.
348constexpr inline uint32_t Hi_32(uint64_t Value) {
349 return static_cast<uint32_t>(Value >> 32);
350}
351
352/// Return the low 32 bits of a 64 bit value.
353constexpr inline uint32_t Lo_32(uint64_t Value) {
354 return static_cast<uint32_t>(Value);
355}
356
357/// Make a 64-bit integer from a high / low pair of 32-bit integers.
358constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
359 return ((uint64_t)High << 32) | (uint64_t)Low;
360}
361
362/// Checks if an integer fits into the given bit width.
363template <unsigned N> constexpr inline bool isInt(int64_t x) {
364 return N >= 64 || (-(INT64_C(1)1L<<(N-1)) <= x && x < (INT64_C(1)1L<<(N-1)));
365}
366// Template specializations to get better code for common cases.
367template <> constexpr inline bool isInt<8>(int64_t x) {
368 return static_cast<int8_t>(x) == x;
369}
370template <> constexpr inline bool isInt<16>(int64_t x) {
371 return static_cast<int16_t>(x) == x;
372}
373template <> constexpr inline bool isInt<32>(int64_t x) {
374 return static_cast<int32_t>(x) == x;
375}
376
377/// Checks if a signed integer is an N bit number shifted left by S.
378template <unsigned N, unsigned S>
379constexpr inline bool isShiftedInt(int64_t x) {
380 static_assert(
381 N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number.");
382 static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide.");
383 return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
384}
385
386/// Checks if an unsigned integer fits into the given bit width.
387///
388/// This is written as two functions rather than as simply
389///
390/// return N >= 64 || X < (UINT64_C(1) << N);
391///
392/// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting
393/// left too many places.
394template <unsigned N>
395constexpr inline std::enable_if_t<(N < 64), bool> isUInt(uint64_t X) {
396 static_assert(N > 0, "isUInt<0> doesn't make sense");
397 return X < (UINT64_C(1)1UL << (N));
398}
399template <unsigned N>
400constexpr inline std::enable_if_t<N >= 64, bool> isUInt(uint64_t) {
401 return true;
402}
403
404// Template specializations to get better code for common cases.
405template <> constexpr inline bool isUInt<8>(uint64_t x) {
406 return static_cast<uint8_t>(x) == x;
407}
408template <> constexpr inline bool isUInt<16>(uint64_t x) {
409 return static_cast<uint16_t>(x) == x;
410}
411template <> constexpr inline bool isUInt<32>(uint64_t x) {
412 return static_cast<uint32_t>(x) == x;
413}
414
415/// Checks if a unsigned integer is an N bit number shifted left by S.
416template <unsigned N, unsigned S>
417constexpr inline bool isShiftedUInt(uint64_t x) {
418 static_assert(
419 N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)");
420 static_assert(N + S <= 64,
421 "isShiftedUInt<N, S> with N + S > 64 is too wide.");
422 // Per the two static_asserts above, S must be strictly less than 64. So
423 // 1 << S is not undefined behavior.
424 return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
425}
426
427/// Gets the maximum value for a N-bit unsigned integer.
428inline uint64_t maxUIntN(uint64_t N) {
429 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
"integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/llvm/include/llvm/Support/MathExtras.h"
, 429, __extension__ __PRETTY_FUNCTION__))
;
430
431 // uint64_t(1) << 64 is undefined behavior, so we can't do
432 // (uint64_t(1) << N) - 1
433 // without checking first that N != 64. But this works and doesn't have a
434 // branch.
435 return UINT64_MAX(18446744073709551615UL) >> (64 - N);
436}
437
438/// Gets the minimum value for a N-bit signed integer.
439inline int64_t minIntN(int64_t N) {
440 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
"integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/llvm/include/llvm/Support/MathExtras.h"
, 440, __extension__ __PRETTY_FUNCTION__))
;
441
442 return UINT64_C(1)1UL + ~(UINT64_C(1)1UL << (N - 1));
443}
444
445/// Gets the maximum value for a N-bit signed integer.
446inline int64_t maxIntN(int64_t N) {
447 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 &&
"integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/llvm/include/llvm/Support/MathExtras.h"
, 447, __extension__ __PRETTY_FUNCTION__))
;
448
449 // This relies on two's complement wraparound when N == 64, so we convert to
450 // int64_t only at the very end to avoid UB.
451 return (UINT64_C(1)1UL << (N - 1)) - 1;
452}
453
454/// Checks if an unsigned integer fits into the given (dynamic) bit width.
455inline bool isUIntN(unsigned N, uint64_t x) {
456 return N >= 64 || x <= maxUIntN(N);
457}
458
459/// Checks if an signed integer fits into the given (dynamic) bit width.
460inline bool isIntN(unsigned N, int64_t x) {
461 return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N));
462}
463
464/// Return true if the argument is a non-empty sequence of ones starting at the
465/// least significant bit with the remainder zero (32 bit version).
466/// Ex. isMask_32(0x0000FFFFU) == true.
467constexpr inline bool isMask_32(uint32_t Value) {
468 return Value && ((Value + 1) & Value) == 0;
469}
470
471/// Return true if the argument is a non-empty sequence of ones starting at the
472/// least significant bit with the remainder zero (64 bit version).
473constexpr inline bool isMask_64(uint64_t Value) {
474 return Value && ((Value + 1) & Value) == 0;
475}
476
477/// Return true if the argument contains a non-empty sequence of ones with the
478/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
479constexpr inline bool isShiftedMask_32(uint32_t Value) {
480 return Value && isMask_32((Value - 1) | Value);
481}
482
483/// Return true if the argument contains a non-empty sequence of ones with the
484/// remainder zero (64 bit version.)
485constexpr inline bool isShiftedMask_64(uint64_t Value) {
486 return Value && isMask_64((Value - 1) | Value);
487}
488
489/// Return true if the argument is a power of two > 0.
490/// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
491constexpr inline bool isPowerOf2_32(uint32_t Value) {
492 return Value && !(Value & (Value - 1));
493}
494
495/// Return true if the argument is a power of two > 0 (64 bit edition.)
496constexpr inline bool isPowerOf2_64(uint64_t Value) {
497 return Value && !(Value & (Value - 1));
498}
499
500/// Count the number of ones from the most significant bit to the first
501/// zero bit.
502///
503/// Ex. countLeadingOnes(0xFF0FFF00) == 8.
504/// Only unsigned integral types are allowed.
505///
506/// \param ZB the behavior on an input of all ones. Only ZB_Width and
507/// ZB_Undefined are valid arguments.
508template <typename T>
509unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
510 static_assert(std::numeric_limits<T>::is_integer &&
511 !std::numeric_limits<T>::is_signed,
512 "Only unsigned integral types are allowed.");
513 return countLeadingZeros<T>(~Value, ZB);
514}
515
516/// Count the number of ones from the least significant bit to the first
517/// zero bit.
518///
519/// Ex. countTrailingOnes(0x00FF00FF) == 8.
520/// Only unsigned integral types are allowed.
521///
522/// \param ZB the behavior on an input of all ones. Only ZB_Width and
523/// ZB_Undefined are valid arguments.
524template <typename T>
525unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
526 static_assert(std::numeric_limits<T>::is_integer &&
527 !std::numeric_limits<T>::is_signed,
528 "Only unsigned integral types are allowed.");
529 return countTrailingZeros<T>(~Value, ZB);
530}
531
532namespace detail {
533template <typename T, std::size_t SizeOfT> struct PopulationCounter {
534 static unsigned count(T Value) {
535 // Generic version, forward to 32 bits.
536 static_assert(SizeOfT <= 4, "Not implemented!");
537#if defined(__GNUC__4)
538 return __builtin_popcount(Value);
539#else
540 uint32_t v = Value;
541 v = v - ((v >> 1) & 0x55555555);
542 v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
543 return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
544#endif
545 }
546};
547
548template <typename T> struct PopulationCounter<T, 8> {
549 static unsigned count(T Value) {
550#if defined(__GNUC__4)
551 return __builtin_popcountll(Value);
552#else
553 uint64_t v = Value;
554 v = v - ((v >> 1) & 0x5555555555555555ULL);
555 v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
556 v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
557 return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
558#endif
559 }
560};
561} // namespace detail
562
563/// Count the number of set bits in a value.
564/// Ex. countPopulation(0xF000F000) = 8
565/// Returns 0 if the word is zero.
566template <typename T>
567inline unsigned countPopulation(T Value) {
568 static_assert(std::numeric_limits<T>::is_integer &&
569 !std::numeric_limits<T>::is_signed,
570 "Only unsigned integral types are allowed.");
571 return detail::PopulationCounter<T, sizeof(T)>::count(Value);
572}
573
574/// Compile time Log2.
575/// Valid only for positive powers of two.
576template <size_t kValue> constexpr inline size_t CTLog2() {
577 static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue),
578 "Value is not a valid power of 2");
579 return 1 + CTLog2<kValue / 2>();
580}
581
582template <> constexpr inline size_t CTLog2<1>() { return 0; }
583
584/// Return the log base 2 of the specified value.
585inline double Log2(double Value) {
586#if defined(__ANDROID_API__) && __ANDROID_API__ < 18
587 return __builtin_log(Value) / __builtin_log(2.0);
588#else
589 return log2(Value);
590#endif
591}
592
593/// Return the floor log base 2 of the specified value, -1 if the value is zero.
594/// (32 bit edition.)
595/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
596inline unsigned Log2_32(uint32_t Value) {
597 return 31 - countLeadingZeros(Value);
12
Returning the value 4294967295
598}
599
600/// Return the floor log base 2 of the specified value, -1 if the value is zero.
601/// (64 bit edition.)
602inline unsigned Log2_64(uint64_t Value) {
603 return 63 - countLeadingZeros(Value);
604}
605
606/// Return the ceil log base 2 of the specified value, 32 if the value is zero.
607/// (32 bit edition).
608/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
609inline unsigned Log2_32_Ceil(uint32_t Value) {
610 return 32 - countLeadingZeros(Value - 1);
611}
612
613/// Return the ceil log base 2 of the specified value, 64 if the value is zero.
614/// (64 bit edition.)
615inline unsigned Log2_64_Ceil(uint64_t Value) {
616 return 64 - countLeadingZeros(Value - 1);
617}
618
619/// Return the greatest common divisor of the values using Euclid's algorithm.
620template <typename T>
621inline T greatestCommonDivisor(T A, T B) {
622 while (B) {
623 T Tmp = B;
624 B = A % B;
625 A = Tmp;
626 }
627 return A;
628}
629
630inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) {
631 return greatestCommonDivisor<uint64_t>(A, B);
632}
633
634/// This function takes a 64-bit integer and returns the bit equivalent double.
635inline double BitsToDouble(uint64_t Bits) {
636 double D;
637 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
638 memcpy(&D, &Bits, sizeof(Bits));
639 return D;
640}
641
642/// This function takes a 32-bit integer and returns the bit equivalent float.
643inline float BitsToFloat(uint32_t Bits) {
644 float F;
645 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
646 memcpy(&F, &Bits, sizeof(Bits));
647 return F;
648}
649
650/// This function takes a double and returns the bit equivalent 64-bit integer.
651/// Note that copying doubles around changes the bits of NaNs on some hosts,
652/// notably x86, so this routine cannot be used if these bits are needed.
653inline uint64_t DoubleToBits(double Double) {
654 uint64_t Bits;
655 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
656 memcpy(&Bits, &Double, sizeof(Double));
657 return Bits;
658}
659
660/// This function takes a float and returns the bit equivalent 32-bit integer.
661/// Note that copying floats around changes the bits of NaNs on some hosts,
662/// notably x86, so this routine cannot be used if these bits are needed.
663inline uint32_t FloatToBits(float Float) {
664 uint32_t Bits;
665 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
666 memcpy(&Bits, &Float, sizeof(Float));
667 return Bits;
668}
669
670/// A and B are either alignments or offsets. Return the minimum alignment that
671/// may be assumed after adding the two together.
672constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
673 // The largest power of 2 that divides both A and B.
674 //
675 // Replace "-Value" by "1+~Value" in the following commented code to avoid
676 // MSVC warning C4146
677 // return (A | B) & -(A | B);
678 return (A | B) & (1 + ~(A | B));
679}
680
681/// Returns the next power of two (in 64-bits) that is strictly greater than A.
682/// Returns zero on overflow.
683inline uint64_t NextPowerOf2(uint64_t A) {
684 A |= (A >> 1);
685 A |= (A >> 2);
686 A |= (A >> 4);
687 A |= (A >> 8);
688 A |= (A >> 16);
689 A |= (A >> 32);
690 return A + 1;
691}
692
693/// Returns the power of two which is less than or equal to the given value.
694/// Essentially, it is a floor operation across the domain of powers of two.
695inline uint64_t PowerOf2Floor(uint64_t A) {
696 if (!A) return 0;
697 return 1ull << (63 - countLeadingZeros(A, ZB_Undefined));
698}
699
700/// Returns the power of two which is greater than or equal to the given value.
701/// Essentially, it is a ceil operation across the domain of powers of two.
702inline uint64_t PowerOf2Ceil(uint64_t A) {
703 if (!A)
704 return 0;
705 return NextPowerOf2(A - 1);
706}
707
708/// Returns the next integer (mod 2**64) that is greater than or equal to
709/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
710///
711/// If non-zero \p Skew is specified, the return value will be a minimal
712/// integer that is greater than or equal to \p Value and equal to
713/// \p Align * N + \p Skew for some integer N. If \p Skew is larger than
714/// \p Align, its value is adjusted to '\p Skew mod \p Align'.
715///
716/// Examples:
717/// \code
718/// alignTo(5, 8) = 8
719/// alignTo(17, 8) = 24
720/// alignTo(~0LL, 8) = 0
721/// alignTo(321, 255) = 510
722///
723/// alignTo(5, 8, 7) = 7
724/// alignTo(17, 8, 1) = 17
725/// alignTo(~0LL, 8, 3) = 3
726/// alignTo(321, 255, 42) = 552
727/// \endcode
728inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
729 assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0."
) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "/build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/llvm/include/llvm/Support/MathExtras.h"
, 729, __extension__ __PRETTY_FUNCTION__))
;
730 Skew %= Align;
731 return (Value + Align - 1 - Skew) / Align * Align + Skew;
732}
733
734/// Returns the next integer (mod 2**64) that is greater than or equal to
735/// \p Value and is a multiple of \c Align. \c Align must be non-zero.
736template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) {
737 static_assert(Align != 0u, "Align must be non-zero");
738 return (Value + Align - 1) / Align * Align;
739}
740
741/// Returns the integer ceil(Numerator / Denominator).
742inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
743 return alignTo(Numerator, Denominator) / Denominator;
744}
745
746/// Returns the integer nearest(Numerator / Denominator).
747inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) {
748 return (Numerator + (Denominator / 2)) / Denominator;
749}
750
751/// Returns the largest uint64_t less than or equal to \p Value and is
752/// \p Skew mod \p Align. \p Align must be non-zero
753inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
754 assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0."
) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "/build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/llvm/include/llvm/Support/MathExtras.h"
, 754, __extension__ __PRETTY_FUNCTION__))
;
755 Skew %= Align;
756 return (Value - Skew) / Align * Align + Skew;
757}
758
759/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
760/// Requires 0 < B <= 32.
761template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) {
762 static_assert(B > 0, "Bit width can't be 0.");
763 static_assert(B <= 32, "Bit width out of range.");
764 return int32_t(X << (32 - B)) >> (32 - B);
765}
766
767/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
768/// Requires 0 < B <= 32.
769inline int32_t SignExtend32(uint32_t X, unsigned B) {
770 assert(B > 0 && "Bit width can't be 0.")(static_cast <bool> (B > 0 && "Bit width can't be 0."
) ? void (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "/build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/llvm/include/llvm/Support/MathExtras.h"
, 770, __extension__ __PRETTY_FUNCTION__))
;
771 assert(B <= 32 && "Bit width out of range.")(static_cast <bool> (B <= 32 && "Bit width out of range."
) ? void (0) : __assert_fail ("B <= 32 && \"Bit width out of range.\""
, "/build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/llvm/include/llvm/Support/MathExtras.h"
, 771, __extension__ __PRETTY_FUNCTION__))
;
772 return int32_t(X << (32 - B)) >> (32 - B);
773}
774
775/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
776/// Requires 0 < B <= 64.
777template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) {
778 static_assert(B > 0, "Bit width can't be 0.");
779 static_assert(B <= 64, "Bit width out of range.");
780 return int64_t(x << (64 - B)) >> (64 - B);
781}
782
783/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
784/// Requires 0 < B <= 64.
785inline int64_t SignExtend64(uint64_t X, unsigned B) {
786 assert(B > 0 && "Bit width can't be 0.")(static_cast <bool> (B > 0 && "Bit width can't be 0."
) ? void (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "/build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/llvm/include/llvm/Support/MathExtras.h"
, 786, __extension__ __PRETTY_FUNCTION__))
;
787 assert(B <= 64 && "Bit width out of range.")(static_cast <bool> (B <= 64 && "Bit width out of range."
) ? void (0) : __assert_fail ("B <= 64 && \"Bit width out of range.\""
, "/build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/llvm/include/llvm/Support/MathExtras.h"
, 787, __extension__ __PRETTY_FUNCTION__))
;
788 return int64_t(X << (64 - B)) >> (64 - B);
789}
790
791/// Subtract two unsigned integers, X and Y, of type T and return the absolute
792/// value of the result.
793template <typename T>
794std::enable_if_t<std::is_unsigned<T>::value, T> AbsoluteDifference(T X, T Y) {
795 return X > Y ? (X - Y) : (Y - X);
796}
797
798/// Add two unsigned integers, X and Y, of type T. Clamp the result to the
799/// maximum representable value of T on overflow. ResultOverflowed indicates if
800/// the result is larger than the maximum representable value of type T.
801template <typename T>
802std::enable_if_t<std::is_unsigned<T>::value, T>
803SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) {
804 bool Dummy;
805 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
806 // Hacker's Delight, p. 29
807 T Z = X + Y;
808 Overflowed = (Z < X || Z < Y);
809 if (Overflowed)
810 return std::numeric_limits<T>::max();
811 else
812 return Z;
813}
814
815/// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the
816/// maximum representable value of T on overflow. ResultOverflowed indicates if
817/// the result is larger than the maximum representable value of type T.
818template <typename T>
819std::enable_if_t<std::is_unsigned<T>::value, T>
820SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) {
821 bool Dummy;
822 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
823
824 // Hacker's Delight, p. 30 has a different algorithm, but we don't use that
825 // because it fails for uint16_t (where multiplication can have undefined
826 // behavior due to promotion to int), and requires a division in addition
827 // to the multiplication.
828
829 Overflowed = false;
830
831 // Log2(Z) would be either Log2Z or Log2Z + 1.
832 // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z
833 // will necessarily be less than Log2Max as desired.
834 int Log2Z = Log2_64(X) + Log2_64(Y);
835 const T Max = std::numeric_limits<T>::max();
836 int Log2Max = Log2_64(Max);
837 if (Log2Z < Log2Max) {
838 return X * Y;
839 }
840 if (Log2Z > Log2Max) {
841 Overflowed = true;
842 return Max;
843 }
844
845 // We're going to use the top bit, and maybe overflow one
846 // bit past it. Multiply all but the bottom bit then add
847 // that on at the end.
848 T Z = (X >> 1) * Y;
849 if (Z & ~(Max >> 1)) {
850 Overflowed = true;
851 return Max;
852 }
853 Z <<= 1;
854 if (X & 1)
855 return SaturatingAdd(Z, Y, ResultOverflowed);
856
857 return Z;
858}
859
860/// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to
861/// the product. Clamp the result to the maximum representable value of T on
862/// overflow. ResultOverflowed indicates if the result is larger than the
863/// maximum representable value of type T.
864template <typename T>
865std::enable_if_t<std::is_unsigned<T>::value, T>
866SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) {
867 bool Dummy;
868 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
869
870 T Product = SaturatingMultiply(X, Y, &Overflowed);
871 if (Overflowed)
872 return Product;
873
874 return SaturatingAdd(A, Product, &Overflowed);
875}
876
877/// Use this rather than HUGE_VALF; the latter causes warnings on MSVC.
878extern const float huge_valf;
879
880
881/// Add two signed integers, computing the two's complement truncated result,
882/// returning true if overflow occured.
883template <typename T>
884std::enable_if_t<std::is_signed<T>::value, T> AddOverflow(T X, T Y, T &Result) {
885#if __has_builtin(__builtin_add_overflow)1
886 return __builtin_add_overflow(X, Y, &Result);
887#else
888 // Perform the unsigned addition.
889 using U = std::make_unsigned_t<T>;
890 const U UX = static_cast<U>(X);
891 const U UY = static_cast<U>(Y);
892 const U UResult = UX + UY;
893
894 // Convert to signed.
895 Result = static_cast<T>(UResult);
896
897 // Adding two positive numbers should result in a positive number.
898 if (X > 0 && Y > 0)
899 return Result <= 0;
900 // Adding two negatives should result in a negative number.
901 if (X < 0 && Y < 0)
902 return Result >= 0;
903 return false;
904#endif
905}
906
907/// Subtract two signed integers, computing the two's complement truncated
908/// result, returning true if an overflow ocurred.
909template <typename T>
910std::enable_if_t<std::is_signed<T>::value, T> SubOverflow(T X, T Y, T &Result) {
911#if __has_builtin(__builtin_sub_overflow)1
912 return __builtin_sub_overflow(X, Y, &Result);
913#else
914 // Perform the unsigned addition.
915 using U = std::make_unsigned_t<T>;
916 const U UX = static_cast<U>(X);
917 const U UY = static_cast<U>(Y);
918 const U UResult = UX - UY;
919
920 // Convert to signed.
921 Result = static_cast<T>(UResult);
922
923 // Subtracting a positive number from a negative results in a negative number.
924 if (X <= 0 && Y > 0)
925 return Result >= 0;
926 // Subtracting a negative number from a positive results in a positive number.
927 if (X >= 0 && Y < 0)
928 return Result <= 0;
929 return false;
930#endif
931}
932
933/// Multiply two signed integers, computing the two's complement truncated
934/// result, returning true if an overflow ocurred.
935template <typename T>
936std::enable_if_t<std::is_signed<T>::value, T> MulOverflow(T X, T Y, T &Result) {
937 // Perform the unsigned multiplication on absolute values.
938 using U = std::make_unsigned_t<T>;
939 const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X);
940 const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y);
941 const U UResult = UX * UY;
942
943 // Convert to signed.
944 const bool IsNegative = (X < 0) ^ (Y < 0);
945 Result = IsNegative ? (0 - UResult) : UResult;
946
947 // If any of the args was 0, result is 0 and no overflow occurs.
948 if (UX == 0 || UY == 0)
949 return false;
950
951 // UX and UY are in [1, 2^n], where n is the number of digits.
952 // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for
953 // positive) divided by an argument compares to the other.
954 if (IsNegative)
955 return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY;
956 else
957 return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY;
958}
959
960} // End llvm namespace
961
962#endif