Bug Summary

File:llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Warning:line 343, column 8
Called C++ object pointer is null

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name AArch64TargetTransformInfo.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/lib/Target/AArch64 -resource-dir /usr/lib/llvm-14/lib/clang/14.0.0 -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/lib/Target/AArch64 -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/lib/Target/AArch64 -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/include -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/include -D NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-14/lib/clang/14.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/lib/Target/AArch64 -fdebug-prefix-map=/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e=. -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2021-09-04-040900-46481-1 -x c++ /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "AArch64TargetTransformInfo.h"
10#include "AArch64ExpandImm.h"
11#include "MCTargetDesc/AArch64AddressingModes.h"
12#include "llvm/Analysis/IVDescriptors.h"
13#include "llvm/Analysis/LoopInfo.h"
14#include "llvm/Analysis/TargetTransformInfo.h"
15#include "llvm/CodeGen/BasicTTIImpl.h"
16#include "llvm/CodeGen/CostTable.h"
17#include "llvm/CodeGen/TargetLowering.h"
18#include "llvm/IR/Intrinsics.h"
19#include "llvm/IR/IntrinsicInst.h"
20#include "llvm/IR/IntrinsicsAArch64.h"
21#include "llvm/IR/PatternMatch.h"
22#include "llvm/Support/Debug.h"
23#include "llvm/Transforms/InstCombine/InstCombiner.h"
24#include <algorithm>
25using namespace llvm;
26using namespace llvm::PatternMatch;
27
28#define DEBUG_TYPE"aarch64tti" "aarch64tti"
29
30static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
31 cl::init(true), cl::Hidden);
32
33bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
34 const Function *Callee) const {
35 const TargetMachine &TM = getTLI()->getTargetMachine();
36
37 const FeatureBitset &CallerBits =
38 TM.getSubtargetImpl(*Caller)->getFeatureBits();
39 const FeatureBitset &CalleeBits =
40 TM.getSubtargetImpl(*Callee)->getFeatureBits();
41
42 // Inline a callee if its target-features are a subset of the callers
43 // target-features.
44 return (CallerBits & CalleeBits) == CalleeBits;
45}
46
47/// Calculate the cost of materializing a 64-bit value. This helper
48/// method might only calculate a fraction of a larger immediate. Therefore it
49/// is valid to return a cost of ZERO.
50InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) {
51 // Check if the immediate can be encoded within an instruction.
52 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
53 return 0;
54
55 if (Val < 0)
56 Val = ~Val;
57
58 // Calculate how many moves we will need to materialize this constant.
59 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
60 AArch64_IMM::expandMOVImm(Val, 64, Insn);
61 return Insn.size();
62}
63
64/// Calculate the cost of materializing the given constant.
65InstructionCost AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
66 TTI::TargetCostKind CostKind) {
67 assert(Ty->isIntegerTy())(static_cast<void> (0));
68
69 unsigned BitSize = Ty->getPrimitiveSizeInBits();
70 if (BitSize == 0)
71 return ~0U;
72
73 // Sign-extend all constants to a multiple of 64-bit.
74 APInt ImmVal = Imm;
75 if (BitSize & 0x3f)
76 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
77
78 // Split the constant into 64-bit chunks and calculate the cost for each
79 // chunk.
80 InstructionCost Cost = 0;
81 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
82 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
83 int64_t Val = Tmp.getSExtValue();
84 Cost += getIntImmCost(Val);
85 }
86 // We need at least one instruction to materialze the constant.
87 return std::max<InstructionCost>(1, Cost);
88}
89
90InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
91 const APInt &Imm, Type *Ty,
92 TTI::TargetCostKind CostKind,
93 Instruction *Inst) {
94 assert(Ty->isIntegerTy())(static_cast<void> (0));
95
96 unsigned BitSize = Ty->getPrimitiveSizeInBits();
97 // There is no cost model for constants with a bit size of 0. Return TCC_Free
98 // here, so that constant hoisting will ignore this constant.
99 if (BitSize == 0)
100 return TTI::TCC_Free;
101
102 unsigned ImmIdx = ~0U;
103 switch (Opcode) {
104 default:
105 return TTI::TCC_Free;
106 case Instruction::GetElementPtr:
107 // Always hoist the base address of a GetElementPtr.
108 if (Idx == 0)
109 return 2 * TTI::TCC_Basic;
110 return TTI::TCC_Free;
111 case Instruction::Store:
112 ImmIdx = 0;
113 break;
114 case Instruction::Add:
115 case Instruction::Sub:
116 case Instruction::Mul:
117 case Instruction::UDiv:
118 case Instruction::SDiv:
119 case Instruction::URem:
120 case Instruction::SRem:
121 case Instruction::And:
122 case Instruction::Or:
123 case Instruction::Xor:
124 case Instruction::ICmp:
125 ImmIdx = 1;
126 break;
127 // Always return TCC_Free for the shift value of a shift instruction.
128 case Instruction::Shl:
129 case Instruction::LShr:
130 case Instruction::AShr:
131 if (Idx == 1)
132 return TTI::TCC_Free;
133 break;
134 case Instruction::Trunc:
135 case Instruction::ZExt:
136 case Instruction::SExt:
137 case Instruction::IntToPtr:
138 case Instruction::PtrToInt:
139 case Instruction::BitCast:
140 case Instruction::PHI:
141 case Instruction::Call:
142 case Instruction::Select:
143 case Instruction::Ret:
144 case Instruction::Load:
145 break;
146 }
147
148 if (Idx == ImmIdx) {
149 int NumConstants = (BitSize + 63) / 64;
150 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
151 return (Cost <= NumConstants * TTI::TCC_Basic)
152 ? static_cast<int>(TTI::TCC_Free)
153 : Cost;
154 }
155 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
156}
157
158InstructionCost
159AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
160 const APInt &Imm, Type *Ty,
161 TTI::TargetCostKind CostKind) {
162 assert(Ty->isIntegerTy())(static_cast<void> (0));
163
164 unsigned BitSize = Ty->getPrimitiveSizeInBits();
165 // There is no cost model for constants with a bit size of 0. Return TCC_Free
166 // here, so that constant hoisting will ignore this constant.
167 if (BitSize == 0)
168 return TTI::TCC_Free;
169
170 // Most (all?) AArch64 intrinsics do not support folding immediates into the
171 // selected instruction, so we compute the materialization cost for the
172 // immediate directly.
173 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
174 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
175
176 switch (IID) {
177 default:
178 return TTI::TCC_Free;
179 case Intrinsic::sadd_with_overflow:
180 case Intrinsic::uadd_with_overflow:
181 case Intrinsic::ssub_with_overflow:
182 case Intrinsic::usub_with_overflow:
183 case Intrinsic::smul_with_overflow:
184 case Intrinsic::umul_with_overflow:
185 if (Idx == 1) {
186 int NumConstants = (BitSize + 63) / 64;
187 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
188 return (Cost <= NumConstants * TTI::TCC_Basic)
189 ? static_cast<int>(TTI::TCC_Free)
190 : Cost;
191 }
192 break;
193 case Intrinsic::experimental_stackmap:
194 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
195 return TTI::TCC_Free;
196 break;
197 case Intrinsic::experimental_patchpoint_void:
198 case Intrinsic::experimental_patchpoint_i64:
199 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
200 return TTI::TCC_Free;
201 break;
202 case Intrinsic::experimental_gc_statepoint:
203 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
204 return TTI::TCC_Free;
205 break;
206 }
207 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
208}
209
210TargetTransformInfo::PopcntSupportKind
211AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
212 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2")(static_cast<void> (0));
213 if (TyWidth == 32 || TyWidth == 64)
214 return TTI::PSK_FastHardware;
215 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
216 return TTI::PSK_Software;
217}
218
219InstructionCost
220AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
221 TTI::TargetCostKind CostKind) {
222 auto *RetTy = ICA.getReturnType();
223 switch (ICA.getID()) {
224 case Intrinsic::umin:
225 case Intrinsic::umax:
226 case Intrinsic::smin:
227 case Intrinsic::smax: {
228 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
229 MVT::v8i16, MVT::v2i32, MVT::v4i32};
230 auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
231 // v2i64 types get converted to cmp+bif hence the cost of 2
232 if (LT.second == MVT::v2i64)
233 return LT.first * 2;
234 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
235 return LT.first;
236 break;
237 }
238 case Intrinsic::sadd_sat:
239 case Intrinsic::ssub_sat:
240 case Intrinsic::uadd_sat:
241 case Intrinsic::usub_sat: {
242 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
243 MVT::v8i16, MVT::v2i32, MVT::v4i32,
244 MVT::v2i64};
245 auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
246 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
247 // need to extend the type, as it uses shr(qadd(shl, shl)).
248 unsigned Instrs =
249 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
250 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
251 return LT.first * Instrs;
252 break;
253 }
254 case Intrinsic::abs: {
255 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
256 MVT::v8i16, MVT::v2i32, MVT::v4i32,
257 MVT::v2i64};
258 auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
259 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
260 return LT.first;
261 break;
262 }
263 case Intrinsic::experimental_stepvector: {
264 InstructionCost Cost = 1; // Cost of the `index' instruction
265 auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
266 // Legalisation of illegal vectors involves an `index' instruction plus
267 // (LT.first - 1) vector adds.
268 if (LT.first > 1) {
269 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
270 InstructionCost AddCost =
271 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
272 Cost += AddCost * (LT.first - 1);
273 }
274 return Cost;
275 }
276 case Intrinsic::bitreverse: {
277 static const CostTblEntry BitreverseTbl[] = {
278 {Intrinsic::bitreverse, MVT::i32, 1},
279 {Intrinsic::bitreverse, MVT::i64, 1},
280 {Intrinsic::bitreverse, MVT::v8i8, 1},
281 {Intrinsic::bitreverse, MVT::v16i8, 1},
282 {Intrinsic::bitreverse, MVT::v4i16, 2},
283 {Intrinsic::bitreverse, MVT::v8i16, 2},
284 {Intrinsic::bitreverse, MVT::v2i32, 2},
285 {Intrinsic::bitreverse, MVT::v4i32, 2},
286 {Intrinsic::bitreverse, MVT::v1i64, 2},
287 {Intrinsic::bitreverse, MVT::v2i64, 2},
288 };
289 const auto LegalisationCost = TLI->getTypeLegalizationCost(DL, RetTy);
290 const auto *Entry =
291 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
292 // Cost Model is using the legal type(i32) that i8 and i16 will be converted
293 // to +1 so that we match the actual lowering cost
294 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
295 TLI->getValueType(DL, RetTy, true) == MVT::i16)
296 return LegalisationCost.first * Entry->Cost + 1;
297 if (Entry)
298 return LegalisationCost.first * Entry->Cost;
299 break;
300 }
301 case Intrinsic::ctpop: {
302 static const CostTblEntry CtpopCostTbl[] = {
303 {ISD::CTPOP, MVT::v2i64, 4},
304 {ISD::CTPOP, MVT::v4i32, 3},
305 {ISD::CTPOP, MVT::v8i16, 2},
306 {ISD::CTPOP, MVT::v16i8, 1},
307 {ISD::CTPOP, MVT::i64, 4},
308 {ISD::CTPOP, MVT::v2i32, 3},
309 {ISD::CTPOP, MVT::v4i16, 2},
310 {ISD::CTPOP, MVT::v8i8, 1},
311 {ISD::CTPOP, MVT::i32, 5},
312 };
313 auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
314 MVT MTy = LT.second;
315 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
316 // Extra cost of +1 when illegal vector types are legalized by promoting
317 // the integer type.
318 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
319 RetTy->getScalarSizeInBits()
320 ? 1
321 : 0;
322 return LT.first * Entry->Cost + ExtraCost;
323 }
324 break;
325 }
326 default:
327 break;
328 }
329 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
330}
331
332/// The function will remove redundant reinterprets casting in the presence
333/// of the control flow
334static Optional<Instruction *> processPhiNode(InstCombiner &IC,
335 IntrinsicInst &II) {
336 SmallVector<Instruction *, 32> Worklist;
337 auto RequiredType = II.getType();
338
339 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
6
Assuming the object is not a 'PHINode'
7
'PN' initialized to a null pointer value
340 assert(PN && "Expected Phi Node!")(static_cast<void> (0));
341
342 // Don't create a new Phi unless we can remove the old one.
343 if (!PN->hasOneUse())
8
Called C++ object pointer is null
344 return None;
345
346 for (Value *IncValPhi : PN->incoming_values()) {
347 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
348 if (!Reinterpret ||
349 Reinterpret->getIntrinsicID() !=
350 Intrinsic::aarch64_sve_convert_to_svbool ||
351 RequiredType != Reinterpret->getArgOperand(0)->getType())
352 return None;
353 }
354
355 // Create the new Phi
356 LLVMContext &Ctx = PN->getContext();
357 IRBuilder<> Builder(Ctx);
358 Builder.SetInsertPoint(PN);
359 PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
360 Worklist.push_back(PN);
361
362 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
363 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
364 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
365 Worklist.push_back(Reinterpret);
366 }
367
368 // Cleanup Phi Node and reinterprets
369 return IC.replaceInstUsesWith(II, NPN);
370}
371
372static Optional<Instruction *> instCombineConvertFromSVBool(InstCombiner &IC,
373 IntrinsicInst &II) {
374 // If the reinterpret instruction operand is a PHI Node
375 if (isa<PHINode>(II.getArgOperand(0)))
3
Assuming the object is a 'PHINode'
4
Taking true branch
376 return processPhiNode(IC, II);
5
Calling 'processPhiNode'
377
378 SmallVector<Instruction *, 32> CandidatesForRemoval;
379 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
380
381 const auto *IVTy = cast<VectorType>(II.getType());
382
383 // Walk the chain of conversions.
384 while (Cursor) {
385 // If the type of the cursor has fewer lanes than the final result, zeroing
386 // must take place, which breaks the equivalence chain.
387 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
388 if (CursorVTy->getElementCount().getKnownMinValue() <
389 IVTy->getElementCount().getKnownMinValue())
390 break;
391
392 // If the cursor has the same type as I, it is a viable replacement.
393 if (Cursor->getType() == IVTy)
394 EarliestReplacement = Cursor;
395
396 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
397
398 // If this is not an SVE conversion intrinsic, this is the end of the chain.
399 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
400 Intrinsic::aarch64_sve_convert_to_svbool ||
401 IntrinsicCursor->getIntrinsicID() ==
402 Intrinsic::aarch64_sve_convert_from_svbool))
403 break;
404
405 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
406 Cursor = IntrinsicCursor->getOperand(0);
407 }
408
409 // If no viable replacement in the conversion chain was found, there is
410 // nothing to do.
411 if (!EarliestReplacement)
412 return None;
413
414 return IC.replaceInstUsesWith(II, EarliestReplacement);
415}
416
417static Optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
418 IntrinsicInst &II) {
419 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
420 if (!Pg)
421 return None;
422
423 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
424 return None;
425
426 const auto PTruePattern =
427 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
428 if (PTruePattern != AArch64SVEPredPattern::vl1)
429 return None;
430
431 // The intrinsic is inserting into lane zero so use an insert instead.
432 auto *IdxTy = Type::getInt64Ty(II.getContext());
433 auto *Insert = InsertElementInst::Create(
434 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
435 Insert->insertBefore(&II);
436 Insert->takeName(&II);
437
438 return IC.replaceInstUsesWith(II, Insert);
439}
440
441static Optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
442 IntrinsicInst &II) {
443 LLVMContext &Ctx = II.getContext();
444 IRBuilder<> Builder(Ctx);
445 Builder.SetInsertPoint(&II);
446
447 // Check that the predicate is all active
448 auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
449 if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
450 return None;
451
452 const auto PTruePattern =
453 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
454 if (PTruePattern != AArch64SVEPredPattern::all)
455 return None;
456
457 // Check that we have a compare of zero..
458 auto *DupX = dyn_cast<IntrinsicInst>(II.getArgOperand(2));
459 if (!DupX || DupX->getIntrinsicID() != Intrinsic::aarch64_sve_dup_x)
460 return None;
461
462 auto *DupXArg = dyn_cast<ConstantInt>(DupX->getArgOperand(0));
463 if (!DupXArg || !DupXArg->isZero())
464 return None;
465
466 // ..against a dupq
467 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
468 if (!DupQLane ||
469 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
470 return None;
471
472 // Where the dupq is a lane 0 replicate of a vector insert
473 if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero())
474 return None;
475
476 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
477 if (!VecIns ||
478 VecIns->getIntrinsicID() != Intrinsic::experimental_vector_insert)
479 return None;
480
481 // Where the vector insert is a fixed constant vector insert into undef at
482 // index zero
483 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
484 return None;
485
486 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
487 return None;
488
489 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
490 if (!ConstVec)
491 return None;
492
493 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
494 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
495 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
496 return None;
497
498 unsigned NumElts = VecTy->getNumElements();
499 unsigned PredicateBits = 0;
500
501 // Expand intrinsic operands to a 16-bit byte level predicate
502 for (unsigned I = 0; I < NumElts; ++I) {
503 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
504 if (!Arg)
505 return None;
506 if (!Arg->isZero())
507 PredicateBits |= 1 << (I * (16 / NumElts));
508 }
509
510 // If all bits are zero bail early with an empty predicate
511 if (PredicateBits == 0) {
512 auto *PFalse = Constant::getNullValue(II.getType());
513 PFalse->takeName(&II);
514 return IC.replaceInstUsesWith(II, PFalse);
515 }
516
517 // Calculate largest predicate type used (where byte predicate is largest)
518 unsigned Mask = 8;
519 for (unsigned I = 0; I < 16; ++I)
520 if ((PredicateBits & (1 << I)) != 0)
521 Mask |= (I % 8);
522
523 unsigned PredSize = Mask & -Mask;
524 auto *PredType = ScalableVectorType::get(
525 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
526
527 // Ensure all relevant bits are set
528 for (unsigned I = 0; I < 16; I += PredSize)
529 if ((PredicateBits & (1 << I)) == 0)
530 return None;
531
532 auto *PTruePat =
533 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
534 auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
535 {PredType}, {PTruePat});
536 auto *ConvertToSVBool = Builder.CreateIntrinsic(
537 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
538 auto *ConvertFromSVBool =
539 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
540 {II.getType()}, {ConvertToSVBool});
541
542 ConvertFromSVBool->takeName(&II);
543 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
544}
545
546static Optional<Instruction *> instCombineSVELast(InstCombiner &IC,
547 IntrinsicInst &II) {
548 IRBuilder<> Builder(II.getContext());
549 Builder.SetInsertPoint(&II);
550 Value *Pg = II.getArgOperand(0);
551 Value *Vec = II.getArgOperand(1);
552 auto IntrinsicID = II.getIntrinsicID();
553 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
554
555 // lastX(splat(X)) --> X
556 if (auto *SplatVal = getSplatValue(Vec))
557 return IC.replaceInstUsesWith(II, SplatVal);
558
559 // If x and/or y is a splat value then:
560 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
561 Value *LHS, *RHS;
562 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
563 if (isSplatValue(LHS) || isSplatValue(RHS)) {
564 auto *OldBinOp = cast<BinaryOperator>(Vec);
565 auto OpC = OldBinOp->getOpcode();
566 auto *NewLHS =
567 Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
568 auto *NewRHS =
569 Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
570 auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags(
571 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), &II);
572 return IC.replaceInstUsesWith(II, NewBinOp);
573 }
574 }
575
576 auto *C = dyn_cast<Constant>(Pg);
577 if (IsAfter && C && C->isNullValue()) {
578 // The intrinsic is extracting lane 0 so use an extract instead.
579 auto *IdxTy = Type::getInt64Ty(II.getContext());
580 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
581 Extract->insertBefore(&II);
582 Extract->takeName(&II);
583 return IC.replaceInstUsesWith(II, Extract);
584 }
585
586 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
587 if (!IntrPG)
588 return None;
589
590 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
591 return None;
592
593 const auto PTruePattern =
594 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
595
596 // Can the intrinsic's predicate be converted to a known constant index?
597 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
598 if (!MinNumElts)
599 return None;
600
601 unsigned Idx = MinNumElts - 1;
602 // Increment the index if extracting the element after the last active
603 // predicate element.
604 if (IsAfter)
605 ++Idx;
606
607 // Ignore extracts whose index is larger than the known minimum vector
608 // length. NOTE: This is an artificial constraint where we prefer to
609 // maintain what the user asked for until an alternative is proven faster.
610 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
611 if (Idx >= PgVTy->getMinNumElements())
612 return None;
613
614 // The intrinsic is extracting a fixed lane so use an extract instead.
615 auto *IdxTy = Type::getInt64Ty(II.getContext());
616 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
617 Extract->insertBefore(&II);
618 Extract->takeName(&II);
619 return IC.replaceInstUsesWith(II, Extract);
620}
621
622static Optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
623 IntrinsicInst &II) {
624 LLVMContext &Ctx = II.getContext();
625 IRBuilder<> Builder(Ctx);
626 Builder.SetInsertPoint(&II);
627 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
628 // can work with RDFFR_PP for ptest elimination.
629 auto *AllPat =
630 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
631 auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
632 {II.getType()}, {AllPat});
633 auto *RDFFR =
634 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue});
635 RDFFR->takeName(&II);
636 return IC.replaceInstUsesWith(II, RDFFR);
637}
638
639static Optional<Instruction *>
640instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) {
641 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
642
643 if (Pattern == AArch64SVEPredPattern::all) {
644 LLVMContext &Ctx = II.getContext();
645 IRBuilder<> Builder(Ctx);
646 Builder.SetInsertPoint(&II);
647
648 Constant *StepVal = ConstantInt::get(II.getType(), NumElts);
649 auto *VScale = Builder.CreateVScale(StepVal);
650 VScale->takeName(&II);
651 return IC.replaceInstUsesWith(II, VScale);
652 }
653
654 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
655
656 return MinNumElts && NumElts >= MinNumElts
657 ? Optional<Instruction *>(IC.replaceInstUsesWith(
658 II, ConstantInt::get(II.getType(), MinNumElts)))
659 : None;
660}
661
662static Optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
663 IntrinsicInst &II) {
664 IntrinsicInst *Op1 = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
665 IntrinsicInst *Op2 = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
666
667 if (Op1 && Op2 &&
668 Op1->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
669 Op2->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
670 Op1->getArgOperand(0)->getType() == Op2->getArgOperand(0)->getType()) {
671
672 IRBuilder<> Builder(II.getContext());
673 Builder.SetInsertPoint(&II);
674
675 Value *Ops[] = {Op1->getArgOperand(0), Op2->getArgOperand(0)};
676 Type *Tys[] = {Op1->getArgOperand(0)->getType()};
677
678 auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
679
680 PTest->takeName(&II);
681 return IC.replaceInstUsesWith(II, PTest);
682 }
683
684 return None;
685}
686
687static Optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
688 IntrinsicInst &II) {
689 auto *OpPredicate = II.getOperand(0);
690 auto *OpMultiplicand = II.getOperand(1);
691 auto *OpMultiplier = II.getOperand(2);
692
693 IRBuilder<> Builder(II.getContext());
694 Builder.SetInsertPoint(&II);
695
696 // Return true if a given instruction is an aarch64_sve_dup_x intrinsic call
697 // with a unit splat value, false otherwise.
698 auto IsUnitDupX = [](auto *I) {
699 auto *IntrI = dyn_cast<IntrinsicInst>(I);
700 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup_x)
701 return false;
702
703 auto *SplatValue = IntrI->getOperand(0);
704 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
705 };
706
707 // Return true if a given instruction is an aarch64_sve_dup intrinsic call
708 // with a unit splat value, false otherwise.
709 auto IsUnitDup = [](auto *I) {
710 auto *IntrI = dyn_cast<IntrinsicInst>(I);
711 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
712 return false;
713
714 auto *SplatValue = IntrI->getOperand(2);
715 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
716 };
717
718 // The OpMultiplier variable should always point to the dup (if any), so
719 // swap if necessary.
720 if (IsUnitDup(OpMultiplicand) || IsUnitDupX(OpMultiplicand))
721 std::swap(OpMultiplier, OpMultiplicand);
722
723 if (IsUnitDupX(OpMultiplier)) {
724 // [f]mul pg (dupx 1) %n => %n
725 OpMultiplicand->takeName(&II);
726 return IC.replaceInstUsesWith(II, OpMultiplicand);
727 } else if (IsUnitDup(OpMultiplier)) {
728 // [f]mul pg (dup pg 1) %n => %n
729 auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
730 auto *DupPg = DupInst->getOperand(1);
731 // TODO: this is naive. The optimization is still valid if DupPg
732 // 'encompasses' OpPredicate, not only if they're the same predicate.
733 if (OpPredicate == DupPg) {
734 OpMultiplicand->takeName(&II);
735 return IC.replaceInstUsesWith(II, OpMultiplicand);
736 }
737 }
738
739 return None;
740}
741
742static Optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
743 IntrinsicInst &II) {
744 IRBuilder<> Builder(II.getContext());
745 Builder.SetInsertPoint(&II);
746 Value *UnpackArg = II.getArgOperand(0);
747 auto *RetTy = cast<ScalableVectorType>(II.getType());
748 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
749 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
750
751 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
752 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
753 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
754 ScalarArg =
755 Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
756 Value *NewVal =
757 Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
758 NewVal->takeName(&II);
759 return IC.replaceInstUsesWith(II, NewVal);
760 }
761
762 return None;
763}
764static Optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
765 IntrinsicInst &II) {
766 auto *OpVal = II.getOperand(0);
767 auto *OpIndices = II.getOperand(1);
768 VectorType *VTy = cast<VectorType>(II.getType());
769
770 // Check whether OpIndices is an aarch64_sve_dup_x intrinsic call with
771 // constant splat value < minimal element count of result.
772 auto *DupXIntrI = dyn_cast<IntrinsicInst>(OpIndices);
773 if (!DupXIntrI || DupXIntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup_x)
774 return None;
775
776 auto *SplatValue = dyn_cast<ConstantInt>(DupXIntrI->getOperand(0));
777 if (!SplatValue ||
778 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
779 return None;
780
781 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
782 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
783 IRBuilder<> Builder(II.getContext());
784 Builder.SetInsertPoint(&II);
785 auto *Extract = Builder.CreateExtractElement(OpVal, SplatValue);
786 auto *VectorSplat =
787 Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
788
789 VectorSplat->takeName(&II);
790 return IC.replaceInstUsesWith(II, VectorSplat);
791}
792
793Optional<Instruction *>
794AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
795 IntrinsicInst &II) const {
796 Intrinsic::ID IID = II.getIntrinsicID();
797 switch (IID) {
1
Control jumps to 'case aarch64_sve_convert_from_svbool:' at line 800
798 default:
799 break;
800 case Intrinsic::aarch64_sve_convert_from_svbool:
801 return instCombineConvertFromSVBool(IC, II);
2
Calling 'instCombineConvertFromSVBool'
802 case Intrinsic::aarch64_sve_dup:
803 return instCombineSVEDup(IC, II);
804 case Intrinsic::aarch64_sve_cmpne:
805 case Intrinsic::aarch64_sve_cmpne_wide:
806 return instCombineSVECmpNE(IC, II);
807 case Intrinsic::aarch64_sve_rdffr:
808 return instCombineRDFFR(IC, II);
809 case Intrinsic::aarch64_sve_lasta:
810 case Intrinsic::aarch64_sve_lastb:
811 return instCombineSVELast(IC, II);
812 case Intrinsic::aarch64_sve_cntd:
813 return instCombineSVECntElts(IC, II, 2);
814 case Intrinsic::aarch64_sve_cntw:
815 return instCombineSVECntElts(IC, II, 4);
816 case Intrinsic::aarch64_sve_cnth:
817 return instCombineSVECntElts(IC, II, 8);
818 case Intrinsic::aarch64_sve_cntb:
819 return instCombineSVECntElts(IC, II, 16);
820 case Intrinsic::aarch64_sve_ptest_any:
821 case Intrinsic::aarch64_sve_ptest_first:
822 case Intrinsic::aarch64_sve_ptest_last:
823 return instCombineSVEPTest(IC, II);
824 case Intrinsic::aarch64_sve_mul:
825 case Intrinsic::aarch64_sve_fmul:
826 return instCombineSVEVectorMul(IC, II);
827 case Intrinsic::aarch64_sve_tbl:
828 return instCombineSVETBL(IC, II);
829 case Intrinsic::aarch64_sve_uunpkhi:
830 case Intrinsic::aarch64_sve_uunpklo:
831 case Intrinsic::aarch64_sve_sunpkhi:
832 case Intrinsic::aarch64_sve_sunpklo:
833 return instCombineSVEUnpack(IC, II);
834 }
835
836 return None;
837}
838
839bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
840 ArrayRef<const Value *> Args) {
841
842 // A helper that returns a vector type from the given type. The number of
843 // elements in type Ty determine the vector width.
844 auto toVectorTy = [&](Type *ArgTy) {
845 return VectorType::get(ArgTy->getScalarType(),
846 cast<VectorType>(DstTy)->getElementCount());
847 };
848
849 // Exit early if DstTy is not a vector type whose elements are at least
850 // 16-bits wide.
851 if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16)
852 return false;
853
854 // Determine if the operation has a widening variant. We consider both the
855 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
856 // instructions.
857 //
858 // TODO: Add additional widening operations (e.g., mul, shl, etc.) once we
859 // verify that their extending operands are eliminated during code
860 // generation.
861 switch (Opcode) {
862 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
863 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
864 break;
865 default:
866 return false;
867 }
868
869 // To be a widening instruction (either the "wide" or "long" versions), the
870 // second operand must be a sign- or zero extend having a single user. We
871 // only consider extends having a single user because they may otherwise not
872 // be eliminated.
873 if (Args.size() != 2 ||
874 (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])) ||
875 !Args[1]->hasOneUse())
876 return false;
877 auto *Extend = cast<CastInst>(Args[1]);
878
879 // Legalize the destination type and ensure it can be used in a widening
880 // operation.
881 auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy);
882 unsigned DstElTySize = DstTyL.second.getScalarSizeInBits();
883 if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits())
884 return false;
885
886 // Legalize the source type and ensure it can be used in a widening
887 // operation.
888 auto *SrcTy = toVectorTy(Extend->getSrcTy());
889 auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy);
890 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
891 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
892 return false;
893
894 // Get the total number of vector elements in the legalized types.
895 InstructionCost NumDstEls =
896 DstTyL.first * DstTyL.second.getVectorMinNumElements();
897 InstructionCost NumSrcEls =
898 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
899
900 // Return true if the legalized types have the same number of vector elements
901 // and the destination element type size is twice that of the source type.
902 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize;
903}
904
905InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
906 Type *Src,
907 TTI::CastContextHint CCH,
908 TTI::TargetCostKind CostKind,
909 const Instruction *I) {
910 int ISD = TLI->InstructionOpcodeToISD(Opcode);
911 assert(ISD && "Invalid opcode")(static_cast<void> (0));
912
913 // If the cast is observable, and it is used by a widening instruction (e.g.,
914 // uaddl, saddw, etc.), it may be free.
915 if (I && I->hasOneUse()) {
916 auto *SingleUser = cast<Instruction>(*I->user_begin());
917 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
918 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
919 // If the cast is the second operand, it is free. We will generate either
920 // a "wide" or "long" version of the widening instruction.
921 if (I == SingleUser->getOperand(1))
922 return 0;
923 // If the cast is not the second operand, it will be free if it looks the
924 // same as the second operand. In this case, we will generate a "long"
925 // version of the widening instruction.
926 if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
927 if (I->getOpcode() == unsigned(Cast->getOpcode()) &&
928 cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
929 return 0;
930 }
931 }
932
933 // TODO: Allow non-throughput costs that aren't binary.
934 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
935 if (CostKind != TTI::TCK_RecipThroughput)
936 return Cost == 0 ? 0 : 1;
937 return Cost;
938 };
939
940 EVT SrcTy = TLI->getValueType(DL, Src);
941 EVT DstTy = TLI->getValueType(DL, Dst);
942
943 if (!SrcTy.isSimple() || !DstTy.isSimple())
944 return AdjustCost(
945 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
946
947 static const TypeConversionCostTblEntry
948 ConversionTbl[] = {
949 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
950 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
951 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
952 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
953
954 // Truncations on nxvmiN
955 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 },
956 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 },
957 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 },
958 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 },
959 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 },
960 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 },
961 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 },
962 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 },
963 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 },
964 { ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1 },
965 { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 },
966 { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 },
967 { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 },
968 { ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 },
969 { ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 },
970 { ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 },
971
972 // The number of shll instructions for the extension.
973 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
974 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
975 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
976 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
977 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
978 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
979 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
980 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
981 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
982 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
983 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
984 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
985 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
986 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
987 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
988 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
989
990 // LowerVectorINT_TO_FP:
991 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
992 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
993 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
994 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
995 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
996 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
997
998 // Complex: to v2f32
999 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
1000 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
1001 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
1002 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
1003 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
1004 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
1005
1006 // Complex: to v4f32
1007 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 },
1008 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
1009 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
1010 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
1011
1012 // Complex: to v8f32
1013 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
1014 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
1015 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
1016 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
1017
1018 // Complex: to v16f32
1019 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
1020 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
1021
1022 // Complex: to v2f64
1023 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
1024 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
1025 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
1026 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
1027 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
1028 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
1029
1030
1031 // LowerVectorFP_TO_INT
1032 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
1033 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
1034 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
1035 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
1036 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
1037 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
1038
1039 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
1040 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
1041 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
1042 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 },
1043 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
1044 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
1045 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 },
1046
1047 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
1048 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
1049 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 },
1050 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
1051 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 },
1052
1053 // Complex, from nxv2f32.
1054 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
1055 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
1056 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
1057 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1 },
1058 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
1059 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
1060 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
1061 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1 },
1062
1063 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
1064 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
1065 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
1066 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 },
1067 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
1068 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
1069 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 },
1070
1071 // Complex, from nxv2f64.
1072 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
1073 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
1074 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
1075 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1 },
1076 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
1077 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
1078 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
1079 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1 },
1080
1081 // Complex, from nxv4f32.
1082 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
1083 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
1084 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
1085 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1 },
1086 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
1087 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
1088 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
1089 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1 },
1090
1091 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
1092 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
1093 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7 },
1094 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
1095 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7 },
1096
1097 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
1098 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
1099 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
1100 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3 },
1101 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
1102 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
1103 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3 },
1104
1105 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
1106 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
1107 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3 },
1108 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
1109 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3 },
1110
1111 // Complex, from nxv8f16.
1112 { ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
1113 { ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
1114 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
1115 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1 },
1116 { ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
1117 { ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
1118 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
1119 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1 },
1120
1121 // Complex, from nxv4f16.
1122 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
1123 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
1124 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
1125 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1 },
1126 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
1127 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
1128 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
1129 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1 },
1130
1131 // Complex, from nxv2f16.
1132 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
1133 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
1134 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
1135 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1 },
1136 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
1137 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
1138 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
1139 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1 },
1140
1141 // Truncate from nxvmf32 to nxvmf16.
1142 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 },
1143 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 },
1144 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 },
1145
1146 // Truncate from nxvmf64 to nxvmf16.
1147 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 },
1148 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 },
1149 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 },
1150
1151 // Truncate from nxvmf64 to nxvmf32.
1152 { ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 },
1153 { ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 },
1154 { ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 },
1155
1156 // Extend from nxvmf16 to nxvmf32.
1157 { ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
1158 { ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
1159 { ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
1160
1161 // Extend from nxvmf16 to nxvmf64.
1162 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
1163 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
1164 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
1165
1166 // Extend from nxvmf32 to nxvmf64.
1167 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
1168 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
1169 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
1170
1171 };
1172
1173 if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
1174 DstTy.getSimpleVT(),
1175 SrcTy.getSimpleVT()))
1176 return AdjustCost(Entry->Cost);
1177
1178 return AdjustCost(
1179 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
1180}
1181
1182InstructionCost AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode,
1183 Type *Dst,
1184 VectorType *VecTy,
1185 unsigned Index) {
1186
1187 // Make sure we were given a valid extend opcode.
1188 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&(static_cast<void> (0))
1189 "Invalid opcode")(static_cast<void> (0));
1190
1191 // We are extending an element we extract from a vector, so the source type
1192 // of the extend is the element type of the vector.
1193 auto *Src = VecTy->getElementType();
1194
1195 // Sign- and zero-extends are for integer types only.
1196 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type")(static_cast<void> (0));
1197
1198 // Get the cost for the extract. We compute the cost (if any) for the extend
1199 // below.
1200 InstructionCost Cost =
1201 getVectorInstrCost(Instruction::ExtractElement, VecTy, Index);
1202
1203 // Legalize the types.
1204 auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy);
1205 auto DstVT = TLI->getValueType(DL, Dst);
1206 auto SrcVT = TLI->getValueType(DL, Src);
1207 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1208
1209 // If the resulting type is still a vector and the destination type is legal,
1210 // we may get the extension for free. If not, get the default cost for the
1211 // extend.
1212 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
1213 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
1214 CostKind);
1215
1216 // The destination type should be larger than the element type. If not, get
1217 // the default cost for the extend.
1218 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
1219 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
1220 CostKind);
1221
1222 switch (Opcode) {
1223 default:
1224 llvm_unreachable("Opcode should be either SExt or ZExt")__builtin_unreachable();
1225
1226 // For sign-extends, we only need a smov, which performs the extension
1227 // automatically.
1228 case Instruction::SExt:
1229 return Cost;
1230
1231 // For zero-extends, the extend is performed automatically by a umov unless
1232 // the destination type is i64 and the element type is i8 or i16.
1233 case Instruction::ZExt:
1234 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
1235 return Cost;
1236 }
1237
1238 // If we are unable to perform the extend for free, get the default cost.
1239 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
1240 CostKind);
1241}
1242
1243InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
1244 TTI::TargetCostKind CostKind,
1245 const Instruction *I) {
1246 if (CostKind != TTI::TCK_RecipThroughput)
1247 return Opcode == Instruction::PHI ? 0 : 1;
1248 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind")(static_cast<void> (0));
1249 // Branches are assumed to be predicted.
1250 return 0;
1251}
1252
1253InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
1254 unsigned Index) {
1255 assert(Val->isVectorTy() && "This must be a vector type")(static_cast<void> (0));
1256
1257 if (Index != -1U) {
1258 // Legalize the type.
1259 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
1260
1261 // This type is legalized to a scalar type.
1262 if (!LT.second.isVector())
1263 return 0;
1264
1265 // The type may be split. Normalize the index to the new type.
1266 unsigned Width = LT.second.getVectorNumElements();
1267 Index = Index % Width;
1268
1269 // The element at index zero is already inside the vector.
1270 if (Index == 0)
1271 return 0;
1272 }
1273
1274 // All other insert/extracts cost this much.
1275 return ST->getVectorInsertExtractBaseCost();
1276}
1277
1278InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
1279 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1280 TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info,
1281 TTI::OperandValueProperties Opd1PropInfo,
1282 TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
1283 const Instruction *CxtI) {
1284 // TODO: Handle more cost kinds.
1285 if (CostKind != TTI::TCK_RecipThroughput)
1286 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
1287 Opd2Info, Opd1PropInfo,
1288 Opd2PropInfo, Args, CxtI);
1289
1290 // Legalize the type.
1291 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
1292
1293 // If the instruction is a widening instruction (e.g., uaddl, saddw, etc.),
1294 // add in the widening overhead specified by the sub-target. Since the
1295 // extends feeding widening instructions are performed automatically, they
1296 // aren't present in the generated code and have a zero cost. By adding a
1297 // widening overhead here, we attach the total cost of the combined operation
1298 // to the widening instruction.
1299 InstructionCost Cost = 0;
1300 if (isWideningInstruction(Ty, Opcode, Args))
1301 Cost += ST->getWideningBaseCost();
1302
1303 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1304
1305 switch (ISD) {
1306 default:
1307 return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
1308 Opd2Info,
1309 Opd1PropInfo, Opd2PropInfo);
1310 case ISD::SDIV:
1311 if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue &&
1312 Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
1313 // On AArch64, scalar signed division by constants power-of-two are
1314 // normally expanded to the sequence ADD + CMP + SELECT + SRA.
1315 // The OperandValue properties many not be same as that of previous
1316 // operation; conservatively assume OP_None.
1317 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
1318 Opd1Info, Opd2Info,
1319 TargetTransformInfo::OP_None,
1320 TargetTransformInfo::OP_None);
1321 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
1322 Opd1Info, Opd2Info,
1323 TargetTransformInfo::OP_None,
1324 TargetTransformInfo::OP_None);
1325 Cost += getArithmeticInstrCost(Instruction::Select, Ty, CostKind,
1326 Opd1Info, Opd2Info,
1327 TargetTransformInfo::OP_None,
1328 TargetTransformInfo::OP_None);
1329 Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
1330 Opd1Info, Opd2Info,
1331 TargetTransformInfo::OP_None,
1332 TargetTransformInfo::OP_None);
1333 return Cost;
1334 }
1335 LLVM_FALLTHROUGH[[gnu::fallthrough]];
1336 case ISD::UDIV:
1337 if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue) {
1338 auto VT = TLI->getValueType(DL, Ty);
1339 if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
1340 // Vector signed division by constant are expanded to the
1341 // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
1342 // to MULHS + SUB + SRL + ADD + SRL.
1343 InstructionCost MulCost = getArithmeticInstrCost(
1344 Instruction::Mul, Ty, CostKind, Opd1Info, Opd2Info,
1345 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
1346 InstructionCost AddCost = getArithmeticInstrCost(
1347 Instruction::Add, Ty, CostKind, Opd1Info, Opd2Info,
1348 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
1349 InstructionCost ShrCost = getArithmeticInstrCost(
1350 Instruction::AShr, Ty, CostKind, Opd1Info, Opd2Info,
1351 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
1352 return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
1353 }
1354 }
1355
1356 Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
1357 Opd2Info,
1358 Opd1PropInfo, Opd2PropInfo);
1359 if (Ty->isVectorTy()) {
1360 // On AArch64, vector divisions are not supported natively and are
1361 // expanded into scalar divisions of each pair of elements.
1362 Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, CostKind,
1363 Opd1Info, Opd2Info, Opd1PropInfo,
1364 Opd2PropInfo);
1365 Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind,
1366 Opd1Info, Opd2Info, Opd1PropInfo,
1367 Opd2PropInfo);
1368 // TODO: if one of the arguments is scalar, then it's not necessary to
1369 // double the cost of handling the vector elements.
1370 Cost += Cost;
1371 }
1372 return Cost;
1373
1374 case ISD::MUL:
1375 if (LT.second != MVT::v2i64)
1376 return (Cost + 1) * LT.first;
1377 // Since we do not have a MUL.2d instruction, a mul <2 x i64> is expensive
1378 // as elements are extracted from the vectors and the muls scalarized.
1379 // As getScalarizationOverhead is a bit too pessimistic, we estimate the
1380 // cost for a i64 vector directly here, which is:
1381 // - four i64 extracts,
1382 // - two i64 inserts, and
1383 // - two muls.
1384 // So, for a v2i64 with LT.First = 1 the cost is 8, and for a v4i64 with
1385 // LT.first = 2 the cost is 16.
1386 return LT.first * 8;
1387 case ISD::ADD:
1388 case ISD::XOR:
1389 case ISD::OR:
1390 case ISD::AND:
1391 // These nodes are marked as 'custom' for combining purposes only.
1392 // We know that they are legal. See LowerAdd in ISelLowering.
1393 return (Cost + 1) * LT.first;
1394
1395 case ISD::FADD:
1396 case ISD::FSUB:
1397 case ISD::FMUL:
1398 case ISD::FDIV:
1399 case ISD::FNEG:
1400 // These nodes are marked as 'custom' just to lower them to SVE.
1401 // We know said lowering will incur no additional cost.
1402 if (!Ty->getScalarType()->isFP128Ty())
1403 return (Cost + 2) * LT.first;
1404
1405 return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
1406 Opd2Info,
1407 Opd1PropInfo, Opd2PropInfo);
1408 }
1409}
1410
1411InstructionCost AArch64TTIImpl::getAddressComputationCost(Type *Ty,
1412 ScalarEvolution *SE,
1413 const SCEV *Ptr) {
1414 // Address computations in vectorized code with non-consecutive addresses will
1415 // likely result in more instructions compared to scalar code where the
1416 // computation can more often be merged into the index mode. The resulting
1417 // extra micro-ops can significantly decrease throughput.
1418 unsigned NumVectorInstToHideOverhead = 10;
1419 int MaxMergeDistance = 64;
1420
1421 if (Ty->isVectorTy() && SE &&
1422 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
1423 return NumVectorInstToHideOverhead;
1424
1425 // In many cases the address computation is not merged into the instruction
1426 // addressing mode.
1427 return 1;
1428}
1429
1430InstructionCost AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
1431 Type *CondTy,
1432 CmpInst::Predicate VecPred,
1433 TTI::TargetCostKind CostKind,
1434 const Instruction *I) {
1435 // TODO: Handle other cost kinds.
1436 if (CostKind != TTI::TCK_RecipThroughput)
1437 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1438 I);
1439
1440 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1441 // We don't lower some vector selects well that are wider than the register
1442 // width.
1443 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) {
1444 // We would need this many instructions to hide the scalarization happening.
1445 const int AmortizationCost = 20;
1446
1447 // If VecPred is not set, check if we can get a predicate from the context
1448 // instruction, if its type matches the requested ValTy.
1449 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
1450 CmpInst::Predicate CurrentPred;
1451 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
1452 m_Value())))
1453 VecPred = CurrentPred;
1454 }
1455 // Check if we have a compare/select chain that can be lowered using CMxx &
1456 // BFI pair.
1457 if (CmpInst::isIntPredicate(VecPred)) {
1458 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
1459 MVT::v8i16, MVT::v2i32, MVT::v4i32,
1460 MVT::v2i64};
1461 auto LT = TLI->getTypeLegalizationCost(DL, ValTy);
1462 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
1463 return LT.first;
1464 }
1465
1466 static const TypeConversionCostTblEntry
1467 VectorSelectTbl[] = {
1468 { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
1469 { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
1470 { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
1471 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
1472 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
1473 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
1474 };
1475
1476 EVT SelCondTy = TLI->getValueType(DL, CondTy);
1477 EVT SelValTy = TLI->getValueType(DL, ValTy);
1478 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
1479 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
1480 SelCondTy.getSimpleVT(),
1481 SelValTy.getSimpleVT()))
1482 return Entry->Cost;
1483 }
1484 }
1485 // The base case handles scalable vectors fine for now, since it treats the
1486 // cost as 1 * legalization cost.
1487 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1488}
1489
1490AArch64TTIImpl::TTI::MemCmpExpansionOptions
1491AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
1492 TTI::MemCmpExpansionOptions Options;
1493 if (ST->requiresStrictAlign()) {
1494 // TODO: Add cost modeling for strict align. Misaligned loads expand to
1495 // a bunch of instructions when strict align is enabled.
1496 return Options;
1497 }
1498 Options.AllowOverlappingLoads = true;
1499 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
1500 Options.NumLoadsPerBlock = Options.MaxNumLoads;
1501 // TODO: Though vector loads usually perform well on AArch64, in some targets
1502 // they may wake up the FP unit, which raises the power consumption. Perhaps
1503 // they could be used with no holds barred (-O3).
1504 Options.LoadSizes = {8, 4, 2, 1};
1505 return Options;
1506}
1507
1508InstructionCost
1509AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
1510 Align Alignment, unsigned AddressSpace,
1511 TTI::TargetCostKind CostKind) {
1512 if (useNeonVector(Src))
1513 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1514 CostKind);
1515 auto LT = TLI->getTypeLegalizationCost(DL, Src);
1516 if (!LT.first.isValid())
1517 return InstructionCost::getInvalid();
1518
1519 // The code-generator is currently not able to handle scalable vectors
1520 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
1521 // it. This change will be removed when code-generation for these types is
1522 // sufficiently reliable.
1523 if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1))
1524 return InstructionCost::getInvalid();
1525
1526 return LT.first * 2;
1527}
1528
1529InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
1530 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1531 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
1532 if (useNeonVector(DataTy))
1533 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1534 Alignment, CostKind, I);
1535 auto *VT = cast<VectorType>(DataTy);
1536 auto LT = TLI->getTypeLegalizationCost(DL, DataTy);
1537 if (!LT.first.isValid())
1538 return InstructionCost::getInvalid();
1539
1540 // The code-generator is currently not able to handle scalable vectors
1541 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
1542 // it. This change will be removed when code-generation for these types is
1543 // sufficiently reliable.
1544 if (cast<VectorType>(DataTy)->getElementCount() ==
1545 ElementCount::getScalable(1))
1546 return InstructionCost::getInvalid();
1547
1548 ElementCount LegalVF = LT.second.getVectorElementCount();
1549 InstructionCost MemOpCost =
1550 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I);
1551 return LT.first * MemOpCost * getMaxNumElements(LegalVF, I->getFunction());
1552}
1553
1554bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {
1555 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
1556}
1557
1558InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
1559 MaybeAlign Alignment,
1560 unsigned AddressSpace,
1561 TTI::TargetCostKind CostKind,
1562 const Instruction *I) {
1563 EVT VT = TLI->getValueType(DL, Ty, true);
1564 // Type legalization can't handle structs
1565 if (VT == MVT::Other)
1566 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
1567 CostKind);
1568
1569 auto LT = TLI->getTypeLegalizationCost(DL, Ty);
1570 if (!LT.first.isValid())
1571 return InstructionCost::getInvalid();
1572
1573 // The code-generator is currently not able to handle scalable vectors
1574 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
1575 // it. This change will be removed when code-generation for these types is
1576 // sufficiently reliable.
1577 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
1578 if (VTy->getElementCount() == ElementCount::getScalable(1))
1579 return InstructionCost::getInvalid();
1580
1581 // TODO: consider latency as well for TCK_SizeAndLatency.
1582 if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
1583 return LT.first;
1584
1585 if (CostKind != TTI::TCK_RecipThroughput)
1586 return 1;
1587
1588 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
1589 LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
1590 // Unaligned stores are extremely inefficient. We don't split all
1591 // unaligned 128-bit stores because the negative impact that has shown in
1592 // practice on inlined block copy code.
1593 // We make such stores expensive so that we will only vectorize if there
1594 // are 6 other instructions getting vectorized.
1595 const int AmortizationCost = 6;
1596
1597 return LT.first * 2 * AmortizationCost;
1598 }
1599
1600 // Check truncating stores and extending loads.
1601 if (useNeonVector(Ty) &&
1602 Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
1603 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
1604 if (VT == MVT::v4i8)
1605 return 2;
1606 // Otherwise we need to scalarize.
1607 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
1608 }
1609
1610 return LT.first;
1611}
1612
1613InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
1614 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1615 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1616 bool UseMaskForCond, bool UseMaskForGaps) {
1617 assert(Factor >= 2 && "Invalid interleave factor")(static_cast<void> (0));
1618 auto *VecVTy = cast<FixedVectorType>(VecTy);
1619
1620 if (!UseMaskForCond && !UseMaskForGaps &&
1621 Factor <= TLI->getMaxSupportedInterleaveFactor()) {
1622 unsigned NumElts = VecVTy->getNumElements();
1623 auto *SubVecTy =
1624 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
1625
1626 // ldN/stN only support legal vector types of size 64 or 128 in bits.
1627 // Accesses having vector types that are a multiple of 128 bits can be
1628 // matched to more than one ldN/stN instruction.
1629 if (NumElts % Factor == 0 &&
1630 TLI->isLegalInterleavedAccessType(SubVecTy, DL))
1631 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1632 }
1633
1634 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1635 Alignment, AddressSpace, CostKind,
1636 UseMaskForCond, UseMaskForGaps);
1637}
1638
1639InstructionCost
1640AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
1641 InstructionCost Cost = 0;
1642 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1643 for (auto *I : Tys) {
1644 if (!I->isVectorTy())
1645 continue;
1646 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
1647 128)
1648 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
1649 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
1650 }
1651 return Cost;
1652}
1653
1654unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
1655 return ST->getMaxInterleaveFactor();
1656}
1657
1658// For Falkor, we want to avoid having too many strided loads in a loop since
1659// that can exhaust the HW prefetcher resources. We adjust the unroller
1660// MaxCount preference below to attempt to ensure unrolling doesn't create too
1661// many strided loads.
1662static void
1663getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1664 TargetTransformInfo::UnrollingPreferences &UP) {
1665 enum { MaxStridedLoads = 7 };
1666 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
1667 int StridedLoads = 0;
1668 // FIXME? We could make this more precise by looking at the CFG and
1669 // e.g. not counting loads in each side of an if-then-else diamond.
1670 for (const auto BB : L->blocks()) {
1671 for (auto &I : *BB) {
1672 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
1673 if (!LMemI)
1674 continue;
1675
1676 Value *PtrValue = LMemI->getPointerOperand();
1677 if (L->isLoopInvariant(PtrValue))
1678 continue;
1679
1680 const SCEV *LSCEV = SE.getSCEV(PtrValue);
1681 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
1682 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
1683 continue;
1684
1685 // FIXME? We could take pairing of unrolled load copies into account
1686 // by looking at the AddRec, but we would probably have to limit this
1687 // to loops with no stores or other memory optimization barriers.
1688 ++StridedLoads;
1689 // We've seen enough strided loads that seeing more won't make a
1690 // difference.
1691 if (StridedLoads > MaxStridedLoads / 2)
1692 return StridedLoads;
1693 }
1694 }
1695 return StridedLoads;
1696 };
1697
1698 int StridedLoads = countStridedLoads(L, SE);
1699 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoadsdo { } while (false)
1700 << " strided loads\n")do { } while (false);
1701 // Pick the largest power of 2 unroll count that won't result in too many
1702 // strided loads.
1703 if (StridedLoads) {
1704 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
1705 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "do { } while (false)
1706 << UP.MaxCount << '\n')do { } while (false);
1707 }
1708}
1709
1710void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1711 TTI::UnrollingPreferences &UP,
1712 OptimizationRemarkEmitter *ORE) {
1713 // Enable partial unrolling and runtime unrolling.
1714 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
1715
1716 UP.UpperBound = true;
1717
1718 // For inner loop, it is more likely to be a hot one, and the runtime check
1719 // can be promoted out from LICM pass, so the overhead is less, let's try
1720 // a larger threshold to unroll more loops.
1721 if (L->getLoopDepth() > 1)
1722 UP.PartialThreshold *= 2;
1723
1724 // Disable partial & runtime unrolling on -Os.
1725 UP.PartialOptSizeThreshold = 0;
1726
1727 if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
1728 EnableFalkorHWPFUnrollFix)
1729 getFalkorUnrollingPreferences(L, SE, UP);
1730
1731 // Scan the loop: don't unroll loops with calls as this could prevent
1732 // inlining. Don't unroll vector loops either, as they don't benefit much from
1733 // unrolling.
1734 for (auto *BB : L->getBlocks()) {
1735 for (auto &I : *BB) {
1736 // Don't unroll vectorised loop.
1737 if (I.getType()->isVectorTy())
1738 return;
1739
1740 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1741 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
1742 if (!isLoweredToCall(F))
1743 continue;
1744 }
1745 return;
1746 }
1747 }
1748 }
1749
1750 // Enable runtime unrolling for in-order models
1751 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
1752 // checking for that case, we can ensure that the default behaviour is
1753 // unchanged
1754 if (ST->getProcFamily() != AArch64Subtarget::Others &&
1755 !ST->getSchedModel().isOutOfOrder()) {
1756 UP.Runtime = true;
1757 UP.Partial = true;
1758 UP.UnrollRemainder = true;
1759 UP.DefaultUnrollRuntimeCount = 4;
1760
1761 UP.UnrollAndJam = true;
1762 UP.UnrollAndJamInnerLoopThreshold = 60;
1763 }
1764}
1765
1766void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1767 TTI::PeelingPreferences &PP) {
1768 BaseT::getPeelingPreferences(L, SE, PP);
1769}
1770
1771Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
1772 Type *ExpectedType) {
1773 switch (Inst->getIntrinsicID()) {
1774 default:
1775 return nullptr;
1776 case Intrinsic::aarch64_neon_st2:
1777 case Intrinsic::aarch64_neon_st3:
1778 case Intrinsic::aarch64_neon_st4: {
1779 // Create a struct type
1780 StructType *ST = dyn_cast<StructType>(ExpectedType);
1781 if (!ST)
1782 return nullptr;
1783 unsigned NumElts = Inst->getNumArgOperands() - 1;
1784 if (ST->getNumElements() != NumElts)
1785 return nullptr;
1786 for (unsigned i = 0, e = NumElts; i != e; ++i) {
1787 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
1788 return nullptr;
1789 }
1790 Value *Res = UndefValue::get(ExpectedType);
1791 IRBuilder<> Builder(Inst);
1792 for (unsigned i = 0, e = NumElts; i != e; ++i) {
1793 Value *L = Inst->getArgOperand(i);
1794 Res = Builder.CreateInsertValue(Res, L, i);
1795 }
1796 return Res;
1797 }
1798 case Intrinsic::aarch64_neon_ld2:
1799 case Intrinsic::aarch64_neon_ld3:
1800 case Intrinsic::aarch64_neon_ld4:
1801 if (Inst->getType() == ExpectedType)
1802 return Inst;
1803 return nullptr;
1804 }
1805}
1806
1807bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
1808 MemIntrinsicInfo &Info) {
1809 switch (Inst->getIntrinsicID()) {
1810 default:
1811 break;
1812 case Intrinsic::aarch64_neon_ld2:
1813 case Intrinsic::aarch64_neon_ld3:
1814 case Intrinsic::aarch64_neon_ld4:
1815 Info.ReadMem = true;
1816 Info.WriteMem = false;
1817 Info.PtrVal = Inst->getArgOperand(0);
1818 break;
1819 case Intrinsic::aarch64_neon_st2:
1820 case Intrinsic::aarch64_neon_st3:
1821 case Intrinsic::aarch64_neon_st4:
1822 Info.ReadMem = false;
1823 Info.WriteMem = true;
1824 Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1);
1825 break;
1826 }
1827
1828 switch (Inst->getIntrinsicID()) {
1829 default:
1830 return false;
1831 case Intrinsic::aarch64_neon_ld2:
1832 case Intrinsic::aarch64_neon_st2:
1833 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
1834 break;
1835 case Intrinsic::aarch64_neon_ld3:
1836 case Intrinsic::aarch64_neon_st3:
1837 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
1838 break;
1839 case Intrinsic::aarch64_neon_ld4:
1840 case Intrinsic::aarch64_neon_st4:
1841 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
1842 break;
1843 }
1844 return true;
1845}
1846
1847/// See if \p I should be considered for address type promotion. We check if \p
1848/// I is a sext with right type and used in memory accesses. If it used in a
1849/// "complex" getelementptr, we allow it to be promoted without finding other
1850/// sext instructions that sign extended the same initial value. A getelementptr
1851/// is considered as "complex" if it has more than 2 operands.
1852bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
1853 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
1854 bool Considerable = false;
1855 AllowPromotionWithoutCommonHeader = false;
1856 if (!isa<SExtInst>(&I))
1857 return false;
1858 Type *ConsideredSExtType =
1859 Type::getInt64Ty(I.getParent()->getParent()->getContext());
1860 if (I.getType() != ConsideredSExtType)
1861 return false;
1862 // See if the sext is the one with the right type and used in at least one
1863 // GetElementPtrInst.
1864 for (const User *U : I.users()) {
1865 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
1866 Considerable = true;
1867 // A getelementptr is considered as "complex" if it has more than 2
1868 // operands. We will promote a SExt used in such complex GEP as we
1869 // expect some computation to be merged if they are done on 64 bits.
1870 if (GEPInst->getNumOperands() > 2) {
1871 AllowPromotionWithoutCommonHeader = true;
1872 break;
1873 }
1874 }
1875 }
1876 return Considerable;
1877}
1878
1879bool AArch64TTIImpl::isLegalToVectorizeReduction(
1880 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
1881 if (!VF.isScalable())
1882 return true;
1883
1884 Type *Ty = RdxDesc.getRecurrenceType();
1885 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
1886 return false;
1887
1888 switch (RdxDesc.getRecurrenceKind()) {
1889 case RecurKind::Add:
1890 case RecurKind::FAdd:
1891 case RecurKind::And:
1892 case RecurKind::Or:
1893 case RecurKind::Xor:
1894 case RecurKind::SMin:
1895 case RecurKind::SMax:
1896 case RecurKind::UMin:
1897 case RecurKind::UMax:
1898 case RecurKind::FMin:
1899 case RecurKind::FMax:
1900 return true;
1901 default:
1902 return false;
1903 }
1904}
1905
1906InstructionCost
1907AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
1908 bool IsUnsigned,
1909 TTI::TargetCostKind CostKind) {
1910 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
1911
1912 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
1913 return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
1914
1915 assert((isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) &&(static_cast<void> (0))
1916 "Both vector needs to be equally scalable")(static_cast<void> (0));
1917
1918 InstructionCost LegalizationCost = 0;
1919 if (LT.first > 1) {
1920 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
1921 unsigned MinMaxOpcode =
1922 Ty->isFPOrFPVectorTy()
1923 ? Intrinsic::maxnum
1924 : (IsUnsigned ? Intrinsic::umin : Intrinsic::smin);
1925 IntrinsicCostAttributes Attrs(MinMaxOpcode, LegalVTy, {LegalVTy, LegalVTy});
1926 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
1927 }
1928
1929 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
1930}
1931
1932InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE(
1933 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) {
1934 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1935 InstructionCost LegalizationCost = 0;
1936 if (LT.first > 1) {
1937 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
1938 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
1939 LegalizationCost *= LT.first - 1;
1940 }
1941
1942 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1943 assert(ISD && "Invalid opcode")(static_cast<void> (0));
1944 // Add the final reduction cost for the legal horizontal reduction
1945 switch (ISD) {
1946 case ISD::ADD:
1947 case ISD::AND:
1948 case ISD::OR:
1949 case ISD::XOR:
1950 case ISD::FADD:
1951 return LegalizationCost + 2;
1952 default:
1953 return InstructionCost::getInvalid();
1954 }
1955}
1956
1957InstructionCost
1958AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
1959 Optional<FastMathFlags> FMF,
1960 TTI::TargetCostKind CostKind) {
1961 if (TTI::requiresOrderedReduction(FMF)) {
1962 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
1963 InstructionCost BaseCost =
1964 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1965 // Add on extra cost to reflect the extra overhead on some CPUs. We still
1966 // end up vectorizing for more computationally intensive loops.
1967 return BaseCost + FixedVTy->getNumElements();
1968 }
1969
1970 if (Opcode != Instruction::FAdd)
1971 return InstructionCost::getInvalid();
1972
1973 auto *VTy = cast<ScalableVectorType>(ValTy);
1974 InstructionCost Cost =
1975 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
1976 Cost *= getMaxNumElements(VTy->getElementCount());
1977 return Cost;
1978 }
1979
1980 if (isa<ScalableVectorType>(ValTy))
1981 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
1982
1983 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1984 MVT MTy = LT.second;
1985 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1986 assert(ISD && "Invalid opcode")(static_cast<void> (0));
1987
1988 // Horizontal adds can use the 'addv' instruction. We model the cost of these
1989 // instructions as twice a normal vector add, plus 1 for each legalization
1990 // step (LT.first). This is the only arithmetic vector reduction operation for
1991 // which we have an instruction.
1992 // OR, XOR and AND costs should match the codegen from:
1993 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
1994 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
1995 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
1996 static const CostTblEntry CostTblNoPairwise[]{
1997 {ISD::ADD, MVT::v8i8, 2},
1998 {ISD::ADD, MVT::v16i8, 2},
1999 {ISD::ADD, MVT::v4i16, 2},
2000 {ISD::ADD, MVT::v8i16, 2},
2001 {ISD::ADD, MVT::v4i32, 2},
2002 {ISD::OR, MVT::v8i8, 15},
2003 {ISD::OR, MVT::v16i8, 17},
2004 {ISD::OR, MVT::v4i16, 7},
2005 {ISD::OR, MVT::v8i16, 9},
2006 {ISD::OR, MVT::v2i32, 3},
2007 {ISD::OR, MVT::v4i32, 5},
2008 {ISD::OR, MVT::v2i64, 3},
2009 {ISD::XOR, MVT::v8i8, 15},
2010 {ISD::XOR, MVT::v16i8, 17},
2011 {ISD::XOR, MVT::v4i16, 7},
2012 {ISD::XOR, MVT::v8i16, 9},
2013 {ISD::XOR, MVT::v2i32, 3},
2014 {ISD::XOR, MVT::v4i32, 5},
2015 {ISD::XOR, MVT::v2i64, 3},
2016 {ISD::AND, MVT::v8i8, 15},
2017 {ISD::AND, MVT::v16i8, 17},
2018 {ISD::AND, MVT::v4i16, 7},
2019 {ISD::AND, MVT::v8i16, 9},
2020 {ISD::AND, MVT::v2i32, 3},
2021 {ISD::AND, MVT::v4i32, 5},
2022 {ISD::AND, MVT::v2i64, 3},
2023 };
2024 switch (ISD) {
2025 default:
2026 break;
2027 case ISD::ADD:
2028 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
2029 return (LT.first - 1) + Entry->Cost;
2030 break;
2031 case ISD::XOR:
2032 case ISD::AND:
2033 case ISD::OR:
2034 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
2035 if (!Entry)
2036 break;
2037 auto *ValVTy = cast<FixedVectorType>(ValTy);
2038 if (!ValVTy->getElementType()->isIntegerTy(1) &&
2039 MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
2040 isPowerOf2_32(ValVTy->getNumElements())) {
2041 InstructionCost ExtraCost = 0;
2042 if (LT.first != 1) {
2043 // Type needs to be split, so there is an extra cost of LT.first - 1
2044 // arithmetic ops.
2045 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
2046 MTy.getVectorNumElements());
2047 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
2048 ExtraCost *= LT.first - 1;
2049 }
2050 return Entry->Cost + ExtraCost;
2051 }
2052 break;
2053 }
2054 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2055}
2056
2057InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) {
2058 static const CostTblEntry ShuffleTbl[] = {
2059 { TTI::SK_Splice, MVT::nxv16i8, 1 },
2060 { TTI::SK_Splice, MVT::nxv8i16, 1 },
2061 { TTI::SK_Splice, MVT::nxv4i32, 1 },
2062 { TTI::SK_Splice, MVT::nxv2i64, 1 },
2063 { TTI::SK_Splice, MVT::nxv2f16, 1 },
2064 { TTI::SK_Splice, MVT::nxv4f16, 1 },
2065 { TTI::SK_Splice, MVT::nxv8f16, 1 },
2066 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
2067 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
2068 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
2069 { TTI::SK_Splice, MVT::nxv2f32, 1 },
2070 { TTI::SK_Splice, MVT::nxv4f32, 1 },
2071 { TTI::SK_Splice, MVT::nxv2f64, 1 },
2072 };
2073
2074 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
2075 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
2076 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2077 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
2078 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
2079 : LT.second;
2080 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
2081 InstructionCost LegalizationCost = 0;
2082 if (Index < 0) {
2083 LegalizationCost =
2084 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
2085 CmpInst::BAD_ICMP_PREDICATE, CostKind) +
2086 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
2087 CmpInst::BAD_ICMP_PREDICATE, CostKind);
2088 }
2089
2090 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
2091 // Cost performed on a promoted type.
2092 if (LT.second.getScalarType() == MVT::i1) {
2093 LegalizationCost +=
2094 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
2095 TTI::CastContextHint::None, CostKind) +
2096 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
2097 TTI::CastContextHint::None, CostKind);
2098 }
2099 const auto *Entry =
2100 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
2101 assert(Entry && "Illegal Type for Splice")(static_cast<void> (0));
2102 LegalizationCost += Entry->Cost;
2103 return LegalizationCost * LT.first;
2104}
2105
2106InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
2107 VectorType *Tp,
2108 ArrayRef<int> Mask, int Index,
2109 VectorType *SubTp) {
2110 Kind = improveShuffleKindFromMask(Kind, Mask);
2111 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
2112 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
2113 Kind == TTI::SK_Reverse) {
2114 static const CostTblEntry ShuffleTbl[] = {
2115 // Broadcast shuffle kinds can be performed with 'dup'.
2116 { TTI::SK_Broadcast, MVT::v8i8, 1 },
2117 { TTI::SK_Broadcast, MVT::v16i8, 1 },
2118 { TTI::SK_Broadcast, MVT::v4i16, 1 },
2119 { TTI::SK_Broadcast, MVT::v8i16, 1 },
2120 { TTI::SK_Broadcast, MVT::v2i32, 1 },
2121 { TTI::SK_Broadcast, MVT::v4i32, 1 },
2122 { TTI::SK_Broadcast, MVT::v2i64, 1 },
2123 { TTI::SK_Broadcast, MVT::v2f32, 1 },
2124 { TTI::SK_Broadcast, MVT::v4f32, 1 },
2125 { TTI::SK_Broadcast, MVT::v2f64, 1 },
2126 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
2127 // 'zip1/zip2' instructions.
2128 { TTI::SK_Transpose, MVT::v8i8, 1 },
2129 { TTI::SK_Transpose, MVT::v16i8, 1 },
2130 { TTI::SK_Transpose, MVT::v4i16, 1 },
2131 { TTI::SK_Transpose, MVT::v8i16, 1 },
2132 { TTI::SK_Transpose, MVT::v2i32, 1 },
2133 { TTI::SK_Transpose, MVT::v4i32, 1 },
2134 { TTI::SK_Transpose, MVT::v2i64, 1 },
2135 { TTI::SK_Transpose, MVT::v2f32, 1 },
2136 { TTI::SK_Transpose, MVT::v4f32, 1 },
2137 { TTI::SK_Transpose, MVT::v2f64, 1 },
2138 // Select shuffle kinds.
2139 // TODO: handle vXi8/vXi16.
2140 { TTI::SK_Select, MVT::v2i32, 1 }, // mov.
2141 { TTI::SK_Select, MVT::v4i32, 2 }, // rev+trn (or similar).
2142 { TTI::SK_Select, MVT::v2i64, 1 }, // mov.
2143 { TTI::SK_Select, MVT::v2f32, 1 }, // mov.
2144 { TTI::SK_Select, MVT::v4f32, 2 }, // rev+trn (or similar).
2145 { TTI::SK_Select, MVT::v2f64, 1 }, // mov.
2146 // PermuteSingleSrc shuffle kinds.
2147 { TTI::SK_PermuteSingleSrc, MVT::v2i32, 1 }, // mov.
2148 { TTI::SK_PermuteSingleSrc, MVT::v4i32, 3 }, // perfectshuffle worst case.
2149 { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // mov.
2150 { TTI::SK_PermuteSingleSrc, MVT::v2f32, 1 }, // mov.
2151 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 3 }, // perfectshuffle worst case.
2152 { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // mov.
2153 { TTI::SK_PermuteSingleSrc, MVT::v4i16, 3 }, // perfectshuffle worst case.
2154 { TTI::SK_PermuteSingleSrc, MVT::v4f16, 3 }, // perfectshuffle worst case.
2155 { TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3 }, // perfectshuffle worst case.
2156 { TTI::SK_PermuteSingleSrc, MVT::v8i16, 8 }, // constpool + load + tbl
2157 { TTI::SK_PermuteSingleSrc, MVT::v8f16, 8 }, // constpool + load + tbl
2158 { TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8 }, // constpool + load + tbl
2159 { TTI::SK_PermuteSingleSrc, MVT::v8i8, 8 }, // constpool + load + tbl
2160 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 8 }, // constpool + load + tbl
2161 // Reverse can be lowered with `rev`.
2162 { TTI::SK_Reverse, MVT::v2i32, 1 }, // mov.
2163 { TTI::SK_Reverse, MVT::v4i32, 2 }, // REV64; EXT
2164 { TTI::SK_Reverse, MVT::v2i64, 1 }, // mov.
2165 { TTI::SK_Reverse, MVT::v2f32, 1 }, // mov.
2166 { TTI::SK_Reverse, MVT::v4f32, 2 }, // REV64; EXT
2167 { TTI::SK_Reverse, MVT::v2f64, 1 }, // mov.
2168 // Broadcast shuffle kinds for scalable vectors
2169 { TTI::SK_Broadcast, MVT::nxv16i8, 1 },
2170 { TTI::SK_Broadcast, MVT::nxv8i16, 1 },
2171 { TTI::SK_Broadcast, MVT::nxv4i32, 1 },
2172 { TTI::SK_Broadcast, MVT::nxv2i64, 1 },
2173 { TTI::SK_Broadcast, MVT::nxv2f16, 1 },
2174 { TTI::SK_Broadcast, MVT::nxv4f16, 1 },
2175 { TTI::SK_Broadcast, MVT::nxv8f16, 1 },
2176 { TTI::SK_Broadcast, MVT::nxv2bf16, 1 },
2177 { TTI::SK_Broadcast, MVT::nxv4bf16, 1 },
2178 { TTI::SK_Broadcast, MVT::nxv8bf16, 1 },
2179 { TTI::SK_Broadcast, MVT::nxv2f32, 1 },
2180 { TTI::SK_Broadcast, MVT::nxv4f32, 1 },
2181 { TTI::SK_Broadcast, MVT::nxv2f64, 1 },
2182 { TTI::SK_Broadcast, MVT::nxv16i1, 1 },
2183 { TTI::SK_Broadcast, MVT::nxv8i1, 1 },
2184 { TTI::SK_Broadcast, MVT::nxv4i1, 1 },
2185 { TTI::SK_Broadcast, MVT::nxv2i1, 1 },
2186 // Handle the cases for vector.reverse with scalable vectors
2187 { TTI::SK_Reverse, MVT::nxv16i8, 1 },
2188 { TTI::SK_Reverse, MVT::nxv8i16, 1 },
2189 { TTI::SK_Reverse, MVT::nxv4i32, 1 },
2190 { TTI::SK_Reverse, MVT::nxv2i64, 1 },
2191 { TTI::SK_Reverse, MVT::nxv2f16, 1 },
2192 { TTI::SK_Reverse, MVT::nxv4f16, 1 },
2193 { TTI::SK_Reverse, MVT::nxv8f16, 1 },
2194 { TTI::SK_Reverse, MVT::nxv2bf16, 1 },
2195 { TTI::SK_Reverse, MVT::nxv4bf16, 1 },
2196 { TTI::SK_Reverse, MVT::nxv8bf16, 1 },
2197 { TTI::SK_Reverse, MVT::nxv2f32, 1 },
2198 { TTI::SK_Reverse, MVT::nxv4f32, 1 },
2199 { TTI::SK_Reverse, MVT::nxv2f64, 1 },
2200 { TTI::SK_Reverse, MVT::nxv16i1, 1 },
2201 { TTI::SK_Reverse, MVT::nxv8i1, 1 },
2202 { TTI::SK_Reverse, MVT::nxv4i1, 1 },
2203 { TTI::SK_Reverse, MVT::nxv2i1, 1 },
2204 };
2205 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
2206 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
2207 return LT.first * Entry->Cost;
2208 }
2209 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))
2210 return getSpliceCost(Tp, Index);
2211 return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
2212}