clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name AArch64TargetTransformInfo.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/lib/Target/AArch64 -resource-dir /usr/lib/llvm-14/lib/clang/14.0.0 -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/lib/Target/AArch64 -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/lib/Target/AArch64 -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/include -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/include -D NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-14/lib/clang/14.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/lib/Target/AArch64 -fdebug-prefix-map=/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e=. -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2021-09-04-040900-46481-1 -x c++ /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
1 | |
2 | |
3 | |
4 | |
5 | |
6 | |
7 | |
8 | |
9 | #include "AArch64TargetTransformInfo.h" |
10 | #include "AArch64ExpandImm.h" |
11 | #include "MCTargetDesc/AArch64AddressingModes.h" |
12 | #include "llvm/Analysis/IVDescriptors.h" |
13 | #include "llvm/Analysis/LoopInfo.h" |
14 | #include "llvm/Analysis/TargetTransformInfo.h" |
15 | #include "llvm/CodeGen/BasicTTIImpl.h" |
16 | #include "llvm/CodeGen/CostTable.h" |
17 | #include "llvm/CodeGen/TargetLowering.h" |
18 | #include "llvm/IR/Intrinsics.h" |
19 | #include "llvm/IR/IntrinsicInst.h" |
20 | #include "llvm/IR/IntrinsicsAArch64.h" |
21 | #include "llvm/IR/PatternMatch.h" |
22 | #include "llvm/Support/Debug.h" |
23 | #include "llvm/Transforms/InstCombine/InstCombiner.h" |
24 | #include <algorithm> |
25 | using namespace llvm; |
26 | using namespace llvm::PatternMatch; |
27 | |
28 | #define DEBUG_TYPE "aarch64tti" |
29 | |
30 | static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", |
31 | cl::init(true), cl::Hidden); |
32 | |
33 | bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, |
34 | const Function *Callee) const { |
35 | const TargetMachine &TM = getTLI()->getTargetMachine(); |
36 | |
37 | const FeatureBitset &CallerBits = |
38 | TM.getSubtargetImpl(*Caller)->getFeatureBits(); |
39 | const FeatureBitset &CalleeBits = |
40 | TM.getSubtargetImpl(*Callee)->getFeatureBits(); |
41 | |
42 | |
43 | |
44 | return (CallerBits & CalleeBits) == CalleeBits; |
45 | } |
46 | |
47 | |
48 | |
49 | |
50 | InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) { |
51 | |
52 | if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64)) |
53 | return 0; |
54 | |
55 | if (Val < 0) |
56 | Val = ~Val; |
57 | |
58 | |
59 | SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; |
60 | AArch64_IMM::expandMOVImm(Val, 64, Insn); |
61 | return Insn.size(); |
62 | } |
63 | |
64 | |
65 | InstructionCost AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, |
66 | TTI::TargetCostKind CostKind) { |
67 | assert(Ty->isIntegerTy()); |
68 | |
69 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
70 | if (BitSize == 0) |
71 | return ~0U; |
72 | |
73 | |
74 | APInt ImmVal = Imm; |
75 | if (BitSize & 0x3f) |
76 | ImmVal = Imm.sext((BitSize + 63) & ~0x3fU); |
77 | |
78 | |
79 | |
80 | InstructionCost Cost = 0; |
81 | for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { |
82 | APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); |
83 | int64_t Val = Tmp.getSExtValue(); |
84 | Cost += getIntImmCost(Val); |
85 | } |
86 | |
87 | return std::max<InstructionCost>(1, Cost); |
88 | } |
89 | |
90 | InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, |
91 | const APInt &Imm, Type *Ty, |
92 | TTI::TargetCostKind CostKind, |
93 | Instruction *Inst) { |
94 | assert(Ty->isIntegerTy()); |
95 | |
96 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
97 | |
98 | |
99 | if (BitSize == 0) |
100 | return TTI::TCC_Free; |
101 | |
102 | unsigned ImmIdx = ~0U; |
103 | switch (Opcode) { |
104 | default: |
105 | return TTI::TCC_Free; |
106 | case Instruction::GetElementPtr: |
107 | |
108 | if (Idx == 0) |
109 | return 2 * TTI::TCC_Basic; |
110 | return TTI::TCC_Free; |
111 | case Instruction::Store: |
112 | ImmIdx = 0; |
113 | break; |
114 | case Instruction::Add: |
115 | case Instruction::Sub: |
116 | case Instruction::Mul: |
117 | case Instruction::UDiv: |
118 | case Instruction::SDiv: |
119 | case Instruction::URem: |
120 | case Instruction::SRem: |
121 | case Instruction::And: |
122 | case Instruction::Or: |
123 | case Instruction::Xor: |
124 | case Instruction::ICmp: |
125 | ImmIdx = 1; |
126 | break; |
127 | |
128 | case Instruction::Shl: |
129 | case Instruction::LShr: |
130 | case Instruction::AShr: |
131 | if (Idx == 1) |
132 | return TTI::TCC_Free; |
133 | break; |
134 | case Instruction::Trunc: |
135 | case Instruction::ZExt: |
136 | case Instruction::SExt: |
137 | case Instruction::IntToPtr: |
138 | case Instruction::PtrToInt: |
139 | case Instruction::BitCast: |
140 | case Instruction::PHI: |
141 | case Instruction::Call: |
142 | case Instruction::Select: |
143 | case Instruction::Ret: |
144 | case Instruction::Load: |
145 | break; |
146 | } |
147 | |
148 | if (Idx == ImmIdx) { |
149 | int NumConstants = (BitSize + 63) / 64; |
150 | InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
151 | return (Cost <= NumConstants * TTI::TCC_Basic) |
152 | ? static_cast<int>(TTI::TCC_Free) |
153 | : Cost; |
154 | } |
155 | return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
156 | } |
157 | |
158 | InstructionCost |
159 | AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, |
160 | const APInt &Imm, Type *Ty, |
161 | TTI::TargetCostKind CostKind) { |
162 | assert(Ty->isIntegerTy()); |
163 | |
164 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
165 | |
166 | |
167 | if (BitSize == 0) |
168 | return TTI::TCC_Free; |
169 | |
170 | |
171 | |
172 | |
173 | if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv) |
174 | return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
175 | |
176 | switch (IID) { |
177 | default: |
178 | return TTI::TCC_Free; |
179 | case Intrinsic::sadd_with_overflow: |
180 | case Intrinsic::uadd_with_overflow: |
181 | case Intrinsic::ssub_with_overflow: |
182 | case Intrinsic::usub_with_overflow: |
183 | case Intrinsic::smul_with_overflow: |
184 | case Intrinsic::umul_with_overflow: |
185 | if (Idx == 1) { |
186 | int NumConstants = (BitSize + 63) / 64; |
187 | InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
188 | return (Cost <= NumConstants * TTI::TCC_Basic) |
189 | ? static_cast<int>(TTI::TCC_Free) |
190 | : Cost; |
191 | } |
192 | break; |
193 | case Intrinsic::experimental_stackmap: |
194 | if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) |
195 | return TTI::TCC_Free; |
196 | break; |
197 | case Intrinsic::experimental_patchpoint_void: |
198 | case Intrinsic::experimental_patchpoint_i64: |
199 | if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) |
200 | return TTI::TCC_Free; |
201 | break; |
202 | case Intrinsic::experimental_gc_statepoint: |
203 | if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) |
204 | return TTI::TCC_Free; |
205 | break; |
206 | } |
207 | return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
208 | } |
209 | |
210 | TargetTransformInfo::PopcntSupportKind |
211 | AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) { |
212 | assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); |
213 | if (TyWidth == 32 || TyWidth == 64) |
214 | return TTI::PSK_FastHardware; |
215 | |
216 | return TTI::PSK_Software; |
217 | } |
218 | |
219 | InstructionCost |
220 | AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, |
221 | TTI::TargetCostKind CostKind) { |
222 | auto *RetTy = ICA.getReturnType(); |
223 | switch (ICA.getID()) { |
224 | case Intrinsic::umin: |
225 | case Intrinsic::umax: |
226 | case Intrinsic::smin: |
227 | case Intrinsic::smax: { |
228 | static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, |
229 | MVT::v8i16, MVT::v2i32, MVT::v4i32}; |
230 | auto LT = TLI->getTypeLegalizationCost(DL, RetTy); |
231 | |
232 | if (LT.second == MVT::v2i64) |
233 | return LT.first * 2; |
234 | if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; })) |
235 | return LT.first; |
236 | break; |
237 | } |
238 | case Intrinsic::sadd_sat: |
239 | case Intrinsic::ssub_sat: |
240 | case Intrinsic::uadd_sat: |
241 | case Intrinsic::usub_sat: { |
242 | static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, |
243 | MVT::v8i16, MVT::v2i32, MVT::v4i32, |
244 | MVT::v2i64}; |
245 | auto LT = TLI->getTypeLegalizationCost(DL, RetTy); |
246 | |
247 | |
248 | unsigned Instrs = |
249 | LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4; |
250 | if (any_of(ValidSatTys, [<](MVT M) { return M == LT.second; })) |
251 | return LT.first * Instrs; |
252 | break; |
253 | } |
254 | case Intrinsic::abs: { |
255 | static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, |
256 | MVT::v8i16, MVT::v2i32, MVT::v4i32, |
257 | MVT::v2i64}; |
258 | auto LT = TLI->getTypeLegalizationCost(DL, RetTy); |
259 | if (any_of(ValidAbsTys, [<](MVT M) { return M == LT.second; })) |
260 | return LT.first; |
261 | break; |
262 | } |
263 | case Intrinsic::experimental_stepvector: { |
264 | InstructionCost Cost = 1; |
265 | auto LT = TLI->getTypeLegalizationCost(DL, RetTy); |
266 | |
267 | |
268 | if (LT.first > 1) { |
269 | Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext()); |
270 | InstructionCost AddCost = |
271 | getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind); |
272 | Cost += AddCost * (LT.first - 1); |
273 | } |
274 | return Cost; |
275 | } |
276 | case Intrinsic::bitreverse: { |
277 | static const CostTblEntry BitreverseTbl[] = { |
278 | {Intrinsic::bitreverse, MVT::i32, 1}, |
279 | {Intrinsic::bitreverse, MVT::i64, 1}, |
280 | {Intrinsic::bitreverse, MVT::v8i8, 1}, |
281 | {Intrinsic::bitreverse, MVT::v16i8, 1}, |
282 | {Intrinsic::bitreverse, MVT::v4i16, 2}, |
283 | {Intrinsic::bitreverse, MVT::v8i16, 2}, |
284 | {Intrinsic::bitreverse, MVT::v2i32, 2}, |
285 | {Intrinsic::bitreverse, MVT::v4i32, 2}, |
286 | {Intrinsic::bitreverse, MVT::v1i64, 2}, |
287 | {Intrinsic::bitreverse, MVT::v2i64, 2}, |
288 | }; |
289 | const auto LegalisationCost = TLI->getTypeLegalizationCost(DL, RetTy); |
290 | const auto *Entry = |
291 | CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second); |
292 | |
293 | |
294 | if (TLI->getValueType(DL, RetTy, true) == MVT::i8 || |
295 | TLI->getValueType(DL, RetTy, true) == MVT::i16) |
296 | return LegalisationCost.first * Entry->Cost + 1; |
297 | if (Entry) |
298 | return LegalisationCost.first * Entry->Cost; |
299 | break; |
300 | } |
301 | case Intrinsic::ctpop: { |
302 | static const CostTblEntry CtpopCostTbl[] = { |
303 | {ISD::CTPOP, MVT::v2i64, 4}, |
304 | {ISD::CTPOP, MVT::v4i32, 3}, |
305 | {ISD::CTPOP, MVT::v8i16, 2}, |
306 | {ISD::CTPOP, MVT::v16i8, 1}, |
307 | {ISD::CTPOP, MVT::i64, 4}, |
308 | {ISD::CTPOP, MVT::v2i32, 3}, |
309 | {ISD::CTPOP, MVT::v4i16, 2}, |
310 | {ISD::CTPOP, MVT::v8i8, 1}, |
311 | {ISD::CTPOP, MVT::i32, 5}, |
312 | }; |
313 | auto LT = TLI->getTypeLegalizationCost(DL, RetTy); |
314 | MVT MTy = LT.second; |
315 | if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) { |
316 | |
317 | |
318 | int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() != |
319 | RetTy->getScalarSizeInBits() |
320 | ? 1 |
321 | : 0; |
322 | return LT.first * Entry->Cost + ExtraCost; |
323 | } |
324 | break; |
325 | } |
326 | default: |
327 | break; |
328 | } |
329 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); |
330 | } |
331 | |
332 | |
333 | |
334 | static Optional<Instruction *> processPhiNode(InstCombiner &IC, |
335 | IntrinsicInst &II) { |
336 | SmallVector<Instruction *, 32> Worklist; |
337 | auto RequiredType = II.getType(); |
338 | |
339 | auto *PN = dyn_cast<PHINode>(II.getArgOperand(0)); |
| 6 | | Assuming the object is not a 'PHINode' | |
|
| 7 | | 'PN' initialized to a null pointer value | |
|
340 | assert(PN && "Expected Phi Node!"); |
341 | |
342 | |
343 | if (!PN->hasOneUse()) |
| 8 | | Called C++ object pointer is null |
|
344 | return None; |
345 | |
346 | for (Value *IncValPhi : PN->incoming_values()) { |
347 | auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi); |
348 | if (!Reinterpret || |
349 | Reinterpret->getIntrinsicID() != |
350 | Intrinsic::aarch64_sve_convert_to_svbool || |
351 | RequiredType != Reinterpret->getArgOperand(0)->getType()) |
352 | return None; |
353 | } |
354 | |
355 | |
356 | LLVMContext &Ctx = PN->getContext(); |
357 | IRBuilder<> Builder(Ctx); |
358 | Builder.SetInsertPoint(PN); |
359 | PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues()); |
360 | Worklist.push_back(PN); |
361 | |
362 | for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) { |
363 | auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I)); |
364 | NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I)); |
365 | Worklist.push_back(Reinterpret); |
366 | } |
367 | |
368 | |
369 | return IC.replaceInstUsesWith(II, NPN); |
370 | } |
371 | |
372 | static Optional<Instruction *> instCombineConvertFromSVBool(InstCombiner &IC, |
373 | IntrinsicInst &II) { |
374 | |
375 | if (isa<PHINode>(II.getArgOperand(0))) |
| 3 | | Assuming the object is a 'PHINode' | |
|
| |
376 | return processPhiNode(IC, II); |
| 5 | | Calling 'processPhiNode' | |
|
377 | |
378 | SmallVector<Instruction *, 32> CandidatesForRemoval; |
379 | Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr; |
380 | |
381 | const auto *IVTy = cast<VectorType>(II.getType()); |
382 | |
383 | |
384 | while (Cursor) { |
385 | |
386 | |
387 | const auto *CursorVTy = cast<VectorType>(Cursor->getType()); |
388 | if (CursorVTy->getElementCount().getKnownMinValue() < |
389 | IVTy->getElementCount().getKnownMinValue()) |
390 | break; |
391 | |
392 | |
393 | if (Cursor->getType() == IVTy) |
394 | EarliestReplacement = Cursor; |
395 | |
396 | auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor); |
397 | |
398 | |
399 | if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() == |
400 | Intrinsic::aarch64_sve_convert_to_svbool || |
401 | IntrinsicCursor->getIntrinsicID() == |
402 | Intrinsic::aarch64_sve_convert_from_svbool)) |
403 | break; |
404 | |
405 | CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor); |
406 | Cursor = IntrinsicCursor->getOperand(0); |
407 | } |
408 | |
409 | |
410 | |
411 | if (!EarliestReplacement) |
412 | return None; |
413 | |
414 | return IC.replaceInstUsesWith(II, EarliestReplacement); |
415 | } |
416 | |
417 | static Optional<Instruction *> instCombineSVEDup(InstCombiner &IC, |
418 | IntrinsicInst &II) { |
419 | IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); |
420 | if (!Pg) |
421 | return None; |
422 | |
423 | if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) |
424 | return None; |
425 | |
426 | const auto PTruePattern = |
427 | cast<ConstantInt>(Pg->getOperand(0))->getZExtValue(); |
428 | if (PTruePattern != AArch64SVEPredPattern::vl1) |
429 | return None; |
430 | |
431 | |
432 | auto *IdxTy = Type::getInt64Ty(II.getContext()); |
433 | auto *Insert = InsertElementInst::Create( |
434 | II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0)); |
435 | Insert->insertBefore(&II); |
436 | Insert->takeName(&II); |
437 | |
438 | return IC.replaceInstUsesWith(II, Insert); |
439 | } |
440 | |
441 | static Optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC, |
442 | IntrinsicInst &II) { |
443 | LLVMContext &Ctx = II.getContext(); |
444 | IRBuilder<> Builder(Ctx); |
445 | Builder.SetInsertPoint(&II); |
446 | |
447 | |
448 | auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0)); |
449 | if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) |
450 | return None; |
451 | |
452 | const auto PTruePattern = |
453 | cast<ConstantInt>(Pg->getOperand(0))->getZExtValue(); |
454 | if (PTruePattern != AArch64SVEPredPattern::all) |
455 | return None; |
456 | |
457 | |
458 | auto *DupX = dyn_cast<IntrinsicInst>(II.getArgOperand(2)); |
459 | if (!DupX || DupX->getIntrinsicID() != Intrinsic::aarch64_sve_dup_x) |
460 | return None; |
461 | |
462 | auto *DupXArg = dyn_cast<ConstantInt>(DupX->getArgOperand(0)); |
463 | if (!DupXArg || !DupXArg->isZero()) |
464 | return None; |
465 | |
466 | |
467 | auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); |
468 | if (!DupQLane || |
469 | DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane) |
470 | return None; |
471 | |
472 | |
473 | if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero()) |
474 | return None; |
475 | |
476 | auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0)); |
477 | if (!VecIns || |
478 | VecIns->getIntrinsicID() != Intrinsic::experimental_vector_insert) |
479 | return None; |
480 | |
481 | |
482 | |
483 | if (!isa<UndefValue>(VecIns->getArgOperand(0))) |
484 | return None; |
485 | |
486 | if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero()) |
487 | return None; |
488 | |
489 | auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1)); |
490 | if (!ConstVec) |
491 | return None; |
492 | |
493 | auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType()); |
494 | auto *OutTy = dyn_cast<ScalableVectorType>(II.getType()); |
495 | if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements()) |
496 | return None; |
497 | |
498 | unsigned NumElts = VecTy->getNumElements(); |
499 | unsigned PredicateBits = 0; |
500 | |
501 | |
502 | for (unsigned I = 0; I < NumElts; ++I) { |
503 | auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I)); |
504 | if (!Arg) |
505 | return None; |
506 | if (!Arg->isZero()) |
507 | PredicateBits |= 1 << (I * (16 / NumElts)); |
508 | } |
509 | |
510 | |
511 | if (PredicateBits == 0) { |
512 | auto *PFalse = Constant::getNullValue(II.getType()); |
513 | PFalse->takeName(&II); |
514 | return IC.replaceInstUsesWith(II, PFalse); |
515 | } |
516 | |
517 | |
518 | unsigned Mask = 8; |
519 | for (unsigned I = 0; I < 16; ++I) |
520 | if ((PredicateBits & (1 << I)) != 0) |
521 | Mask |= (I % 8); |
522 | |
523 | unsigned PredSize = Mask & -Mask; |
524 | auto *PredType = ScalableVectorType::get( |
525 | Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8)); |
526 | |
527 | |
528 | for (unsigned I = 0; I < 16; I += PredSize) |
529 | if ((PredicateBits & (1 << I)) == 0) |
530 | return None; |
531 | |
532 | auto *PTruePat = |
533 | ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all); |
534 | auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, |
535 | {PredType}, {PTruePat}); |
536 | auto *ConvertToSVBool = Builder.CreateIntrinsic( |
537 | Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue}); |
538 | auto *ConvertFromSVBool = |
539 | Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, |
540 | {II.getType()}, {ConvertToSVBool}); |
541 | |
542 | ConvertFromSVBool->takeName(&II); |
543 | return IC.replaceInstUsesWith(II, ConvertFromSVBool); |
544 | } |
545 | |
546 | static Optional<Instruction *> instCombineSVELast(InstCombiner &IC, |
547 | IntrinsicInst &II) { |
548 | IRBuilder<> Builder(II.getContext()); |
549 | Builder.SetInsertPoint(&II); |
550 | Value *Pg = II.getArgOperand(0); |
551 | Value *Vec = II.getArgOperand(1); |
552 | auto IntrinsicID = II.getIntrinsicID(); |
553 | bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta; |
554 | |
555 | |
556 | if (auto *SplatVal = getSplatValue(Vec)) |
557 | return IC.replaceInstUsesWith(II, SplatVal); |
558 | |
559 | |
560 | |
561 | Value *LHS, *RHS; |
562 | if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) { |
563 | if (isSplatValue(LHS) || isSplatValue(RHS)) { |
564 | auto *OldBinOp = cast<BinaryOperator>(Vec); |
565 | auto OpC = OldBinOp->getOpcode(); |
566 | auto *NewLHS = |
567 | Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS}); |
568 | auto *NewRHS = |
569 | Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS}); |
570 | auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags( |
571 | OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), &II); |
572 | return IC.replaceInstUsesWith(II, NewBinOp); |
573 | } |
574 | } |
575 | |
576 | auto *C = dyn_cast<Constant>(Pg); |
577 | if (IsAfter && C && C->isNullValue()) { |
578 | |
579 | auto *IdxTy = Type::getInt64Ty(II.getContext()); |
580 | auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0)); |
581 | Extract->insertBefore(&II); |
582 | Extract->takeName(&II); |
583 | return IC.replaceInstUsesWith(II, Extract); |
584 | } |
585 | |
586 | auto *IntrPG = dyn_cast<IntrinsicInst>(Pg); |
587 | if (!IntrPG) |
588 | return None; |
589 | |
590 | if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) |
591 | return None; |
592 | |
593 | const auto PTruePattern = |
594 | cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue(); |
595 | |
596 | |
597 | unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern); |
598 | if (!MinNumElts) |
599 | return None; |
600 | |
601 | unsigned Idx = MinNumElts - 1; |
602 | |
603 | |
604 | if (IsAfter) |
605 | ++Idx; |
606 | |
607 | |
608 | |
609 | |
610 | auto *PgVTy = cast<ScalableVectorType>(Pg->getType()); |
611 | if (Idx >= PgVTy->getMinNumElements()) |
612 | return None; |
613 | |
614 | |
615 | auto *IdxTy = Type::getInt64Ty(II.getContext()); |
616 | auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx)); |
617 | Extract->insertBefore(&II); |
618 | Extract->takeName(&II); |
619 | return IC.replaceInstUsesWith(II, Extract); |
620 | } |
621 | |
622 | static Optional<Instruction *> instCombineRDFFR(InstCombiner &IC, |
623 | IntrinsicInst &II) { |
624 | LLVMContext &Ctx = II.getContext(); |
625 | IRBuilder<> Builder(Ctx); |
626 | Builder.SetInsertPoint(&II); |
627 | |
628 | |
629 | auto *AllPat = |
630 | ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all); |
631 | auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, |
632 | {II.getType()}, {AllPat}); |
633 | auto *RDFFR = |
634 | Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue}); |
635 | RDFFR->takeName(&II); |
636 | return IC.replaceInstUsesWith(II, RDFFR); |
637 | } |
638 | |
639 | static Optional<Instruction *> |
640 | instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) { |
641 | const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue(); |
642 | |
643 | if (Pattern == AArch64SVEPredPattern::all) { |
644 | LLVMContext &Ctx = II.getContext(); |
645 | IRBuilder<> Builder(Ctx); |
646 | Builder.SetInsertPoint(&II); |
647 | |
648 | Constant *StepVal = ConstantInt::get(II.getType(), NumElts); |
649 | auto *VScale = Builder.CreateVScale(StepVal); |
650 | VScale->takeName(&II); |
651 | return IC.replaceInstUsesWith(II, VScale); |
652 | } |
653 | |
654 | unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern); |
655 | |
656 | return MinNumElts && NumElts >= MinNumElts |
657 | ? Optional<Instruction *>(IC.replaceInstUsesWith( |
658 | II, ConstantInt::get(II.getType(), MinNumElts))) |
659 | : None; |
660 | } |
661 | |
662 | static Optional<Instruction *> instCombineSVEPTest(InstCombiner &IC, |
663 | IntrinsicInst &II) { |
664 | IntrinsicInst *Op1 = dyn_cast<IntrinsicInst>(II.getArgOperand(0)); |
665 | IntrinsicInst *Op2 = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); |
666 | |
667 | if (Op1 && Op2 && |
668 | Op1->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && |
669 | Op2->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && |
670 | Op1->getArgOperand(0)->getType() == Op2->getArgOperand(0)->getType()) { |
671 | |
672 | IRBuilder<> Builder(II.getContext()); |
673 | Builder.SetInsertPoint(&II); |
674 | |
675 | Value *Ops[] = {Op1->getArgOperand(0), Op2->getArgOperand(0)}; |
676 | Type *Tys[] = {Op1->getArgOperand(0)->getType()}; |
677 | |
678 | auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops); |
679 | |
680 | PTest->takeName(&II); |
681 | return IC.replaceInstUsesWith(II, PTest); |
682 | } |
683 | |
684 | return None; |
685 | } |
686 | |
687 | static Optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC, |
688 | IntrinsicInst &II) { |
689 | auto *OpPredicate = II.getOperand(0); |
690 | auto *OpMultiplicand = II.getOperand(1); |
691 | auto *OpMultiplier = II.getOperand(2); |
692 | |
693 | IRBuilder<> Builder(II.getContext()); |
694 | Builder.SetInsertPoint(&II); |
695 | |
696 | |
697 | |
698 | auto IsUnitDupX = [](auto *I) { |
699 | auto *IntrI = dyn_cast<IntrinsicInst>(I); |
700 | if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup_x) |
701 | return false; |
702 | |
703 | auto *SplatValue = IntrI->getOperand(0); |
704 | return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); |
705 | }; |
706 | |
707 | |
708 | |
709 | auto IsUnitDup = [](auto *I) { |
710 | auto *IntrI = dyn_cast<IntrinsicInst>(I); |
711 | if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup) |
712 | return false; |
713 | |
714 | auto *SplatValue = IntrI->getOperand(2); |
715 | return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); |
716 | }; |
717 | |
718 | |
719 | |
720 | if (IsUnitDup(OpMultiplicand) || IsUnitDupX(OpMultiplicand)) |
721 | std::swap(OpMultiplier, OpMultiplicand); |
722 | |
723 | if (IsUnitDupX(OpMultiplier)) { |
724 | |
725 | OpMultiplicand->takeName(&II); |
726 | return IC.replaceInstUsesWith(II, OpMultiplicand); |
727 | } else if (IsUnitDup(OpMultiplier)) { |
728 | |
729 | auto *DupInst = cast<IntrinsicInst>(OpMultiplier); |
730 | auto *DupPg = DupInst->getOperand(1); |
731 | |
732 | |
733 | if (OpPredicate == DupPg) { |
734 | OpMultiplicand->takeName(&II); |
735 | return IC.replaceInstUsesWith(II, OpMultiplicand); |
736 | } |
737 | } |
738 | |
739 | return None; |
740 | } |
741 | |
742 | static Optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC, |
743 | IntrinsicInst &II) { |
744 | IRBuilder<> Builder(II.getContext()); |
745 | Builder.SetInsertPoint(&II); |
746 | Value *UnpackArg = II.getArgOperand(0); |
747 | auto *RetTy = cast<ScalableVectorType>(II.getType()); |
748 | bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi || |
749 | II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo; |
750 | |
751 | |
752 | |
753 | if (auto *ScalarArg = getSplatValue(UnpackArg)) { |
754 | ScalarArg = |
755 | Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned); |
756 | Value *NewVal = |
757 | Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg); |
758 | NewVal->takeName(&II); |
759 | return IC.replaceInstUsesWith(II, NewVal); |
760 | } |
761 | |
762 | return None; |
763 | } |
764 | static Optional<Instruction *> instCombineSVETBL(InstCombiner &IC, |
765 | IntrinsicInst &II) { |
766 | auto *OpVal = II.getOperand(0); |
767 | auto *OpIndices = II.getOperand(1); |
768 | VectorType *VTy = cast<VectorType>(II.getType()); |
769 | |
770 | |
771 | |
772 | auto *DupXIntrI = dyn_cast<IntrinsicInst>(OpIndices); |
773 | if (!DupXIntrI || DupXIntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup_x) |
774 | return None; |
775 | |
776 | auto *SplatValue = dyn_cast<ConstantInt>(DupXIntrI->getOperand(0)); |
777 | if (!SplatValue || |
778 | SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue())) |
779 | return None; |
780 | |
781 | |
782 | |
783 | IRBuilder<> Builder(II.getContext()); |
784 | Builder.SetInsertPoint(&II); |
785 | auto *Extract = Builder.CreateExtractElement(OpVal, SplatValue); |
786 | auto *VectorSplat = |
787 | Builder.CreateVectorSplat(VTy->getElementCount(), Extract); |
788 | |
789 | VectorSplat->takeName(&II); |
790 | return IC.replaceInstUsesWith(II, VectorSplat); |
791 | } |
792 | |
793 | Optional<Instruction *> |
794 | AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, |
795 | IntrinsicInst &II) const { |
796 | Intrinsic::ID IID = II.getIntrinsicID(); |
797 | switch (IID) { |
| 1 | Control jumps to 'case aarch64_sve_convert_from_svbool:' at line 800 | |
|
798 | default: |
799 | break; |
800 | case Intrinsic::aarch64_sve_convert_from_svbool: |
801 | return instCombineConvertFromSVBool(IC, II); |
| 2 | | Calling 'instCombineConvertFromSVBool' | |
|
802 | case Intrinsic::aarch64_sve_dup: |
803 | return instCombineSVEDup(IC, II); |
804 | case Intrinsic::aarch64_sve_cmpne: |
805 | case Intrinsic::aarch64_sve_cmpne_wide: |
806 | return instCombineSVECmpNE(IC, II); |
807 | case Intrinsic::aarch64_sve_rdffr: |
808 | return instCombineRDFFR(IC, II); |
809 | case Intrinsic::aarch64_sve_lasta: |
810 | case Intrinsic::aarch64_sve_lastb: |
811 | return instCombineSVELast(IC, II); |
812 | case Intrinsic::aarch64_sve_cntd: |
813 | return instCombineSVECntElts(IC, II, 2); |
814 | case Intrinsic::aarch64_sve_cntw: |
815 | return instCombineSVECntElts(IC, II, 4); |
816 | case Intrinsic::aarch64_sve_cnth: |
817 | return instCombineSVECntElts(IC, II, 8); |
818 | case Intrinsic::aarch64_sve_cntb: |
819 | return instCombineSVECntElts(IC, II, 16); |
820 | case Intrinsic::aarch64_sve_ptest_any: |
821 | case Intrinsic::aarch64_sve_ptest_first: |
822 | case Intrinsic::aarch64_sve_ptest_last: |
823 | return instCombineSVEPTest(IC, II); |
824 | case Intrinsic::aarch64_sve_mul: |
825 | case Intrinsic::aarch64_sve_fmul: |
826 | return instCombineSVEVectorMul(IC, II); |
827 | case Intrinsic::aarch64_sve_tbl: |
828 | return instCombineSVETBL(IC, II); |
829 | case Intrinsic::aarch64_sve_uunpkhi: |
830 | case Intrinsic::aarch64_sve_uunpklo: |
831 | case Intrinsic::aarch64_sve_sunpkhi: |
832 | case Intrinsic::aarch64_sve_sunpklo: |
833 | return instCombineSVEUnpack(IC, II); |
834 | } |
835 | |
836 | return None; |
837 | } |
838 | |
839 | bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, |
840 | ArrayRef<const Value *> Args) { |
841 | |
842 | |
843 | |
844 | auto toVectorTy = [&](Type *ArgTy) { |
845 | return VectorType::get(ArgTy->getScalarType(), |
846 | cast<VectorType>(DstTy)->getElementCount()); |
847 | }; |
848 | |
849 | |
850 | |
851 | if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16) |
852 | return false; |
853 | |
854 | |
855 | |
856 | |
857 | |
858 | |
859 | |
860 | |
861 | switch (Opcode) { |
862 | case Instruction::Add: |
863 | case Instruction::Sub: |
864 | break; |
865 | default: |
866 | return false; |
867 | } |
868 | |
869 | |
870 | |
871 | |
872 | |
873 | if (Args.size() != 2 || |
874 | (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])) || |
875 | !Args[1]->hasOneUse()) |
876 | return false; |
877 | auto *Extend = cast<CastInst>(Args[1]); |
878 | |
879 | |
880 | |
881 | auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy); |
882 | unsigned DstElTySize = DstTyL.second.getScalarSizeInBits(); |
883 | if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits()) |
884 | return false; |
885 | |
886 | |
887 | |
888 | auto *SrcTy = toVectorTy(Extend->getSrcTy()); |
889 | auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy); |
890 | unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits(); |
891 | if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits()) |
892 | return false; |
893 | |
894 | |
895 | InstructionCost NumDstEls = |
896 | DstTyL.first * DstTyL.second.getVectorMinNumElements(); |
897 | InstructionCost NumSrcEls = |
898 | SrcTyL.first * SrcTyL.second.getVectorMinNumElements(); |
899 | |
900 | |
901 | |
902 | return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize; |
903 | } |
904 | |
905 | InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, |
906 | Type *Src, |
907 | TTI::CastContextHint CCH, |
908 | TTI::TargetCostKind CostKind, |
909 | const Instruction *I) { |
910 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
911 | assert(ISD && "Invalid opcode"); |
912 | |
913 | |
914 | |
915 | if (I && I->hasOneUse()) { |
916 | auto *SingleUser = cast<Instruction>(*I->user_begin()); |
917 | SmallVector<const Value *, 4> Operands(SingleUser->operand_values()); |
918 | if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) { |
919 | |
920 | |
921 | if (I == SingleUser->getOperand(1)) |
922 | return 0; |
923 | |
924 | |
925 | |
926 | if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1))) |
927 | if (I->getOpcode() == unsigned(Cast->getOpcode()) && |
928 | cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy()) |
929 | return 0; |
930 | } |
931 | } |
932 | |
933 | |
934 | auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost { |
935 | if (CostKind != TTI::TCK_RecipThroughput) |
936 | return Cost == 0 ? 0 : 1; |
937 | return Cost; |
938 | }; |
939 | |
940 | EVT SrcTy = TLI->getValueType(DL, Src); |
941 | EVT DstTy = TLI->getValueType(DL, Dst); |
942 | |
943 | if (!SrcTy.isSimple() || !DstTy.isSimple()) |
944 | return AdjustCost( |
945 | BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); |
946 | |
947 | static const TypeConversionCostTblEntry |
948 | ConversionTbl[] = { |
949 | { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, |
950 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 }, |
951 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, |
952 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 }, |
953 | |
954 | |
955 | { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 }, |
956 | { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 }, |
957 | { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 }, |
958 | { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 }, |
959 | { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 }, |
960 | { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 }, |
961 | { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 }, |
962 | { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 }, |
963 | { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 }, |
964 | { ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1 }, |
965 | { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 }, |
966 | { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 }, |
967 | { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 }, |
968 | { ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 }, |
969 | { ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 }, |
970 | { ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 }, |
971 | |
972 | |
973 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, |
974 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, |
975 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, |
976 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, |
977 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, |
978 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, |
979 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, |
980 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, |
981 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, |
982 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, |
983 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, |
984 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, |
985 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, |
986 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, |
987 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, |
988 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, |
989 | |
990 | |
991 | { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, |
992 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, |
993 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, |
994 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, |
995 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, |
996 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, |
997 | |
998 | |
999 | { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, |
1000 | { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, |
1001 | { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, |
1002 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, |
1003 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, |
1004 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, |
1005 | |
1006 | |
1007 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 }, |
1008 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, |
1009 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, |
1010 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, |
1011 | |
1012 | |
1013 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, |
1014 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, |
1015 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, |
1016 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, |
1017 | |
1018 | |
1019 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, |
1020 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, |
1021 | |
1022 | |
1023 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, |
1024 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, |
1025 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, |
1026 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, |
1027 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, |
1028 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, |
1029 | |
1030 | |
1031 | |
1032 | { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 }, |
1033 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, |
1034 | { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, |
1035 | { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, |
1036 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, |
1037 | { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, |
1038 | |
1039 | |
1040 | { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 }, |
1041 | { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 }, |
1042 | { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 }, |
1043 | { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 }, |
1044 | { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 }, |
1045 | { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 }, |
1046 | |
1047 | |
1048 | { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 }, |
1049 | { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 }, |
1050 | { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 }, |
1051 | { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 }, |
1052 | |
1053 | |
1054 | { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1 }, |
1055 | { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 }, |
1056 | { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 }, |
1057 | { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1 }, |
1058 | { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1 }, |
1059 | { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 }, |
1060 | { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 }, |
1061 | { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1 }, |
1062 | |
1063 | |
1064 | { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 }, |
1065 | { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 }, |
1066 | { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 }, |
1067 | { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 }, |
1068 | { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 }, |
1069 | { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 }, |
1070 | |
1071 | |
1072 | { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 }, |
1073 | { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1 }, |
1074 | { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1 }, |
1075 | { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1 }, |
1076 | { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 }, |
1077 | { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1 }, |
1078 | { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1 }, |
1079 | { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1 }, |
1080 | |
1081 | |
1082 | { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4 }, |
1083 | { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 }, |
1084 | { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1 }, |
1085 | { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1 }, |
1086 | { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4 }, |
1087 | { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 }, |
1088 | { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1 }, |
1089 | { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1 }, |
1090 | |
1091 | |
1092 | { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7 }, |
1093 | { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7 }, |
1094 | { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7 }, |
1095 | { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7 }, |
1096 | |
1097 | |
1098 | { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3 }, |
1099 | { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3 }, |
1100 | { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3 }, |
1101 | { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3 }, |
1102 | { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3 }, |
1103 | { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3 }, |
1104 | |
1105 | |
1106 | { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 }, |
1107 | { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3 }, |
1108 | { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3 }, |
1109 | { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3 }, |
1110 | |
1111 | |
1112 | { ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10 }, |
1113 | { ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4 }, |
1114 | { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1 }, |
1115 | { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1 }, |
1116 | { ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10 }, |
1117 | { ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4 }, |
1118 | { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1 }, |
1119 | { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1 }, |
1120 | |
1121 | |
1122 | { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4 }, |
1123 | { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1 }, |
1124 | { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1 }, |
1125 | { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1 }, |
1126 | { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4 }, |
1127 | { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1 }, |
1128 | { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1 }, |
1129 | { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1 }, |
1130 | |
1131 | |
1132 | { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1 }, |
1133 | { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1 }, |
1134 | { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1 }, |
1135 | { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1 }, |
1136 | { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1 }, |
1137 | { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1 }, |
1138 | { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1 }, |
1139 | { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1 }, |
1140 | |
1141 | |
1142 | { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 }, |
1143 | { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 }, |
1144 | { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 }, |
1145 | |
1146 | |
1147 | { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 }, |
1148 | { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 }, |
1149 | { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 }, |
1150 | |
1151 | |
1152 | { ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 }, |
1153 | { ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 }, |
1154 | { ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 }, |
1155 | |
1156 | |
1157 | { ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1}, |
1158 | { ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1}, |
1159 | { ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2}, |
1160 | |
1161 | |
1162 | { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1}, |
1163 | { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2}, |
1164 | { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4}, |
1165 | |
1166 | |
1167 | { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1}, |
1168 | { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2}, |
1169 | { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6}, |
1170 | |
1171 | }; |
1172 | |
1173 | if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD, |
1174 | DstTy.getSimpleVT(), |
1175 | SrcTy.getSimpleVT())) |
1176 | return AdjustCost(Entry->Cost); |
1177 | |
1178 | return AdjustCost( |
1179 | BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); |
1180 | } |
1181 | |
1182 | InstructionCost AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, |
1183 | Type *Dst, |
1184 | VectorType *VecTy, |
1185 | unsigned Index) { |
1186 | |
1187 | |
1188 | assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && |
1189 | "Invalid opcode"); |
1190 | |
1191 | |
1192 | |
1193 | auto *Src = VecTy->getElementType(); |
1194 | |
1195 | |
1196 | assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type"); |
1197 | |
1198 | |
1199 | |
1200 | InstructionCost Cost = |
1201 | getVectorInstrCost(Instruction::ExtractElement, VecTy, Index); |
1202 | |
1203 | |
1204 | auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy); |
1205 | auto DstVT = TLI->getValueType(DL, Dst); |
1206 | auto SrcVT = TLI->getValueType(DL, Src); |
1207 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
1208 | |
1209 | |
1210 | |
1211 | |
1212 | if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT)) |
1213 | return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, |
1214 | CostKind); |
1215 | |
1216 | |
1217 | |
1218 | if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits()) |
1219 | return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, |
1220 | CostKind); |
1221 | |
1222 | switch (Opcode) { |
1223 | default: |
1224 | llvm_unreachable("Opcode should be either SExt or ZExt"); |
1225 | |
1226 | |
1227 | |
1228 | case Instruction::SExt: |
1229 | return Cost; |
1230 | |
1231 | |
1232 | |
1233 | case Instruction::ZExt: |
1234 | if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u) |
1235 | return Cost; |
1236 | } |
1237 | |
1238 | |
1239 | return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, |
1240 | CostKind); |
1241 | } |
1242 | |
1243 | InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode, |
1244 | TTI::TargetCostKind CostKind, |
1245 | const Instruction *I) { |
1246 | if (CostKind != TTI::TCK_RecipThroughput) |
1247 | return Opcode == Instruction::PHI ? 0 : 1; |
1248 | assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind"); |
1249 | |
1250 | return 0; |
1251 | } |
1252 | |
1253 | InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, |
1254 | unsigned Index) { |
1255 | assert(Val->isVectorTy() && "This must be a vector type"); |
1256 | |
1257 | if (Index != -1U) { |
1258 | |
1259 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); |
1260 | |
1261 | |
1262 | if (!LT.second.isVector()) |
1263 | return 0; |
1264 | |
1265 | |
1266 | unsigned Width = LT.second.getVectorNumElements(); |
1267 | Index = Index % Width; |
1268 | |
1269 | |
1270 | if (Index == 0) |
1271 | return 0; |
1272 | } |
1273 | |
1274 | |
1275 | return ST->getVectorInsertExtractBaseCost(); |
1276 | } |
1277 | |
1278 | InstructionCost AArch64TTIImpl::getArithmeticInstrCost( |
1279 | unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, |
1280 | TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, |
1281 | TTI::OperandValueProperties Opd1PropInfo, |
1282 | TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args, |
1283 | const Instruction *CxtI) { |
1284 | |
1285 | if (CostKind != TTI::TCK_RecipThroughput) |
1286 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, |
1287 | Opd2Info, Opd1PropInfo, |
1288 | Opd2PropInfo, Args, CxtI); |
1289 | |
1290 | |
1291 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); |
1292 | |
1293 | |
1294 | |
1295 | |
1296 | |
1297 | |
1298 | |
1299 | InstructionCost Cost = 0; |
1300 | if (isWideningInstruction(Ty, Opcode, Args)) |
1301 | Cost += ST->getWideningBaseCost(); |
1302 | |
1303 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
1304 | |
1305 | switch (ISD) { |
1306 | default: |
1307 | return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, |
1308 | Opd2Info, |
1309 | Opd1PropInfo, Opd2PropInfo); |
1310 | case ISD::SDIV: |
1311 | if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue && |
1312 | Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { |
1313 | |
1314 | |
1315 | |
1316 | |
1317 | Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, |
1318 | Opd1Info, Opd2Info, |
1319 | TargetTransformInfo::OP_None, |
1320 | TargetTransformInfo::OP_None); |
1321 | Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, |
1322 | Opd1Info, Opd2Info, |
1323 | TargetTransformInfo::OP_None, |
1324 | TargetTransformInfo::OP_None); |
1325 | Cost += getArithmeticInstrCost(Instruction::Select, Ty, CostKind, |
1326 | Opd1Info, Opd2Info, |
1327 | TargetTransformInfo::OP_None, |
1328 | TargetTransformInfo::OP_None); |
1329 | Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, |
1330 | Opd1Info, Opd2Info, |
1331 | TargetTransformInfo::OP_None, |
1332 | TargetTransformInfo::OP_None); |
1333 | return Cost; |
1334 | } |
1335 | LLVM_FALLTHROUGH; |
1336 | case ISD::UDIV: |
1337 | if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue) { |
1338 | auto VT = TLI->getValueType(DL, Ty); |
1339 | if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) { |
1340 | |
1341 | |
1342 | |
1343 | InstructionCost MulCost = getArithmeticInstrCost( |
1344 | Instruction::Mul, Ty, CostKind, Opd1Info, Opd2Info, |
1345 | TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); |
1346 | InstructionCost AddCost = getArithmeticInstrCost( |
1347 | Instruction::Add, Ty, CostKind, Opd1Info, Opd2Info, |
1348 | TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); |
1349 | InstructionCost ShrCost = getArithmeticInstrCost( |
1350 | Instruction::AShr, Ty, CostKind, Opd1Info, Opd2Info, |
1351 | TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); |
1352 | return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1; |
1353 | } |
1354 | } |
1355 | |
1356 | Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, |
1357 | Opd2Info, |
1358 | Opd1PropInfo, Opd2PropInfo); |
1359 | if (Ty->isVectorTy()) { |
1360 | |
1361 | |
1362 | Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, CostKind, |
1363 | Opd1Info, Opd2Info, Opd1PropInfo, |
1364 | Opd2PropInfo); |
1365 | Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind, |
1366 | Opd1Info, Opd2Info, Opd1PropInfo, |
1367 | Opd2PropInfo); |
1368 | |
1369 | |
1370 | Cost += Cost; |
1371 | } |
1372 | return Cost; |
1373 | |
1374 | case ISD::MUL: |
1375 | if (LT.second != MVT::v2i64) |
1376 | return (Cost + 1) * LT.first; |
1377 | |
1378 | |
1379 | |
1380 | |
1381 | |
1382 | |
1383 | |
1384 | |
1385 | |
1386 | return LT.first * 8; |
1387 | case ISD::ADD: |
1388 | case ISD::XOR: |
1389 | case ISD::OR: |
1390 | case ISD::AND: |
1391 | |
1392 | |
1393 | return (Cost + 1) * LT.first; |
1394 | |
1395 | case ISD::FADD: |
1396 | case ISD::FSUB: |
1397 | case ISD::FMUL: |
1398 | case ISD::FDIV: |
1399 | case ISD::FNEG: |
1400 | |
1401 | |
1402 | if (!Ty->getScalarType()->isFP128Ty()) |
1403 | return (Cost + 2) * LT.first; |
1404 | |
1405 | return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, |
1406 | Opd2Info, |
1407 | Opd1PropInfo, Opd2PropInfo); |
1408 | } |
1409 | } |
1410 | |
1411 | InstructionCost AArch64TTIImpl::getAddressComputationCost(Type *Ty, |
1412 | ScalarEvolution *SE, |
1413 | const SCEV *Ptr) { |
1414 | |
1415 | |
1416 | |
1417 | |
1418 | unsigned NumVectorInstToHideOverhead = 10; |
1419 | int MaxMergeDistance = 64; |
1420 | |
1421 | if (Ty->isVectorTy() && SE && |
1422 | !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)) |
1423 | return NumVectorInstToHideOverhead; |
1424 | |
1425 | |
1426 | |
1427 | return 1; |
1428 | } |
1429 | |
1430 | InstructionCost AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, |
1431 | Type *CondTy, |
1432 | CmpInst::Predicate VecPred, |
1433 | TTI::TargetCostKind CostKind, |
1434 | const Instruction *I) { |
1435 | |
1436 | if (CostKind != TTI::TCK_RecipThroughput) |
1437 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, |
1438 | I); |
1439 | |
1440 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
1441 | |
1442 | |
1443 | if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) { |
1444 | |
1445 | const int AmortizationCost = 20; |
1446 | |
1447 | |
1448 | |
1449 | if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) { |
1450 | CmpInst::Predicate CurrentPred; |
1451 | if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(), |
1452 | m_Value()))) |
1453 | VecPred = CurrentPred; |
1454 | } |
1455 | |
1456 | |
1457 | if (CmpInst::isIntPredicate(VecPred)) { |
1458 | static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, |
1459 | MVT::v8i16, MVT::v2i32, MVT::v4i32, |
1460 | MVT::v2i64}; |
1461 | auto LT = TLI->getTypeLegalizationCost(DL, ValTy); |
1462 | if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; })) |
1463 | return LT.first; |
1464 | } |
1465 | |
1466 | static const TypeConversionCostTblEntry |
1467 | VectorSelectTbl[] = { |
1468 | { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 }, |
1469 | { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 }, |
1470 | { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 }, |
1471 | { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost }, |
1472 | { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost }, |
1473 | { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost } |
1474 | }; |
1475 | |
1476 | EVT SelCondTy = TLI->getValueType(DL, CondTy); |
1477 | EVT SelValTy = TLI->getValueType(DL, ValTy); |
1478 | if (SelCondTy.isSimple() && SelValTy.isSimple()) { |
1479 | if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD, |
1480 | SelCondTy.getSimpleVT(), |
1481 | SelValTy.getSimpleVT())) |
1482 | return Entry->Cost; |
1483 | } |
1484 | } |
1485 | |
1486 | |
1487 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); |
1488 | } |
1489 | |
1490 | AArch64TTIImpl::TTI::MemCmpExpansionOptions |
1491 | AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { |
1492 | TTI::MemCmpExpansionOptions Options; |
1493 | if (ST->requiresStrictAlign()) { |
1494 | |
1495 | |
1496 | return Options; |
1497 | } |
1498 | Options.AllowOverlappingLoads = true; |
1499 | Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); |
1500 | Options.NumLoadsPerBlock = Options.MaxNumLoads; |
1501 | |
1502 | |
1503 | |
1504 | Options.LoadSizes = {8, 4, 2, 1}; |
1505 | return Options; |
1506 | } |
1507 | |
1508 | InstructionCost |
1509 | AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, |
1510 | Align Alignment, unsigned AddressSpace, |
1511 | TTI::TargetCostKind CostKind) { |
1512 | if (useNeonVector(Src)) |
1513 | return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, |
1514 | CostKind); |
1515 | auto LT = TLI->getTypeLegalizationCost(DL, Src); |
1516 | if (!LT.first.isValid()) |
1517 | return InstructionCost::getInvalid(); |
1518 | |
1519 | |
1520 | |
1521 | |
1522 | |
1523 | if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1)) |
1524 | return InstructionCost::getInvalid(); |
1525 | |
1526 | return LT.first * 2; |
1527 | } |
1528 | |
1529 | InstructionCost AArch64TTIImpl::getGatherScatterOpCost( |
1530 | unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, |
1531 | Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { |
1532 | if (useNeonVector(DataTy)) |
1533 | return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, |
1534 | Alignment, CostKind, I); |
1535 | auto *VT = cast<VectorType>(DataTy); |
1536 | auto LT = TLI->getTypeLegalizationCost(DL, DataTy); |
1537 | if (!LT.first.isValid()) |
1538 | return InstructionCost::getInvalid(); |
1539 | |
1540 | |
1541 | |
1542 | |
1543 | |
1544 | if (cast<VectorType>(DataTy)->getElementCount() == |
1545 | ElementCount::getScalable(1)) |
1546 | return InstructionCost::getInvalid(); |
1547 | |
1548 | ElementCount LegalVF = LT.second.getVectorElementCount(); |
1549 | InstructionCost MemOpCost = |
1550 | getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I); |
1551 | return LT.first * MemOpCost * getMaxNumElements(LegalVF, I->getFunction()); |
1552 | } |
1553 | |
1554 | bool AArch64TTIImpl::useNeonVector(const Type *Ty) const { |
1555 | return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors(); |
1556 | } |
1557 | |
1558 | InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, |
1559 | MaybeAlign Alignment, |
1560 | unsigned AddressSpace, |
1561 | TTI::TargetCostKind CostKind, |
1562 | const Instruction *I) { |
1563 | EVT VT = TLI->getValueType(DL, Ty, true); |
1564 | |
1565 | if (VT == MVT::Other) |
1566 | return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, |
1567 | CostKind); |
1568 | |
1569 | auto LT = TLI->getTypeLegalizationCost(DL, Ty); |
1570 | if (!LT.first.isValid()) |
1571 | return InstructionCost::getInvalid(); |
1572 | |
1573 | |
1574 | |
1575 | |
1576 | |
1577 | if (auto *VTy = dyn_cast<ScalableVectorType>(Ty)) |
1578 | if (VTy->getElementCount() == ElementCount::getScalable(1)) |
1579 | return InstructionCost::getInvalid(); |
1580 | |
1581 | |
1582 | if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) |
1583 | return LT.first; |
1584 | |
1585 | if (CostKind != TTI::TCK_RecipThroughput) |
1586 | return 1; |
1587 | |
1588 | if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store && |
1589 | LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) { |
1590 | |
1591 | |
1592 | |
1593 | |
1594 | |
1595 | const int AmortizationCost = 6; |
1596 | |
1597 | return LT.first * 2 * AmortizationCost; |
1598 | } |
1599 | |
1600 | |
1601 | if (useNeonVector(Ty) && |
1602 | Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) { |
1603 | |
1604 | if (VT == MVT::v4i8) |
1605 | return 2; |
1606 | |
1607 | return cast<FixedVectorType>(Ty)->getNumElements() * 2; |
1608 | } |
1609 | |
1610 | return LT.first; |
1611 | } |
1612 | |
1613 | InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost( |
1614 | unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, |
1615 | Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, |
1616 | bool UseMaskForCond, bool UseMaskForGaps) { |
1617 | assert(Factor >= 2 && "Invalid interleave factor"); |
1618 | auto *VecVTy = cast<FixedVectorType>(VecTy); |
1619 | |
1620 | if (!UseMaskForCond && !UseMaskForGaps && |
1621 | Factor <= TLI->getMaxSupportedInterleaveFactor()) { |
1622 | unsigned NumElts = VecVTy->getNumElements(); |
1623 | auto *SubVecTy = |
1624 | FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor); |
1625 | |
1626 | |
1627 | |
1628 | |
1629 | if (NumElts % Factor == 0 && |
1630 | TLI->isLegalInterleavedAccessType(SubVecTy, DL)) |
1631 | return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL); |
1632 | } |
1633 | |
1634 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, |
1635 | Alignment, AddressSpace, CostKind, |
1636 | UseMaskForCond, UseMaskForGaps); |
1637 | } |
1638 | |
1639 | InstructionCost |
1640 | AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { |
1641 | InstructionCost Cost = 0; |
1642 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
1643 | for (auto *I : Tys) { |
1644 | if (!I->isVectorTy()) |
1645 | continue; |
1646 | if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() == |
1647 | 128) |
1648 | Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) + |
1649 | getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind); |
1650 | } |
1651 | return Cost; |
1652 | } |
1653 | |
1654 | unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) { |
1655 | return ST->getMaxInterleaveFactor(); |
1656 | } |
1657 | |
1658 | |
1659 | |
1660 | |
1661 | |
1662 | static void |
1663 | getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, |
1664 | TargetTransformInfo::UnrollingPreferences &UP) { |
1665 | enum { MaxStridedLoads = 7 }; |
1666 | auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) { |
1667 | int StridedLoads = 0; |
1668 | |
1669 | |
1670 | for (const auto BB : L->blocks()) { |
1671 | for (auto &I : *BB) { |
1672 | LoadInst *LMemI = dyn_cast<LoadInst>(&I); |
1673 | if (!LMemI) |
1674 | continue; |
1675 | |
1676 | Value *PtrValue = LMemI->getPointerOperand(); |
1677 | if (L->isLoopInvariant(PtrValue)) |
1678 | continue; |
1679 | |
1680 | const SCEV *LSCEV = SE.getSCEV(PtrValue); |
1681 | const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV); |
1682 | if (!LSCEVAddRec || !LSCEVAddRec->isAffine()) |
1683 | continue; |
1684 | |
1685 | |
1686 | |
1687 | |
1688 | ++StridedLoads; |
1689 | |
1690 | |
1691 | if (StridedLoads > MaxStridedLoads / 2) |
1692 | return StridedLoads; |
1693 | } |
1694 | } |
1695 | return StridedLoads; |
1696 | }; |
1697 | |
1698 | int StridedLoads = countStridedLoads(L, SE); |
1699 | LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads |
1700 | << " strided loads\n"); |
1701 | |
1702 | |
1703 | if (StridedLoads) { |
1704 | UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads); |
1705 | LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to " |
1706 | << UP.MaxCount << '\n'); |
1707 | } |
1708 | } |
1709 | |
1710 | void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, |
1711 | TTI::UnrollingPreferences &UP, |
1712 | OptimizationRemarkEmitter *ORE) { |
1713 | |
1714 | BaseT::getUnrollingPreferences(L, SE, UP, ORE); |
1715 | |
1716 | UP.UpperBound = true; |
1717 | |
1718 | |
1719 | |
1720 | |
1721 | if (L->getLoopDepth() > 1) |
1722 | UP.PartialThreshold *= 2; |
1723 | |
1724 | |
1725 | UP.PartialOptSizeThreshold = 0; |
1726 | |
1727 | if (ST->getProcFamily() == AArch64Subtarget::Falkor && |
1728 | EnableFalkorHWPFUnrollFix) |
1729 | getFalkorUnrollingPreferences(L, SE, UP); |
1730 | |
1731 | |
1732 | |
1733 | |
1734 | for (auto *BB : L->getBlocks()) { |
1735 | for (auto &I : *BB) { |
1736 | |
1737 | if (I.getType()->isVectorTy()) |
1738 | return; |
1739 | |
1740 | if (isa<CallInst>(I) || isa<InvokeInst>(I)) { |
1741 | if (const Function *F = cast<CallBase>(I).getCalledFunction()) { |
1742 | if (!isLoweredToCall(F)) |
1743 | continue; |
1744 | } |
1745 | return; |
1746 | } |
1747 | } |
1748 | } |
1749 | |
1750 | |
1751 | |
1752 | |
1753 | |
1754 | if (ST->getProcFamily() != AArch64Subtarget::Others && |
1755 | !ST->getSchedModel().isOutOfOrder()) { |
1756 | UP.Runtime = true; |
1757 | UP.Partial = true; |
1758 | UP.UnrollRemainder = true; |
1759 | UP.DefaultUnrollRuntimeCount = 4; |
1760 | |
1761 | UP.UnrollAndJam = true; |
1762 | UP.UnrollAndJamInnerLoopThreshold = 60; |
1763 | } |
1764 | } |
1765 | |
1766 | void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, |
1767 | TTI::PeelingPreferences &PP) { |
1768 | BaseT::getPeelingPreferences(L, SE, PP); |
1769 | } |
1770 | |
1771 | Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, |
1772 | Type *ExpectedType) { |
1773 | switch (Inst->getIntrinsicID()) { |
1774 | default: |
1775 | return nullptr; |
1776 | case Intrinsic::aarch64_neon_st2: |
1777 | case Intrinsic::aarch64_neon_st3: |
1778 | case Intrinsic::aarch64_neon_st4: { |
1779 | |
1780 | StructType *ST = dyn_cast<StructType>(ExpectedType); |
1781 | if (!ST) |
1782 | return nullptr; |
1783 | unsigned NumElts = Inst->getNumArgOperands() - 1; |
1784 | if (ST->getNumElements() != NumElts) |
1785 | return nullptr; |
1786 | for (unsigned i = 0, e = NumElts; i != e; ++i) { |
1787 | if (Inst->getArgOperand(i)->getType() != ST->getElementType(i)) |
1788 | return nullptr; |
1789 | } |
1790 | Value *Res = UndefValue::get(ExpectedType); |
1791 | IRBuilder<> Builder(Inst); |
1792 | for (unsigned i = 0, e = NumElts; i != e; ++i) { |
1793 | Value *L = Inst->getArgOperand(i); |
1794 | Res = Builder.CreateInsertValue(Res, L, i); |
1795 | } |
1796 | return Res; |
1797 | } |
1798 | case Intrinsic::aarch64_neon_ld2: |
1799 | case Intrinsic::aarch64_neon_ld3: |
1800 | case Intrinsic::aarch64_neon_ld4: |
1801 | if (Inst->getType() == ExpectedType) |
1802 | return Inst; |
1803 | return nullptr; |
1804 | } |
1805 | } |
1806 | |
1807 | bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, |
1808 | MemIntrinsicInfo &Info) { |
1809 | switch (Inst->getIntrinsicID()) { |
1810 | default: |
1811 | break; |
1812 | case Intrinsic::aarch64_neon_ld2: |
1813 | case Intrinsic::aarch64_neon_ld3: |
1814 | case Intrinsic::aarch64_neon_ld4: |
1815 | Info.ReadMem = true; |
1816 | Info.WriteMem = false; |
1817 | Info.PtrVal = Inst->getArgOperand(0); |
1818 | break; |
1819 | case Intrinsic::aarch64_neon_st2: |
1820 | case Intrinsic::aarch64_neon_st3: |
1821 | case Intrinsic::aarch64_neon_st4: |
1822 | Info.ReadMem = false; |
1823 | Info.WriteMem = true; |
1824 | Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1); |
1825 | break; |
1826 | } |
1827 | |
1828 | switch (Inst->getIntrinsicID()) { |
1829 | default: |
1830 | return false; |
1831 | case Intrinsic::aarch64_neon_ld2: |
1832 | case Intrinsic::aarch64_neon_st2: |
1833 | Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS; |
1834 | break; |
1835 | case Intrinsic::aarch64_neon_ld3: |
1836 | case Intrinsic::aarch64_neon_st3: |
1837 | Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS; |
1838 | break; |
1839 | case Intrinsic::aarch64_neon_ld4: |
1840 | case Intrinsic::aarch64_neon_st4: |
1841 | Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS; |
1842 | break; |
1843 | } |
1844 | return true; |
1845 | } |
1846 | |
1847 | |
1848 | |
1849 | |
1850 | |
1851 | |
1852 | bool AArch64TTIImpl::shouldConsiderAddressTypePromotion( |
1853 | const Instruction &I, bool &AllowPromotionWithoutCommonHeader) { |
1854 | bool Considerable = false; |
1855 | AllowPromotionWithoutCommonHeader = false; |
1856 | if (!isa<SExtInst>(&I)) |
1857 | return false; |
1858 | Type *ConsideredSExtType = |
1859 | Type::getInt64Ty(I.getParent()->getParent()->getContext()); |
1860 | if (I.getType() != ConsideredSExtType) |
1861 | return false; |
1862 | |
1863 | |
1864 | for (const User *U : I.users()) { |
1865 | if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) { |
1866 | Considerable = true; |
1867 | |
1868 | |
1869 | |
1870 | if (GEPInst->getNumOperands() > 2) { |
1871 | AllowPromotionWithoutCommonHeader = true; |
1872 | break; |
1873 | } |
1874 | } |
1875 | } |
1876 | return Considerable; |
1877 | } |
1878 | |
1879 | bool AArch64TTIImpl::isLegalToVectorizeReduction( |
1880 | const RecurrenceDescriptor &RdxDesc, ElementCount VF) const { |
1881 | if (!VF.isScalable()) |
1882 | return true; |
1883 | |
1884 | Type *Ty = RdxDesc.getRecurrenceType(); |
1885 | if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty)) |
1886 | return false; |
1887 | |
1888 | switch (RdxDesc.getRecurrenceKind()) { |
1889 | case RecurKind::Add: |
1890 | case RecurKind::FAdd: |
1891 | case RecurKind::And: |
1892 | case RecurKind::Or: |
1893 | case RecurKind::Xor: |
1894 | case RecurKind::SMin: |
1895 | case RecurKind::SMax: |
1896 | case RecurKind::UMin: |
1897 | case RecurKind::UMax: |
1898 | case RecurKind::FMin: |
1899 | case RecurKind::FMax: |
1900 | return true; |
1901 | default: |
1902 | return false; |
1903 | } |
1904 | } |
1905 | |
1906 | InstructionCost |
1907 | AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, |
1908 | bool IsUnsigned, |
1909 | TTI::TargetCostKind CostKind) { |
1910 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); |
1911 | |
1912 | if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16()) |
1913 | return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind); |
1914 | |
1915 | assert((isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) && |
1916 | "Both vector needs to be equally scalable"); |
1917 | |
1918 | InstructionCost LegalizationCost = 0; |
1919 | if (LT.first > 1) { |
1920 | Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext()); |
1921 | unsigned MinMaxOpcode = |
1922 | Ty->isFPOrFPVectorTy() |
1923 | ? Intrinsic::maxnum |
1924 | : (IsUnsigned ? Intrinsic::umin : Intrinsic::smin); |
1925 | IntrinsicCostAttributes Attrs(MinMaxOpcode, LegalVTy, {LegalVTy, LegalVTy}); |
1926 | LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1); |
1927 | } |
1928 | |
1929 | return LegalizationCost + 2; |
1930 | } |
1931 | |
1932 | InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE( |
1933 | unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) { |
1934 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); |
1935 | InstructionCost LegalizationCost = 0; |
1936 | if (LT.first > 1) { |
1937 | Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext()); |
1938 | LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind); |
1939 | LegalizationCost *= LT.first - 1; |
1940 | } |
1941 | |
1942 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
1943 | assert(ISD && "Invalid opcode"); |
1944 | |
1945 | switch (ISD) { |
1946 | case ISD::ADD: |
1947 | case ISD::AND: |
1948 | case ISD::OR: |
1949 | case ISD::XOR: |
1950 | case ISD::FADD: |
1951 | return LegalizationCost + 2; |
1952 | default: |
1953 | return InstructionCost::getInvalid(); |
1954 | } |
1955 | } |
1956 | |
1957 | InstructionCost |
1958 | AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, |
1959 | Optional<FastMathFlags> FMF, |
1960 | TTI::TargetCostKind CostKind) { |
1961 | if (TTI::requiresOrderedReduction(FMF)) { |
1962 | if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) { |
1963 | InstructionCost BaseCost = |
1964 | BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); |
1965 | |
1966 | |
1967 | return BaseCost + FixedVTy->getNumElements(); |
1968 | } |
1969 | |
1970 | if (Opcode != Instruction::FAdd) |
1971 | return InstructionCost::getInvalid(); |
1972 | |
1973 | auto *VTy = cast<ScalableVectorType>(ValTy); |
1974 | InstructionCost Cost = |
1975 | getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind); |
1976 | Cost *= getMaxNumElements(VTy->getElementCount()); |
1977 | return Cost; |
1978 | } |
1979 | |
1980 | if (isa<ScalableVectorType>(ValTy)) |
1981 | return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind); |
1982 | |
1983 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); |
1984 | MVT MTy = LT.second; |
1985 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
1986 | assert(ISD && "Invalid opcode"); |
1987 | |
1988 | |
1989 | |
1990 | |
1991 | |
1992 | |
1993 | |
1994 | |
1995 | |
1996 | static const CostTblEntry CostTblNoPairwise[]{ |
1997 | {ISD::ADD, MVT::v8i8, 2}, |
1998 | {ISD::ADD, MVT::v16i8, 2}, |
1999 | {ISD::ADD, MVT::v4i16, 2}, |
2000 | {ISD::ADD, MVT::v8i16, 2}, |
2001 | {ISD::ADD, MVT::v4i32, 2}, |
2002 | {ISD::OR, MVT::v8i8, 15}, |
2003 | {ISD::OR, MVT::v16i8, 17}, |
2004 | {ISD::OR, MVT::v4i16, 7}, |
2005 | {ISD::OR, MVT::v8i16, 9}, |
2006 | {ISD::OR, MVT::v2i32, 3}, |
2007 | {ISD::OR, MVT::v4i32, 5}, |
2008 | {ISD::OR, MVT::v2i64, 3}, |
2009 | {ISD::XOR, MVT::v8i8, 15}, |
2010 | {ISD::XOR, MVT::v16i8, 17}, |
2011 | {ISD::XOR, MVT::v4i16, 7}, |
2012 | {ISD::XOR, MVT::v8i16, 9}, |
2013 | {ISD::XOR, MVT::v2i32, 3}, |
2014 | {ISD::XOR, MVT::v4i32, 5}, |
2015 | {ISD::XOR, MVT::v2i64, 3}, |
2016 | {ISD::AND, MVT::v8i8, 15}, |
2017 | {ISD::AND, MVT::v16i8, 17}, |
2018 | {ISD::AND, MVT::v4i16, 7}, |
2019 | {ISD::AND, MVT::v8i16, 9}, |
2020 | {ISD::AND, MVT::v2i32, 3}, |
2021 | {ISD::AND, MVT::v4i32, 5}, |
2022 | {ISD::AND, MVT::v2i64, 3}, |
2023 | }; |
2024 | switch (ISD) { |
2025 | default: |
2026 | break; |
2027 | case ISD::ADD: |
2028 | if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy)) |
2029 | return (LT.first - 1) + Entry->Cost; |
2030 | break; |
2031 | case ISD::XOR: |
2032 | case ISD::AND: |
2033 | case ISD::OR: |
2034 | const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy); |
2035 | if (!Entry) |
2036 | break; |
2037 | auto *ValVTy = cast<FixedVectorType>(ValTy); |
2038 | if (!ValVTy->getElementType()->isIntegerTy(1) && |
2039 | MTy.getVectorNumElements() <= ValVTy->getNumElements() && |
2040 | isPowerOf2_32(ValVTy->getNumElements())) { |
2041 | InstructionCost ExtraCost = 0; |
2042 | if (LT.first != 1) { |
2043 | |
2044 | |
2045 | auto *Ty = FixedVectorType::get(ValTy->getElementType(), |
2046 | MTy.getVectorNumElements()); |
2047 | ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind); |
2048 | ExtraCost *= LT.first - 1; |
2049 | } |
2050 | return Entry->Cost + ExtraCost; |
2051 | } |
2052 | break; |
2053 | } |
2054 | return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); |
2055 | } |
2056 | |
2057 | InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) { |
2058 | static const CostTblEntry ShuffleTbl[] = { |
2059 | { TTI::SK_Splice, MVT::nxv16i8, 1 }, |
2060 | { TTI::SK_Splice, MVT::nxv8i16, 1 }, |
2061 | { TTI::SK_Splice, MVT::nxv4i32, 1 }, |
2062 | { TTI::SK_Splice, MVT::nxv2i64, 1 }, |
2063 | { TTI::SK_Splice, MVT::nxv2f16, 1 }, |
2064 | { TTI::SK_Splice, MVT::nxv4f16, 1 }, |
2065 | { TTI::SK_Splice, MVT::nxv8f16, 1 }, |
2066 | { TTI::SK_Splice, MVT::nxv2bf16, 1 }, |
2067 | { TTI::SK_Splice, MVT::nxv4bf16, 1 }, |
2068 | { TTI::SK_Splice, MVT::nxv8bf16, 1 }, |
2069 | { TTI::SK_Splice, MVT::nxv2f32, 1 }, |
2070 | { TTI::SK_Splice, MVT::nxv4f32, 1 }, |
2071 | { TTI::SK_Splice, MVT::nxv2f64, 1 }, |
2072 | }; |
2073 | |
2074 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); |
2075 | Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext()); |
2076 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
2077 | EVT PromotedVT = LT.second.getScalarType() == MVT::i1 |
2078 | ? TLI->getPromotedVTForPredicate(EVT(LT.second)) |
2079 | : LT.second; |
2080 | Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext()); |
2081 | InstructionCost LegalizationCost = 0; |
2082 | if (Index < 0) { |
2083 | LegalizationCost = |
2084 | getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy, |
2085 | CmpInst::BAD_ICMP_PREDICATE, CostKind) + |
2086 | getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy, |
2087 | CmpInst::BAD_ICMP_PREDICATE, CostKind); |
2088 | } |
2089 | |
2090 | |
2091 | |
2092 | if (LT.second.getScalarType() == MVT::i1) { |
2093 | LegalizationCost += |
2094 | getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy, |
2095 | TTI::CastContextHint::None, CostKind) + |
2096 | getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy, |
2097 | TTI::CastContextHint::None, CostKind); |
2098 | } |
2099 | const auto *Entry = |
2100 | CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT()); |
2101 | assert(Entry && "Illegal Type for Splice"); |
2102 | LegalizationCost += Entry->Cost; |
2103 | return LegalizationCost * LT.first; |
2104 | } |
2105 | |
2106 | InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, |
2107 | VectorType *Tp, |
2108 | ArrayRef<int> Mask, int Index, |
2109 | VectorType *SubTp) { |
2110 | Kind = improveShuffleKindFromMask(Kind, Mask); |
2111 | if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose || |
2112 | Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc || |
2113 | Kind == TTI::SK_Reverse) { |
2114 | static const CostTblEntry ShuffleTbl[] = { |
2115 | |
2116 | { TTI::SK_Broadcast, MVT::v8i8, 1 }, |
2117 | { TTI::SK_Broadcast, MVT::v16i8, 1 }, |
2118 | { TTI::SK_Broadcast, MVT::v4i16, 1 }, |
2119 | { TTI::SK_Broadcast, MVT::v8i16, 1 }, |
2120 | { TTI::SK_Broadcast, MVT::v2i32, 1 }, |
2121 | { TTI::SK_Broadcast, MVT::v4i32, 1 }, |
2122 | { TTI::SK_Broadcast, MVT::v2i64, 1 }, |
2123 | { TTI::SK_Broadcast, MVT::v2f32, 1 }, |
2124 | { TTI::SK_Broadcast, MVT::v4f32, 1 }, |
2125 | { TTI::SK_Broadcast, MVT::v2f64, 1 }, |
2126 | |
2127 | |
2128 | { TTI::SK_Transpose, MVT::v8i8, 1 }, |
2129 | { TTI::SK_Transpose, MVT::v16i8, 1 }, |
2130 | { TTI::SK_Transpose, MVT::v4i16, 1 }, |
2131 | { TTI::SK_Transpose, MVT::v8i16, 1 }, |
2132 | { TTI::SK_Transpose, MVT::v2i32, 1 }, |
2133 | { TTI::SK_Transpose, MVT::v4i32, 1 }, |
2134 | { TTI::SK_Transpose, MVT::v2i64, 1 }, |
2135 | { TTI::SK_Transpose, MVT::v2f32, 1 }, |
2136 | { TTI::SK_Transpose, MVT::v4f32, 1 }, |
2137 | { TTI::SK_Transpose, MVT::v2f64, 1 }, |
2138 | |
2139 | |
2140 | { TTI::SK_Select, MVT::v2i32, 1 }, |
2141 | { TTI::SK_Select, MVT::v4i32, 2 }, |
2142 | { TTI::SK_Select, MVT::v2i64, 1 }, |
2143 | { TTI::SK_Select, MVT::v2f32, 1 }, |
2144 | { TTI::SK_Select, MVT::v4f32, 2 }, |
2145 | { TTI::SK_Select, MVT::v2f64, 1 }, |
2146 | |
2147 | { TTI::SK_PermuteSingleSrc, MVT::v2i32, 1 }, |
2148 | { TTI::SK_PermuteSingleSrc, MVT::v4i32, 3 }, |
2149 | { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, |
2150 | { TTI::SK_PermuteSingleSrc, MVT::v2f32, 1 }, |
2151 | { TTI::SK_PermuteSingleSrc, MVT::v4f32, 3 }, |
2152 | { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, |
2153 | { TTI::SK_PermuteSingleSrc, MVT::v4i16, 3 }, |
2154 | { TTI::SK_PermuteSingleSrc, MVT::v4f16, 3 }, |
2155 | { TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3 }, |
2156 | { TTI::SK_PermuteSingleSrc, MVT::v8i16, 8 }, |
2157 | { TTI::SK_PermuteSingleSrc, MVT::v8f16, 8 }, |
2158 | { TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8 }, |
2159 | { TTI::SK_PermuteSingleSrc, MVT::v8i8, 8 }, |
2160 | { TTI::SK_PermuteSingleSrc, MVT::v16i8, 8 }, |
2161 | |
2162 | { TTI::SK_Reverse, MVT::v2i32, 1 }, |
2163 | { TTI::SK_Reverse, MVT::v4i32, 2 }, |
2164 | { TTI::SK_Reverse, MVT::v2i64, 1 }, |
2165 | { TTI::SK_Reverse, MVT::v2f32, 1 }, |
2166 | { TTI::SK_Reverse, MVT::v4f32, 2 }, |
2167 | { TTI::SK_Reverse, MVT::v2f64, 1 }, |
2168 | |
2169 | { TTI::SK_Broadcast, MVT::nxv16i8, 1 }, |
2170 | { TTI::SK_Broadcast, MVT::nxv8i16, 1 }, |
2171 | { TTI::SK_Broadcast, MVT::nxv4i32, 1 }, |
2172 | { TTI::SK_Broadcast, MVT::nxv2i64, 1 }, |
2173 | { TTI::SK_Broadcast, MVT::nxv2f16, 1 }, |
2174 | { TTI::SK_Broadcast, MVT::nxv4f16, 1 }, |
2175 | { TTI::SK_Broadcast, MVT::nxv8f16, 1 }, |
2176 | { TTI::SK_Broadcast, MVT::nxv2bf16, 1 }, |
2177 | { TTI::SK_Broadcast, MVT::nxv4bf16, 1 }, |
2178 | { TTI::SK_Broadcast, MVT::nxv8bf16, 1 }, |
2179 | { TTI::SK_Broadcast, MVT::nxv2f32, 1 }, |
2180 | { TTI::SK_Broadcast, MVT::nxv4f32, 1 }, |
2181 | { TTI::SK_Broadcast, MVT::nxv2f64, 1 }, |
2182 | { TTI::SK_Broadcast, MVT::nxv16i1, 1 }, |
2183 | { TTI::SK_Broadcast, MVT::nxv8i1, 1 }, |
2184 | { TTI::SK_Broadcast, MVT::nxv4i1, 1 }, |
2185 | { TTI::SK_Broadcast, MVT::nxv2i1, 1 }, |
2186 | |
2187 | { TTI::SK_Reverse, MVT::nxv16i8, 1 }, |
2188 | { TTI::SK_Reverse, MVT::nxv8i16, 1 }, |
2189 | { TTI::SK_Reverse, MVT::nxv4i32, 1 }, |
2190 | { TTI::SK_Reverse, MVT::nxv2i64, 1 }, |
2191 | { TTI::SK_Reverse, MVT::nxv2f16, 1 }, |
2192 | { TTI::SK_Reverse, MVT::nxv4f16, 1 }, |
2193 | { TTI::SK_Reverse, MVT::nxv8f16, 1 }, |
2194 | { TTI::SK_Reverse, MVT::nxv2bf16, 1 }, |
2195 | { TTI::SK_Reverse, MVT::nxv4bf16, 1 }, |
2196 | { TTI::SK_Reverse, MVT::nxv8bf16, 1 }, |
2197 | { TTI::SK_Reverse, MVT::nxv2f32, 1 }, |
2198 | { TTI::SK_Reverse, MVT::nxv4f32, 1 }, |
2199 | { TTI::SK_Reverse, MVT::nxv2f64, 1 }, |
2200 | { TTI::SK_Reverse, MVT::nxv16i1, 1 }, |
2201 | { TTI::SK_Reverse, MVT::nxv8i1, 1 }, |
2202 | { TTI::SK_Reverse, MVT::nxv4i1, 1 }, |
2203 | { TTI::SK_Reverse, MVT::nxv2i1, 1 }, |
2204 | }; |
2205 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); |
2206 | if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second)) |
2207 | return LT.first * Entry->Cost; |
2208 | } |
2209 | if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp)) |
2210 | return getSpliceCost(Tp, Index); |
2211 | return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp); |
2212 | } |