File: | build/source/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp |
Warning: | line 2741, column 21 The result of the left shift is undefined due to shifting by '4294967295', which is greater or equal to the width of type 'int' |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===// | |||
2 | // | |||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | |||
4 | // See https://llvm.org/LICENSE.txt for license information. | |||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | |||
6 | // | |||
7 | //===----------------------------------------------------------------------===// | |||
8 | ||||
9 | #include "AArch64TargetTransformInfo.h" | |||
10 | #include "AArch64ExpandImm.h" | |||
11 | #include "AArch64PerfectShuffle.h" | |||
12 | #include "MCTargetDesc/AArch64AddressingModes.h" | |||
13 | #include "llvm/Analysis/IVDescriptors.h" | |||
14 | #include "llvm/Analysis/LoopInfo.h" | |||
15 | #include "llvm/Analysis/TargetTransformInfo.h" | |||
16 | #include "llvm/CodeGen/BasicTTIImpl.h" | |||
17 | #include "llvm/CodeGen/CostTable.h" | |||
18 | #include "llvm/CodeGen/TargetLowering.h" | |||
19 | #include "llvm/IR/IntrinsicInst.h" | |||
20 | #include "llvm/IR/Intrinsics.h" | |||
21 | #include "llvm/IR/IntrinsicsAArch64.h" | |||
22 | #include "llvm/IR/PatternMatch.h" | |||
23 | #include "llvm/Support/Debug.h" | |||
24 | #include "llvm/Transforms/InstCombine/InstCombiner.h" | |||
25 | #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" | |||
26 | #include <algorithm> | |||
27 | #include <optional> | |||
28 | using namespace llvm; | |||
29 | using namespace llvm::PatternMatch; | |||
30 | ||||
31 | #define DEBUG_TYPE"aarch64tti" "aarch64tti" | |||
32 | ||||
33 | static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", | |||
34 | cl::init(true), cl::Hidden); | |||
35 | ||||
36 | static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10), | |||
37 | cl::Hidden); | |||
38 | ||||
39 | static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead", | |||
40 | cl::init(10), cl::Hidden); | |||
41 | ||||
42 | namespace { | |||
43 | class TailFoldingKind { | |||
44 | private: | |||
45 | uint8_t Bits = 0; // Currently defaults to disabled. | |||
46 | ||||
47 | public: | |||
48 | enum TailFoldingOpts { | |||
49 | TFDisabled = 0x0, | |||
50 | TFReductions = 0x01, | |||
51 | TFRecurrences = 0x02, | |||
52 | TFReverse = 0x04, | |||
53 | TFSimple = 0x80, | |||
54 | TFAll = TFReductions | TFRecurrences | TFReverse | TFSimple | |||
55 | }; | |||
56 | ||||
57 | void operator=(const std::string &Val) { | |||
58 | if (Val.empty()) | |||
59 | return; | |||
60 | SmallVector<StringRef, 6> TailFoldTypes; | |||
61 | StringRef(Val).split(TailFoldTypes, '+', -1, false); | |||
62 | for (auto TailFoldType : TailFoldTypes) { | |||
63 | if (TailFoldType == "disabled") | |||
64 | Bits = 0; | |||
65 | else if (TailFoldType == "all") | |||
66 | Bits = TFAll; | |||
67 | else if (TailFoldType == "default") | |||
68 | Bits = 0; // Currently defaults to never tail-folding. | |||
69 | else if (TailFoldType == "simple") | |||
70 | add(TFSimple); | |||
71 | else if (TailFoldType == "reductions") | |||
72 | add(TFReductions); | |||
73 | else if (TailFoldType == "recurrences") | |||
74 | add(TFRecurrences); | |||
75 | else if (TailFoldType == "reverse") | |||
76 | add(TFReverse); | |||
77 | else if (TailFoldType == "noreductions") | |||
78 | remove(TFReductions); | |||
79 | else if (TailFoldType == "norecurrences") | |||
80 | remove(TFRecurrences); | |||
81 | else if (TailFoldType == "noreverse") | |||
82 | remove(TFReverse); | |||
83 | else { | |||
84 | errs() | |||
85 | << "invalid argument " << TailFoldType.str() | |||
86 | << " to -sve-tail-folding=; each element must be one of: disabled, " | |||
87 | "all, default, simple, reductions, noreductions, recurrences, " | |||
88 | "norecurrences\n"; | |||
89 | } | |||
90 | } | |||
91 | } | |||
92 | ||||
93 | operator uint8_t() const { return Bits; } | |||
94 | ||||
95 | void add(uint8_t Flag) { Bits |= Flag; } | |||
96 | void remove(uint8_t Flag) { Bits &= ~Flag; } | |||
97 | }; | |||
98 | } // namespace | |||
99 | ||||
100 | TailFoldingKind TailFoldingKindLoc; | |||
101 | ||||
102 | cl::opt<TailFoldingKind, true, cl::parser<std::string>> SVETailFolding( | |||
103 | "sve-tail-folding", | |||
104 | cl::desc( | |||
105 | "Control the use of vectorisation using tail-folding for SVE:" | |||
106 | "\ndisabled No loop types will vectorize using tail-folding" | |||
107 | "\ndefault Uses the default tail-folding settings for the target " | |||
108 | "CPU" | |||
109 | "\nall All legal loop types will vectorize using tail-folding" | |||
110 | "\nsimple Use tail-folding for simple loops (not reductions or " | |||
111 | "recurrences)" | |||
112 | "\nreductions Use tail-folding for loops containing reductions" | |||
113 | "\nrecurrences Use tail-folding for loops containing fixed order " | |||
114 | "recurrences" | |||
115 | "\nreverse Use tail-folding for loops requiring reversed " | |||
116 | "predicates"), | |||
117 | cl::location(TailFoldingKindLoc)); | |||
118 | ||||
119 | // Experimental option that will only be fully functional when the | |||
120 | // code-generator is changed to use SVE instead of NEON for all fixed-width | |||
121 | // operations. | |||
122 | static cl::opt<bool> EnableFixedwidthAutovecInStreamingMode( | |||
123 | "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden); | |||
124 | ||||
125 | // Experimental option that will only be fully functional when the cost-model | |||
126 | // and code-generator have been changed to avoid using scalable vector | |||
127 | // instructions that are not legal in streaming SVE mode. | |||
128 | static cl::opt<bool> EnableScalableAutovecInStreamingMode( | |||
129 | "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden); | |||
130 | ||||
131 | bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, | |||
132 | const Function *Callee) const { | |||
133 | SMEAttrs CallerAttrs(*Caller); | |||
134 | SMEAttrs CalleeAttrs(*Callee); | |||
135 | if (CallerAttrs.requiresSMChange(CalleeAttrs, | |||
136 | /*BodyOverridesInterface=*/true) || | |||
137 | CallerAttrs.requiresLazySave(CalleeAttrs) || | |||
138 | CalleeAttrs.hasNewZAInterface()) | |||
139 | return false; | |||
140 | ||||
141 | const TargetMachine &TM = getTLI()->getTargetMachine(); | |||
142 | ||||
143 | const FeatureBitset &CallerBits = | |||
144 | TM.getSubtargetImpl(*Caller)->getFeatureBits(); | |||
145 | const FeatureBitset &CalleeBits = | |||
146 | TM.getSubtargetImpl(*Callee)->getFeatureBits(); | |||
147 | ||||
148 | // Inline a callee if its target-features are a subset of the callers | |||
149 | // target-features. | |||
150 | return (CallerBits & CalleeBits) == CalleeBits; | |||
151 | } | |||
152 | ||||
153 | bool AArch64TTIImpl::shouldMaximizeVectorBandwidth( | |||
154 | TargetTransformInfo::RegisterKind K) const { | |||
155 | assert(K != TargetTransformInfo::RGK_Scalar)(static_cast <bool> (K != TargetTransformInfo::RGK_Scalar ) ? void (0) : __assert_fail ("K != TargetTransformInfo::RGK_Scalar" , "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 155 , __extension__ __PRETTY_FUNCTION__)); | |||
156 | return K == TargetTransformInfo::RGK_FixedWidthVector; | |||
157 | } | |||
158 | ||||
159 | /// Calculate the cost of materializing a 64-bit value. This helper | |||
160 | /// method might only calculate a fraction of a larger immediate. Therefore it | |||
161 | /// is valid to return a cost of ZERO. | |||
162 | InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) { | |||
163 | // Check if the immediate can be encoded within an instruction. | |||
164 | if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64)) | |||
165 | return 0; | |||
166 | ||||
167 | if (Val < 0) | |||
168 | Val = ~Val; | |||
169 | ||||
170 | // Calculate how many moves we will need to materialize this constant. | |||
171 | SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; | |||
172 | AArch64_IMM::expandMOVImm(Val, 64, Insn); | |||
173 | return Insn.size(); | |||
174 | } | |||
175 | ||||
176 | /// Calculate the cost of materializing the given constant. | |||
177 | InstructionCost AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, | |||
178 | TTI::TargetCostKind CostKind) { | |||
179 | assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) : __assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp" , 179, __extension__ __PRETTY_FUNCTION__)); | |||
180 | ||||
181 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); | |||
182 | if (BitSize == 0) | |||
183 | return ~0U; | |||
184 | ||||
185 | // Sign-extend all constants to a multiple of 64-bit. | |||
186 | APInt ImmVal = Imm; | |||
187 | if (BitSize & 0x3f) | |||
188 | ImmVal = Imm.sext((BitSize + 63) & ~0x3fU); | |||
189 | ||||
190 | // Split the constant into 64-bit chunks and calculate the cost for each | |||
191 | // chunk. | |||
192 | InstructionCost Cost = 0; | |||
193 | for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { | |||
194 | APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); | |||
195 | int64_t Val = Tmp.getSExtValue(); | |||
196 | Cost += getIntImmCost(Val); | |||
197 | } | |||
198 | // We need at least one instruction to materialze the constant. | |||
199 | return std::max<InstructionCost>(1, Cost); | |||
200 | } | |||
201 | ||||
202 | InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, | |||
203 | const APInt &Imm, Type *Ty, | |||
204 | TTI::TargetCostKind CostKind, | |||
205 | Instruction *Inst) { | |||
206 | assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) : __assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp" , 206, __extension__ __PRETTY_FUNCTION__)); | |||
207 | ||||
208 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); | |||
209 | // There is no cost model for constants with a bit size of 0. Return TCC_Free | |||
210 | // here, so that constant hoisting will ignore this constant. | |||
211 | if (BitSize == 0) | |||
212 | return TTI::TCC_Free; | |||
213 | ||||
214 | unsigned ImmIdx = ~0U; | |||
215 | switch (Opcode) { | |||
216 | default: | |||
217 | return TTI::TCC_Free; | |||
218 | case Instruction::GetElementPtr: | |||
219 | // Always hoist the base address of a GetElementPtr. | |||
220 | if (Idx == 0) | |||
221 | return 2 * TTI::TCC_Basic; | |||
222 | return TTI::TCC_Free; | |||
223 | case Instruction::Store: | |||
224 | ImmIdx = 0; | |||
225 | break; | |||
226 | case Instruction::Add: | |||
227 | case Instruction::Sub: | |||
228 | case Instruction::Mul: | |||
229 | case Instruction::UDiv: | |||
230 | case Instruction::SDiv: | |||
231 | case Instruction::URem: | |||
232 | case Instruction::SRem: | |||
233 | case Instruction::And: | |||
234 | case Instruction::Or: | |||
235 | case Instruction::Xor: | |||
236 | case Instruction::ICmp: | |||
237 | ImmIdx = 1; | |||
238 | break; | |||
239 | // Always return TCC_Free for the shift value of a shift instruction. | |||
240 | case Instruction::Shl: | |||
241 | case Instruction::LShr: | |||
242 | case Instruction::AShr: | |||
243 | if (Idx == 1) | |||
244 | return TTI::TCC_Free; | |||
245 | break; | |||
246 | case Instruction::Trunc: | |||
247 | case Instruction::ZExt: | |||
248 | case Instruction::SExt: | |||
249 | case Instruction::IntToPtr: | |||
250 | case Instruction::PtrToInt: | |||
251 | case Instruction::BitCast: | |||
252 | case Instruction::PHI: | |||
253 | case Instruction::Call: | |||
254 | case Instruction::Select: | |||
255 | case Instruction::Ret: | |||
256 | case Instruction::Load: | |||
257 | break; | |||
258 | } | |||
259 | ||||
260 | if (Idx == ImmIdx) { | |||
261 | int NumConstants = (BitSize + 63) / 64; | |||
262 | InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); | |||
263 | return (Cost <= NumConstants * TTI::TCC_Basic) | |||
264 | ? static_cast<int>(TTI::TCC_Free) | |||
265 | : Cost; | |||
266 | } | |||
267 | return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); | |||
268 | } | |||
269 | ||||
270 | InstructionCost | |||
271 | AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, | |||
272 | const APInt &Imm, Type *Ty, | |||
273 | TTI::TargetCostKind CostKind) { | |||
274 | assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) : __assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp" , 274, __extension__ __PRETTY_FUNCTION__)); | |||
275 | ||||
276 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); | |||
277 | // There is no cost model for constants with a bit size of 0. Return TCC_Free | |||
278 | // here, so that constant hoisting will ignore this constant. | |||
279 | if (BitSize == 0) | |||
280 | return TTI::TCC_Free; | |||
281 | ||||
282 | // Most (all?) AArch64 intrinsics do not support folding immediates into the | |||
283 | // selected instruction, so we compute the materialization cost for the | |||
284 | // immediate directly. | |||
285 | if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv) | |||
286 | return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); | |||
287 | ||||
288 | switch (IID) { | |||
289 | default: | |||
290 | return TTI::TCC_Free; | |||
291 | case Intrinsic::sadd_with_overflow: | |||
292 | case Intrinsic::uadd_with_overflow: | |||
293 | case Intrinsic::ssub_with_overflow: | |||
294 | case Intrinsic::usub_with_overflow: | |||
295 | case Intrinsic::smul_with_overflow: | |||
296 | case Intrinsic::umul_with_overflow: | |||
297 | if (Idx == 1) { | |||
298 | int NumConstants = (BitSize + 63) / 64; | |||
299 | InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); | |||
300 | return (Cost <= NumConstants * TTI::TCC_Basic) | |||
301 | ? static_cast<int>(TTI::TCC_Free) | |||
302 | : Cost; | |||
303 | } | |||
304 | break; | |||
305 | case Intrinsic::experimental_stackmap: | |||
306 | if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) | |||
307 | return TTI::TCC_Free; | |||
308 | break; | |||
309 | case Intrinsic::experimental_patchpoint_void: | |||
310 | case Intrinsic::experimental_patchpoint_i64: | |||
311 | if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) | |||
312 | return TTI::TCC_Free; | |||
313 | break; | |||
314 | case Intrinsic::experimental_gc_statepoint: | |||
315 | if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) | |||
316 | return TTI::TCC_Free; | |||
317 | break; | |||
318 | } | |||
319 | return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); | |||
320 | } | |||
321 | ||||
322 | TargetTransformInfo::PopcntSupportKind | |||
323 | AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) { | |||
324 | assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2")(static_cast <bool> (isPowerOf2_32(TyWidth) && "Ty width must be power of 2" ) ? void (0) : __assert_fail ("isPowerOf2_32(TyWidth) && \"Ty width must be power of 2\"" , "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 324 , __extension__ __PRETTY_FUNCTION__)); | |||
325 | if (TyWidth == 32 || TyWidth == 64) | |||
326 | return TTI::PSK_FastHardware; | |||
327 | // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount. | |||
328 | return TTI::PSK_Software; | |||
329 | } | |||
330 | ||||
331 | InstructionCost | |||
332 | AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, | |||
333 | TTI::TargetCostKind CostKind) { | |||
334 | auto *RetTy = ICA.getReturnType(); | |||
335 | switch (ICA.getID()) { | |||
336 | case Intrinsic::umin: | |||
337 | case Intrinsic::umax: | |||
338 | case Intrinsic::smin: | |||
339 | case Intrinsic::smax: { | |||
340 | static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, | |||
341 | MVT::v8i16, MVT::v2i32, MVT::v4i32}; | |||
342 | auto LT = getTypeLegalizationCost(RetTy); | |||
343 | // v2i64 types get converted to cmp+bif hence the cost of 2 | |||
344 | if (LT.second == MVT::v2i64) | |||
345 | return LT.first * 2; | |||
346 | if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; })) | |||
347 | return LT.first; | |||
348 | break; | |||
349 | } | |||
350 | case Intrinsic::sadd_sat: | |||
351 | case Intrinsic::ssub_sat: | |||
352 | case Intrinsic::uadd_sat: | |||
353 | case Intrinsic::usub_sat: { | |||
354 | static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, | |||
355 | MVT::v8i16, MVT::v2i32, MVT::v4i32, | |||
356 | MVT::v2i64}; | |||
357 | auto LT = getTypeLegalizationCost(RetTy); | |||
358 | // This is a base cost of 1 for the vadd, plus 3 extract shifts if we | |||
359 | // need to extend the type, as it uses shr(qadd(shl, shl)). | |||
360 | unsigned Instrs = | |||
361 | LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4; | |||
362 | if (any_of(ValidSatTys, [<](MVT M) { return M == LT.second; })) | |||
363 | return LT.first * Instrs; | |||
364 | break; | |||
365 | } | |||
366 | case Intrinsic::abs: { | |||
367 | static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, | |||
368 | MVT::v8i16, MVT::v2i32, MVT::v4i32, | |||
369 | MVT::v2i64}; | |||
370 | auto LT = getTypeLegalizationCost(RetTy); | |||
371 | if (any_of(ValidAbsTys, [<](MVT M) { return M == LT.second; })) | |||
372 | return LT.first; | |||
373 | break; | |||
374 | } | |||
375 | case Intrinsic::experimental_stepvector: { | |||
376 | InstructionCost Cost = 1; // Cost of the `index' instruction | |||
377 | auto LT = getTypeLegalizationCost(RetTy); | |||
378 | // Legalisation of illegal vectors involves an `index' instruction plus | |||
379 | // (LT.first - 1) vector adds. | |||
380 | if (LT.first > 1) { | |||
381 | Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext()); | |||
382 | InstructionCost AddCost = | |||
383 | getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind); | |||
384 | Cost += AddCost * (LT.first - 1); | |||
385 | } | |||
386 | return Cost; | |||
387 | } | |||
388 | case Intrinsic::bitreverse: { | |||
389 | static const CostTblEntry BitreverseTbl[] = { | |||
390 | {Intrinsic::bitreverse, MVT::i32, 1}, | |||
391 | {Intrinsic::bitreverse, MVT::i64, 1}, | |||
392 | {Intrinsic::bitreverse, MVT::v8i8, 1}, | |||
393 | {Intrinsic::bitreverse, MVT::v16i8, 1}, | |||
394 | {Intrinsic::bitreverse, MVT::v4i16, 2}, | |||
395 | {Intrinsic::bitreverse, MVT::v8i16, 2}, | |||
396 | {Intrinsic::bitreverse, MVT::v2i32, 2}, | |||
397 | {Intrinsic::bitreverse, MVT::v4i32, 2}, | |||
398 | {Intrinsic::bitreverse, MVT::v1i64, 2}, | |||
399 | {Intrinsic::bitreverse, MVT::v2i64, 2}, | |||
400 | }; | |||
401 | const auto LegalisationCost = getTypeLegalizationCost(RetTy); | |||
402 | const auto *Entry = | |||
403 | CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second); | |||
404 | if (Entry) { | |||
405 | // Cost Model is using the legal type(i32) that i8 and i16 will be | |||
406 | // converted to +1 so that we match the actual lowering cost | |||
407 | if (TLI->getValueType(DL, RetTy, true) == MVT::i8 || | |||
408 | TLI->getValueType(DL, RetTy, true) == MVT::i16) | |||
409 | return LegalisationCost.first * Entry->Cost + 1; | |||
410 | ||||
411 | return LegalisationCost.first * Entry->Cost; | |||
412 | } | |||
413 | break; | |||
414 | } | |||
415 | case Intrinsic::ctpop: { | |||
416 | if (!ST->hasNEON()) { | |||
417 | // 32-bit or 64-bit ctpop without NEON is 12 instructions. | |||
418 | return getTypeLegalizationCost(RetTy).first * 12; | |||
419 | } | |||
420 | static const CostTblEntry CtpopCostTbl[] = { | |||
421 | {ISD::CTPOP, MVT::v2i64, 4}, | |||
422 | {ISD::CTPOP, MVT::v4i32, 3}, | |||
423 | {ISD::CTPOP, MVT::v8i16, 2}, | |||
424 | {ISD::CTPOP, MVT::v16i8, 1}, | |||
425 | {ISD::CTPOP, MVT::i64, 4}, | |||
426 | {ISD::CTPOP, MVT::v2i32, 3}, | |||
427 | {ISD::CTPOP, MVT::v4i16, 2}, | |||
428 | {ISD::CTPOP, MVT::v8i8, 1}, | |||
429 | {ISD::CTPOP, MVT::i32, 5}, | |||
430 | }; | |||
431 | auto LT = getTypeLegalizationCost(RetTy); | |||
432 | MVT MTy = LT.second; | |||
433 | if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) { | |||
434 | // Extra cost of +1 when illegal vector types are legalized by promoting | |||
435 | // the integer type. | |||
436 | int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() != | |||
437 | RetTy->getScalarSizeInBits() | |||
438 | ? 1 | |||
439 | : 0; | |||
440 | return LT.first * Entry->Cost + ExtraCost; | |||
441 | } | |||
442 | break; | |||
443 | } | |||
444 | case Intrinsic::sadd_with_overflow: | |||
445 | case Intrinsic::uadd_with_overflow: | |||
446 | case Intrinsic::ssub_with_overflow: | |||
447 | case Intrinsic::usub_with_overflow: | |||
448 | case Intrinsic::smul_with_overflow: | |||
449 | case Intrinsic::umul_with_overflow: { | |||
450 | static const CostTblEntry WithOverflowCostTbl[] = { | |||
451 | {Intrinsic::sadd_with_overflow, MVT::i8, 3}, | |||
452 | {Intrinsic::uadd_with_overflow, MVT::i8, 3}, | |||
453 | {Intrinsic::sadd_with_overflow, MVT::i16, 3}, | |||
454 | {Intrinsic::uadd_with_overflow, MVT::i16, 3}, | |||
455 | {Intrinsic::sadd_with_overflow, MVT::i32, 1}, | |||
456 | {Intrinsic::uadd_with_overflow, MVT::i32, 1}, | |||
457 | {Intrinsic::sadd_with_overflow, MVT::i64, 1}, | |||
458 | {Intrinsic::uadd_with_overflow, MVT::i64, 1}, | |||
459 | {Intrinsic::ssub_with_overflow, MVT::i8, 3}, | |||
460 | {Intrinsic::usub_with_overflow, MVT::i8, 3}, | |||
461 | {Intrinsic::ssub_with_overflow, MVT::i16, 3}, | |||
462 | {Intrinsic::usub_with_overflow, MVT::i16, 3}, | |||
463 | {Intrinsic::ssub_with_overflow, MVT::i32, 1}, | |||
464 | {Intrinsic::usub_with_overflow, MVT::i32, 1}, | |||
465 | {Intrinsic::ssub_with_overflow, MVT::i64, 1}, | |||
466 | {Intrinsic::usub_with_overflow, MVT::i64, 1}, | |||
467 | {Intrinsic::smul_with_overflow, MVT::i8, 5}, | |||
468 | {Intrinsic::umul_with_overflow, MVT::i8, 4}, | |||
469 | {Intrinsic::smul_with_overflow, MVT::i16, 5}, | |||
470 | {Intrinsic::umul_with_overflow, MVT::i16, 4}, | |||
471 | {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst | |||
472 | {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw | |||
473 | {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp | |||
474 | {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr | |||
475 | }; | |||
476 | EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true); | |||
477 | if (MTy.isSimple()) | |||
478 | if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(), | |||
479 | MTy.getSimpleVT())) | |||
480 | return Entry->Cost; | |||
481 | break; | |||
482 | } | |||
483 | case Intrinsic::fptosi_sat: | |||
484 | case Intrinsic::fptoui_sat: { | |||
485 | if (ICA.getArgTypes().empty()) | |||
486 | break; | |||
487 | bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat; | |||
488 | auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]); | |||
489 | EVT MTy = TLI->getValueType(DL, RetTy); | |||
490 | // Check for the legal types, which are where the size of the input and the | |||
491 | // output are the same, or we are using cvt f64->i32 or f32->i64. | |||
492 | if ((LT.second == MVT::f32 || LT.second == MVT::f64 || | |||
493 | LT.second == MVT::v2f32 || LT.second == MVT::v4f32 || | |||
494 | LT.second == MVT::v2f64) && | |||
495 | (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() || | |||
496 | (LT.second == MVT::f64 && MTy == MVT::i32) || | |||
497 | (LT.second == MVT::f32 && MTy == MVT::i64))) | |||
498 | return LT.first; | |||
499 | // Similarly for fp16 sizes | |||
500 | if (ST->hasFullFP16() && | |||
501 | ((LT.second == MVT::f16 && MTy == MVT::i32) || | |||
502 | ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) && | |||
503 | (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))) | |||
504 | return LT.first; | |||
505 | ||||
506 | // Otherwise we use a legal convert followed by a min+max | |||
507 | if ((LT.second.getScalarType() == MVT::f32 || | |||
508 | LT.second.getScalarType() == MVT::f64 || | |||
509 | (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) && | |||
510 | LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) { | |||
511 | Type *LegalTy = | |||
512 | Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits()); | |||
513 | if (LT.second.isVector()) | |||
514 | LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount()); | |||
515 | InstructionCost Cost = 1; | |||
516 | IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin, | |||
517 | LegalTy, {LegalTy, LegalTy}); | |||
518 | Cost += getIntrinsicInstrCost(Attrs1, CostKind); | |||
519 | IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax, | |||
520 | LegalTy, {LegalTy, LegalTy}); | |||
521 | Cost += getIntrinsicInstrCost(Attrs2, CostKind); | |||
522 | return LT.first * Cost; | |||
523 | } | |||
524 | break; | |||
525 | } | |||
526 | default: | |||
527 | break; | |||
528 | } | |||
529 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); | |||
530 | } | |||
531 | ||||
532 | /// The function will remove redundant reinterprets casting in the presence | |||
533 | /// of the control flow | |||
534 | static std::optional<Instruction *> processPhiNode(InstCombiner &IC, | |||
535 | IntrinsicInst &II) { | |||
536 | SmallVector<Instruction *, 32> Worklist; | |||
537 | auto RequiredType = II.getType(); | |||
538 | ||||
539 | auto *PN = dyn_cast<PHINode>(II.getArgOperand(0)); | |||
540 | assert(PN && "Expected Phi Node!")(static_cast <bool> (PN && "Expected Phi Node!" ) ? void (0) : __assert_fail ("PN && \"Expected Phi Node!\"" , "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 540 , __extension__ __PRETTY_FUNCTION__)); | |||
541 | ||||
542 | // Don't create a new Phi unless we can remove the old one. | |||
543 | if (!PN->hasOneUse()) | |||
544 | return std::nullopt; | |||
545 | ||||
546 | for (Value *IncValPhi : PN->incoming_values()) { | |||
547 | auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi); | |||
548 | if (!Reinterpret || | |||
549 | Reinterpret->getIntrinsicID() != | |||
550 | Intrinsic::aarch64_sve_convert_to_svbool || | |||
551 | RequiredType != Reinterpret->getArgOperand(0)->getType()) | |||
552 | return std::nullopt; | |||
553 | } | |||
554 | ||||
555 | // Create the new Phi | |||
556 | LLVMContext &Ctx = PN->getContext(); | |||
557 | IRBuilder<> Builder(Ctx); | |||
558 | Builder.SetInsertPoint(PN); | |||
559 | PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues()); | |||
560 | Worklist.push_back(PN); | |||
561 | ||||
562 | for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) { | |||
563 | auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I)); | |||
564 | NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I)); | |||
565 | Worklist.push_back(Reinterpret); | |||
566 | } | |||
567 | ||||
568 | // Cleanup Phi Node and reinterprets | |||
569 | return IC.replaceInstUsesWith(II, NPN); | |||
570 | } | |||
571 | ||||
572 | // (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _)))) | |||
573 | // => (binop (pred) (from_svbool _) (from_svbool _)) | |||
574 | // | |||
575 | // The above transformation eliminates a `to_svbool` in the predicate | |||
576 | // operand of bitwise operation `binop` by narrowing the vector width of | |||
577 | // the operation. For example, it would convert a `<vscale x 16 x i1> | |||
578 | // and` into a `<vscale x 4 x i1> and`. This is profitable because | |||
579 | // to_svbool must zero the new lanes during widening, whereas | |||
580 | // from_svbool is free. | |||
581 | static std::optional<Instruction *> | |||
582 | tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II) { | |||
583 | auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0)); | |||
584 | if (!BinOp) | |||
585 | return std::nullopt; | |||
586 | ||||
587 | auto IntrinsicID = BinOp->getIntrinsicID(); | |||
588 | switch (IntrinsicID) { | |||
589 | case Intrinsic::aarch64_sve_and_z: | |||
590 | case Intrinsic::aarch64_sve_bic_z: | |||
591 | case Intrinsic::aarch64_sve_eor_z: | |||
592 | case Intrinsic::aarch64_sve_nand_z: | |||
593 | case Intrinsic::aarch64_sve_nor_z: | |||
594 | case Intrinsic::aarch64_sve_orn_z: | |||
595 | case Intrinsic::aarch64_sve_orr_z: | |||
596 | break; | |||
597 | default: | |||
598 | return std::nullopt; | |||
599 | } | |||
600 | ||||
601 | auto BinOpPred = BinOp->getOperand(0); | |||
602 | auto BinOpOp1 = BinOp->getOperand(1); | |||
603 | auto BinOpOp2 = BinOp->getOperand(2); | |||
604 | ||||
605 | auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred); | |||
606 | if (!PredIntr || | |||
607 | PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool) | |||
608 | return std::nullopt; | |||
609 | ||||
610 | auto PredOp = PredIntr->getOperand(0); | |||
611 | auto PredOpTy = cast<VectorType>(PredOp->getType()); | |||
612 | if (PredOpTy != II.getType()) | |||
613 | return std::nullopt; | |||
614 | ||||
615 | IRBuilder<> Builder(II.getContext()); | |||
616 | Builder.SetInsertPoint(&II); | |||
617 | ||||
618 | SmallVector<Value *> NarrowedBinOpArgs = {PredOp}; | |||
619 | auto NarrowBinOpOp1 = Builder.CreateIntrinsic( | |||
620 | Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1}); | |||
621 | NarrowedBinOpArgs.push_back(NarrowBinOpOp1); | |||
622 | if (BinOpOp1 == BinOpOp2) | |||
623 | NarrowedBinOpArgs.push_back(NarrowBinOpOp1); | |||
624 | else | |||
625 | NarrowedBinOpArgs.push_back(Builder.CreateIntrinsic( | |||
626 | Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2})); | |||
627 | ||||
628 | auto NarrowedBinOp = | |||
629 | Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs); | |||
630 | return IC.replaceInstUsesWith(II, NarrowedBinOp); | |||
631 | } | |||
632 | ||||
633 | static std::optional<Instruction *> | |||
634 | instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II) { | |||
635 | // If the reinterpret instruction operand is a PHI Node | |||
636 | if (isa<PHINode>(II.getArgOperand(0))) | |||
637 | return processPhiNode(IC, II); | |||
638 | ||||
639 | if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II)) | |||
640 | return BinOpCombine; | |||
641 | ||||
642 | SmallVector<Instruction *, 32> CandidatesForRemoval; | |||
643 | Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr; | |||
644 | ||||
645 | const auto *IVTy = cast<VectorType>(II.getType()); | |||
646 | ||||
647 | // Walk the chain of conversions. | |||
648 | while (Cursor) { | |||
649 | // If the type of the cursor has fewer lanes than the final result, zeroing | |||
650 | // must take place, which breaks the equivalence chain. | |||
651 | const auto *CursorVTy = cast<VectorType>(Cursor->getType()); | |||
652 | if (CursorVTy->getElementCount().getKnownMinValue() < | |||
653 | IVTy->getElementCount().getKnownMinValue()) | |||
654 | break; | |||
655 | ||||
656 | // If the cursor has the same type as I, it is a viable replacement. | |||
657 | if (Cursor->getType() == IVTy) | |||
658 | EarliestReplacement = Cursor; | |||
659 | ||||
660 | auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor); | |||
661 | ||||
662 | // If this is not an SVE conversion intrinsic, this is the end of the chain. | |||
663 | if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() == | |||
664 | Intrinsic::aarch64_sve_convert_to_svbool || | |||
665 | IntrinsicCursor->getIntrinsicID() == | |||
666 | Intrinsic::aarch64_sve_convert_from_svbool)) | |||
667 | break; | |||
668 | ||||
669 | CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor); | |||
670 | Cursor = IntrinsicCursor->getOperand(0); | |||
671 | } | |||
672 | ||||
673 | // If no viable replacement in the conversion chain was found, there is | |||
674 | // nothing to do. | |||
675 | if (!EarliestReplacement) | |||
676 | return std::nullopt; | |||
677 | ||||
678 | return IC.replaceInstUsesWith(II, EarliestReplacement); | |||
679 | } | |||
680 | ||||
681 | static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC, | |||
682 | IntrinsicInst &II) { | |||
683 | IRBuilder<> Builder(&II); | |||
684 | auto Select = Builder.CreateSelect(II.getOperand(0), II.getOperand(1), | |||
685 | II.getOperand(2)); | |||
686 | return IC.replaceInstUsesWith(II, Select); | |||
687 | } | |||
688 | ||||
689 | static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC, | |||
690 | IntrinsicInst &II) { | |||
691 | IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); | |||
692 | if (!Pg) | |||
693 | return std::nullopt; | |||
694 | ||||
695 | if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) | |||
696 | return std::nullopt; | |||
697 | ||||
698 | const auto PTruePattern = | |||
699 | cast<ConstantInt>(Pg->getOperand(0))->getZExtValue(); | |||
700 | if (PTruePattern != AArch64SVEPredPattern::vl1) | |||
701 | return std::nullopt; | |||
702 | ||||
703 | // The intrinsic is inserting into lane zero so use an insert instead. | |||
704 | auto *IdxTy = Type::getInt64Ty(II.getContext()); | |||
705 | auto *Insert = InsertElementInst::Create( | |||
706 | II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0)); | |||
707 | Insert->insertBefore(&II); | |||
708 | Insert->takeName(&II); | |||
709 | ||||
710 | return IC.replaceInstUsesWith(II, Insert); | |||
711 | } | |||
712 | ||||
713 | static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC, | |||
714 | IntrinsicInst &II) { | |||
715 | // Replace DupX with a regular IR splat. | |||
716 | IRBuilder<> Builder(II.getContext()); | |||
717 | Builder.SetInsertPoint(&II); | |||
718 | auto *RetTy = cast<ScalableVectorType>(II.getType()); | |||
719 | Value *Splat = | |||
720 | Builder.CreateVectorSplat(RetTy->getElementCount(), II.getArgOperand(0)); | |||
721 | Splat->takeName(&II); | |||
722 | return IC.replaceInstUsesWith(II, Splat); | |||
723 | } | |||
724 | ||||
725 | static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC, | |||
726 | IntrinsicInst &II) { | |||
727 | LLVMContext &Ctx = II.getContext(); | |||
728 | IRBuilder<> Builder(Ctx); | |||
729 | Builder.SetInsertPoint(&II); | |||
730 | ||||
731 | // Check that the predicate is all active | |||
732 | auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0)); | |||
733 | if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) | |||
734 | return std::nullopt; | |||
735 | ||||
736 | const auto PTruePattern = | |||
737 | cast<ConstantInt>(Pg->getOperand(0))->getZExtValue(); | |||
738 | if (PTruePattern != AArch64SVEPredPattern::all) | |||
739 | return std::nullopt; | |||
740 | ||||
741 | // Check that we have a compare of zero.. | |||
742 | auto *SplatValue = | |||
743 | dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2))); | |||
744 | if (!SplatValue || !SplatValue->isZero()) | |||
745 | return std::nullopt; | |||
746 | ||||
747 | // ..against a dupq | |||
748 | auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); | |||
749 | if (!DupQLane || | |||
750 | DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane) | |||
751 | return std::nullopt; | |||
752 | ||||
753 | // Where the dupq is a lane 0 replicate of a vector insert | |||
754 | if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero()) | |||
755 | return std::nullopt; | |||
756 | ||||
757 | auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0)); | |||
758 | if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert) | |||
759 | return std::nullopt; | |||
760 | ||||
761 | // Where the vector insert is a fixed constant vector insert into undef at | |||
762 | // index zero | |||
763 | if (!isa<UndefValue>(VecIns->getArgOperand(0))) | |||
764 | return std::nullopt; | |||
765 | ||||
766 | if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero()) | |||
767 | return std::nullopt; | |||
768 | ||||
769 | auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1)); | |||
770 | if (!ConstVec) | |||
771 | return std::nullopt; | |||
772 | ||||
773 | auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType()); | |||
774 | auto *OutTy = dyn_cast<ScalableVectorType>(II.getType()); | |||
775 | if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements()) | |||
776 | return std::nullopt; | |||
777 | ||||
778 | unsigned NumElts = VecTy->getNumElements(); | |||
779 | unsigned PredicateBits = 0; | |||
780 | ||||
781 | // Expand intrinsic operands to a 16-bit byte level predicate | |||
782 | for (unsigned I = 0; I < NumElts; ++I) { | |||
783 | auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I)); | |||
784 | if (!Arg) | |||
785 | return std::nullopt; | |||
786 | if (!Arg->isZero()) | |||
787 | PredicateBits |= 1 << (I * (16 / NumElts)); | |||
788 | } | |||
789 | ||||
790 | // If all bits are zero bail early with an empty predicate | |||
791 | if (PredicateBits == 0) { | |||
792 | auto *PFalse = Constant::getNullValue(II.getType()); | |||
793 | PFalse->takeName(&II); | |||
794 | return IC.replaceInstUsesWith(II, PFalse); | |||
795 | } | |||
796 | ||||
797 | // Calculate largest predicate type used (where byte predicate is largest) | |||
798 | unsigned Mask = 8; | |||
799 | for (unsigned I = 0; I < 16; ++I) | |||
800 | if ((PredicateBits & (1 << I)) != 0) | |||
801 | Mask |= (I % 8); | |||
802 | ||||
803 | unsigned PredSize = Mask & -Mask; | |||
804 | auto *PredType = ScalableVectorType::get( | |||
805 | Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8)); | |||
806 | ||||
807 | // Ensure all relevant bits are set | |||
808 | for (unsigned I = 0; I < 16; I += PredSize) | |||
809 | if ((PredicateBits & (1 << I)) == 0) | |||
810 | return std::nullopt; | |||
811 | ||||
812 | auto *PTruePat = | |||
813 | ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all); | |||
814 | auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, | |||
815 | {PredType}, {PTruePat}); | |||
816 | auto *ConvertToSVBool = Builder.CreateIntrinsic( | |||
817 | Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue}); | |||
818 | auto *ConvertFromSVBool = | |||
819 | Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, | |||
820 | {II.getType()}, {ConvertToSVBool}); | |||
821 | ||||
822 | ConvertFromSVBool->takeName(&II); | |||
823 | return IC.replaceInstUsesWith(II, ConvertFromSVBool); | |||
824 | } | |||
825 | ||||
826 | static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC, | |||
827 | IntrinsicInst &II) { | |||
828 | IRBuilder<> Builder(II.getContext()); | |||
829 | Builder.SetInsertPoint(&II); | |||
830 | Value *Pg = II.getArgOperand(0); | |||
831 | Value *Vec = II.getArgOperand(1); | |||
832 | auto IntrinsicID = II.getIntrinsicID(); | |||
833 | bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta; | |||
834 | ||||
835 | // lastX(splat(X)) --> X | |||
836 | if (auto *SplatVal = getSplatValue(Vec)) | |||
837 | return IC.replaceInstUsesWith(II, SplatVal); | |||
838 | ||||
839 | // If x and/or y is a splat value then: | |||
840 | // lastX (binop (x, y)) --> binop(lastX(x), lastX(y)) | |||
841 | Value *LHS, *RHS; | |||
842 | if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) { | |||
843 | if (isSplatValue(LHS) || isSplatValue(RHS)) { | |||
844 | auto *OldBinOp = cast<BinaryOperator>(Vec); | |||
845 | auto OpC = OldBinOp->getOpcode(); | |||
846 | auto *NewLHS = | |||
847 | Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS}); | |||
848 | auto *NewRHS = | |||
849 | Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS}); | |||
850 | auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags( | |||
851 | OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), &II); | |||
852 | return IC.replaceInstUsesWith(II, NewBinOp); | |||
853 | } | |||
854 | } | |||
855 | ||||
856 | auto *C = dyn_cast<Constant>(Pg); | |||
857 | if (IsAfter && C && C->isNullValue()) { | |||
858 | // The intrinsic is extracting lane 0 so use an extract instead. | |||
859 | auto *IdxTy = Type::getInt64Ty(II.getContext()); | |||
860 | auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0)); | |||
861 | Extract->insertBefore(&II); | |||
862 | Extract->takeName(&II); | |||
863 | return IC.replaceInstUsesWith(II, Extract); | |||
864 | } | |||
865 | ||||
866 | auto *IntrPG = dyn_cast<IntrinsicInst>(Pg); | |||
867 | if (!IntrPG) | |||
868 | return std::nullopt; | |||
869 | ||||
870 | if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) | |||
871 | return std::nullopt; | |||
872 | ||||
873 | const auto PTruePattern = | |||
874 | cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue(); | |||
875 | ||||
876 | // Can the intrinsic's predicate be converted to a known constant index? | |||
877 | unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern); | |||
878 | if (!MinNumElts) | |||
879 | return std::nullopt; | |||
880 | ||||
881 | unsigned Idx = MinNumElts - 1; | |||
882 | // Increment the index if extracting the element after the last active | |||
883 | // predicate element. | |||
884 | if (IsAfter) | |||
885 | ++Idx; | |||
886 | ||||
887 | // Ignore extracts whose index is larger than the known minimum vector | |||
888 | // length. NOTE: This is an artificial constraint where we prefer to | |||
889 | // maintain what the user asked for until an alternative is proven faster. | |||
890 | auto *PgVTy = cast<ScalableVectorType>(Pg->getType()); | |||
891 | if (Idx >= PgVTy->getMinNumElements()) | |||
892 | return std::nullopt; | |||
893 | ||||
894 | // The intrinsic is extracting a fixed lane so use an extract instead. | |||
895 | auto *IdxTy = Type::getInt64Ty(II.getContext()); | |||
896 | auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx)); | |||
897 | Extract->insertBefore(&II); | |||
898 | Extract->takeName(&II); | |||
899 | return IC.replaceInstUsesWith(II, Extract); | |||
900 | } | |||
901 | ||||
902 | static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC, | |||
903 | IntrinsicInst &II) { | |||
904 | // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar | |||
905 | // integer variant across a variety of micro-architectures. Replace scalar | |||
906 | // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple | |||
907 | // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more | |||
908 | // depending on the micro-architecture, but has been observed as generally | |||
909 | // being faster, particularly when the CLAST[AB] op is a loop-carried | |||
910 | // dependency. | |||
911 | IRBuilder<> Builder(II.getContext()); | |||
912 | Builder.SetInsertPoint(&II); | |||
913 | Value *Pg = II.getArgOperand(0); | |||
914 | Value *Fallback = II.getArgOperand(1); | |||
915 | Value *Vec = II.getArgOperand(2); | |||
916 | Type *Ty = II.getType(); | |||
917 | ||||
918 | if (!Ty->isIntegerTy()) | |||
919 | return std::nullopt; | |||
920 | ||||
921 | Type *FPTy; | |||
922 | switch (cast<IntegerType>(Ty)->getBitWidth()) { | |||
923 | default: | |||
924 | return std::nullopt; | |||
925 | case 16: | |||
926 | FPTy = Builder.getHalfTy(); | |||
927 | break; | |||
928 | case 32: | |||
929 | FPTy = Builder.getFloatTy(); | |||
930 | break; | |||
931 | case 64: | |||
932 | FPTy = Builder.getDoubleTy(); | |||
933 | break; | |||
934 | } | |||
935 | ||||
936 | Value *FPFallBack = Builder.CreateBitCast(Fallback, FPTy); | |||
937 | auto *FPVTy = VectorType::get( | |||
938 | FPTy, cast<VectorType>(Vec->getType())->getElementCount()); | |||
939 | Value *FPVec = Builder.CreateBitCast(Vec, FPVTy); | |||
940 | auto *FPII = Builder.CreateIntrinsic(II.getIntrinsicID(), {FPVec->getType()}, | |||
941 | {Pg, FPFallBack, FPVec}); | |||
942 | Value *FPIItoInt = Builder.CreateBitCast(FPII, II.getType()); | |||
943 | return IC.replaceInstUsesWith(II, FPIItoInt); | |||
944 | } | |||
945 | ||||
946 | static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC, | |||
947 | IntrinsicInst &II) { | |||
948 | LLVMContext &Ctx = II.getContext(); | |||
949 | IRBuilder<> Builder(Ctx); | |||
950 | Builder.SetInsertPoint(&II); | |||
951 | // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr | |||
952 | // can work with RDFFR_PP for ptest elimination. | |||
953 | auto *AllPat = | |||
954 | ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all); | |||
955 | auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, | |||
956 | {II.getType()}, {AllPat}); | |||
957 | auto *RDFFR = | |||
958 | Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue}); | |||
959 | RDFFR->takeName(&II); | |||
960 | return IC.replaceInstUsesWith(II, RDFFR); | |||
961 | } | |||
962 | ||||
963 | static std::optional<Instruction *> | |||
964 | instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) { | |||
965 | const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue(); | |||
966 | ||||
967 | if (Pattern == AArch64SVEPredPattern::all) { | |||
968 | LLVMContext &Ctx = II.getContext(); | |||
969 | IRBuilder<> Builder(Ctx); | |||
970 | Builder.SetInsertPoint(&II); | |||
971 | ||||
972 | Constant *StepVal = ConstantInt::get(II.getType(), NumElts); | |||
973 | auto *VScale = Builder.CreateVScale(StepVal); | |||
974 | VScale->takeName(&II); | |||
975 | return IC.replaceInstUsesWith(II, VScale); | |||
976 | } | |||
977 | ||||
978 | unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern); | |||
979 | ||||
980 | return MinNumElts && NumElts >= MinNumElts | |||
981 | ? std::optional<Instruction *>(IC.replaceInstUsesWith( | |||
982 | II, ConstantInt::get(II.getType(), MinNumElts))) | |||
983 | : std::nullopt; | |||
984 | } | |||
985 | ||||
986 | static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC, | |||
987 | IntrinsicInst &II) { | |||
988 | Value *PgVal = II.getArgOperand(0); | |||
989 | Value *OpVal = II.getArgOperand(1); | |||
990 | ||||
991 | IRBuilder<> Builder(II.getContext()); | |||
992 | Builder.SetInsertPoint(&II); | |||
993 | ||||
994 | // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X). | |||
995 | // Later optimizations prefer this form. | |||
996 | if (PgVal == OpVal && | |||
997 | (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first || | |||
998 | II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) { | |||
999 | Value *Ops[] = {PgVal, OpVal}; | |||
1000 | Type *Tys[] = {PgVal->getType()}; | |||
1001 | ||||
1002 | auto *PTest = | |||
1003 | Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops); | |||
1004 | PTest->takeName(&II); | |||
1005 | ||||
1006 | return IC.replaceInstUsesWith(II, PTest); | |||
1007 | } | |||
1008 | ||||
1009 | IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(PgVal); | |||
1010 | IntrinsicInst *Op = dyn_cast<IntrinsicInst>(OpVal); | |||
1011 | ||||
1012 | if (!Pg || !Op) | |||
1013 | return std::nullopt; | |||
1014 | ||||
1015 | Intrinsic::ID OpIID = Op->getIntrinsicID(); | |||
1016 | ||||
1017 | if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && | |||
1018 | OpIID == Intrinsic::aarch64_sve_convert_to_svbool && | |||
1019 | Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) { | |||
1020 | Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)}; | |||
1021 | Type *Tys[] = {Pg->getArgOperand(0)->getType()}; | |||
1022 | ||||
1023 | auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops); | |||
1024 | ||||
1025 | PTest->takeName(&II); | |||
1026 | return IC.replaceInstUsesWith(II, PTest); | |||
1027 | } | |||
1028 | ||||
1029 | // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)). | |||
1030 | // Later optimizations may rewrite sequence to use the flag-setting variant | |||
1031 | // of instruction X to remove PTEST. | |||
1032 | if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) && | |||
1033 | ((OpIID == Intrinsic::aarch64_sve_brka_z) || | |||
1034 | (OpIID == Intrinsic::aarch64_sve_brkb_z) || | |||
1035 | (OpIID == Intrinsic::aarch64_sve_brkpa_z) || | |||
1036 | (OpIID == Intrinsic::aarch64_sve_brkpb_z) || | |||
1037 | (OpIID == Intrinsic::aarch64_sve_rdffr_z) || | |||
1038 | (OpIID == Intrinsic::aarch64_sve_and_z) || | |||
1039 | (OpIID == Intrinsic::aarch64_sve_bic_z) || | |||
1040 | (OpIID == Intrinsic::aarch64_sve_eor_z) || | |||
1041 | (OpIID == Intrinsic::aarch64_sve_nand_z) || | |||
1042 | (OpIID == Intrinsic::aarch64_sve_nor_z) || | |||
1043 | (OpIID == Intrinsic::aarch64_sve_orn_z) || | |||
1044 | (OpIID == Intrinsic::aarch64_sve_orr_z))) { | |||
1045 | Value *Ops[] = {Pg->getArgOperand(0), Pg}; | |||
1046 | Type *Tys[] = {Pg->getType()}; | |||
1047 | ||||
1048 | auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops); | |||
1049 | PTest->takeName(&II); | |||
1050 | ||||
1051 | return IC.replaceInstUsesWith(II, PTest); | |||
1052 | } | |||
1053 | ||||
1054 | return std::nullopt; | |||
1055 | } | |||
1056 | ||||
1057 | template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc> | |||
1058 | static std::optional<Instruction *> | |||
1059 | instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, | |||
1060 | bool MergeIntoAddendOp) { | |||
1061 | Value *P = II.getOperand(0); | |||
1062 | Value *MulOp0, *MulOp1, *AddendOp, *Mul; | |||
1063 | if (MergeIntoAddendOp) { | |||
1064 | AddendOp = II.getOperand(1); | |||
1065 | Mul = II.getOperand(2); | |||
1066 | } else { | |||
1067 | AddendOp = II.getOperand(2); | |||
1068 | Mul = II.getOperand(1); | |||
1069 | } | |||
1070 | ||||
1071 | if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(P), m_Value(MulOp0), | |||
1072 | m_Value(MulOp1)))) | |||
1073 | return std::nullopt; | |||
1074 | ||||
1075 | if (!Mul->hasOneUse()) | |||
1076 | return std::nullopt; | |||
1077 | ||||
1078 | Instruction *FMFSource = nullptr; | |||
1079 | if (II.getType()->isFPOrFPVectorTy()) { | |||
1080 | llvm::FastMathFlags FAddFlags = II.getFastMathFlags(); | |||
1081 | // Stop the combine when the flags on the inputs differ in case dropping | |||
1082 | // flags would lead to us missing out on more beneficial optimizations. | |||
1083 | if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags()) | |||
1084 | return std::nullopt; | |||
1085 | if (!FAddFlags.allowContract()) | |||
1086 | return std::nullopt; | |||
1087 | FMFSource = &II; | |||
1088 | } | |||
1089 | ||||
1090 | IRBuilder<> Builder(II.getContext()); | |||
1091 | Builder.SetInsertPoint(&II); | |||
1092 | ||||
1093 | CallInst *Res; | |||
1094 | if (MergeIntoAddendOp) | |||
1095 | Res = Builder.CreateIntrinsic(FuseOpc, {II.getType()}, | |||
1096 | {P, AddendOp, MulOp0, MulOp1}, FMFSource); | |||
1097 | else | |||
1098 | Res = Builder.CreateIntrinsic(FuseOpc, {II.getType()}, | |||
1099 | {P, MulOp0, MulOp1, AddendOp}, FMFSource); | |||
1100 | ||||
1101 | return IC.replaceInstUsesWith(II, Res); | |||
1102 | } | |||
1103 | ||||
1104 | static bool isAllActivePredicate(Value *Pred) { | |||
1105 | // Look through convert.from.svbool(convert.to.svbool(...) chain. | |||
1106 | Value *UncastedPred; | |||
1107 | if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>( | |||
1108 | m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>( | |||
1109 | m_Value(UncastedPred))))) | |||
1110 | // If the predicate has the same or less lanes than the uncasted | |||
1111 | // predicate then we know the casting has no effect. | |||
1112 | if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <= | |||
1113 | cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements()) | |||
1114 | Pred = UncastedPred; | |||
1115 | ||||
1116 | return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( | |||
1117 | m_ConstantInt<AArch64SVEPredPattern::all>())); | |||
1118 | } | |||
1119 | ||||
1120 | static std::optional<Instruction *> | |||
1121 | instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { | |||
1122 | IRBuilder<> Builder(II.getContext()); | |||
1123 | Builder.SetInsertPoint(&II); | |||
1124 | ||||
1125 | Value *Pred = II.getOperand(0); | |||
1126 | Value *PtrOp = II.getOperand(1); | |||
1127 | Type *VecTy = II.getType(); | |||
1128 | Value *VecPtr = Builder.CreateBitCast(PtrOp, VecTy->getPointerTo()); | |||
1129 | ||||
1130 | if (isAllActivePredicate(Pred)) { | |||
1131 | LoadInst *Load = Builder.CreateLoad(VecTy, VecPtr); | |||
1132 | Load->copyMetadata(II); | |||
1133 | return IC.replaceInstUsesWith(II, Load); | |||
1134 | } | |||
1135 | ||||
1136 | CallInst *MaskedLoad = | |||
1137 | Builder.CreateMaskedLoad(VecTy, VecPtr, PtrOp->getPointerAlignment(DL), | |||
1138 | Pred, ConstantAggregateZero::get(VecTy)); | |||
1139 | MaskedLoad->copyMetadata(II); | |||
1140 | return IC.replaceInstUsesWith(II, MaskedLoad); | |||
1141 | } | |||
1142 | ||||
1143 | static std::optional<Instruction *> | |||
1144 | instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { | |||
1145 | IRBuilder<> Builder(II.getContext()); | |||
1146 | Builder.SetInsertPoint(&II); | |||
1147 | ||||
1148 | Value *VecOp = II.getOperand(0); | |||
1149 | Value *Pred = II.getOperand(1); | |||
1150 | Value *PtrOp = II.getOperand(2); | |||
1151 | Value *VecPtr = | |||
1152 | Builder.CreateBitCast(PtrOp, VecOp->getType()->getPointerTo()); | |||
1153 | ||||
1154 | if (isAllActivePredicate(Pred)) { | |||
1155 | StoreInst *Store = Builder.CreateStore(VecOp, VecPtr); | |||
1156 | Store->copyMetadata(II); | |||
1157 | return IC.eraseInstFromFunction(II); | |||
1158 | } | |||
1159 | ||||
1160 | CallInst *MaskedStore = Builder.CreateMaskedStore( | |||
1161 | VecOp, VecPtr, PtrOp->getPointerAlignment(DL), Pred); | |||
1162 | MaskedStore->copyMetadata(II); | |||
1163 | return IC.eraseInstFromFunction(II); | |||
1164 | } | |||
1165 | ||||
1166 | static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) { | |||
1167 | switch (Intrinsic) { | |||
1168 | case Intrinsic::aarch64_sve_fmul: | |||
1169 | return Instruction::BinaryOps::FMul; | |||
1170 | case Intrinsic::aarch64_sve_fadd: | |||
1171 | return Instruction::BinaryOps::FAdd; | |||
1172 | case Intrinsic::aarch64_sve_fsub: | |||
1173 | return Instruction::BinaryOps::FSub; | |||
1174 | default: | |||
1175 | return Instruction::BinaryOpsEnd; | |||
1176 | } | |||
1177 | } | |||
1178 | ||||
1179 | static std::optional<Instruction *> | |||
1180 | instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II) { | |||
1181 | auto *OpPredicate = II.getOperand(0); | |||
1182 | auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID()); | |||
1183 | if (BinOpCode == Instruction::BinaryOpsEnd || | |||
1184 | !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( | |||
1185 | m_ConstantInt<AArch64SVEPredPattern::all>()))) | |||
1186 | return std::nullopt; | |||
1187 | IRBuilder<> Builder(II.getContext()); | |||
1188 | Builder.SetInsertPoint(&II); | |||
1189 | Builder.setFastMathFlags(II.getFastMathFlags()); | |||
1190 | auto BinOp = | |||
1191 | Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2)); | |||
1192 | return IC.replaceInstUsesWith(II, BinOp); | |||
1193 | } | |||
1194 | ||||
1195 | static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC, | |||
1196 | IntrinsicInst &II) { | |||
1197 | if (auto FMLA = | |||
1198 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, | |||
1199 | Intrinsic::aarch64_sve_fmla>(IC, II, | |||
1200 | true)) | |||
1201 | return FMLA; | |||
1202 | if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul, | |||
1203 | Intrinsic::aarch64_sve_mla>( | |||
1204 | IC, II, true)) | |||
1205 | return MLA; | |||
1206 | if (auto FMAD = | |||
1207 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, | |||
1208 | Intrinsic::aarch64_sve_fmad>(IC, II, | |||
1209 | false)) | |||
1210 | return FMAD; | |||
1211 | if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul, | |||
1212 | Intrinsic::aarch64_sve_mad>( | |||
1213 | IC, II, false)) | |||
1214 | return MAD; | |||
1215 | return instCombineSVEVectorBinOp(IC, II); | |||
1216 | } | |||
1217 | ||||
1218 | static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC, | |||
1219 | IntrinsicInst &II) { | |||
1220 | if (auto FMLS = | |||
1221 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, | |||
1222 | Intrinsic::aarch64_sve_fmls>(IC, II, | |||
1223 | true)) | |||
1224 | return FMLS; | |||
1225 | if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul, | |||
1226 | Intrinsic::aarch64_sve_mls>( | |||
1227 | IC, II, true)) | |||
1228 | return MLS; | |||
1229 | if (auto FMSB = | |||
1230 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, | |||
1231 | Intrinsic::aarch64_sve_fnmsb>( | |||
1232 | IC, II, false)) | |||
1233 | return FMSB; | |||
1234 | return instCombineSVEVectorBinOp(IC, II); | |||
1235 | } | |||
1236 | ||||
1237 | static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC, | |||
1238 | IntrinsicInst &II) { | |||
1239 | auto *OpPredicate = II.getOperand(0); | |||
1240 | auto *OpMultiplicand = II.getOperand(1); | |||
1241 | auto *OpMultiplier = II.getOperand(2); | |||
1242 | ||||
1243 | IRBuilder<> Builder(II.getContext()); | |||
1244 | Builder.SetInsertPoint(&II); | |||
1245 | ||||
1246 | // Return true if a given instruction is a unit splat value, false otherwise. | |||
1247 | auto IsUnitSplat = [](auto *I) { | |||
1248 | auto *SplatValue = getSplatValue(I); | |||
1249 | if (!SplatValue) | |||
1250 | return false; | |||
1251 | return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); | |||
1252 | }; | |||
1253 | ||||
1254 | // Return true if a given instruction is an aarch64_sve_dup intrinsic call | |||
1255 | // with a unit splat value, false otherwise. | |||
1256 | auto IsUnitDup = [](auto *I) { | |||
1257 | auto *IntrI = dyn_cast<IntrinsicInst>(I); | |||
1258 | if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup) | |||
1259 | return false; | |||
1260 | ||||
1261 | auto *SplatValue = IntrI->getOperand(2); | |||
1262 | return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); | |||
1263 | }; | |||
1264 | ||||
1265 | if (IsUnitSplat(OpMultiplier)) { | |||
1266 | // [f]mul pg %n, (dupx 1) => %n | |||
1267 | OpMultiplicand->takeName(&II); | |||
1268 | return IC.replaceInstUsesWith(II, OpMultiplicand); | |||
1269 | } else if (IsUnitDup(OpMultiplier)) { | |||
1270 | // [f]mul pg %n, (dup pg 1) => %n | |||
1271 | auto *DupInst = cast<IntrinsicInst>(OpMultiplier); | |||
1272 | auto *DupPg = DupInst->getOperand(1); | |||
1273 | // TODO: this is naive. The optimization is still valid if DupPg | |||
1274 | // 'encompasses' OpPredicate, not only if they're the same predicate. | |||
1275 | if (OpPredicate == DupPg) { | |||
1276 | OpMultiplicand->takeName(&II); | |||
1277 | return IC.replaceInstUsesWith(II, OpMultiplicand); | |||
1278 | } | |||
1279 | } | |||
1280 | ||||
1281 | return instCombineSVEVectorBinOp(IC, II); | |||
1282 | } | |||
1283 | ||||
1284 | static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC, | |||
1285 | IntrinsicInst &II) { | |||
1286 | IRBuilder<> Builder(II.getContext()); | |||
1287 | Builder.SetInsertPoint(&II); | |||
1288 | Value *UnpackArg = II.getArgOperand(0); | |||
1289 | auto *RetTy = cast<ScalableVectorType>(II.getType()); | |||
1290 | bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi || | |||
1291 | II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo; | |||
1292 | ||||
1293 | // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X)) | |||
1294 | // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X)) | |||
1295 | if (auto *ScalarArg = getSplatValue(UnpackArg)) { | |||
1296 | ScalarArg = | |||
1297 | Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned); | |||
1298 | Value *NewVal = | |||
1299 | Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg); | |||
1300 | NewVal->takeName(&II); | |||
1301 | return IC.replaceInstUsesWith(II, NewVal); | |||
1302 | } | |||
1303 | ||||
1304 | return std::nullopt; | |||
1305 | } | |||
1306 | static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC, | |||
1307 | IntrinsicInst &II) { | |||
1308 | auto *OpVal = II.getOperand(0); | |||
1309 | auto *OpIndices = II.getOperand(1); | |||
1310 | VectorType *VTy = cast<VectorType>(II.getType()); | |||
1311 | ||||
1312 | // Check whether OpIndices is a constant splat value < minimal element count | |||
1313 | // of result. | |||
1314 | auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices)); | |||
1315 | if (!SplatValue || | |||
1316 | SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue())) | |||
1317 | return std::nullopt; | |||
1318 | ||||
1319 | // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to | |||
1320 | // splat_vector(extractelement(OpVal, SplatValue)) for further optimization. | |||
1321 | IRBuilder<> Builder(II.getContext()); | |||
1322 | Builder.SetInsertPoint(&II); | |||
1323 | auto *Extract = Builder.CreateExtractElement(OpVal, SplatValue); | |||
1324 | auto *VectorSplat = | |||
1325 | Builder.CreateVectorSplat(VTy->getElementCount(), Extract); | |||
1326 | ||||
1327 | VectorSplat->takeName(&II); | |||
1328 | return IC.replaceInstUsesWith(II, VectorSplat); | |||
1329 | } | |||
1330 | ||||
1331 | static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC, | |||
1332 | IntrinsicInst &II) { | |||
1333 | // zip1(uzp1(A, B), uzp2(A, B)) --> A | |||
1334 | // zip2(uzp1(A, B), uzp2(A, B)) --> B | |||
1335 | Value *A, *B; | |||
1336 | if (match(II.getArgOperand(0), | |||
1337 | m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) && | |||
1338 | match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>( | |||
1339 | m_Specific(A), m_Specific(B)))) | |||
1340 | return IC.replaceInstUsesWith( | |||
1341 | II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B)); | |||
1342 | ||||
1343 | return std::nullopt; | |||
1344 | } | |||
1345 | ||||
1346 | static std::optional<Instruction *> | |||
1347 | instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) { | |||
1348 | Value *Mask = II.getOperand(0); | |||
1349 | Value *BasePtr = II.getOperand(1); | |||
1350 | Value *Index = II.getOperand(2); | |||
1351 | Type *Ty = II.getType(); | |||
1352 | Value *PassThru = ConstantAggregateZero::get(Ty); | |||
1353 | ||||
1354 | // Contiguous gather => masked load. | |||
1355 | // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1)) | |||
1356 | // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer) | |||
1357 | Value *IndexBase; | |||
1358 | if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>( | |||
1359 | m_Value(IndexBase), m_SpecificInt(1)))) { | |||
1360 | IRBuilder<> Builder(II.getContext()); | |||
1361 | Builder.SetInsertPoint(&II); | |||
1362 | ||||
1363 | Align Alignment = | |||
1364 | BasePtr->getPointerAlignment(II.getModule()->getDataLayout()); | |||
1365 | ||||
1366 | Type *VecPtrTy = PointerType::getUnqual(Ty); | |||
1367 | Value *Ptr = Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(), | |||
1368 | BasePtr, IndexBase); | |||
1369 | Ptr = Builder.CreateBitCast(Ptr, VecPtrTy); | |||
1370 | CallInst *MaskedLoad = | |||
1371 | Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru); | |||
1372 | MaskedLoad->takeName(&II); | |||
1373 | return IC.replaceInstUsesWith(II, MaskedLoad); | |||
1374 | } | |||
1375 | ||||
1376 | return std::nullopt; | |||
1377 | } | |||
1378 | ||||
1379 | static std::optional<Instruction *> | |||
1380 | instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II) { | |||
1381 | Value *Val = II.getOperand(0); | |||
1382 | Value *Mask = II.getOperand(1); | |||
1383 | Value *BasePtr = II.getOperand(2); | |||
1384 | Value *Index = II.getOperand(3); | |||
1385 | Type *Ty = Val->getType(); | |||
1386 | ||||
1387 | // Contiguous scatter => masked store. | |||
1388 | // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1)) | |||
1389 | // => (masked.store Value (gep BasePtr IndexBase) Align Mask) | |||
1390 | Value *IndexBase; | |||
1391 | if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>( | |||
1392 | m_Value(IndexBase), m_SpecificInt(1)))) { | |||
1393 | IRBuilder<> Builder(II.getContext()); | |||
1394 | Builder.SetInsertPoint(&II); | |||
1395 | ||||
1396 | Align Alignment = | |||
1397 | BasePtr->getPointerAlignment(II.getModule()->getDataLayout()); | |||
1398 | ||||
1399 | Value *Ptr = Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(), | |||
1400 | BasePtr, IndexBase); | |||
1401 | Type *VecPtrTy = PointerType::getUnqual(Ty); | |||
1402 | Ptr = Builder.CreateBitCast(Ptr, VecPtrTy); | |||
1403 | ||||
1404 | (void)Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask); | |||
1405 | ||||
1406 | return IC.eraseInstFromFunction(II); | |||
1407 | } | |||
1408 | ||||
1409 | return std::nullopt; | |||
1410 | } | |||
1411 | ||||
1412 | static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC, | |||
1413 | IntrinsicInst &II) { | |||
1414 | IRBuilder<> Builder(II.getContext()); | |||
1415 | Builder.SetInsertPoint(&II); | |||
1416 | Type *Int32Ty = Builder.getInt32Ty(); | |||
1417 | Value *Pred = II.getOperand(0); | |||
1418 | Value *Vec = II.getOperand(1); | |||
1419 | Value *DivVec = II.getOperand(2); | |||
1420 | ||||
1421 | Value *SplatValue = getSplatValue(DivVec); | |||
1422 | ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue); | |||
1423 | if (!SplatConstantInt) | |||
1424 | return std::nullopt; | |||
1425 | APInt Divisor = SplatConstantInt->getValue(); | |||
1426 | ||||
1427 | if (Divisor.isPowerOf2()) { | |||
1428 | Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2()); | |||
1429 | auto ASRD = Builder.CreateIntrinsic( | |||
1430 | Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2}); | |||
1431 | return IC.replaceInstUsesWith(II, ASRD); | |||
1432 | } | |||
1433 | if (Divisor.isNegatedPowerOf2()) { | |||
1434 | Divisor.negate(); | |||
1435 | Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2()); | |||
1436 | auto ASRD = Builder.CreateIntrinsic( | |||
1437 | Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2}); | |||
1438 | auto NEG = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_neg, | |||
1439 | {ASRD->getType()}, {ASRD, Pred, ASRD}); | |||
1440 | return IC.replaceInstUsesWith(II, NEG); | |||
1441 | } | |||
1442 | ||||
1443 | return std::nullopt; | |||
1444 | } | |||
1445 | ||||
1446 | bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) { | |||
1447 | size_t VecSize = Vec.size(); | |||
1448 | if (VecSize == 1) | |||
1449 | return true; | |||
1450 | if (!isPowerOf2_64(VecSize)) | |||
1451 | return false; | |||
1452 | size_t HalfVecSize = VecSize / 2; | |||
1453 | ||||
1454 | for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize; | |||
1455 | RHS != Vec.end(); LHS++, RHS++) { | |||
1456 | if (*LHS != nullptr && *RHS != nullptr) { | |||
1457 | if (*LHS == *RHS) | |||
1458 | continue; | |||
1459 | else | |||
1460 | return false; | |||
1461 | } | |||
1462 | if (!AllowPoison) | |||
1463 | return false; | |||
1464 | if (*LHS == nullptr && *RHS != nullptr) | |||
1465 | *LHS = *RHS; | |||
1466 | } | |||
1467 | ||||
1468 | Vec.resize(HalfVecSize); | |||
1469 | SimplifyValuePattern(Vec, AllowPoison); | |||
1470 | return true; | |||
1471 | } | |||
1472 | ||||
1473 | // Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B) | |||
1474 | // to dupqlane(f64(C)) where C is A concatenated with B | |||
1475 | static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC, | |||
1476 | IntrinsicInst &II) { | |||
1477 | Value *CurrentInsertElt = nullptr, *Default = nullptr; | |||
1478 | if (!match(II.getOperand(0), | |||
1479 | m_Intrinsic<Intrinsic::vector_insert>( | |||
1480 | m_Value(Default), m_Value(CurrentInsertElt), m_Value())) || | |||
1481 | !isa<FixedVectorType>(CurrentInsertElt->getType())) | |||
1482 | return std::nullopt; | |||
1483 | auto IIScalableTy = cast<ScalableVectorType>(II.getType()); | |||
1484 | ||||
1485 | // Insert the scalars into a container ordered by InsertElement index | |||
1486 | SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr); | |||
1487 | while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) { | |||
1488 | auto Idx = cast<ConstantInt>(InsertElt->getOperand(2)); | |||
1489 | Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1); | |||
1490 | CurrentInsertElt = InsertElt->getOperand(0); | |||
1491 | } | |||
1492 | ||||
1493 | bool AllowPoison = | |||
1494 | isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default); | |||
1495 | if (!SimplifyValuePattern(Elts, AllowPoison)) | |||
1496 | return std::nullopt; | |||
1497 | ||||
1498 | // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b) | |||
1499 | IRBuilder<> Builder(II.getContext()); | |||
1500 | Builder.SetInsertPoint(&II); | |||
1501 | Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType()); | |||
1502 | for (size_t I = 0; I < Elts.size(); I++) { | |||
1503 | if (Elts[I] == nullptr) | |||
1504 | continue; | |||
1505 | InsertEltChain = Builder.CreateInsertElement(InsertEltChain, Elts[I], | |||
1506 | Builder.getInt64(I)); | |||
1507 | } | |||
1508 | if (InsertEltChain == nullptr) | |||
1509 | return std::nullopt; | |||
1510 | ||||
1511 | // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64 | |||
1512 | // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector | |||
1513 | // be bitcast to a type wide enough to fit the sequence, be splatted, and then | |||
1514 | // be narrowed back to the original type. | |||
1515 | unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size(); | |||
1516 | unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() * | |||
1517 | IIScalableTy->getMinNumElements() / | |||
1518 | PatternWidth; | |||
1519 | ||||
1520 | IntegerType *WideTy = Builder.getIntNTy(PatternWidth); | |||
1521 | auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount); | |||
1522 | auto *WideShuffleMaskTy = | |||
1523 | ScalableVectorType::get(Builder.getInt32Ty(), PatternElementCount); | |||
1524 | ||||
1525 | auto ZeroIdx = ConstantInt::get(Builder.getInt64Ty(), APInt(64, 0)); | |||
1526 | auto InsertSubvector = Builder.CreateInsertVector( | |||
1527 | II.getType(), PoisonValue::get(II.getType()), InsertEltChain, ZeroIdx); | |||
1528 | auto WideBitcast = | |||
1529 | Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy); | |||
1530 | auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy); | |||
1531 | auto WideShuffle = Builder.CreateShuffleVector( | |||
1532 | WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask); | |||
1533 | auto NarrowBitcast = | |||
1534 | Builder.CreateBitOrPointerCast(WideShuffle, II.getType()); | |||
1535 | ||||
1536 | return IC.replaceInstUsesWith(II, NarrowBitcast); | |||
1537 | } | |||
1538 | ||||
1539 | static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC, | |||
1540 | IntrinsicInst &II) { | |||
1541 | Value *A = II.getArgOperand(0); | |||
1542 | Value *B = II.getArgOperand(1); | |||
1543 | if (A == B) | |||
1544 | return IC.replaceInstUsesWith(II, A); | |||
1545 | ||||
1546 | return std::nullopt; | |||
1547 | } | |||
1548 | ||||
1549 | static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC, | |||
1550 | IntrinsicInst &II) { | |||
1551 | IRBuilder<> Builder(&II); | |||
1552 | Value *Pred = II.getOperand(0); | |||
1553 | Value *Vec = II.getOperand(1); | |||
1554 | Value *Shift = II.getOperand(2); | |||
1555 | ||||
1556 | // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic. | |||
1557 | Value *AbsPred, *MergedValue; | |||
1558 | if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>( | |||
1559 | m_Value(MergedValue), m_Value(AbsPred), m_Value())) && | |||
1560 | !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>( | |||
1561 | m_Value(MergedValue), m_Value(AbsPred), m_Value()))) | |||
1562 | ||||
1563 | return std::nullopt; | |||
1564 | ||||
1565 | // Transform is valid if any of the following are true: | |||
1566 | // * The ABS merge value is an undef or non-negative | |||
1567 | // * The ABS predicate is all active | |||
1568 | // * The ABS predicate and the SRSHL predicates are the same | |||
1569 | if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) && | |||
1570 | AbsPred != Pred && !isAllActivePredicate(AbsPred)) | |||
1571 | return std::nullopt; | |||
1572 | ||||
1573 | // Only valid when the shift amount is non-negative, otherwise the rounding | |||
1574 | // behaviour of SRSHL cannot be ignored. | |||
1575 | if (!match(Shift, m_NonNegative())) | |||
1576 | return std::nullopt; | |||
1577 | ||||
1578 | auto LSL = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl, {II.getType()}, | |||
1579 | {Pred, Vec, Shift}); | |||
1580 | ||||
1581 | return IC.replaceInstUsesWith(II, LSL); | |||
1582 | } | |||
1583 | ||||
1584 | std::optional<Instruction *> | |||
1585 | AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, | |||
1586 | IntrinsicInst &II) const { | |||
1587 | Intrinsic::ID IID = II.getIntrinsicID(); | |||
1588 | switch (IID) { | |||
1589 | default: | |||
1590 | break; | |||
1591 | case Intrinsic::aarch64_neon_fmaxnm: | |||
1592 | case Intrinsic::aarch64_neon_fminnm: | |||
1593 | return instCombineMaxMinNM(IC, II); | |||
1594 | case Intrinsic::aarch64_sve_convert_from_svbool: | |||
1595 | return instCombineConvertFromSVBool(IC, II); | |||
1596 | case Intrinsic::aarch64_sve_dup: | |||
1597 | return instCombineSVEDup(IC, II); | |||
1598 | case Intrinsic::aarch64_sve_dup_x: | |||
1599 | return instCombineSVEDupX(IC, II); | |||
1600 | case Intrinsic::aarch64_sve_cmpne: | |||
1601 | case Intrinsic::aarch64_sve_cmpne_wide: | |||
1602 | return instCombineSVECmpNE(IC, II); | |||
1603 | case Intrinsic::aarch64_sve_rdffr: | |||
1604 | return instCombineRDFFR(IC, II); | |||
1605 | case Intrinsic::aarch64_sve_lasta: | |||
1606 | case Intrinsic::aarch64_sve_lastb: | |||
1607 | return instCombineSVELast(IC, II); | |||
1608 | case Intrinsic::aarch64_sve_clasta_n: | |||
1609 | case Intrinsic::aarch64_sve_clastb_n: | |||
1610 | return instCombineSVECondLast(IC, II); | |||
1611 | case Intrinsic::aarch64_sve_cntd: | |||
1612 | return instCombineSVECntElts(IC, II, 2); | |||
1613 | case Intrinsic::aarch64_sve_cntw: | |||
1614 | return instCombineSVECntElts(IC, II, 4); | |||
1615 | case Intrinsic::aarch64_sve_cnth: | |||
1616 | return instCombineSVECntElts(IC, II, 8); | |||
1617 | case Intrinsic::aarch64_sve_cntb: | |||
1618 | return instCombineSVECntElts(IC, II, 16); | |||
1619 | case Intrinsic::aarch64_sve_ptest_any: | |||
1620 | case Intrinsic::aarch64_sve_ptest_first: | |||
1621 | case Intrinsic::aarch64_sve_ptest_last: | |||
1622 | return instCombineSVEPTest(IC, II); | |||
1623 | case Intrinsic::aarch64_sve_mul: | |||
1624 | case Intrinsic::aarch64_sve_fmul: | |||
1625 | return instCombineSVEVectorMul(IC, II); | |||
1626 | case Intrinsic::aarch64_sve_fadd: | |||
1627 | case Intrinsic::aarch64_sve_add: | |||
1628 | return instCombineSVEVectorAdd(IC, II); | |||
1629 | case Intrinsic::aarch64_sve_fadd_u: | |||
1630 | return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u, | |||
1631 | Intrinsic::aarch64_sve_fmla_u>( | |||
1632 | IC, II, true); | |||
1633 | case Intrinsic::aarch64_sve_fsub: | |||
1634 | case Intrinsic::aarch64_sve_sub: | |||
1635 | return instCombineSVEVectorSub(IC, II); | |||
1636 | case Intrinsic::aarch64_sve_fsub_u: | |||
1637 | return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u, | |||
1638 | Intrinsic::aarch64_sve_fmls_u>( | |||
1639 | IC, II, true); | |||
1640 | case Intrinsic::aarch64_sve_tbl: | |||
1641 | return instCombineSVETBL(IC, II); | |||
1642 | case Intrinsic::aarch64_sve_uunpkhi: | |||
1643 | case Intrinsic::aarch64_sve_uunpklo: | |||
1644 | case Intrinsic::aarch64_sve_sunpkhi: | |||
1645 | case Intrinsic::aarch64_sve_sunpklo: | |||
1646 | return instCombineSVEUnpack(IC, II); | |||
1647 | case Intrinsic::aarch64_sve_zip1: | |||
1648 | case Intrinsic::aarch64_sve_zip2: | |||
1649 | return instCombineSVEZip(IC, II); | |||
1650 | case Intrinsic::aarch64_sve_ld1_gather_index: | |||
1651 | return instCombineLD1GatherIndex(IC, II); | |||
1652 | case Intrinsic::aarch64_sve_st1_scatter_index: | |||
1653 | return instCombineST1ScatterIndex(IC, II); | |||
1654 | case Intrinsic::aarch64_sve_ld1: | |||
1655 | return instCombineSVELD1(IC, II, DL); | |||
1656 | case Intrinsic::aarch64_sve_st1: | |||
1657 | return instCombineSVEST1(IC, II, DL); | |||
1658 | case Intrinsic::aarch64_sve_sdiv: | |||
1659 | return instCombineSVESDIV(IC, II); | |||
1660 | case Intrinsic::aarch64_sve_sel: | |||
1661 | return instCombineSVESel(IC, II); | |||
1662 | case Intrinsic::aarch64_sve_srshl: | |||
1663 | return instCombineSVESrshl(IC, II); | |||
1664 | case Intrinsic::aarch64_sve_dupq_lane: | |||
1665 | return instCombineSVEDupqLane(IC, II); | |||
1666 | } | |||
1667 | ||||
1668 | return std::nullopt; | |||
1669 | } | |||
1670 | ||||
1671 | std::optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic( | |||
1672 | InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts, | |||
1673 | APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, | |||
1674 | std::function<void(Instruction *, unsigned, APInt, APInt &)> | |||
1675 | SimplifyAndSetOp) const { | |||
1676 | switch (II.getIntrinsicID()) { | |||
1677 | default: | |||
1678 | break; | |||
1679 | case Intrinsic::aarch64_neon_fcvtxn: | |||
1680 | case Intrinsic::aarch64_neon_rshrn: | |||
1681 | case Intrinsic::aarch64_neon_sqrshrn: | |||
1682 | case Intrinsic::aarch64_neon_sqrshrun: | |||
1683 | case Intrinsic::aarch64_neon_sqshrn: | |||
1684 | case Intrinsic::aarch64_neon_sqshrun: | |||
1685 | case Intrinsic::aarch64_neon_sqxtn: | |||
1686 | case Intrinsic::aarch64_neon_sqxtun: | |||
1687 | case Intrinsic::aarch64_neon_uqrshrn: | |||
1688 | case Intrinsic::aarch64_neon_uqshrn: | |||
1689 | case Intrinsic::aarch64_neon_uqxtn: | |||
1690 | SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts); | |||
1691 | break; | |||
1692 | } | |||
1693 | ||||
1694 | return std::nullopt; | |||
1695 | } | |||
1696 | ||||
1697 | TypeSize | |||
1698 | AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { | |||
1699 | switch (K) { | |||
1700 | case TargetTransformInfo::RGK_Scalar: | |||
1701 | return TypeSize::getFixed(64); | |||
1702 | case TargetTransformInfo::RGK_FixedWidthVector: | |||
1703 | if (!ST->isStreamingSVEModeDisabled() && | |||
1704 | !EnableFixedwidthAutovecInStreamingMode) | |||
1705 | return TypeSize::getFixed(0); | |||
1706 | ||||
1707 | if (ST->hasSVE()) | |||
1708 | return TypeSize::getFixed( | |||
1709 | std::max(ST->getMinSVEVectorSizeInBits(), 128u)); | |||
1710 | ||||
1711 | return TypeSize::getFixed(ST->hasNEON() ? 128 : 0); | |||
1712 | case TargetTransformInfo::RGK_ScalableVector: | |||
1713 | if (!ST->isStreamingSVEModeDisabled() && !EnableScalableAutovecInStreamingMode) | |||
1714 | return TypeSize::getScalable(0); | |||
1715 | ||||
1716 | return TypeSize::getScalable(ST->hasSVE() ? 128 : 0); | |||
1717 | } | |||
1718 | llvm_unreachable("Unsupported register kind")::llvm::llvm_unreachable_internal("Unsupported register kind" , "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 1718 ); | |||
1719 | } | |||
1720 | ||||
1721 | bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, | |||
1722 | ArrayRef<const Value *> Args) { | |||
1723 | ||||
1724 | // A helper that returns a vector type from the given type. The number of | |||
1725 | // elements in type Ty determines the vector width. | |||
1726 | auto toVectorTy = [&](Type *ArgTy) { | |||
1727 | return VectorType::get(ArgTy->getScalarType(), | |||
1728 | cast<VectorType>(DstTy)->getElementCount()); | |||
1729 | }; | |||
1730 | ||||
1731 | // Exit early if DstTy is not a vector type whose elements are at least | |||
1732 | // 16-bits wide. SVE doesn't generally have the same set of instructions to | |||
1733 | // perform an extend with the add/sub/mul. There are SMULLB style | |||
1734 | // instructions, but they operate on top/bottom, requiring some sort of lane | |||
1735 | // interleaving to be used with zext/sext. | |||
1736 | if (!useNeonVector(DstTy) || DstTy->getScalarSizeInBits() < 16) | |||
1737 | return false; | |||
1738 | ||||
1739 | // Determine if the operation has a widening variant. We consider both the | |||
1740 | // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the | |||
1741 | // instructions. | |||
1742 | // | |||
1743 | // TODO: Add additional widening operations (e.g., shl, etc.) once we | |||
1744 | // verify that their extending operands are eliminated during code | |||
1745 | // generation. | |||
1746 | switch (Opcode) { | |||
1747 | case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2). | |||
1748 | case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2). | |||
1749 | case Instruction::Mul: // SMULL(2), UMULL(2) | |||
1750 | break; | |||
1751 | default: | |||
1752 | return false; | |||
1753 | } | |||
1754 | ||||
1755 | // To be a widening instruction (either the "wide" or "long" versions), the | |||
1756 | // second operand must be a sign- or zero extend. | |||
1757 | if (Args.size() != 2 || | |||
1758 | (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1]))) | |||
1759 | return false; | |||
1760 | auto *Extend = cast<CastInst>(Args[1]); | |||
1761 | auto *Arg0 = dyn_cast<CastInst>(Args[0]); | |||
1762 | ||||
1763 | // A mul only has a mull version (not like addw). Both operands need to be | |||
1764 | // extending and the same type. | |||
1765 | if (Opcode == Instruction::Mul && | |||
1766 | (!Arg0 || Arg0->getOpcode() != Extend->getOpcode() || | |||
1767 | Arg0->getOperand(0)->getType() != Extend->getOperand(0)->getType())) | |||
1768 | return false; | |||
1769 | ||||
1770 | // Legalize the destination type and ensure it can be used in a widening | |||
1771 | // operation. | |||
1772 | auto DstTyL = getTypeLegalizationCost(DstTy); | |||
1773 | unsigned DstElTySize = DstTyL.second.getScalarSizeInBits(); | |||
1774 | if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits()) | |||
1775 | return false; | |||
1776 | ||||
1777 | // Legalize the source type and ensure it can be used in a widening | |||
1778 | // operation. | |||
1779 | auto *SrcTy = toVectorTy(Extend->getSrcTy()); | |||
1780 | auto SrcTyL = getTypeLegalizationCost(SrcTy); | |||
1781 | unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits(); | |||
1782 | if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits()) | |||
1783 | return false; | |||
1784 | ||||
1785 | // Get the total number of vector elements in the legalized types. | |||
1786 | InstructionCost NumDstEls = | |||
1787 | DstTyL.first * DstTyL.second.getVectorMinNumElements(); | |||
1788 | InstructionCost NumSrcEls = | |||
1789 | SrcTyL.first * SrcTyL.second.getVectorMinNumElements(); | |||
1790 | ||||
1791 | // Return true if the legalized types have the same number of vector elements | |||
1792 | // and the destination element type size is twice that of the source type. | |||
1793 | return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize; | |||
1794 | } | |||
1795 | ||||
1796 | InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, | |||
1797 | Type *Src, | |||
1798 | TTI::CastContextHint CCH, | |||
1799 | TTI::TargetCostKind CostKind, | |||
1800 | const Instruction *I) { | |||
1801 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
1802 | assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ? void (0) : __assert_fail ("ISD && \"Invalid opcode\"" , "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 1802 , __extension__ __PRETTY_FUNCTION__)); | |||
1803 | ||||
1804 | // If the cast is observable, and it is used by a widening instruction (e.g., | |||
1805 | // uaddl, saddw, etc.), it may be free. | |||
1806 | if (I && I->hasOneUser()) { | |||
1807 | auto *SingleUser = cast<Instruction>(*I->user_begin()); | |||
1808 | SmallVector<const Value *, 4> Operands(SingleUser->operand_values()); | |||
1809 | if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) { | |||
1810 | // If the cast is the second operand, it is free. We will generate either | |||
1811 | // a "wide" or "long" version of the widening instruction. | |||
1812 | if (I == SingleUser->getOperand(1)) | |||
1813 | return 0; | |||
1814 | // If the cast is not the second operand, it will be free if it looks the | |||
1815 | // same as the second operand. In this case, we will generate a "long" | |||
1816 | // version of the widening instruction. | |||
1817 | if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1))) | |||
1818 | if (I->getOpcode() == unsigned(Cast->getOpcode()) && | |||
1819 | cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy()) | |||
1820 | return 0; | |||
1821 | } | |||
1822 | } | |||
1823 | ||||
1824 | // TODO: Allow non-throughput costs that aren't binary. | |||
1825 | auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost { | |||
1826 | if (CostKind != TTI::TCK_RecipThroughput) | |||
1827 | return Cost == 0 ? 0 : 1; | |||
1828 | return Cost; | |||
1829 | }; | |||
1830 | ||||
1831 | EVT SrcTy = TLI->getValueType(DL, Src); | |||
1832 | EVT DstTy = TLI->getValueType(DL, Dst); | |||
1833 | ||||
1834 | if (!SrcTy.isSimple() || !DstTy.isSimple()) | |||
1835 | return AdjustCost( | |||
1836 | BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); | |||
1837 | ||||
1838 | static const TypeConversionCostTblEntry | |||
1839 | ConversionTbl[] = { | |||
1840 | { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn | |||
1841 | { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn | |||
1842 | { ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn | |||
1843 | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn | |||
1844 | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1 | |||
1845 | { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn | |||
1846 | { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn | |||
1847 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1 | |||
1848 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn | |||
1849 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn | |||
1850 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn | |||
1851 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1 | |||
1852 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1 | |||
1853 | { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1 | |||
1854 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1 | |||
1855 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1 | |||
1856 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1 | |||
1857 | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1 | |||
1858 | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1 | |||
1859 | { ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1 | |||
1860 | ||||
1861 | // Truncations on nxvmiN | |||
1862 | { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 }, | |||
1863 | { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 }, | |||
1864 | { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 }, | |||
1865 | { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 }, | |||
1866 | { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 }, | |||
1867 | { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 }, | |||
1868 | { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 }, | |||
1869 | { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 }, | |||
1870 | { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 }, | |||
1871 | { ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1 }, | |||
1872 | { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 }, | |||
1873 | { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 }, | |||
1874 | { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 }, | |||
1875 | { ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 }, | |||
1876 | { ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 }, | |||
1877 | { ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 }, | |||
1878 | ||||
1879 | // The number of shll instructions for the extension. | |||
1880 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, | |||
1881 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, | |||
1882 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, | |||
1883 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, | |||
1884 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, | |||
1885 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, | |||
1886 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, | |||
1887 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, | |||
1888 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, | |||
1889 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, | |||
1890 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, | |||
1891 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, | |||
1892 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, | |||
1893 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, | |||
1894 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, | |||
1895 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, | |||
1896 | ||||
1897 | // LowerVectorINT_TO_FP: | |||
1898 | { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, | |||
1899 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, | |||
1900 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, | |||
1901 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, | |||
1902 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, | |||
1903 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, | |||
1904 | ||||
1905 | // Complex: to v2f32 | |||
1906 | { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, | |||
1907 | { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, | |||
1908 | { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, | |||
1909 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, | |||
1910 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, | |||
1911 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, | |||
1912 | ||||
1913 | // Complex: to v4f32 | |||
1914 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 }, | |||
1915 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, | |||
1916 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, | |||
1917 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, | |||
1918 | ||||
1919 | // Complex: to v8f32 | |||
1920 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, | |||
1921 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, | |||
1922 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, | |||
1923 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, | |||
1924 | ||||
1925 | // Complex: to v16f32 | |||
1926 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, | |||
1927 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, | |||
1928 | ||||
1929 | // Complex: to v2f64 | |||
1930 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, | |||
1931 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, | |||
1932 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, | |||
1933 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, | |||
1934 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, | |||
1935 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, | |||
1936 | ||||
1937 | // Complex: to v4f64 | |||
1938 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4 }, | |||
1939 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4 }, | |||
1940 | ||||
1941 | // LowerVectorFP_TO_INT | |||
1942 | { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 }, | |||
1943 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, | |||
1944 | { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, | |||
1945 | { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, | |||
1946 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, | |||
1947 | { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, | |||
1948 | ||||
1949 | // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext). | |||
1950 | { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 }, | |||
1951 | { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 }, | |||
1952 | { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 }, | |||
1953 | { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 }, | |||
1954 | { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 }, | |||
1955 | { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 }, | |||
1956 | ||||
1957 | // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2 | |||
1958 | { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 }, | |||
1959 | { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 }, | |||
1960 | { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 }, | |||
1961 | { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 }, | |||
1962 | ||||
1963 | // Complex, from nxv2f32. | |||
1964 | { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1 }, | |||
1965 | { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 }, | |||
1966 | { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 }, | |||
1967 | { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1 }, | |||
1968 | { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1 }, | |||
1969 | { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 }, | |||
1970 | { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 }, | |||
1971 | { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1 }, | |||
1972 | ||||
1973 | // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2. | |||
1974 | { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 }, | |||
1975 | { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 }, | |||
1976 | { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 }, | |||
1977 | { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 }, | |||
1978 | { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 }, | |||
1979 | { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 }, | |||
1980 | ||||
1981 | // Complex, from nxv2f64. | |||
1982 | { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 }, | |||
1983 | { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1 }, | |||
1984 | { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1 }, | |||
1985 | { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1 }, | |||
1986 | { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 }, | |||
1987 | { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1 }, | |||
1988 | { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1 }, | |||
1989 | { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1 }, | |||
1990 | ||||
1991 | // Complex, from nxv4f32. | |||
1992 | { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4 }, | |||
1993 | { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 }, | |||
1994 | { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1 }, | |||
1995 | { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1 }, | |||
1996 | { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4 }, | |||
1997 | { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 }, | |||
1998 | { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1 }, | |||
1999 | { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1 }, | |||
2000 | ||||
2001 | // Complex, from nxv8f64. Illegal -> illegal conversions not required. | |||
2002 | { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7 }, | |||
2003 | { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7 }, | |||
2004 | { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7 }, | |||
2005 | { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7 }, | |||
2006 | ||||
2007 | // Complex, from nxv4f64. Illegal -> illegal conversions not required. | |||
2008 | { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3 }, | |||
2009 | { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3 }, | |||
2010 | { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3 }, | |||
2011 | { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3 }, | |||
2012 | { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3 }, | |||
2013 | { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3 }, | |||
2014 | ||||
2015 | // Complex, from nxv8f32. Illegal -> illegal conversions not required. | |||
2016 | { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 }, | |||
2017 | { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3 }, | |||
2018 | { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3 }, | |||
2019 | { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3 }, | |||
2020 | ||||
2021 | // Complex, from nxv8f16. | |||
2022 | { ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10 }, | |||
2023 | { ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4 }, | |||
2024 | { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1 }, | |||
2025 | { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1 }, | |||
2026 | { ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10 }, | |||
2027 | { ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4 }, | |||
2028 | { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1 }, | |||
2029 | { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1 }, | |||
2030 | ||||
2031 | // Complex, from nxv4f16. | |||
2032 | { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4 }, | |||
2033 | { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1 }, | |||
2034 | { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1 }, | |||
2035 | { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1 }, | |||
2036 | { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4 }, | |||
2037 | { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1 }, | |||
2038 | { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1 }, | |||
2039 | { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1 }, | |||
2040 | ||||
2041 | // Complex, from nxv2f16. | |||
2042 | { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1 }, | |||
2043 | { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1 }, | |||
2044 | { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1 }, | |||
2045 | { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1 }, | |||
2046 | { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1 }, | |||
2047 | { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1 }, | |||
2048 | { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1 }, | |||
2049 | { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1 }, | |||
2050 | ||||
2051 | // Truncate from nxvmf32 to nxvmf16. | |||
2052 | { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 }, | |||
2053 | { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 }, | |||
2054 | { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 }, | |||
2055 | ||||
2056 | // Truncate from nxvmf64 to nxvmf16. | |||
2057 | { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 }, | |||
2058 | { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 }, | |||
2059 | { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 }, | |||
2060 | ||||
2061 | // Truncate from nxvmf64 to nxvmf32. | |||
2062 | { ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 }, | |||
2063 | { ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 }, | |||
2064 | { ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 }, | |||
2065 | ||||
2066 | // Extend from nxvmf16 to nxvmf32. | |||
2067 | { ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1}, | |||
2068 | { ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1}, | |||
2069 | { ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2}, | |||
2070 | ||||
2071 | // Extend from nxvmf16 to nxvmf64. | |||
2072 | { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1}, | |||
2073 | { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2}, | |||
2074 | { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4}, | |||
2075 | ||||
2076 | // Extend from nxvmf32 to nxvmf64. | |||
2077 | { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1}, | |||
2078 | { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2}, | |||
2079 | { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6}, | |||
2080 | ||||
2081 | // Bitcasts from float to integer | |||
2082 | { ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0 }, | |||
2083 | { ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0 }, | |||
2084 | { ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0 }, | |||
2085 | ||||
2086 | // Bitcasts from integer to float | |||
2087 | { ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0 }, | |||
2088 | { ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0 }, | |||
2089 | { ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0 }, | |||
2090 | }; | |||
2091 | ||||
2092 | if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD, | |||
2093 | DstTy.getSimpleVT(), | |||
2094 | SrcTy.getSimpleVT())) | |||
2095 | return AdjustCost(Entry->Cost); | |||
2096 | ||||
2097 | static const TypeConversionCostTblEntry FP16Tbl[] = { | |||
2098 | {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs | |||
2099 | {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1}, | |||
2100 | {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs | |||
2101 | {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1}, | |||
2102 | {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs | |||
2103 | {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2}, | |||
2104 | {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn | |||
2105 | {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2}, | |||
2106 | {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs | |||
2107 | {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1}, | |||
2108 | {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs | |||
2109 | {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4}, | |||
2110 | {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn | |||
2111 | {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3}, | |||
2112 | {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs | |||
2113 | {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2}, | |||
2114 | {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs | |||
2115 | {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8}, | |||
2116 | {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf | |||
2117 | {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf | |||
2118 | {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf | |||
2119 | {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf | |||
2120 | }; | |||
2121 | ||||
2122 | if (ST->hasFullFP16()) | |||
2123 | if (const auto *Entry = ConvertCostTableLookup( | |||
2124 | FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) | |||
2125 | return AdjustCost(Entry->Cost); | |||
2126 | ||||
2127 | return AdjustCost( | |||
2128 | BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); | |||
2129 | } | |||
2130 | ||||
2131 | InstructionCost AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, | |||
2132 | Type *Dst, | |||
2133 | VectorType *VecTy, | |||
2134 | unsigned Index) { | |||
2135 | ||||
2136 | // Make sure we were given a valid extend opcode. | |||
2137 | assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&(static_cast <bool> ((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && "Invalid opcode") ? void (0 ) : __assert_fail ("(Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && \"Invalid opcode\"" , "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2138 , __extension__ __PRETTY_FUNCTION__)) | |||
2138 | "Invalid opcode")(static_cast <bool> ((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && "Invalid opcode") ? void (0 ) : __assert_fail ("(Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && \"Invalid opcode\"" , "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2138 , __extension__ __PRETTY_FUNCTION__)); | |||
2139 | ||||
2140 | // We are extending an element we extract from a vector, so the source type | |||
2141 | // of the extend is the element type of the vector. | |||
2142 | auto *Src = VecTy->getElementType(); | |||
2143 | ||||
2144 | // Sign- and zero-extends are for integer types only. | |||
2145 | assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type")(static_cast <bool> (isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type") ? void (0) : __assert_fail ("isa<IntegerType>(Dst) && isa<IntegerType>(Src) && \"Invalid type\"" , "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2145 , __extension__ __PRETTY_FUNCTION__)); | |||
2146 | ||||
2147 | // Get the cost for the extract. We compute the cost (if any) for the extend | |||
2148 | // below. | |||
2149 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; | |||
2150 | InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy, | |||
2151 | CostKind, Index, nullptr, nullptr); | |||
2152 | ||||
2153 | // Legalize the types. | |||
2154 | auto VecLT = getTypeLegalizationCost(VecTy); | |||
2155 | auto DstVT = TLI->getValueType(DL, Dst); | |||
2156 | auto SrcVT = TLI->getValueType(DL, Src); | |||
2157 | ||||
2158 | // If the resulting type is still a vector and the destination type is legal, | |||
2159 | // we may get the extension for free. If not, get the default cost for the | |||
2160 | // extend. | |||
2161 | if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT)) | |||
2162 | return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, | |||
2163 | CostKind); | |||
2164 | ||||
2165 | // The destination type should be larger than the element type. If not, get | |||
2166 | // the default cost for the extend. | |||
2167 | if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits()) | |||
2168 | return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, | |||
2169 | CostKind); | |||
2170 | ||||
2171 | switch (Opcode) { | |||
2172 | default: | |||
2173 | llvm_unreachable("Opcode should be either SExt or ZExt")::llvm::llvm_unreachable_internal("Opcode should be either SExt or ZExt" , "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2173 ); | |||
2174 | ||||
2175 | // For sign-extends, we only need a smov, which performs the extension | |||
2176 | // automatically. | |||
2177 | case Instruction::SExt: | |||
2178 | return Cost; | |||
2179 | ||||
2180 | // For zero-extends, the extend is performed automatically by a umov unless | |||
2181 | // the destination type is i64 and the element type is i8 or i16. | |||
2182 | case Instruction::ZExt: | |||
2183 | if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u) | |||
2184 | return Cost; | |||
2185 | } | |||
2186 | ||||
2187 | // If we are unable to perform the extend for free, get the default cost. | |||
2188 | return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, | |||
2189 | CostKind); | |||
2190 | } | |||
2191 | ||||
2192 | InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode, | |||
2193 | TTI::TargetCostKind CostKind, | |||
2194 | const Instruction *I) { | |||
2195 | if (CostKind != TTI::TCK_RecipThroughput) | |||
2196 | return Opcode == Instruction::PHI ? 0 : 1; | |||
2197 | assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind")(static_cast <bool> (CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind") ? void (0) : __assert_fail ("CostKind == TTI::TCK_RecipThroughput && \"unexpected CostKind\"" , "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2197 , __extension__ __PRETTY_FUNCTION__)); | |||
2198 | // Branches are assumed to be predicted. | |||
2199 | return 0; | |||
2200 | } | |||
2201 | ||||
2202 | InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(const Instruction *I, | |||
2203 | Type *Val, | |||
2204 | unsigned Index, | |||
2205 | bool HasRealUse) { | |||
2206 | assert(Val->isVectorTy() && "This must be a vector type")(static_cast <bool> (Val->isVectorTy() && "This must be a vector type" ) ? void (0) : __assert_fail ("Val->isVectorTy() && \"This must be a vector type\"" , "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2206 , __extension__ __PRETTY_FUNCTION__)); | |||
2207 | ||||
2208 | if (Index != -1U) { | |||
2209 | // Legalize the type. | |||
2210 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val); | |||
2211 | ||||
2212 | // This type is legalized to a scalar type. | |||
2213 | if (!LT.second.isVector()) | |||
2214 | return 0; | |||
2215 | ||||
2216 | // The type may be split. For fixed-width vectors we can normalize the | |||
2217 | // index to the new type. | |||
2218 | if (LT.second.isFixedLengthVector()) { | |||
2219 | unsigned Width = LT.second.getVectorNumElements(); | |||
2220 | Index = Index % Width; | |||
2221 | } | |||
2222 | ||||
2223 | // The element at index zero is already inside the vector. | |||
2224 | // - For a physical (HasRealUse==true) insert-element or extract-element | |||
2225 | // instruction that extracts integers, an explicit FPR -> GPR move is | |||
2226 | // needed. So it has non-zero cost. | |||
2227 | // - For the rest of cases (virtual instruction or element type is float), | |||
2228 | // consider the instruction free. | |||
2229 | if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy())) | |||
2230 | return 0; | |||
2231 | ||||
2232 | // This is recognising a LD1 single-element structure to one lane of one | |||
2233 | // register instruction. I.e., if this is an `insertelement` instruction, | |||
2234 | // and its second operand is a load, then we will generate a LD1, which | |||
2235 | // are expensive instructions. | |||
2236 | if (I && dyn_cast<LoadInst>(I->getOperand(1))) | |||
2237 | return ST->getVectorInsertExtractBaseCost() + 1; | |||
2238 | ||||
2239 | // FIXME: | |||
2240 | // If the extract-element and insert-element instructions could be | |||
2241 | // simplified away (e.g., could be combined into users by looking at use-def | |||
2242 | // context), they have no cost. This is not done in the first place for | |||
2243 | // compile-time considerations. | |||
2244 | } | |||
2245 | ||||
2246 | // All other insert/extracts cost this much. | |||
2247 | return ST->getVectorInsertExtractBaseCost(); | |||
2248 | } | |||
2249 | ||||
2250 | InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, | |||
2251 | TTI::TargetCostKind CostKind, | |||
2252 | unsigned Index, Value *Op0, | |||
2253 | Value *Op1) { | |||
2254 | return getVectorInstrCostHelper(nullptr, Val, Index, false /* HasRealUse */); | |||
2255 | } | |||
2256 | ||||
2257 | InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I, | |||
2258 | Type *Val, | |||
2259 | TTI::TargetCostKind CostKind, | |||
2260 | unsigned Index) { | |||
2261 | return getVectorInstrCostHelper(&I, Val, Index, true /* HasRealUse */); | |||
2262 | } | |||
2263 | ||||
2264 | InstructionCost AArch64TTIImpl::getArithmeticInstrCost( | |||
2265 | unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, | |||
2266 | TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, | |||
2267 | ArrayRef<const Value *> Args, | |||
2268 | const Instruction *CxtI) { | |||
2269 | ||||
2270 | // TODO: Handle more cost kinds. | |||
2271 | if (CostKind != TTI::TCK_RecipThroughput) | |||
2272 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, | |||
2273 | Op2Info, Args, CxtI); | |||
2274 | ||||
2275 | // Legalize the type. | |||
2276 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); | |||
2277 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
2278 | ||||
2279 | switch (ISD) { | |||
2280 | default: | |||
2281 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, | |||
2282 | Op2Info); | |||
2283 | case ISD::SDIV: | |||
2284 | if (Op2Info.isConstant() && Op2Info.isUniform() && Op2Info.isPowerOf2()) { | |||
2285 | // On AArch64, scalar signed division by constants power-of-two are | |||
2286 | // normally expanded to the sequence ADD + CMP + SELECT + SRA. | |||
2287 | // The OperandValue properties many not be same as that of previous | |||
2288 | // operation; conservatively assume OP_None. | |||
2289 | InstructionCost Cost = getArithmeticInstrCost( | |||
2290 | Instruction::Add, Ty, CostKind, | |||
2291 | Op1Info.getNoProps(), Op2Info.getNoProps()); | |||
2292 | Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, | |||
2293 | Op1Info.getNoProps(), Op2Info.getNoProps()); | |||
2294 | Cost += getArithmeticInstrCost( | |||
2295 | Instruction::Select, Ty, CostKind, | |||
2296 | Op1Info.getNoProps(), Op2Info.getNoProps()); | |||
2297 | Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, | |||
2298 | Op1Info.getNoProps(), Op2Info.getNoProps()); | |||
2299 | return Cost; | |||
2300 | } | |||
2301 | [[fallthrough]]; | |||
2302 | case ISD::UDIV: { | |||
2303 | if (Op2Info.isConstant() && Op2Info.isUniform()) { | |||
2304 | auto VT = TLI->getValueType(DL, Ty); | |||
2305 | if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) { | |||
2306 | // Vector signed division by constant are expanded to the | |||
2307 | // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division | |||
2308 | // to MULHS + SUB + SRL + ADD + SRL. | |||
2309 | InstructionCost MulCost = getArithmeticInstrCost( | |||
2310 | Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps()); | |||
2311 | InstructionCost AddCost = getArithmeticInstrCost( | |||
2312 | Instruction::Add, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps()); | |||
2313 | InstructionCost ShrCost = getArithmeticInstrCost( | |||
2314 | Instruction::AShr, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps()); | |||
2315 | return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1; | |||
2316 | } | |||
2317 | } | |||
2318 | ||||
2319 | InstructionCost Cost = BaseT::getArithmeticInstrCost( | |||
2320 | Opcode, Ty, CostKind, Op1Info, Op2Info); | |||
2321 | if (Ty->isVectorTy()) { | |||
2322 | if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) { | |||
2323 | // SDIV/UDIV operations are lowered using SVE, then we can have less | |||
2324 | // costs. | |||
2325 | if (isa<FixedVectorType>(Ty) && cast<FixedVectorType>(Ty) | |||
2326 | ->getPrimitiveSizeInBits() | |||
2327 | .getFixedValue() < 128) { | |||
2328 | EVT VT = TLI->getValueType(DL, Ty); | |||
2329 | static const CostTblEntry DivTbl[]{ | |||
2330 | {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8}, | |||
2331 | {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5}, | |||
2332 | {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1}, | |||
2333 | {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8}, | |||
2334 | {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5}, | |||
2335 | {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}}; | |||
2336 | ||||
2337 | const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT()); | |||
2338 | if (nullptr != Entry) | |||
2339 | return Entry->Cost; | |||
2340 | } | |||
2341 | // For 8/16-bit elements, the cost is higher because the type | |||
2342 | // requires promotion and possibly splitting: | |||
2343 | if (LT.second.getScalarType() == MVT::i8) | |||
2344 | Cost *= 8; | |||
2345 | else if (LT.second.getScalarType() == MVT::i16) | |||
2346 | Cost *= 4; | |||
2347 | return Cost; | |||
2348 | } else { | |||
2349 | // If one of the operands is a uniform constant then the cost for each | |||
2350 | // element is Cost for insertion, extraction and division. | |||
2351 | // Insertion cost = 2, Extraction Cost = 2, Division = cost for the | |||
2352 | // operation with scalar type | |||
2353 | if ((Op1Info.isConstant() && Op1Info.isUniform()) || | |||
2354 | (Op2Info.isConstant() && Op2Info.isUniform())) { | |||
2355 | if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) { | |||
2356 | InstructionCost DivCost = BaseT::getArithmeticInstrCost( | |||
2357 | Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info); | |||
2358 | return (4 + DivCost) * VTy->getNumElements(); | |||
2359 | } | |||
2360 | } | |||
2361 | // On AArch64, without SVE, vector divisions are expanded | |||
2362 | // into scalar divisions of each pair of elements. | |||
2363 | Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, | |||
2364 | CostKind, Op1Info, Op2Info); | |||
2365 | Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind, | |||
2366 | Op1Info, Op2Info); | |||
2367 | } | |||
2368 | ||||
2369 | // TODO: if one of the arguments is scalar, then it's not necessary to | |||
2370 | // double the cost of handling the vector elements. | |||
2371 | Cost += Cost; | |||
2372 | } | |||
2373 | return Cost; | |||
2374 | } | |||
2375 | case ISD::MUL: | |||
2376 | // When SVE is available, then we can lower the v2i64 operation using | |||
2377 | // the SVE mul instruction, which has a lower cost. | |||
2378 | if (LT.second == MVT::v2i64 && ST->hasSVE()) | |||
2379 | return LT.first; | |||
2380 | ||||
2381 | // When SVE is not available, there is no MUL.2d instruction, | |||
2382 | // which means mul <2 x i64> is expensive as elements are extracted | |||
2383 | // from the vectors and the muls scalarized. | |||
2384 | // As getScalarizationOverhead is a bit too pessimistic, we | |||
2385 | // estimate the cost for a i64 vector directly here, which is: | |||
2386 | // - four 2-cost i64 extracts, | |||
2387 | // - two 2-cost i64 inserts, and | |||
2388 | // - two 1-cost muls. | |||
2389 | // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with | |||
2390 | // LT.first = 2 the cost is 28. If both operands are extensions it will not | |||
2391 | // need to scalarize so the cost can be cheaper (smull or umull). | |||
2392 | // so the cost can be cheaper (smull or umull). | |||
2393 | if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args)) | |||
2394 | return LT.first; | |||
2395 | return LT.first * 14; | |||
2396 | case ISD::ADD: | |||
2397 | case ISD::XOR: | |||
2398 | case ISD::OR: | |||
2399 | case ISD::AND: | |||
2400 | case ISD::SRL: | |||
2401 | case ISD::SRA: | |||
2402 | case ISD::SHL: | |||
2403 | // These nodes are marked as 'custom' for combining purposes only. | |||
2404 | // We know that they are legal. See LowerAdd in ISelLowering. | |||
2405 | return LT.first; | |||
2406 | ||||
2407 | case ISD::FADD: | |||
2408 | case ISD::FSUB: | |||
2409 | case ISD::FMUL: | |||
2410 | case ISD::FDIV: | |||
2411 | case ISD::FNEG: | |||
2412 | // These nodes are marked as 'custom' just to lower them to SVE. | |||
2413 | // We know said lowering will incur no additional cost. | |||
2414 | if (!Ty->getScalarType()->isFP128Ty()) | |||
2415 | return 2 * LT.first; | |||
2416 | ||||
2417 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, | |||
2418 | Op2Info); | |||
2419 | } | |||
2420 | } | |||
2421 | ||||
2422 | InstructionCost AArch64TTIImpl::getAddressComputationCost(Type *Ty, | |||
2423 | ScalarEvolution *SE, | |||
2424 | const SCEV *Ptr) { | |||
2425 | // Address computations in vectorized code with non-consecutive addresses will | |||
2426 | // likely result in more instructions compared to scalar code where the | |||
2427 | // computation can more often be merged into the index mode. The resulting | |||
2428 | // extra micro-ops can significantly decrease throughput. | |||
2429 | unsigned NumVectorInstToHideOverhead = 10; | |||
2430 | int MaxMergeDistance = 64; | |||
2431 | ||||
2432 | if (Ty->isVectorTy() && SE && | |||
2433 | !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)) | |||
2434 | return NumVectorInstToHideOverhead; | |||
2435 | ||||
2436 | // In many cases the address computation is not merged into the instruction | |||
2437 | // addressing mode. | |||
2438 | return 1; | |||
2439 | } | |||
2440 | ||||
2441 | InstructionCost AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, | |||
2442 | Type *CondTy, | |||
2443 | CmpInst::Predicate VecPred, | |||
2444 | TTI::TargetCostKind CostKind, | |||
2445 | const Instruction *I) { | |||
2446 | // TODO: Handle other cost kinds. | |||
2447 | if (CostKind != TTI::TCK_RecipThroughput) | |||
2448 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, | |||
2449 | I); | |||
2450 | ||||
2451 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
2452 | // We don't lower some vector selects well that are wider than the register | |||
2453 | // width. | |||
2454 | if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) { | |||
2455 | // We would need this many instructions to hide the scalarization happening. | |||
2456 | const int AmortizationCost = 20; | |||
2457 | ||||
2458 | // If VecPred is not set, check if we can get a predicate from the context | |||
2459 | // instruction, if its type matches the requested ValTy. | |||
2460 | if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) { | |||
2461 | CmpInst::Predicate CurrentPred; | |||
2462 | if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(), | |||
2463 | m_Value()))) | |||
2464 | VecPred = CurrentPred; | |||
2465 | } | |||
2466 | // Check if we have a compare/select chain that can be lowered using | |||
2467 | // a (F)CMxx & BFI pair. | |||
2468 | if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE || | |||
2469 | VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT || | |||
2470 | VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ || | |||
2471 | VecPred == CmpInst::FCMP_UNE) { | |||
2472 | static const auto ValidMinMaxTys = { | |||
2473 | MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, | |||
2474 | MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64}; | |||
2475 | static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16}; | |||
2476 | ||||
2477 | auto LT = getTypeLegalizationCost(ValTy); | |||
2478 | if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; }) || | |||
2479 | (ST->hasFullFP16() && | |||
2480 | any_of(ValidFP16MinMaxTys, [<](MVT M) { return M == LT.second; }))) | |||
2481 | return LT.first; | |||
2482 | } | |||
2483 | ||||
2484 | static const TypeConversionCostTblEntry | |||
2485 | VectorSelectTbl[] = { | |||
2486 | { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 }, | |||
2487 | { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 }, | |||
2488 | { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 }, | |||
2489 | { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost }, | |||
2490 | { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost }, | |||
2491 | { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost } | |||
2492 | }; | |||
2493 | ||||
2494 | EVT SelCondTy = TLI->getValueType(DL, CondTy); | |||
2495 | EVT SelValTy = TLI->getValueType(DL, ValTy); | |||
2496 | if (SelCondTy.isSimple() && SelValTy.isSimple()) { | |||
2497 | if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD, | |||
2498 | SelCondTy.getSimpleVT(), | |||
2499 | SelValTy.getSimpleVT())) | |||
2500 | return Entry->Cost; | |||
2501 | } | |||
2502 | } | |||
2503 | // The base case handles scalable vectors fine for now, since it treats the | |||
2504 | // cost as 1 * legalization cost. | |||
2505 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); | |||
2506 | } | |||
2507 | ||||
2508 | AArch64TTIImpl::TTI::MemCmpExpansionOptions | |||
2509 | AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { | |||
2510 | TTI::MemCmpExpansionOptions Options; | |||
2511 | if (ST->requiresStrictAlign()) { | |||
2512 | // TODO: Add cost modeling for strict align. Misaligned loads expand to | |||
2513 | // a bunch of instructions when strict align is enabled. | |||
2514 | return Options; | |||
2515 | } | |||
2516 | Options.AllowOverlappingLoads = true; | |||
2517 | Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); | |||
2518 | Options.NumLoadsPerBlock = Options.MaxNumLoads; | |||
2519 | // TODO: Though vector loads usually perform well on AArch64, in some targets | |||
2520 | // they may wake up the FP unit, which raises the power consumption. Perhaps | |||
2521 | // they could be used with no holds barred (-O3). | |||
2522 | Options.LoadSizes = {8, 4, 2, 1}; | |||
2523 | return Options; | |||
2524 | } | |||
2525 | ||||
2526 | bool AArch64TTIImpl::prefersVectorizedAddressing() const { | |||
2527 | return ST->hasSVE(); | |||
2528 | } | |||
2529 | ||||
2530 | InstructionCost | |||
2531 | AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, | |||
2532 | Align Alignment, unsigned AddressSpace, | |||
2533 | TTI::TargetCostKind CostKind) { | |||
2534 | if (useNeonVector(Src)) | |||
2535 | return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, | |||
2536 | CostKind); | |||
2537 | auto LT = getTypeLegalizationCost(Src); | |||
2538 | if (!LT.first.isValid()) | |||
2539 | return InstructionCost::getInvalid(); | |||
2540 | ||||
2541 | // The code-generator is currently not able to handle scalable vectors | |||
2542 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting | |||
2543 | // it. This change will be removed when code-generation for these types is | |||
2544 | // sufficiently reliable. | |||
2545 | if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1)) | |||
2546 | return InstructionCost::getInvalid(); | |||
2547 | ||||
2548 | return LT.first; | |||
2549 | } | |||
2550 | ||||
2551 | static unsigned getSVEGatherScatterOverhead(unsigned Opcode) { | |||
2552 | return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead; | |||
2553 | } | |||
2554 | ||||
2555 | InstructionCost AArch64TTIImpl::getGatherScatterOpCost( | |||
2556 | unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, | |||
2557 | Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { | |||
2558 | if (useNeonVector(DataTy)) | |||
2559 | return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, | |||
2560 | Alignment, CostKind, I); | |||
2561 | auto *VT = cast<VectorType>(DataTy); | |||
2562 | auto LT = getTypeLegalizationCost(DataTy); | |||
2563 | if (!LT.first.isValid()) | |||
2564 | return InstructionCost::getInvalid(); | |||
2565 | ||||
2566 | // The code-generator is currently not able to handle scalable vectors | |||
2567 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting | |||
2568 | // it. This change will be removed when code-generation for these types is | |||
2569 | // sufficiently reliable. | |||
2570 | if (cast<VectorType>(DataTy)->getElementCount() == | |||
2571 | ElementCount::getScalable(1)) | |||
2572 | return InstructionCost::getInvalid(); | |||
2573 | ||||
2574 | ElementCount LegalVF = LT.second.getVectorElementCount(); | |||
2575 | InstructionCost MemOpCost = | |||
2576 | getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, | |||
2577 | {TTI::OK_AnyValue, TTI::OP_None}, I); | |||
2578 | // Add on an overhead cost for using gathers/scatters. | |||
2579 | // TODO: At the moment this is applied unilaterally for all CPUs, but at some | |||
2580 | // point we may want a per-CPU overhead. | |||
2581 | MemOpCost *= getSVEGatherScatterOverhead(Opcode); | |||
2582 | return LT.first * MemOpCost * getMaxNumElements(LegalVF); | |||
2583 | } | |||
2584 | ||||
2585 | bool AArch64TTIImpl::useNeonVector(const Type *Ty) const { | |||
2586 | return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors(); | |||
2587 | } | |||
2588 | ||||
2589 | InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, | |||
2590 | MaybeAlign Alignment, | |||
2591 | unsigned AddressSpace, | |||
2592 | TTI::TargetCostKind CostKind, | |||
2593 | TTI::OperandValueInfo OpInfo, | |||
2594 | const Instruction *I) { | |||
2595 | EVT VT = TLI->getValueType(DL, Ty, true); | |||
2596 | // Type legalization can't handle structs | |||
2597 | if (VT == MVT::Other) | |||
2598 | return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, | |||
2599 | CostKind); | |||
2600 | ||||
2601 | auto LT = getTypeLegalizationCost(Ty); | |||
2602 | if (!LT.first.isValid()) | |||
2603 | return InstructionCost::getInvalid(); | |||
2604 | ||||
2605 | // The code-generator is currently not able to handle scalable vectors | |||
2606 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting | |||
2607 | // it. This change will be removed when code-generation for these types is | |||
2608 | // sufficiently reliable. | |||
2609 | if (auto *VTy = dyn_cast<ScalableVectorType>(Ty)) | |||
2610 | if (VTy->getElementCount() == ElementCount::getScalable(1)) | |||
2611 | return InstructionCost::getInvalid(); | |||
2612 | ||||
2613 | // TODO: consider latency as well for TCK_SizeAndLatency. | |||
2614 | if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) | |||
2615 | return LT.first; | |||
2616 | ||||
2617 | if (CostKind != TTI::TCK_RecipThroughput) | |||
2618 | return 1; | |||
2619 | ||||
2620 | if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store && | |||
2621 | LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) { | |||
2622 | // Unaligned stores are extremely inefficient. We don't split all | |||
2623 | // unaligned 128-bit stores because the negative impact that has shown in | |||
2624 | // practice on inlined block copy code. | |||
2625 | // We make such stores expensive so that we will only vectorize if there | |||
2626 | // are 6 other instructions getting vectorized. | |||
2627 | const int AmortizationCost = 6; | |||
2628 | ||||
2629 | return LT.first * 2 * AmortizationCost; | |||
2630 | } | |||
2631 | ||||
2632 | // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs. | |||
2633 | if (Ty->isPtrOrPtrVectorTy()) | |||
2634 | return LT.first; | |||
2635 | ||||
2636 | // Check truncating stores and extending loads. | |||
2637 | if (useNeonVector(Ty) && | |||
2638 | Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) { | |||
2639 | // v4i8 types are lowered to scalar a load/store and sshll/xtn. | |||
2640 | if (VT == MVT::v4i8) | |||
2641 | return 2; | |||
2642 | // Otherwise we need to scalarize. | |||
2643 | return cast<FixedVectorType>(Ty)->getNumElements() * 2; | |||
2644 | } | |||
2645 | ||||
2646 | return LT.first; | |||
2647 | } | |||
2648 | ||||
2649 | InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost( | |||
2650 | unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, | |||
2651 | Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, | |||
2652 | bool UseMaskForCond, bool UseMaskForGaps) { | |||
2653 | assert(Factor >= 2 && "Invalid interleave factor")(static_cast <bool> (Factor >= 2 && "Invalid interleave factor" ) ? void (0) : __assert_fail ("Factor >= 2 && \"Invalid interleave factor\"" , "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2653 , __extension__ __PRETTY_FUNCTION__)); | |||
2654 | auto *VecVTy = cast<FixedVectorType>(VecTy); | |||
2655 | ||||
2656 | if (!UseMaskForCond && !UseMaskForGaps && | |||
2657 | Factor <= TLI->getMaxSupportedInterleaveFactor()) { | |||
2658 | unsigned NumElts = VecVTy->getNumElements(); | |||
2659 | auto *SubVecTy = | |||
2660 | FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor); | |||
2661 | ||||
2662 | // ldN/stN only support legal vector types of size 64 or 128 in bits. | |||
2663 | // Accesses having vector types that are a multiple of 128 bits can be | |||
2664 | // matched to more than one ldN/stN instruction. | |||
2665 | bool UseScalable; | |||
2666 | if (NumElts % Factor == 0 && | |||
2667 | TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable)) | |||
2668 | return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable); | |||
2669 | } | |||
2670 | ||||
2671 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | |||
2672 | Alignment, AddressSpace, CostKind, | |||
2673 | UseMaskForCond, UseMaskForGaps); | |||
2674 | } | |||
2675 | ||||
2676 | InstructionCost | |||
2677 | AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { | |||
2678 | InstructionCost Cost = 0; | |||
2679 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; | |||
2680 | for (auto *I : Tys) { | |||
2681 | if (!I->isVectorTy()) | |||
2682 | continue; | |||
2683 | if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() == | |||
2684 | 128) | |||
2685 | Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) + | |||
2686 | getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind); | |||
2687 | } | |||
2688 | return Cost; | |||
2689 | } | |||
2690 | ||||
2691 | unsigned AArch64TTIImpl::getMaxInterleaveFactor(ElementCount VF) { | |||
2692 | return ST->getMaxInterleaveFactor(); | |||
2693 | } | |||
2694 | ||||
2695 | // For Falkor, we want to avoid having too many strided loads in a loop since | |||
2696 | // that can exhaust the HW prefetcher resources. We adjust the unroller | |||
2697 | // MaxCount preference below to attempt to ensure unrolling doesn't create too | |||
2698 | // many strided loads. | |||
2699 | static void | |||
2700 | getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, | |||
2701 | TargetTransformInfo::UnrollingPreferences &UP) { | |||
2702 | enum { MaxStridedLoads = 7 }; | |||
2703 | auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) { | |||
2704 | int StridedLoads = 0; | |||
2705 | // FIXME? We could make this more precise by looking at the CFG and | |||
2706 | // e.g. not counting loads in each side of an if-then-else diamond. | |||
2707 | for (const auto BB : L->blocks()) { | |||
2708 | for (auto &I : *BB) { | |||
2709 | LoadInst *LMemI = dyn_cast<LoadInst>(&I); | |||
2710 | if (!LMemI) | |||
2711 | continue; | |||
2712 | ||||
2713 | Value *PtrValue = LMemI->getPointerOperand(); | |||
2714 | if (L->isLoopInvariant(PtrValue)) | |||
2715 | continue; | |||
2716 | ||||
2717 | const SCEV *LSCEV = SE.getSCEV(PtrValue); | |||
2718 | const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV); | |||
2719 | if (!LSCEVAddRec || !LSCEVAddRec->isAffine()) | |||
2720 | continue; | |||
2721 | ||||
2722 | // FIXME? We could take pairing of unrolled load copies into account | |||
2723 | // by looking at the AddRec, but we would probably have to limit this | |||
2724 | // to loops with no stores or other memory optimization barriers. | |||
2725 | ++StridedLoads; | |||
2726 | // We've seen enough strided loads that seeing more won't make a | |||
2727 | // difference. | |||
2728 | if (StridedLoads > MaxStridedLoads / 2) | |||
2729 | return StridedLoads; | |||
2730 | } | |||
2731 | } | |||
2732 | return StridedLoads; | |||
2733 | }; | |||
2734 | ||||
2735 | int StridedLoads = countStridedLoads(L, SE); | |||
2736 | LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoadsdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("aarch64tti")) { dbgs() << "falkor-hwpf: detected " << StridedLoads << " strided loads\n"; } } while (false) | |||
2737 | << " strided loads\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("aarch64tti")) { dbgs() << "falkor-hwpf: detected " << StridedLoads << " strided loads\n"; } } while (false); | |||
2738 | // Pick the largest power of 2 unroll count that won't result in too many | |||
2739 | // strided loads. | |||
2740 | if (StridedLoads) { | |||
2741 | UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads); | |||
| ||||
2742 | LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("aarch64tti")) { dbgs() << "falkor-hwpf: setting unroll MaxCount to " << UP.MaxCount << '\n'; } } while (false) | |||
2743 | << UP.MaxCount << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("aarch64tti")) { dbgs() << "falkor-hwpf: setting unroll MaxCount to " << UP.MaxCount << '\n'; } } while (false); | |||
2744 | } | |||
2745 | } | |||
2746 | ||||
2747 | void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, | |||
2748 | TTI::UnrollingPreferences &UP, | |||
2749 | OptimizationRemarkEmitter *ORE) { | |||
2750 | // Enable partial unrolling and runtime unrolling. | |||
2751 | BaseT::getUnrollingPreferences(L, SE, UP, ORE); | |||
2752 | ||||
2753 | UP.UpperBound = true; | |||
2754 | ||||
2755 | // For inner loop, it is more likely to be a hot one, and the runtime check | |||
2756 | // can be promoted out from LICM pass, so the overhead is less, let's try | |||
2757 | // a larger threshold to unroll more loops. | |||
2758 | if (L->getLoopDepth() > 1) | |||
| ||||
2759 | UP.PartialThreshold *= 2; | |||
2760 | ||||
2761 | // Disable partial & runtime unrolling on -Os. | |||
2762 | UP.PartialOptSizeThreshold = 0; | |||
2763 | ||||
2764 | if (ST->getProcFamily() == AArch64Subtarget::Falkor && | |||
2765 | EnableFalkorHWPFUnrollFix) | |||
2766 | getFalkorUnrollingPreferences(L, SE, UP); | |||
2767 | ||||
2768 | // Scan the loop: don't unroll loops with calls as this could prevent | |||
2769 | // inlining. Don't unroll vector loops either, as they don't benefit much from | |||
2770 | // unrolling. | |||
2771 | for (auto *BB : L->getBlocks()) { | |||
2772 | for (auto &I : *BB) { | |||
2773 | // Don't unroll vectorised loop. | |||
2774 | if (I.getType()->isVectorTy()) | |||
2775 | return; | |||
2776 | ||||
2777 | if (isa<CallInst>(I) || isa<InvokeInst>(I)) { | |||
2778 | if (const Function *F = cast<CallBase>(I).getCalledFunction()) { | |||
2779 | if (!isLoweredToCall(F)) | |||
2780 | continue; | |||
2781 | } | |||
2782 | return; | |||
2783 | } | |||
2784 | } | |||
2785 | } | |||
2786 | ||||
2787 | // Enable runtime unrolling for in-order models | |||
2788 | // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by | |||
2789 | // checking for that case, we can ensure that the default behaviour is | |||
2790 | // unchanged | |||
2791 | if (ST->getProcFamily() != AArch64Subtarget::Others && | |||
2792 | !ST->getSchedModel().isOutOfOrder()) { | |||
2793 | UP.Runtime = true; | |||
2794 | UP.Partial = true; | |||
2795 | UP.UnrollRemainder = true; | |||
2796 | UP.DefaultUnrollRuntimeCount = 4; | |||
2797 | ||||
2798 | UP.UnrollAndJam = true; | |||
2799 | UP.UnrollAndJamInnerLoopThreshold = 60; | |||
2800 | } | |||
2801 | } | |||
2802 | ||||
2803 | void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, | |||
2804 | TTI::PeelingPreferences &PP) { | |||
2805 | BaseT::getPeelingPreferences(L, SE, PP); | |||
2806 | } | |||
2807 | ||||
2808 | Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, | |||
2809 | Type *ExpectedType) { | |||
2810 | switch (Inst->getIntrinsicID()) { | |||
2811 | default: | |||
2812 | return nullptr; | |||
2813 | case Intrinsic::aarch64_neon_st2: | |||
2814 | case Intrinsic::aarch64_neon_st3: | |||
2815 | case Intrinsic::aarch64_neon_st4: { | |||
2816 | // Create a struct type | |||
2817 | StructType *ST = dyn_cast<StructType>(ExpectedType); | |||
2818 | if (!ST) | |||
2819 | return nullptr; | |||
2820 | unsigned NumElts = Inst->arg_size() - 1; | |||
2821 | if (ST->getNumElements() != NumElts) | |||
2822 | return nullptr; | |||
2823 | for (unsigned i = 0, e = NumElts; i != e; ++i) { | |||
2824 | if (Inst->getArgOperand(i)->getType() != ST->getElementType(i)) | |||
2825 | return nullptr; | |||
2826 | } | |||
2827 | Value *Res = PoisonValue::get(ExpectedType); | |||
2828 | IRBuilder<> Builder(Inst); | |||
2829 | for (unsigned i = 0, e = NumElts; i != e; ++i) { | |||
2830 | Value *L = Inst->getArgOperand(i); | |||
2831 | Res = Builder.CreateInsertValue(Res, L, i); | |||
2832 | } | |||
2833 | return Res; | |||
2834 | } | |||
2835 | case Intrinsic::aarch64_neon_ld2: | |||
2836 | case Intrinsic::aarch64_neon_ld3: | |||
2837 | case Intrinsic::aarch64_neon_ld4: | |||
2838 | if (Inst->getType() == ExpectedType) | |||
2839 | return Inst; | |||
2840 | return nullptr; | |||
2841 | } | |||
2842 | } | |||
2843 | ||||
2844 | bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, | |||
2845 | MemIntrinsicInfo &Info) { | |||
2846 | switch (Inst->getIntrinsicID()) { | |||
2847 | default: | |||
2848 | break; | |||
2849 | case Intrinsic::aarch64_neon_ld2: | |||
2850 | case Intrinsic::aarch64_neon_ld3: | |||
2851 | case Intrinsic::aarch64_neon_ld4: | |||
2852 | Info.ReadMem = true; | |||
2853 | Info.WriteMem = false; | |||
2854 | Info.PtrVal = Inst->getArgOperand(0); | |||
2855 | break; | |||
2856 | case Intrinsic::aarch64_neon_st2: | |||
2857 | case Intrinsic::aarch64_neon_st3: | |||
2858 | case Intrinsic::aarch64_neon_st4: | |||
2859 | Info.ReadMem = false; | |||
2860 | Info.WriteMem = true; | |||
2861 | Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1); | |||
2862 | break; | |||
2863 | } | |||
2864 | ||||
2865 | switch (Inst->getIntrinsicID()) { | |||
2866 | default: | |||
2867 | return false; | |||
2868 | case Intrinsic::aarch64_neon_ld2: | |||
2869 | case Intrinsic::aarch64_neon_st2: | |||
2870 | Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS; | |||
2871 | break; | |||
2872 | case Intrinsic::aarch64_neon_ld3: | |||
2873 | case Intrinsic::aarch64_neon_st3: | |||
2874 | Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS; | |||
2875 | break; | |||
2876 | case Intrinsic::aarch64_neon_ld4: | |||
2877 | case Intrinsic::aarch64_neon_st4: | |||
2878 | Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS; | |||
2879 | break; | |||
2880 | } | |||
2881 | return true; | |||
2882 | } | |||
2883 | ||||
2884 | /// See if \p I should be considered for address type promotion. We check if \p | |||
2885 | /// I is a sext with right type and used in memory accesses. If it used in a | |||
2886 | /// "complex" getelementptr, we allow it to be promoted without finding other | |||
2887 | /// sext instructions that sign extended the same initial value. A getelementptr | |||
2888 | /// is considered as "complex" if it has more than 2 operands. | |||
2889 | bool AArch64TTIImpl::shouldConsiderAddressTypePromotion( | |||
2890 | const Instruction &I, bool &AllowPromotionWithoutCommonHeader) { | |||
2891 | bool Considerable = false; | |||
2892 | AllowPromotionWithoutCommonHeader = false; | |||
2893 | if (!isa<SExtInst>(&I)) | |||
2894 | return false; | |||
2895 | Type *ConsideredSExtType = | |||
2896 | Type::getInt64Ty(I.getParent()->getParent()->getContext()); | |||
2897 | if (I.getType() != ConsideredSExtType) | |||
2898 | return false; | |||
2899 | // See if the sext is the one with the right type and used in at least one | |||
2900 | // GetElementPtrInst. | |||
2901 | for (const User *U : I.users()) { | |||
2902 | if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) { | |||
2903 | Considerable = true; | |||
2904 | // A getelementptr is considered as "complex" if it has more than 2 | |||
2905 | // operands. We will promote a SExt used in such complex GEP as we | |||
2906 | // expect some computation to be merged if they are done on 64 bits. | |||
2907 | if (GEPInst->getNumOperands() > 2) { | |||
2908 | AllowPromotionWithoutCommonHeader = true; | |||
2909 | break; | |||
2910 | } | |||
2911 | } | |||
2912 | } | |||
2913 | return Considerable; | |||
2914 | } | |||
2915 | ||||
2916 | bool AArch64TTIImpl::isLegalToVectorizeReduction( | |||
2917 | const RecurrenceDescriptor &RdxDesc, ElementCount VF) const { | |||
2918 | if (!VF.isScalable()) | |||
2919 | return true; | |||
2920 | ||||
2921 | Type *Ty = RdxDesc.getRecurrenceType(); | |||
2922 | if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty)) | |||
2923 | return false; | |||
2924 | ||||
2925 | switch (RdxDesc.getRecurrenceKind()) { | |||
2926 | case RecurKind::Add: | |||
2927 | case RecurKind::FAdd: | |||
2928 | case RecurKind::And: | |||
2929 | case RecurKind::Or: | |||
2930 | case RecurKind::Xor: | |||
2931 | case RecurKind::SMin: | |||
2932 | case RecurKind::SMax: | |||
2933 | case RecurKind::UMin: | |||
2934 | case RecurKind::UMax: | |||
2935 | case RecurKind::FMin: | |||
2936 | case RecurKind::FMax: | |||
2937 | case RecurKind::SelectICmp: | |||
2938 | case RecurKind::SelectFCmp: | |||
2939 | case RecurKind::FMulAdd: | |||
2940 | return true; | |||
2941 | default: | |||
2942 | return false; | |||
2943 | } | |||
2944 | } | |||
2945 | ||||
2946 | InstructionCost | |||
2947 | AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, | |||
2948 | bool IsUnsigned, | |||
2949 | TTI::TargetCostKind CostKind) { | |||
2950 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); | |||
2951 | ||||
2952 | if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16()) | |||
2953 | return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind); | |||
2954 | ||||
2955 | assert((isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) &&(static_cast <bool> ((isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) && "Both vector needs to be equally scalable" ) ? void (0) : __assert_fail ("(isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) && \"Both vector needs to be equally scalable\"" , "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2956 , __extension__ __PRETTY_FUNCTION__)) | |||
2956 | "Both vector needs to be equally scalable")(static_cast <bool> ((isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) && "Both vector needs to be equally scalable" ) ? void (0) : __assert_fail ("(isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) && \"Both vector needs to be equally scalable\"" , "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2956 , __extension__ __PRETTY_FUNCTION__)); | |||
2957 | ||||
2958 | InstructionCost LegalizationCost = 0; | |||
2959 | if (LT.first > 1) { | |||
2960 | Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext()); | |||
2961 | unsigned MinMaxOpcode = | |||
2962 | Ty->isFPOrFPVectorTy() | |||
2963 | ? Intrinsic::maxnum | |||
2964 | : (IsUnsigned ? Intrinsic::umin : Intrinsic::smin); | |||
2965 | IntrinsicCostAttributes Attrs(MinMaxOpcode, LegalVTy, {LegalVTy, LegalVTy}); | |||
2966 | LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1); | |||
2967 | } | |||
2968 | ||||
2969 | return LegalizationCost + /*Cost of horizontal reduction*/ 2; | |||
2970 | } | |||
2971 | ||||
2972 | InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE( | |||
2973 | unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) { | |||
2974 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); | |||
2975 | InstructionCost LegalizationCost = 0; | |||
2976 | if (LT.first > 1) { | |||
2977 | Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext()); | |||
2978 | LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind); | |||
2979 | LegalizationCost *= LT.first - 1; | |||
2980 | } | |||
2981 | ||||
2982 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
2983 | assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ? void (0) : __assert_fail ("ISD && \"Invalid opcode\"" , "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2983 , __extension__ __PRETTY_FUNCTION__)); | |||
2984 | // Add the final reduction cost for the legal horizontal reduction | |||
2985 | switch (ISD) { | |||
2986 | case ISD::ADD: | |||
2987 | case ISD::AND: | |||
2988 | case ISD::OR: | |||
2989 | case ISD::XOR: | |||
2990 | case ISD::FADD: | |||
2991 | return LegalizationCost + 2; | |||
2992 | default: | |||
2993 | return InstructionCost::getInvalid(); | |||
2994 | } | |||
2995 | } | |||
2996 | ||||
2997 | InstructionCost | |||
2998 | AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, | |||
2999 | std::optional<FastMathFlags> FMF, | |||
3000 | TTI::TargetCostKind CostKind) { | |||
3001 | if (TTI::requiresOrderedReduction(FMF)) { | |||
3002 | if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) { | |||
3003 | InstructionCost BaseCost = | |||
3004 | BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); | |||
3005 | // Add on extra cost to reflect the extra overhead on some CPUs. We still | |||
3006 | // end up vectorizing for more computationally intensive loops. | |||
3007 | return BaseCost + FixedVTy->getNumElements(); | |||
3008 | } | |||
3009 | ||||
3010 | if (Opcode != Instruction::FAdd) | |||
3011 | return InstructionCost::getInvalid(); | |||
3012 | ||||
3013 | auto *VTy = cast<ScalableVectorType>(ValTy); | |||
3014 | InstructionCost Cost = | |||
3015 | getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind); | |||
3016 | Cost *= getMaxNumElements(VTy->getElementCount()); | |||
3017 | return Cost; | |||
3018 | } | |||
3019 | ||||
3020 | if (isa<ScalableVectorType>(ValTy)) | |||
3021 | return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind); | |||
3022 | ||||
3023 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); | |||
3024 | MVT MTy = LT.second; | |||
3025 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
3026 | assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ? void (0) : __assert_fail ("ISD && \"Invalid opcode\"" , "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 3026 , __extension__ __PRETTY_FUNCTION__)); | |||
3027 | ||||
3028 | // Horizontal adds can use the 'addv' instruction. We model the cost of these | |||
3029 | // instructions as twice a normal vector add, plus 1 for each legalization | |||
3030 | // step (LT.first). This is the only arithmetic vector reduction operation for | |||
3031 | // which we have an instruction. | |||
3032 | // OR, XOR and AND costs should match the codegen from: | |||
3033 | // OR: llvm/test/CodeGen/AArch64/reduce-or.ll | |||
3034 | // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll | |||
3035 | // AND: llvm/test/CodeGen/AArch64/reduce-and.ll | |||
3036 | static const CostTblEntry CostTblNoPairwise[]{ | |||
3037 | {ISD::ADD, MVT::v8i8, 2}, | |||
3038 | {ISD::ADD, MVT::v16i8, 2}, | |||
3039 | {ISD::ADD, MVT::v4i16, 2}, | |||
3040 | {ISD::ADD, MVT::v8i16, 2}, | |||
3041 | {ISD::ADD, MVT::v4i32, 2}, | |||
3042 | {ISD::ADD, MVT::v2i64, 2}, | |||
3043 | {ISD::OR, MVT::v8i8, 15}, | |||
3044 | {ISD::OR, MVT::v16i8, 17}, | |||
3045 | {ISD::OR, MVT::v4i16, 7}, | |||
3046 | {ISD::OR, MVT::v8i16, 9}, | |||
3047 | {ISD::OR, MVT::v2i32, 3}, | |||
3048 | {ISD::OR, MVT::v4i32, 5}, | |||
3049 | {ISD::OR, MVT::v2i64, 3}, | |||
3050 | {ISD::XOR, MVT::v8i8, 15}, | |||
3051 | {ISD::XOR, MVT::v16i8, 17}, | |||
3052 | {ISD::XOR, MVT::v4i16, 7}, | |||
3053 | {ISD::XOR, MVT::v8i16, 9}, | |||
3054 | {ISD::XOR, MVT::v2i32, 3}, | |||
3055 | {ISD::XOR, MVT::v4i32, 5}, | |||
3056 | {ISD::XOR, MVT::v2i64, 3}, | |||
3057 | {ISD::AND, MVT::v8i8, 15}, | |||
3058 | {ISD::AND, MVT::v16i8, 17}, | |||
3059 | {ISD::AND, MVT::v4i16, 7}, | |||
3060 | {ISD::AND, MVT::v8i16, 9}, | |||
3061 | {ISD::AND, MVT::v2i32, 3}, | |||
3062 | {ISD::AND, MVT::v4i32, 5}, | |||
3063 | {ISD::AND, MVT::v2i64, 3}, | |||
3064 | }; | |||
3065 | switch (ISD) { | |||
3066 | default: | |||
3067 | break; | |||
3068 | case ISD::ADD: | |||
3069 | if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy)) | |||
3070 | return (LT.first - 1) + Entry->Cost; | |||
3071 | break; | |||
3072 | case ISD::XOR: | |||
3073 | case ISD::AND: | |||
3074 | case ISD::OR: | |||
3075 | const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy); | |||
3076 | if (!Entry) | |||
3077 | break; | |||
3078 | auto *ValVTy = cast<FixedVectorType>(ValTy); | |||
3079 | if (!ValVTy->getElementType()->isIntegerTy(1) && | |||
3080 | MTy.getVectorNumElements() <= ValVTy->getNumElements() && | |||
3081 | isPowerOf2_32(ValVTy->getNumElements())) { | |||
3082 | InstructionCost ExtraCost = 0; | |||
3083 | if (LT.first != 1) { | |||
3084 | // Type needs to be split, so there is an extra cost of LT.first - 1 | |||
3085 | // arithmetic ops. | |||
3086 | auto *Ty = FixedVectorType::get(ValTy->getElementType(), | |||
3087 | MTy.getVectorNumElements()); | |||
3088 | ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind); | |||
3089 | ExtraCost *= LT.first - 1; | |||
3090 | } | |||
3091 | return Entry->Cost + ExtraCost; | |||
3092 | } | |||
3093 | break; | |||
3094 | } | |||
3095 | return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); | |||
3096 | } | |||
3097 | ||||
3098 | InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) { | |||
3099 | static const CostTblEntry ShuffleTbl[] = { | |||
3100 | { TTI::SK_Splice, MVT::nxv16i8, 1 }, | |||
3101 | { TTI::SK_Splice, MVT::nxv8i16, 1 }, | |||
3102 | { TTI::SK_Splice, MVT::nxv4i32, 1 }, | |||
3103 | { TTI::SK_Splice, MVT::nxv2i64, 1 }, | |||
3104 | { TTI::SK_Splice, MVT::nxv2f16, 1 }, | |||
3105 | { TTI::SK_Splice, MVT::nxv4f16, 1 }, | |||
3106 | { TTI::SK_Splice, MVT::nxv8f16, 1 }, | |||
3107 | { TTI::SK_Splice, MVT::nxv2bf16, 1 }, | |||
3108 | { TTI::SK_Splice, MVT::nxv4bf16, 1 }, | |||
3109 | { TTI::SK_Splice, MVT::nxv8bf16, 1 }, | |||
3110 | { TTI::SK_Splice, MVT::nxv2f32, 1 }, | |||
3111 | { TTI::SK_Splice, MVT::nxv4f32, 1 }, | |||
3112 | { TTI::SK_Splice, MVT::nxv2f64, 1 }, | |||
3113 | }; | |||
3114 | ||||
3115 | // The code-generator is currently not able to handle scalable vectors | |||
3116 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting | |||
3117 | // it. This change will be removed when code-generation for these types is | |||
3118 | // sufficiently reliable. | |||
3119 | if (Tp->getElementCount() == ElementCount::getScalable(1)) | |||
3120 | return InstructionCost::getInvalid(); | |||
3121 | ||||
3122 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp); | |||
3123 | Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext()); | |||
3124 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; | |||
3125 | EVT PromotedVT = LT.second.getScalarType() == MVT::i1 | |||
3126 | ? TLI->getPromotedVTForPredicate(EVT(LT.second)) | |||
3127 | : LT.second; | |||
3128 | Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext()); | |||
3129 | InstructionCost LegalizationCost = 0; | |||
3130 | if (Index < 0) { | |||
3131 | LegalizationCost = | |||
3132 | getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy, | |||
3133 | CmpInst::BAD_ICMP_PREDICATE, CostKind) + | |||
3134 | getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy, | |||
3135 | CmpInst::BAD_ICMP_PREDICATE, CostKind); | |||
3136 | } | |||
3137 | ||||
3138 | // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp | |||
3139 | // Cost performed on a promoted type. | |||
3140 | if (LT.second.getScalarType() == MVT::i1) { | |||
3141 | LegalizationCost += | |||
3142 | getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy, | |||
3143 | TTI::CastContextHint::None, CostKind) + | |||
3144 | getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy, | |||
3145 | TTI::CastContextHint::None, CostKind); | |||
3146 | } | |||
3147 | const auto *Entry = | |||
3148 | CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT()); | |||
3149 | assert(Entry && "Illegal Type for Splice")(static_cast <bool> (Entry && "Illegal Type for Splice" ) ? void (0) : __assert_fail ("Entry && \"Illegal Type for Splice\"" , "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 3149 , __extension__ __PRETTY_FUNCTION__)); | |||
3150 | LegalizationCost += Entry->Cost; | |||
3151 | return LegalizationCost * LT.first; | |||
3152 | } | |||
3153 | ||||
3154 | InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, | |||
3155 | VectorType *Tp, | |||
3156 | ArrayRef<int> Mask, | |||
3157 | TTI::TargetCostKind CostKind, | |||
3158 | int Index, VectorType *SubTp, | |||
3159 | ArrayRef<const Value *> Args) { | |||
3160 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp); | |||
3161 | // If we have a Mask, and the LT is being legalized somehow, split the Mask | |||
3162 | // into smaller vectors and sum the cost of each shuffle. | |||
3163 | if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() && | |||
3164 | Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() && | |||
3165 | cast<FixedVectorType>(Tp)->getNumElements() > | |||
3166 | LT.second.getVectorNumElements() && | |||
3167 | !Index && !SubTp) { | |||
3168 | unsigned TpNumElts = cast<FixedVectorType>(Tp)->getNumElements(); | |||
3169 | assert(Mask.size() == TpNumElts && "Expected Mask and Tp size to match!")(static_cast <bool> (Mask.size() == TpNumElts && "Expected Mask and Tp size to match!") ? void (0) : __assert_fail ("Mask.size() == TpNumElts && \"Expected Mask and Tp size to match!\"" , "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 3169 , __extension__ __PRETTY_FUNCTION__)); | |||
3170 | unsigned LTNumElts = LT.second.getVectorNumElements(); | |||
3171 | unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts; | |||
3172 | VectorType *NTp = | |||
3173 | VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount()); | |||
3174 | InstructionCost Cost; | |||
3175 | for (unsigned N = 0; N < NumVecs; N++) { | |||
3176 | SmallVector<int> NMask; | |||
3177 | // Split the existing mask into chunks of size LTNumElts. Track the source | |||
3178 | // sub-vectors to ensure the result has at most 2 inputs. | |||
3179 | unsigned Source1, Source2; | |||
3180 | unsigned NumSources = 0; | |||
3181 | for (unsigned E = 0; E < LTNumElts; E++) { | |||
3182 | int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E] | |||
3183 | : UndefMaskElem; | |||
3184 | if (MaskElt < 0) { | |||
3185 | NMask.push_back(UndefMaskElem); | |||
3186 | continue; | |||
3187 | } | |||
3188 | ||||
3189 | // Calculate which source from the input this comes from and whether it | |||
3190 | // is new to us. | |||
3191 | unsigned Source = MaskElt / LTNumElts; | |||
3192 | if (NumSources == 0) { | |||
3193 | Source1 = Source; | |||
3194 | NumSources = 1; | |||
3195 | } else if (NumSources == 1 && Source != Source1) { | |||
3196 | Source2 = Source; | |||
3197 | NumSources = 2; | |||
3198 | } else if (NumSources >= 2 && Source != Source1 && Source != Source2) { | |||
3199 | NumSources++; | |||
3200 | } | |||
3201 | ||||
3202 | // Add to the new mask. For the NumSources>2 case these are not correct, | |||
3203 | // but are only used for the modular lane number. | |||
3204 | if (Source == Source1) | |||
3205 | NMask.push_back(MaskElt % LTNumElts); | |||
3206 | else if (Source == Source2) | |||
3207 | NMask.push_back(MaskElt % LTNumElts + LTNumElts); | |||
3208 | else | |||
3209 | NMask.push_back(MaskElt % LTNumElts); | |||
3210 | } | |||
3211 | // If the sub-mask has at most 2 input sub-vectors then re-cost it using | |||
3212 | // getShuffleCost. If not then cost it using the worst case. | |||
3213 | if (NumSources <= 2) | |||
3214 | Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc | |||
3215 | : TTI::SK_PermuteTwoSrc, | |||
3216 | NTp, NMask, CostKind, 0, nullptr, Args); | |||
3217 | else if (any_of(enumerate(NMask), [&](const auto &ME) { | |||
3218 | return ME.value() % LTNumElts == ME.index(); | |||
3219 | })) | |||
3220 | Cost += LTNumElts - 1; | |||
3221 | else | |||
3222 | Cost += LTNumElts; | |||
3223 | } | |||
3224 | return Cost; | |||
3225 | } | |||
3226 | ||||
3227 | Kind = improveShuffleKindFromMask(Kind, Mask); | |||
3228 | ||||
3229 | // Check for broadcast loads, which are supported by the LD1R instruction. | |||
3230 | // In terms of code-size, the shuffle vector is free when a load + dup get | |||
3231 | // folded into a LD1R. That's what we check and return here. For performance | |||
3232 | // and reciprocal throughput, a LD1R is not completely free. In this case, we | |||
3233 | // return the cost for the broadcast below (i.e. 1 for most/all types), so | |||
3234 | // that we model the load + dup sequence slightly higher because LD1R is a | |||
3235 | // high latency instruction. | |||
3236 | if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) { | |||
3237 | bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]); | |||
3238 | if (IsLoad && LT.second.isVector() && | |||
3239 | isLegalBroadcastLoad(Tp->getElementType(), | |||
3240 | LT.second.getVectorElementCount())) | |||
3241 | return 0; | |||
3242 | } | |||
3243 | ||||
3244 | // If we have 4 elements for the shuffle and a Mask, get the cost straight | |||
3245 | // from the perfect shuffle tables. | |||
3246 | if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) && | |||
3247 | (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) && | |||
3248 | all_of(Mask, [](int E) { return E < 8; })) | |||
3249 | return getPerfectShuffleCost(Mask); | |||
3250 | ||||
3251 | if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose || | |||
3252 | Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc || | |||
3253 | Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) { | |||
3254 | static const CostTblEntry ShuffleTbl[] = { | |||
3255 | // Broadcast shuffle kinds can be performed with 'dup'. | |||
3256 | {TTI::SK_Broadcast, MVT::v8i8, 1}, | |||
3257 | {TTI::SK_Broadcast, MVT::v16i8, 1}, | |||
3258 | {TTI::SK_Broadcast, MVT::v4i16, 1}, | |||
3259 | {TTI::SK_Broadcast, MVT::v8i16, 1}, | |||
3260 | {TTI::SK_Broadcast, MVT::v2i32, 1}, | |||
3261 | {TTI::SK_Broadcast, MVT::v4i32, 1}, | |||
3262 | {TTI::SK_Broadcast, MVT::v2i64, 1}, | |||
3263 | {TTI::SK_Broadcast, MVT::v4f16, 1}, | |||
3264 | {TTI::SK_Broadcast, MVT::v8f16, 1}, | |||
3265 | {TTI::SK_Broadcast, MVT::v2f32, 1}, | |||
3266 | {TTI::SK_Broadcast, MVT::v4f32, 1}, | |||
3267 | {TTI::SK_Broadcast, MVT::v2f64, 1}, | |||
3268 | // Transpose shuffle kinds can be performed with 'trn1/trn2' and | |||
3269 | // 'zip1/zip2' instructions. | |||
3270 | {TTI::SK_Transpose, MVT::v8i8, 1}, | |||
3271 | {TTI::SK_Transpose, MVT::v16i8, 1}, | |||
3272 | {TTI::SK_Transpose, MVT::v4i16, 1}, | |||
3273 | {TTI::SK_Transpose, MVT::v8i16, 1}, | |||
3274 | {TTI::SK_Transpose, MVT::v2i32, 1}, | |||
3275 | {TTI::SK_Transpose, MVT::v4i32, 1}, | |||
3276 | {TTI::SK_Transpose, MVT::v2i64, 1}, | |||
3277 | {TTI::SK_Transpose, MVT::v4f16, 1}, | |||
3278 | {TTI::SK_Transpose, MVT::v8f16, 1}, | |||
3279 | {TTI::SK_Transpose, MVT::v2f32, 1}, | |||
3280 | {TTI::SK_Transpose, MVT::v4f32, 1}, | |||
3281 | {TTI::SK_Transpose, MVT::v2f64, 1}, | |||
3282 | // Select shuffle kinds. | |||
3283 | // TODO: handle vXi8/vXi16. | |||
3284 | {TTI::SK_Select, MVT::v2i32, 1}, // mov. | |||
3285 | {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar). | |||
3286 | {TTI::SK_Select, MVT::v2i64, 1}, // mov. | |||
3287 | {TTI::SK_Select, MVT::v2f32, 1}, // mov. | |||
3288 | {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar). | |||
3289 | {TTI::SK_Select, MVT::v2f64, 1}, // mov. | |||
3290 | // PermuteSingleSrc shuffle kinds. | |||
3291 | {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov. | |||
3292 | {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case. | |||
3293 | {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov. | |||
3294 | {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov. | |||
3295 | {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case. | |||
3296 | {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov. | |||
3297 | {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case. | |||
3298 | {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case. | |||
3299 | {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same | |||
3300 | {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl | |||
3301 | {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl | |||
3302 | {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl | |||
3303 | {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl | |||
3304 | {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl | |||
3305 | // Reverse can be lowered with `rev`. | |||
3306 | {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64 | |||
3307 | {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT | |||
3308 | {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT | |||
3309 | {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64 | |||
3310 | {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT | |||
3311 | {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT | |||
3312 | {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT | |||
3313 | {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT | |||
3314 | {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT | |||
3315 | {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64 | |||
3316 | {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64 | |||
3317 | {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64 | |||
3318 | // Splice can all be lowered as `ext`. | |||
3319 | {TTI::SK_Splice, MVT::v2i32, 1}, | |||
3320 | {TTI::SK_Splice, MVT::v4i32, 1}, | |||
3321 | {TTI::SK_Splice, MVT::v2i64, 1}, | |||
3322 | {TTI::SK_Splice, MVT::v2f32, 1}, | |||
3323 | {TTI::SK_Splice, MVT::v4f32, 1}, | |||
3324 | {TTI::SK_Splice, MVT::v2f64, 1}, | |||
3325 | {TTI::SK_Splice, MVT::v8f16, 1}, | |||
3326 | {TTI::SK_Splice, MVT::v8bf16, 1}, | |||
3327 | {TTI::SK_Splice, MVT::v8i16, 1}, | |||
3328 | {TTI::SK_Splice, MVT::v16i8, 1}, | |||
3329 | {TTI::SK_Splice, MVT::v4bf16, 1}, | |||
3330 | {TTI::SK_Splice, MVT::v4f16, 1}, | |||
3331 | {TTI::SK_Splice, MVT::v4i16, 1}, | |||
3332 | {TTI::SK_Splice, MVT::v8i8, 1}, | |||
3333 | // Broadcast shuffle kinds for scalable vectors | |||
3334 | {TTI::SK_Broadcast, MVT::nxv16i8, 1}, | |||
3335 | {TTI::SK_Broadcast, MVT::nxv8i16, 1}, | |||
3336 | {TTI::SK_Broadcast, MVT::nxv4i32, 1}, | |||
3337 | {TTI::SK_Broadcast, MVT::nxv2i64, 1}, | |||
3338 | {TTI::SK_Broadcast, MVT::nxv2f16, 1}, | |||
3339 | {TTI::SK_Broadcast, MVT::nxv4f16, 1}, | |||
3340 | {TTI::SK_Broadcast, MVT::nxv8f16, 1}, | |||
3341 | {TTI::SK_Broadcast, MVT::nxv2bf16, 1}, | |||
3342 | {TTI::SK_Broadcast, MVT::nxv4bf16, 1}, | |||
3343 | {TTI::SK_Broadcast, MVT::nxv8bf16, 1}, | |||
3344 | {TTI::SK_Broadcast, MVT::nxv2f32, 1}, | |||
3345 | {TTI::SK_Broadcast, MVT::nxv4f32, 1}, | |||
3346 | {TTI::SK_Broadcast, MVT::nxv2f64, 1}, | |||
3347 | {TTI::SK_Broadcast, MVT::nxv16i1, 1}, | |||
3348 | {TTI::SK_Broadcast, MVT::nxv8i1, 1}, | |||
3349 | {TTI::SK_Broadcast, MVT::nxv4i1, 1}, | |||
3350 | {TTI::SK_Broadcast, MVT::nxv2i1, 1}, | |||
3351 | // Handle the cases for vector.reverse with scalable vectors | |||
3352 | {TTI::SK_Reverse, MVT::nxv16i8, 1}, | |||
3353 | {TTI::SK_Reverse, MVT::nxv8i16, 1}, | |||
3354 | {TTI::SK_Reverse, MVT::nxv4i32, 1}, | |||
3355 | {TTI::SK_Reverse, MVT::nxv2i64, 1}, | |||
3356 | {TTI::SK_Reverse, MVT::nxv2f16, 1}, | |||
3357 | {TTI::SK_Reverse, MVT::nxv4f16, 1}, | |||
3358 | {TTI::SK_Reverse, MVT::nxv8f16, 1}, | |||
3359 | {TTI::SK_Reverse, MVT::nxv2bf16, 1}, | |||
3360 | {TTI::SK_Reverse, MVT::nxv4bf16, 1}, | |||
3361 | {TTI::SK_Reverse, MVT::nxv8bf16, 1}, | |||
3362 | {TTI::SK_Reverse, MVT::nxv2f32, 1}, | |||
3363 | {TTI::SK_Reverse, MVT::nxv4f32, 1}, | |||
3364 | {TTI::SK_Reverse, MVT::nxv2f64, 1}, | |||
3365 | {TTI::SK_Reverse, MVT::nxv16i1, 1}, | |||
3366 | {TTI::SK_Reverse, MVT::nxv8i1, 1}, | |||
3367 | {TTI::SK_Reverse, MVT::nxv4i1, 1}, | |||
3368 | {TTI::SK_Reverse, MVT::nxv2i1, 1}, | |||
3369 | }; | |||
3370 | if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second)) | |||
3371 | return LT.first * Entry->Cost; | |||
3372 | } | |||
3373 | ||||
3374 | if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp)) | |||
3375 | return getSpliceCost(Tp, Index); | |||
3376 | ||||
3377 | // Inserting a subvector can often be done with either a D, S or H register | |||
3378 | // move, so long as the inserted vector is "aligned". | |||
3379 | if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() && | |||
3380 | LT.second.getSizeInBits() <= 128 && SubTp) { | |||
3381 | std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp); | |||
3382 | if (SubLT.second.isVector()) { | |||
3383 | int NumElts = LT.second.getVectorNumElements(); | |||
3384 | int NumSubElts = SubLT.second.getVectorNumElements(); | |||
3385 | if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) | |||
3386 | return SubLT.first; | |||
3387 | } | |||
3388 | } | |||
3389 | ||||
3390 | return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp); | |||
3391 | } | |||
3392 | ||||
3393 | static bool containsDecreasingPointers(Loop *TheLoop, | |||
3394 | PredicatedScalarEvolution *PSE) { | |||
3395 | const ValueToValueMap &Strides = ValueToValueMap(); | |||
3396 | for (BasicBlock *BB : TheLoop->blocks()) { | |||
3397 | // Scan the instructions in the block and look for addresses that are | |||
3398 | // consecutive and decreasing. | |||
3399 | for (Instruction &I : *BB) { | |||
3400 | if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) { | |||
3401 | Value *Ptr = getLoadStorePointerOperand(&I); | |||
3402 | Type *AccessTy = getLoadStoreType(&I); | |||
3403 | if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true, | |||
3404 | /*ShouldCheckWrap=*/false) | |||
3405 | .value_or(0) < 0) | |||
3406 | return true; | |||
3407 | } | |||
3408 | } | |||
3409 | } | |||
3410 | return false; | |||
3411 | } | |||
3412 | ||||
3413 | bool AArch64TTIImpl::preferPredicateOverEpilogue( | |||
3414 | Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, | |||
3415 | TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL, | |||
3416 | InterleavedAccessInfo *IAI) { | |||
3417 | if (!ST->hasSVE() || TailFoldingKindLoc == TailFoldingKind::TFDisabled) | |||
3418 | return false; | |||
3419 | ||||
3420 | // We don't currently support vectorisation with interleaving for SVE - with | |||
3421 | // such loops we're better off not using tail-folding. This gives us a chance | |||
3422 | // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc. | |||
3423 | if (IAI->hasGroups()) | |||
3424 | return false; | |||
3425 | ||||
3426 | TailFoldingKind Required; // Defaults to 0. | |||
3427 | if (LVL->getReductionVars().size()) | |||
3428 | Required.add(TailFoldingKind::TFReductions); | |||
3429 | if (LVL->getFixedOrderRecurrences().size()) | |||
3430 | Required.add(TailFoldingKind::TFRecurrences); | |||
3431 | ||||
3432 | // We call this to discover whether any load/store pointers in the loop have | |||
3433 | // negative strides. This will require extra work to reverse the loop | |||
3434 | // predicate, which may be expensive. | |||
3435 | if (containsDecreasingPointers(L, LVL->getPredicatedScalarEvolution())) | |||
3436 | Required.add(TailFoldingKind::TFReverse); | |||
3437 | if (!Required) | |||
3438 | Required.add(TailFoldingKind::TFSimple); | |||
3439 | ||||
3440 | return (TailFoldingKindLoc & Required) == Required; | |||
3441 | } | |||
3442 | ||||
3443 | InstructionCost | |||
3444 | AArch64TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, | |||
3445 | int64_t BaseOffset, bool HasBaseReg, | |||
3446 | int64_t Scale, unsigned AddrSpace) const { | |||
3447 | // Scaling factors are not free at all. | |||
3448 | // Operands | Rt Latency | |||
3449 | // ------------------------------------------- | |||
3450 | // Rt, [Xn, Xm] | 4 | |||
3451 | // ------------------------------------------- | |||
3452 | // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5 | |||
3453 | // Rt, [Xn, Wm, <extend> #imm] | | |||
3454 | TargetLoweringBase::AddrMode AM; | |||
3455 | AM.BaseGV = BaseGV; | |||
3456 | AM.BaseOffs = BaseOffset; | |||
3457 | AM.HasBaseReg = HasBaseReg; | |||
3458 | AM.Scale = Scale; | |||
3459 | if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) | |||
3460 | // Scale represents reg2 * scale, thus account for 1 if | |||
3461 | // it is not equal to 0 or 1. | |||
3462 | return AM.Scale != 0 && AM.Scale != 1; | |||
3463 | return -1; | |||
3464 | } |
1 | //===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file contains some functions that are useful for math stuff. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #ifndef LLVM_SUPPORT_MATHEXTRAS_H |
14 | #define LLVM_SUPPORT_MATHEXTRAS_H |
15 | |
16 | #include "llvm/ADT/bit.h" |
17 | #include "llvm/Support/Compiler.h" |
18 | #include <cassert> |
19 | #include <climits> |
20 | #include <cstdint> |
21 | #include <cstring> |
22 | #include <limits> |
23 | #include <type_traits> |
24 | |
25 | namespace llvm { |
26 | |
27 | /// Mathematical constants. |
28 | namespace numbers { |
29 | // TODO: Track C++20 std::numbers. |
30 | // TODO: Favor using the hexadecimal FP constants (requires C++17). |
31 | constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113 |
32 | egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620 |
33 | ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162 |
34 | ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392 |
35 | log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0) |
36 | log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2) |
37 | pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796 |
38 | inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541 |
39 | sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161 |
40 | inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197 |
41 | sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219 |
42 | inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1) |
43 | sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194 |
44 | inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1) |
45 | phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622 |
46 | constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113 |
47 | egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620 |
48 | ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162 |
49 | ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392 |
50 | log2ef = 1.44269504F, // (0x1.715476P+0) |
51 | log10ef = .434294482F, // (0x1.bcb7b2P-2) |
52 | pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796 |
53 | inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541 |
54 | sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161 |
55 | inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197 |
56 | sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193 |
57 | inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1) |
58 | sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194 |
59 | inv_sqrt3f = .577350269F, // (0x1.279a74P-1) |
60 | phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622 |
61 | } // namespace numbers |
62 | |
63 | /// Count number of 0's from the least significant bit to the most |
64 | /// stopping at the first 1. |
65 | /// |
66 | /// Only unsigned integral types are allowed. |
67 | /// |
68 | /// Returns std::numeric_limits<T>::digits on an input of 0. |
69 | template <typename T> |
70 | LLVM_DEPRECATED("Use llvm::countr_zero instead.", "llvm::countr_zero")__attribute__((deprecated("Use llvm::countr_zero instead.", "llvm::countr_zero" ))) |
71 | unsigned countTrailingZeros(T Val) { |
72 | static_assert(std::is_unsigned_v<T>, |
73 | "Only unsigned integral types are allowed."); |
74 | return llvm::countr_zero(Val); |
75 | } |
76 | |
77 | /// Count number of 0's from the most significant bit to the least |
78 | /// stopping at the first 1. |
79 | /// |
80 | /// Only unsigned integral types are allowed. |
81 | /// |
82 | /// Returns std::numeric_limits<T>::digits on an input of 0. |
83 | template <typename T> |
84 | LLVM_DEPRECATED("Use llvm::countl_zero instead.", "llvm::countl_zero")__attribute__((deprecated("Use llvm::countl_zero instead.", "llvm::countl_zero" ))) |
85 | unsigned countLeadingZeros(T Val) { |
86 | static_assert(std::is_unsigned_v<T>, |
87 | "Only unsigned integral types are allowed."); |
88 | return llvm::countl_zero(Val); |
89 | } |
90 | |
91 | /// Create a bitmask with the N right-most bits set to 1, and all other |
92 | /// bits set to 0. Only unsigned types are allowed. |
93 | template <typename T> T maskTrailingOnes(unsigned N) { |
94 | static_assert(std::is_unsigned<T>::value, "Invalid type!"); |
95 | const unsigned Bits = CHAR_BIT8 * sizeof(T); |
96 | assert(N <= Bits && "Invalid bit index")(static_cast <bool> (N <= Bits && "Invalid bit index" ) ? void (0) : __assert_fail ("N <= Bits && \"Invalid bit index\"" , "llvm/include/llvm/Support/MathExtras.h", 96, __extension__ __PRETTY_FUNCTION__)); |
97 | return N == 0 ? 0 : (T(-1) >> (Bits - N)); |
98 | } |
99 | |
100 | /// Create a bitmask with the N left-most bits set to 1, and all other |
101 | /// bits set to 0. Only unsigned types are allowed. |
102 | template <typename T> T maskLeadingOnes(unsigned N) { |
103 | return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N); |
104 | } |
105 | |
106 | /// Create a bitmask with the N right-most bits set to 0, and all other |
107 | /// bits set to 1. Only unsigned types are allowed. |
108 | template <typename T> T maskTrailingZeros(unsigned N) { |
109 | return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N); |
110 | } |
111 | |
112 | /// Create a bitmask with the N left-most bits set to 0, and all other |
113 | /// bits set to 1. Only unsigned types are allowed. |
114 | template <typename T> T maskLeadingZeros(unsigned N) { |
115 | return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N); |
116 | } |
117 | |
118 | /// Macro compressed bit reversal table for 256 bits. |
119 | /// |
120 | /// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable |
121 | static const unsigned char BitReverseTable256[256] = { |
122 | #define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64 |
123 | #define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16) |
124 | #define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4) |
125 | R6(0), R6(2), R6(1), R6(3) |
126 | #undef R2 |
127 | #undef R4 |
128 | #undef R6 |
129 | }; |
130 | |
131 | /// Reverse the bits in \p Val. |
132 | template <typename T> T reverseBits(T Val) { |
133 | #if __has_builtin(__builtin_bitreverse8)1 |
134 | if constexpr (std::is_same_v<T, uint8_t>) |
135 | return __builtin_bitreverse8(Val); |
136 | #endif |
137 | #if __has_builtin(__builtin_bitreverse16)1 |
138 | if constexpr (std::is_same_v<T, uint16_t>) |
139 | return __builtin_bitreverse16(Val); |
140 | #endif |
141 | #if __has_builtin(__builtin_bitreverse32)1 |
142 | if constexpr (std::is_same_v<T, uint32_t>) |
143 | return __builtin_bitreverse32(Val); |
144 | #endif |
145 | #if __has_builtin(__builtin_bitreverse64)1 |
146 | if constexpr (std::is_same_v<T, uint64_t>) |
147 | return __builtin_bitreverse64(Val); |
148 | #endif |
149 | |
150 | unsigned char in[sizeof(Val)]; |
151 | unsigned char out[sizeof(Val)]; |
152 | std::memcpy(in, &Val, sizeof(Val)); |
153 | for (unsigned i = 0; i < sizeof(Val); ++i) |
154 | out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]]; |
155 | std::memcpy(&Val, out, sizeof(Val)); |
156 | return Val; |
157 | } |
158 | |
159 | // NOTE: The following support functions use the _32/_64 extensions instead of |
160 | // type overloading so that signed and unsigned integers can be used without |
161 | // ambiguity. |
162 | |
163 | /// Return the high 32 bits of a 64 bit value. |
164 | constexpr inline uint32_t Hi_32(uint64_t Value) { |
165 | return static_cast<uint32_t>(Value >> 32); |
166 | } |
167 | |
168 | /// Return the low 32 bits of a 64 bit value. |
169 | constexpr inline uint32_t Lo_32(uint64_t Value) { |
170 | return static_cast<uint32_t>(Value); |
171 | } |
172 | |
173 | /// Make a 64-bit integer from a high / low pair of 32-bit integers. |
174 | constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) { |
175 | return ((uint64_t)High << 32) | (uint64_t)Low; |
176 | } |
177 | |
178 | /// Checks if an integer fits into the given bit width. |
179 | template <unsigned N> constexpr inline bool isInt(int64_t x) { |
180 | if constexpr (N == 8) |
181 | return static_cast<int8_t>(x) == x; |
182 | if constexpr (N == 16) |
183 | return static_cast<int16_t>(x) == x; |
184 | if constexpr (N == 32) |
185 | return static_cast<int32_t>(x) == x; |
186 | if constexpr (N < 64) |
187 | return -(INT64_C(1)1L << (N - 1)) <= x && x < (INT64_C(1)1L << (N - 1)); |
188 | (void)x; // MSVC v19.25 warns that x is unused. |
189 | return true; |
190 | } |
191 | |
192 | /// Checks if a signed integer is an N bit number shifted left by S. |
193 | template <unsigned N, unsigned S> |
194 | constexpr inline bool isShiftedInt(int64_t x) { |
195 | static_assert( |
196 | N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number."); |
197 | static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide."); |
198 | return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0); |
199 | } |
200 | |
201 | /// Checks if an unsigned integer fits into the given bit width. |
202 | template <unsigned N> constexpr inline bool isUInt(uint64_t x) { |
203 | static_assert(N > 0, "isUInt<0> doesn't make sense"); |
204 | if constexpr (N == 8) |
205 | return static_cast<uint8_t>(x) == x; |
206 | if constexpr (N == 16) |
207 | return static_cast<uint16_t>(x) == x; |
208 | if constexpr (N == 32) |
209 | return static_cast<uint32_t>(x) == x; |
210 | if constexpr (N < 64) |
211 | return x < (UINT64_C(1)1UL << (N)); |
212 | (void)x; // MSVC v19.25 warns that x is unused. |
213 | return true; |
214 | } |
215 | |
216 | /// Checks if a unsigned integer is an N bit number shifted left by S. |
217 | template <unsigned N, unsigned S> |
218 | constexpr inline bool isShiftedUInt(uint64_t x) { |
219 | static_assert( |
220 | N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)"); |
221 | static_assert(N + S <= 64, |
222 | "isShiftedUInt<N, S> with N + S > 64 is too wide."); |
223 | // Per the two static_asserts above, S must be strictly less than 64. So |
224 | // 1 << S is not undefined behavior. |
225 | return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0); |
226 | } |
227 | |
228 | /// Gets the maximum value for a N-bit unsigned integer. |
229 | inline uint64_t maxUIntN(uint64_t N) { |
230 | assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 && "integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\"" , "llvm/include/llvm/Support/MathExtras.h", 230, __extension__ __PRETTY_FUNCTION__)); |
231 | |
232 | // uint64_t(1) << 64 is undefined behavior, so we can't do |
233 | // (uint64_t(1) << N) - 1 |
234 | // without checking first that N != 64. But this works and doesn't have a |
235 | // branch. |
236 | return UINT64_MAX(18446744073709551615UL) >> (64 - N); |
237 | } |
238 | |
239 | /// Gets the minimum value for a N-bit signed integer. |
240 | inline int64_t minIntN(int64_t N) { |
241 | assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 && "integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\"" , "llvm/include/llvm/Support/MathExtras.h", 241, __extension__ __PRETTY_FUNCTION__)); |
242 | |
243 | return UINT64_C(1)1UL + ~(UINT64_C(1)1UL << (N - 1)); |
244 | } |
245 | |
246 | /// Gets the maximum value for a N-bit signed integer. |
247 | inline int64_t maxIntN(int64_t N) { |
248 | assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 && "integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\"" , "llvm/include/llvm/Support/MathExtras.h", 248, __extension__ __PRETTY_FUNCTION__)); |
249 | |
250 | // This relies on two's complement wraparound when N == 64, so we convert to |
251 | // int64_t only at the very end to avoid UB. |
252 | return (UINT64_C(1)1UL << (N - 1)) - 1; |
253 | } |
254 | |
255 | /// Checks if an unsigned integer fits into the given (dynamic) bit width. |
256 | inline bool isUIntN(unsigned N, uint64_t x) { |
257 | return N >= 64 || x <= maxUIntN(N); |
258 | } |
259 | |
260 | /// Checks if an signed integer fits into the given (dynamic) bit width. |
261 | inline bool isIntN(unsigned N, int64_t x) { |
262 | return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N)); |
263 | } |
264 | |
265 | /// Return true if the argument is a non-empty sequence of ones starting at the |
266 | /// least significant bit with the remainder zero (32 bit version). |
267 | /// Ex. isMask_32(0x0000FFFFU) == true. |
268 | constexpr inline bool isMask_32(uint32_t Value) { |
269 | return Value && ((Value + 1) & Value) == 0; |
270 | } |
271 | |
272 | /// Return true if the argument is a non-empty sequence of ones starting at the |
273 | /// least significant bit with the remainder zero (64 bit version). |
274 | constexpr inline bool isMask_64(uint64_t Value) { |
275 | return Value && ((Value + 1) & Value) == 0; |
276 | } |
277 | |
278 | /// Return true if the argument contains a non-empty sequence of ones with the |
279 | /// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true. |
280 | constexpr inline bool isShiftedMask_32(uint32_t Value) { |
281 | return Value && isMask_32((Value - 1) | Value); |
282 | } |
283 | |
284 | /// Return true if the argument contains a non-empty sequence of ones with the |
285 | /// remainder zero (64 bit version.) |
286 | constexpr inline bool isShiftedMask_64(uint64_t Value) { |
287 | return Value && isMask_64((Value - 1) | Value); |
288 | } |
289 | |
290 | /// Return true if the argument is a power of two > 0. |
291 | /// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.) |
292 | constexpr inline bool isPowerOf2_32(uint32_t Value) { |
293 | return llvm::has_single_bit(Value); |
294 | } |
295 | |
296 | /// Return true if the argument is a power of two > 0 (64 bit edition.) |
297 | constexpr inline bool isPowerOf2_64(uint64_t Value) { |
298 | return llvm::has_single_bit(Value); |
299 | } |
300 | |
301 | /// Count the number of ones from the most significant bit to the first |
302 | /// zero bit. |
303 | /// |
304 | /// Ex. countLeadingOnes(0xFF0FFF00) == 8. |
305 | /// Only unsigned integral types are allowed. |
306 | /// |
307 | /// Returns std::numeric_limits<T>::digits on an input of all ones. |
308 | template <typename T> |
309 | LLVM_DEPRECATED("Use llvm::countl_one instead.", "llvm::countl_one")__attribute__((deprecated("Use llvm::countl_one instead.", "llvm::countl_one" ))) |
310 | unsigned countLeadingOnes(T Value) { |
311 | static_assert(std::is_unsigned_v<T>, |
312 | "Only unsigned integral types are allowed."); |
313 | return llvm::countl_one<T>(Value); |
314 | } |
315 | |
316 | /// Count the number of ones from the least significant bit to the first |
317 | /// zero bit. |
318 | /// |
319 | /// Ex. countTrailingOnes(0x00FF00FF) == 8. |
320 | /// Only unsigned integral types are allowed. |
321 | /// |
322 | /// Returns std::numeric_limits<T>::digits on an input of all ones. |
323 | template <typename T> |
324 | LLVM_DEPRECATED("Use llvm::countr_one instead.", "llvm::countr_one")__attribute__((deprecated("Use llvm::countr_one instead.", "llvm::countr_one" ))) |
325 | unsigned countTrailingOnes(T Value) { |
326 | static_assert(std::is_unsigned_v<T>, |
327 | "Only unsigned integral types are allowed."); |
328 | return llvm::countr_one<T>(Value); |
329 | } |
330 | |
331 | /// Count the number of set bits in a value. |
332 | /// Ex. countPopulation(0xF000F000) = 8 |
333 | /// Returns 0 if the word is zero. |
334 | template <typename T> |
335 | LLVM_DEPRECATED("Use llvm::popcount instead.", "llvm::popcount")__attribute__((deprecated("Use llvm::popcount instead.", "llvm::popcount" ))) |
336 | inline unsigned countPopulation(T Value) { |
337 | static_assert(std::is_unsigned_v<T>, |
338 | "Only unsigned integral types are allowed."); |
339 | return (unsigned)llvm::popcount(Value); |
340 | } |
341 | |
342 | /// Return true if the argument contains a non-empty sequence of ones with the |
343 | /// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true. |
344 | /// If true, \p MaskIdx will specify the index of the lowest set bit and \p |
345 | /// MaskLen is updated to specify the length of the mask, else neither are |
346 | /// updated. |
347 | inline bool isShiftedMask_32(uint32_t Value, unsigned &MaskIdx, |
348 | unsigned &MaskLen) { |
349 | if (!isShiftedMask_32(Value)) |
350 | return false; |
351 | MaskIdx = llvm::countr_zero(Value); |
352 | MaskLen = llvm::popcount(Value); |
353 | return true; |
354 | } |
355 | |
356 | /// Return true if the argument contains a non-empty sequence of ones with the |
357 | /// remainder zero (64 bit version.) If true, \p MaskIdx will specify the index |
358 | /// of the lowest set bit and \p MaskLen is updated to specify the length of the |
359 | /// mask, else neither are updated. |
360 | inline bool isShiftedMask_64(uint64_t Value, unsigned &MaskIdx, |
361 | unsigned &MaskLen) { |
362 | if (!isShiftedMask_64(Value)) |
363 | return false; |
364 | MaskIdx = llvm::countr_zero(Value); |
365 | MaskLen = llvm::popcount(Value); |
366 | return true; |
367 | } |
368 | |
369 | /// Compile time Log2. |
370 | /// Valid only for positive powers of two. |
371 | template <size_t kValue> constexpr inline size_t CTLog2() { |
372 | static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue), |
373 | "Value is not a valid power of 2"); |
374 | return 1 + CTLog2<kValue / 2>(); |
375 | } |
376 | |
377 | template <> constexpr inline size_t CTLog2<1>() { return 0; } |
378 | |
379 | /// Return the floor log base 2 of the specified value, -1 if the value is zero. |
380 | /// (32 bit edition.) |
381 | /// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2 |
382 | inline unsigned Log2_32(uint32_t Value) { |
383 | return 31 - llvm::countl_zero(Value); |
384 | } |
385 | |
386 | /// Return the floor log base 2 of the specified value, -1 if the value is zero. |
387 | /// (64 bit edition.) |
388 | inline unsigned Log2_64(uint64_t Value) { |
389 | return 63 - llvm::countl_zero(Value); |
390 | } |
391 | |
392 | /// Return the ceil log base 2 of the specified value, 32 if the value is zero. |
393 | /// (32 bit edition). |
394 | /// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3 |
395 | inline unsigned Log2_32_Ceil(uint32_t Value) { |
396 | return 32 - llvm::countl_zero(Value - 1); |
397 | } |
398 | |
399 | /// Return the ceil log base 2 of the specified value, 64 if the value is zero. |
400 | /// (64 bit edition.) |
401 | inline unsigned Log2_64_Ceil(uint64_t Value) { |
402 | return 64 - llvm::countl_zero(Value - 1); |
403 | } |
404 | |
405 | /// This function takes a 64-bit integer and returns the bit equivalent double. |
406 | LLVM_DEPRECATED("use llvm::bit_cast instead", "llvm::bit_cast<double>")__attribute__((deprecated("use llvm::bit_cast instead", "llvm::bit_cast<double>" ))) |
407 | inline double BitsToDouble(uint64_t Bits) { |
408 | static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes"); |
409 | return llvm::bit_cast<double>(Bits); |
410 | } |
411 | |
412 | /// This function takes a 32-bit integer and returns the bit equivalent float. |
413 | LLVM_DEPRECATED("use llvm::bit_cast instead", "llvm::bit_cast<float>")__attribute__((deprecated("use llvm::bit_cast instead", "llvm::bit_cast<float>" ))) |
414 | inline float BitsToFloat(uint32_t Bits) { |
415 | static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes"); |
416 | return llvm::bit_cast<float>(Bits); |
417 | } |
418 | |
419 | /// This function takes a double and returns the bit equivalent 64-bit integer. |
420 | /// Note that copying doubles around changes the bits of NaNs on some hosts, |
421 | /// notably x86, so this routine cannot be used if these bits are needed. |
422 | LLVM_DEPRECATED("use llvm::bit_cast instead", "llvm::bit_cast<uint64_t>")__attribute__((deprecated("use llvm::bit_cast instead", "llvm::bit_cast<uint64_t>" ))) |
423 | inline uint64_t DoubleToBits(double Double) { |
424 | static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes"); |
425 | return llvm::bit_cast<uint64_t>(Double); |
426 | } |
427 | |
428 | /// This function takes a float and returns the bit equivalent 32-bit integer. |
429 | /// Note that copying floats around changes the bits of NaNs on some hosts, |
430 | /// notably x86, so this routine cannot be used if these bits are needed. |
431 | LLVM_DEPRECATED("use llvm::bit_cast instead", "llvm::bit_cast<uint32_t>")__attribute__((deprecated("use llvm::bit_cast instead", "llvm::bit_cast<uint32_t>" ))) |
432 | inline uint32_t FloatToBits(float Float) { |
433 | static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes"); |
434 | return llvm::bit_cast<uint32_t>(Float); |
435 | } |
436 | |
437 | /// A and B are either alignments or offsets. Return the minimum alignment that |
438 | /// may be assumed after adding the two together. |
439 | constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) { |
440 | // The largest power of 2 that divides both A and B. |
441 | // |
442 | // Replace "-Value" by "1+~Value" in the following commented code to avoid |
443 | // MSVC warning C4146 |
444 | // return (A | B) & -(A | B); |
445 | return (A | B) & (1 + ~(A | B)); |
446 | } |
447 | |
448 | /// Returns the next power of two (in 64-bits) that is strictly greater than A. |
449 | /// Returns zero on overflow. |
450 | constexpr inline uint64_t NextPowerOf2(uint64_t A) { |
451 | A |= (A >> 1); |
452 | A |= (A >> 2); |
453 | A |= (A >> 4); |
454 | A |= (A >> 8); |
455 | A |= (A >> 16); |
456 | A |= (A >> 32); |
457 | return A + 1; |
458 | } |
459 | |
460 | /// Returns the power of two which is less than or equal to the given value. |
461 | /// Essentially, it is a floor operation across the domain of powers of two. |
462 | LLVM_DEPRECATED("use llvm::bit_floor instead", "llvm::bit_floor")__attribute__((deprecated("use llvm::bit_floor instead", "llvm::bit_floor" ))) |
463 | inline uint64_t PowerOf2Floor(uint64_t A) { |
464 | return llvm::bit_floor(A); |
465 | } |
466 | |
467 | /// Returns the power of two which is greater than or equal to the given value. |
468 | /// Essentially, it is a ceil operation across the domain of powers of two. |
469 | inline uint64_t PowerOf2Ceil(uint64_t A) { |
470 | if (!A) |
471 | return 0; |
472 | return NextPowerOf2(A - 1); |
473 | } |
474 | |
475 | /// Returns the next integer (mod 2**64) that is greater than or equal to |
476 | /// \p Value and is a multiple of \p Align. \p Align must be non-zero. |
477 | /// |
478 | /// Examples: |
479 | /// \code |
480 | /// alignTo(5, 8) = 8 |
481 | /// alignTo(17, 8) = 24 |
482 | /// alignTo(~0LL, 8) = 0 |
483 | /// alignTo(321, 255) = 510 |
484 | /// \endcode |
485 | inline uint64_t alignTo(uint64_t Value, uint64_t Align) { |
486 | assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0." ) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\"" , "llvm/include/llvm/Support/MathExtras.h", 486, __extension__ __PRETTY_FUNCTION__)); |
487 | return (Value + Align - 1) / Align * Align; |
488 | } |
489 | |
490 | inline uint64_t alignToPowerOf2(uint64_t Value, uint64_t Align) { |
491 | assert(Align != 0 && (Align & (Align - 1)) == 0 &&(static_cast <bool> (Align != 0 && (Align & (Align - 1)) == 0 && "Align must be a power of 2") ? void (0) : __assert_fail ("Align != 0 && (Align & (Align - 1)) == 0 && \"Align must be a power of 2\"" , "llvm/include/llvm/Support/MathExtras.h", 492, __extension__ __PRETTY_FUNCTION__)) |
492 | "Align must be a power of 2")(static_cast <bool> (Align != 0 && (Align & (Align - 1)) == 0 && "Align must be a power of 2") ? void (0) : __assert_fail ("Align != 0 && (Align & (Align - 1)) == 0 && \"Align must be a power of 2\"" , "llvm/include/llvm/Support/MathExtras.h", 492, __extension__ __PRETTY_FUNCTION__)); |
493 | return (Value + Align - 1) & -Align; |
494 | } |
495 | |
496 | /// If non-zero \p Skew is specified, the return value will be a minimal integer |
497 | /// that is greater than or equal to \p Size and equal to \p A * N + \p Skew for |
498 | /// some integer N. If \p Skew is larger than \p A, its value is adjusted to '\p |
499 | /// Skew mod \p A'. \p Align must be non-zero. |
500 | /// |
501 | /// Examples: |
502 | /// \code |
503 | /// alignTo(5, 8, 7) = 7 |
504 | /// alignTo(17, 8, 1) = 17 |
505 | /// alignTo(~0LL, 8, 3) = 3 |
506 | /// alignTo(321, 255, 42) = 552 |
507 | /// \endcode |
508 | inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew) { |
509 | assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0." ) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\"" , "llvm/include/llvm/Support/MathExtras.h", 509, __extension__ __PRETTY_FUNCTION__)); |
510 | Skew %= Align; |
511 | return alignTo(Value - Skew, Align) + Skew; |
512 | } |
513 | |
514 | /// Returns the next integer (mod 2**64) that is greater than or equal to |
515 | /// \p Value and is a multiple of \c Align. \c Align must be non-zero. |
516 | template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) { |
517 | static_assert(Align != 0u, "Align must be non-zero"); |
518 | return (Value + Align - 1) / Align * Align; |
519 | } |
520 | |
521 | /// Returns the integer ceil(Numerator / Denominator). |
522 | inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) { |
523 | return alignTo(Numerator, Denominator) / Denominator; |
524 | } |
525 | |
526 | /// Returns the integer nearest(Numerator / Denominator). |
527 | inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) { |
528 | return (Numerator + (Denominator / 2)) / Denominator; |
529 | } |
530 | |
531 | /// Returns the largest uint64_t less than or equal to \p Value and is |
532 | /// \p Skew mod \p Align. \p Align must be non-zero |
533 | inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) { |
534 | assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0." ) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\"" , "llvm/include/llvm/Support/MathExtras.h", 534, __extension__ __PRETTY_FUNCTION__)); |
535 | Skew %= Align; |
536 | return (Value - Skew) / Align * Align + Skew; |
537 | } |
538 | |
539 | /// Sign-extend the number in the bottom B bits of X to a 32-bit integer. |
540 | /// Requires 0 < B <= 32. |
541 | template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) { |
542 | static_assert(B > 0, "Bit width can't be 0."); |
543 | static_assert(B <= 32, "Bit width out of range."); |
544 | return int32_t(X << (32 - B)) >> (32 - B); |
545 | } |
546 | |
547 | /// Sign-extend the number in the bottom B bits of X to a 32-bit integer. |
548 | /// Requires 0 < B <= 32. |
549 | inline int32_t SignExtend32(uint32_t X, unsigned B) { |
550 | assert(B > 0 && "Bit width can't be 0.")(static_cast <bool> (B > 0 && "Bit width can't be 0." ) ? void (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\"" , "llvm/include/llvm/Support/MathExtras.h", 550, __extension__ __PRETTY_FUNCTION__)); |
551 | assert(B <= 32 && "Bit width out of range.")(static_cast <bool> (B <= 32 && "Bit width out of range." ) ? void (0) : __assert_fail ("B <= 32 && \"Bit width out of range.\"" , "llvm/include/llvm/Support/MathExtras.h", 551, __extension__ __PRETTY_FUNCTION__)); |
552 | return int32_t(X << (32 - B)) >> (32 - B); |
553 | } |
554 | |
555 | /// Sign-extend the number in the bottom B bits of X to a 64-bit integer. |
556 | /// Requires 0 < B <= 64. |
557 | template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) { |
558 | static_assert(B > 0, "Bit width can't be 0."); |
559 | static_assert(B <= 64, "Bit width out of range."); |
560 | return int64_t(x << (64 - B)) >> (64 - B); |
561 | } |
562 | |
563 | /// Sign-extend the number in the bottom B bits of X to a 64-bit integer. |
564 | /// Requires 0 < B <= 64. |
565 | inline int64_t SignExtend64(uint64_t X, unsigned B) { |
566 | assert(B > 0 && "Bit width can't be 0.")(static_cast <bool> (B > 0 && "Bit width can't be 0." ) ? void (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\"" , "llvm/include/llvm/Support/MathExtras.h", 566, __extension__ __PRETTY_FUNCTION__)); |
567 | assert(B <= 64 && "Bit width out of range.")(static_cast <bool> (B <= 64 && "Bit width out of range." ) ? void (0) : __assert_fail ("B <= 64 && \"Bit width out of range.\"" , "llvm/include/llvm/Support/MathExtras.h", 567, __extension__ __PRETTY_FUNCTION__)); |
568 | return int64_t(X << (64 - B)) >> (64 - B); |
569 | } |
570 | |
571 | /// Subtract two unsigned integers, X and Y, of type T and return the absolute |
572 | /// value of the result. |
573 | template <typename T> |
574 | std::enable_if_t<std::is_unsigned<T>::value, T> AbsoluteDifference(T X, T Y) { |
575 | return X > Y ? (X - Y) : (Y - X); |
576 | } |
577 | |
578 | /// Add two unsigned integers, X and Y, of type T. Clamp the result to the |
579 | /// maximum representable value of T on overflow. ResultOverflowed indicates if |
580 | /// the result is larger than the maximum representable value of type T. |
581 | template <typename T> |
582 | std::enable_if_t<std::is_unsigned<T>::value, T> |
583 | SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) { |
584 | bool Dummy; |
585 | bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy; |
586 | // Hacker's Delight, p. 29 |
587 | T Z = X + Y; |
588 | Overflowed = (Z < X || Z < Y); |
589 | if (Overflowed) |
590 | return std::numeric_limits<T>::max(); |
591 | else |
592 | return Z; |
593 | } |
594 | |
595 | /// Add multiple unsigned integers of type T. Clamp the result to the |
596 | /// maximum representable value of T on overflow. |
597 | template <class T, class... Ts> |
598 | std::enable_if_t<std::is_unsigned_v<T>, T> SaturatingAdd(T X, T Y, T Z, |
599 | Ts... Args) { |
600 | bool Overflowed = false; |
601 | T XY = SaturatingAdd(X, Y, &Overflowed); |
602 | if (Overflowed) |
603 | return SaturatingAdd(std::numeric_limits<T>::max(), T(1), Args...); |
604 | return SaturatingAdd(XY, Z, Args...); |
605 | } |
606 | |
607 | /// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the |
608 | /// maximum representable value of T on overflow. ResultOverflowed indicates if |
609 | /// the result is larger than the maximum representable value of type T. |
610 | template <typename T> |
611 | std::enable_if_t<std::is_unsigned<T>::value, T> |
612 | SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) { |
613 | bool Dummy; |
614 | bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy; |
615 | |
616 | // Hacker's Delight, p. 30 has a different algorithm, but we don't use that |
617 | // because it fails for uint16_t (where multiplication can have undefined |
618 | // behavior due to promotion to int), and requires a division in addition |
619 | // to the multiplication. |
620 | |
621 | Overflowed = false; |
622 | |
623 | // Log2(Z) would be either Log2Z or Log2Z + 1. |
624 | // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z |
625 | // will necessarily be less than Log2Max as desired. |
626 | int Log2Z = Log2_64(X) + Log2_64(Y); |
627 | const T Max = std::numeric_limits<T>::max(); |
628 | int Log2Max = Log2_64(Max); |
629 | if (Log2Z < Log2Max) { |
630 | return X * Y; |
631 | } |
632 | if (Log2Z > Log2Max) { |
633 | Overflowed = true; |
634 | return Max; |
635 | } |
636 | |
637 | // We're going to use the top bit, and maybe overflow one |
638 | // bit past it. Multiply all but the bottom bit then add |
639 | // that on at the end. |
640 | T Z = (X >> 1) * Y; |
641 | if (Z & ~(Max >> 1)) { |
642 | Overflowed = true; |
643 | return Max; |
644 | } |
645 | Z <<= 1; |
646 | if (X & 1) |
647 | return SaturatingAdd(Z, Y, ResultOverflowed); |
648 | |
649 | return Z; |
650 | } |
651 | |
652 | /// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to |
653 | /// the product. Clamp the result to the maximum representable value of T on |
654 | /// overflow. ResultOverflowed indicates if the result is larger than the |
655 | /// maximum representable value of type T. |
656 | template <typename T> |
657 | std::enable_if_t<std::is_unsigned<T>::value, T> |
658 | SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) { |
659 | bool Dummy; |
660 | bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy; |
661 | |
662 | T Product = SaturatingMultiply(X, Y, &Overflowed); |
663 | if (Overflowed) |
664 | return Product; |
665 | |
666 | return SaturatingAdd(A, Product, &Overflowed); |
667 | } |
668 | |
669 | /// Use this rather than HUGE_VALF; the latter causes warnings on MSVC. |
670 | extern const float huge_valf; |
671 | |
672 | |
673 | /// Add two signed integers, computing the two's complement truncated result, |
674 | /// returning true if overflow occurred. |
675 | template <typename T> |
676 | std::enable_if_t<std::is_signed<T>::value, T> AddOverflow(T X, T Y, T &Result) { |
677 | #if __has_builtin(__builtin_add_overflow)1 |
678 | return __builtin_add_overflow(X, Y, &Result); |
679 | #else |
680 | // Perform the unsigned addition. |
681 | using U = std::make_unsigned_t<T>; |
682 | const U UX = static_cast<U>(X); |
683 | const U UY = static_cast<U>(Y); |
684 | const U UResult = UX + UY; |
685 | |
686 | // Convert to signed. |
687 | Result = static_cast<T>(UResult); |
688 | |
689 | // Adding two positive numbers should result in a positive number. |
690 | if (X > 0 && Y > 0) |
691 | return Result <= 0; |
692 | // Adding two negatives should result in a negative number. |
693 | if (X < 0 && Y < 0) |
694 | return Result >= 0; |
695 | return false; |
696 | #endif |
697 | } |
698 | |
699 | /// Subtract two signed integers, computing the two's complement truncated |
700 | /// result, returning true if an overflow ocurred. |
701 | template <typename T> |
702 | std::enable_if_t<std::is_signed<T>::value, T> SubOverflow(T X, T Y, T &Result) { |
703 | #if __has_builtin(__builtin_sub_overflow)1 |
704 | return __builtin_sub_overflow(X, Y, &Result); |
705 | #else |
706 | // Perform the unsigned addition. |
707 | using U = std::make_unsigned_t<T>; |
708 | const U UX = static_cast<U>(X); |
709 | const U UY = static_cast<U>(Y); |
710 | const U UResult = UX - UY; |
711 | |
712 | // Convert to signed. |
713 | Result = static_cast<T>(UResult); |
714 | |
715 | // Subtracting a positive number from a negative results in a negative number. |
716 | if (X <= 0 && Y > 0) |
717 | return Result >= 0; |
718 | // Subtracting a negative number from a positive results in a positive number. |
719 | if (X >= 0 && Y < 0) |
720 | return Result <= 0; |
721 | return false; |
722 | #endif |
723 | } |
724 | |
725 | /// Multiply two signed integers, computing the two's complement truncated |
726 | /// result, returning true if an overflow ocurred. |
727 | template <typename T> |
728 | std::enable_if_t<std::is_signed<T>::value, T> MulOverflow(T X, T Y, T &Result) { |
729 | // Perform the unsigned multiplication on absolute values. |
730 | using U = std::make_unsigned_t<T>; |
731 | const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X); |
732 | const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y); |
733 | const U UResult = UX * UY; |
734 | |
735 | // Convert to signed. |
736 | const bool IsNegative = (X < 0) ^ (Y < 0); |
737 | Result = IsNegative ? (0 - UResult) : UResult; |
738 | |
739 | // If any of the args was 0, result is 0 and no overflow occurs. |
740 | if (UX == 0 || UY == 0) |
741 | return false; |
742 | |
743 | // UX and UY are in [1, 2^n], where n is the number of digits. |
744 | // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for |
745 | // positive) divided by an argument compares to the other. |
746 | if (IsNegative) |
747 | return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY; |
748 | else |
749 | return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY; |
750 | } |
751 | |
752 | } // End llvm namespace |
753 | |
754 | #endif |