Line data Source code
1 : //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2 : //
3 : // The LLVM Compiler Infrastructure
4 : //
5 : // This file is distributed under the University of Illinois Open Source
6 : // License. See LICENSE.TXT for details.
7 : //
8 : //===----------------------------------------------------------------------===//
9 :
10 : #include "AArch64TargetTransformInfo.h"
11 : #include "MCTargetDesc/AArch64AddressingModes.h"
12 : #include "llvm/Analysis/LoopInfo.h"
13 : #include "llvm/Analysis/TargetTransformInfo.h"
14 : #include "llvm/CodeGen/BasicTTIImpl.h"
15 : #include "llvm/CodeGen/CostTable.h"
16 : #include "llvm/CodeGen/TargetLowering.h"
17 : #include "llvm/IR/IntrinsicInst.h"
18 : #include "llvm/Support/Debug.h"
19 : #include <algorithm>
20 : using namespace llvm;
21 :
22 : #define DEBUG_TYPE "aarch64tti"
23 :
24 : static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
25 : cl::init(true), cl::Hidden);
26 :
27 83 : bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
28 : const Function *Callee) const {
29 83 : const TargetMachine &TM = getTLI()->getTargetMachine();
30 :
31 : const FeatureBitset &CallerBits =
32 83 : TM.getSubtargetImpl(*Caller)->getFeatureBits();
33 : const FeatureBitset &CalleeBits =
34 83 : TM.getSubtargetImpl(*Callee)->getFeatureBits();
35 :
36 : // Inline a callee if its target-features are a subset of the callers
37 : // target-features.
38 83 : return (CallerBits & CalleeBits) == CalleeBits;
39 : }
40 :
41 : /// Calculate the cost of materializing a 64-bit value. This helper
42 : /// method might only calculate a fraction of a larger immediate. Therefore it
43 : /// is valid to return a cost of ZERO.
44 4624 : int AArch64TTIImpl::getIntImmCost(int64_t Val) {
45 : // Check if the immediate can be encoded within an instruction.
46 4624 : if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
47 3432 : return 0;
48 :
49 1192 : if (Val < 0)
50 550 : Val = ~Val;
51 :
52 : // Calculate how many moves we will need to materialize this constant.
53 1192 : unsigned LZ = countLeadingZeros((uint64_t)Val);
54 1192 : return (64 - LZ + 15) / 16;
55 : }
56 :
57 : /// Calculate the cost of materializing the given constant.
58 4541 : int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
59 : assert(Ty->isIntegerTy());
60 :
61 4541 : unsigned BitSize = Ty->getPrimitiveSizeInBits();
62 4541 : if (BitSize == 0)
63 : return ~0U;
64 :
65 : // Sign-extend all constants to a multiple of 64-bit.
66 : APInt ImmVal = Imm;
67 4541 : if (BitSize & 0x3f)
68 6448 : ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
69 :
70 : // Split the constant into 64-bit chunks and calculate the cost for each
71 : // chunk.
72 4541 : int Cost = 0;
73 9165 : for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
74 9248 : APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
75 : int64_t Val = Tmp.getSExtValue();
76 4624 : Cost += getIntImmCost(Val);
77 : }
78 : // We need at least one instruction to materialze the constant.
79 4685 : return std::max(1, Cost);
80 : }
81 :
82 9395 : int AArch64TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
83 : const APInt &Imm, Type *Ty) {
84 : assert(Ty->isIntegerTy());
85 :
86 9395 : unsigned BitSize = Ty->getPrimitiveSizeInBits();
87 : // There is no cost model for constants with a bit size of 0. Return TCC_Free
88 : // here, so that constant hoisting will ignore this constant.
89 9395 : if (BitSize == 0)
90 : return TTI::TCC_Free;
91 :
92 : unsigned ImmIdx = ~0U;
93 9395 : switch (Opcode) {
94 : default:
95 : return TTI::TCC_Free;
96 2293 : case Instruction::GetElementPtr:
97 : // Always hoist the base address of a GetElementPtr.
98 2293 : if (Idx == 0)
99 6 : return 2 * TTI::TCC_Basic;
100 : return TTI::TCC_Free;
101 405 : case Instruction::Store:
102 : ImmIdx = 0;
103 405 : break;
104 2550 : case Instruction::Add:
105 : case Instruction::Sub:
106 : case Instruction::Mul:
107 : case Instruction::UDiv:
108 : case Instruction::SDiv:
109 : case Instruction::URem:
110 : case Instruction::SRem:
111 : case Instruction::And:
112 : case Instruction::Or:
113 : case Instruction::Xor:
114 : case Instruction::ICmp:
115 : ImmIdx = 1;
116 2550 : break;
117 : // Always return TCC_Free for the shift value of a shift instruction.
118 871 : case Instruction::Shl:
119 : case Instruction::LShr:
120 : case Instruction::AShr:
121 871 : if (Idx == 1)
122 : return TTI::TCC_Free;
123 : break;
124 : case Instruction::Trunc:
125 : case Instruction::ZExt:
126 : case Instruction::SExt:
127 : case Instruction::IntToPtr:
128 : case Instruction::PtrToInt:
129 : case Instruction::BitCast:
130 : case Instruction::PHI:
131 : case Instruction::Call:
132 : case Instruction::Select:
133 : case Instruction::Ret:
134 : case Instruction::Load:
135 : break;
136 : }
137 :
138 4521 : if (Idx == ImmIdx) {
139 2741 : int NumConstants = (BitSize + 63) / 64;
140 2741 : int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
141 : return (Cost <= NumConstants * TTI::TCC_Basic)
142 2741 : ? static_cast<int>(TTI::TCC_Free)
143 2741 : : Cost;
144 : }
145 1780 : return AArch64TTIImpl::getIntImmCost(Imm, Ty);
146 : }
147 :
148 1028 : int AArch64TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
149 : const APInt &Imm, Type *Ty) {
150 : assert(Ty->isIntegerTy());
151 :
152 1028 : unsigned BitSize = Ty->getPrimitiveSizeInBits();
153 : // There is no cost model for constants with a bit size of 0. Return TCC_Free
154 : // here, so that constant hoisting will ignore this constant.
155 1028 : if (BitSize == 0)
156 : return TTI::TCC_Free;
157 :
158 1028 : switch (IID) {
159 : default:
160 : return TTI::TCC_Free;
161 20 : case Intrinsic::sadd_with_overflow:
162 : case Intrinsic::uadd_with_overflow:
163 : case Intrinsic::ssub_with_overflow:
164 : case Intrinsic::usub_with_overflow:
165 : case Intrinsic::smul_with_overflow:
166 : case Intrinsic::umul_with_overflow:
167 20 : if (Idx == 1) {
168 20 : int NumConstants = (BitSize + 63) / 64;
169 20 : int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
170 : return (Cost <= NumConstants * TTI::TCC_Basic)
171 20 : ? static_cast<int>(TTI::TCC_Free)
172 20 : : Cost;
173 0 : }
174 : break;
175 36 : case Intrinsic::experimental_stackmap:
176 36 : if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
177 : return TTI::TCC_Free;
178 : break;
179 209 : case Intrinsic::experimental_patchpoint_void:
180 : case Intrinsic::experimental_patchpoint_i64:
181 209 : if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
182 : return TTI::TCC_Free;
183 : break;
184 : }
185 0 : return AArch64TTIImpl::getIntImmCost(Imm, Ty);
186 : }
187 :
188 : TargetTransformInfo::PopcntSupportKind
189 3 : AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
190 : assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
191 3 : if (TyWidth == 32 || TyWidth == 64)
192 3 : return TTI::PSK_FastHardware;
193 : // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
194 : return TTI::PSK_Software;
195 : }
196 :
197 1035 : bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
198 : ArrayRef<const Value *> Args) {
199 :
200 : // A helper that returns a vector type from the given type. The number of
201 : // elements in type Ty determine the vector width.
202 : auto toVectorTy = [&](Type *ArgTy) {
203 161 : return VectorType::get(ArgTy->getScalarType(),
204 : DstTy->getVectorNumElements());
205 : };
206 :
207 : // Exit early if DstTy is not a vector type whose elements are at least
208 : // 16-bits wide.
209 2070 : if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16)
210 : return false;
211 :
212 : // Determine if the operation has a widening variant. We consider both the
213 : // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
214 : // instructions.
215 : //
216 : // TODO: Add additional widening operations (e.g., mul, shl, etc.) once we
217 : // verify that their extending operands are eliminated during code
218 : // generation.
219 490 : switch (Opcode) {
220 : case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
221 : case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
222 : break;
223 : default:
224 : return false;
225 : }
226 :
227 : // To be a widening instruction (either the "wide" or "long" versions), the
228 : // second operand must be a sign- or zero extend having a single user. We
229 : // only consider extends having a single user because they may otherwise not
230 : // be eliminated.
231 325 : if (Args.size() != 2 ||
232 488 : (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])) ||
233 : !Args[1]->hasOneUse())
234 : return false;
235 : auto *Extend = cast<CastInst>(Args[1]);
236 :
237 : // Legalize the destination type and ensure it can be used in a widening
238 : // operation.
239 163 : auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy);
240 163 : unsigned DstElTySize = DstTyL.second.getScalarSizeInBits();
241 326 : if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits())
242 : return false;
243 :
244 : // Legalize the source type and ensure it can be used in a widening
245 : // operation.
246 : Type *SrcTy = toVectorTy(Extend->getSrcTy());
247 161 : auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy);
248 161 : unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
249 322 : if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
250 : return false;
251 :
252 : // Get the total number of vector elements in the legalized types.
253 154 : unsigned NumDstEls = DstTyL.first * DstTyL.second.getVectorNumElements();
254 154 : unsigned NumSrcEls = SrcTyL.first * SrcTyL.second.getVectorNumElements();
255 :
256 : // Return true if the legalized types have the same number of vector elements
257 : // and the destination element type size is twice that of the source type.
258 154 : return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize;
259 : }
260 :
261 293 : int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
262 : const Instruction *I) {
263 293 : int ISD = TLI->InstructionOpcodeToISD(Opcode);
264 : assert(ISD && "Invalid opcode");
265 :
266 : // If the cast is observable, and it is used by a widening instruction (e.g.,
267 : // uaddl, saddw, etc.), it may be free.
268 584 : if (I && I->hasOneUse()) {
269 : auto *SingleUser = cast<Instruction>(*I->user_begin());
270 263 : SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
271 263 : if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
272 : // If the cast is the second operand, it is free. We will generate either
273 : // a "wide" or "long" version of the widening instruction.
274 84 : if (I == SingleUser->getOperand(1))
275 : return 0;
276 : // If the cast is not the second operand, it will be free if it looks the
277 : // same as the second operand. In this case, we will generate a "long"
278 : // version of the widening instruction.
279 : if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
280 59 : if (I->getOpcode() == unsigned(Cast->getOpcode()) &&
281 : cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
282 : return 0;
283 : }
284 : }
285 :
286 211 : EVT SrcTy = TLI->getValueType(DL, Src);
287 211 : EVT DstTy = TLI->getValueType(DL, Dst);
288 :
289 211 : if (!SrcTy.isSimple() || !DstTy.isSimple())
290 3 : return BaseT::getCastInstrCost(Opcode, Dst, Src);
291 :
292 : static const TypeConversionCostTblEntry
293 : ConversionTbl[] = {
294 : { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
295 : { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
296 : { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
297 : { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
298 :
299 : // The number of shll instructions for the extension.
300 : { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
301 : { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
302 : { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
303 : { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
304 : { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
305 : { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
306 : { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
307 : { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
308 : { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
309 : { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
310 : { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
311 : { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
312 : { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
313 : { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
314 : { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
315 : { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
316 :
317 : // LowerVectorINT_TO_FP:
318 : { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
319 : { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
320 : { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
321 : { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
322 : { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
323 : { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
324 :
325 : // Complex: to v2f32
326 : { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
327 : { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
328 : { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
329 : { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
330 : { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
331 : { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
332 :
333 : // Complex: to v4f32
334 : { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 },
335 : { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
336 : { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
337 : { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
338 :
339 : // Complex: to v8f32
340 : { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
341 : { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
342 : { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
343 : { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
344 :
345 : // Complex: to v16f32
346 : { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
347 : { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
348 :
349 : // Complex: to v2f64
350 : { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
351 : { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
352 : { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
353 : { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
354 : { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
355 : { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
356 :
357 :
358 : // LowerVectorFP_TO_INT
359 : { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
360 : { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
361 : { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
362 : { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
363 : { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
364 : { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
365 :
366 : // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
367 : { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
368 : { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
369 : { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 },
370 : { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
371 : { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
372 : { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 },
373 :
374 : // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
375 : { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
376 : { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 },
377 : { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
378 : { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 },
379 :
380 : // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
381 : { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
382 : { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
383 : { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 },
384 : { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
385 : { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
386 : { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 },
387 : };
388 :
389 18 : if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
390 : DstTy.getSimpleVT(),
391 : SrcTy.getSimpleVT()))
392 18 : return Entry->Cost;
393 :
394 190 : return BaseT::getCastInstrCost(Opcode, Dst, Src);
395 : }
396 :
397 24 : int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
398 : VectorType *VecTy,
399 : unsigned Index) {
400 :
401 : // Make sure we were given a valid extend opcode.
402 : assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
403 : "Invalid opcode");
404 :
405 : // We are extending an element we extract from a vector, so the source type
406 : // of the extend is the element type of the vector.
407 24 : auto *Src = VecTy->getElementType();
408 :
409 : // Sign- and zero-extends are for integer types only.
410 : assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
411 :
412 : // Get the cost for the extract. We compute the cost (if any) for the extend
413 : // below.
414 24 : auto Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy, Index);
415 :
416 : // Legalize the types.
417 24 : auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy);
418 24 : auto DstVT = TLI->getValueType(DL, Dst);
419 24 : auto SrcVT = TLI->getValueType(DL, Src);
420 :
421 : // If the resulting type is still a vector and the destination type is legal,
422 : // we may get the extension for free. If not, get the default cost for the
423 : // extend.
424 48 : if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
425 0 : return Cost + getCastInstrCost(Opcode, Dst, Src);
426 :
427 : // The destination type should be larger than the element type. If not, get
428 : // the default cost for the extend.
429 24 : if (DstVT.getSizeInBits() < SrcVT.getSizeInBits())
430 0 : return Cost + getCastInstrCost(Opcode, Dst, Src);
431 :
432 24 : switch (Opcode) {
433 0 : default:
434 0 : llvm_unreachable("Opcode should be either SExt or ZExt");
435 :
436 : // For sign-extends, we only need a smov, which performs the extension
437 : // automatically.
438 : case Instruction::SExt:
439 : return Cost;
440 :
441 : // For zero-extends, the extend is performed automatically by a umov unless
442 : // the destination type is i64 and the element type is i8 or i16.
443 0 : case Instruction::ZExt:
444 0 : if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
445 : return Cost;
446 : }
447 :
448 : // If we are unable to perform the extend for free, get the default cost.
449 0 : return Cost + getCastInstrCost(Opcode, Dst, Src);
450 : }
451 :
452 1531 : int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
453 : unsigned Index) {
454 : assert(Val->isVectorTy() && "This must be a vector type");
455 :
456 1531 : if (Index != -1U) {
457 : // Legalize the type.
458 1531 : std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
459 :
460 : // This type is legalized to a scalar type.
461 1531 : if (!LT.second.isVector())
462 500 : return 0;
463 :
464 : // The type may be split. Normalize the index to the new type.
465 : unsigned Width = LT.second.getVectorNumElements();
466 1519 : Index = Index % Width;
467 :
468 : // The element at index zero is already inside the vector.
469 1519 : if (Index == 0)
470 : return 0;
471 : }
472 :
473 : // All other insert/extracts cost this much.
474 1031 : return ST->getVectorInsertExtractBaseCost();
475 : }
476 :
477 772 : int AArch64TTIImpl::getArithmeticInstrCost(
478 : unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
479 : TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
480 : TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) {
481 : // Legalize the type.
482 772 : std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
483 :
484 : // If the instruction is a widening instruction (e.g., uaddl, saddw, etc.),
485 : // add in the widening overhead specified by the sub-target. Since the
486 : // extends feeding widening instructions are performed automatically, they
487 : // aren't present in the generated code and have a zero cost. By adding a
488 : // widening overhead here, we attach the total cost of the combined operation
489 : // to the widening instruction.
490 : int Cost = 0;
491 772 : if (isWideningInstruction(Ty, Opcode, Args))
492 56 : Cost += ST->getWideningBaseCost();
493 :
494 772 : int ISD = TLI->InstructionOpcodeToISD(Opcode);
495 :
496 772 : switch (ISD) {
497 242 : default:
498 968 : return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
499 242 : Opd1PropInfo, Opd2PropInfo);
500 16 : case ISD::SDIV:
501 32 : if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue &&
502 16 : Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
503 : // On AArch64, scalar signed division by constants power-of-two are
504 : // normally expanded to the sequence ADD + CMP + SELECT + SRA.
505 : // The OperandValue properties many not be same as that of previous
506 : // operation; conservatively assume OP_None.
507 10 : Cost += getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
508 : TargetTransformInfo::OP_None,
509 5 : TargetTransformInfo::OP_None);
510 10 : Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info,
511 : TargetTransformInfo::OP_None,
512 5 : TargetTransformInfo::OP_None);
513 10 : Cost += getArithmeticInstrCost(Instruction::Select, Ty, Opd1Info, Opd2Info,
514 : TargetTransformInfo::OP_None,
515 5 : TargetTransformInfo::OP_None);
516 10 : Cost += getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info, Opd2Info,
517 : TargetTransformInfo::OP_None,
518 5 : TargetTransformInfo::OP_None);
519 5 : return Cost;
520 : }
521 : LLVM_FALLTHROUGH;
522 : case ISD::UDIV:
523 16 : if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue) {
524 6 : auto VT = TLI->getValueType(DL, Ty);
525 6 : if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
526 : // Vector signed division by constant are expanded to the
527 : // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
528 : // to MULHS + SUB + SRL + ADD + SRL.
529 12 : int MulCost = getArithmeticInstrCost(Instruction::Mul, Ty, Opd1Info,
530 : Opd2Info,
531 : TargetTransformInfo::OP_None,
532 6 : TargetTransformInfo::OP_None);
533 12 : int AddCost = getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info,
534 : Opd2Info,
535 : TargetTransformInfo::OP_None,
536 6 : TargetTransformInfo::OP_None);
537 12 : int ShrCost = getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info,
538 : Opd2Info,
539 : TargetTransformInfo::OP_None,
540 6 : TargetTransformInfo::OP_None);
541 6 : return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
542 : }
543 : }
544 :
545 10 : Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
546 10 : Opd1PropInfo, Opd2PropInfo);
547 10 : if (Ty->isVectorTy()) {
548 : // On AArch64, vector divisions are not supported natively and are
549 : // expanded into scalar divisions of each pair of elements.
550 8 : Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, Opd1Info,
551 4 : Opd2Info, Opd1PropInfo, Opd2PropInfo);
552 8 : Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, Opd1Info,
553 4 : Opd2Info, Opd1PropInfo, Opd2PropInfo);
554 : // TODO: if one of the arguments is scalar, then it's not necessary to
555 : // double the cost of handling the vector elements.
556 4 : Cost += Cost;
557 : }
558 : return Cost;
559 :
560 509 : case ISD::ADD:
561 : case ISD::MUL:
562 : case ISD::XOR:
563 : case ISD::OR:
564 : case ISD::AND:
565 : // These nodes are marked as 'custom' for combining purposes only.
566 : // We know that they are legal. See LowerAdd in ISelLowering.
567 509 : return (Cost + 1) * LT.first;
568 : }
569 : }
570 :
571 77 : int AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
572 : const SCEV *Ptr) {
573 : // Address computations in vectorized code with non-consecutive addresses will
574 : // likely result in more instructions compared to scalar code where the
575 : // computation can more often be merged into the index mode. The resulting
576 : // extra micro-ops can significantly decrease throughput.
577 : unsigned NumVectorInstToHideOverhead = 10;
578 : int MaxMergeDistance = 64;
579 :
580 97 : if (Ty->isVectorTy() && SE &&
581 20 : !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
582 11 : return NumVectorInstToHideOverhead;
583 :
584 : // In many cases the address computation is not merged into the instruction
585 : // addressing mode.
586 : return 1;
587 : }
588 :
589 241 : int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
590 : Type *CondTy, const Instruction *I) {
591 :
592 241 : int ISD = TLI->InstructionOpcodeToISD(Opcode);
593 : // We don't lower some vector selects well that are wider than the register
594 : // width.
595 241 : if (ValTy->isVectorTy() && ISD == ISD::SELECT) {
596 : // We would need this many instructions to hide the scalarization happening.
597 : const int AmortizationCost = 20;
598 : static const TypeConversionCostTblEntry
599 : VectorSelectTbl[] = {
600 : { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
601 : { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
602 : { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
603 : { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
604 : { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
605 : { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
606 : };
607 :
608 39 : EVT SelCondTy = TLI->getValueType(DL, CondTy);
609 39 : EVT SelValTy = TLI->getValueType(DL, ValTy);
610 39 : if (SelCondTy.isSimple() && SelValTy.isSimple()) {
611 12 : if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
612 : SelCondTy.getSimpleVT(),
613 : SelValTy.getSimpleVT()))
614 12 : return Entry->Cost;
615 : }
616 : }
617 229 : return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
618 : }
619 :
620 420 : int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
621 : unsigned Alignment, unsigned AddressSpace,
622 : const Instruction *I) {
623 420 : auto LT = TLI->getTypeLegalizationCost(DL, Ty);
624 :
625 18 : if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
626 436 : LT.second.is128BitVector() && Alignment < 16) {
627 : // Unaligned stores are extremely inefficient. We don't split all
628 : // unaligned 128-bit stores because the negative impact that has shown in
629 : // practice on inlined block copy code.
630 : // We make such stores expensive so that we will only vectorize if there
631 : // are 6 other instructions getting vectorized.
632 : const int AmortizationCost = 6;
633 :
634 14 : return LT.first * 2 * AmortizationCost;
635 : }
636 :
637 406 : if (Ty->isVectorTy() && Ty->getVectorElementType()->isIntegerTy(8)) {
638 : unsigned ProfitableNumElements;
639 102 : if (Opcode == Instruction::Store)
640 : // We use a custom trunc store lowering so v.4b should be profitable.
641 : ProfitableNumElements = 4;
642 : else
643 : // We scalarize the loads because there is not v.4b register and we
644 : // have to promote the elements to v.2.
645 : ProfitableNumElements = 8;
646 :
647 102 : if (Ty->getVectorNumElements() < ProfitableNumElements) {
648 : unsigned NumVecElts = Ty->getVectorNumElements();
649 : unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
650 : // We generate 2 instructions per vector element.
651 43 : return NumVectorizableInstsToAmortize * NumVecElts * 2;
652 : }
653 : }
654 :
655 363 : return LT.first;
656 : }
657 :
658 10 : int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
659 : unsigned Factor,
660 : ArrayRef<unsigned> Indices,
661 : unsigned Alignment,
662 : unsigned AddressSpace,
663 : bool IsMasked) {
664 : assert(Factor >= 2 && "Invalid interleave factor");
665 : assert(isa<VectorType>(VecTy) && "Expect a vector type");
666 :
667 10 : if (!IsMasked && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
668 : unsigned NumElts = VecTy->getVectorNumElements();
669 20 : auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
670 :
671 : // ldN/stN only support legal vector types of size 64 or 128 in bits.
672 : // Accesses having vector types that are a multiple of 128 bits can be
673 : // matched to more than one ldN/stN instruction.
674 20 : if (NumElts % Factor == 0 &&
675 10 : TLI->isLegalInterleavedAccessType(SubVecTy, DL))
676 10 : return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL);
677 : }
678 :
679 0 : return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
680 0 : Alignment, AddressSpace, IsMasked);
681 : }
682 :
683 1 : int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
684 : int Cost = 0;
685 2 : for (auto *I : Tys) {
686 1 : if (!I->isVectorTy())
687 : continue;
688 2 : if (I->getScalarSizeInBits() * I->getVectorNumElements() == 128)
689 2 : Cost += getMemoryOpCost(Instruction::Store, I, 128, 0) +
690 1 : getMemoryOpCost(Instruction::Load, I, 128, 0);
691 : }
692 1 : return Cost;
693 : }
694 :
695 34 : unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
696 68 : return ST->getMaxInterleaveFactor();
697 : }
698 :
699 : // For Falkor, we want to avoid having too many strided loads in a loop since
700 : // that can exhaust the HW prefetcher resources. We adjust the unroller
701 : // MaxCount preference below to attempt to ensure unrolling doesn't create too
702 : // many strided loads.
703 : static void
704 0 : getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
705 : TargetTransformInfo::UnrollingPreferences &UP) {
706 : enum { MaxStridedLoads = 7 };
707 : auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
708 : int StridedLoads = 0;
709 : // FIXME? We could make this more precise by looking at the CFG and
710 : // e.g. not counting loads in each side of an if-then-else diamond.
711 : for (const auto BB : L->blocks()) {
712 : for (auto &I : *BB) {
713 : LoadInst *LMemI = dyn_cast<LoadInst>(&I);
714 : if (!LMemI)
715 : continue;
716 :
717 : Value *PtrValue = LMemI->getPointerOperand();
718 : if (L->isLoopInvariant(PtrValue))
719 : continue;
720 :
721 : const SCEV *LSCEV = SE.getSCEV(PtrValue);
722 : const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
723 : if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
724 : continue;
725 :
726 : // FIXME? We could take pairing of unrolled load copies into account
727 : // by looking at the AddRec, but we would probably have to limit this
728 : // to loops with no stores or other memory optimization barriers.
729 : ++StridedLoads;
730 : // We've seen enough strided loads that seeing more won't make a
731 : // difference.
732 : if (StridedLoads > MaxStridedLoads / 2)
733 : return StridedLoads;
734 : }
735 : }
736 : return StridedLoads;
737 : };
738 :
739 0 : int StridedLoads = countStridedLoads(L, SE);
740 : LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
741 : << " strided loads\n");
742 : // Pick the largest power of 2 unroll count that won't result in too many
743 : // strided loads.
744 0 : if (StridedLoads) {
745 0 : UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
746 : LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
747 : << UP.MaxCount << '\n');
748 : }
749 0 : }
750 :
751 22 : void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
752 : TTI::UnrollingPreferences &UP) {
753 : // Enable partial unrolling and runtime unrolling.
754 22 : BaseT::getUnrollingPreferences(L, SE, UP);
755 :
756 : // For inner loop, it is more likely to be a hot one, and the runtime check
757 : // can be promoted out from LICM pass, so the overhead is less, let's try
758 : // a larger threshold to unroll more loops.
759 44 : if (L->getLoopDepth() > 1)
760 3 : UP.PartialThreshold *= 2;
761 :
762 : // Disable partial & runtime unrolling on -Os.
763 22 : UP.PartialOptSizeThreshold = 0;
764 :
765 22 : if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
766 : EnableFalkorHWPFUnrollFix)
767 3 : getFalkorUnrollingPreferences(L, SE, UP);
768 22 : }
769 :
770 20 : Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
771 : Type *ExpectedType) {
772 : switch (Inst->getIntrinsicID()) {
773 : default:
774 : return nullptr;
775 : case Intrinsic::aarch64_neon_st2:
776 : case Intrinsic::aarch64_neon_st3:
777 : case Intrinsic::aarch64_neon_st4: {
778 : // Create a struct type
779 : StructType *ST = dyn_cast<StructType>(ExpectedType);
780 : if (!ST)
781 : return nullptr;
782 8 : unsigned NumElts = Inst->getNumArgOperands() - 1;
783 8 : if (ST->getNumElements() != NumElts)
784 : return nullptr;
785 24 : for (unsigned i = 0, e = NumElts; i != e; ++i) {
786 32 : if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
787 : return nullptr;
788 : }
789 8 : Value *Res = UndefValue::get(ExpectedType);
790 8 : IRBuilder<> Builder(Inst);
791 24 : for (unsigned i = 0, e = NumElts; i != e; ++i) {
792 : Value *L = Inst->getArgOperand(i);
793 16 : Res = Builder.CreateInsertValue(Res, L, i);
794 : }
795 : return Res;
796 : }
797 4 : case Intrinsic::aarch64_neon_ld2:
798 : case Intrinsic::aarch64_neon_ld3:
799 : case Intrinsic::aarch64_neon_ld4:
800 4 : if (Inst->getType() == ExpectedType)
801 4 : return Inst;
802 : return nullptr;
803 : }
804 : }
805 :
806 120 : bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
807 : MemIntrinsicInfo &Info) {
808 : switch (Inst->getIntrinsicID()) {
809 : default:
810 : break;
811 30 : case Intrinsic::aarch64_neon_ld2:
812 : case Intrinsic::aarch64_neon_ld3:
813 : case Intrinsic::aarch64_neon_ld4:
814 30 : Info.ReadMem = true;
815 30 : Info.WriteMem = false;
816 30 : Info.PtrVal = Inst->getArgOperand(0);
817 30 : break;
818 40 : case Intrinsic::aarch64_neon_st2:
819 : case Intrinsic::aarch64_neon_st3:
820 : case Intrinsic::aarch64_neon_st4:
821 40 : Info.ReadMem = false;
822 40 : Info.WriteMem = true;
823 40 : Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1);
824 40 : break;
825 : }
826 :
827 : switch (Inst->getIntrinsicID()) {
828 : default:
829 : return false;
830 52 : case Intrinsic::aarch64_neon_ld2:
831 : case Intrinsic::aarch64_neon_st2:
832 52 : Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
833 52 : break;
834 16 : case Intrinsic::aarch64_neon_ld3:
835 : case Intrinsic::aarch64_neon_st3:
836 16 : Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
837 16 : break;
838 2 : case Intrinsic::aarch64_neon_ld4:
839 : case Intrinsic::aarch64_neon_st4:
840 2 : Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
841 2 : break;
842 : }
843 : return true;
844 : }
845 :
846 : /// See if \p I should be considered for address type promotion. We check if \p
847 : /// I is a sext with right type and used in memory accesses. If it used in a
848 : /// "complex" getelementptr, we allow it to be promoted without finding other
849 : /// sext instructions that sign extended the same initial value. A getelementptr
850 : /// is considered as "complex" if it has more than 2 operands.
851 2326 : bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
852 : const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
853 : bool Considerable = false;
854 2326 : AllowPromotionWithoutCommonHeader = false;
855 2326 : if (!isa<SExtInst>(&I))
856 : return false;
857 : Type *ConsideredSExtType =
858 1190 : Type::getInt64Ty(I.getParent()->getParent()->getContext());
859 1190 : if (I.getType() != ConsideredSExtType)
860 : return false;
861 : // See if the sext is the one with the right type and used in at least one
862 : // GetElementPtrInst.
863 1014 : for (const User *U : I.users()) {
864 : if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
865 : Considerable = true;
866 : // A getelementptr is considered as "complex" if it has more than 2
867 : // operands. We will promote a SExt used in such complex GEP as we
868 : // expect some computation to be merged if they are done on 64 bits.
869 164 : if (GEPInst->getNumOperands() > 2) {
870 18 : AllowPromotionWithoutCommonHeader = true;
871 18 : break;
872 : }
873 : }
874 : }
875 : return Considerable;
876 : }
877 :
878 0 : unsigned AArch64TTIImpl::getCacheLineSize() {
879 0 : return ST->getCacheLineSize();
880 : }
881 :
882 14214 : unsigned AArch64TTIImpl::getPrefetchDistance() {
883 28428 : return ST->getPrefetchDistance();
884 : }
885 :
886 12 : unsigned AArch64TTIImpl::getMinPrefetchStride() {
887 24 : return ST->getMinPrefetchStride();
888 : }
889 :
890 51 : unsigned AArch64TTIImpl::getMaxPrefetchIterationsAhead() {
891 51 : return ST->getMaxPrefetchIterationsAhead();
892 : }
893 :
894 25 : bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
895 : TTI::ReductionFlags Flags) const {
896 : assert(isa<VectorType>(Ty) && "Expected Ty to be a vector type");
897 25 : unsigned ScalarBits = Ty->getScalarSizeInBits();
898 25 : switch (Opcode) {
899 : case Instruction::FAdd:
900 : case Instruction::FMul:
901 : case Instruction::And:
902 : case Instruction::Or:
903 : case Instruction::Xor:
904 : case Instruction::Mul:
905 : return false;
906 : case Instruction::Add:
907 23 : return ScalarBits * Ty->getVectorNumElements() >= 128;
908 2 : case Instruction::ICmp:
909 2 : return (ScalarBits < 64) &&
910 2 : (ScalarBits * Ty->getVectorNumElements() >= 128);
911 0 : case Instruction::FCmp:
912 0 : return Flags.NoNaN;
913 0 : default:
914 0 : llvm_unreachable("Unhandled reduction opcode");
915 : }
916 : return false;
917 : }
918 :
919 33 : int AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
920 : bool IsPairwiseForm) {
921 :
922 33 : if (IsPairwiseForm)
923 14 : return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm);
924 :
925 19 : std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
926 19 : MVT MTy = LT.second;
927 19 : int ISD = TLI->InstructionOpcodeToISD(Opcode);
928 : assert(ISD && "Invalid opcode");
929 :
930 : // Horizontal adds can use the 'addv' instruction. We model the cost of these
931 : // instructions as normal vector adds. This is the only arithmetic vector
932 : // reduction operation for which we have an instruction.
933 : static const CostTblEntry CostTblNoPairwise[]{
934 : {ISD::ADD, MVT::v8i8, 1},
935 : {ISD::ADD, MVT::v16i8, 1},
936 : {ISD::ADD, MVT::v4i16, 1},
937 : {ISD::ADD, MVT::v8i16, 1},
938 : {ISD::ADD, MVT::v4i32, 1},
939 : };
940 :
941 19 : if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
942 19 : return LT.first * Entry->Cost;
943 :
944 0 : return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm);
945 : }
946 :
947 166 : int AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
948 : Type *SubTp) {
949 332 : if (Kind == TTI::SK_Transpose || Kind == TTI::SK_Select ||
950 166 : Kind == TTI::SK_PermuteSingleSrc) {
951 : static const CostTblEntry ShuffleTbl[] = {
952 : // Transpose shuffle kinds can be performed with 'trn1/trn2' and
953 : // 'zip1/zip2' instructions.
954 : { TTI::SK_Transpose, MVT::v8i8, 1 },
955 : { TTI::SK_Transpose, MVT::v16i8, 1 },
956 : { TTI::SK_Transpose, MVT::v4i16, 1 },
957 : { TTI::SK_Transpose, MVT::v8i16, 1 },
958 : { TTI::SK_Transpose, MVT::v2i32, 1 },
959 : { TTI::SK_Transpose, MVT::v4i32, 1 },
960 : { TTI::SK_Transpose, MVT::v2i64, 1 },
961 : { TTI::SK_Transpose, MVT::v2f32, 1 },
962 : { TTI::SK_Transpose, MVT::v4f32, 1 },
963 : { TTI::SK_Transpose, MVT::v2f64, 1 },
964 : // Select shuffle kinds.
965 : // TODO: handle vXi8/vXi16.
966 : { TTI::SK_Select, MVT::v2i32, 1 }, // mov.
967 : { TTI::SK_Select, MVT::v4i32, 2 }, // rev+trn (or similar).
968 : { TTI::SK_Select, MVT::v2i64, 1 }, // mov.
969 : { TTI::SK_Select, MVT::v2f32, 1 }, // mov.
970 : { TTI::SK_Select, MVT::v4f32, 2 }, // rev+trn (or similar).
971 : { TTI::SK_Select, MVT::v2f64, 1 }, // mov.
972 : // PermuteSingleSrc shuffle kinds.
973 : // TODO: handle vXi8/vXi16.
974 : { TTI::SK_PermuteSingleSrc, MVT::v2i32, 1 }, // mov.
975 : { TTI::SK_PermuteSingleSrc, MVT::v4i32, 3 }, // perfectshuffle worst case.
976 : { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // mov.
977 : { TTI::SK_PermuteSingleSrc, MVT::v2f32, 1 }, // mov.
978 : { TTI::SK_PermuteSingleSrc, MVT::v4f32, 3 }, // perfectshuffle worst case.
979 : { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // mov.
980 : };
981 75 : std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
982 75 : if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
983 71 : return LT.first * Entry->Cost;
984 : }
985 :
986 190 : return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
987 : }
|