LLVM 22.0.0git
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements a TargetTransformInfo analysis pass specific to the
10/// X86 target machine. It uses the target's detailed information to provide
11/// more precise answers to certain TTI queries, while letting the target
12/// independent and default TTI implementations handle the rest.
13///
14//===----------------------------------------------------------------------===//
15/// About Cost Model numbers used below it's necessary to say the following:
16/// the numbers correspond to some "generic" X86 CPU instead of usage of a
17/// specific CPU model. Usually the numbers correspond to the CPU where the
18/// feature first appeared. For example, if we do Subtarget.hasSSE42() in
19/// the lookups below the cost is based on Nehalem as that was the first CPU
20/// to support that feature level and thus has most likely the worst case cost,
21/// although we may discard an outlying worst cost from one CPU (e.g. Atom).
22///
23/// Some examples of other technologies/CPUs:
24/// SSE 3 - Pentium4 / Athlon64
25/// SSE 4.1 - Penryn
26/// SSE 4.2 - Nehalem / Silvermont
27/// AVX - Sandy Bridge / Jaguar / Bulldozer
28/// AVX2 - Haswell / Ryzen
29/// AVX-512 - Xeon Phi / Skylake
30///
31/// And some examples of instruction target dependent costs (latency)
32/// divss sqrtss rsqrtss
33/// AMD K7 11-16 19 3
34/// Piledriver 9-24 13-15 5
35/// Jaguar 14 16 2
36/// Pentium II,III 18 30 2
37/// Nehalem 7-14 7-18 3
38/// Haswell 10-13 11 5
39///
40/// Interpreting the 4 TargetCostKind types:
41/// TCK_RecipThroughput and TCK_Latency should try to match the worst case
42/// values reported by the CPU scheduler models (and llvm-mca).
43/// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the
44/// actual encoding size of the instruction.
45/// TCK_SizeAndLatency should match the worst case micro-op counts reported by
46/// by the CPU scheduler models (and llvm-mca), to ensure that they are
47/// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are
48/// often used as the cost thresholds where TCK_SizeAndLatency is requested.
49//===----------------------------------------------------------------------===//
50
59#include <optional>
60
61using namespace llvm;
62
63#define DEBUG_TYPE "x86tti"
64
65//===----------------------------------------------------------------------===//
66//
67// X86 cost model.
68//
69//===----------------------------------------------------------------------===//
70
71// Helper struct to store/access costs for each cost kind.
72// TODO: Move this to allow other targets to use it?
74 unsigned RecipThroughputCost = ~0U;
75 unsigned LatencyCost = ~0U;
76 unsigned CodeSizeCost = ~0U;
77 unsigned SizeAndLatencyCost = ~0U;
78
79 std::optional<unsigned>
81 unsigned Cost = ~0U;
82 switch (Kind) {
85 break;
88 break;
91 break;
94 break;
95 }
96 if (Cost == ~0U)
97 return std::nullopt;
98 return Cost;
99 }
100};
103
105X86TTIImpl::getPopcntSupport(unsigned TyWidth) const {
106 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
107 // TODO: Currently the __builtin_popcount() implementation using SSE3
108 // instructions is inefficient. Once the problem is fixed, we should
109 // call ST->hasSSE3() instead of ST->hasPOPCNT().
110 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
111}
112
113std::optional<unsigned> X86TTIImpl::getCacheSize(
115 switch (Level) {
117 // - Penryn
118 // - Nehalem
119 // - Westmere
120 // - Sandy Bridge
121 // - Ivy Bridge
122 // - Haswell
123 // - Broadwell
124 // - Skylake
125 // - Kabylake
126 return 32 * 1024; // 32 KiB
128 // - Penryn
129 // - Nehalem
130 // - Westmere
131 // - Sandy Bridge
132 // - Ivy Bridge
133 // - Haswell
134 // - Broadwell
135 // - Skylake
136 // - Kabylake
137 return 256 * 1024; // 256 KiB
138 }
139
140 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
141}
142
143std::optional<unsigned> X86TTIImpl::getCacheAssociativity(
145 // - Penryn
146 // - Nehalem
147 // - Westmere
148 // - Sandy Bridge
149 // - Ivy Bridge
150 // - Haswell
151 // - Broadwell
152 // - Skylake
153 // - Kabylake
154 switch (Level) {
156 [[fallthrough]];
158 return 8;
159 }
160
161 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
162}
163
165
167 return Vector ? VectorClass
168 : Ty && Ty->isFloatingPointTy() ? ScalarFPClass
169 : GPRClass;
170}
171
172unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
173 if (ClassID == VectorClass && !ST->hasSSE1())
174 return 0;
175
176 if (!ST->is64Bit())
177 return 8;
178
179 if ((ClassID == GPRClass && ST->hasEGPR()) ||
180 (ClassID != GPRClass && ST->hasAVX512()))
181 return 32;
182
183 return 16;
184}
185
187 if (!ST->hasCF())
188 return false;
189 if (!Ty)
190 return true;
191 // Conditional faulting is supported by CFCMOV, which only accepts
192 // 16/32/64-bit operands.
193 // TODO: Support f32/f64 with VMOVSS/VMOVSD with zero mask when it's
194 // profitable.
195 auto *VTy = dyn_cast<FixedVectorType>(Ty);
196 if (!Ty->isIntegerTy() && (!VTy || VTy->getNumElements() != 1))
197 return false;
198 auto *ScalarTy = Ty->getScalarType();
199 switch (cast<IntegerType>(ScalarTy)->getBitWidth()) {
200 default:
201 return false;
202 case 16:
203 case 32:
204 case 64:
205 return true;
206 }
207}
208
211 unsigned PreferVectorWidth = ST->getPreferVectorWidth();
212 switch (K) {
214 return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
216 if (ST->hasAVX512() && PreferVectorWidth >= 512)
217 return TypeSize::getFixed(512);
218 if (ST->hasAVX() && PreferVectorWidth >= 256)
219 return TypeSize::getFixed(256);
220 if (ST->hasSSE1() && PreferVectorWidth >= 128)
221 return TypeSize::getFixed(128);
222 return TypeSize::getFixed(0);
224 return TypeSize::getScalable(0);
225 }
226
227 llvm_unreachable("Unsupported register kind");
228}
229
234
236 // If the loop will not be vectorized, don't interleave the loop.
237 // Let regular unroll to unroll the loop, which saves the overflow
238 // check and memory check cost.
239 if (VF.isScalar())
240 return 1;
241
242 if (ST->isAtom())
243 return 1;
244
245 // Sandybridge and Haswell have multiple execution ports and pipelined
246 // vector units.
247 if (ST->hasAVX())
248 return 4;
249
250 return 2;
251}
252
254 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
256 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
257
258 // vXi8 multiplications are always promoted to vXi16.
259 // Sub-128-bit types can be extended/packed more efficiently.
260 if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
261 Ty->getPrimitiveSizeInBits() <= 64 && Ty->getScalarSizeInBits() == 8) {
262 Type *WideVecTy =
264 return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
266 CostKind) +
267 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
269 CostKind) +
270 getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info);
271 }
272
273 // Legalize the type.
274 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
275
276 int ISD = TLI->InstructionOpcodeToISD(Opcode);
277 assert(ISD && "Invalid opcode");
278
279 if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
280 (LT.second.getScalarType() == MVT::i32 ||
281 LT.second.getScalarType() == MVT::i64)) {
282 // Check if the operands can be represented as a smaller datatype.
283 bool Op1Signed = false, Op2Signed = false;
284 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
285 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
286 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
287 bool SignedMode = Op1Signed || Op2Signed;
288
289 // If both vXi32 are representable as i15 and at least one is constant,
290 // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
291 // can treat this as PMADDWD which has the same costs as a vXi16 multiply.
292 if (OpMinSize <= 15 && !ST->isPMADDWDSlow() &&
293 LT.second.getScalarType() == MVT::i32) {
294 bool Op1Constant =
295 isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
296 bool Op2Constant =
297 isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
298 bool Op1Sext = isa<SExtInst>(Args[0]) &&
299 (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
300 bool Op2Sext = isa<SExtInst>(Args[1]) &&
301 (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
302
303 bool IsZeroExtended = !Op1Signed || !Op2Signed;
304 bool IsConstant = Op1Constant || Op2Constant;
305 bool IsSext = Op1Sext || Op2Sext;
306 if (IsConstant || IsZeroExtended || IsSext)
307 LT.second =
308 MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
309 }
310
311 // Check if the vXi32 operands can be shrunk into a smaller datatype.
312 // This should match the codegen from reduceVMULWidth.
313 // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()).
314 if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) {
315 if (OpMinSize <= 7)
316 return LT.first * 3; // pmullw/sext
317 if (!SignedMode && OpMinSize <= 8)
318 return LT.first * 3; // pmullw/zext
319 if (OpMinSize <= 15)
320 return LT.first * 5; // pmullw/pmulhw/pshuf
321 if (!SignedMode && OpMinSize <= 16)
322 return LT.first * 5; // pmullw/pmulhw/pshuf
323 }
324
325 // If both vXi64 are representable as (unsigned) i32, then we can perform
326 // the multiple with a single PMULUDQ instruction.
327 // TODO: Add (SSE41+) PMULDQ handling for signed extensions.
328 if (!SignedMode && OpMinSize <= 32 && LT.second.getScalarType() == MVT::i64)
330 }
331
332 // Vector multiply by pow2 will be simplified to shifts.
333 // Vector multiply by -pow2 will be simplified to shifts/negates.
334 if (ISD == ISD::MUL && Op2Info.isConstant() &&
335 (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) {
337 getArithmeticInstrCost(Instruction::Shl, Ty, CostKind,
338 Op1Info.getNoProps(), Op2Info.getNoProps());
339 if (Op2Info.isNegatedPowerOf2())
340 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
341 return Cost;
342 }
343
344 // On X86, vector signed division by constants power-of-two are
345 // normally expanded to the sequence SRA + SRL + ADD + SRA.
346 // The OperandValue properties may not be the same as that of the previous
347 // operation; conservatively assume OP_None.
348 if ((ISD == ISD::SDIV || ISD == ISD::SREM) &&
349 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
351 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
352 Op1Info.getNoProps(), Op2Info.getNoProps());
353 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
354 Op1Info.getNoProps(), Op2Info.getNoProps());
355 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
356 Op1Info.getNoProps(), Op2Info.getNoProps());
357
358 if (ISD == ISD::SREM) {
359 // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
360 Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(),
361 Op2Info.getNoProps());
362 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(),
363 Op2Info.getNoProps());
364 }
365
366 return Cost;
367 }
368
369 // Vector unsigned division/remainder will be simplified to shifts/masks.
370 if ((ISD == ISD::UDIV || ISD == ISD::UREM) &&
371 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
372 if (ISD == ISD::UDIV)
373 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
374 Op1Info.getNoProps(), Op2Info.getNoProps());
375 // UREM
376 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
377 Op1Info.getNoProps(), Op2Info.getNoProps());
378 }
379
380 static const CostKindTblEntry GFNIUniformConstCostTable[] = {
381 { ISD::SHL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
382 { ISD::SRL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
383 { ISD::SRA, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
384 { ISD::SHL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
385 { ISD::SRL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
386 { ISD::SRA, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
387 { ISD::SHL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
388 { ISD::SRL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
389 { ISD::SRA, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
390 };
391
392 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasGFNI())
393 if (const auto *Entry =
394 CostTableLookup(GFNIUniformConstCostTable, ISD, LT.second))
395 if (auto KindCost = Entry->Cost[CostKind])
396 return LT.first * *KindCost;
397
398 static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
399 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
400 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
401 { ISD::SRA, MVT::v16i8, { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb.
402 { ISD::SHL, MVT::v32i8, { 1, 8, 2, 3 } }, // psllw + pand.
403 { ISD::SRL, MVT::v32i8, { 1, 8, 2, 3 } }, // psrlw + pand.
404 { ISD::SRA, MVT::v32i8, { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb.
405 { ISD::SHL, MVT::v64i8, { 1, 8, 2, 3 } }, // psllw + pand.
406 { ISD::SRL, MVT::v64i8, { 1, 8, 2, 3 } }, // psrlw + pand.
407 { ISD::SRA, MVT::v64i8, { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb.
408
409 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // psllw
410 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
411 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
412 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // psllw
413 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
414 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
415 };
416
417 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI())
418 if (const auto *Entry =
419 CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second))
420 if (auto KindCost = Entry->Cost[CostKind])
421 return LT.first * *KindCost;
422
423 static const CostKindTblEntry AVX512UniformConstCostTable[] = {
424 { ISD::SHL, MVT::v64i8, { 2, 12, 5, 6 } }, // psllw + pand.
425 { ISD::SRL, MVT::v64i8, { 2, 12, 5, 6 } }, // psrlw + pand.
426 { ISD::SRA, MVT::v64i8, { 3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb.
427
428 { ISD::SHL, MVT::v16i16, { 2, 7, 4, 4 } }, // psllw + split.
429 { ISD::SRL, MVT::v16i16, { 2, 7, 4, 4 } }, // psrlw + split.
430 { ISD::SRA, MVT::v16i16, { 2, 7, 4, 4 } }, // psraw + split.
431
432 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, // pslld
433 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, // psrld
434 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, // psrad
435 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, // pslld
436 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, // psrld
437 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, // psrad
438
439 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, // psraq
440 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, // psllq
441 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, // psrlq
442 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, // psraq
443 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, // psllq
444 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, // psrlq
445 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, // psraq
446
447 { ISD::SDIV, MVT::v16i32, { 6 } }, // pmuludq sequence
448 { ISD::SREM, MVT::v16i32, { 8 } }, // pmuludq+mul+sub sequence
449 { ISD::UDIV, MVT::v16i32, { 5 } }, // pmuludq sequence
450 { ISD::UREM, MVT::v16i32, { 7 } }, // pmuludq+mul+sub sequence
451 };
452
453 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512())
454 if (const auto *Entry =
455 CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second))
456 if (auto KindCost = Entry->Cost[CostKind])
457 return LT.first * *KindCost;
458
459 static const CostKindTblEntry AVX2UniformConstCostTable[] = {
460 { ISD::SHL, MVT::v16i8, { 1, 8, 2, 3 } }, // psllw + pand.
461 { ISD::SRL, MVT::v16i8, { 1, 8, 2, 3 } }, // psrlw + pand.
462 { ISD::SRA, MVT::v16i8, { 2, 10, 5, 6 } }, // psrlw, pand, pxor, psubb.
463 { ISD::SHL, MVT::v32i8, { 2, 8, 2, 4 } }, // psllw + pand.
464 { ISD::SRL, MVT::v32i8, { 2, 8, 2, 4 } }, // psrlw + pand.
465 { ISD::SRA, MVT::v32i8, { 3, 10, 5, 9 } }, // psrlw, pand, pxor, psubb.
466
467 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw
468 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw
469 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw
470 { ISD::SHL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psllw
471 { ISD::SRL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psrlw
472 { ISD::SRA, MVT::v16i16,{ 2, 2, 1, 2 } }, // psraw
473
474 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
475 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld
476 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad
477 { ISD::SHL, MVT::v8i32, { 2, 2, 1, 2 } }, // pslld
478 { ISD::SRL, MVT::v8i32, { 2, 2, 1, 2 } }, // psrld
479 { ISD::SRA, MVT::v8i32, { 2, 2, 1, 2 } }, // psrad
480
481 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq
482 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq
483 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
484 { ISD::SHL, MVT::v4i64, { 2, 2, 1, 2 } }, // psllq
485 { ISD::SRL, MVT::v4i64, { 2, 2, 1, 2 } }, // psrlq
486 { ISD::SRA, MVT::v4i64, { 4, 4, 3, 6 } }, // psrad + shuffle + split.
487
488 { ISD::SDIV, MVT::v8i32, { 6 } }, // pmuludq sequence
489 { ISD::SREM, MVT::v8i32, { 8 } }, // pmuludq+mul+sub sequence
490 { ISD::UDIV, MVT::v8i32, { 5 } }, // pmuludq sequence
491 { ISD::UREM, MVT::v8i32, { 7 } }, // pmuludq+mul+sub sequence
492 };
493
494 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2())
495 if (const auto *Entry =
496 CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second))
497 if (auto KindCost = Entry->Cost[CostKind])
498 return LT.first * *KindCost;
499
500 static const CostKindTblEntry AVXUniformConstCostTable[] = {
501 { ISD::SHL, MVT::v16i8, { 2, 7, 2, 3 } }, // psllw + pand.
502 { ISD::SRL, MVT::v16i8, { 2, 7, 2, 3 } }, // psrlw + pand.
503 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
504 { ISD::SHL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psllw + pand) + split.
505 { ISD::SRL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psrlw + pand) + split.
506 { ISD::SRA, MVT::v32i8, { 7, 7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split.
507
508 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 1 } }, // psllw.
509 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 1 } }, // psrlw.
510 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 1 } }, // psraw.
511 { ISD::SHL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psllw + split.
512 { ISD::SRL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psrlw + split.
513 { ISD::SRA, MVT::v16i16,{ 3, 6, 4, 5 } }, // psraw + split.
514
515 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 1 } }, // pslld.
516 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 1 } }, // psrld.
517 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 1 } }, // psrad.
518 { ISD::SHL, MVT::v8i32, { 3, 6, 4, 5 } }, // pslld + split.
519 { ISD::SRL, MVT::v8i32, { 3, 6, 4, 5 } }, // psrld + split.
520 { ISD::SRA, MVT::v8i32, { 3, 6, 4, 5 } }, // psrad + split.
521
522 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 1 } }, // psllq.
523 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 1 } }, // psrlq.
524 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
525 { ISD::SHL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
526 { ISD::SRL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
527 { ISD::SRA, MVT::v4i64, { 5, 7, 8, 9 } }, // 2 x psrad + shuffle + split.
528
529 { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split.
530 { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split.
531 { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split.
532 { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split.
533 };
534
535 // XOP has faster vXi8 shifts.
536 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() &&
537 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
538 if (const auto *Entry =
539 CostTableLookup(AVXUniformConstCostTable, ISD, LT.second))
540 if (auto KindCost = Entry->Cost[CostKind])
541 return LT.first * *KindCost;
542
543 static const CostKindTblEntry SSE2UniformConstCostTable[] = {
544 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
545 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
546 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
547
548 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw.
549 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw.
550 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw.
551
552 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
553 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld.
554 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad.
555
556 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq.
557 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq.
558 { ISD::SRA, MVT::v2i64, { 3, 5, 6, 6 } }, // 2 x psrad + shuffle.
559
560 { ISD::SDIV, MVT::v4i32, { 6 } }, // pmuludq sequence
561 { ISD::SREM, MVT::v4i32, { 8 } }, // pmuludq+mul+sub sequence
562 { ISD::UDIV, MVT::v4i32, { 5 } }, // pmuludq sequence
563 { ISD::UREM, MVT::v4i32, { 7 } }, // pmuludq+mul+sub sequence
564 };
565
566 // XOP has faster vXi8 shifts.
567 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() &&
568 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
569 if (const auto *Entry =
570 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
571 if (auto KindCost = Entry->Cost[CostKind])
572 return LT.first * *KindCost;
573
574 static const CostKindTblEntry AVX512BWConstCostTable[] = {
575 { ISD::SDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
576 { ISD::SREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
577 { ISD::UDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
578 { ISD::UREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
579
580 { ISD::SDIV, MVT::v32i16, { 6 } }, // vpmulhw sequence
581 { ISD::SREM, MVT::v32i16, { 8 } }, // vpmulhw+mul+sub sequence
582 { ISD::UDIV, MVT::v32i16, { 6 } }, // vpmulhuw sequence
583 { ISD::UREM, MVT::v32i16, { 8 } }, // vpmulhuw+mul+sub sequence
584 };
585
586 if (Op2Info.isConstant() && ST->hasBWI())
587 if (const auto *Entry =
588 CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
589 if (auto KindCost = Entry->Cost[CostKind])
590 return LT.first * *KindCost;
591
592 static const CostKindTblEntry AVX512ConstCostTable[] = {
593 { ISD::SDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
594 { ISD::SREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
595 { ISD::UDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
596 { ISD::UREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
597
598 { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence
599 { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence
600 { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence
601 { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence
602
603 { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence
604 { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence
605 { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence
606 { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence
607 };
608
609 if (Op2Info.isConstant() && ST->hasAVX512())
610 if (const auto *Entry =
611 CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
612 if (auto KindCost = Entry->Cost[CostKind])
613 return LT.first * *KindCost;
614
615 static const CostKindTblEntry AVX2ConstCostTable[] = {
616 { ISD::SDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
617 { ISD::SREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
618 { ISD::UDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
619 { ISD::UREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
620
621 { ISD::SDIV, MVT::v16i16, { 6 } }, // vpmulhw sequence
622 { ISD::SREM, MVT::v16i16, { 8 } }, // vpmulhw+mul+sub sequence
623 { ISD::UDIV, MVT::v16i16, { 6 } }, // vpmulhuw sequence
624 { ISD::UREM, MVT::v16i16, { 8 } }, // vpmulhuw+mul+sub sequence
625
626 { ISD::SDIV, MVT::v8i32, { 15 } }, // vpmuldq sequence
627 { ISD::SREM, MVT::v8i32, { 19 } }, // vpmuldq+mul+sub sequence
628 { ISD::UDIV, MVT::v8i32, { 15 } }, // vpmuludq sequence
629 { ISD::UREM, MVT::v8i32, { 19 } }, // vpmuludq+mul+sub sequence
630 };
631
632 if (Op2Info.isConstant() && ST->hasAVX2())
633 if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
634 if (auto KindCost = Entry->Cost[CostKind])
635 return LT.first * *KindCost;
636
637 static const CostKindTblEntry AVXConstCostTable[] = {
638 { ISD::SDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
639 { ISD::SREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
640 { ISD::UDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
641 { ISD::UREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
642
643 { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split.
644 { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split.
645 { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split.
646 { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split.
647
648 { ISD::SDIV, MVT::v8i32, { 32 } }, // vpmuludq sequence
649 { ISD::SREM, MVT::v8i32, { 38 } }, // vpmuludq+mul+sub sequence
650 { ISD::UDIV, MVT::v8i32, { 32 } }, // 2*pmuludq sequence + split.
651 { ISD::UREM, MVT::v8i32, { 42 } }, // 2*pmuludq+mul+sub sequence + split.
652 };
653
654 if (Op2Info.isConstant() && ST->hasAVX())
655 if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second))
656 if (auto KindCost = Entry->Cost[CostKind])
657 return LT.first * *KindCost;
658
659 static const CostKindTblEntry SSE41ConstCostTable[] = {
660 { ISD::SDIV, MVT::v4i32, { 15 } }, // vpmuludq sequence
661 { ISD::SREM, MVT::v4i32, { 20 } }, // vpmuludq+mul+sub sequence
662 };
663
664 if (Op2Info.isConstant() && ST->hasSSE41())
665 if (const auto *Entry =
666 CostTableLookup(SSE41ConstCostTable, ISD, LT.second))
667 if (auto KindCost = Entry->Cost[CostKind])
668 return LT.first * *KindCost;
669
670 static const CostKindTblEntry SSE2ConstCostTable[] = {
671 { ISD::SDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
672 { ISD::SREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
673 { ISD::UDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
674 { ISD::UREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
675
676 { ISD::SDIV, MVT::v8i16, { 6 } }, // pmulhw sequence
677 { ISD::SREM, MVT::v8i16, { 8 } }, // pmulhw+mul+sub sequence
678 { ISD::UDIV, MVT::v8i16, { 6 } }, // pmulhuw sequence
679 { ISD::UREM, MVT::v8i16, { 8 } }, // pmulhuw+mul+sub sequence
680
681 { ISD::SDIV, MVT::v4i32, { 19 } }, // pmuludq sequence
682 { ISD::SREM, MVT::v4i32, { 24 } }, // pmuludq+mul+sub sequence
683 { ISD::UDIV, MVT::v4i32, { 15 } }, // pmuludq sequence
684 { ISD::UREM, MVT::v4i32, { 20 } }, // pmuludq+mul+sub sequence
685 };
686
687 if (Op2Info.isConstant() && ST->hasSSE2())
688 if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
689 if (auto KindCost = Entry->Cost[CostKind])
690 return LT.first * *KindCost;
691
692 static const CostKindTblEntry AVX512BWUniformCostTable[] = {
693 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
694 { ISD::SRL, MVT::v16i8, { 3,10, 5, 8 } }, // psrlw + pand.
695 { ISD::SRA, MVT::v16i8, { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb.
696 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
697 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
698 { ISD::SRA, MVT::v32i8, { 5,10,10,13 } }, // psrlw, pand, pxor, psubb.
699 { ISD::SHL, MVT::v64i8, { 4, 7, 6, 8 } }, // psllw + pand.
700 { ISD::SRL, MVT::v64i8, { 4, 8, 7,10 } }, // psrlw + pand.
701 { ISD::SRA, MVT::v64i8, { 5,10,10,15 } }, // psrlw, pand, pxor, psubb.
702
703 { ISD::SHL, MVT::v32i16, { 2, 4, 2, 3 } }, // psllw
704 { ISD::SRL, MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw
705 { ISD::SRA, MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw
706 };
707
708 if (ST->hasBWI() && Op2Info.isUniform())
709 if (const auto *Entry =
710 CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second))
711 if (auto KindCost = Entry->Cost[CostKind])
712 return LT.first * *KindCost;
713
714 static const CostKindTblEntry AVX512UniformCostTable[] = {
715 { ISD::SHL, MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split.
716 { ISD::SRL, MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split.
717 { ISD::SRA, MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split.
718
719 { ISD::SHL, MVT::v16i32, { 2, 4, 2, 3 } }, // pslld
720 { ISD::SRL, MVT::v16i32, { 2, 4, 2, 3 } }, // psrld
721 { ISD::SRA, MVT::v16i32, { 2, 4, 2, 3 } }, // psrad
722
723 { ISD::SRA, MVT::v2i64, { 1, 2, 1, 2 } }, // psraq
724 { ISD::SHL, MVT::v4i64, { 1, 4, 1, 2 } }, // psllq
725 { ISD::SRL, MVT::v4i64, { 1, 4, 1, 2 } }, // psrlq
726 { ISD::SRA, MVT::v4i64, { 1, 4, 1, 2 } }, // psraq
727 { ISD::SHL, MVT::v8i64, { 1, 4, 1, 2 } }, // psllq
728 { ISD::SRL, MVT::v8i64, { 1, 4, 1, 2 } }, // psrlq
729 { ISD::SRA, MVT::v8i64, { 1, 4, 1, 2 } }, // psraq
730 };
731
732 if (ST->hasAVX512() && Op2Info.isUniform())
733 if (const auto *Entry =
734 CostTableLookup(AVX512UniformCostTable, ISD, LT.second))
735 if (auto KindCost = Entry->Cost[CostKind])
736 return LT.first * *KindCost;
737
738 static const CostKindTblEntry AVX2UniformCostTable[] = {
739 // Uniform splats are cheaper for the following instructions.
740 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
741 { ISD::SRL, MVT::v16i8, { 3, 9, 5, 8 } }, // psrlw + pand.
742 { ISD::SRA, MVT::v16i8, { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb.
743 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
744 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
745 { ISD::SRA, MVT::v32i8, { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb.
746
747 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 2 } }, // psllw.
748 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 2 } }, // psrlw.
749 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 2 } }, // psraw.
750 { ISD::SHL, MVT::v16i16, { 2, 4, 2, 3 } }, // psllw.
751 { ISD::SRL, MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw.
752 { ISD::SRA, MVT::v16i16, { 2, 4, 2, 3 } }, // psraw.
753
754 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 2 } }, // pslld
755 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 2 } }, // psrld
756 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 2 } }, // psrad
757 { ISD::SHL, MVT::v8i32, { 2, 4, 2, 3 } }, // pslld
758 { ISD::SRL, MVT::v8i32, { 2, 4, 2, 3 } }, // psrld
759 { ISD::SRA, MVT::v8i32, { 2, 4, 2, 3 } }, // psrad
760
761 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 2 } }, // psllq
762 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 2 } }, // psrlq
763 { ISD::SRA, MVT::v2i64, { 2, 4, 5, 7 } }, // 2 x psrad + shuffle.
764 { ISD::SHL, MVT::v4i64, { 2, 4, 1, 2 } }, // psllq
765 { ISD::SRL, MVT::v4i64, { 2, 4, 1, 2 } }, // psrlq
766 { ISD::SRA, MVT::v4i64, { 4, 6, 5, 9 } }, // 2 x psrad + shuffle.
767 };
768
769 if (ST->hasAVX2() && Op2Info.isUniform())
770 if (const auto *Entry =
771 CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
772 if (auto KindCost = Entry->Cost[CostKind])
773 return LT.first * *KindCost;
774
775 static const CostKindTblEntry AVXUniformCostTable[] = {
776 { ISD::SHL, MVT::v16i8, { 4, 4, 6, 8 } }, // psllw + pand.
777 { ISD::SRL, MVT::v16i8, { 4, 8, 5, 8 } }, // psrlw + pand.
778 { ISD::SRA, MVT::v16i8, { 6, 6, 9,13 } }, // psrlw, pand, pxor, psubb.
779 { ISD::SHL, MVT::v32i8, { 7, 8,11,14 } }, // psllw + pand + split.
780 { ISD::SRL, MVT::v32i8, { 7, 9,10,14 } }, // psrlw + pand + split.
781 { ISD::SRA, MVT::v32i8, { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split.
782
783 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 2 } }, // psllw.
784 { ISD::SRL, MVT::v8i16, { 1, 3, 1, 2 } }, // psrlw.
785 { ISD::SRA, MVT::v8i16, { 1, 3, 1, 2 } }, // psraw.
786 { ISD::SHL, MVT::v16i16, { 3, 7, 5, 7 } }, // psllw + split.
787 { ISD::SRL, MVT::v16i16, { 3, 7, 5, 7 } }, // psrlw + split.
788 { ISD::SRA, MVT::v16i16, { 3, 7, 5, 7 } }, // psraw + split.
789
790 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 2 } }, // pslld.
791 { ISD::SRL, MVT::v4i32, { 1, 3, 1, 2 } }, // psrld.
792 { ISD::SRA, MVT::v4i32, { 1, 3, 1, 2 } }, // psrad.
793 { ISD::SHL, MVT::v8i32, { 3, 7, 5, 7 } }, // pslld + split.
794 { ISD::SRL, MVT::v8i32, { 3, 7, 5, 7 } }, // psrld + split.
795 { ISD::SRA, MVT::v8i32, { 3, 7, 5, 7 } }, // psrad + split.
796
797 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 2 } }, // psllq.
798 { ISD::SRL, MVT::v2i64, { 1, 3, 1, 2 } }, // psrlq.
799 { ISD::SRA, MVT::v2i64, { 3, 4, 5, 7 } }, // 2 x psrad + shuffle.
800 { ISD::SHL, MVT::v4i64, { 3, 7, 4, 6 } }, // psllq + split.
801 { ISD::SRL, MVT::v4i64, { 3, 7, 4, 6 } }, // psrlq + split.
802 { ISD::SRA, MVT::v4i64, { 6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split.
803 };
804
805 // XOP has faster vXi8 shifts.
806 if (ST->hasAVX() && Op2Info.isUniform() &&
807 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
808 if (const auto *Entry =
809 CostTableLookup(AVXUniformCostTable, ISD, LT.second))
810 if (auto KindCost = Entry->Cost[CostKind])
811 return LT.first * *KindCost;
812
813 static const CostKindTblEntry SSE2UniformCostTable[] = {
814 // Uniform splats are cheaper for the following instructions.
815 { ISD::SHL, MVT::v16i8, { 9, 10, 6, 9 } }, // psllw + pand.
816 { ISD::SRL, MVT::v16i8, { 9, 13, 5, 9 } }, // psrlw + pand.
817 { ISD::SRA, MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence.
818
819 { ISD::SHL, MVT::v8i16, { 2, 2, 1, 2 } }, // psllw.
820 { ISD::SRL, MVT::v8i16, { 2, 2, 1, 2 } }, // psrlw.
821 { ISD::SRA, MVT::v8i16, { 2, 2, 1, 2 } }, // psraw.
822
823 { ISD::SHL, MVT::v4i32, { 2, 2, 1, 2 } }, // pslld
824 { ISD::SRL, MVT::v4i32, { 2, 2, 1, 2 } }, // psrld.
825 { ISD::SRA, MVT::v4i32, { 2, 2, 1, 2 } }, // psrad.
826
827 { ISD::SHL, MVT::v2i64, { 2, 2, 1, 2 } }, // psllq.
828 { ISD::SRL, MVT::v2i64, { 2, 2, 1, 2 } }, // psrlq.
829 { ISD::SRA, MVT::v2i64, { 5, 9, 5, 7 } }, // 2*psrlq + xor + sub.
830 };
831
832 if (ST->hasSSE2() && Op2Info.isUniform() &&
833 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
834 if (const auto *Entry =
835 CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
836 if (auto KindCost = Entry->Cost[CostKind])
837 return LT.first * *KindCost;
838
839 static const CostKindTblEntry AVX512DQCostTable[] = {
840 { ISD::MUL, MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq
841 { ISD::MUL, MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq
842 { ISD::MUL, MVT::v8i64, { 3, 15, 1, 3 } } // pmullq
843 };
844
845 // Look for AVX512DQ lowering tricks for custom cases.
846 if (ST->hasDQI())
847 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
848 if (auto KindCost = Entry->Cost[CostKind])
849 return LT.first * *KindCost;
850
851 static const CostKindTblEntry AVX512BWCostTable[] = {
852 { ISD::SHL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsllvw/pack sequence.
853 { ISD::SRL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsrlvw/pack sequence.
854 { ISD::SRA, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsravw/pack sequence.
855 { ISD::SHL, MVT::v32i8, { 4, 23,11,16 } }, // extend/vpsllvw/pack sequence.
856 { ISD::SRL, MVT::v32i8, { 4, 30,12,18 } }, // extend/vpsrlvw/pack sequence.
857 { ISD::SRA, MVT::v32i8, { 6, 13,24,30 } }, // extend/vpsravw/pack sequence.
858 { ISD::SHL, MVT::v64i8, { 6, 19,13,15 } }, // extend/vpsllvw/pack sequence.
859 { ISD::SRL, MVT::v64i8, { 7, 27,15,18 } }, // extend/vpsrlvw/pack sequence.
860 { ISD::SRA, MVT::v64i8, { 15, 15,30,30 } }, // extend/vpsravw/pack sequence.
861
862 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsllvw
863 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsrlvw
864 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsravw
865 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsllvw
866 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsrlvw
867 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsravw
868 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsllvw
869 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsrlvw
870 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsravw
871
872 { ISD::ADD, MVT::v64i8, { 1, 1, 1, 1 } }, // paddb
873 { ISD::ADD, MVT::v32i16, { 1, 1, 1, 1 } }, // paddw
874
875 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 1 } }, // paddb
876 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 1 } }, // paddw
877 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 1 } }, // paddd
878 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 1 } }, // paddq
879
880 { ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb
881 { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw
882
883 { ISD::MUL, MVT::v16i8, { 4, 12, 4, 5 } }, // extend/pmullw/trunc
884 { ISD::MUL, MVT::v32i8, { 3, 10, 7,10 } }, // pmaddubsw
885 { ISD::MUL, MVT::v64i8, { 3, 11, 7,10 } }, // pmaddubsw
886 { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw
887
888 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb
889 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 1 } }, // psubw
890 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 1 } }, // psubd
891 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 1 } }, // psubq
892 };
893
894 // Look for AVX512BW lowering tricks for custom cases.
895 if (ST->hasBWI())
896 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
897 if (auto KindCost = Entry->Cost[CostKind])
898 return LT.first * *KindCost;
899
900 static const CostKindTblEntry AVX512CostTable[] = {
901 { ISD::SHL, MVT::v64i8, { 15, 19,27,33 } }, // vpblendv+split sequence.
902 { ISD::SRL, MVT::v64i8, { 15, 19,30,36 } }, // vpblendv+split sequence.
903 { ISD::SRA, MVT::v64i8, { 37, 37,51,63 } }, // vpblendv+split sequence.
904
905 { ISD::SHL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
906 { ISD::SRL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
907 { ISD::SRA, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence.
908
909 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } },
910 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } },
911 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } },
912 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } },
913 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } },
914 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } },
915 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } },
916 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } },
917 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } },
918
919 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } },
920 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } },
921 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } },
922 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } },
923 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } },
924 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } },
925 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } },
926 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } },
927 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } },
928
929 { ISD::ADD, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*paddb + split
930 { ISD::ADD, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*paddw + split
931
932 { ISD::SUB, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*psubb + split
933 { ISD::SUB, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*psubw + split
934
935 { ISD::AND, MVT::v32i8, { 1, 1, 1, 1 } },
936 { ISD::AND, MVT::v16i16, { 1, 1, 1, 1 } },
937 { ISD::AND, MVT::v8i32, { 1, 1, 1, 1 } },
938 { ISD::AND, MVT::v4i64, { 1, 1, 1, 1 } },
939
940 { ISD::OR, MVT::v32i8, { 1, 1, 1, 1 } },
941 { ISD::OR, MVT::v16i16, { 1, 1, 1, 1 } },
942 { ISD::OR, MVT::v8i32, { 1, 1, 1, 1 } },
943 { ISD::OR, MVT::v4i64, { 1, 1, 1, 1 } },
944
945 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 1 } },
946 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 1 } },
947 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 1 } },
948 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 1 } },
949
950 { ISD::MUL, MVT::v16i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
951 { ISD::MUL, MVT::v8i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
952 { ISD::MUL, MVT::v4i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
953 { ISD::MUL, MVT::v8i64, { 6, 9, 8, 8 } }, // 3*pmuludq/3*shift/2*add
954 { ISD::MUL, MVT::i64, { 1 } }, // Skylake from http://www.agner.org/
955
956 { X86ISD::PMULUDQ, MVT::v8i64, { 1, 5, 1, 1 } },
957
958 { ISD::FNEG, MVT::v8f64, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
959 { ISD::FADD, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
960 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
961 { ISD::FSUB, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
962 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
963 { ISD::FMUL, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
964 { ISD::FMUL, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
965 { ISD::FMUL, MVT::v2f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
966 { ISD::FMUL, MVT::f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
967
968 { ISD::FDIV, MVT::f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
969 { ISD::FDIV, MVT::v2f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
970 { ISD::FDIV, MVT::v4f64, { 8, 14, 1, 1 } }, // Skylake from http://www.agner.org/
971 { ISD::FDIV, MVT::v8f64, { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/
972
973 { ISD::FNEG, MVT::v16f32, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
974 { ISD::FADD, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
975 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
976 { ISD::FSUB, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
977 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
978 { ISD::FMUL, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
979 { ISD::FMUL, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
980 { ISD::FMUL, MVT::v4f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
981 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
982
983 { ISD::FDIV, MVT::f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
984 { ISD::FDIV, MVT::v4f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
985 { ISD::FDIV, MVT::v8f32, { 5, 11, 1, 1 } }, // Skylake from http://www.agner.org/
986 { ISD::FDIV, MVT::v16f32, { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/
987 };
988
989 if (ST->hasAVX512())
990 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
991 if (auto KindCost = Entry->Cost[CostKind])
992 return LT.first * *KindCost;
993
994 static const CostKindTblEntry AVX2ShiftCostTable[] = {
995 // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
996 // customize them to detect the cases where shift amount is a scalar one.
997 { ISD::SHL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org)
998 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
999 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org)
1000 { ISD::SHL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org)
1001 { ISD::SRL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
1002 { ISD::SRA, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org)
1003 { ISD::SHL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org)
1004 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org)
1005 { ISD::SHL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org)
1006 { ISD::SRL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org)
1007 };
1008
1009 if (ST->hasAVX512()) {
1010 if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant())
1011 // On AVX512, a packed v32i16 shift left by a constant build_vector
1012 // is lowered into a vector multiply (vpmullw).
1013 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
1014 Op1Info.getNoProps(), Op2Info.getNoProps());
1015 }
1016
1017 // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
1018 if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
1019 if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
1020 Op2Info.isConstant())
1021 // On AVX2, a packed v16i16 shift left by a constant build_vector
1022 // is lowered into a vector multiply (vpmullw).
1023 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
1024 Op1Info.getNoProps(), Op2Info.getNoProps());
1025
1026 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
1027 if (auto KindCost = Entry->Cost[CostKind])
1028 return LT.first * *KindCost;
1029 }
1030
1031 static const CostKindTblEntry XOPShiftCostTable[] = {
1032 // 128bit shifts take 1cy, but right shifts require negation beforehand.
1033 { ISD::SHL, MVT::v16i8, { 1, 3, 1, 1 } },
1034 { ISD::SRL, MVT::v16i8, { 2, 3, 1, 1 } },
1035 { ISD::SRA, MVT::v16i8, { 2, 3, 1, 1 } },
1036 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 1 } },
1037 { ISD::SRL, MVT::v8i16, { 2, 3, 1, 1 } },
1038 { ISD::SRA, MVT::v8i16, { 2, 3, 1, 1 } },
1039 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 1 } },
1040 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 1 } },
1041 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 1 } },
1042 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 1 } },
1043 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } },
1044 { ISD::SRA, MVT::v2i64, { 2, 3, 1, 1 } },
1045 // 256bit shifts require splitting if AVX2 didn't catch them above.
1046 { ISD::SHL, MVT::v32i8, { 4, 7, 5, 6 } },
1047 { ISD::SRL, MVT::v32i8, { 6, 7, 5, 6 } },
1048 { ISD::SRA, MVT::v32i8, { 6, 7, 5, 6 } },
1049 { ISD::SHL, MVT::v16i16, { 4, 7, 5, 6 } },
1050 { ISD::SRL, MVT::v16i16, { 6, 7, 5, 6 } },
1051 { ISD::SRA, MVT::v16i16, { 6, 7, 5, 6 } },
1052 { ISD::SHL, MVT::v8i32, { 4, 7, 5, 6 } },
1053 { ISD::SRL, MVT::v8i32, { 6, 7, 5, 6 } },
1054 { ISD::SRA, MVT::v8i32, { 6, 7, 5, 6 } },
1055 { ISD::SHL, MVT::v4i64, { 4, 7, 5, 6 } },
1056 { ISD::SRL, MVT::v4i64, { 6, 7, 5, 6 } },
1057 { ISD::SRA, MVT::v4i64, { 6, 7, 5, 6 } },
1058 };
1059
1060 // Look for XOP lowering tricks.
1061 if (ST->hasXOP()) {
1062 // If the right shift is constant then we'll fold the negation so
1063 // it's as cheap as a left shift.
1064 int ShiftISD = ISD;
1065 if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant())
1066 ShiftISD = ISD::SHL;
1067 if (const auto *Entry =
1068 CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
1069 if (auto KindCost = Entry->Cost[CostKind])
1070 return LT.first * *KindCost;
1071 }
1072
1073 if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) {
1074 MVT VT = LT.second;
1075 // Vector shift left by non uniform constant can be lowered
1076 // into vector multiply.
1077 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
1078 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
1079 ISD = ISD::MUL;
1080 }
1081
1082 static const CostKindTblEntry GLMCostTable[] = {
1083 { ISD::FDIV, MVT::f32, { 18, 19, 1, 1 } }, // divss
1084 { ISD::FDIV, MVT::v4f32, { 35, 36, 1, 1 } }, // divps
1085 { ISD::FDIV, MVT::f64, { 33, 34, 1, 1 } }, // divsd
1086 { ISD::FDIV, MVT::v2f64, { 65, 66, 1, 1 } }, // divpd
1087 };
1088
1089 if (ST->useGLMDivSqrtCosts())
1090 if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second))
1091 if (auto KindCost = Entry->Cost[CostKind])
1092 return LT.first * *KindCost;
1093
1094 static const CostKindTblEntry SLMCostTable[] = {
1095 { ISD::MUL, MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld
1096 { ISD::MUL, MVT::v8i16, { 2, 5, 1, 1 } }, // pmullw
1097 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // mulsd
1098 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // mulss
1099 { ISD::FMUL, MVT::v2f64, { 4, 7, 1, 1 } }, // mulpd
1100 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // mulps
1101 { ISD::FDIV, MVT::f32, { 17, 19, 1, 1 } }, // divss
1102 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 6 } }, // divps
1103 { ISD::FDIV, MVT::f64, { 32, 34, 1, 1 } }, // divsd
1104 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 6 } }, // divpd
1105 { ISD::FADD, MVT::v2f64, { 2, 4, 1, 1 } }, // addpd
1106 { ISD::FSUB, MVT::v2f64, { 2, 4, 1, 1 } }, // subpd
1107 // v2i64/v4i64 mul is custom lowered as a series of long:
1108 // multiplies(3), shifts(3) and adds(2)
1109 // slm muldq version throughput is 2 and addq throughput 4
1110 // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
1111 // 3X4 (addq throughput) = 17
1112 { ISD::MUL, MVT::v2i64, { 17, 22, 9, 9 } },
1113 // slm addq\subq throughput is 4
1114 { ISD::ADD, MVT::v2i64, { 4, 2, 1, 2 } },
1115 { ISD::SUB, MVT::v2i64, { 4, 2, 1, 2 } },
1116 };
1117
1118 if (ST->useSLMArithCosts())
1119 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second))
1120 if (auto KindCost = Entry->Cost[CostKind])
1121 return LT.first * *KindCost;
1122
1123 static const CostKindTblEntry AVX2CostTable[] = {
1124 { ISD::SHL, MVT::v16i8, { 6, 21,11,16 } }, // vpblendvb sequence.
1125 { ISD::SHL, MVT::v32i8, { 6, 23,11,22 } }, // vpblendvb sequence.
1126 { ISD::SHL, MVT::v8i16, { 5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence.
1127 { ISD::SHL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1128
1129 { ISD::SRL, MVT::v16i8, { 6, 27,12,18 } }, // vpblendvb sequence.
1130 { ISD::SRL, MVT::v32i8, { 8, 30,12,24 } }, // vpblendvb sequence.
1131 { ISD::SRL, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence.
1132 { ISD::SRL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1133
1134 { ISD::SRA, MVT::v16i8, { 17, 17,24,30 } }, // vpblendvb sequence.
1135 { ISD::SRA, MVT::v32i8, { 18, 20,24,43 } }, // vpblendvb sequence.
1136 { ISD::SRA, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsravd/pack sequence.
1137 { ISD::SRA, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsravd/pack sequence.
1138 { ISD::SRA, MVT::v2i64, { 4, 5, 5, 5 } }, // srl/xor/sub sequence.
1139 { ISD::SRA, MVT::v4i64, { 8, 8, 5, 9 } }, // srl/xor/sub sequence.
1140
1141 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 2 } }, // psubb
1142 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 2 } }, // paddb
1143 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 2 } }, // psubw
1144 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 2 } }, // paddw
1145 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 2 } }, // psubd
1146 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 2 } }, // paddd
1147 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 2 } }, // psubq
1148 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq
1149
1150 { ISD::MUL, MVT::v16i8, { 5, 18, 6,12 } }, // extend/pmullw/pack
1151 { ISD::MUL, MVT::v32i8, { 4, 8, 8,16 } }, // pmaddubsw
1152 { ISD::MUL, MVT::v16i16, { 2, 5, 1, 2 } }, // pmullw
1153 { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld
1154 { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld
1155 { ISD::MUL, MVT::v4i64, { 6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add
1156 { ISD::MUL, MVT::v2i64, { 6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
1157
1158 { X86ISD::PMULUDQ, MVT::v4i64, { 1, 5, 1, 1 } },
1159
1160 { ISD::FNEG, MVT::v4f64, { 1, 1, 1, 2 } }, // vxorpd
1161 { ISD::FNEG, MVT::v8f32, { 1, 1, 1, 2 } }, // vxorps
1162
1163 { ISD::FADD, MVT::f64, { 1, 4, 1, 1 } }, // vaddsd
1164 { ISD::FADD, MVT::f32, { 1, 4, 1, 1 } }, // vaddss
1165 { ISD::FADD, MVT::v2f64, { 1, 4, 1, 1 } }, // vaddpd
1166 { ISD::FADD, MVT::v4f32, { 1, 4, 1, 1 } }, // vaddps
1167 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 2 } }, // vaddpd
1168 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 2 } }, // vaddps
1169
1170 { ISD::FSUB, MVT::f64, { 1, 4, 1, 1 } }, // vsubsd
1171 { ISD::FSUB, MVT::f32, { 1, 4, 1, 1 } }, // vsubss
1172 { ISD::FSUB, MVT::v2f64, { 1, 4, 1, 1 } }, // vsubpd
1173 { ISD::FSUB, MVT::v4f32, { 1, 4, 1, 1 } }, // vsubps
1174 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 2 } }, // vsubpd
1175 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 2 } }, // vsubps
1176
1177 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // vmulsd
1178 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // vmulss
1179 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // vmulpd
1180 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // vmulps
1181 { ISD::FMUL, MVT::v4f64, { 1, 5, 1, 2 } }, // vmulpd
1182 { ISD::FMUL, MVT::v8f32, { 1, 5, 1, 2 } }, // vmulps
1183
1184 { ISD::FDIV, MVT::f32, { 7, 13, 1, 1 } }, // vdivss
1185 { ISD::FDIV, MVT::v4f32, { 7, 13, 1, 1 } }, // vdivps
1186 { ISD::FDIV, MVT::v8f32, { 14, 21, 1, 3 } }, // vdivps
1187 { ISD::FDIV, MVT::f64, { 14, 20, 1, 1 } }, // vdivsd
1188 { ISD::FDIV, MVT::v2f64, { 14, 20, 1, 1 } }, // vdivpd
1189 { ISD::FDIV, MVT::v4f64, { 28, 35, 1, 3 } }, // vdivpd
1190 };
1191
1192 // Look for AVX2 lowering tricks for custom cases.
1193 if (ST->hasAVX2())
1194 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
1195 if (auto KindCost = Entry->Cost[CostKind])
1196 return LT.first * *KindCost;
1197
1198 static const CostKindTblEntry AVX1CostTable[] = {
1199 // We don't have to scalarize unsupported ops. We can issue two half-sized
1200 // operations and we only need to extract the upper YMM half.
1201 // Two ops + 1 extract + 1 insert = 4.
1202 { ISD::MUL, MVT::v32i8, { 10, 11, 18, 19 } }, // pmaddubsw + split
1203 { ISD::MUL, MVT::v16i8, { 5, 6, 8, 12 } }, // 2*pmaddubsw/3*and/psllw/or
1204 { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split
1205 { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split
1206 { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld
1207 { ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } },
1208
1209 { X86ISD::PMULUDQ, MVT::v4i64, { 3, 5, 5, 6 } }, // pmuludq + split
1210
1211 { ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps
1212 { ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps
1213 { ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps
1214 { ISD::AND, MVT::v4i64, { 1, 1, 1, 2 } }, // vandps
1215
1216 { ISD::OR, MVT::v32i8, { 1, 1, 1, 2 } }, // vorps
1217 { ISD::OR, MVT::v16i16, { 1, 1, 1, 2 } }, // vorps
1218 { ISD::OR, MVT::v8i32, { 1, 1, 1, 2 } }, // vorps
1219 { ISD::OR, MVT::v4i64, { 1, 1, 1, 2 } }, // vorps
1220
1221 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 2 } }, // vxorps
1222 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 2 } }, // vxorps
1223 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 2 } }, // vxorps
1224 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 2 } }, // vxorps
1225
1226 { ISD::SUB, MVT::v32i8, { 4, 2, 5, 6 } }, // psubb + split
1227 { ISD::ADD, MVT::v32i8, { 4, 2, 5, 6 } }, // paddb + split
1228 { ISD::SUB, MVT::v16i16, { 4, 2, 5, 6 } }, // psubw + split
1229 { ISD::ADD, MVT::v16i16, { 4, 2, 5, 6 } }, // paddw + split
1230 { ISD::SUB, MVT::v8i32, { 4, 2, 5, 6 } }, // psubd + split
1231 { ISD::ADD, MVT::v8i32, { 4, 2, 5, 6 } }, // paddd + split
1232 { ISD::SUB, MVT::v4i64, { 4, 2, 5, 6 } }, // psubq + split
1233 { ISD::ADD, MVT::v4i64, { 4, 2, 5, 6 } }, // paddq + split
1234 { ISD::SUB, MVT::v2i64, { 1, 1, 1, 1 } }, // psubq
1235 { ISD::ADD, MVT::v2i64, { 1, 1, 1, 1 } }, // paddq
1236
1237 { ISD::SHL, MVT::v16i8, { 10, 21,11,17 } }, // pblendvb sequence.
1238 { ISD::SHL, MVT::v32i8, { 22, 22,27,40 } }, // pblendvb sequence + split.
1239 { ISD::SHL, MVT::v8i16, { 6, 9,11,11 } }, // pblendvb sequence.
1240 { ISD::SHL, MVT::v16i16, { 13, 16,24,25 } }, // pblendvb sequence + split.
1241 { ISD::SHL, MVT::v4i32, { 3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld
1242 { ISD::SHL, MVT::v8i32, { 9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split
1243 { ISD::SHL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1244 { ISD::SHL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1245
1246 { ISD::SRL, MVT::v16i8, { 11, 27,12,18 } }, // pblendvb sequence.
1247 { ISD::SRL, MVT::v32i8, { 23, 23,30,43 } }, // pblendvb sequence + split.
1248 { ISD::SRL, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1249 { ISD::SRL, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1250 { ISD::SRL, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1251 { ISD::SRL, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1252 { ISD::SRL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1253 { ISD::SRL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1254
1255 { ISD::SRA, MVT::v16i8, { 21, 22,24,36 } }, // pblendvb sequence.
1256 { ISD::SRA, MVT::v32i8, { 44, 45,51,76 } }, // pblendvb sequence + split.
1257 { ISD::SRA, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1258 { ISD::SRA, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1259 { ISD::SRA, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1260 { ISD::SRA, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1261 { ISD::SRA, MVT::v2i64, { 5, 6,10,14 } }, // Shift each lane + blend.
1262 { ISD::SRA, MVT::v4i64, { 12, 12,22,30 } }, // Shift each lane + blend + split.
1263
1264 { ISD::FNEG, MVT::v4f64, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1265 { ISD::FNEG, MVT::v8f32, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1266
1267 { ISD::FADD, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1268 { ISD::FADD, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1269 { ISD::FADD, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1270 { ISD::FADD, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1271 { ISD::FADD, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1272 { ISD::FADD, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1273
1274 { ISD::FSUB, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1275 { ISD::FSUB, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1276 { ISD::FSUB, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1277 { ISD::FSUB, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1278 { ISD::FSUB, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1279 { ISD::FSUB, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1280
1281 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1282 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1283 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1284 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1285 { ISD::FMUL, MVT::v4f64, { 4, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1286 { ISD::FMUL, MVT::v8f32, { 2, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1287
1288 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1289 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1290 { ISD::FDIV, MVT::v8f32, { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/
1291 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1292 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1293 { ISD::FDIV, MVT::v4f64, { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/
1294 };
1295
1296 if (ST->hasAVX())
1297 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
1298 if (auto KindCost = Entry->Cost[CostKind])
1299 return LT.first * *KindCost;
1300
1301 static const CostKindTblEntry SSE42CostTable[] = {
1302 { ISD::FADD, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1303 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1304 { ISD::FADD, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1305 { ISD::FADD, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1306
1307 { ISD::FSUB, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1308 { ISD::FSUB, MVT::f32 , { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1309 { ISD::FSUB, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1310 { ISD::FSUB, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1311
1312 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1313 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1314 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1315 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1316
1317 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1318 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1319 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1320 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1321
1322 { ISD::MUL, MVT::v2i64, { 6, 10,10,10 } } // 3*pmuludq/3*shift/2*add
1323 };
1324
1325 if (ST->hasSSE42())
1326 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
1327 if (auto KindCost = Entry->Cost[CostKind])
1328 return LT.first * *KindCost;
1329
1330 static const CostKindTblEntry SSE41CostTable[] = {
1331 { ISD::SHL, MVT::v16i8, { 15, 24,17,22 } }, // pblendvb sequence.
1332 { ISD::SHL, MVT::v8i16, { 11, 14,11,11 } }, // pblendvb sequence.
1333 { ISD::SHL, MVT::v4i32, { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld
1334
1335 { ISD::SRL, MVT::v16i8, { 16, 27,18,24 } }, // pblendvb sequence.
1336 { ISD::SRL, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1337 { ISD::SRL, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1338 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1339
1340 { ISD::SRA, MVT::v16i8, { 38, 41,30,36 } }, // pblendvb sequence.
1341 { ISD::SRA, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1342 { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1343 { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence.
1344
1345 { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org)
1346 };
1347
1348 if (ST->hasSSE41())
1349 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
1350 if (auto KindCost = Entry->Cost[CostKind])
1351 return LT.first * *KindCost;
1352
1353 static const CostKindTblEntry SSSE3CostTable[] = {
1354 { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*pmaddubsw/3*and/psllw/or
1355 };
1356
1357 if (ST->hasSSSE3())
1358 if (const auto *Entry = CostTableLookup(SSSE3CostTable, ISD, LT.second))
1359 if (auto KindCost = Entry->Cost[CostKind])
1360 return LT.first * *KindCost;
1361
1362 static const CostKindTblEntry SSE2CostTable[] = {
1363 // We don't correctly identify costs of casts because they are marked as
1364 // custom.
1365 { ISD::SHL, MVT::v16i8, { 13, 21,26,28 } }, // cmpgtb sequence.
1366 { ISD::SHL, MVT::v8i16, { 24, 27,16,20 } }, // cmpgtw sequence.
1367 { ISD::SHL, MVT::v4i32, { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq.
1368 { ISD::SHL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1369
1370 { ISD::SRL, MVT::v16i8, { 14, 28,27,30 } }, // cmpgtb sequence.
1371 { ISD::SRL, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1372 { ISD::SRL, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1373 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1374
1375 { ISD::SRA, MVT::v16i8, { 27, 30,54,54 } }, // unpacked cmpgtb sequence.
1376 { ISD::SRA, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1377 { ISD::SRA, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1378 { ISD::SRA, MVT::v2i64, { 8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence.
1379
1380 { ISD::AND, MVT::v16i8, { 1, 1, 1, 1 } }, // pand
1381 { ISD::AND, MVT::v8i16, { 1, 1, 1, 1 } }, // pand
1382 { ISD::AND, MVT::v4i32, { 1, 1, 1, 1 } }, // pand
1383 { ISD::AND, MVT::v2i64, { 1, 1, 1, 1 } }, // pand
1384
1385 { ISD::OR, MVT::v16i8, { 1, 1, 1, 1 } }, // por
1386 { ISD::OR, MVT::v8i16, { 1, 1, 1, 1 } }, // por
1387 { ISD::OR, MVT::v4i32, { 1, 1, 1, 1 } }, // por
1388 { ISD::OR, MVT::v2i64, { 1, 1, 1, 1 } }, // por
1389
1390 { ISD::XOR, MVT::v16i8, { 1, 1, 1, 1 } }, // pxor
1391 { ISD::XOR, MVT::v8i16, { 1, 1, 1, 1 } }, // pxor
1392 { ISD::XOR, MVT::v4i32, { 1, 1, 1, 1 } }, // pxor
1393 { ISD::XOR, MVT::v2i64, { 1, 1, 1, 1 } }, // pxor
1394
1395 { ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq
1396 { ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq
1397
1398 { ISD::MUL, MVT::v16i8, { 6, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack
1399 { ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw
1400 { ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle
1401 { ISD::MUL, MVT::v2i64, { 7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add
1402
1403 { X86ISD::PMULUDQ, MVT::v2i64, { 1, 5, 1, 1 } },
1404
1405 { ISD::FDIV, MVT::f32, { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/
1406 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/
1407 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/
1408 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/
1409
1410 { ISD::FNEG, MVT::f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1411 { ISD::FNEG, MVT::f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1412 { ISD::FNEG, MVT::v4f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1413 { ISD::FNEG, MVT::v2f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1414
1415 { ISD::FADD, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1416 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1417 { ISD::FADD, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1418
1419 { ISD::FSUB, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1420 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1421 { ISD::FSUB, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1422
1423 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1424 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1425 };
1426
1427 if (ST->hasSSE2())
1428 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
1429 if (auto KindCost = Entry->Cost[CostKind])
1430 return LT.first * *KindCost;
1431
1432 static const CostKindTblEntry SSE1CostTable[] = {
1433 { ISD::FDIV, MVT::f32, { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/
1434 { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/
1435
1436 { ISD::FNEG, MVT::f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1437 { ISD::FNEG, MVT::v4f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1438
1439 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1440 { ISD::FADD, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1441
1442 { ISD::FSUB, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1443 { ISD::FSUB, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1444
1445 { ISD::FMUL, MVT::f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1446 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1447 };
1448
1449 if (ST->hasSSE1())
1450 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1451 if (auto KindCost = Entry->Cost[CostKind])
1452 return LT.first * *KindCost;
1453
1454 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
1455 { ISD::ADD, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1456 { ISD::SUB, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1457 { ISD::MUL, MVT::i64, { 2, 6, 1, 2 } },
1458 };
1459
1460 if (ST->is64Bit())
1461 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1462 if (auto KindCost = Entry->Cost[CostKind])
1463 return LT.first * *KindCost;
1464
1465 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1466 { ISD::ADD, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1467 { ISD::ADD, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1468 { ISD::ADD, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1469
1470 { ISD::SUB, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1471 { ISD::SUB, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1472 { ISD::SUB, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1473
1474 { ISD::MUL, MVT::i8, { 3, 4, 1, 1 } },
1475 { ISD::MUL, MVT::i16, { 2, 4, 1, 1 } },
1476 { ISD::MUL, MVT::i32, { 1, 4, 1, 1 } },
1477
1478 { ISD::FNEG, MVT::f64, { 2, 2, 1, 3 } }, // (x87)
1479 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1480 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1481 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // (x87)
1482 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87)
1483 };
1484
1485 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1486 if (auto KindCost = Entry->Cost[CostKind])
1487 return LT.first * *KindCost;
1488
1489 // It is not a good idea to vectorize division. We have to scalarize it and
1490 // in the process we will often end up having to spilling regular
1491 // registers. The overhead of division is going to dominate most kernels
1492 // anyways so try hard to prevent vectorization of division - it is
1493 // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1494 // to hide "20 cycles" for each lane.
1495 if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() &&
1496 (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
1497 ISD == ISD::UREM)) {
1498 InstructionCost ScalarCost =
1499 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
1500 Op1Info.getNoProps(), Op2Info.getNoProps());
1501 return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1502 }
1503
1504 // Handle some basic single instruction code size cases.
1505 if (CostKind == TTI::TCK_CodeSize) {
1506 switch (ISD) {
1507 case ISD::FADD:
1508 case ISD::FSUB:
1509 case ISD::FMUL:
1510 case ISD::FDIV:
1511 case ISD::FNEG:
1512 case ISD::AND:
1513 case ISD::OR:
1514 case ISD::XOR:
1515 return LT.first;
1516 break;
1517 }
1518 }
1519
1520 // Fallback to the default implementation.
1521 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1522 Args, CxtI);
1523}
1524
1527 unsigned Opcode1, const SmallBitVector &OpcodeMask,
1529 if (isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
1530 return TTI::TCC_Basic;
1532}
1533
1535 VectorType *DstTy, VectorType *SrcTy,
1536 ArrayRef<int> Mask,
1538 int Index, VectorType *SubTp,
1540 const Instruction *CxtI) const {
1541 assert((Mask.empty() || DstTy->isScalableTy() ||
1542 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
1543 "Expected the Mask to match the return size if given");
1544 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
1545 "Expected the same scalar types");
1546
1547 // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1548 // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1549 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1550
1551 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
1552
1553 // If all args are constant than this will be constant folded away.
1554 if (!Args.empty() &&
1555 all_of(Args, [](const Value *Arg) { return isa<Constant>(Arg); }))
1556 return TTI::TCC_Free;
1557
1558 // Recognize a basic concat_vector shuffle.
1559 if (Kind == TTI::SK_PermuteTwoSrc &&
1560 Mask.size() == (2 * SrcTy->getElementCount().getKnownMinValue()) &&
1561 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
1565 CostKind, Mask.size() / 2, SrcTy);
1566
1567 // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1568 if (Kind == TTI::SK_Transpose)
1569 Kind = TTI::SK_PermuteTwoSrc;
1570
1571 if (Kind == TTI::SK_Broadcast) {
1572 // For Broadcasts we are splatting the first element from the first input
1573 // register, so only need to reference that input and all the output
1574 // registers are the same.
1575 LT.first = 1;
1576
1577 // If we're broadcasting a load then AVX/AVX2 can do this for free.
1578 using namespace PatternMatch;
1579 if (!Args.empty() && match(Args[0], m_OneUse(m_Load(m_Value()))) &&
1580 (ST->hasAVX2() ||
1581 (ST->hasAVX() && LT.second.getScalarSizeInBits() >= 32)))
1582 return TTI::TCC_Free;
1583 }
1584
1585 // Attempt to detect a cheaper inlane shuffle, avoiding 128-bit subvector
1586 // permutation.
1587 // Attempt to detect a shuffle mask with a single defined element.
1588 bool IsInLaneShuffle = false;
1589 bool IsSingleElementMask = false;
1590 if (SrcTy->getPrimitiveSizeInBits() > 0 &&
1591 (SrcTy->getPrimitiveSizeInBits() % 128) == 0 &&
1592 SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
1593 Mask.size() == SrcTy->getElementCount().getKnownMinValue()) {
1594 unsigned NumLanes = SrcTy->getPrimitiveSizeInBits() / 128;
1595 unsigned NumEltsPerLane = Mask.size() / NumLanes;
1596 if ((Mask.size() % NumLanes) == 0) {
1597 IsInLaneShuffle = all_of(enumerate(Mask), [&](const auto &P) {
1598 return P.value() == PoisonMaskElem ||
1599 ((P.value() % Mask.size()) / NumEltsPerLane) ==
1600 (P.index() / NumEltsPerLane);
1601 });
1602 IsSingleElementMask =
1603 (Mask.size() - 1) == static_cast<unsigned>(count_if(Mask, [](int M) {
1604 return M == PoisonMaskElem;
1605 }));
1606 }
1607 }
1608
1609 // Treat <X x bfloat> shuffles as <X x half>.
1610 if (LT.second.isVector() && LT.second.getScalarType() == MVT::bf16)
1611 LT.second = LT.second.changeVectorElementType(MVT::f16);
1612
1613 // Subvector extractions are free if they start at the beginning of a
1614 // vector and cheap if the subvectors are aligned.
1615 if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1616 int NumElts = LT.second.getVectorNumElements();
1617 if ((Index % NumElts) == 0)
1618 return TTI::TCC_Free;
1619 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1620 if (SubLT.second.isVector()) {
1621 int NumSubElts = SubLT.second.getVectorNumElements();
1622 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1623 return SubLT.first;
1624 // Handle some cases for widening legalization. For now we only handle
1625 // cases where the original subvector was naturally aligned and evenly
1626 // fit in its legalized subvector type.
1627 // FIXME: Remove some of the alignment restrictions.
1628 // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1629 // vectors.
1630 int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1631 if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1632 (NumSubElts % OrigSubElts) == 0 &&
1633 LT.second.getVectorElementType() ==
1634 SubLT.second.getVectorElementType() &&
1635 LT.second.getVectorElementType().getSizeInBits() ==
1636 SrcTy->getElementType()->getPrimitiveSizeInBits()) {
1637 assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1638 "Unexpected number of elements!");
1639 auto *VecTy = FixedVectorType::get(SrcTy->getElementType(),
1640 LT.second.getVectorNumElements());
1641 auto *SubTy = FixedVectorType::get(SrcTy->getElementType(),
1642 SubLT.second.getVectorNumElements());
1643 int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1644 InstructionCost ExtractCost =
1646 ExtractIndex, SubTy);
1647
1648 // If the original size is 32-bits or more, we can use pshufd. Otherwise
1649 // if we have SSSE3 we can use pshufb.
1650 if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1651 return ExtractCost + 1; // pshufd or pshufb
1652
1653 assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1654 "Unexpected vector size");
1655
1656 return ExtractCost + 2; // worst case pshufhw + pshufd
1657 }
1658 }
1659 // If the extract subvector is not optimal, treat it as single op shuffle.
1661 }
1662
1663 // Subvector insertions are cheap if the subvectors are aligned.
1664 // Note that in general, the insertion starting at the beginning of a vector
1665 // isn't free, because we need to preserve the rest of the wide vector,
1666 // but if the destination vector legalizes to the same width as the subvector
1667 // then the insertion will simplify to a (free) register copy.
1668 if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1669 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(DstTy);
1670 int NumElts = DstLT.second.getVectorNumElements();
1671 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1672 if (SubLT.second.isVector()) {
1673 int NumSubElts = SubLT.second.getVectorNumElements();
1674 bool MatchingTypes =
1675 NumElts == NumSubElts &&
1676 (SubTp->getElementCount().getKnownMinValue() % NumSubElts) == 0;
1677 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1678 return MatchingTypes ? TTI::TCC_Free : SubLT.first;
1679 }
1680
1681 // Attempt to match MOVSS (Idx == 0) or INSERTPS pattern. This will have
1682 // been matched by improveShuffleKindFromMask as a SK_InsertSubvector of
1683 // v1f32 (legalised to f32) into a v4f32.
1684 if (LT.first == 1 && LT.second == MVT::v4f32 && SubLT.first == 1 &&
1685 SubLT.second == MVT::f32 && (Index == 0 || ST->hasSSE41()))
1686 return 1;
1687
1688 // If the insertion is the lowest subvector then it will be blended
1689 // otherwise treat it like a 2-op shuffle.
1690 Kind =
1691 (Index == 0 && LT.first == 1) ? TTI::SK_Select : TTI::SK_PermuteTwoSrc;
1692 }
1693
1694 // Handle some common (illegal) sub-vector types as they are often very cheap
1695 // to shuffle even on targets without PSHUFB.
1696 EVT VT = TLI->getValueType(DL, SrcTy);
1697 if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1698 !ST->hasSSSE3()) {
1699 static const CostKindTblEntry SSE2SubVectorShuffleTbl[] = {
1700 {TTI::SK_Broadcast, MVT::v4i16, {1,1,1,1}}, // pshuflw
1701 {TTI::SK_Broadcast, MVT::v2i16, {1,1,1,1}}, // pshuflw
1702 {TTI::SK_Broadcast, MVT::v8i8, {2,2,2,2}}, // punpck/pshuflw
1703 {TTI::SK_Broadcast, MVT::v4i8, {2,2,2,2}}, // punpck/pshuflw
1704 {TTI::SK_Broadcast, MVT::v2i8, {1,1,1,1}}, // punpck
1705
1706 {TTI::SK_Reverse, MVT::v4i16, {1,1,1,1}}, // pshuflw
1707 {TTI::SK_Reverse, MVT::v2i16, {1,1,1,1}}, // pshuflw
1708 {TTI::SK_Reverse, MVT::v4i8, {3,3,3,3}}, // punpck/pshuflw/packus
1709 {TTI::SK_Reverse, MVT::v2i8, {1,1,1,1}}, // punpck
1710
1711 {TTI::SK_Splice, MVT::v4i16, {2,2,2,2}}, // punpck+psrldq
1712 {TTI::SK_Splice, MVT::v2i16, {2,2,2,2}}, // punpck+psrldq
1713 {TTI::SK_Splice, MVT::v4i8, {2,2,2,2}}, // punpck+psrldq
1714 {TTI::SK_Splice, MVT::v2i8, {2,2,2,2}}, // punpck+psrldq
1715
1716 {TTI::SK_PermuteTwoSrc, MVT::v4i16, {2,2,2,2}}, // punpck/pshuflw
1717 {TTI::SK_PermuteTwoSrc, MVT::v2i16, {2,2,2,2}}, // punpck/pshuflw
1718 {TTI::SK_PermuteTwoSrc, MVT::v8i8, {7,7,7,7}}, // punpck/pshuflw
1719 {TTI::SK_PermuteTwoSrc, MVT::v4i8, {4,4,4,4}}, // punpck/pshuflw
1720 {TTI::SK_PermuteTwoSrc, MVT::v2i8, {2,2,2,2}}, // punpck
1721
1722 {TTI::SK_PermuteSingleSrc, MVT::v4i16, {1,1,1,1}}, // pshuflw
1723 {TTI::SK_PermuteSingleSrc, MVT::v2i16, {1,1,1,1}}, // pshuflw
1724 {TTI::SK_PermuteSingleSrc, MVT::v8i8, {5,5,5,5}}, // punpck/pshuflw
1725 {TTI::SK_PermuteSingleSrc, MVT::v4i8, {3,3,3,3}}, // punpck/pshuflw
1726 {TTI::SK_PermuteSingleSrc, MVT::v2i8, {1,1,1,1}}, // punpck
1727 };
1728
1729 if (ST->hasSSE2())
1730 if (const auto *Entry =
1731 CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1732 if (auto KindCost = Entry->Cost[CostKind])
1733 return LT.first * *KindCost;
1734 }
1735
1736 // We are going to permute multiple sources and the result will be in multiple
1737 // destinations. Providing an accurate cost only for splits where the element
1738 // type remains the same.
1739 if (LT.first != 1) {
1740 MVT LegalVT = LT.second;
1741 if (LegalVT.isVector() &&
1742 LegalVT.getVectorElementType().getSizeInBits() ==
1743 SrcTy->getElementType()->getPrimitiveSizeInBits() &&
1744 LegalVT.getVectorNumElements() <
1745 cast<FixedVectorType>(SrcTy)->getNumElements()) {
1746 unsigned VecTySize = DL.getTypeStoreSize(SrcTy);
1747 unsigned LegalVTSize = LegalVT.getStoreSize();
1748 // Number of source vectors after legalization:
1749 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1750 // Number of destination vectors after legalization:
1751 InstructionCost NumOfDests = LT.first;
1752
1753 auto *SingleOpTy = FixedVectorType::get(SrcTy->getElementType(),
1754 LegalVT.getVectorNumElements());
1755
1756 if (!Mask.empty() && NumOfDests.isValid()) {
1757 // Try to perform better estimation of the permutation.
1758 // 1. Split the source/destination vectors into real registers.
1759 // 2. Do the mask analysis to identify which real registers are
1760 // permuted. If more than 1 source registers are used for the
1761 // destination register building, the cost for this destination register
1762 // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
1763 // source register is used, build mask and calculate the cost as a cost
1764 // of PermuteSingleSrc.
1765 // Also, for the single register permute we try to identify if the
1766 // destination register is just a copy of the source register or the
1767 // copy of the previous destination register (the cost is
1768 // TTI::TCC_Basic). If the source register is just reused, the cost for
1769 // this operation is TTI::TCC_Free.
1770 NumOfDests =
1772 FixedVectorType::get(SrcTy->getElementType(), Mask.size()))
1773 .first;
1774 unsigned E = NumOfDests.getValue();
1775 unsigned NormalizedVF =
1776 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
1777 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
1778 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
1779 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
1780 copy(Mask, NormalizedMask.begin());
1781 unsigned PrevSrcReg = 0;
1782 ArrayRef<int> PrevRegMask;
1785 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
1786 [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask,
1787 &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
1788 if (!ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) {
1789 // Check if the previous register can be just copied to the next
1790 // one.
1791 if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
1792 PrevRegMask != RegMask)
1793 Cost +=
1795 SingleOpTy, RegMask, CostKind, 0, nullptr);
1796 else
1797 // Just a copy of previous destination register.
1799 return;
1800 }
1801 if (SrcReg != DestReg &&
1802 any_of(RegMask, [](int I) { return I != PoisonMaskElem; })) {
1803 // Just a copy of the source register.
1805 }
1806 PrevSrcReg = SrcReg;
1807 PrevRegMask = RegMask;
1808 },
1809 [this, SingleOpTy, CostKind,
1810 &Cost](ArrayRef<int> RegMask, unsigned /*Unused*/,
1811 unsigned /*Unused*/, bool /*Unused*/) {
1813 SingleOpTy, RegMask, CostKind, 0, nullptr);
1814 });
1815 return Cost;
1816 }
1817
1818 InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1819 return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1820 SingleOpTy, {}, CostKind, 0,
1821 nullptr);
1822 }
1823
1824 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1825 SubTp);
1826 }
1827
1828 // If we're just moving a single element around (probably as an alternative to
1829 // extracting it), we can assume this is cheap.
1830 if (LT.first == 1 && IsInLaneShuffle && IsSingleElementMask)
1831 return TTI::TCC_Basic;
1832
1833 static const CostKindTblEntry AVX512VBMIShuffleTbl[] = {
1834 { TTI::SK_Reverse, MVT::v64i8, { 1, 1, 1, 1 } }, // vpermb
1835 { TTI::SK_Reverse, MVT::v32i8, { 1, 1, 1, 1 } }, // vpermb
1836 { TTI::SK_PermuteSingleSrc, MVT::v64i8, { 1, 1, 1, 1 } }, // vpermb
1837 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 1, 1, 1, 1 } }, // vpermb
1838 { TTI::SK_PermuteTwoSrc, MVT::v64i8, { 2, 2, 2, 2 } }, // vpermt2b
1839 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 2, 2, 2, 2 } }, // vpermt2b
1840 { TTI::SK_PermuteTwoSrc, MVT::v16i8, { 2, 2, 2, 2 } } // vpermt2b
1841 };
1842
1843 if (ST->hasVBMI())
1844 if (const auto *Entry =
1845 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1846 if (auto KindCost = Entry->Cost[CostKind])
1847 return LT.first * *KindCost;
1848
1849 static const CostKindTblEntry AVX512BWShuffleTbl[] = {
1850 { TTI::SK_Broadcast, MVT::v32i16, { 1, 3, 1, 1 } }, // vpbroadcastw
1851 { TTI::SK_Broadcast, MVT::v32f16, { 1, 3, 1, 1 } }, // vpbroadcastw
1852 { TTI::SK_Broadcast, MVT::v64i8, { 1, 3, 1, 1 } }, // vpbroadcastb
1853
1854 { TTI::SK_Reverse, MVT::v32i16, { 2, 6, 2, 4 } }, // vpermw
1855 { TTI::SK_Reverse, MVT::v32f16, { 2, 6, 2, 4 } }, // vpermw
1856 { TTI::SK_Reverse, MVT::v16i16, { 2, 2, 2, 2 } }, // vpermw
1857 { TTI::SK_Reverse, MVT::v16f16, { 2, 2, 2, 2 } }, // vpermw
1858 { TTI::SK_Reverse, MVT::v64i8, { 2, 9, 2, 3 } }, // pshufb + vshufi64x2
1859
1860 { TTI::SK_PermuteSingleSrc, MVT::v32i16, { 2, 2, 2, 2 } }, // vpermw
1861 { TTI::SK_PermuteSingleSrc, MVT::v32f16, { 2, 2, 2, 2 } }, // vpermw
1862 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 2, 2, 2, 2 } }, // vpermw
1863 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 2, 2, 2, 2 } }, // vpermw
1864 { TTI::SK_PermuteSingleSrc, MVT::v64i8, { 8, 8, 8, 8 } }, // extend to v32i16
1865
1866 { TTI::SK_PermuteTwoSrc, MVT::v32i16,{ 2, 2, 2, 2 } }, // vpermt2w
1867 { TTI::SK_PermuteTwoSrc, MVT::v32f16,{ 2, 2, 2, 2 } }, // vpermt2w
1868 { TTI::SK_PermuteTwoSrc, MVT::v16i16,{ 2, 2, 2, 2 } }, // vpermt2w
1869 { TTI::SK_PermuteTwoSrc, MVT::v8i16, { 2, 2, 2, 2 } }, // vpermt2w
1870 { TTI::SK_PermuteTwoSrc, MVT::v64i8, { 19, 19, 19, 19 } }, // 6 * v32i8 + 1
1871
1872 { TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vblendmw
1873 { TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vblendmb
1874
1875 { TTI::SK_Splice, MVT::v32i16, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
1876 { TTI::SK_Splice, MVT::v32f16, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
1877 { TTI::SK_Splice, MVT::v64i8, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
1878 };
1879
1880 if (ST->hasBWI())
1881 if (const auto *Entry =
1882 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1883 if (auto KindCost = Entry->Cost[CostKind])
1884 return LT.first * *KindCost;
1885
1886 static const CostKindTblEntry AVX512ShuffleTbl[] = {
1887 {TTI::SK_Broadcast, MVT::v8f64, { 1, 3, 1, 1 } }, // vbroadcastsd
1888 {TTI::SK_Broadcast, MVT::v4f64, { 1, 3, 1, 1 } }, // vbroadcastsd
1889 {TTI::SK_Broadcast, MVT::v16f32, { 1, 3, 1, 1 } }, // vbroadcastss
1890 {TTI::SK_Broadcast, MVT::v8f32, { 1, 3, 1, 1 } }, // vbroadcastss
1891 {TTI::SK_Broadcast, MVT::v8i64, { 1, 3, 1, 1 } }, // vpbroadcastq
1892 {TTI::SK_Broadcast, MVT::v4i64, { 1, 3, 1, 1 } }, // vpbroadcastq
1893 {TTI::SK_Broadcast, MVT::v16i32, { 1, 3, 1, 1 } }, // vpbroadcastd
1894 {TTI::SK_Broadcast, MVT::v8i32, { 1, 3, 1, 1 } }, // vpbroadcastd
1895 {TTI::SK_Broadcast, MVT::v32i16, { 1, 3, 1, 1 } }, // vpbroadcastw
1896 {TTI::SK_Broadcast, MVT::v16i16, { 1, 3, 1, 1 } }, // vpbroadcastw
1897 {TTI::SK_Broadcast, MVT::v32f16, { 1, 3, 1, 1 } }, // vpbroadcastw
1898 {TTI::SK_Broadcast, MVT::v16f16, { 1, 3, 1, 1 } }, // vpbroadcastw
1899 {TTI::SK_Broadcast, MVT::v64i8, { 1, 3, 1, 1 } }, // vpbroadcastb
1900 {TTI::SK_Broadcast, MVT::v32i8, { 1, 3, 1, 1 }}, // vpbroadcastb
1901
1902 {TTI::SK_Reverse, MVT::v8f64, { 1, 5, 2, 3 } }, // vpermpd
1903 {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 2, 3 } }, // vpermps
1904 {TTI::SK_Reverse, MVT::v8i64, { 1, 5, 2, 3 } }, // vpermq
1905 {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 2, 3 } }, // vpermd
1906 {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca
1907 {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca
1908 {TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca
1909
1910 {TTI::SK_Splice, MVT::v8f64, { 1, 1, 1, 1 } }, // vpalignd
1911 {TTI::SK_Splice, MVT::v4f64, { 1, 1, 1, 1 } }, // vpalignd
1912 {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd
1913 {TTI::SK_Splice, MVT::v8f32, { 1, 1, 1, 1 } }, // vpalignd
1914 {TTI::SK_Splice, MVT::v8i64, { 1, 1, 1, 1 } }, // vpalignd
1915 {TTI::SK_Splice, MVT::v4i64, { 1, 1, 1, 1 } }, // vpalignd
1916 {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd
1917 {TTI::SK_Splice, MVT::v8i32, { 1, 1, 1, 1 } }, // vpalignd
1918 {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr
1919 {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr
1920 {TTI::SK_Splice, MVT::v64i8, { 4, 4, 4, 4 } }, // split + palignr
1921
1922 {TTI::SK_PermuteSingleSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1923 {TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermpd
1924 {TTI::SK_PermuteSingleSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermpd
1925 {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1926 {TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermps
1927 {TTI::SK_PermuteSingleSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermps
1928 {TTI::SK_PermuteSingleSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1929 {TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermq
1930 {TTI::SK_PermuteSingleSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermq
1931 {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1932 {TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermd
1933 {TTI::SK_PermuteSingleSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermd
1934 {TTI::SK_PermuteSingleSrc, MVT::v16i8, { 1, 3, 1, 1 } }, // pshufb
1935
1936 {TTI::SK_PermuteTwoSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermt2pd
1937 {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermt2ps
1938 {TTI::SK_PermuteTwoSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermt2q
1939 {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermt2d
1940 {TTI::SK_PermuteTwoSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermt2pd
1941 {TTI::SK_PermuteTwoSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermt2ps
1942 {TTI::SK_PermuteTwoSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermt2q
1943 {TTI::SK_PermuteTwoSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermt2d
1944 {TTI::SK_PermuteTwoSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermt2pd
1945 {TTI::SK_PermuteTwoSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermt2ps
1946 {TTI::SK_PermuteTwoSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermt2q
1947 {TTI::SK_PermuteTwoSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermt2d
1948
1949 // FIXME: This just applies the type legalization cost rules above
1950 // assuming these completely split.
1951 {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } },
1952 {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } },
1953 {TTI::SK_PermuteSingleSrc, MVT::v64i8, { 14, 14, 14, 14 } },
1954 {TTI::SK_PermuteTwoSrc, MVT::v32i16, { 42, 42, 42, 42 } },
1955 {TTI::SK_PermuteTwoSrc, MVT::v32f16, { 42, 42, 42, 42 } },
1956 {TTI::SK_PermuteTwoSrc, MVT::v64i8, { 42, 42, 42, 42 } },
1957
1958 {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq
1959 {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq
1960 {TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vpternlogq
1961 {TTI::SK_Select, MVT::v8f64, { 1, 1, 1, 1 } }, // vblendmpd
1962 {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps
1963 {TTI::SK_Select, MVT::v8i64, { 1, 1, 1, 1 } }, // vblendmq
1964 {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd
1965 };
1966
1967 if (ST->hasAVX512())
1968 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1969 if (auto KindCost = Entry->Cost[CostKind])
1970 return LT.first * *KindCost;
1971
1972 static const CostKindTblEntry AVX2InLaneShuffleTbl[] = {
1973 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 1, 1, 1, 1 } }, // vpshufb
1974 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 1, 1, 1, 1 } }, // vpshufb
1975 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 1, 1, 1, 1 } }, // vpshufb
1976
1977 { TTI::SK_PermuteTwoSrc, MVT::v4f64, { 2, 2, 2, 2 } }, // 2*vshufpd + vblendpd
1978 { TTI::SK_PermuteTwoSrc, MVT::v8f32, { 2, 2, 2, 2 } }, // 2*vshufps + vblendps
1979 { TTI::SK_PermuteTwoSrc, MVT::v4i64, { 2, 2, 2, 2 } }, // 2*vpshufd + vpblendd
1980 { TTI::SK_PermuteTwoSrc, MVT::v8i32, { 2, 2, 2, 2 } }, // 2*vpshufd + vpblendd
1981 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 2, 2, 2, 2 } }, // 2*vpshufb + vpor
1982 { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 2, 2, 2, 2 } }, // 2*vpshufb + vpor
1983 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 2, 2, 2, 2 } }, // 2*vpshufb + vpor
1984 };
1985
1986 if (IsInLaneShuffle && ST->hasAVX2())
1987 if (const auto *Entry =
1988 CostTableLookup(AVX2InLaneShuffleTbl, Kind, LT.second))
1989 if (auto KindCost = Entry->Cost[CostKind])
1990 return LT.first * *KindCost;
1991
1992 static const CostKindTblEntry AVX2ShuffleTbl[] = {
1993 { TTI::SK_Broadcast, MVT::v4f64, { 1, 3, 1, 2 } }, // vbroadcastpd
1994 { TTI::SK_Broadcast, MVT::v8f32, { 1, 3, 1, 2 } }, // vbroadcastps
1995 { TTI::SK_Broadcast, MVT::v4i64, { 1, 3, 1, 2 } }, // vpbroadcastq
1996 { TTI::SK_Broadcast, MVT::v8i32, { 1, 3, 1, 2 } }, // vpbroadcastd
1997 { TTI::SK_Broadcast, MVT::v16i16, { 1, 3, 1, 2 } }, // vpbroadcastw
1998 { TTI::SK_Broadcast, MVT::v8i16, { 1, 3, 1, 1 } }, // vpbroadcastw
1999 { TTI::SK_Broadcast, MVT::v16f16, { 1, 3, 1, 2 } }, // vpbroadcastw
2000 { TTI::SK_Broadcast, MVT::v8f16, { 1, 3, 1, 1 } }, // vpbroadcastw
2001 { TTI::SK_Broadcast, MVT::v32i8, { 1, 3, 1, 2 } }, // vpbroadcastb
2002 { TTI::SK_Broadcast, MVT::v16i8, { 1, 3, 1, 1 } }, // vpbroadcastb
2003
2004 { TTI::SK_Reverse, MVT::v4f64, { 1, 6, 1, 2 } }, // vpermpd
2005 { TTI::SK_Reverse, MVT::v8f32, { 2, 7, 2, 4 } }, // vpermps
2006 { TTI::SK_Reverse, MVT::v4i64, { 1, 6, 1, 2 } }, // vpermq
2007 { TTI::SK_Reverse, MVT::v8i32, { 2, 7, 2, 4 } }, // vpermd
2008 { TTI::SK_Reverse, MVT::v16i16, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
2009 { TTI::SK_Reverse, MVT::v16f16, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
2010 { TTI::SK_Reverse, MVT::v32i8, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
2011
2012 { TTI::SK_Select, MVT::v16i16, { 1, 1, 1, 1 } }, // vpblendvb
2013 { TTI::SK_Select, MVT::v16f16, { 1, 1, 1, 1 } }, // vpblendvb
2014 { TTI::SK_Select, MVT::v32i8, { 1, 1, 1, 1 } }, // vpblendvb
2015
2016 { TTI::SK_Splice, MVT::v8i32, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2017 { TTI::SK_Splice, MVT::v8f32, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2018 { TTI::SK_Splice, MVT::v16i16, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2019 { TTI::SK_Splice, MVT::v16f16, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2020 { TTI::SK_Splice, MVT::v32i8, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2021
2022 { TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 1, 1, 1 } }, // vpermpd
2023 { TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 1, 1, 1 } }, // vpermps
2024 { TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 1, 1, 1 } }, // vpermq
2025 { TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 1, 1, 1 } }, // vpermd
2026 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 4, 4, 4, 4 } },
2027 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 4, 4, 4, 4 } },
2028 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 4, 4, 4, 4 } },
2029
2030 { TTI::SK_PermuteTwoSrc, MVT::v4f64, { 3, 3, 3, 3 } }, // 2*vpermpd + vblendpd
2031 { TTI::SK_PermuteTwoSrc, MVT::v8f32, { 3, 3, 3, 3 } }, // 2*vpermps + vblendps
2032 { TTI::SK_PermuteTwoSrc, MVT::v4i64, { 3, 3, 3, 3 } }, // 2*vpermq + vpblendd
2033 { TTI::SK_PermuteTwoSrc, MVT::v8i32, { 3, 3, 3, 3 } }, // 2*vpermd + vpblendd
2034 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 7, 7, 7, 7 } },
2035 { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 7, 7, 7, 7 } },
2036 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 7, 7, 7, 7 } },
2037 };
2038
2039 if (ST->hasAVX2())
2040 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
2041 if (auto KindCost = Entry->Cost[CostKind])
2042 return LT.first * *KindCost;
2043
2044 static const CostKindTblEntry XOPShuffleTbl[] = {
2045 { TTI::SK_PermuteSingleSrc, MVT::v4f64, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2pd
2046 { TTI::SK_PermuteSingleSrc, MVT::v8f32, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2ps
2047 { TTI::SK_PermuteSingleSrc, MVT::v4i64, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2pd
2048 { TTI::SK_PermuteSingleSrc, MVT::v8i32, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2ps
2049 { TTI::SK_PermuteSingleSrc, MVT::v16i16,{ 4, 4, 4, 4 } }, // vextractf128 + 2*vpperm
2050 // + vinsertf128
2051 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 4, 4, 4, 4 } }, // vextractf128 + 2*vpperm
2052 // + vinsertf128
2053
2054 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 6*vpperm
2055 // + vinsertf128
2056
2057 { TTI::SK_PermuteTwoSrc, MVT::v8i16, { 1, 1, 1, 1 } }, // vpperm
2058 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 9, 9, 9, 9 } }, // 2*vextractf128 + 6*vpperm
2059 // + vinsertf128
2060 { TTI::SK_PermuteTwoSrc, MVT::v16i8, { 1, 1, 1, 1 } }, // vpperm
2061 };
2062
2063 if (ST->hasXOP())
2064 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
2065 if (auto KindCost = Entry->Cost[CostKind])
2066 return LT.first * *KindCost;
2067
2068 static const CostKindTblEntry AVX1InLaneShuffleTbl[] = {
2069 { TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 1, 1, 1 } }, // vpermilpd
2070 { TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 1, 1, 1 } }, // vpermilpd
2071 { TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 1, 1, 1 } }, // vpermilps
2072 { TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 1, 1, 1 } }, // vpermilps
2073
2074 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb
2075 // + vpor + vinsertf128
2076 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb
2077 // + vpor + vinsertf128
2078 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb
2079 // + vpor + vinsertf128
2080
2081 { TTI::SK_PermuteTwoSrc, MVT::v4f64, { 2, 2, 2, 2 } }, // 2*vshufpd + vblendpd
2082 { TTI::SK_PermuteTwoSrc, MVT::v8f32, { 2, 2, 2, 2 } }, // 2*vshufps + vblendps
2083 { TTI::SK_PermuteTwoSrc, MVT::v4i64, { 2, 2, 2, 2 } }, // 2*vpermilpd + vblendpd
2084 { TTI::SK_PermuteTwoSrc, MVT::v8i32, { 2, 2, 2, 2 } }, // 2*vpermilps + vblendps
2085 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb
2086 // + 2*vpor + vinsertf128
2087 { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb
2088 // + 2*vpor + vinsertf128
2089 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb
2090 // + 2*vpor + vinsertf128
2091 };
2092
2093 if (IsInLaneShuffle && ST->hasAVX())
2094 if (const auto *Entry =
2095 CostTableLookup(AVX1InLaneShuffleTbl, Kind, LT.second))
2096 if (auto KindCost = Entry->Cost[CostKind])
2097 return LT.first * *KindCost;
2098
2099 static const CostKindTblEntry AVX1ShuffleTbl[] = {
2100 {TTI::SK_Broadcast, MVT::v4f64, {2,3,2,3}}, // vperm2f128 + vpermilpd
2101 {TTI::SK_Broadcast, MVT::v8f32, {2,3,2,3}}, // vperm2f128 + vpermilps
2102 {TTI::SK_Broadcast, MVT::v4i64, {2,3,2,3}}, // vperm2f128 + vpermilpd
2103 {TTI::SK_Broadcast, MVT::v8i32, {2,3,2,3}}, // vperm2f128 + vpermilps
2104 {TTI::SK_Broadcast, MVT::v16i16, {2,3,3,4}}, // vpshuflw + vpshufd + vinsertf128
2105 {TTI::SK_Broadcast, MVT::v16f16, {2,3,3,4}}, // vpshuflw + vpshufd + vinsertf128
2106 {TTI::SK_Broadcast, MVT::v32i8, {3,4,3,6}}, // vpshufb + vinsertf128
2107
2108 {TTI::SK_Reverse, MVT::v4f64, {2,6,2,2}}, // vperm2f128 + vpermilpd
2109 {TTI::SK_Reverse, MVT::v8f32, {2,7,2,4}}, // vperm2f128 + vpermilps
2110 {TTI::SK_Reverse, MVT::v4i64, {2,6,2,2}}, // vperm2f128 + vpermilpd
2111 {TTI::SK_Reverse, MVT::v8i32, {2,7,2,4}}, // vperm2f128 + vpermilps
2112 {TTI::SK_Reverse, MVT::v16i16, {2,9,5,5}}, // vextractf128 + 2*pshufb
2113 // + vinsertf128
2114 {TTI::SK_Reverse, MVT::v16f16, {2,9,5,5}}, // vextractf128 + 2*pshufb
2115 // + vinsertf128
2116 {TTI::SK_Reverse, MVT::v32i8, {2,9,5,5}}, // vextractf128 + 2*pshufb
2117 // + vinsertf128
2118
2119 {TTI::SK_Select, MVT::v4i64, {1,1,1,1}}, // vblendpd
2120 {TTI::SK_Select, MVT::v4f64, {1,1,1,1}}, // vblendpd
2121 {TTI::SK_Select, MVT::v8i32, {1,1,1,1}}, // vblendps
2122 {TTI::SK_Select, MVT::v8f32, {1,1,1,1}}, // vblendps
2123 {TTI::SK_Select, MVT::v16i16, {3,3,3,3}}, // vpand + vpandn + vpor
2124 {TTI::SK_Select, MVT::v16f16, {3,3,3,3}}, // vpand + vpandn + vpor
2125 {TTI::SK_Select, MVT::v32i8, {3,3,3,3}}, // vpand + vpandn + vpor
2126
2127 {TTI::SK_Splice, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + shufpd
2128 {TTI::SK_Splice, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + shufpd
2129 {TTI::SK_Splice, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2130 {TTI::SK_Splice, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2131 {TTI::SK_Splice, MVT::v16i16, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2132 {TTI::SK_Splice, MVT::v16f16, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2133 {TTI::SK_Splice, MVT::v32i8, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2134
2135 {TTI::SK_PermuteSingleSrc, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + vshufpd
2136 {TTI::SK_PermuteSingleSrc, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + vshufpd
2137 {TTI::SK_PermuteSingleSrc, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2138 {TTI::SK_PermuteSingleSrc, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2139 {TTI::SK_PermuteSingleSrc, MVT::v16i16,{8,8,8,8}}, // vextractf128 + 4*pshufb
2140 // + 2*por + vinsertf128
2141 {TTI::SK_PermuteSingleSrc, MVT::v16f16,{8,8,8,8}}, // vextractf128 + 4*pshufb
2142 // + 2*por + vinsertf128
2143 {TTI::SK_PermuteSingleSrc, MVT::v32i8, {8,8,8,8}}, // vextractf128 + 4*pshufb
2144 // + 2*por + vinsertf128
2145
2146 {TTI::SK_PermuteTwoSrc, MVT::v4f64, {3,3,3,3}}, // 2*vperm2f128 + vshufpd
2147 {TTI::SK_PermuteTwoSrc, MVT::v4i64, {3,3,3,3}}, // 2*vperm2f128 + vshufpd
2148 {TTI::SK_PermuteTwoSrc, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2149 {TTI::SK_PermuteTwoSrc, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2150 {TTI::SK_PermuteTwoSrc, MVT::v16i16,{15,15,15,15}}, // 2*vextractf128 + 8*pshufb
2151 // + 4*por + vinsertf128
2152 {TTI::SK_PermuteTwoSrc, MVT::v16f16,{15,15,15,15}}, // 2*vextractf128 + 8*pshufb
2153 // + 4*por + vinsertf128
2154 {TTI::SK_PermuteTwoSrc, MVT::v32i8, {15,15,15,15}}, // 2*vextractf128 + 8*pshufb
2155 // + 4*por + vinsertf128
2156 };
2157
2158 if (ST->hasAVX())
2159 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
2160 if (auto KindCost = Entry->Cost[CostKind])
2161 return LT.first * *KindCost;
2162
2163 static const CostKindTblEntry SSE41ShuffleTbl[] = {
2164 {TTI::SK_Select, MVT::v2i64, {1,1,1,1}}, // pblendw
2165 {TTI::SK_Select, MVT::v2f64, {1,1,1,1}}, // movsd
2166 {TTI::SK_Select, MVT::v4i32, {1,1,1,1}}, // pblendw
2167 {TTI::SK_Select, MVT::v4f32, {1,1,1,1}}, // blendps
2168 {TTI::SK_Select, MVT::v8i16, {1,1,1,1}}, // pblendw
2169 {TTI::SK_Select, MVT::v8f16, {1,1,1,1}}, // pblendw
2170 {TTI::SK_Select, MVT::v16i8, {1,1,1,1}} // pblendvb
2171 };
2172
2173 if (ST->hasSSE41())
2174 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
2175 if (auto KindCost = Entry->Cost[CostKind])
2176 return LT.first * *KindCost;
2177
2178 static const CostKindTblEntry SSSE3ShuffleTbl[] = {
2179 {TTI::SK_Broadcast, MVT::v8i16, {1, 3, 2, 2}}, // pshufb
2180 {TTI::SK_Broadcast, MVT::v8f16, {1, 3, 2, 2}}, // pshufb
2181 {TTI::SK_Broadcast, MVT::v16i8, {1, 3, 2, 2}}, // pshufb
2182
2183 {TTI::SK_Reverse, MVT::v8i16, {1, 2, 1, 2}}, // pshufb
2184 {TTI::SK_Reverse, MVT::v8f16, {1, 2, 1, 2}}, // pshufb
2185 {TTI::SK_Reverse, MVT::v16i8, {1, 2, 1, 2}}, // pshufb
2186
2187 {TTI::SK_Select, MVT::v8i16, {3, 3, 3, 3}}, // 2*pshufb + por
2188 {TTI::SK_Select, MVT::v8f16, {3, 3, 3, 3}}, // 2*pshufb + por
2189 {TTI::SK_Select, MVT::v16i8, {3, 3, 3, 3}}, // 2*pshufb + por
2190
2191 {TTI::SK_Splice, MVT::v4i32, {1, 1, 1, 1}}, // palignr
2192 {TTI::SK_Splice, MVT::v4f32, {1, 1, 1, 1}}, // palignr
2193 {TTI::SK_Splice, MVT::v8i16, {1, 1, 1, 1}}, // palignr
2194 {TTI::SK_Splice, MVT::v8f16, {1, 1, 1, 1}}, // palignr
2195 {TTI::SK_Splice, MVT::v16i8, {1, 1, 1, 1}}, // palignr
2196
2197 {TTI::SK_PermuteSingleSrc, MVT::v8i16, {1, 1, 1, 1}}, // pshufb
2198 {TTI::SK_PermuteSingleSrc, MVT::v8f16, {1, 1, 1, 1}}, // pshufb
2199 {TTI::SK_PermuteSingleSrc, MVT::v16i8, {1, 1, 1, 1}}, // pshufb
2200
2201 {TTI::SK_PermuteTwoSrc, MVT::v8i16, {3, 3, 3, 3}}, // 2*pshufb + por
2202 {TTI::SK_PermuteTwoSrc, MVT::v8f16, {3, 3, 3, 3}}, // 2*pshufb + por
2203 {TTI::SK_PermuteTwoSrc, MVT::v16i8, {3, 3, 3, 3}}, // 2*pshufb + por
2204 };
2205
2206 if (ST->hasSSSE3())
2207 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
2208 if (auto KindCost = Entry->Cost[CostKind])
2209 return LT.first * *KindCost;
2210
2211 static const CostKindTblEntry SSE2ShuffleTbl[] = {
2212 {TTI::SK_Broadcast, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2213 {TTI::SK_Broadcast, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
2214 {TTI::SK_Broadcast, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
2215 {TTI::SK_Broadcast, MVT::v8i16, {1, 2, 2, 2}}, // pshuflw + pshufd
2216 {TTI::SK_Broadcast, MVT::v8f16, {1, 2, 2, 2}}, // pshuflw + pshufd
2217 {TTI::SK_Broadcast, MVT::v16i8, {2, 3, 3, 4}}, // unpck + pshuflw + pshufd
2218
2219 {TTI::SK_Reverse, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2220 {TTI::SK_Reverse, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
2221 {TTI::SK_Reverse, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
2222 {TTI::SK_Reverse, MVT::v8i16, {2, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
2223 {TTI::SK_Reverse, MVT::v8f16, {2, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
2224 {TTI::SK_Reverse, MVT::v16i8, {5, 6,11,11}}, // 2*pshuflw + 2*pshufhw
2225 // + 2*pshufd + 2*unpck + packus
2226
2227 {TTI::SK_Select, MVT::v2i64, {1, 1, 1, 1}}, // movsd
2228 {TTI::SK_Select, MVT::v2f64, {1, 1, 1, 1}}, // movsd
2229 {TTI::SK_Select, MVT::v4i32, {2, 2, 2, 2}}, // 2*shufps
2230 {TTI::SK_Select, MVT::v8i16, {3, 3, 3, 3}}, // pand + pandn + por
2231 {TTI::SK_Select, MVT::v8f16, {3, 3, 3, 3}}, // pand + pandn + por
2232 {TTI::SK_Select, MVT::v16i8, {3, 3, 3, 3}}, // pand + pandn + por
2233
2234 {TTI::SK_Splice, MVT::v2i64, {1, 1, 1, 1}}, // shufpd
2235 {TTI::SK_Splice, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2236 {TTI::SK_Splice, MVT::v4i32, {2, 2, 2, 2}}, // 2*{unpck,movsd,pshufd}
2237 {TTI::SK_Splice, MVT::v8i16, {3, 3, 3, 3}}, // psrldq + psrlldq + por
2238 {TTI::SK_Splice, MVT::v8f16, {3, 3, 3, 3}}, // psrldq + psrlldq + por
2239 {TTI::SK_Splice, MVT::v16i8, {3, 3, 3, 3}}, // psrldq + psrlldq + por
2240
2241 {TTI::SK_PermuteSingleSrc, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2242 {TTI::SK_PermuteSingleSrc, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
2243 {TTI::SK_PermuteSingleSrc, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
2244 {TTI::SK_PermuteSingleSrc, MVT::v8i16, {3, 5, 5, 5}}, // 2*pshuflw + 2*pshufhw
2245 // + pshufd/unpck
2246 {TTI::SK_PermuteSingleSrc, MVT::v8f16, {3, 5, 5, 5}}, // 2*pshuflw + 2*pshufhw
2247 // + pshufd/unpck
2248 {TTI::SK_PermuteSingleSrc, MVT::v16i8, {8, 10, 10, 10}}, // 2*pshuflw + 2*pshufhw
2249 // + 2*pshufd + 2*unpck + 2*packus
2250
2251 {TTI::SK_PermuteTwoSrc, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2252 {TTI::SK_PermuteTwoSrc, MVT::v2i64, {1, 1, 1, 1}}, // shufpd
2253 {TTI::SK_PermuteTwoSrc, MVT::v4i32, {2, 2, 2, 2}}, // 2*{unpck,movsd,pshufd}
2254 {TTI::SK_PermuteTwoSrc, MVT::v8i16, {6, 8, 8, 8}}, // blend+permute
2255 {TTI::SK_PermuteTwoSrc, MVT::v8f16, {6, 8, 8, 8}}, // blend+permute
2256 {TTI::SK_PermuteTwoSrc, MVT::v16i8, {11, 13, 13, 13}}, // blend+permute
2257 };
2258
2259 static const CostTblEntry SSE3BroadcastLoadTbl[] = {
2260 {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
2261 };
2262
2263 if (ST->hasSSE2()) {
2264 bool IsLoad =
2265 llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); });
2266 if (ST->hasSSE3() && IsLoad)
2267 if (const auto *Entry =
2268 CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
2269 assert(isLegalBroadcastLoad(SrcTy->getElementType(),
2270 LT.second.getVectorElementCount()) &&
2271 "Table entry missing from isLegalBroadcastLoad()");
2272 return LT.first * Entry->Cost;
2273 }
2274
2275 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
2276 if (auto KindCost = Entry->Cost[CostKind])
2277 return LT.first * *KindCost;
2278 }
2279
2280 static const CostKindTblEntry SSE1ShuffleTbl[] = {
2281 { TTI::SK_Broadcast, MVT::v4f32, {1,1,1,1} }, // shufps
2282 { TTI::SK_Reverse, MVT::v4f32, {1,1,1,1} }, // shufps
2283 { TTI::SK_Select, MVT::v4f32, {2,2,2,2} }, // 2*shufps
2284 { TTI::SK_Splice, MVT::v4f32, {2,2,2,2} }, // 2*shufps
2285 { TTI::SK_PermuteSingleSrc, MVT::v4f32, {1,1,1,1} }, // shufps
2286 { TTI::SK_PermuteTwoSrc, MVT::v4f32, {2,2,2,2} }, // 2*shufps
2287 };
2288
2289 if (ST->hasSSE1()) {
2290 if (LT.first == 1 && LT.second == MVT::v4f32 && Mask.size() == 4) {
2291 // SHUFPS: both pairs must come from the same source register.
2292 auto MatchSHUFPS = [](int X, int Y) {
2293 return X < 0 || Y < 0 || ((X & 4) == (Y & 4));
2294 };
2295 if (MatchSHUFPS(Mask[0], Mask[1]) && MatchSHUFPS(Mask[2], Mask[3]))
2296 return 1;
2297 }
2298 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
2299 if (auto KindCost = Entry->Cost[CostKind])
2300 return LT.first * *KindCost;
2301 }
2302
2303 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
2304 SubTp);
2305}
2306
2308 Type *Src,
2311 const Instruction *I) const {
2312 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2313 assert(ISD && "Invalid opcode");
2314
2315 // The cost tables include both specific, custom (non-legal) src/dst type
2316 // conversions and generic, legalized types. We test for customs first, before
2317 // falling back to legalization.
2318 // FIXME: Need a better design of the cost table to handle non-simple types of
2319 // potential massive combinations (elem_num x src_type x dst_type).
2320 static const TypeConversionCostKindTblEntry AVX512BWConversionTbl[]{
2321 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2322 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2323
2324 // Mask sign extend has an instruction.
2325 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2326 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2327 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2328 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2329 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2330 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2331 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2332 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2333 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2334 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2335 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2336 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2337 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2338 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2339 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, { 1, 1, 1, 1 } },
2340 { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, { 1, 1, 1, 1 } },
2341 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, { 1, 1, 1, 1 } },
2342
2343 // Mask zero extend is a sext + shift.
2344 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2345 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2346 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2347 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2348 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2349 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2350 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2351 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2352 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2353 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2354 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2355 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2356 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2357 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2358 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, { 2, 1, 1, 1 } },
2359 { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, { 2, 1, 1, 1 } },
2360 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, { 2, 1, 1, 1 } },
2361
2362 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2363 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2364 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2365 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2366 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2367 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2368 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2369 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2370 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2371 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2372 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2373 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2374 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2375 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2376 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, { 2, 1, 1, 1 } },
2377 { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, { 2, 1, 1, 1 } },
2378 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, { 2, 1, 1, 1 } },
2379
2380 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 2, 1, 1, 1 } },
2381 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // widen to zmm
2382 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, { 2, 1, 1, 1 } }, // vpmovwb
2383 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, { 2, 1, 1, 1 } }, // vpmovwb
2384 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, { 2, 1, 1, 1 } }, // vpmovwb
2385 };
2386
2387 static const TypeConversionCostKindTblEntry AVX512DQConversionTbl[] = {
2388 // Mask sign extend has an instruction.
2389 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2390 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2391 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2392 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2393 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2394 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, { 1, 1, 1, 1 } },
2395 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } },
2396 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } },
2397
2398 // Mask zero extend is a sext + shift.
2399 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1, } },
2400 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1, } },
2401 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1, } },
2402 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1, } },
2403 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1, } },
2404 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, { 2, 1, 1, 1, } },
2405 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1, } },
2406 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1, } },
2407
2408 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2409 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2410 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2411 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2412 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2413 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } },
2414 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } },
2415 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, { 2, 1, 1, 1 } },
2416
2417 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2418 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2419
2420 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2421 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2422
2423 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2424 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2425
2426 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2427 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2428 };
2429
2430 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
2431 // 256-bit wide vectors.
2432
2433 static const TypeConversionCostKindTblEntry AVX512FConversionTbl[] = {
2434 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 1, 1, 1, 1 } },
2435 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, { 3, 1, 1, 1 } },
2436 { ISD::FP_EXTEND, MVT::v16f64, MVT::v16f32, { 4, 1, 1, 1 } }, // 2*vcvtps2pd+vextractf64x4
2437 { ISD::FP_EXTEND, MVT::v16f32, MVT::v16f16, { 1, 1, 1, 1 } }, // vcvtph2ps
2438 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
2439 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 1, 1, 1, 1 } },
2440 { ISD::FP_ROUND, MVT::v16f16, MVT::v16f32, { 1, 1, 1, 1 } }, // vcvtps2ph
2441
2442 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2443 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2444 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2445 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2446 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2447 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2448 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2449 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2450 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2451 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2452 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2453 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2454 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2455 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2456 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2457 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, { 2, 1, 1, 1 } }, // vpmovdb
2458 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, { 2, 1, 1, 1 } }, // vpmovdb
2459 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2460 { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2461 { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2462 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2463 { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2464 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, { 2, 1, 1, 1 } }, // vpmovqb
2465 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, { 1, 1, 1, 1 } }, // vpshufb
2466 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2467 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2468 { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2469 { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2470 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2471 { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2472 { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2473 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, { 1, 1, 1, 1 } }, // vpmovqd
2474 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // zmm vpmovqd
2475 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, { 5, 1, 1, 1 } },// 2*vpmovqd+concat+vpmovdb
2476
2477 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } }, // extend to v16i32
2478 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 8, 1, 1, 1 } },
2479 { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, { 8, 1, 1, 1 } },
2480
2481 // Sign extend is zmm vpternlogd+vptruncdb.
2482 // Zero extend is zmm broadcast load+vptruncdw.
2483 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 3, 1, 1, 1 } },
2484 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 4, 1, 1, 1 } },
2485 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 3, 1, 1, 1 } },
2486 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 4, 1, 1, 1 } },
2487 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 3, 1, 1, 1 } },
2488 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 4, 1, 1, 1 } },
2489 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 3, 1, 1, 1 } },
2490 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 4, 1, 1, 1 } },
2491
2492 // Sign extend is zmm vpternlogd+vptruncdw.
2493 // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
2494 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 3, 1, 1, 1 } },
2495 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2496 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 3, 1, 1, 1 } },
2497 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2498 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 3, 1, 1, 1 } },
2499 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2500 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 3, 1, 1, 1 } },
2501 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2502
2503 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2504 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2505 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2506 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2507 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2508 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2509 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2510 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2511 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2512 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2513
2514 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2515 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2516 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogq
2517 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2518
2519 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2520 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2521 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2522 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2523 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2524 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2525 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2526 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2527 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2528 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2529
2530 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2531 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2532
2533 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2534 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2535 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2536 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2537 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2538 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2539 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2540 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2541
2542 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2543 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2544 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2545 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2546 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2547 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2548 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2549 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2550 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, {26, 1, 1, 1 } },
2551 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 5, 1, 1, 1 } },
2552
2553 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2554 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, { 7, 1, 1, 1 } },
2555 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64, {15, 1, 1, 1 } },
2556 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32, {11, 1, 1, 1 } },
2557 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64, {31, 1, 1, 1 } },
2558 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2559 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, { 7, 1, 1, 1 } },
2560 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, { 5, 1, 1, 1 } },
2561 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64, {15, 1, 1, 1 } },
2562 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2563 { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, { 3, 1, 1, 1 } },
2564
2565 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2566 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2567 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, { 3, 1, 1, 1 } },
2568 { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, { 1, 1, 1, 1 } },
2569 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, { 3, 1, 1, 1 } },
2570 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, { 3, 1, 1, 1 } },
2571 };
2572
2573 static const TypeConversionCostKindTblEntry AVX512BWVLConversionTbl[] {
2574 // Mask sign extend has an instruction.
2575 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2576 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2577 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2578 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2579 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2580 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2581 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2582 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2583 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2584 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2585 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2586 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2587 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2588 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2589 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, { 1, 1, 1, 1 } },
2590 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, { 1, 1, 1, 1 } },
2591 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, { 1, 1, 1, 1 } },
2592
2593 // Mask zero extend is a sext + shift.
2594 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2595 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2596 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2597 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2598 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2599 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2600 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2601 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2602 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2603 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2604 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2605 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2606 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2607 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2608 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, { 2, 1, 1, 1 } },
2609 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, { 2, 1, 1, 1 } },
2610 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, { 2, 1, 1, 1 } },
2611
2612 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2613 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2614 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2615 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2616 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2617 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2618 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2619 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2620 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2621 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2622 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2623 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2624 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2625 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2626 { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, { 2, 1, 1, 1 } },
2627 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, { 2, 1, 1, 1 } },
2628 { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, { 2, 1, 1, 1 } },
2629
2630 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } },
2631 };
2632
2633 static const TypeConversionCostKindTblEntry AVX512DQVLConversionTbl[] = {
2634 // Mask sign extend has an instruction.
2635 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2636 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2637 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2638 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, { 1, 1, 1, 1 } },
2639 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2640 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, { 1, 1, 1, 1 } },
2641 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } },
2642 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2643
2644 // Mask zero extend is a sext + shift.
2645 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } },
2646 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1 } },
2647 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } },
2648 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, { 2, 1, 1, 1 } },
2649 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } },
2650 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, { 2, 1, 1, 1 } },
2651 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } },
2652 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } },
2653
2654 { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, { 2, 1, 1, 1 } },
2655 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } },
2656 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2657 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2658 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2659 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2660 { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, { 2, 1, 1, 1 } },
2661 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2662
2663 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2664 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2665 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2666 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2667
2668 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2669 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2670 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2671 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2672
2673 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2674 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2675 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2676 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2677
2678 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2679 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2680 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2681 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2682 };
2683
2684 static const TypeConversionCostKindTblEntry AVX512VLConversionTbl[] = {
2685 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2686 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2687 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2688 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 8, 1, 1, 1 } }, // split+2*v8i8
2689 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2690 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2691 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2692 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 8, 1, 1, 1 } }, // split+2*v8i16
2693 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2694 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2695 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2696 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2697 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2698 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2699 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // vpmovqd
2700 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqb
2701 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqw
2702 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, { 2, 1, 1, 1 } }, // vpmovwb
2703
2704 // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
2705 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
2706 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 5, 1, 1, 1 } },
2707 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 6, 1, 1, 1 } },
2708 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 5, 1, 1, 1 } },
2709 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 6, 1, 1, 1 } },
2710 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 5, 1, 1, 1 } },
2711 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 6, 1, 1, 1 } },
2712 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, {10, 1, 1, 1 } },
2713 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, {12, 1, 1, 1 } },
2714
2715 // sign extend is vpcmpeq+maskedmove+vpmovdw
2716 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
2717 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2718 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 5, 1, 1, 1 } },
2719 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2720 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 5, 1, 1, 1 } },
2721 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2722 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 5, 1, 1, 1 } },
2723 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, {10, 1, 1, 1 } },
2724 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, {12, 1, 1, 1 } },
2725
2726 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogd
2727 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2728 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogd
2729 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2730 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogd
2731 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2732 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2733 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2734
2735 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogq
2736 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2737 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogq
2738 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2739
2740 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2741 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2742 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2743 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2744 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2745 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2746 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2747 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2748 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2749 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2750 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2751 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2752
2753 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2754 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2755 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2756 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2757
2758 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
2759 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
2760 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2761 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2762 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2763 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2764 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 1, 1, 1, 1 } },
2765 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
2766 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2767 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2768 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 5, 1, 1, 1 } },
2769 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2770 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 5, 1, 1, 1 } },
2771
2772 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2773 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2774 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, { 5, 1, 1, 1 } },
2775
2776 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
2777 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
2778 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
2779 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
2780 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2781 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2782 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2783 };
2784
2785 static const TypeConversionCostKindTblEntry AVX2ConversionTbl[] = {
2786 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2787 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2788 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2789 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2790 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2791 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2792
2793 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2794 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2795 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2796 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2797 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2798 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2799 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2800 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2801 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2802 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2803 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2804 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2805 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2806 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2807
2808 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2809
2810 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 4, 1, 1, 1 } },
2811 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 4, 1, 1, 1 } },
2812 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 1, 1, 1, 1 } },
2813 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 1, 1, 1, 1 } },
2814 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 1, 1, 1, 1 } },
2815 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 4, 1, 1, 1 } },
2816 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 4, 1, 1, 1 } },
2817 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 1, 1, 1, 1 } },
2818 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 1, 1, 1, 1 } },
2819 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 5, 1, 1, 1 } },
2820 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } },
2821 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 2, 1, 1, 1 } },
2822
2823 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 3, 1, 1, 1 } },
2824 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 3, 1, 1, 1 } },
2825
2826 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2827 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2828 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2829 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 3, 1, 1, 1 } },
2830
2831 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 3, 1, 1, 1 } },
2832 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 3, 1, 1, 1 } },
2833 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2834 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
2835 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2836 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 4, 1, 1, 1 } },
2837 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 3, 1, 1, 1 } },
2838 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 4, 1, 1, 1 } },
2839
2840 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2841 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2842 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2843 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2844 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2845 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2846 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 3, 1, 1, 1 } },
2847
2848 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2849 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2850 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2851 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2852 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 2, 1, 1, 1 } },
2853 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 1, 1, 1, 1 } },
2854 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 2, 1, 1, 1 } },
2855 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2856 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2857 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2858 };
2859
2860 static const TypeConversionCostKindTblEntry AVXConversionTbl[] = {
2861 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2862 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2863 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2864 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2865 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2866 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2867
2868 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2869 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2870 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2871 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2872 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2873 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2874 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2875 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2876 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2877 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2878 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2879 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2880
2881 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 4, 1, 1, 1 } },
2882 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 5, 1, 1, 1 } },
2883 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 4, 1, 1, 1 } },
2884 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 9, 1, 1, 1 } },
2885 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, {11, 1, 1, 1 } },
2886
2887 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 6, 1, 1, 1 } },
2888 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 6, 1, 1, 1 } },
2889 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // and+extract+packuswb
2890 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 5, 1, 1, 1 } },
2891 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
2892 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 5, 1, 1, 1 } },
2893 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 3, 1, 1, 1 } }, // and+extract+2*packusdw
2894 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 2, 1, 1, 1 } },
2895
2896 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, { 3, 1, 1, 1 } },
2897 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, { 3, 1, 1, 1 } },
2898 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, { 8, 1, 1, 1 } },
2899 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2900 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2901 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2902 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2903 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2904 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2905 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2906 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 5, 1, 1, 1 } },
2907 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 8, 1, 1, 1 } },
2908
2909 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, { 7, 1, 1, 1 } },
2910 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, { 7, 1, 1, 1 } },
2911 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, { 6, 1, 1, 1 } },
2912 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2913 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2914 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2915 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2916 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 4, 1, 1, 1 } },
2917 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 4, 1, 1, 1 } },
2918 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
2919 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 6, 1, 1, 1 } },
2920 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 8, 1, 1, 1 } },
2921 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, {10, 1, 1, 1 } },
2922 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, {10, 1, 1, 1 } },
2923 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {18, 1, 1, 1 } },
2924 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2925 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, {10, 1, 1, 1 } },
2926
2927 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2928 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2929 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2930 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
2931 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
2932 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
2933 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
2934 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
2935 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 2, 1, 1, 1 } },
2936 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 2, 1, 1, 1 } },
2937 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 5, 1, 1, 1 } },
2938
2939 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2940 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2941 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2942 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
2943 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
2944 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
2945 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
2946 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
2947 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
2948 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2949 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 6, 1, 1, 1 } },
2950 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 7, 1, 1, 1 } },
2951 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 7, 1, 1, 1 } },
2952
2953 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, { 1, 1, 1, 1 } },
2954 { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, { 1, 1, 1, 1 } },
2955 };
2956
2957 static const TypeConversionCostKindTblEntry SSE41ConversionTbl[] = {
2958 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
2959 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
2960 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
2961 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
2962 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
2963 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
2964 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
2965 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
2966 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
2967 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
2968 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
2969 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
2970
2971 // These truncates end up widening elements.
2972 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 1, 1, 1, 1 } }, // PMOVXZBQ
2973 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 1, 1, 1, 1 } }, // PMOVXZWQ
2974 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 1, 1, 1, 1 } }, // PMOVXZBD
2975
2976 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 2, 1, 1, 1 } },
2977 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 2, 1, 1, 1 } },
2978 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 2, 1, 1, 1 } },
2979
2980 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
2981 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
2982 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
2983 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
2984 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
2985 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2986 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
2987 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2988 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
2989 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 1, 1, 1, 1 } },
2990 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2991
2992 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
2993 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
2994 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 4, 1, 1, 1 } },
2995 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 4, 1, 1, 1 } },
2996 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
2997 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2998 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
2999 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
3000 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 3, 1, 1, 1 } },
3001 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
3002 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 2, 1, 1, 1 } },
3003 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {12, 1, 1, 1 } },
3004 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {22, 1, 1, 1 } },
3005 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 4, 1, 1, 1 } },
3006
3007 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
3008 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
3009 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
3010 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
3011 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
3012 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
3013 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
3014 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
3015 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
3016 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
3017
3018 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
3019 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3020 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
3021 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
3022 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
3023 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
3024 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
3025 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
3026 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
3027 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
3028 };
3029
3030 static const TypeConversionCostKindTblEntry SSE2ConversionTbl[] = {
3031 // These are somewhat magic numbers justified by comparing the
3032 // output of llvm-mca for our various supported scheduler models
3033 // and basing it off the worst case scenario.
3034 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
3035 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
3036 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 3, 1, 1, 1 } },
3037 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 3, 1, 1, 1 } },
3038 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 3, 1, 1, 1 } },
3039 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
3040 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 3, 1, 1, 1 } },
3041 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
3042 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
3043 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 4, 1, 1, 1 } },
3044 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 8, 1, 1, 1 } },
3045 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 8, 1, 1, 1 } },
3046
3047 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
3048 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
3049 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 8, 1, 1, 1 } },
3050 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 9, 1, 1, 1 } },
3051 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
3052 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 4, 1, 1, 1 } },
3053 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 4, 1, 1, 1 } },
3054 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
3055 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 7, 1, 1, 1 } },
3056 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 7, 1, 1, 1 } },
3057 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
3058 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, {15, 1, 1, 1 } },
3059 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {18, 1, 1, 1 } },
3060
3061 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
3062 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3063 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
3064 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
3065 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
3066 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
3067 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
3068 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
3069 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
3070 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
3071
3072 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
3073 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3074 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
3075 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, {15, 1, 1, 1 } },
3076 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
3077 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
3078 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
3079 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
3080 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 8, 1, 1, 1 } },
3081 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 8, 1, 1, 1 } },
3082
3083 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
3084 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
3085 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 2, 1, 1, 1 } },
3086 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 3, 1, 1, 1 } },
3087 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
3088 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 2, 1, 1, 1 } },
3089 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 2, 1, 1, 1 } },
3090 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 3, 1, 1, 1 } },
3091 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
3092 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 2, 1, 1, 1 } },
3093 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
3094 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 2, 1, 1, 1 } },
3095
3096 // These truncates are really widening elements.
3097 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 1, 1, 1, 1 } }, // PSHUFD
3098 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } }, // PUNPCKLWD+DQ
3099 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // PUNPCKLBW+WD+PSHUFD
3100 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 1, 1, 1, 1 } }, // PUNPCKLWD
3101 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } }, // PUNPCKLBW+WD
3102 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 1, 1, 1, 1 } }, // PUNPCKLBW
3103
3104 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 2, 1, 1, 1 } }, // PAND+PACKUSWB
3105 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } },
3106 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 3, 1, 1, 1 } }, // PAND+2*PACKUSWB
3107 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 7, 1, 1, 1 } },
3108 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, { 1, 1, 1, 1 } },
3109 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 3, 1, 1, 1 } },
3110 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
3111 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, {10, 1, 1, 1 } },
3112 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 4, 1, 1, 1 } }, // PAND+3*PACKUSWB
3113 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 2, 1, 1, 1 } }, // PSHUFD+PSHUFLW
3114 { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, { 1, 1, 1, 1 } }, // PSHUFD
3115 };
3116
3117 static const TypeConversionCostKindTblEntry F16ConversionTbl[] = {
3118 { ISD::FP_ROUND, MVT::f16, MVT::f32, { 1, 1, 1, 1 } },
3119 { ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, { 1, 1, 1, 1 } },
3120 { ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, { 1, 1, 1, 1 } },
3121 { ISD::FP_EXTEND, MVT::f32, MVT::f16, { 1, 1, 1, 1 } },
3122 { ISD::FP_EXTEND, MVT::f64, MVT::f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
3123 { ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, { 1, 1, 1, 1 } },
3124 { ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, { 1, 1, 1, 1 } },
3125 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
3126 };
3127
3128 // Attempt to map directly to (simple) MVT types to let us match custom entries.
3129 EVT SrcTy = TLI->getValueType(DL, Src);
3130 EVT DstTy = TLI->getValueType(DL, Dst);
3131
3132 // The function getSimpleVT only handles simple value types.
3133 if (SrcTy.isSimple() && DstTy.isSimple()) {
3134 MVT SimpleSrcTy = SrcTy.getSimpleVT();
3135 MVT SimpleDstTy = DstTy.getSimpleVT();
3136
3137 if (ST->useAVX512Regs()) {
3138 if (ST->hasBWI())
3139 if (const auto *Entry = ConvertCostTableLookup(
3140 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3141 if (auto KindCost = Entry->Cost[CostKind])
3142 return *KindCost;
3143
3144 if (ST->hasDQI())
3145 if (const auto *Entry = ConvertCostTableLookup(
3146 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3147 if (auto KindCost = Entry->Cost[CostKind])
3148 return *KindCost;
3149
3150 if (ST->hasAVX512())
3151 if (const auto *Entry = ConvertCostTableLookup(
3152 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3153 if (auto KindCost = Entry->Cost[CostKind])
3154 return *KindCost;
3155 }
3156
3157 if (ST->hasBWI())
3158 if (const auto *Entry = ConvertCostTableLookup(
3159 AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3160 if (auto KindCost = Entry->Cost[CostKind])
3161 return *KindCost;
3162
3163 if (ST->hasDQI())
3164 if (const auto *Entry = ConvertCostTableLookup(
3165 AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3166 if (auto KindCost = Entry->Cost[CostKind])
3167 return *KindCost;
3168
3169 if (ST->hasAVX512())
3170 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3171 SimpleDstTy, SimpleSrcTy))
3172 if (auto KindCost = Entry->Cost[CostKind])
3173 return *KindCost;
3174
3175 if (ST->hasAVX2()) {
3176 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3177 SimpleDstTy, SimpleSrcTy))
3178 if (auto KindCost = Entry->Cost[CostKind])
3179 return *KindCost;
3180 }
3181
3182 if (ST->hasAVX()) {
3183 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3184 SimpleDstTy, SimpleSrcTy))
3185 if (auto KindCost = Entry->Cost[CostKind])
3186 return *KindCost;
3187 }
3188
3189 if (ST->hasF16C()) {
3190 if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD,
3191 SimpleDstTy, SimpleSrcTy))
3192 if (auto KindCost = Entry->Cost[CostKind])
3193 return *KindCost;
3194 }
3195
3196 if (ST->hasSSE41()) {
3197 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3198 SimpleDstTy, SimpleSrcTy))
3199 if (auto KindCost = Entry->Cost[CostKind])
3200 return *KindCost;
3201 }
3202
3203 if (ST->hasSSE2()) {
3204 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3205 SimpleDstTy, SimpleSrcTy))
3206 if (auto KindCost = Entry->Cost[CostKind])
3207 return *KindCost;
3208 }
3209
3210 if ((ISD == ISD::FP_ROUND && SimpleDstTy == MVT::f16) ||
3211 (ISD == ISD::FP_EXTEND && SimpleSrcTy == MVT::f16)) {
3212 // fp16 conversions not covered by any table entries require a libcall.
3213 // Return a large (arbitrary) number to model this.
3214 return InstructionCost(64);
3215 }
3216 }
3217
3218 // Fall back to legalized types.
3219 std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src);
3220 std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst);
3221
3222 // If we're truncating to the same legalized type - just assume its free.
3223 if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second)
3224 return TTI::TCC_Free;
3225
3226 if (ST->useAVX512Regs()) {
3227 if (ST->hasBWI())
3228 if (const auto *Entry = ConvertCostTableLookup(
3229 AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
3230 if (auto KindCost = Entry->Cost[CostKind])
3231 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3232
3233 if (ST->hasDQI())
3234 if (const auto *Entry = ConvertCostTableLookup(
3235 AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
3236 if (auto KindCost = Entry->Cost[CostKind])
3237 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3238
3239 if (ST->hasAVX512())
3240 if (const auto *Entry = ConvertCostTableLookup(
3241 AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
3242 if (auto KindCost = Entry->Cost[CostKind])
3243 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3244 }
3245
3246 if (ST->hasBWI())
3247 if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
3248 LTDest.second, LTSrc.second))
3249 if (auto KindCost = Entry->Cost[CostKind])
3250 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3251
3252 if (ST->hasDQI())
3253 if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
3254 LTDest.second, LTSrc.second))
3255 if (auto KindCost = Entry->Cost[CostKind])
3256 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3257
3258 if (ST->hasAVX512())
3259 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3260 LTDest.second, LTSrc.second))
3261 if (auto KindCost = Entry->Cost[CostKind])
3262 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3263
3264 if (ST->hasAVX2())
3265 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3266 LTDest.second, LTSrc.second))
3267 if (auto KindCost = Entry->Cost[CostKind])
3268 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3269
3270 if (ST->hasAVX())
3271 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3272 LTDest.second, LTSrc.second))
3273 if (auto KindCost = Entry->Cost[CostKind])
3274 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3275
3276 if (ST->hasF16C()) {
3277 if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD,
3278 LTDest.second, LTSrc.second))
3279 if (auto KindCost = Entry->Cost[CostKind])
3280 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3281 }
3282
3283 if (ST->hasSSE41())
3284 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3285 LTDest.second, LTSrc.second))
3286 if (auto KindCost = Entry->Cost[CostKind])
3287 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3288
3289 if (ST->hasSSE2())
3290 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3291 LTDest.second, LTSrc.second))
3292 if (auto KindCost = Entry->Cost[CostKind])
3293 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3294
3295 // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
3296 // sitofp.
3297 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3298 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
3299 Type *ExtSrc = Src->getWithNewBitWidth(32);
3300 unsigned ExtOpc =
3301 (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
3302
3303 // For scalar loads the extend would be free.
3304 InstructionCost ExtCost = 0;
3305 if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
3306 ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
3307
3308 return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
3310 }
3311
3312 // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
3313 // i32.
3314 if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
3315 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
3316 Type *TruncDst = Dst->getWithNewBitWidth(32);
3317 return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
3318 getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
3320 }
3321
3322 // TODO: Allow non-throughput costs that aren't binary.
3323 auto AdjustCost = [&CostKind](InstructionCost Cost,
3326 return Cost == 0 ? 0 : N;
3327 return Cost * N;
3328 };
3329 return AdjustCost(
3330 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3331}
3332
3334 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
3336 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
3337 // Early out if this type isn't scalar/vector integer/float.
3338 if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy()))
3339 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3340 Op1Info, Op2Info, I);
3341
3342 // Legalize the type.
3343 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3344
3345 MVT MTy = LT.second;
3346
3347 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3348 assert(ISD && "Invalid opcode");
3349
3350 InstructionCost ExtraCost = 0;
3351 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
3352 // Some vector comparison predicates cost extra instructions.
3353 // TODO: Adjust ExtraCost based on CostKind?
3354 // TODO: Should we invert this and assume worst case cmp costs
3355 // and reduce for particular predicates?
3356 if (MTy.isVector() &&
3357 !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
3358 (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
3359 ST->hasBWI())) {
3360 // Fallback to I if a specific predicate wasn't specified.
3361 CmpInst::Predicate Pred = VecPred;
3362 if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE ||
3364 Pred = cast<CmpInst>(I)->getPredicate();
3365
3366 bool CmpWithConstant = false;
3367 if (auto *CmpInstr = dyn_cast_or_null<CmpInst>(I))
3368 CmpWithConstant = isa<Constant>(CmpInstr->getOperand(1));
3369
3370 switch (Pred) {
3372 // xor(cmpeq(x,y),-1)
3373 ExtraCost = CmpWithConstant ? 0 : 1;
3374 break;
3377 // xor(cmpgt(x,y),-1)
3378 ExtraCost = CmpWithConstant ? 0 : 1;
3379 break;
3382 // cmpgt(xor(x,signbit),xor(y,signbit))
3383 // xor(cmpeq(pmaxu(x,y),x),-1)
3384 ExtraCost = CmpWithConstant ? 1 : 2;
3385 break;
3388 if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
3389 (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
3390 // cmpeq(psubus(x,y),0)
3391 // cmpeq(pminu(x,y),x)
3392 ExtraCost = 1;
3393 } else {
3394 // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
3395 ExtraCost = CmpWithConstant ? 2 : 3;
3396 }
3397 break;
3400 // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases.
3401 // Use FCMP_UEQ expansion - FCMP_ONE should be the same.
3402 if (CondTy && !ST->hasAVX())
3403 return getCmpSelInstrCost(Opcode, ValTy, CondTy,
3405 Op1Info, Op2Info) +
3406 getCmpSelInstrCost(Opcode, ValTy, CondTy,
3408 Op1Info, Op2Info) +
3409 getArithmeticInstrCost(Instruction::Or, CondTy, CostKind);
3410
3411 break;
3414 // Assume worst case scenario and add the maximum extra cost.
3415 ExtraCost = 3;
3416 break;
3417 default:
3418 break;
3419 }
3420 }
3421 }
3422
3423 static const CostKindTblEntry SLMCostTbl[] = {
3424 // slm pcmpeq/pcmpgt throughput is 2
3425 { ISD::SETCC, MVT::v2i64, { 2, 5, 1, 2 } },
3426 // slm pblendvb/blendvpd/blendvps throughput is 4
3427 { ISD::SELECT, MVT::v2f64, { 4, 4, 1, 3 } }, // vblendvpd
3428 { ISD::SELECT, MVT::v4f32, { 4, 4, 1, 3 } }, // vblendvps
3429 { ISD::SELECT, MVT::v2i64, { 4, 4, 1, 3 } }, // pblendvb
3430 { ISD::SELECT, MVT::v8i32, { 4, 4, 1, 3 } }, // pblendvb
3431 { ISD::SELECT, MVT::v8i16, { 4, 4, 1, 3 } }, // pblendvb
3432 { ISD::SELECT, MVT::v16i8, { 4, 4, 1, 3 } }, // pblendvb
3433 };
3434
3435 static const CostKindTblEntry AVX512BWCostTbl[] = {
3436 { ISD::SETCC, MVT::v32i16, { 1, 1, 1, 1 } },
3437 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 1 } },
3438 { ISD::SETCC, MVT::v64i8, { 1, 1, 1, 1 } },
3439 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 1 } },
3440
3441 { ISD::SELECT, MVT::v32i16, { 1, 1, 1, 1 } },
3442 { ISD::SELECT, MVT::v64i8, { 1, 1, 1, 1 } },
3443 };
3444
3445 static const CostKindTblEntry AVX512CostTbl[] = {
3446 { ISD::SETCC, MVT::v8f64, { 1, 4, 1, 1 } },
3447 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 1 } },
3448 { ISD::SETCC, MVT::v16f32, { 1, 4, 1, 1 } },
3449 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 1 } },
3450
3451 { ISD::SETCC, MVT::v8i64, { 1, 1, 1, 1 } },
3452 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 1 } },
3453 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3454 { ISD::SETCC, MVT::v16i32, { 1, 1, 1, 1 } },
3455 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 1 } },
3456 { ISD::SETCC, MVT::v32i16, { 3, 7, 5, 5 } },
3457 { ISD::SETCC, MVT::v64i8, { 3, 7, 5, 5 } },
3458
3459 { ISD::SELECT, MVT::v8i64, { 1, 1, 1, 1 } },
3460 { ISD::SELECT, MVT::v4i64, { 1, 1, 1, 1 } },
3461 { ISD::SELECT, MVT::v2i64, { 1, 1, 1, 1 } },
3462 { ISD::SELECT, MVT::v16i32, { 1, 1, 1, 1 } },
3463 { ISD::SELECT, MVT::v8i32, { 1, 1, 1, 1 } },
3464 { ISD::SELECT, MVT::v4i32, { 1, 1, 1, 1 } },
3465 { ISD::SELECT, MVT::v8f64, { 1, 1, 1, 1 } },
3466 { ISD::SELECT, MVT::v4f64, { 1, 1, 1, 1 } },
3467 { ISD::SELECT, MVT::v2f64, { 1, 1, 1, 1 } },
3468 { ISD::SELECT, MVT::f64, { 1, 1, 1, 1 } },
3469 { ISD::SELECT, MVT::v16f32, { 1, 1, 1, 1 } },
3470 { ISD::SELECT, MVT::v8f32 , { 1, 1, 1, 1 } },
3471 { ISD::SELECT, MVT::v4f32, { 1, 1, 1, 1 } },
3472 { ISD::SELECT, MVT::f32 , { 1, 1, 1, 1 } },
3473
3474 { ISD::SELECT, MVT::v32i16, { 2, 2, 4, 4 } },
3475 { ISD::SELECT, MVT::v16i16, { 1, 1, 1, 1 } },
3476 { ISD::SELECT, MVT::v8i16, { 1, 1, 1, 1 } },
3477 { ISD::SELECT, MVT::v64i8, { 2, 2, 4, 4 } },
3478 { ISD::SELECT, MVT::v32i8, { 1, 1, 1, 1 } },
3479 { ISD::SELECT, MVT::v16i8, { 1, 1, 1, 1 } },
3480 };
3481
3482 static const CostKindTblEntry AVX2CostTbl[] = {
3483 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 2 } },
3484 { ISD::SETCC, MVT::v2f64, { 1, 4, 1, 1 } },
3485 { ISD::SETCC, MVT::f64, { 1, 4, 1, 1 } },
3486 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 2 } },
3487 { ISD::SETCC, MVT::v4f32, { 1, 4, 1, 1 } },
3488 { ISD::SETCC, MVT::f32, { 1, 4, 1, 1 } },
3489
3490 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 2 } },
3491 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 2 } },
3492 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 2 } },
3493 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 2 } },
3494
3495 { ISD::SELECT, MVT::v4f64, { 2, 2, 1, 2 } }, // vblendvpd
3496 { ISD::SELECT, MVT::v8f32, { 2, 2, 1, 2 } }, // vblendvps
3497 { ISD::SELECT, MVT::v4i64, { 2, 2, 1, 2 } }, // pblendvb
3498 { ISD::SELECT, MVT::v8i32, { 2, 2, 1, 2 } }, // pblendvb
3499 { ISD::SELECT, MVT::v16i16, { 2, 2, 1, 2 } }, // pblendvb
3500 { ISD::SELECT, MVT::v32i8, { 2, 2, 1, 2 } }, // pblendvb
3501 };
3502
3503 static const CostKindTblEntry XOPCostTbl[] = {
3504 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3505 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3506 };
3507
3508 static const CostKindTblEntry AVX1CostTbl[] = {
3509 { ISD::SETCC, MVT::v4f64, { 2, 3, 1, 2 } },
3510 { ISD::SETCC, MVT::v2f64, { 1, 3, 1, 1 } },
3511 { ISD::SETCC, MVT::f64, { 1, 3, 1, 1 } },
3512 { ISD::SETCC, MVT::v8f32, { 2, 3, 1, 2 } },
3513 { ISD::SETCC, MVT::v4f32, { 1, 3, 1, 1 } },
3514 { ISD::SETCC, MVT::f32, { 1, 3, 1, 1 } },
3515
3516 // AVX1 does not support 8-wide integer compare.
3517 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3518 { ISD::SETCC, MVT::v8i32, { 4, 2, 5, 6 } },
3519 { ISD::SETCC, MVT::v16i16, { 4, 2, 5, 6 } },
3520 { ISD::SETCC, MVT::v32i8, { 4, 2, 5, 6 } },
3521
3522 { ISD::SELECT, MVT::v4f64, { 3, 3, 1, 2 } }, // vblendvpd
3523 { ISD::SELECT, MVT::v8f32, { 3, 3, 1, 2 } }, // vblendvps
3524 { ISD::SELECT, MVT::v4i64, { 3, 3, 1, 2 } }, // vblendvpd
3525 { ISD::SELECT, MVT::v8i32, { 3, 3, 1, 2 } }, // vblendvps
3526 { ISD::SELECT, MVT::v16i16, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3527 { ISD::SELECT, MVT::v32i8, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3528 };
3529
3530 static const CostKindTblEntry SSE42CostTbl[] = {
3531 { ISD::SETCC, MVT::v2i64, { 1, 2, 1, 2 } },
3532 };
3533
3534 static const CostKindTblEntry SSE41CostTbl[] = {
3535 { ISD::SETCC, MVT::v2f64, { 1, 5, 1, 1 } },
3536 { ISD::SETCC, MVT::v4f32, { 1, 5, 1, 1 } },
3537
3538 { ISD::SELECT, MVT::v2f64, { 2, 2, 1, 2 } }, // blendvpd
3539 { ISD::SELECT, MVT::f64, { 2, 2, 1, 2 } }, // blendvpd
3540 { ISD::SELECT, MVT::v4f32, { 2, 2, 1, 2 } }, // blendvps
3541 { ISD::SELECT, MVT::f32 , { 2, 2, 1, 2 } }, // blendvps
3542 { ISD::SELECT, MVT::v2i64, { 2, 2, 1, 2 } }, // pblendvb
3543 { ISD::SELECT, MVT::v4i32, { 2, 2, 1, 2 } }, // pblendvb
3544 { ISD::SELECT, MVT::v8i16, { 2, 2, 1, 2 } }, // pblendvb
3545 { ISD::SELECT, MVT::v16i8, { 2, 2, 1, 2 } }, // pblendvb
3546 };
3547
3548 static const CostKindTblEntry SSE2CostTbl[] = {
3549 { ISD::SETCC, MVT::v2f64, { 2, 5, 1, 1 } },
3550 { ISD::SETCC, MVT::f64, { 1, 5, 1, 1 } },
3551
3552 { ISD::SETCC, MVT::v2i64, { 5, 4, 5, 5 } }, // pcmpeqd/pcmpgtd expansion
3553 { ISD::SETCC, MVT::v4i32, { 1, 1, 1, 1 } },
3554 { ISD::SETCC, MVT::v8i16, { 1, 1, 1, 1 } },
3555 { ISD::SETCC, MVT::v16i8, { 1, 1, 1, 1 } },
3556
3557 { ISD::SELECT, MVT::v2f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3558 { ISD::SELECT, MVT::f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3559 { ISD::SELECT, MVT::v2i64, { 2, 2, 3, 3 } }, // pand + pandn + por
3560 { ISD::SELECT, MVT::v4i32, { 2, 2, 3, 3 } }, // pand + pandn + por
3561 { ISD::SELECT, MVT::v8i16, { 2, 2, 3, 3 } }, // pand + pandn + por
3562 { ISD::SELECT, MVT::v16i8, { 2, 2, 3, 3 } }, // pand + pandn + por
3563 };
3564
3565 static const CostKindTblEntry SSE1CostTbl[] = {
3566 { ISD::SETCC, MVT::v4f32, { 2, 5, 1, 1 } },
3567 { ISD::SETCC, MVT::f32, { 1, 5, 1, 1 } },
3568
3569 { ISD::SELECT, MVT::v4f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3570 { ISD::SELECT, MVT::f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3571 };
3572
3573 if (ST->useSLMArithCosts())
3574 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
3575 if (auto KindCost = Entry->Cost[CostKind])
3576 return LT.first * (ExtraCost + *KindCost);
3577
3578 if (ST->hasBWI())
3579 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3580 if (auto KindCost = Entry->Cost[CostKind])
3581 return LT.first * (ExtraCost + *KindCost);
3582
3583 if (ST->hasAVX512())
3584 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3585 if (auto KindCost = Entry->Cost[CostKind])
3586 return LT.first * (ExtraCost + *KindCost);
3587
3588 if (ST->hasAVX2())
3589 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3590 if (auto KindCost = Entry->Cost[CostKind])
3591 return LT.first * (ExtraCost + *KindCost);
3592
3593 if (ST->hasXOP())
3594 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3595 if (auto KindCost = Entry->Cost[CostKind])
3596 return LT.first * (ExtraCost + *KindCost);
3597
3598 if (ST->hasAVX())
3599 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3600 if (auto KindCost = Entry->Cost[CostKind])
3601 return LT.first * (ExtraCost + *KindCost);
3602
3603 if (ST->hasSSE42())
3604 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3605 if (auto KindCost = Entry->Cost[CostKind])
3606 return LT.first * (ExtraCost + *KindCost);
3607
3608 if (ST->hasSSE41())
3609 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3610 if (auto KindCost = Entry->Cost[CostKind])
3611 return LT.first * (ExtraCost + *KindCost);
3612
3613 if (ST->hasSSE2())
3614 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3615 if (auto KindCost = Entry->Cost[CostKind])
3616 return LT.first * (ExtraCost + *KindCost);
3617
3618 if (ST->hasSSE1())
3619 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3620 if (auto KindCost = Entry->Cost[CostKind])
3621 return LT.first * (ExtraCost + *KindCost);
3622
3623 // Assume a 3cy latency for fp select ops.
3624 if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select)
3625 if (ValTy->getScalarType()->isFloatingPointTy())
3626 return 3;
3627
3628 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3629 Op1Info, Op2Info, I);
3630}
3631
3633
3637 // Costs should match the codegen from:
3638 // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
3639 // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
3640 // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
3641 // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
3642 // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
3643
3644 // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
3645 // specialized in these tables yet.
3646 static const CostKindTblEntry AVX512VBMI2CostTbl[] = {
3647 { ISD::FSHL, MVT::v8i64, { 1, 1, 1, 1 } },
3648 { ISD::FSHL, MVT::v4i64, { 1, 1, 1, 1 } },
3649 { ISD::FSHL, MVT::v2i64, { 1, 1, 1, 1 } },
3650 { ISD::FSHL, MVT::v16i32, { 1, 1, 1, 1 } },
3651 { ISD::FSHL, MVT::v8i32, { 1, 1, 1, 1 } },
3652 { ISD::FSHL, MVT::v4i32, { 1, 1, 1, 1 } },
3653 { ISD::FSHL, MVT::v32i16, { 1, 1, 1, 1 } },
3654 { ISD::FSHL, MVT::v16i16, { 1, 1, 1, 1 } },
3655 { ISD::FSHL, MVT::v8i16, { 1, 1, 1, 1 } },
3656 { ISD::ROTL, MVT::v32i16, { 1, 1, 1, 1 } },
3657 { ISD::ROTL, MVT::v16i16, { 1, 1, 1, 1 } },
3658 { ISD::ROTL, MVT::v8i16, { 1, 1, 1, 1 } },
3659 { ISD::ROTR, MVT::v32i16, { 1, 1, 1, 1 } },
3660 { ISD::ROTR, MVT::v16i16, { 1, 1, 1, 1 } },
3661 { ISD::ROTR, MVT::v8i16, { 1, 1, 1, 1 } },
3662 { X86ISD::VROTLI, MVT::v32i16, { 1, 1, 1, 1 } },
3663 { X86ISD::VROTLI, MVT::v16i16, { 1, 1, 1, 1 } },
3664 { X86ISD::VROTLI, MVT::v8i16, { 1, 1, 1, 1 } },
3665 };
3666 static const CostKindTblEntry AVX512BITALGCostTbl[] = {
3667 { ISD::CTPOP, MVT::v32i16, { 1, 1, 1, 1 } },
3668 { ISD::CTPOP, MVT::v64i8, { 1, 1, 1, 1 } },
3669 { ISD::CTPOP, MVT::v16i16, { 1, 1, 1, 1 } },
3670 { ISD::CTPOP, MVT::v32i8, { 1, 1, 1, 1 } },
3671 { ISD::CTPOP, MVT::v8i16, { 1, 1, 1, 1 } },
3672 { ISD::CTPOP, MVT::v16i8, { 1, 1, 1, 1 } },
3673 };
3674 static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = {
3675 { ISD::CTPOP, MVT::v8i64, { 1, 1, 1, 1 } },
3676 { ISD::CTPOP, MVT::v16i32, { 1, 1, 1, 1 } },
3677 { ISD::CTPOP, MVT::v4i64, { 1, 1, 1, 1 } },
3678 { ISD::CTPOP, MVT::v8i32, { 1, 1, 1, 1 } },
3679 { ISD::CTPOP, MVT::v2i64, { 1, 1, 1, 1 } },
3680 { ISD::CTPOP, MVT::v4i32, { 1, 1, 1, 1 } },
3681 };
3682 static const CostKindTblEntry AVX512CDCostTbl[] = {
3683 { ISD::CTLZ, MVT::v8i64, { 1, 5, 1, 1 } },
3684 { ISD::CTLZ, MVT::v16i32, { 1, 5, 1, 1 } },
3685 { ISD::CTLZ, MVT::v32i16, { 18, 27, 23, 27 } },
3686 { ISD::CTLZ, MVT::v64i8, { 3, 16, 9, 11 } },
3687 { ISD::CTLZ, MVT::v4i64, { 1, 5, 1, 1 } },
3688 { ISD::CTLZ, MVT::v8i32, { 1, 5, 1, 1 } },
3689 { ISD::CTLZ, MVT::v16i16, { 8, 19, 11, 13 } },
3690 { ISD::CTLZ, MVT::v32i8, { 2, 11, 9, 10 } },
3691 { ISD::CTLZ, MVT::v2i64, { 1, 5, 1, 1 } },
3692 { ISD::CTLZ, MVT::v4i32, { 1, 5, 1, 1 } },
3693 { ISD::CTLZ, MVT::v8i16, { 3, 15, 4, 6 } },
3694 { ISD::CTLZ, MVT::v16i8, { 2, 10, 9, 10 } },
3695
3696 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3697 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3698 { ISD::CTTZ, MVT::v4i64, { 1, 8, 6, 6 } },
3699 { ISD::CTTZ, MVT::v8i32, { 1, 8, 6, 6 } },
3700 { ISD::CTTZ, MVT::v2i64, { 1, 8, 6, 6 } },
3701 { ISD::CTTZ, MVT::v4i32, { 1, 8, 6, 6 } },
3702 };
3703 static const CostKindTblEntry AVX512BWCostTbl[] = {
3704 { ISD::ABS, MVT::v32i16, { 1, 1, 1, 1 } },
3705 { ISD::ABS, MVT::v64i8, { 1, 1, 1, 1 } },
3706 { ISD::BITREVERSE, MVT::v2i64, { 3, 10, 10, 11 } },
3707 { ISD::BITREVERSE, MVT::v4i64, { 3, 11, 10, 11 } },
3708 { ISD::BITREVERSE, MVT::v8i64, { 3, 12, 10, 14 } },
3709 { ISD::BITREVERSE, MVT::v4i32, { 3, 10, 10, 11 } },
3710 { ISD::BITREVERSE, MVT::v8i32, { 3, 11, 10, 11 } },
3711 { ISD::BITREVERSE, MVT::v16i32, { 3, 12, 10, 14 } },
3712 { ISD::BITREVERSE, MVT::v8i16, { 3, 10, 10, 11 } },
3713 { ISD::BITREVERSE, MVT::v16i16, { 3, 11, 10, 11 } },
3714 { ISD::BITREVERSE, MVT::v32i16, { 3, 12, 10, 14 } },
3715 { ISD::BITREVERSE, MVT::v16i8, { 2, 5, 9, 9 } },
3716 { ISD::BITREVERSE, MVT::v32i8, { 2, 5, 9, 9 } },
3717 { ISD::BITREVERSE, MVT::v64i8, { 2, 5, 9, 12 } },
3718 { ISD::BSWAP, MVT::v2i64, { 1, 1, 1, 2 } },
3719 { ISD::BSWAP, MVT::v4i64, { 1, 1, 1, 2 } },
3720 { ISD::BSWAP, MVT::v8i64, { 1, 1, 1, 2 } },
3721 { ISD::BSWAP, MVT::v4i32, { 1, 1, 1, 2 } },
3722 { ISD::BSWAP, MVT::v8i32, { 1, 1, 1, 2 } },
3723 { ISD::BSWAP, MVT::v16i32, { 1, 1, 1, 2 } },
3724 { ISD::BSWAP, MVT::v8i16, { 1, 1, 1, 2 } },
3725 { ISD::BSWAP, MVT::v16i16, { 1, 1, 1, 2 } },
3726 { ISD::BSWAP, MVT::v32i16, { 1, 1, 1, 2 } },
3727 { ISD::CTLZ, MVT::v8i64, { 8, 22, 23, 23 } },
3728 { ISD::CTLZ, MVT::v16i32, { 8, 23, 25, 25 } },
3729 { ISD::CTLZ, MVT::v32i16, { 4, 15, 15, 16 } },
3730 { ISD::CTLZ, MVT::v64i8, { 3, 12, 10, 9 } },
3731 { ISD::CTPOP, MVT::v2i64, { 3, 7, 10, 10 } },
3732 { ISD::CTPOP, MVT::v4i64, { 3, 7, 10, 10 } },
3733 { ISD::CTPOP, MVT::v8i64, { 3, 8, 10, 12 } },
3734 { ISD::CTPOP, MVT::v4i32, { 7, 11, 14, 14 } },
3735 { ISD::CTPOP, MVT::v8i32, { 7, 11, 14, 14 } },
3736 { ISD::CTPOP, MVT::v16i32, { 7, 12, 14, 16 } },
3737 { ISD::CTPOP, MVT::v8i16, { 2, 7, 11, 11 } },
3738 { ISD::CTPOP, MVT::v16i16, { 2, 7, 11, 11 } },
3739 { ISD::CTPOP, MVT::v32i16, { 3, 7, 11, 13 } },
3740 { ISD::CTPOP, MVT::v16i8, { 2, 4, 8, 8 } },
3741 { ISD::CTPOP, MVT::v32i8, { 2, 4, 8, 8 } },
3742 { ISD::CTPOP, MVT::v64i8, { 2, 5, 8, 10 } },
3743 { ISD::CTTZ, MVT::v8i16, { 3, 9, 14, 14 } },
3744 { ISD::CTTZ, MVT::v16i16, { 3, 9, 14, 14 } },
3745 { ISD::CTTZ, MVT::v32i16, { 3, 10, 14, 16 } },
3746 { ISD::CTTZ, MVT::v16i8, { 2, 6, 11, 11 } },
3747 { ISD::CTTZ, MVT::v32i8, { 2, 6, 11, 11 } },
3748 { ISD::CTTZ, MVT::v64i8, { 3, 7, 11, 13 } },
3749 { ISD::ROTL, MVT::v32i16, { 2, 8, 6, 8 } },
3750 { ISD::ROTL, MVT::v16i16, { 2, 8, 6, 7 } },
3751 { ISD::ROTL, MVT::v8i16, { 2, 7, 6, 7 } },
3752 { ISD::ROTL, MVT::v64i8, { 5, 6, 11, 12 } },
3753 { ISD::ROTL, MVT::v32i8, { 5, 15, 7, 10 } },
3754 { ISD::ROTL, MVT::v16i8, { 5, 15, 7, 10 } },
3755 { ISD::ROTR, MVT::v32i16, { 2, 8, 6, 8 } },
3756 { ISD::ROTR, MVT::v16i16, { 2, 8, 6, 7 } },
3757 { ISD::ROTR, MVT::v8i16, { 2, 7, 6, 7 } },
3758 { ISD::ROTR, MVT::v64i8, { 5, 6, 12, 14 } },
3759 { ISD::ROTR, MVT::v32i8, { 5, 14, 6, 9 } },
3760 { ISD::ROTR, MVT::v16i8, { 5, 14, 6, 9 } },
3761 { X86ISD::VROTLI, MVT::v32i16, { 2, 5, 3, 3 } },
3762 { X86ISD::VROTLI, MVT::v16i16, { 1, 5, 3, 3 } },
3763 { X86ISD::VROTLI, MVT::v8i16, { 1, 5, 3, 3 } },
3764 { X86ISD::VROTLI, MVT::v64i8, { 2, 9, 3, 4 } },
3765 { X86ISD::VROTLI, MVT::v32i8, { 1, 9, 3, 4 } },
3766 { X86ISD::VROTLI, MVT::v16i8, { 1, 8, 3, 4 } },
3767 { ISD::SADDSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3768 { ISD::SADDSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3769 { ISD::SMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3770 { ISD::SMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3771 { ISD::SMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3772 { ISD::SMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3773 { ISD::SMULO, MVT::v32i16, { 3, 6, 4, 4 } },
3774 { ISD::SMULO, MVT::v64i8, { 8, 21, 17, 18 } },
3775 { ISD::UMULO, MVT::v32i16, { 2, 5, 3, 3 } },
3776 { ISD::UMULO, MVT::v64i8, { 8, 15, 15, 16 } },
3777 { ISD::SSUBSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3778 { ISD::SSUBSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3779 { ISD::UADDSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3780 { ISD::UADDSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3781 { ISD::UMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3782 { ISD::UMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3783 { ISD::UMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3784 { ISD::UMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3785 { ISD::USUBSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3786 { ISD::USUBSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3787 };
3788 static const CostKindTblEntry AVX512CostTbl[] = {
3789 { ISD::ABS, MVT::v8i64, { 1, 1, 1, 1 } },
3790 { ISD::ABS, MVT::v4i64, { 1, 1, 1, 1 } },
3791 { ISD::ABS, MVT::v2i64, { 1, 1, 1, 1 } },
3792 { ISD::ABS, MVT::v16i32, { 1, 1, 1, 1 } },
3793 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 1 } },
3794 { ISD::ABS, MVT::v32i16, { 2, 7, 4, 4 } },
3795 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 1 } },
3796 { ISD::ABS, MVT::v64i8, { 2, 7, 4, 4 } },
3797 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 1 } },
3798 { ISD::BITREVERSE, MVT::v8i64, { 9, 13, 20, 20 } },
3799 { ISD::BITREVERSE, MVT::v16i32, { 9, 13, 20, 20 } },
3800 { ISD::BITREVERSE, MVT::v32i16, { 9, 13, 20, 20 } },
3801 { ISD::BITREVERSE, MVT::v64i8, { 6, 11, 17, 17 } },
3802 { ISD::BSWAP, MVT::v8i64, { 4, 7, 5, 5 } },
3803 { ISD::BSWAP, MVT::v16i32, { 4, 7, 5, 5 } },
3804 { ISD::BSWAP, MVT::v32i16, { 4, 7, 5, 5 } },
3805 { ISD::CTLZ, MVT::v8i64, { 10, 28, 32, 32 } },
3806 { ISD::CTLZ, MVT::v16i32, { 12, 30, 38, 38 } },
3807 { ISD::CTLZ, MVT::v32i16, { 8, 15, 29, 29 } },
3808 { ISD::CTLZ, MVT::v64i8, { 6, 11, 19, 19 } },
3809 { ISD::CTPOP, MVT::v8i64, { 16, 16, 19, 19 } },
3810 { ISD::CTPOP, MVT::v16i32, { 24, 19, 27, 27 } },
3811 { ISD::CTPOP, MVT::v32i16, { 18, 15, 22, 22 } },
3812 { ISD::CTPOP, MVT::v64i8, { 12, 11, 16, 16 } },
3813 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3814 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3815 { ISD::CTTZ, MVT::v32i16, { 7, 17, 27, 27 } },
3816 { ISD::CTTZ, MVT::v64i8, { 6, 13, 21, 21 } },
3817 { ISD::ROTL, MVT::v8i64, { 1, 1, 1, 1 } },
3818 { ISD::ROTL, MVT::v4i64, { 1, 1, 1, 1 } },
3819 { ISD::ROTL, MVT::v2i64, { 1, 1, 1, 1 } },
3820 { ISD::ROTL, MVT::v16i32, { 1, 1, 1, 1 } },
3821 { ISD::ROTL, MVT::v8i32, { 1, 1, 1, 1 } },
3822 { ISD::ROTL, MVT::v4i32, { 1, 1, 1, 1 } },
3823 { ISD::ROTR, MVT::v8i64, { 1, 1, 1, 1 } },
3824 { ISD::ROTR, MVT::v4i64, { 1, 1, 1, 1 } },
3825 { ISD::ROTR, MVT::v2i64, { 1, 1, 1, 1 } },
3826 { ISD::ROTR, MVT::v16i32, { 1, 1, 1, 1 } },
3827 { ISD::ROTR, MVT::v8i32, { 1, 1, 1, 1 } },
3828 { ISD::ROTR, MVT::v4i32, { 1, 1, 1, 1 } },
3829 { X86ISD::VROTLI, MVT::v8i64, { 1, 1, 1, 1 } },
3830 { X86ISD::VROTLI, MVT::v4i64, { 1, 1, 1, 1 } },
3831 { X86ISD::VROTLI, MVT::v2i64, { 1, 1, 1, 1 } },
3832 { X86ISD::VROTLI, MVT::v16i32, { 1, 1, 1, 1 } },
3833 { X86ISD::VROTLI, MVT::v8i32, { 1, 1, 1, 1 } },
3834 { X86ISD::VROTLI, MVT::v4i32, { 1, 1, 1, 1 } },
3835 { ISD::SADDSAT, MVT::v2i64, { 3, 3, 8, 9 } },
3836 { ISD::SADDSAT, MVT::v4i64, { 2, 2, 6, 7 } },
3837 { ISD::SADDSAT, MVT::v8i64, { 3, 3, 6, 7 } },
3838 { ISD::SADDSAT, MVT::v4i32, { 2, 2, 6, 7 } },
3839 { ISD::SADDSAT, MVT::v8i32, { 2, 2, 6, 7 } },
3840 { ISD::SADDSAT, MVT::v16i32, { 3, 3, 6, 7 } },
3841 { ISD::SADDSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3842 { ISD::SADDSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3843 { ISD::SMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3844 { ISD::SMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3845 { ISD::SMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3846 { ISD::SMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3847 { ISD::SMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3848 { ISD::SMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3849 { ISD::SMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3850 { ISD::SMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3851 { ISD::SMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3852 { ISD::SMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3853 { ISD::SMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3854 { ISD::SMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3855 { ISD::SMULO, MVT::v8i64, { 44, 44, 81, 93 } },
3856 { ISD::SMULO, MVT::v16i32, { 5, 12, 9, 11 } },
3857 { ISD::SMULO, MVT::v32i16, { 6, 12, 17, 17 } },
3858 { ISD::SMULO, MVT::v64i8, { 22, 28, 42, 42 } },
3859 { ISD::SSUBSAT, MVT::v2i64, { 2, 13, 9, 10 } },
3860 { ISD::SSUBSAT, MVT::v4i64, { 2, 15, 7, 8 } },
3861 { ISD::SSUBSAT, MVT::v8i64, { 2, 14, 7, 8 } },
3862 { ISD::SSUBSAT, MVT::v4i32, { 2, 14, 7, 8 } },
3863 { ISD::SSUBSAT, MVT::v8i32, { 2, 15, 7, 8 } },
3864 { ISD::SSUBSAT, MVT::v16i32, { 2, 14, 7, 8 } },
3865 { ISD::SSUBSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3866 { ISD::SSUBSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3867 { ISD::UMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3868 { ISD::UMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3869 { ISD::UMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3870 { ISD::UMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3871 { ISD::UMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3872 { ISD::UMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3873 { ISD::UMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3874 { ISD::UMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3875 { ISD::UMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3876 { ISD::UMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3877 { ISD::UMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3878 { ISD::UMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3879 { ISD::UMULO, MVT::v8i64, { 52, 52, 95, 104} },
3880 { ISD::UMULO, MVT::v16i32, { 5, 12, 8, 10 } },
3881 { ISD::UMULO, MVT::v32i16, { 5, 13, 16, 16 } },
3882 { ISD::UMULO, MVT::v64i8, { 18, 24, 30, 30 } },
3883 { ISD::UADDSAT, MVT::v2i64, { 1, 4, 4, 4 } },
3884 { ISD::UADDSAT, MVT::v4i64, { 1, 4, 4, 4 } },
3885 { ISD::UADDSAT, MVT::v8i64, { 1, 4, 4, 4 } },
3886 { ISD::UADDSAT, MVT::v4i32, { 1, 2, 4, 4 } },
3887 { ISD::UADDSAT, MVT::v8i32, { 1, 2, 4, 4 } },
3888 { ISD::UADDSAT, MVT::v16i32, { 2, 2, 4, 4 } },
3889 { ISD::UADDSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3890 { ISD::UADDSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3891 { ISD::USUBSAT, MVT::v2i64, { 1, 4, 2, 2 } },
3892 { ISD::USUBSAT, MVT::v4i64, { 1, 4, 2, 2 } },
3893 { ISD::USUBSAT, MVT::v8i64, { 1, 4, 2, 2 } },
3894 { ISD::USUBSAT, MVT::v8i32, { 1, 2, 2, 2 } },
3895 { ISD::USUBSAT, MVT::v16i32, { 1, 2, 2, 2 } },
3896 { ISD::USUBSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3897 { ISD::USUBSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3898 { ISD::FMAXNUM, MVT::f32, { 2, 2, 3, 3 } },
3899 { ISD::FMAXNUM, MVT::v4f32, { 1, 1, 3, 3 } },
3900 { ISD::FMAXNUM, MVT::v8f32, { 2, 2, 3, 3 } },
3901 { ISD::FMAXNUM, MVT::v16f32, { 4, 4, 3, 3 } },
3902 { ISD::FMAXNUM, MVT::f64, { 2, 2, 3, 3 } },
3903 { ISD::FMAXNUM, MVT::v2f64, { 1, 1, 3, 3 } },
3904 { ISD::FMAXNUM, MVT::v4f64, { 2, 2, 3, 3 } },
3905 { ISD::FMAXNUM, MVT::v8f64, { 3, 3, 3, 3 } },
3906 { ISD::FSQRT, MVT::f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3907 { ISD::FSQRT, MVT::v4f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3908 { ISD::FSQRT, MVT::v8f32, { 6, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3909 { ISD::FSQRT, MVT::v16f32, { 12, 20, 1, 3 } }, // Skylake from http://www.agner.org/
3910 { ISD::FSQRT, MVT::f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3911 { ISD::FSQRT, MVT::v2f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3912 { ISD::FSQRT, MVT::v4f64, { 12, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3913 { ISD::FSQRT, MVT::v8f64, { 24, 32, 1, 3 } }, // Skylake from http://www.agner.org/
3914 };
3915 static const CostKindTblEntry XOPCostTbl[] = {
3916 { ISD::BITREVERSE, MVT::v4i64, { 3, 6, 5, 6 } },
3917 { ISD::BITREVERSE, MVT::v8i32, { 3, 6, 5, 6 } },
3918 { ISD::BITREVERSE, MVT::v16i16, { 3, 6, 5, 6 } },
3919 { ISD::BITREVERSE, MVT::v32i8, { 3, 6, 5, 6 } },
3920 { ISD::BITREVERSE, MVT::v2i64, { 2, 7, 1, 1 } },
3921 { ISD::BITREVERSE, MVT::v4i32, { 2, 7, 1, 1 } },
3922 { ISD::BITREVERSE, MVT::v8i16, { 2, 7, 1, 1 } },
3923 { ISD::BITREVERSE, MVT::v16i8, { 2, 7, 1, 1 } },
3924 { ISD::BITREVERSE, MVT::i64, { 2, 2, 3, 4 } },
3925 { ISD::BITREVERSE, MVT::i32, { 2, 2, 3, 4 } },
3926 { ISD::BITREVERSE, MVT::i16, { 2, 2, 3, 4 } },
3927 { ISD::BITREVERSE, MVT::i8, { 2, 2, 3, 4 } },
3928 // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
3929 { ISD::ROTL, MVT::v4i64, { 4, 7, 5, 6 } },
3930 { ISD::ROTL, MVT::v8i32, { 4, 7, 5, 6 } },
3931 { ISD::ROTL, MVT::v16i16, { 4, 7, 5, 6 } },
3932 { ISD::ROTL, MVT::v32i8, { 4, 7, 5, 6 } },
3933 { ISD::ROTL, MVT::v2i64, { 1, 3, 1, 1 } },
3934 { ISD::ROTL, MVT::v4i32, { 1, 3, 1, 1 } },
3935 { ISD::ROTL, MVT::v8i16, { 1, 3, 1, 1 } },
3936 { ISD::ROTL, MVT::v16i8, { 1, 3, 1, 1 } },
3937 { ISD::ROTR, MVT::v4i64, { 4, 7, 8, 9 } },
3938 { ISD::ROTR, MVT::v8i32, { 4, 7, 8, 9 } },
3939 { ISD::ROTR, MVT::v16i16, { 4, 7, 8, 9 } },
3940 { ISD::ROTR, MVT::v32i8, { 4, 7, 8, 9 } },
3941 { ISD::ROTR, MVT::v2i64, { 1, 3, 3, 3 } },
3942 { ISD::ROTR, MVT::v4i32, { 1, 3, 3, 3 } },
3943 { ISD::ROTR, MVT::v8i16, { 1, 3, 3, 3 } },
3944 { ISD::ROTR, MVT::v16i8, { 1, 3, 3, 3 } },
3945 { X86ISD::VROTLI, MVT::v4i64, { 4, 7, 5, 6 } },
3946 { X86ISD::VROTLI, MVT::v8i32, { 4, 7, 5, 6 } },
3947 { X86ISD::VROTLI, MVT::v16i16, { 4, 7, 5, 6 } },
3948 { X86ISD::VROTLI, MVT::v32i8, { 4, 7, 5, 6 } },
3949 { X86ISD::VROTLI, MVT::v2i64, { 1, 3, 1, 1 } },
3950 { X86ISD::VROTLI, MVT::v4i32, { 1, 3, 1, 1 } },
3951 { X86ISD::VROTLI, MVT::v8i16, { 1, 3, 1, 1 } },
3952 { X86ISD::VROTLI, MVT::v16i8, { 1, 3, 1, 1 } },
3953 };
3954 static const CostKindTblEntry AVX2CostTbl[] = {
3955 { ISD::ABS, MVT::v2i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3956 { ISD::ABS, MVT::v4i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3957 { ISD::ABS, MVT::v4i32, { 1, 1, 1, 1 } },
3958 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 2 } },
3959 { ISD::ABS, MVT::v8i16, { 1, 1, 1, 1 } },
3960 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 2 } },
3961 { ISD::ABS, MVT::v16i8, { 1, 1, 1, 1 } },
3962 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 2 } },
3963 { ISD::BITREVERSE, MVT::v2i64, { 3, 11, 10, 11 } },
3964 { ISD::BITREVERSE, MVT::v4i64, { 5, 11, 10, 17 } },
3965 { ISD::BITREVERSE, MVT::v4i32, { 3, 11, 10, 11 } },
3966 { ISD::BITREVERSE, MVT::v8i32, { 5, 11, 10, 17 } },
3967 { ISD::BITREVERSE, MVT::v8i16, { 3, 11, 10, 11 } },
3968 { ISD::BITREVERSE, MVT::v16i16, { 5, 11, 10, 17 } },
3969 { ISD::BITREVERSE, MVT::v16i8, { 3, 6, 9, 9 } },
3970 { ISD::BITREVERSE, MVT::v32i8, { 4, 5, 9, 15 } },
3971 { ISD::BSWAP, MVT::v2i64, { 1, 2, 1, 2 } },
3972 { ISD::BSWAP, MVT::v4i64, { 1, 3, 1, 2 } },
3973 { ISD::BSWAP, MVT::v4i32, { 1, 2, 1, 2 } },
3974 { ISD::BSWAP, MVT::v8i32, { 1, 3, 1, 2 } },
3975 { ISD::BSWAP, MVT::v8i16, { 1, 2, 1, 2 } },
3976 { ISD::BSWAP, MVT::v16i16, { 1, 3, 1, 2 } },
3977 { ISD::CTLZ, MVT::v2i64, { 7, 18, 24, 25 } },
3978 { ISD::CTLZ, MVT::v4i64, { 14, 18, 24, 44 } },
3979 { ISD::CTLZ, MVT::v4i32, { 5, 16, 19, 20 } },
3980 { ISD::CTLZ, MVT::v8i32, { 10, 16, 19, 34 } },
3981 { ISD::CTLZ, MVT::v8i16, { 4, 13, 14, 15 } },
3982 { ISD::CTLZ, MVT::v16i16, { 6, 14, 14, 24 } },
3983 { ISD::CTLZ, MVT::v16i8, { 3, 12, 9, 10 } },
3984 { ISD::CTLZ, MVT::v32i8, { 4, 12, 9, 14 } },
3985 { ISD::CTPOP, MVT::v2i64, { 3, 9, 10, 10 } },
3986 { ISD::CTPOP, MVT::v4i64, { 4, 9, 10, 14 } },
3987 { ISD::CTPOP, MVT::v4i32, { 7, 12, 14, 14 } },
3988 { ISD::CTPOP, MVT::v8i32, { 7, 12, 14, 18 } },
3989 { ISD::CTPOP, MVT::v8i16, { 3, 7, 11, 11 } },
3990 { ISD::CTPOP, MVT::v16i16, { 6, 8, 11, 18 } },
3991 { ISD::CTPOP, MVT::v16i8, { 2, 5, 8, 8 } },
3992 { ISD::CTPOP, MVT::v32i8, { 3, 5, 8, 12 } },
3993 { ISD::CTTZ, MVT::v2i64, { 4, 11, 13, 13 } },
3994 { ISD::CTTZ, MVT::v4i64, { 5, 11, 13, 20 } },
3995 { ISD::CTTZ, MVT::v4i32, { 7, 14, 17, 17 } },
3996 { ISD::CTTZ, MVT::v8i32, { 7, 15, 17, 24 } },
3997 { ISD::CTTZ, MVT::v8i16, { 4, 9, 14, 14 } },
3998 { ISD::CTTZ, MVT::v16i16, { 6, 9, 14, 24 } },
3999 { ISD::CTTZ, MVT::v16i8, { 3, 7, 11, 11 } },
4000 { ISD::CTTZ, MVT::v32i8, { 5, 7, 11, 18 } },
4001 { ISD::SADDSAT, MVT::v2i64, { 4, 13, 8, 11 } },
4002 { ISD::SADDSAT, MVT::v4i64, { 3, 10, 8, 12 } },
4003 { ISD::SADDSAT, MVT::v4i32, { 2, 6, 7, 9 } },
4004 { ISD::SADDSAT, MVT::v8i32, { 4, 6, 7, 13 } },
4005 { ISD::SADDSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4006 { ISD::SADDSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4007 { ISD::SMAX, MVT::v2i64, { 2, 7, 2, 3 } },
4008 { ISD::SMAX, MVT::v4i64, { 2, 7, 2, 3 } },
4009 { ISD::SMAX, MVT::v8i32, { 1, 1, 1, 2 } },
4010 { ISD::SMAX, MVT::v16i16, { 1, 1, 1, 2 } },
4011 { ISD::SMAX, MVT::v32i8, { 1, 1, 1, 2 } },
4012 { ISD::SMIN, MVT::v2i64, { 2, 7, 2, 3 } },
4013 { ISD::SMIN, MVT::v4i64, { 2, 7, 2, 3 } },
4014 { ISD::SMIN, MVT::v8i32, { 1, 1, 1, 2 } },
4015 { ISD::SMIN, MVT::v16i16, { 1, 1, 1, 2 } },
4016 { ISD::SMIN, MVT::v32i8, { 1, 1, 1, 2 } },
4017 { ISD::SMULO, MVT::v4i64, { 20, 20, 33, 37 } },
4018 { ISD::SMULO, MVT::v2i64, { 8, 8, 13, 15 } },
4019 { ISD::SMULO, MVT::v8i32, { 8, 20, 13, 24 } },
4020 { ISD::SMULO, MVT::v4i32, { 5, 15, 11, 12 } },
4021 { ISD::SMULO, MVT::v16i16, { 4, 14, 8, 14 } },
4022 { ISD::SMULO, MVT::v8i16, { 3, 9, 6, 6 } },
4023 { ISD::SMULO, MVT::v32i8, { 9, 15, 18, 35 } },
4024 { ISD::SMULO, MVT::v16i8, { 6, 22, 14, 21 } },
4025 { ISD::SSUBSAT, MVT::v2i64, { 4, 13, 9, 13 } },
4026 { ISD::SSUBSAT, MVT::v4i64, { 4, 15, 9, 13 } },
4027 { ISD::SSUBSAT, MVT::v4i32, { 3, 14, 9, 11 } },
4028 { ISD::SSUBSAT, MVT::v8i32, { 4, 15, 9, 16 } },
4029 { ISD::SSUBSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4030 { ISD::SSUBSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4031 { ISD::UADDSAT, MVT::v2i64, { 2, 8, 6, 6 } },
4032 { ISD::UADDSAT, MVT::v4i64, { 3, 8, 6, 10 } },
4033 { ISD::UADDSAT, MVT::v8i32, { 2, 2, 4, 8 } },
4034 { ISD::UADDSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4035 { ISD::UADDSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4036 { ISD::UMAX, MVT::v2i64, { 2, 8, 5, 6 } },
4037 { ISD::UMAX, MVT::v4i64, { 2, 8, 5, 8 } },
4038 { ISD::UMAX, MVT::v8i32, { 1, 1, 1, 2 } },
4039 { ISD::UMAX, MVT::v16i16, { 1, 1, 1, 2 } },
4040 { ISD::UMAX, MVT::v32i8, { 1, 1, 1, 2 } },
4041 { ISD::UMIN, MVT::v2i64, { 2, 8, 5, 6 } },
4042 { ISD::UMIN, MVT::v4i64, { 2, 8, 5, 8 } },
4043 { ISD::UMIN, MVT::v8i32, { 1, 1, 1, 2 } },
4044 { ISD::UMIN, MVT::v16i16, { 1, 1, 1, 2 } },
4045 { ISD::UMIN, MVT::v32i8, { 1, 1, 1, 2 } },
4046 { ISD::UMULO, MVT::v4i64, { 24, 24, 39, 43 } },
4047 { ISD::UMULO, MVT::v2i64, { 10, 10, 15, 19 } },
4048 { ISD::UMULO, MVT::v8i32, { 8, 11, 13, 23 } },
4049 { ISD::UMULO, MVT::v4i32, { 5, 12, 11, 12 } },
4050 { ISD::UMULO, MVT::v16i16, { 4, 6, 8, 13 } },
4051 { ISD::UMULO, MVT::v8i16, { 2, 8, 6, 6 } },
4052 { ISD::UMULO, MVT::v32i8, { 9, 13, 17, 33 } },
4053 { ISD::UMULO, MVT::v16i8, { 6, 19, 13, 20 } },
4054 { ISD::USUBSAT, MVT::v2i64, { 2, 7, 6, 6 } },
4055 { ISD::USUBSAT, MVT::v4i64, { 3, 7, 6, 10 } },
4056 { ISD::USUBSAT, MVT::v8i32, { 2, 2, 2, 4 } },
4057 { ISD::USUBSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4058 { ISD::USUBSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4059 { ISD::FMAXNUM, MVT::f32, { 2, 7, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4060 { ISD::FMAXNUM, MVT::v4f32, { 2, 7, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4061 { ISD::FMAXNUM, MVT::v8f32, { 3, 7, 3, 6 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4062 { ISD::FMAXNUM, MVT::f64, { 2, 7, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4063 { ISD::FMAXNUM, MVT::v2f64, { 2, 7, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4064 { ISD::FMAXNUM, MVT::v4f64, { 3, 7, 3, 6 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4065 { ISD::FSQRT, MVT::f32, { 7, 15, 1, 1 } }, // vsqrtss
4066 { ISD::FSQRT, MVT::v4f32, { 7, 15, 1, 1 } }, // vsqrtps
4067 { ISD::FSQRT, MVT::v8f32, { 14, 21, 1, 3 } }, // vsqrtps
4068 { ISD::FSQRT, MVT::f64, { 14, 21, 1, 1 } }, // vsqrtsd
4069 { ISD::FSQRT, MVT::v2f64, { 14, 21, 1, 1 } }, // vsqrtpd
4070 { ISD::FSQRT, MVT::v4f64, { 28, 35, 1, 3 } }, // vsqrtpd
4071 };
4072 static const CostKindTblEntry AVX1CostTbl[] = {
4073 { ISD::ABS, MVT::v4i64, { 6, 8, 6, 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
4074 { ISD::ABS, MVT::v8i32, { 3, 6, 4, 5 } },
4075 { ISD::ABS, MVT::v16i16, { 3, 6, 4, 5 } },
4076 { ISD::ABS, MVT::v32i8, { 3, 6, 4, 5 } },
4077 { ISD::BITREVERSE, MVT::v4i64, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4078 { ISD::BITREVERSE, MVT::v2i64, { 8, 13, 10, 16 } },
4079 { ISD::BITREVERSE, MVT::v8i32, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4080 { ISD::BITREVERSE, MVT::v4i32, { 8, 13, 10, 16 } },
4081 { ISD::BITREVERSE, MVT::v16i16, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4082 { ISD::BITREVERSE, MVT::v8i16, { 8, 13, 10, 16 } },
4083 { ISD::BITREVERSE, MVT::v32i8, { 13, 15, 17, 26 } }, // 2 x 128-bit Op + extract/insert
4084 { ISD::BITREVERSE, MVT::v16i8, { 7, 7, 9, 13 } },
4085 { ISD::BSWAP, MVT::v4i64, { 5, 6, 5, 10 } },
4086 { ISD::BSWAP, MVT::v2i64, { 2, 2, 1, 3 } },
4087 { ISD::BSWAP, MVT::v8i32, { 5, 6, 5, 10 } },
4088 { ISD::BSWAP, MVT::v4i32, { 2, 2, 1, 3 } },
4089 { ISD::BSWAP, MVT::v16i16, { 5, 6, 5, 10 } },
4090 { ISD::BSWAP, MVT::v8i16, { 2, 2, 1, 3 } },
4091 { ISD::CTLZ, MVT::v4i64, { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert
4092 { ISD::CTLZ, MVT::v2i64, { 14, 24, 24, 28 } },
4093 { ISD::CTLZ, MVT::v8i32, { 24, 28, 39, 48 } }, // 2 x 128-bit Op + extract/insert
4094 { ISD::CTLZ, MVT::v4i32, { 12, 20, 19, 23 } },
4095 { ISD::CTLZ, MVT::v16i16, { 19, 22, 29, 38 } }, // 2 x 128-bit Op + extract/insert
4096 { ISD::CTLZ, MVT::v8i16, { 9, 16, 14, 18 } },
4097 { ISD::CTLZ, MVT::v32i8, { 14, 15, 19, 28 } }, // 2 x 128-bit Op + extract/insert
4098 { ISD::CTLZ, MVT::v16i8, { 7, 12, 9, 13 } },
4099 { ISD::CTPOP, MVT::v4i64, { 14, 18, 19, 28 } }, // 2 x 128-bit Op + extract/insert
4100 { ISD::CTPOP, MVT::v2i64, { 7, 14, 10, 14 } },
4101 { ISD::CTPOP, MVT::v8i32, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
4102 { ISD::CTPOP, MVT::v4i32, { 9, 20, 14, 18 } },
4103 { ISD::CTPOP, MVT::v16i16, { 16, 21, 22, 31 } }, // 2 x 128-bit Op + extract/insert
4104 { ISD::CTPOP, MVT::v8i16, { 8, 18, 11, 15 } },
4105 { ISD::CTPOP, MVT::v32i8, { 13, 15, 16, 25 } }, // 2 x 128-bit Op + extract/insert
4106 { ISD::CTPOP, MVT::v16i8, { 6, 12, 8, 12 } },
4107 { ISD::CTTZ, MVT::v4i64, { 17, 22, 24, 33 } }, // 2 x 128-bit Op + extract/insert
4108 { ISD::CTTZ, MVT::v2i64, { 9, 19, 13, 17 } },
4109 { ISD::CTTZ, MVT::v8i32, { 21, 27, 32, 41 } }, // 2 x 128-bit Op + extract/insert
4110 { ISD::CTTZ, MVT::v4i32, { 11, 24, 17, 21 } },
4111 { ISD::CTTZ, MVT::v16i16, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
4112 { ISD::CTTZ, MVT::v8i16, { 9, 21, 14, 18 } },
4113 { ISD::CTTZ, MVT::v32i8, { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert
4114 { ISD::CTTZ, MVT::v16i8, { 8, 16, 11, 15 } },
4115 { ISD::SADDSAT, MVT::v2i64, { 6, 13, 8, 11 } },
4116 { ISD::SADDSAT, MVT::v4i64, { 13, 20, 15, 25 } }, // 2 x 128-bit Op + extract/insert
4117 { ISD::SADDSAT, MVT::v8i32, { 12, 18, 14, 24 } }, // 2 x 128-bit Op + extract/insert
4118 { ISD::SADDSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4119 { ISD::SADDSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4120 { ISD::SMAX, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
4121 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 4 } },
4122 { ISD::SMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4123 { ISD::SMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4124 { ISD::SMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4125 { ISD::SMIN, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
4126 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
4127 { ISD::SMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4128 { ISD::SMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4129 { ISD::SMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4130 { ISD::SMULO, MVT::v4i64, { 20, 20, 33, 37 } },
4131 { ISD::SMULO, MVT::v2i64, { 9, 9, 13, 17 } },
4132 { ISD::SMULO, MVT::v8i32, { 15, 20, 24, 29 } },
4133 { ISD::SMULO, MVT::v4i32, { 7, 15, 11, 13 } },
4134 { ISD::SMULO, MVT::v16i16, { 8, 14, 14, 15 } },
4135 { ISD::SMULO, MVT::v8i16, { 3, 9, 6, 6 } },
4136 { ISD::SMULO, MVT::v32i8, { 20, 20, 37, 39 } },
4137 { ISD::SMULO, MVT::v16i8, { 9, 22, 18, 21 } },
4138 { ISD::SSUBSAT, MVT::v2i64, { 7, 13, 9, 13 } },
4139 { ISD::SSUBSAT, MVT::v4i64, { 15, 21, 18, 29 } }, // 2 x 128-bit Op + extract/insert
4140 { ISD::SSUBSAT, MVT::v8i32, { 15, 19, 18, 29 } }, // 2 x 128-bit Op + extract/insert
4141 { ISD::SSUBSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4142 { ISD::SSUBSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4143 { ISD::UADDSAT, MVT::v2i64, { 3, 8, 6, 6 } },
4144 { ISD::UADDSAT, MVT::v4i64, { 8, 11, 14, 15 } }, // 2 x 128-bit Op + extract/insert
4145 { ISD::UADDSAT, MVT::v8i32, { 6, 6, 10, 11 } }, // 2 x 128-bit Op + extract/insert
4146 { ISD::UADDSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4147 { ISD::UADDSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4148 { ISD::UMAX, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
4149 { ISD::UMAX, MVT::v2i64, { 4, 8, 5, 7 } },
4150 { ISD::UMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4151 { ISD::UMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4152 { ISD::UMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4153 { ISD::UMIN, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
4154 { ISD::UMIN, MVT::v2i64, { 4, 8, 5, 7 } },
4155 { ISD::UMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4156 { ISD::UMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4157 { ISD::UMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4158 { ISD::UMULO, MVT::v4i64, { 24, 26, 39, 45 } },
4159 { ISD::UMULO, MVT::v2i64, { 10, 12, 15, 20 } },
4160 { ISD::UMULO, MVT::v8i32, { 14, 15, 23, 28 } },
4161 { ISD::UMULO, MVT::v4i32, { 7, 12, 11, 13 } },
4162 { ISD::UMULO, MVT::v16i16, { 7, 11, 13, 14 } },
4163 { ISD::UMULO, MVT::v8i16, { 3, 8, 6, 6 } },
4164 { ISD::UMULO, MVT::v32i8, { 19, 19, 35, 37 } },
4165 { ISD::UMULO, MVT::v16i8, { 9, 19, 17, 20 } },
4166 { ISD::USUBSAT, MVT::v2i64, { 3, 7, 6, 6 } },
4167 { ISD::USUBSAT, MVT::v4i64, { 8, 10, 14, 15 } }, // 2 x 128-bit Op + extract/insert
4168 { ISD::USUBSAT, MVT::v8i32, { 4, 4, 7, 8 } }, // 2 x 128-bit Op + extract/insert
4169 { ISD::USUBSAT, MVT::v8i32, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4170 { ISD::USUBSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4171 { ISD::USUBSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4172 { ISD::FMAXNUM, MVT::f32, { 3, 6, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4173 { ISD::FMAXNUM, MVT::v4f32, { 3, 6, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4174 { ISD::FMAXNUM, MVT::v8f32, { 5, 7, 3, 10 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4175 { ISD::FMAXNUM, MVT::f64, { 3, 6, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4176 { ISD::FMAXNUM, MVT::v2f64, { 3, 6, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4177 { ISD::FMAXNUM, MVT::v4f64, { 5, 7, 3, 10 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4178 { ISD::FSQRT, MVT::f32, { 21, 21, 1, 1 } }, // vsqrtss
4179 { ISD::FSQRT, MVT::v4f32, { 21, 21, 1, 1 } }, // vsqrtps
4180 { ISD::FSQRT, MVT::v8f32, { 42, 42, 1, 3 } }, // vsqrtps
4181 { ISD::FSQRT, MVT::f64, { 27, 27, 1, 1 } }, // vsqrtsd
4182 { ISD::FSQRT, MVT::v2f64, { 27, 27, 1, 1 } }, // vsqrtpd
4183 { ISD::FSQRT, MVT::v4f64, { 54, 54, 1, 3 } }, // vsqrtpd
4184 };
4185 static const CostKindTblEntry GFNICostTbl[] = {
4186 { ISD::BITREVERSE, MVT::i8, { 3, 3, 3, 4 } }, // gf2p8affineqb
4187 { ISD::BITREVERSE, MVT::i16, { 3, 3, 4, 6 } }, // gf2p8affineqb
4188 { ISD::BITREVERSE, MVT::i32, { 3, 3, 4, 5 } }, // gf2p8affineqb
4189 { ISD::BITREVERSE, MVT::i64, { 3, 3, 4, 6 } }, // gf2p8affineqb
4190 { ISD::BITREVERSE, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4191 { ISD::BITREVERSE, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4192 { ISD::BITREVERSE, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4193 { ISD::BITREVERSE, MVT::v8i16, { 1, 8, 2, 4 } }, // gf2p8affineqb
4194 { ISD::BITREVERSE, MVT::v16i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
4195 { ISD::BITREVERSE, MVT::v32i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
4196 { ISD::BITREVERSE, MVT::v4i32, { 1, 8, 2, 4 } }, // gf2p8affineqb
4197 { ISD::BITREVERSE, MVT::v8i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
4198 { ISD::BITREVERSE, MVT::v16i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
4199 { ISD::BITREVERSE, MVT::v2i64, { 1, 8, 2, 4 } }, // gf2p8affineqb
4200 { ISD::BITREVERSE, MVT::v4i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
4201 { ISD::BITREVERSE, MVT::v8i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
4202 { X86ISD::VROTLI, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4203 { X86ISD::VROTLI, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4204 { X86ISD::VROTLI, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4205 };
4206 static const CostKindTblEntry GLMCostTbl[] = {
4207 { ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss
4208 { ISD::FSQRT, MVT::v4f32, { 37, 41, 1, 5 } }, // sqrtps
4209 { ISD::FSQRT, MVT::f64, { 34, 35, 1, 1 } }, // sqrtsd
4210 { ISD::FSQRT, MVT::v2f64, { 67, 71, 1, 5 } }, // sqrtpd
4211 };
4212 static const CostKindTblEntry SLMCostTbl[] = {
4213 { ISD::BSWAP, MVT::v2i64, { 5, 5, 1, 5 } },
4214 { ISD::BSWAP, MVT::v4i32, { 5, 5, 1, 5 } },
4215 { ISD::BSWAP, MVT::v8i16, { 5, 5, 1, 5 } },
4216 { ISD::FSQRT, MVT::f32, { 20, 20, 1, 1 } }, // sqrtss
4217 { ISD::FSQRT, MVT::v4f32, { 40, 41, 1, 5 } }, // sqrtps
4218 { ISD::FSQRT, MVT::f64, { 35, 35, 1, 1 } }, // sqrtsd
4219 { ISD::FSQRT, MVT::v2f64, { 70, 71, 1, 5 } }, // sqrtpd
4220 };
4221 static const CostKindTblEntry SSE42CostTbl[] = {
4222 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4223 { ISD::FMAXNUM, MVT::v4f32, { 4, 4, 4, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4224 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4225 { ISD::FMAXNUM, MVT::v2f64, { 4, 4, 4, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4226 { ISD::FSQRT, MVT::f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
4227 { ISD::FSQRT, MVT::v4f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
4228 };
4229 static const CostKindTblEntry SSE41CostTbl[] = {
4230 { ISD::ABS, MVT::v2i64, { 3, 4, 3, 5 } }, // BLENDVPD(X,PSUBQ(0,X),X)
4231 { ISD::SADDSAT, MVT::v2i64, { 10, 14, 17, 21 } },
4232 { ISD::SADDSAT, MVT::v4i32, { 5, 11, 8, 10 } },
4233 { ISD::SSUBSAT, MVT::v2i64, { 12, 19, 25, 29 } },
4234 { ISD::SSUBSAT, MVT::v4i32, { 6, 14, 10, 12 } },
4235 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 3 } },
4236 { ISD::SMAX, MVT::v4i32, { 1, 1, 1, 1 } },
4237 { ISD::SMAX, MVT::v16i8, { 1, 1, 1, 1 } },
4238 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
4239 { ISD::SMIN, MVT::v4i32, { 1, 1, 1, 1 } },
4240 { ISD::SMIN, MVT::v16i8, { 1, 1, 1, 1 } },
4241 { ISD::SMULO, MVT::v2i64, { 9, 11, 13, 17 } },
4242 { ISD::SMULO, MVT::v4i32, { 20, 24, 13, 19 } },
4243 { ISD::SMULO, MVT::v8i16, { 5, 9, 8, 8 } },
4244 { ISD::SMULO, MVT::v16i8, { 13, 22, 24, 25 } },
4245 { ISD::UADDSAT, MVT::v2i64, { 6, 13, 14, 14 } },
4246 { ISD::UADDSAT, MVT::v4i32, { 2, 2, 4, 4 } },
4247 { ISD::USUBSAT, MVT::v2i64, { 6, 10, 14, 14 } },
4248 { ISD::USUBSAT, MVT::v4i32, { 1, 2, 2, 2 } },
4249 { ISD::UMAX, MVT::v2i64, { 2, 11, 6, 7 } },
4250 { ISD::UMAX, MVT::v4i32, { 1, 1, 1, 1 } },
4251 { ISD::UMAX, MVT::v8i16, { 1, 1, 1, 1 } },
4252 { ISD::UMIN, MVT::v2i64, { 2, 11, 6, 7 } },
4253 { ISD::UMIN, MVT::v4i32, { 1, 1, 1, 1 } },
4254 { ISD::UMIN, MVT::v8i16, { 1, 1, 1, 1 } },
4255 { ISD::UMULO, MVT::v2i64, { 14, 20, 15, 20 } },
4256 { ISD::UMULO, MVT::v4i32, { 19, 22, 12, 18 } },
4257 { ISD::UMULO, MVT::v8i16, { 4, 9, 7, 7 } },
4258 { ISD::UMULO, MVT::v16i8, { 13, 19, 18, 20 } },
4259 };
4260 static const CostKindTblEntry SSSE3CostTbl[] = {
4261 { ISD::ABS, MVT::v4i32, { 1, 2, 1, 1 } },
4262 { ISD::ABS, MVT::v8i16, { 1, 2, 1, 1 } },
4263 { ISD::ABS, MVT::v16i8, { 1, 2, 1, 1 } },
4264 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 11, 21 } },
4265 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 11, 21 } },
4266 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 11, 21 } },
4267 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 10, 16 } },
4268 { ISD::BSWAP, MVT::v2i64, { 2, 3, 1, 5 } },
4269 { ISD::BSWAP, MVT::v4i32, { 2, 3, 1, 5 } },
4270 { ISD::BSWAP, MVT::v8i16, { 2, 3, 1, 5 } },
4271 { ISD::CTLZ, MVT::v2i64, { 18, 28, 28, 35 } },
4272 { ISD::CTLZ, MVT::v4i32, { 15, 20, 22, 28 } },
4273 { ISD::CTLZ, MVT::v8i16, { 13, 17, 16, 22 } },
4274 { ISD::CTLZ, MVT::v16i8, { 11, 15, 10, 16 } },
4275 { ISD::CTPOP, MVT::v2i64, { 13, 19, 12, 18 } },
4276 { ISD::CTPOP, MVT::v4i32, { 18, 24, 16, 22 } },
4277 { ISD::CTPOP, MVT::v8i16, { 13, 18, 14, 20 } },
4278 { ISD::CTPOP, MVT::v16i8, { 11, 12, 10, 16 } },
4279 { ISD::CTTZ, MVT::v2i64, { 13, 25, 15, 22 } },
4280 { ISD::CTTZ, MVT::v4i32, { 18, 26, 19, 25 } },
4281 { ISD::CTTZ, MVT::v8i16, { 13, 20, 17, 23 } },
4282 { ISD::CTTZ, MVT::v16i8, { 11, 16, 13, 19 } }
4283 };
4284 static const CostKindTblEntry SSE2CostTbl[] = {
4285 { ISD::ABS, MVT::v2i64, { 3, 6, 5, 5 } },
4286 { ISD::ABS, MVT::v4i32, { 1, 4, 4, 4 } },
4287 { ISD::ABS, MVT::v8i16, { 1, 2, 3, 3 } },
4288 { ISD::ABS, MVT::v16i8, { 1, 2, 3, 3 } },
4289 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 32, 32 } },
4290 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 30, 30 } },
4291 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 25, 25 } },
4292 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 21, 21 } },
4293 { ISD::BSWAP, MVT::v2i64, { 5, 6, 11, 11 } },
4294 { ISD::BSWAP, MVT::v4i32, { 5, 5, 9, 9 } },
4295 { ISD::BSWAP, MVT::v8i16, { 5, 5, 4, 5 } },
4296 { ISD::CTLZ, MVT::v2i64, { 10, 45, 36, 38 } },
4297 { ISD::CTLZ, MVT::v4i32, { 10, 45, 38, 40 } },
4298 { ISD::CTLZ, MVT::v8i16, { 9, 38, 32, 34 } },
4299 { ISD::CTLZ, MVT::v16i8, { 8, 39, 29, 32 } },
4300 { ISD::CTPOP, MVT::v2i64, { 12, 26, 16, 18 } },
4301 { ISD::CTPOP, MVT::v4i32, { 15, 29, 21, 23 } },
4302 { ISD::CTPOP, MVT::v8i16, { 13, 25, 18, 20 } },
4303 { ISD::CTPOP, MVT::v16i8, { 10, 21, 14, 16 } },
4304 { ISD::CTTZ, MVT::v2i64, { 14, 28, 19, 21 } },
4305 { ISD::CTTZ, MVT::v4i32, { 18, 31, 24, 26 } },
4306 { ISD::CTTZ, MVT::v8i16, { 16, 27, 21, 23 } },
4307 { ISD::CTTZ, MVT::v16i8, { 13, 23, 17, 19 } },
4308 { ISD::SADDSAT, MVT::v2i64, { 12, 14, 24, 24 } },
4309 { ISD::SADDSAT, MVT::v4i32, { 6, 11, 11, 12 } },
4310 { ISD::SADDSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4311 { ISD::SADDSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4312 { ISD::SMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4313 { ISD::SMAX, MVT::v4i32, { 2, 4, 5, 5 } },
4314 { ISD::SMAX, MVT::v8i16, { 1, 1, 1, 1 } },
4315 { ISD::SMAX, MVT::v16i8, { 2, 4, 5, 5 } },
4316 { ISD::SMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4317 { ISD::SMIN, MVT::v4i32, { 2, 4, 5, 5 } },
4318 { ISD::SMIN, MVT::v8i16, { 1, 1, 1, 1 } },
4319 { ISD::SMIN, MVT::v16i8, { 2, 4, 5, 5 } },
4320 { ISD::SMULO, MVT::v2i64, { 30, 33, 13, 23 } },
4321 { ISD::SMULO, MVT::v4i32, { 20, 24, 23, 23 } },
4322 { ISD::SMULO, MVT::v8i16, { 5, 10, 8, 8 } },
4323 { ISD::SMULO, MVT::v16i8, { 13, 23, 24, 25 } },
4324 { ISD::SSUBSAT, MVT::v2i64, { 16, 19, 31, 31 } },
4325 { ISD::SSUBSAT, MVT::v4i32, { 6, 14, 12, 13 } },
4326 { ISD::SSUBSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4327 { ISD::SSUBSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4328 { ISD::UADDSAT, MVT::v2i64, { 7, 13, 14, 14 } },
4329 { ISD::UADDSAT, MVT::v4i32, { 4, 5, 7, 7 } },
4330 { ISD::UADDSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4331 { ISD::UADDSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4332 { ISD::UMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4333 { ISD::UMAX, MVT::v4i32, { 2, 5, 8, 8 } },
4334 { ISD::UMAX, MVT::v8i16, { 1, 3, 3, 3 } },
4335 { ISD::UMAX, MVT::v16i8, { 1, 1, 1, 1 } },
4336 { ISD::UMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4337 { ISD::UMIN, MVT::v4i32, { 2, 5, 8, 8 } },
4338 { ISD::UMIN, MVT::v8i16, { 1, 3, 3, 3 } },
4339 { ISD::UMIN, MVT::v16i8, { 1, 1, 1, 1 } },
4340 { ISD::UMULO, MVT::v2i64, { 30, 33, 15, 29 } },
4341 { ISD::UMULO, MVT::v4i32, { 19, 22, 14, 18 } },
4342 { ISD::UMULO, MVT::v8i16, { 4, 9, 7, 7 } },
4343 { ISD::UMULO, MVT::v16i8, { 13, 19, 20, 20 } },
4344 { ISD::USUBSAT, MVT::v2i64, { 7, 10, 14, 14 } },
4345 { ISD::USUBSAT, MVT::v4i32, { 4, 4, 7, 7 } },
4346 { ISD::USUBSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4347 { ISD::USUBSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4348 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } },
4349 { ISD::FMAXNUM, MVT::v2f64, { 4, 6, 6, 6 } },
4350 { ISD::FSQRT, MVT::f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4351 { ISD::FSQRT, MVT::v2f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4352 };
4353 static const CostKindTblEntry SSE1CostTbl[] = {
4354 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } },
4355 { ISD::FMAXNUM, MVT::v4f32, { 4, 6, 6, 6 } },
4356 { ISD::FSQRT, MVT::f32, { 28, 30, 1, 2 } }, // Pentium III from http://www.agner.org/
4357 { ISD::FSQRT, MVT::v4f32, { 56, 56, 1, 2 } }, // Pentium III from http://www.agner.org/
4358 };
4359 static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets
4360 { ISD::CTTZ, MVT::i64, { 1, 1, 1, 1 } },
4361 };
4362 static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
4363 { ISD::CTTZ, MVT::i32, { 1, 1, 1, 1 } },
4364 { ISD::CTTZ, MVT::i16, { 2, 1, 1, 1 } },
4365 { ISD::CTTZ, MVT::i8, { 2, 1, 1, 1 } },
4366 };
4367 static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets
4368 { ISD::CTLZ, MVT::i64, { 1, 1, 1, 1 } },
4369 };
4370 static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
4371 { ISD::CTLZ, MVT::i32, { 1, 1, 1, 1 } },
4372 { ISD::CTLZ, MVT::i16, { 2, 1, 1, 1 } },
4373 { ISD::CTLZ, MVT::i8, { 2, 1, 1, 1 } },
4374 };
4375 static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets
4376 { ISD::CTPOP, MVT::i64, { 1, 1, 1, 1 } }, // popcnt
4377 };
4378 static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
4379 { ISD::CTPOP, MVT::i32, { 1, 1, 1, 1 } }, // popcnt
4380 { ISD::CTPOP, MVT::i16, { 1, 1, 2, 2 } }, // popcnt(zext())
4381 { ISD::CTPOP, MVT::i8, { 1, 1, 2, 2 } }, // popcnt(zext())
4382 };
4383 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
4384 { ISD::ABS, MVT::i64, { 1, 2, 3, 3 } }, // SUB+CMOV
4385 { ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } },
4386 { ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } },
4387 { ISD::CTLZ, MVT::i64, { 1, 2, 3, 3 } }, // MOV+BSR+XOR
4388 { ISD::CTLZ, MVT::i32, { 1, 2, 3, 3 } }, // MOV+BSR+XOR
4389 { ISD::CTLZ, MVT::i16, { 2, 2, 3, 3 } }, // MOV+BSR+XOR
4390 { ISD::CTLZ, MVT::i8, { 2, 2, 4, 3 } }, // MOV+BSR+XOR
4391 { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 2, 2 } }, // BSR+XOR
4392 { ISD::CTTZ, MVT::i64, { 1, 2, 2, 2 } }, // MOV+BSF
4393 { ISD::CTTZ, MVT::i32, { 1, 2, 2, 2 } }, // MOV+BSF
4394 { ISD::CTTZ, MVT::i16, { 2, 2, 2, 2 } }, // MOV+BSF
4395 { ISD::CTTZ, MVT::i8, { 2, 2, 2, 2 } }, // MOV+BSF
4396 { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 1, 2 } }, // BSF
4397 { ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } },
4398 { ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } },
4399 { ISD::ROTR, MVT::i64, { 2, 3, 1, 3 } },
4400 { X86ISD::VROTLI, MVT::i64, { 1, 1, 1, 1 } },
4401 { ISD::FSHL, MVT::i64, { 4, 4, 1, 4 } },
4402 { ISD::SADDSAT, MVT::i64, { 4, 4, 7, 10 } },
4403 { ISD::SSUBSAT, MVT::i64, { 4, 5, 8, 11 } },
4404 { ISD::UADDSAT, MVT::i64, { 2, 3, 4, 7 } },
4405 { ISD::USUBSAT, MVT::i64, { 2, 3, 4, 7 } },
4406 { ISD::SMAX, MVT::i64, { 1, 3, 2, 3 } },
4407 { ISD::SMIN, MVT::i64, { 1, 3, 2, 3 } },
4408 { ISD::UMAX, MVT::i64, { 1, 3, 2, 3 } },
4409 { ISD::UMIN, MVT::i64, { 1, 3, 2, 3 } },
4410 { ISD::SADDO, MVT::i64, { 2, 2, 4, 6 } },
4411 { ISD::UADDO, MVT::i64, { 2, 2, 4, 6 } },
4412 { ISD::SMULO, MVT::i64, { 4, 4, 4, 6 } },
4413 { ISD::UMULO, MVT::i64, { 8, 8, 4, 7 } },
4414 };
4415 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
4416 { ISD::ABS, MVT::i32, { 1, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
4417 { ISD::ABS, MVT::i16, { 2, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
4418 { ISD::ABS, MVT::i8, { 2, 4, 4, 3 } }, // SUB+XOR+SRA
4419 { ISD::BITREVERSE, MVT::i32, { 9, 12, 17, 19 } },
4420 { ISD::BITREVERSE, MVT::i16, { 9, 12, 17, 19 } },
4421 { ISD::BITREVERSE, MVT::i8, { 7, 9, 13, 14 } },
4422 { ISD::BSWAP, MVT::i32, { 1, 1, 1, 1 } },
4423 { ISD::BSWAP, MVT::i16, { 1, 2, 1, 2 } }, // ROL
4424 { ISD::CTLZ, MVT::i32, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
4425 { ISD::CTLZ, MVT::i16, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
4426 { ISD::CTLZ, MVT::i8, { 2, 2, 5, 6 } }, // BSR+XOR or BSR+XOR+CMOV
4427 { ISD::CTLZ_ZERO_UNDEF, MVT::i32,{ 1, 2, 2, 2 } }, // BSR+XOR
4428 { ISD::CTLZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 2, 2 } }, // BSR+XOR
4429 { ISD::CTLZ_ZERO_UNDEF, MVT::i8, { 2, 2, 3, 3 } }, // BSR+XOR
4430 { ISD::CTTZ, MVT::i32, { 2, 2, 3, 3 } }, // TEST+BSF+CMOV/BRANCH
4431 { ISD::CTTZ, MVT::i16, { 2, 2, 2, 3 } }, // TEST+BSF+CMOV/BRANCH
4432 { ISD::CTTZ, MVT::i8, { 2, 2, 2, 3 } }, // TEST+BSF+CMOV/BRANCH
4433 { ISD::CTTZ_ZERO_UNDEF, MVT::i32,{ 1, 2, 1, 2 } }, // BSF
4434 { ISD::CTTZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 1, 2 } }, // BSF
4435 { ISD::CTTZ_ZERO_UNDEF, MVT::i8, { 2, 2, 1, 2 } }, // BSF
4436 { ISD::CTPOP, MVT::i32, { 8, 7, 15, 15 } },
4437 { ISD::CTPOP, MVT::i16, { 9, 8, 17, 17 } },
4438 { ISD::CTPOP, MVT::i8, { 7, 6, 6, 6 } },
4439 { ISD::ROTL, MVT::i32, { 2, 3, 1, 3 } },
4440 { ISD::ROTL, MVT::i16, { 2, 3, 1, 3 } },
4441 { ISD::ROTL, MVT::i8, { 2, 3, 1, 3 } },
4442 { ISD::ROTR, MVT::i32, { 2, 3, 1, 3 } },
4443 { ISD::ROTR, MVT::i16, { 2, 3, 1, 3 } },
4444 { ISD::ROTR, MVT::i8, { 2, 3, 1, 3 } },
4445 { X86ISD::VROTLI, MVT::i32, { 1, 1, 1, 1 } },
4446 { X86ISD::VROTLI, MVT::i16, { 1, 1, 1, 1 } },
4447 { X86ISD::VROTLI, MVT::i8, { 1, 1, 1, 1 } },
4448 { ISD::FSHL, MVT::i32, { 4, 4, 1, 4 } },
4449 { ISD::FSHL, MVT::i16, { 4, 4, 2, 5 } },
4450 { ISD::FSHL, MVT::i8, { 4, 4, 2, 5 } },
4451 { ISD::SADDSAT, MVT::i32, { 3, 4, 6, 9 } },
4452 { ISD::SADDSAT, MVT::i16, { 4, 4, 7, 10 } },
4453 { ISD::SADDSAT, MVT::i8, { 4, 5, 8, 11 } },
4454 { ISD::SSUBSAT, MVT::i32, { 4, 4, 7, 10 } },
4455 { ISD::SSUBSAT, MVT::i16, { 4, 4, 7, 10 } },
4456 { ISD::SSUBSAT, MVT::i8, { 4, 5, 8, 11 } },
4457 { ISD::UADDSAT, MVT::i32, { 2, 3, 4, 7 } },
4458 { ISD::UADDSAT, MVT::i16, { 2, 3, 4, 7 } },
4459 { ISD::UADDSAT, MVT::i8, { 3, 3, 5, 8 } },
4460 { ISD::USUBSAT, MVT::i32, { 2, 3, 4, 7 } },
4461 { ISD::USUBSAT, MVT::i16, { 2, 3, 4, 7 } },
4462 { ISD::USUBSAT, MVT::i8, { 3, 3, 5, 8 } },
4463 { ISD::SMAX, MVT::i32, { 1, 2, 2, 3 } },
4464 { ISD::SMAX, MVT::i16, { 1, 4, 2, 4 } },
4465 { ISD::SMAX, MVT::i8, { 1, 4, 2, 4 } },
4466 { ISD::SMIN, MVT::i32, { 1, 2, 2, 3 } },
4467 { ISD::SMIN, MVT::i16, { 1, 4, 2, 4 } },
4468 { ISD::SMIN, MVT::i8, { 1, 4, 2, 4 } },
4469 { ISD::UMAX, MVT::i32, { 1, 2, 2, 3 } },
4470 { ISD::UMAX, MVT::i16, { 1, 4, 2, 4 } },
4471 { ISD::UMAX, MVT::i8, { 1, 4, 2, 4 } },
4472 { ISD::UMIN, MVT::i32, { 1, 2, 2, 3 } },
4473 { ISD::UMIN, MVT::i16, { 1, 4, 2, 4 } },
4474 { ISD::UMIN, MVT::i8, { 1, 4, 2, 4 } },
4475 { ISD::SADDO, MVT::i32, { 2, 2, 4, 6 } },
4476 { ISD::SADDO, MVT::i16, { 2, 2, 4, 6 } },
4477 { ISD::SADDO, MVT::i8, { 2, 2, 4, 6 } },
4478 { ISD::UADDO, MVT::i32, { 2, 2, 4, 6 } },
4479 { ISD::UADDO, MVT::i16, { 2, 2, 4, 6 } },
4480 { ISD::UADDO, MVT::i8, { 2, 2, 4, 6 } },
4481 { ISD::SMULO, MVT::i32, { 2, 2, 4, 6 } },
4482 { ISD::SMULO, MVT::i16, { 5, 5, 4, 6 } },
4483 { ISD::SMULO, MVT::i8, { 6, 6, 4, 6 } },
4484 { ISD::UMULO, MVT::i32, { 6, 6, 4, 8 } },
4485 { ISD::UMULO, MVT::i16, { 6, 6, 4, 9 } },
4486 { ISD::UMULO, MVT::i8, { 6, 6, 4, 6 } },
4487 };
4488
4489 Type *RetTy = ICA.getReturnType();
4490 Type *OpTy = RetTy;
4491 Intrinsic::ID IID = ICA.getID();
4492 unsigned ISD = ISD::DELETED_NODE;
4493 switch (IID) {
4494 default:
4495 break;
4496 case Intrinsic::abs:
4497 ISD = ISD::ABS;
4498 break;
4499 case Intrinsic::bitreverse:
4501 break;
4502 case Intrinsic::bswap:
4503 ISD = ISD::BSWAP;
4504 break;
4505 case Intrinsic::ctlz:
4506 ISD = ISD::CTLZ;
4507 break;
4508 case Intrinsic::ctpop:
4509 ISD = ISD::CTPOP;
4510 break;
4511 case Intrinsic::cttz:
4512 ISD = ISD::CTTZ;
4513 break;
4514 case Intrinsic::fshl:
4515 ISD = ISD::FSHL;
4516 if (!ICA.isTypeBasedOnly()) {
4517 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4518 if (Args[0] == Args[1]) {
4519 ISD = ISD::ROTL;
4520 // Handle uniform constant rotation amounts.
4521 // TODO: Handle funnel-shift cases.
4522 const APInt *Amt;
4523 if (Args[2] &&
4526 }
4527 }
4528 break;
4529 case Intrinsic::fshr:
4530 // FSHR has same costs so don't duplicate.
4531 ISD = ISD::FSHL;
4532 if (!ICA.isTypeBasedOnly()) {
4533 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4534 if (Args[0] == Args[1]) {
4535 ISD = ISD::ROTR;
4536 // Handle uniform constant rotation amount.
4537 // TODO: Handle funnel-shift cases.
4538 const APInt *Amt;
4539 if (Args[2] &&
4542 }
4543 }
4544 break;
4545 case Intrinsic::lrint:
4546 case Intrinsic::llrint: {
4547 // X86 can use the CVTP2SI instructions to lower lrint/llrint calls, which
4548 // have the same costs as the CVTTP2SI (fptosi) instructions
4549 const SmallVectorImpl<Type *> &ArgTys = ICA.getArgTypes();
4550 return getCastInstrCost(Instruction::FPToSI, RetTy, ArgTys[0],
4552 }
4553 case Intrinsic::maxnum:
4554 case Intrinsic::minnum:
4555 // FMINNUM has same costs so don't duplicate.
4556 ISD = ISD::FMAXNUM;
4557 break;
4558 case Intrinsic::sadd_sat:
4559 ISD = ISD::SADDSAT;
4560 break;
4561 case Intrinsic::smax:
4562 ISD = ISD::SMAX;
4563 break;
4564 case Intrinsic::smin:
4565 ISD = ISD::SMIN;
4566 break;
4567 case Intrinsic::ssub_sat:
4568 ISD = ISD::SSUBSAT;
4569 break;
4570 case Intrinsic::uadd_sat:
4571 ISD = ISD::UADDSAT;
4572 break;
4573 case Intrinsic::umax:
4574 ISD = ISD::UMAX;
4575 break;
4576 case Intrinsic::umin:
4577 ISD = ISD::UMIN;
4578 break;
4579 case Intrinsic::usub_sat:
4580 ISD = ISD::USUBSAT;
4581 break;
4582 case Intrinsic::sqrt:
4583 ISD = ISD::FSQRT;
4584 break;
4585 case Intrinsic::sadd_with_overflow:
4586 case Intrinsic::ssub_with_overflow:
4587 // SSUBO has same costs so don't duplicate.
4588 ISD = ISD::SADDO;
4589 OpTy = RetTy->getContainedType(0);
4590 break;
4591 case Intrinsic::uadd_with_overflow:
4592 case Intrinsic::usub_with_overflow:
4593 // USUBO has same costs so don't duplicate.
4594 ISD = ISD::UADDO;
4595 OpTy = RetTy->getContainedType(0);
4596 break;
4597 case Intrinsic::smul_with_overflow:
4598 ISD = ISD::SMULO;
4599 OpTy = RetTy->getContainedType(0);
4600 break;
4601 case Intrinsic::umul_with_overflow:
4602 ISD = ISD::UMULO;
4603 OpTy = RetTy->getContainedType(0);
4604 break;
4605 }
4606
4607 if (ISD != ISD::DELETED_NODE) {
4608 auto adjustTableCost = [&](int ISD, unsigned Cost,
4609 std::pair<InstructionCost, MVT> LT,
4611 InstructionCost LegalizationCost = LT.first;
4612 MVT MTy = LT.second;
4613
4614 // If there are no NANs to deal with, then these are reduced to a
4615 // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
4616 // assume is used in the non-fast case.
4617 if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) {
4618 if (FMF.noNaNs())
4619 return LegalizationCost * 1;
4620 }
4621
4622 // For cases where some ops can be folded into a load/store, assume free.
4623 if (MTy.isScalarInteger()) {
4624 if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
4625 if (const Instruction *II = ICA.getInst()) {
4626 if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
4627 return TTI::TCC_Free;
4628 if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
4629 if (LI->hasOneUse())
4630 return TTI::TCC_Free;
4631 }
4632 }
4633 }
4634 }
4635
4636 return LegalizationCost * (int)Cost;
4637 };
4638
4639 // Legalize the type.
4640 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy);
4641 MVT MTy = LT.second;
4642
4643 // Without BMI/LZCNT see if we're only looking for a *_ZERO_UNDEF cost.
4644 if (((ISD == ISD::CTTZ && !ST->hasBMI()) ||
4645 (ISD == ISD::CTLZ && !ST->hasLZCNT())) &&
4646 !MTy.isVector() && !ICA.isTypeBasedOnly()) {
4647 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4648 if (auto *Cst = dyn_cast<ConstantInt>(Args[1]))
4649 if (Cst->isAllOnesValue())
4651 }
4652
4653 // FSQRT is a single instruction.
4654 if (ISD == ISD::FSQRT && CostKind == TTI::TCK_CodeSize)
4655 return LT.first;
4656
4657 if (ST->useGLMDivSqrtCosts())
4658 if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
4659 if (auto KindCost = Entry->Cost[CostKind])
4660 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4661
4662 if (ST->useSLMArithCosts())
4663 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
4664 if (auto KindCost = Entry->Cost[CostKind])
4665 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4666
4667 if (ST->hasVBMI2())
4668 if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy))
4669 if (auto KindCost = Entry->Cost[CostKind])
4670 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4671
4672 if (ST->hasBITALG())
4673 if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy))
4674 if (auto KindCost = Entry->Cost[CostKind])
4675 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4676
4677 if (ST->hasVPOPCNTDQ())
4678 if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy))
4679 if (auto KindCost = Entry->Cost[CostKind])
4680 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4681
4682 if (ST->hasGFNI())
4683 if (const auto *Entry = CostTableLookup(GFNICostTbl, ISD, MTy))
4684 if (auto KindCost = Entry->Cost[CostKind])
4685 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4686
4687 if (ST->hasCDI())
4688 if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
4689 if (auto KindCost = Entry->Cost[CostKind])
4690 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4691
4692 if (ST->hasBWI())
4693 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
4694 if (auto KindCost = Entry->Cost[CostKind])
4695 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4696
4697 if (ST->hasAVX512())
4698 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
4699 if (auto KindCost = Entry->Cost[CostKind])
4700 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4701
4702 if (ST->hasXOP())
4703 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
4704 if (auto KindCost = Entry->Cost[CostKind])
4705 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4706
4707 if (ST->hasAVX2())
4708 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
4709 if (auto KindCost = Entry->Cost[CostKind])
4710 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4711
4712 if (ST->hasAVX())
4713 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
4714 if (auto KindCost = Entry->Cost[CostKind])
4715 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4716
4717 if (ST->hasSSE42())
4718 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
4719 if (auto KindCost = Entry->Cost[CostKind])
4720 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4721
4722 if (ST->hasSSE41())
4723 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
4724 if (auto KindCost = Entry->Cost[CostKind])
4725 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4726
4727 if (ST->hasSSSE3())
4728 if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
4729 if (auto KindCost = Entry->Cost[CostKind])
4730 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4731
4732 if (ST->hasSSE2())
4733 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
4734 if (auto KindCost = Entry->Cost[CostKind])
4735 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4736
4737 if (ST->hasSSE1())
4738 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
4739 if (auto KindCost = Entry->Cost[CostKind])
4740 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4741
4742 if (ST->hasBMI()) {
4743 if (ST->is64Bit())
4744 if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
4745 if (auto KindCost = Entry->Cost[CostKind])
4746 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4747
4748 if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
4749 if (auto KindCost = Entry->Cost[CostKind])
4750 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4751 }
4752
4753 if (ST->hasLZCNT()) {
4754 if (ST->is64Bit())
4755 if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
4756 if (auto KindCost = Entry->Cost[CostKind])
4757 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4758
4759 if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
4760 if (auto KindCost = Entry->Cost[CostKind])
4761 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4762 }
4763
4764 if (ST->hasPOPCNT()) {
4765 if (ST->is64Bit())
4766 if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
4767 if (auto KindCost = Entry->Cost[CostKind])
4768 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4769
4770 if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
4771 if (auto KindCost = Entry->Cost[CostKind])
4772 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4773 }
4774
4775 if (ST->is64Bit())
4776 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
4777 if (auto KindCost = Entry->Cost[CostKind])
4778 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4779
4780 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
4781 if (auto KindCost = Entry->Cost[CostKind])
4782 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4783
4784 // Without arg data, we need to compute the expanded costs of custom lowered
4785 // intrinsics to prevent use of the (very low) default costs.
4786 if (ICA.isTypeBasedOnly() &&
4787 (IID == Intrinsic::fshl || IID == Intrinsic::fshr)) {
4788 Type *CondTy = RetTy->getWithNewBitWidth(1);
4790 Cost += getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
4791 Cost += getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
4792 Cost += getArithmeticInstrCost(BinaryOperator::Shl, RetTy, CostKind);
4793 Cost += getArithmeticInstrCost(BinaryOperator::LShr, RetTy, CostKind);
4794 Cost += getArithmeticInstrCost(BinaryOperator::And, RetTy, CostKind);
4795 Cost += getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
4797 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
4799 return Cost;
4800 }
4801 }
4802
4804}
4805
4808 unsigned Index, const Value *Op0,
4809 const Value *Op1) const {
4810 static const CostTblEntry SLMCostTbl[] = {
4811 { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 },
4812 { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 },
4813 { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 },
4814 { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 }
4815 };
4816
4817 assert(Val->isVectorTy() && "This must be a vector type");
4818 Type *ScalarType = Val->getScalarType();
4819 InstructionCost RegisterFileMoveCost = 0;
4820
4821 // Non-immediate extraction/insertion can be handled as a sequence of
4822 // aliased loads+stores via the stack.
4823 if (Index == -1U && (Opcode == Instruction::ExtractElement ||
4824 Opcode == Instruction::InsertElement)) {
4825 // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
4826 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
4827
4828 // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling.
4829 assert(isa<FixedVectorType>(Val) && "Fixed vector type expected");
4830 Align VecAlign = DL.getPrefTypeAlign(Val);
4831 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
4832
4833 // Extract - store vector to stack, load scalar.
4834 if (Opcode == Instruction::ExtractElement) {
4835 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4836 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
4837 CostKind);
4838 }
4839 // Insert - store vector to stack, store scalar, load vector.
4840 if (Opcode == Instruction::InsertElement) {
4841 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4842 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
4843 CostKind) +
4844 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind);
4845 }
4846 }
4847
4848 if (Index != -1U && (Opcode == Instruction::ExtractElement ||
4849 Opcode == Instruction::InsertElement)) {
4850 // Extraction of vXi1 elements are now efficiently handled by MOVMSK.
4851 if (Opcode == Instruction::ExtractElement &&
4852 ScalarType->getScalarSizeInBits() == 1 &&
4853 cast<FixedVectorType>(Val)->getNumElements() > 1)
4854 return 1;
4855
4856 // Legalize the type.
4857 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
4858
4859 // This type is legalized to a scalar type.
4860 if (!LT.second.isVector())
4861 return TTI::TCC_Free;
4862
4863 // The type may be split. Normalize the index to the new type.
4864 unsigned SizeInBits = LT.second.getSizeInBits();
4865 unsigned NumElts = LT.second.getVectorNumElements();
4866 unsigned SubNumElts = NumElts;
4867 Index = Index % NumElts;
4868
4869 // For >128-bit vectors, we need to extract higher 128-bit subvectors.
4870 // For inserts, we also need to insert the subvector back.
4871 if (SizeInBits > 128) {
4872 assert((SizeInBits % 128) == 0 && "Illegal vector");
4873 unsigned NumSubVecs = SizeInBits / 128;
4874 SubNumElts = NumElts / NumSubVecs;
4875 if (SubNumElts <= Index) {
4876 RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
4877 Index %= SubNumElts;
4878 }
4879 }
4880
4881 MVT MScalarTy = LT.second.getScalarType();
4882 auto IsCheapPInsrPExtrInsertPS = [&]() {
4883 // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
4884 // Inserting f32 into index0 is just movss.
4885 // Also, assume insertps is relatively cheap on all >= SSE41 targets.
4886 return (MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4887 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4888 (MScalarTy == MVT::f32 && ST->hasSSE1() && Index == 0 &&
4889 Opcode == Instruction::InsertElement) ||
4890 (MScalarTy == MVT::f32 && ST->hasSSE41() &&
4891 Opcode == Instruction::InsertElement);
4892 };
4893
4894 if (Index == 0) {
4895 // Floating point scalars are already located in index #0.
4896 // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
4897 // true for all.
4898 if (ScalarType->isFloatingPointTy() &&
4899 (Opcode != Instruction::InsertElement || !Op0 ||
4900 isa<UndefValue>(Op0)))
4901 return RegisterFileMoveCost;
4902
4903 if (Opcode == Instruction::InsertElement &&
4905 // Consider the gather cost to be cheap.
4907 return RegisterFileMoveCost;
4908 if (!IsCheapPInsrPExtrInsertPS()) {
4909 // mov constant-to-GPR + movd/movq GPR -> XMM.
4910 if (isa_and_nonnull<Constant>(Op1) && Op1->getType()->isIntegerTy())
4911 return 2 + RegisterFileMoveCost;
4912 // Assume movd/movq GPR -> XMM is relatively cheap on all targets.
4913 return 1 + RegisterFileMoveCost;
4914 }
4915 }
4916
4917 // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
4918 if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
4919 return 1 + RegisterFileMoveCost;
4920 }
4921
4922 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4923 assert(ISD && "Unexpected vector opcode");
4924 if (ST->useSLMArithCosts())
4925 if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
4926 return Entry->Cost + RegisterFileMoveCost;
4927
4928 // Consider cheap cases.
4929 if (IsCheapPInsrPExtrInsertPS())
4930 return 1 + RegisterFileMoveCost;
4931
4932 // For extractions we just need to shuffle the element to index 0, which
4933 // should be very cheap (assume cost = 1). For insertions we need to shuffle
4934 // the elements to its destination. In both cases we must handle the
4935 // subvector move(s).
4936 // If the vector type is already less than 128-bits then don't reduce it.
4937 // TODO: Under what circumstances should we shuffle using the full width?
4938 InstructionCost ShuffleCost = 1;
4939 if (Opcode == Instruction::InsertElement) {
4940 auto *SubTy = cast<VectorType>(Val);
4941 EVT VT = TLI->getValueType(DL, Val);
4942 if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
4943 SubTy = FixedVectorType::get(ScalarType, SubNumElts);
4944 ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, SubTy, {},
4945 CostKind, 0, SubTy);
4946 }
4947 int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
4948 return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
4949 }
4950
4951 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1) +
4952 RegisterFileMoveCost;
4953}
4954
4956 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
4957 TTI::TargetCostKind CostKind, bool ForPoisonSrc,
4958 ArrayRef<Value *> VL) const {
4959 assert(DemandedElts.getBitWidth() ==
4960 cast<FixedVectorType>(Ty)->getNumElements() &&
4961 "Vector size mismatch");
4962
4963 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4964 MVT MScalarTy = LT.second.getScalarType();
4965 unsigned LegalVectorBitWidth = LT.second.getSizeInBits();
4967
4968 constexpr unsigned LaneBitWidth = 128;
4969 assert((LegalVectorBitWidth < LaneBitWidth ||
4970 (LegalVectorBitWidth % LaneBitWidth) == 0) &&
4971 "Illegal vector");
4972
4973 const int NumLegalVectors = LT.first.getValue();
4974 assert(NumLegalVectors >= 0 && "Negative cost!");
4975
4976 // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
4977 // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT. SLPVectorizer has
4978 // a special heuristic regarding poison input which is passed here in
4979 // ForPoisonSrc.
4980 if (Insert && !ForPoisonSrc) {
4981 // This is nearly identical to BaseT::getScalarizationOverhead(), except
4982 // it is passing nullptr to getVectorInstrCost() for Op0 (instead of
4983 // Constant::getNullValue()), which makes the X86TTIImpl
4984 // getVectorInstrCost() return 0 instead of 1.
4985 for (unsigned I : seq(DemandedElts.getBitWidth())) {
4986 if (!DemandedElts[I])
4987 continue;
4988 Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, I,
4990 VL.empty() ? nullptr : VL[I]);
4991 }
4992 return Cost;
4993 }
4994
4995 if (Insert) {
4996 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4997 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4998 (MScalarTy == MVT::f32 && ST->hasSSE41())) {
4999 // For types we can insert directly, insertion into 128-bit sub vectors is
5000 // cheap, followed by a cheap chain of concatenations.
5001 if (LegalVectorBitWidth <= LaneBitWidth) {
5002 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
5003 /*Extract*/ false, CostKind);
5004 } else {
5005 // In each 128-lane, if at least one index is demanded but not all
5006 // indices are demanded and this 128-lane is not the first 128-lane of
5007 // the legalized-vector, then this 128-lane needs a extracti128; If in
5008 // each 128-lane, there is at least one demanded index, this 128-lane
5009 // needs a inserti128.
5010
5011 // The following cases will help you build a better understanding:
5012 // Assume we insert several elements into a v8i32 vector in avx2,
5013 // Case#1: inserting into 1th index needs vpinsrd + inserti128.
5014 // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
5015 // inserti128.
5016 // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
5017 assert((LegalVectorBitWidth % LaneBitWidth) == 0 && "Illegal vector");
5018 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
5019 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
5020 unsigned NumLegalElts =
5021 LT.second.getVectorNumElements() * NumLegalVectors;
5022 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
5023 "Vector has been legalized to smaller element count");
5024 assert((NumLegalElts % NumLanesTotal) == 0 &&
5025 "Unexpected elts per lane");
5026 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
5027
5028 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
5029 auto *LaneTy =
5030 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
5031
5032 for (unsigned I = 0; I != NumLanesTotal; ++I) {
5033 APInt LaneEltMask = WidenedDemandedElts.extractBits(
5034 NumEltsPerLane, NumEltsPerLane * I);
5035 if (LaneEltMask.isZero())
5036 continue;
5037 // FIXME: we don't need to extract if all non-demanded elements
5038 // are legalization-inserted padding.
5039 if (!LaneEltMask.isAllOnes())
5041 CostKind, I * NumEltsPerLane, LaneTy);
5042 Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert,
5043 /*Extract*/ false, CostKind);
5044 }
5045
5046 APInt AffectedLanes =
5047 APIntOps::ScaleBitMask(WidenedDemandedElts, NumLanesTotal);
5048 APInt FullyAffectedLegalVectors = APIntOps::ScaleBitMask(
5049 AffectedLanes, NumLegalVectors, /*MatchAllBits=*/true);
5050 for (int LegalVec = 0; LegalVec != NumLegalVectors; ++LegalVec) {
5051 for (unsigned Lane = 0; Lane != NumLegalLanes; ++Lane) {
5052 unsigned I = NumLegalLanes * LegalVec + Lane;
5053 // No need to insert unaffected lane; or lane 0 of each legal vector
5054 // iff ALL lanes of that vector were affected and will be inserted.
5055 if (!AffectedLanes[I] ||
5056 (Lane == 0 && FullyAffectedLegalVectors[LegalVec]))
5057 continue;
5059 CostKind, I * NumEltsPerLane, LaneTy);
5060 }
5061 }
5062 }
5063 } else if (LT.second.isVector()) {
5064 // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
5065 // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
5066 // series of UNPCK followed by CONCAT_VECTORS - all of these can be
5067 // considered cheap.
5068 if (Ty->isIntOrIntVectorTy())
5069 Cost += DemandedElts.popcount();
5070
5071 // Get the smaller of the legalized or original pow2-extended number of
5072 // vector elements, which represents the number of unpacks we'll end up
5073 // performing.
5074 unsigned NumElts = LT.second.getVectorNumElements();
5075 unsigned Pow2Elts =
5077 Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
5078 }
5079 }
5080
5081 if (Extract) {
5082 // vXi1 can be efficiently extracted with MOVMSK.
5083 // TODO: AVX512 predicate mask handling.
5084 // NOTE: This doesn't work well for roundtrip scalarization.
5085 if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) {
5086 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
5087 unsigned MaxElts = ST->hasAVX2() ? 32 : 16;
5088 unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts;
5089 return MOVMSKCost;
5090 }
5091
5092 if (LT.second.isVector()) {
5093 unsigned NumLegalElts =
5094 LT.second.getVectorNumElements() * NumLegalVectors;
5095 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
5096 "Vector has been legalized to smaller element count");
5097
5098 // If we're extracting elements from a 128-bit subvector lane,
5099 // we only need to extract each lane once, not for every element.
5100 if (LegalVectorBitWidth > LaneBitWidth) {
5101 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
5102 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
5103 assert((NumLegalElts % NumLanesTotal) == 0 &&
5104 "Unexpected elts per lane");
5105 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
5106
5107 // Add cost for each demanded 128-bit subvector extraction.
5108 // Luckily this is a lot easier than for insertion.
5109 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
5110 auto *LaneTy =
5111 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
5112
5113 for (unsigned I = 0; I != NumLanesTotal; ++I) {
5114 APInt LaneEltMask = WidenedDemandedElts.extractBits(
5115 NumEltsPerLane, I * NumEltsPerLane);
5116 if (LaneEltMask.isZero())
5117 continue;
5119 I * NumEltsPerLane, LaneTy);
5121 LaneTy, LaneEltMask, /*Insert*/ false, Extract, CostKind);
5122 }
5123
5124 return Cost;
5125 }
5126 }
5127
5128 // Fallback to default extraction.
5129 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ false,
5130 Extract, CostKind);
5131 }
5132
5133 return Cost;
5134}
5135
5137X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
5138 int VF, const APInt &DemandedDstElts,
5140 const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy);
5141 // We don't differentiate element types here, only element bit width.
5142 EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits);
5143
5144 auto bailout = [&]() {
5145 return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
5146 DemandedDstElts, CostKind);
5147 };
5148
5149 // For now, only deal with AVX512 cases.
5150 if (!ST->hasAVX512())
5151 return bailout();
5152
5153 // Do we have a native shuffle for this element type, or should we promote?
5154 unsigned PromEltTyBits = EltTyBits;
5155 switch (EltTyBits) {
5156 case 32:
5157 case 64:
5158 break; // AVX512F.
5159 case 16:
5160 if (!ST->hasBWI())
5161 PromEltTyBits = 32; // promote to i32, AVX512F.
5162 break; // AVX512BW
5163 case 8:
5164 if (!ST->hasVBMI())
5165 PromEltTyBits = 32; // promote to i32, AVX512F.
5166 break; // AVX512VBMI
5167 case 1:
5168 // There is no support for shuffling i1 elements. We *must* promote.
5169 if (ST->hasBWI()) {
5170 if (ST->hasVBMI())
5171 PromEltTyBits = 8; // promote to i8, AVX512VBMI.
5172 else
5173 PromEltTyBits = 16; // promote to i16, AVX512BW.
5174 break;
5175 }
5176 PromEltTyBits = 32; // promote to i32, AVX512F.
5177 break;
5178 default:
5179 return bailout();
5180 }
5181 auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits);
5182
5183 auto *SrcVecTy = FixedVectorType::get(EltTy, VF);
5184 auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF);
5185
5186 int NumDstElements = VF * ReplicationFactor;
5187 auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements);
5188 auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements);
5189
5190 // Legalize the types.
5191 MVT LegalSrcVecTy = getTypeLegalizationCost(SrcVecTy).second;
5192 MVT LegalPromSrcVecTy = getTypeLegalizationCost(PromSrcVecTy).second;
5193 MVT LegalPromDstVecTy = getTypeLegalizationCost(PromDstVecTy).second;
5194 MVT LegalDstVecTy = getTypeLegalizationCost(DstVecTy).second;
5195 // They should have legalized into vector types.
5196 if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() ||
5197 !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector())
5198 return bailout();
5199
5200 if (PromEltTyBits != EltTyBits) {
5201 // If we have to perform the shuffle with wider elt type than our data type,
5202 // then we will first need to anyext (we don't care about the new bits)
5203 // the source elements, and then truncate Dst elements.
5204 InstructionCost PromotionCost;
5205 PromotionCost += getCastInstrCost(
5206 Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy,
5208 PromotionCost +=
5209 getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy,
5210 /*Src=*/PromDstVecTy,
5212 return PromotionCost + getReplicationShuffleCost(PromEltTy,
5213 ReplicationFactor, VF,
5214 DemandedDstElts, CostKind);
5215 }
5216
5217 assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits &&
5218 LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() &&
5219 "We expect that the legalization doesn't affect the element width, "
5220 "doesn't coalesce/split elements.");
5221
5222 unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements();
5223 unsigned NumDstVectors =
5224 divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec);
5225
5226 auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec);
5227
5228 // Not all the produced Dst elements may be demanded. In our case,
5229 // given that a single Dst vector is formed by a single shuffle,
5230 // if all elements that will form a single Dst vector aren't demanded,
5231 // then we won't need to do that shuffle, so adjust the cost accordingly.
5232 APInt DemandedDstVectors = APIntOps::ScaleBitMask(
5233 DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors);
5234 unsigned NumDstVectorsDemanded = DemandedDstVectors.popcount();
5235
5236 InstructionCost SingleShuffleCost =
5237 getShuffleCost(TTI::SK_PermuteSingleSrc, SingleDstVecTy, SingleDstVecTy,
5238 /*Mask=*/{}, CostKind,
5239 /*Index=*/0, /*SubTp=*/nullptr);
5240 return NumDstVectorsDemanded * SingleShuffleCost;
5241}
5242
5244 Align Alignment,
5245 unsigned AddressSpace,
5247 TTI::OperandValueInfo OpInfo,
5248 const Instruction *I) const {
5249 // TODO: Handle other cost kinds.
5251 if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
5252 // Store instruction with index and scale costs 2 Uops.
5253 // Check the preceding GEP to identify non-const indices.
5254 if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
5255 if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
5256 return TTI::TCC_Basic * 2;
5257 }
5258 }
5259 return TTI::TCC_Basic;
5260 }
5261
5262 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
5263 "Invalid Opcode");
5264 // Type legalization can't handle structs
5265 if (TLI->getValueType(DL, Src, true) == MVT::Other)
5266 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5267 CostKind, OpInfo, I);
5268
5269 // Legalize the type.
5270 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
5271
5272 auto *VTy = dyn_cast<FixedVectorType>(Src);
5273
5275
5276 // Add a cost for constant load to vector.
5277 if (Opcode == Instruction::Store && OpInfo.isConstant())
5278 Cost += getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src),
5279 /*AddressSpace=*/0, CostKind, OpInfo);
5280
5281 // Handle the simple case of non-vectors.
5282 // NOTE: this assumes that legalization never creates vector from scalars!
5283 if (!VTy || !LT.second.isVector()) {
5284 // Each load/store unit costs 1.
5285 return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1;
5286 }
5287
5288 bool IsLoad = Opcode == Instruction::Load;
5289
5290 Type *EltTy = VTy->getElementType();
5291
5292 const int EltTyBits = DL.getTypeSizeInBits(EltTy);
5293
5294 // Source of truth: how many elements were there in the original IR vector?
5295 const unsigned SrcNumElt = VTy->getNumElements();
5296
5297 // How far have we gotten?
5298 int NumEltRemaining = SrcNumElt;
5299 // Note that we intentionally capture by-reference, NumEltRemaining changes.
5300 auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
5301
5302 const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8);
5303
5304 // Note that even if we can store 64 bits of an XMM, we still operate on XMM.
5305 const unsigned XMMBits = 128;
5306 if (XMMBits % EltTyBits != 0)
5307 // Vector size must be a multiple of the element size. I.e. no padding.
5308 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5309 CostKind, OpInfo, I);
5310 const int NumEltPerXMM = XMMBits / EltTyBits;
5311
5312 auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
5313
5314 for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0;
5315 NumEltRemaining > 0; CurrOpSizeBytes /= 2) {
5316 // How many elements would a single op deal with at once?
5317 if ((8 * CurrOpSizeBytes) % EltTyBits != 0)
5318 // Vector size must be a multiple of the element size. I.e. no padding.
5319 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5320 CostKind, OpInfo, I);
5321 int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits;
5322
5323 assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?");
5324 assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||
5325 (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&
5326 "Unless we haven't halved the op size yet, "
5327 "we have less than two op's sized units of work left.");
5328
5329 auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
5330 ? FixedVectorType::get(EltTy, CurrNumEltPerOp)
5331 : XMMVecTy;
5332
5333 assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&
5334 "After halving sizes, the vector elt count is no longer a multiple "
5335 "of number of elements per operation?");
5336 auto *CoalescedVecTy =
5337 CurrNumEltPerOp == 1
5338 ? CurrVecTy
5340 IntegerType::get(Src->getContext(),
5341 EltTyBits * CurrNumEltPerOp),
5342 CurrVecTy->getNumElements() / CurrNumEltPerOp);
5343 assert(DL.getTypeSizeInBits(CoalescedVecTy) ==
5344 DL.getTypeSizeInBits(CurrVecTy) &&
5345 "coalesciing elements doesn't change vector width.");
5346
5347 while (NumEltRemaining > 0) {
5348 assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?");
5349
5350 // Can we use this vector size, as per the remaining element count?
5351 // Iff the vector is naturally aligned, we can do a wide load regardless.
5352 if (NumEltRemaining < CurrNumEltPerOp &&
5353 (!IsLoad || Alignment < CurrOpSizeBytes) && CurrOpSizeBytes != 1)
5354 break; // Try smalled vector size.
5355
5356 // This isn't exactly right. We're using slow unaligned 32-byte accesses
5357 // as a proxy for a double-pumped AVX memory interface such as on
5358 // Sandybridge.
5359 // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or
5360 // will be scalarized.
5361 if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
5362 Cost += 2;
5363 else if (CurrOpSizeBytes < 4)
5364 Cost += 2;
5365 else
5366 Cost += 1;
5367
5368 // If we're loading a uniform value, then we don't need to split the load,
5369 // loading just a single (widest) vector can be reused by all splits.
5370 if (IsLoad && OpInfo.isUniform())
5371 return Cost;
5372
5373 bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
5374
5375 // If we have fully processed the previous reg, we need to replenish it.
5376 if (SubVecEltsLeft == 0) {
5377 SubVecEltsLeft += CurrVecTy->getNumElements();
5378 // And that's free only for the 0'th subvector of a legalized vector.
5379 if (!Is0thSubVec)
5380 Cost +=
5383 VTy, VTy, {}, CostKind, NumEltDone(), CurrVecTy);
5384 }
5385
5386 // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
5387 // for smaller widths (32/16/8) we have to insert/extract them separately.
5388 // Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
5389 // but let's pretend that it is also true for 16/8 bit wide ops...)
5390 if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
5391 int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
5392 assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "");
5393 int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
5394 APInt DemandedElts =
5395 APInt::getBitsSet(CoalescedVecTy->getNumElements(),
5396 CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
5397 assert(DemandedElts.popcount() == 1 && "Inserting single value");
5398 Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
5399 !IsLoad, CostKind);
5400 }
5401
5402 SubVecEltsLeft -= CurrNumEltPerOp;
5403 NumEltRemaining -= CurrNumEltPerOp;
5404 Alignment = commonAlignment(Alignment, CurrOpSizeBytes);
5405 }
5406 }
5407
5408 assert(NumEltRemaining <= 0 && "Should have processed all the elements.");
5409
5410 return Cost;
5411}
5412
5416 unsigned Opcode = MICA.getID() == Intrinsic::masked_load ? Instruction::Load
5417 : Instruction::Store;
5418 Type *SrcTy = MICA.getDataType();
5419 Align Alignment = MICA.getAlignment();
5420 unsigned AddressSpace = MICA.getAddressSpace();
5421
5422 bool IsLoad = (Instruction::Load == Opcode);
5423 bool IsStore = (Instruction::Store == Opcode);
5424
5425 auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
5426 if (!SrcVTy)
5427 // To calculate scalar take the regular cost, without mask
5428 return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
5429
5430 unsigned NumElem = SrcVTy->getNumElements();
5431 auto *MaskTy =
5432 FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
5433 if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment, AddressSpace)) ||
5434 (IsStore && !isLegalMaskedStore(SrcVTy, Alignment, AddressSpace))) {
5435 // Scalarization
5436 APInt DemandedElts = APInt::getAllOnes(NumElem);
5438 MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind);
5439 InstructionCost ScalarCompareCost = getCmpSelInstrCost(
5440 Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
5442 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
5443 InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
5445 SrcVTy, DemandedElts, IsLoad, IsStore, CostKind);
5446 InstructionCost MemopCost =
5447 NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
5448 Alignment, AddressSpace, CostKind);
5449 return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
5450 }
5451
5452 // Legalize the type.
5453 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy);
5454 auto VT = TLI->getValueType(DL, SrcVTy);
5456 MVT Ty = LT.second;
5457 if (Ty == MVT::i16 || Ty == MVT::i32 || Ty == MVT::i64)
5458 // APX masked load/store for scalar is cheap.
5459 return Cost + LT.first;
5460
5461 if (VT.isSimple() && Ty != VT.getSimpleVT() &&
5462 LT.second.getVectorNumElements() == NumElem)
5463 // Promotion requires extend/truncate for data and a shuffle for mask.
5464 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, SrcVTy, {}, CostKind,
5465 0, nullptr) +
5466 getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, MaskTy, {}, CostKind,
5467 0, nullptr);
5468
5469 else if (LT.first * Ty.getVectorNumElements() > NumElem) {
5470 auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
5471 (unsigned)LT.first.getValue() *
5472 Ty.getVectorNumElements());
5473 // Expanding requires fill mask with zeroes
5474 Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, NewMaskTy, {},
5475 CostKind, 0, MaskTy);
5476 }
5477
5478 // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
5479 if (!ST->hasAVX512())
5480 return Cost + LT.first * (IsLoad ? 2 : 8);
5481
5482 // AVX-512 masked load/store is cheaper
5483 return Cost + LT.first;
5484}
5485
5487 ArrayRef<const Value *> Ptrs, const Value *Base,
5488 const TTI::PointersChainInfo &Info, Type *AccessTy,
5490 if (Info.isSameBase() && Info.isKnownStride()) {
5491 // If all the pointers have known stride all the differences are translated
5492 // into constants. X86 memory addressing allows encoding it into
5493 // displacement. So we just need to take the base GEP cost.
5494 if (const auto *BaseGEP = dyn_cast<GetElementPtrInst>(Base)) {
5495 SmallVector<const Value *> Indices(BaseGEP->indices());
5496 return getGEPCost(BaseGEP->getSourceElementType(),
5497 BaseGEP->getPointerOperand(), Indices, nullptr,
5498 CostKind);
5499 }
5500 return TTI::TCC_Free;
5501 }
5502 return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind);
5503}
5504
5507 const SCEV *Ptr,
5509 // Address computations in vectorized code with non-consecutive addresses will
5510 // likely result in more instructions compared to scalar code where the
5511 // computation can more often be merged into the index mode. The resulting
5512 // extra micro-ops can significantly decrease throughput.
5513 const unsigned NumVectorInstToHideOverhead = 10;
5514
5515 // Cost modeling of Strided Access Computation is hidden by the indexing
5516 // modes of X86 regardless of the stride value. We dont believe that there
5517 // is a difference between constant strided access in gerenal and constant
5518 // strided value which is less than or equal to 64.
5519 // Even in the case of (loop invariant) stride whose value is not known at
5520 // compile time, the address computation will not incur more than one extra
5521 // ADD instruction.
5522 if (PtrTy->isVectorTy() && SE && !ST->hasAVX2()) {
5523 // TODO: AVX2 is the current cut-off because we don't have correct
5524 // interleaving costs for prior ISA's.
5525 if (!BaseT::isStridedAccess(Ptr))
5526 return NumVectorInstToHideOverhead;
5527 if (!BaseT::getConstantStrideStep(SE, Ptr))
5528 return 1;
5529 }
5530
5531 return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
5532}
5533
5536 std::optional<FastMathFlags> FMF,
5539 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5540
5541 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5542 // and make it as the cost.
5543
5544 static const CostTblEntry SLMCostTbl[] = {
5545 { ISD::FADD, MVT::v2f64, 3 },
5546 { ISD::ADD, MVT::v2i64, 5 },
5547 };
5548
5549 static const CostTblEntry SSE2CostTbl[] = {
5550 { ISD::FADD, MVT::v2f64, 2 },
5551 { ISD::FADD, MVT::v2f32, 2 },
5552 { ISD::FADD, MVT::v4f32, 4 },
5553 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
5554 { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
5555 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
5556 { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
5557 { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
5558 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
5559 { ISD::ADD, MVT::v2i8, 2 },
5560 { ISD::ADD, MVT::v4i8, 2 },
5561 { ISD::ADD, MVT::v8i8, 2 },
5562 { ISD::ADD, MVT::v16i8, 3 },
5563 };
5564
5565 static const CostTblEntry AVX1CostTbl[] = {
5566 { ISD::FADD, MVT::v4f64, 3 },
5567 { ISD::FADD, MVT::v4f32, 3 },
5568 { ISD::FADD, MVT::v8f32, 4 },
5569 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
5570 { ISD::ADD, MVT::v4i64, 3 },
5571 { ISD::ADD, MVT::v8i32, 5 },
5572 { ISD::ADD, MVT::v16i16, 5 },
5573 { ISD::ADD, MVT::v32i8, 4 },
5574 };
5575
5576 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5577 assert(ISD && "Invalid opcode");
5578
5579 // Before legalizing the type, give a chance to look up illegal narrow types
5580 // in the table.
5581 // FIXME: Is there a better way to do this?
5582 EVT VT = TLI->getValueType(DL, ValTy);
5583 if (VT.isSimple()) {
5584 MVT MTy = VT.getSimpleVT();
5585 if (ST->useSLMArithCosts())
5586 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5587 return Entry->Cost;
5588
5589 if (ST->hasAVX())
5590 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5591 return Entry->Cost;
5592
5593 if (ST->hasSSE2())
5594 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5595 return Entry->Cost;
5596 }
5597
5598 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5599
5600 MVT MTy = LT.second;
5601
5602 auto *ValVTy = cast<FixedVectorType>(ValTy);
5603
5604 // Special case: vXi8 mul reductions are performed as vXi16.
5605 if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) {
5606 auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16);
5607 auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements());
5608 return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
5610 CostKind) +
5611 getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind);
5612 }
5613
5614 InstructionCost ArithmeticCost = 0;
5615 if (LT.first != 1 && MTy.isVector() &&
5616 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5617 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5618 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5619 MTy.getVectorNumElements());
5620 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5621 ArithmeticCost *= LT.first - 1;
5622 }
5623
5624 if (ST->useSLMArithCosts())
5625 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5626 return ArithmeticCost + Entry->Cost;
5627
5628 if (ST->hasAVX())
5629 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5630 return ArithmeticCost + Entry->Cost;
5631
5632 if (ST->hasSSE2())
5633 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5634 return ArithmeticCost + Entry->Cost;
5635
5636 // FIXME: These assume a naive kshift+binop lowering, which is probably
5637 // conservative in most cases.
5638 static const CostTblEntry AVX512BoolReduction[] = {
5639 { ISD::AND, MVT::v2i1, 3 },
5640 { ISD::AND, MVT::v4i1, 5 },
5641 { ISD::AND, MVT::v8i1, 7 },
5642 { ISD::AND, MVT::v16i1, 9 },
5643 { ISD::AND, MVT::v32i1, 11 },
5644 { ISD::AND, MVT::v64i1, 13 },
5645 { ISD::OR, MVT::v2i1, 3 },
5646 { ISD::OR, MVT::v4i1, 5 },
5647 { ISD::OR, MVT::v8i1, 7 },
5648 { ISD::OR, MVT::v16i1, 9 },
5649 { ISD::OR, MVT::v32i1, 11 },
5650 { ISD::OR, MVT::v64i1, 13 },
5651 };
5652
5653 static const CostTblEntry AVX2BoolReduction[] = {
5654 { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
5655 { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
5656 { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
5657 { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
5658 };
5659
5660 static const CostTblEntry AVX1BoolReduction[] = {
5661 { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
5662 { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
5663 { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5664 { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5665 { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
5666 { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
5667 { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5668 { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5669 };
5670
5671 static const CostTblEntry SSE2BoolReduction[] = {
5672 { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
5673 { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
5674 { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
5675 { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
5676 { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
5677 { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
5678 { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
5679 { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
5680 };
5681
5682 // Handle bool allof/anyof patterns.
5683 if (ValVTy->getElementType()->isIntegerTy(1)) {
5684 InstructionCost ArithmeticCost = 0;
5685 if (LT.first != 1 && MTy.isVector() &&
5686 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5687 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5688 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5689 MTy.getVectorNumElements());
5690 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5691 ArithmeticCost *= LT.first - 1;
5692 }
5693
5694 if (ST->hasAVX512())
5695 if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
5696 return ArithmeticCost + Entry->Cost;
5697 if (ST->hasAVX2())
5698 if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
5699 return ArithmeticCost + Entry->Cost;
5700 if (ST->hasAVX())
5701 if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
5702 return ArithmeticCost + Entry->Cost;
5703 if (ST->hasSSE2())
5704 if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
5705 return ArithmeticCost + Entry->Cost;
5706
5707 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5708 }
5709
5710 unsigned NumVecElts = ValVTy->getNumElements();
5711 unsigned ScalarSize = ValVTy->getScalarSizeInBits();
5712
5713 // Special case power of 2 reductions where the scalar type isn't changed
5714 // by type legalization.
5715 if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
5716 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5717
5718 InstructionCost ReductionCost = 0;
5719
5720 auto *Ty = ValVTy;
5721 if (LT.first != 1 && MTy.isVector() &&
5722 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5723 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5724 Ty = FixedVectorType::get(ValVTy->getElementType(),
5725 MTy.getVectorNumElements());
5726 ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5727 ReductionCost *= LT.first - 1;
5728 NumVecElts = MTy.getVectorNumElements();
5729 }
5730
5731 // Now handle reduction with the legal type, taking into account size changes
5732 // at each level.
5733 while (NumVecElts > 1) {
5734 // Determine the size of the remaining vector we need to reduce.
5735 unsigned Size = NumVecElts * ScalarSize;
5736 NumVecElts /= 2;
5737 // If we're reducing from 256/512 bits, use an extract_subvector.
5738 if (Size > 128) {
5739 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5740 ReductionCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, Ty, {},
5741 CostKind, NumVecElts, SubTy);
5742 Ty = SubTy;
5743 } else if (Size == 128) {
5744 // Reducing from 128 bits is a permute of v2f64/v2i64.
5745 FixedVectorType *ShufTy;
5746 if (ValVTy->isFloatingPointTy())
5747 ShufTy =
5748 FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
5749 else
5750 ShufTy =
5751 FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
5752 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy,
5753 {}, CostKind, 0, nullptr);
5754 } else if (Size == 64) {
5755 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5756 FixedVectorType *ShufTy;
5757 if (ValVTy->isFloatingPointTy())
5758 ShufTy =
5759 FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
5760 else
5761 ShufTy =
5762 FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
5763 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy,
5764 {}, CostKind, 0, nullptr);
5765 } else {
5766 // Reducing from smaller size is a shift by immediate.
5767 auto *ShiftTy = FixedVectorType::get(
5768 Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
5769 ReductionCost += getArithmeticInstrCost(
5770 Instruction::LShr, ShiftTy, CostKind,
5773 }
5774
5775 // Add the arithmetic op for this level.
5776 ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
5777 }
5778
5779 // Add the final extract element to the cost.
5780 return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5781 CostKind, 0, nullptr, nullptr);
5782}
5783
5786 FastMathFlags FMF) const {
5787 IntrinsicCostAttributes ICA(IID, Ty, {Ty, Ty}, FMF);
5788 return getIntrinsicInstrCost(ICA, CostKind);
5789}
5790
5793 FastMathFlags FMF,
5795 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5796
5797 MVT MTy = LT.second;
5798
5799 int ISD;
5800 if (ValTy->isIntOrIntVectorTy()) {
5801 ISD = (IID == Intrinsic::umin || IID == Intrinsic::umax) ? ISD::UMIN
5802 : ISD::SMIN;
5803 } else {
5804 assert(ValTy->isFPOrFPVectorTy() &&
5805 "Expected float point or integer vector type.");
5806 ISD = (IID == Intrinsic::minnum || IID == Intrinsic::maxnum)
5807 ? ISD::FMINNUM
5808 : ISD::FMINIMUM;
5809 }
5810
5811 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5812 // and make it as the cost.
5813
5814 static const CostTblEntry SSE2CostTbl[] = {
5815 {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw
5816 {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw
5817 {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw
5818 };
5819
5820 static const CostTblEntry SSE41CostTbl[] = {
5821 {ISD::SMIN, MVT::v2i16, 3}, // same as sse2
5822 {ISD::SMIN, MVT::v4i16, 5}, // same as sse2
5823 {ISD::UMIN, MVT::v2i16, 5}, // same as sse2
5824 {ISD::UMIN, MVT::v4i16, 7}, // same as sse2
5825 {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor
5826 {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax
5827 {ISD::SMIN, MVT::v2i8, 3}, // pminsb
5828 {ISD::SMIN, MVT::v4i8, 5}, // pminsb
5829 {ISD::SMIN, MVT::v8i8, 7}, // pminsb
5830 {ISD::SMIN, MVT::v16i8, 6},
5831 {ISD::UMIN, MVT::v2i8, 3}, // same as sse2
5832 {ISD::UMIN, MVT::v4i8, 5}, // same as sse2
5833 {ISD::UMIN, MVT::v8i8, 7}, // same as sse2
5834 {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax
5835 };
5836
5837 static const CostTblEntry AVX1CostTbl[] = {
5838 {ISD::SMIN, MVT::v16i16, 6},
5839 {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax
5840 {ISD::SMIN, MVT::v32i8, 8},
5841 {ISD::UMIN, MVT::v32i8, 8},
5842 };
5843
5844 static const CostTblEntry AVX512BWCostTbl[] = {
5845 {ISD::SMIN, MVT::v32i16, 8},
5846 {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax
5847 {ISD::SMIN, MVT::v64i8, 10},
5848 {ISD::UMIN, MVT::v64i8, 10},
5849 };
5850
5851 // Before legalizing the type, give a chance to look up illegal narrow types
5852 // in the table.
5853 // FIXME: Is there a better way to do this?
5854 EVT VT = TLI->getValueType(DL, ValTy);
5855 if (VT.isSimple()) {
5856 MVT MTy = VT.getSimpleVT();
5857 if (ST->hasBWI())
5858 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5859 return Entry->Cost;
5860
5861 if (ST->hasAVX())
5862 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5863 return Entry->Cost;
5864
5865 if (ST->hasSSE41())
5866 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5867 return Entry->Cost;
5868
5869 if (ST->hasSSE2())
5870 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5871 return Entry->Cost;
5872 }
5873
5874 auto *ValVTy = cast<FixedVectorType>(ValTy);
5875 unsigned NumVecElts = ValVTy->getNumElements();
5876
5877 auto *Ty = ValVTy;
5878 InstructionCost MinMaxCost = 0;
5879 if (LT.first != 1 && MTy.isVector() &&
5880 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5881 // Type needs to be split. We need LT.first - 1 operations ops.
5882 Ty = FixedVectorType::get(ValVTy->getElementType(),
5883 MTy.getVectorNumElements());
5884 MinMaxCost = getMinMaxCost(IID, Ty, CostKind, FMF);
5885 MinMaxCost *= LT.first - 1;
5886 NumVecElts = MTy.getVectorNumElements();
5887 }
5888
5889 if (ST->hasBWI())
5890 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5891 return MinMaxCost + Entry->Cost;
5892
5893 if (ST->hasAVX())
5894 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5895 return MinMaxCost + Entry->Cost;
5896
5897 if (ST->hasSSE41())
5898 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5899 return MinMaxCost + Entry->Cost;
5900
5901 if (ST->hasSSE2())
5902 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5903 return MinMaxCost + Entry->Cost;
5904
5905 unsigned ScalarSize = ValTy->getScalarSizeInBits();
5906
5907 // Special case power of 2 reductions where the scalar type isn't changed
5908 // by type legalization.
5909 if (!isPowerOf2_32(ValVTy->getNumElements()) ||
5910 ScalarSize != MTy.getScalarSizeInBits())
5911 return BaseT::getMinMaxReductionCost(IID, ValTy, FMF, CostKind);
5912
5913 // Now handle reduction with the legal type, taking into account size changes
5914 // at each level.
5915 while (NumVecElts > 1) {
5916 // Determine the size of the remaining vector we need to reduce.
5917 unsigned Size = NumVecElts * ScalarSize;
5918 NumVecElts /= 2;
5919 // If we're reducing from 256/512 bits, use an extract_subvector.
5920 if (Size > 128) {
5921 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5922 MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, Ty, {},
5923 CostKind, NumVecElts, SubTy);
5924 Ty = SubTy;
5925 } else if (Size == 128) {
5926 // Reducing from 128 bits is a permute of v2f64/v2i64.
5927 VectorType *ShufTy;
5928 if (ValTy->isFloatingPointTy())
5929 ShufTy =
5930 FixedVectorType::get(Type::getDoubleTy(ValTy->getContext()), 2);
5931 else
5932 ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
5933 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy, {},
5934 CostKind, 0, nullptr);
5935 } else if (Size == 64) {
5936 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5937 FixedVectorType *ShufTy;
5938 if (ValTy->isFloatingPointTy())
5939 ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
5940 else
5941 ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
5942 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy, {},
5943 CostKind, 0, nullptr);
5944 } else {
5945 // Reducing from smaller size is a shift by immediate.
5946 auto *ShiftTy = FixedVectorType::get(
5947 Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
5948 MinMaxCost += getArithmeticInstrCost(
5949 Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
5952 }
5953
5954 // Add the arithmetic op for this level.
5955 MinMaxCost += getMinMaxCost(IID, Ty, CostKind, FMF);
5956 }
5957
5958 // Add the final extract element to the cost.
5959 return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5960 CostKind, 0, nullptr, nullptr);
5961}
5962
5963/// Calculate the cost of materializing a 64-bit value. This helper
5964/// method might only calculate a fraction of a larger immediate. Therefore it
5965/// is valid to return a cost of ZERO.
5967 if (Val == 0)
5968 return TTI::TCC_Free;
5969
5970 if (isInt<32>(Val))
5971 return TTI::TCC_Basic;
5972
5973 return 2 * TTI::TCC_Basic;
5974}
5975
5978 assert(Ty->isIntegerTy());
5979
5980 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5981 if (BitSize == 0)
5982 return ~0U;
5983
5984 // Never hoist constants larger than 128bit, because this might lead to
5985 // incorrect code generation or assertions in codegen.
5986 // Fixme: Create a cost model for types larger than i128 once the codegen
5987 // issues have been fixed.
5988 if (BitSize > 128)
5989 return TTI::TCC_Free;
5990
5991 if (Imm == 0)
5992 return TTI::TCC_Free;
5993
5994 // Sign-extend all constants to a multiple of 64-bit.
5995 APInt ImmVal = Imm;
5996 if (BitSize % 64 != 0)
5997 ImmVal = Imm.sext(alignTo(BitSize, 64));
5998
5999 // Split the constant into 64-bit chunks and calculate the cost for each
6000 // chunk.
6002 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
6003 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
6004 int64_t Val = Tmp.getSExtValue();
6005 Cost += getIntImmCost(Val);
6006 }
6007 // We need at least one instruction to materialize the constant.
6008 return std::max<InstructionCost>(1, Cost);
6009}
6010
6012 const APInt &Imm, Type *Ty,
6014 Instruction *Inst) const {
6015 assert(Ty->isIntegerTy());
6016
6017 unsigned BitSize = Ty->getPrimitiveSizeInBits();
6018 unsigned ImmBitWidth = Imm.getBitWidth();
6019
6020 // There is no cost model for constants with a bit size of 0. Return TCC_Free
6021 // here, so that constant hoisting will ignore this constant.
6022 if (BitSize == 0)
6023 return TTI::TCC_Free;
6024
6025 unsigned ImmIdx = ~0U;
6026 switch (Opcode) {
6027 default:
6028 return TTI::TCC_Free;
6029 case Instruction::GetElementPtr:
6030 // Always hoist the base address of a GetElementPtr. This prevents the
6031 // creation of new constants for every base constant that gets constant
6032 // folded with the offset.
6033 if (Idx == 0)
6034 return 2 * TTI::TCC_Basic;
6035 return TTI::TCC_Free;
6036 case Instruction::Store:
6037 ImmIdx = 0;
6038 break;
6039 case Instruction::ICmp:
6040 // This is an imperfect hack to prevent constant hoisting of
6041 // compares that might be trying to check if a 64-bit value fits in
6042 // 32-bits. The backend can optimize these cases using a right shift by 32.
6043 // There are other predicates and immediates the backend can use shifts for.
6044 if (Idx == 1 && ImmBitWidth == 64) {
6045 uint64_t ImmVal = Imm.getZExtValue();
6046 if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
6047 return TTI::TCC_Free;
6048
6049 if (auto *Cmp = dyn_cast_or_null<CmpInst>(Inst)) {
6050 if (Cmp->isEquality()) {
6051 KnownBits Known = computeKnownBits(Cmp->getOperand(0), DL);
6052 if (Known.countMinTrailingZeros() >= 32)
6053 return TTI::TCC_Free;
6054 }
6055 }
6056 }
6057 ImmIdx = 1;
6058 break;
6059 case Instruction::And:
6060 // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
6061 // by using a 32-bit operation with implicit zero extension. Detect such
6062 // immediates here as the normal path expects bit 31 to be sign extended.
6063 if (Idx == 1 && ImmBitWidth == 64 && Imm.isIntN(32))
6064 return TTI::TCC_Free;
6065 // If we have BMI then we can use BEXTR/BZHI to mask out upper i64 bits.
6066 if (Idx == 1 && ImmBitWidth == 64 && ST->is64Bit() && ST->hasBMI() &&
6067 Imm.isMask())
6068 return X86TTIImpl::getIntImmCost(ST->hasBMI2() ? 255 : 65535);
6069 ImmIdx = 1;
6070 break;
6071 case Instruction::Add:
6072 case Instruction::Sub:
6073 // For add/sub, we can use the opposite instruction for INT32_MIN.
6074 if (Idx == 1 && ImmBitWidth == 64 && Imm.getZExtValue() == 0x80000000)
6075 return TTI::TCC_Free;
6076 ImmIdx = 1;
6077 break;
6078 case Instruction::UDiv:
6079 case Instruction::SDiv:
6080 case Instruction::URem:
6081 case Instruction::SRem:
6082 // Division by constant is typically expanded later into a different
6083 // instruction sequence. This completely changes the constants.
6084 // Report them as "free" to stop ConstantHoist from marking them as opaque.
6085 return TTI::TCC_Free;
6086 case Instruction::Mul:
6087 case Instruction::Or:
6088 case Instruction::Xor:
6089 ImmIdx = 1;
6090 break;
6091 // Always return TCC_Free for the shift value of a shift instruction.
6092 case Instruction::Shl:
6093 case Instruction::LShr:
6094 case Instruction::AShr:
6095 if (Idx == 1)
6096 return TTI::TCC_Free;
6097 break;
6098 case Instruction::Trunc:
6099 case Instruction::ZExt:
6100 case Instruction::SExt:
6101 case Instruction::IntToPtr:
6102 case Instruction::PtrToInt:
6103 case Instruction::BitCast:
6104 case Instruction::PHI:
6105 case Instruction::Call:
6106 case Instruction::Select:
6107 case Instruction::Ret:
6108 case Instruction::Load:
6109 break;
6110 }
6111
6112 if (Idx == ImmIdx) {
6113 uint64_t NumConstants = divideCeil(BitSize, 64);
6115 return (Cost <= NumConstants * TTI::TCC_Basic)
6116 ? static_cast<int>(TTI::TCC_Free)
6117 : Cost;
6118 }
6119
6120 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
6121}
6122
6125 const APInt &Imm, Type *Ty,
6127 assert(Ty->isIntegerTy());
6128
6129 unsigned BitSize = Ty->getPrimitiveSizeInBits();
6130 // There is no cost model for constants with a bit size of 0. Return TCC_Free
6131 // here, so that constant hoisting will ignore this constant.
6132 if (BitSize == 0)
6133 return TTI::TCC_Free;
6134
6135 switch (IID) {
6136 default:
6137 return TTI::TCC_Free;
6138 case Intrinsic::sadd_with_overflow:
6139 case Intrinsic::uadd_with_overflow:
6140 case Intrinsic::ssub_with_overflow:
6141 case Intrinsic::usub_with_overflow:
6142 case Intrinsic::smul_with_overflow:
6143 case Intrinsic::umul_with_overflow:
6144 if ((Idx == 1) && Imm.getBitWidth() <= 64 && Imm.isSignedIntN(32))
6145 return TTI::TCC_Free;
6146 break;
6147 case Intrinsic::experimental_stackmap:
6148 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
6149 return TTI::TCC_Free;
6150 break;
6151 case Intrinsic::experimental_patchpoint_void:
6152 case Intrinsic::experimental_patchpoint:
6153 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
6154 return TTI::TCC_Free;
6155 break;
6156 }
6157 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
6158}
6159
6162 const Instruction *I) const {
6164 return Opcode == Instruction::PHI ? TTI::TCC_Free : TTI::TCC_Basic;
6165 // Branches are assumed to be predicted.
6166 return TTI::TCC_Free;
6167}
6168
6169int X86TTIImpl::getGatherOverhead() const {
6170 // Some CPUs have more overhead for gather. The specified overhead is relative
6171 // to the Load operation. "2" is the number provided by Intel architects. This
6172 // parameter is used for cost estimation of Gather Op and comparison with
6173 // other alternatives.
6174 // TODO: Remove the explicit hasAVX512()?, That would mean we would only
6175 // enable gather with a -march.
6176 if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
6177 return 2;
6178
6179 return 1024;
6180}
6181
6182int X86TTIImpl::getScatterOverhead() const {
6183 if (ST->hasAVX512())
6184 return 2;
6185
6186 return 1024;
6187}
6188
6189// Return an average cost of Gather / Scatter instruction, maybe improved later.
6190InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode,
6192 Type *SrcVTy, const Value *Ptr,
6193 Align Alignment,
6194 unsigned AddressSpace) const {
6195
6196 assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
6197 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
6198
6199 // Try to reduce index size from 64 bit (default for GEP)
6200 // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
6201 // operation will use 16 x 64 indices which do not fit in a zmm and needs
6202 // to split. Also check that the base pointer is the same for all lanes,
6203 // and that there's at most one variable index.
6204 auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
6205 unsigned IndexSize = DL.getPointerSizeInBits();
6206 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
6207 if (IndexSize < 64 || !GEP)
6208 return IndexSize;
6209
6210 unsigned NumOfVarIndices = 0;
6211 const Value *Ptrs = GEP->getPointerOperand();
6212 if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
6213 return IndexSize;
6214 for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) {
6215 if (isa<Constant>(GEP->getOperand(I)))
6216 continue;
6217 Type *IndxTy = GEP->getOperand(I)->getType();
6218 if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
6219 IndxTy = IndexVTy->getElementType();
6220 if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
6221 !isa<SExtInst>(GEP->getOperand(I))) ||
6222 ++NumOfVarIndices > 1)
6223 return IndexSize; // 64
6224 }
6225 return (unsigned)32;
6226 };
6227
6228 // Trying to reduce IndexSize to 32 bits for vector 16.
6229 // By default the IndexSize is equal to pointer size.
6230 unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
6231 ? getIndexSizeInBits(Ptr, DL)
6232 : DL.getPointerSizeInBits();
6233
6234 auto *IndexVTy = FixedVectorType::get(
6235 IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
6236 std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(IndexVTy);
6237 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(SrcVTy);
6238 InstructionCost::CostType SplitFactor =
6239 std::max(IdxsLT.first, SrcLT.first).getValue();
6240 if (SplitFactor > 1) {
6241 // Handle splitting of vector of pointers
6242 auto *SplitSrcTy =
6243 FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
6244 return SplitFactor * getGSVectorCost(Opcode, CostKind, SplitSrcTy, Ptr,
6245 Alignment, AddressSpace);
6246 }
6247
6248 // If we didn't split, this will be a single gather/scatter instruction.
6250 return 1;
6251
6252 // The gather / scatter cost is given by Intel architects. It is a rough
6253 // number since we are looking at one instruction in a time.
6254 const int GSOverhead = (Opcode == Instruction::Load) ? getGatherOverhead()
6255 : getScatterOverhead();
6256 return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
6257 Alignment, AddressSpace, CostKind);
6258}
6259
6260/// Calculate the cost of Gather / Scatter operation
6262 unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask,
6264 const Instruction *I = nullptr) const {
6265 if ((Opcode == Instruction::Load &&
6266 (!isLegalMaskedGather(SrcVTy, Align(Alignment)) ||
6268 Align(Alignment)))) ||
6269 (Opcode == Instruction::Store &&
6270 (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) ||
6272 Align(Alignment)))))
6273 return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask,
6274 Alignment, CostKind, I);
6275
6276 assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
6277 PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
6278 if (!PtrTy && Ptr->getType()->isVectorTy())
6279 PtrTy = dyn_cast<PointerType>(
6280 cast<VectorType>(Ptr->getType())->getElementType());
6281 assert(PtrTy && "Unexpected type for Ptr argument");
6282 unsigned AddressSpace = PtrTy->getAddressSpace();
6283 return getGSVectorCost(Opcode, CostKind, SrcVTy, Ptr, Alignment,
6284 AddressSpace);
6285}
6286
6288 const TargetTransformInfo::LSRCost &C2) const {
6289 // X86 specific here are "instruction number 1st priority".
6290 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, C1.NumIVMuls,
6291 C1.NumBaseAdds, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
6292 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, C2.NumIVMuls,
6293 C2.NumBaseAdds, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
6294}
6295
6297 return ST->hasMacroFusion() || ST->hasBranchFusion();
6298}
6299
6300static bool isLegalMaskedLoadStore(Type *ScalarTy, const X86Subtarget *ST) {
6301 if (!ST->hasAVX())
6302 return false;
6303
6304 if (ScalarTy->isPointerTy())
6305 return true;
6306
6307 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6308 return true;
6309
6310 if (ScalarTy->isHalfTy() && ST->hasBWI())
6311 return true;
6312
6313 if (ScalarTy->isBFloatTy() && ST->hasBF16())
6314 return true;
6315
6316 if (!ScalarTy->isIntegerTy())
6317 return false;
6318
6319 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6320 return IntWidth == 32 || IntWidth == 64 ||
6321 ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
6322}
6323
6325 unsigned AddressSpace) const {
6326 Type *ScalarTy = DataTy->getScalarType();
6327
6328 // The backend can't handle a single element vector w/o CFCMOV.
6329 if (isa<VectorType>(DataTy) &&
6330 cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6331 return ST->hasCF() &&
6332 hasConditionalLoadStoreForType(ScalarTy, /*IsStore=*/false);
6333
6334 return isLegalMaskedLoadStore(ScalarTy, ST);
6335}
6336
6338 unsigned AddressSpace) const {
6339 Type *ScalarTy = DataTy->getScalarType();
6340
6341 // The backend can't handle a single element vector w/o CFCMOV.
6342 if (isa<VectorType>(DataTy) &&
6343 cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6344 return ST->hasCF() &&
6345 hasConditionalLoadStoreForType(ScalarTy, /*IsStore=*/true);
6346
6347 return isLegalMaskedLoadStore(ScalarTy, ST);
6348}
6349
6350bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) const {
6351 unsigned DataSize = DL.getTypeStoreSize(DataType);
6352 // The only supported nontemporal loads are for aligned vectors of 16 or 32
6353 // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
6354 // (the equivalent stores only require AVX).
6355 if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
6356 return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
6357
6358 return false;
6359}
6360
6361bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) const {
6362 unsigned DataSize = DL.getTypeStoreSize(DataType);
6363
6364 // SSE4A supports nontemporal stores of float and double at arbitrary
6365 // alignment.
6366 if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
6367 return true;
6368
6369 // Besides the SSE4A subtarget exception above, only aligned stores are
6370 // available nontemporaly on any other subtarget. And only stores with a size
6371 // of 4..32 bytes (powers of 2, only) are permitted.
6372 if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
6374 return false;
6375
6376 // 32-byte vector nontemporal stores are supported by AVX (the equivalent
6377 // loads require AVX2).
6378 if (DataSize == 32)
6379 return ST->hasAVX();
6380 if (DataSize == 16)
6381 return ST->hasSSE1();
6382 return true;
6383}
6384
6386 ElementCount NumElements) const {
6387 // movddup
6388 return ST->hasSSE3() && !NumElements.isScalable() &&
6389 NumElements.getFixedValue() == 2 &&
6390 ElementTy == Type::getDoubleTy(ElementTy->getContext());
6391}
6392
6393bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy, Align Alignment) const {
6394 if (!isa<VectorType>(DataTy))
6395 return false;
6396
6397 if (!ST->hasAVX512())
6398 return false;
6399
6400 // The backend can't handle a single element vector.
6401 if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6402 return false;
6403
6404 Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
6405
6406 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6407 return true;
6408
6409 if (!ScalarTy->isIntegerTy())
6410 return false;
6411
6412 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6413 return IntWidth == 32 || IntWidth == 64 ||
6414 ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
6415}
6416
6418 Align Alignment) const {
6419 return isLegalMaskedExpandLoad(DataTy, Alignment);
6420}
6421
6422bool X86TTIImpl::supportsGather() const {
6423 // Some CPUs have better gather performance than others.
6424 // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
6425 // enable gather with a -march.
6426 return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2());
6427}
6428
6430 Align Alignment) const {
6431 // Gather / Scatter for vector 2 is not profitable on KNL / SKX
6432 // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend
6433 // it to 8 elements, but zeroing upper bits of the mask vector will add more
6434 // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO:
6435 // Check, maybe the gather/scatter instruction is better in the VariableMask
6436 // case.
6437 unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements();
6438 return NumElts == 1 ||
6439 (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())));
6440}
6441
6443 Align Alignment) const {
6444 Type *ScalarTy = DataTy->getScalarType();
6445 if (ScalarTy->isPointerTy())
6446 return true;
6447
6448 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6449 return true;
6450
6451 if (!ScalarTy->isIntegerTy())
6452 return false;
6453
6454 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6455 return IntWidth == 32 || IntWidth == 64;
6456}
6457
6458bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) const {
6459 if (!supportsGather() || !ST->preferGather())
6460 return false;
6461 return isLegalMaskedGatherScatter(DataTy, Alignment);
6462}
6463
6464bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
6465 unsigned Opcode1,
6466 const SmallBitVector &OpcodeMask) const {
6467 // ADDSUBPS 4xf32 SSE3
6468 // VADDSUBPS 4xf32 AVX
6469 // VADDSUBPS 8xf32 AVX2
6470 // ADDSUBPD 2xf64 SSE3
6471 // VADDSUBPD 2xf64 AVX
6472 // VADDSUBPD 4xf64 AVX2
6473
6474 unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
6475 assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible");
6476 if (!isPowerOf2_32(NumElements))
6477 return false;
6478 // Check the opcode pattern. We apply the mask on the opcode arguments and
6479 // then check if it is what we expect.
6480 for (int Lane : seq<int>(0, NumElements)) {
6481 unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0;
6482 // We expect FSub for even lanes and FAdd for odd lanes.
6483 if (Lane % 2 == 0 && Opc != Instruction::FSub)
6484 return false;
6485 if (Lane % 2 == 1 && Opc != Instruction::FAdd)
6486 return false;
6487 }
6488 // Now check that the pattern is supported by the target ISA.
6489 Type *ElemTy = cast<VectorType>(VecTy)->getElementType();
6490 if (ElemTy->isFloatTy())
6491 return ST->hasSSE3() && NumElements % 4 == 0;
6492 if (ElemTy->isDoubleTy())
6493 return ST->hasSSE3() && NumElements % 2 == 0;
6494 return false;
6495}
6496
6497bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) const {
6498 // AVX2 doesn't support scatter
6499 if (!ST->hasAVX512() || !ST->preferScatter())
6500 return false;
6501 return isLegalMaskedGatherScatter(DataType, Alignment);
6502}
6503
6504bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) const {
6505 EVT VT = TLI->getValueType(DL, DataType);
6506 return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
6507}
6508
6510 // FDIV is always expensive, even if it has a very low uop count.
6511 // TODO: Still necessary for recent CPUs with low latency/throughput fdiv?
6512 if (I->getOpcode() == Instruction::FDiv)
6513 return true;
6514
6516}
6517
6518bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) const { return false; }
6519
6521 const Function *Callee) const {
6522 const TargetMachine &TM = getTLI()->getTargetMachine();
6523
6524 // Work this as a subsetting of subtarget features.
6525 const FeatureBitset &CallerBits =
6526 TM.getSubtargetImpl(*Caller)->getFeatureBits();
6527 const FeatureBitset &CalleeBits =
6528 TM.getSubtargetImpl(*Callee)->getFeatureBits();
6529
6530 // Check whether features are the same (apart from the ignore list).
6531 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
6532 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
6533 if (RealCallerBits == RealCalleeBits)
6534 return true;
6535
6536 // If the features are a subset, we need to additionally check for calls
6537 // that may become ABI-incompatible as a result of inlining.
6538 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
6539 return false;
6540
6541 for (const Instruction &I : instructions(Callee)) {
6542 if (const auto *CB = dyn_cast<CallBase>(&I)) {
6543 // Having more target features is fine for inline ASM and intrinsics.
6544 if (CB->isInlineAsm() || CB->getIntrinsicID() != Intrinsic::not_intrinsic)
6545 continue;
6546
6548 for (Value *Arg : CB->args())
6549 Types.push_back(Arg->getType());
6550 if (!CB->getType()->isVoidTy())
6551 Types.push_back(CB->getType());
6552
6553 // Simple types are always ABI compatible.
6554 auto IsSimpleTy = [](Type *Ty) {
6555 return !Ty->isVectorTy() && !Ty->isAggregateType();
6556 };
6557 if (all_of(Types, IsSimpleTy))
6558 continue;
6559
6560 // Do a precise compatibility check.
6561 if (!areTypesABICompatible(Caller, Callee, Types))
6562 return false;
6563 }
6564 }
6565 return true;
6566}
6567
6569 const Function *Callee,
6570 ArrayRef<Type *> Types) const {
6571 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
6572 return false;
6573
6574 // If we get here, we know the target features match. If one function
6575 // considers 512-bit vectors legal and the other does not, consider them
6576 // incompatible.
6577 const TargetMachine &TM = getTLI()->getTargetMachine();
6578
6579 if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
6581 return true;
6582
6583 // Consider the arguments compatible if they aren't vectors or aggregates.
6584 // FIXME: Look at the size of vectors.
6585 // FIXME: Look at the element types of aggregates to see if there are vectors.
6586 return llvm::none_of(Types,
6587 [](Type *T) { return T->isVectorTy() || T->isAggregateType(); });
6588}
6589
6591X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
6593 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
6594 Options.NumLoadsPerBlock = 2;
6595 // All GPR and vector loads can be unaligned.
6596 Options.AllowOverlappingLoads = true;
6597 if (IsZeroCmp) {
6598 // Only enable vector loads for equality comparison. Right now the vector
6599 // version is not as fast for three way compare (see #33329).
6600 const unsigned PreferredWidth = ST->getPreferVectorWidth();
6601 if (PreferredWidth >= 512 && ST->hasAVX512())
6602 Options.LoadSizes.push_back(64);
6603 if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
6604 if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
6605 }
6606 if (ST->is64Bit()) {
6607 Options.LoadSizes.push_back(8);
6608 }
6609 Options.LoadSizes.push_back(4);
6610 Options.LoadSizes.push_back(2);
6611 Options.LoadSizes.push_back(1);
6612 return Options;
6613}
6614
6616 return supportsGather();
6617}
6618
6620 return false;
6621}
6622
6624 // TODO: We expect this to be beneficial regardless of arch,
6625 // but there are currently some unexplained performance artifacts on Atom.
6626 // As a temporary solution, disable on Atom.
6627 return !(ST->isAtom());
6628}
6629
6630// Get estimation for interleaved load/store operations and strided load.
6631// \p Indices contains indices for strided load.
6632// \p Factor - the factor of interleaving.
6633// AVX-512 provides 3-src shuffles that significantly reduces the cost.
6635 unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
6636 ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
6637 TTI::TargetCostKind CostKind, bool UseMaskForCond,
6638 bool UseMaskForGaps) const {
6639 // VecTy for interleave memop is <VF*Factor x Elt>.
6640 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6641 // VecTy = <12 x i32>.
6642
6643 // Calculate the number of memory operations (NumOfMemOps), required
6644 // for load/store the VecTy.
6645 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6646 unsigned VecTySize = DL.getTypeStoreSize(VecTy);
6647 unsigned LegalVTSize = LegalVT.getStoreSize();
6648 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
6649
6650 // Get the cost of one memory operation.
6651 auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
6652 LegalVT.getVectorNumElements());
6653 InstructionCost MemOpCost;
6654 bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps;
6655 if (UseMaskedMemOp) {
6656 unsigned IID = Opcode == Instruction::Load ? Intrinsic::masked_load
6657 : Intrinsic::masked_store;
6658 MemOpCost = getMaskedMemoryOpCost(
6659 {IID, SingleMemOpTy, Alignment, AddressSpace}, CostKind);
6660 } else
6661 MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace,
6662 CostKind);
6663
6664 unsigned VF = VecTy->getNumElements() / Factor;
6665 MVT VT =
6666 MVT::getVectorVT(TLI->getSimpleValueType(DL, VecTy->getScalarType()), VF);
6667
6668 InstructionCost MaskCost;
6669 if (UseMaskedMemOp) {
6670 APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements());
6671 for (unsigned Index : Indices) {
6672 assert(Index < Factor && "Invalid index for interleaved memory op");
6673 for (unsigned Elm = 0; Elm < VF; Elm++)
6674 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
6675 }
6676
6677 Type *I1Type = Type::getInt1Ty(VecTy->getContext());
6678
6679 MaskCost = getReplicationShuffleCost(
6680 I1Type, Factor, VF,
6681 UseMaskForGaps ? DemandedLoadStoreElts
6683 CostKind);
6684
6685 // The Gaps mask is invariant and created outside the loop, therefore the
6686 // cost of creating it is not accounted for here. However if we have both
6687 // a MaskForGaps and some other mask that guards the execution of the
6688 // memory access, we need to account for the cost of And-ing the two masks
6689 // inside the loop.
6690 if (UseMaskForGaps) {
6691 auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements());
6692 MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind);
6693 }
6694 }
6695
6696 if (Opcode == Instruction::Load) {
6697 // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
6698 // contain the cost of the optimized shuffle sequence that the
6699 // X86InterleavedAccess pass will generate.
6700 // The cost of loads and stores are computed separately from the table.
6701
6702 // X86InterleavedAccess support only the following interleaved-access group.
6703 static const CostTblEntry AVX512InterleavedLoadTbl[] = {
6704 {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
6705 {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
6706 {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
6707 };
6708
6709 if (const auto *Entry =
6710 CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
6711 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6712 //If an entry does not exist, fallback to the default implementation.
6713
6714 // Kind of shuffle depends on number of loaded values.
6715 // If we load the entire data in one register, we can use a 1-src shuffle.
6716 // Otherwise, we'll merge 2 sources in each operation.
6717 TTI::ShuffleKind ShuffleKind =
6718 (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
6719
6720 InstructionCost ShuffleCost = getShuffleCost(
6721 ShuffleKind, SingleMemOpTy, SingleMemOpTy, {}, CostKind, 0, nullptr);
6722
6723 unsigned NumOfLoadsInInterleaveGrp =
6724 Indices.size() ? Indices.size() : Factor;
6725 auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
6726 VecTy->getNumElements() / Factor);
6727 InstructionCost NumOfResults =
6728 getTypeLegalizationCost(ResultTy).first * NumOfLoadsInInterleaveGrp;
6729
6730 // About a half of the loads may be folded in shuffles when we have only
6731 // one result. If we have more than one result, or the loads are masked,
6732 // we do not fold loads at all.
6733 unsigned NumOfUnfoldedLoads =
6734 UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
6735
6736 // Get a number of shuffle operations per result.
6737 unsigned NumOfShufflesPerResult =
6738 std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
6739
6740 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6741 // When we have more than one destination, we need additional instructions
6742 // to keep sources.
6743 InstructionCost NumOfMoves = 0;
6744 if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
6745 NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
6746
6747 InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
6748 MaskCost + NumOfUnfoldedLoads * MemOpCost +
6749 NumOfMoves;
6750
6751 return Cost;
6752 }
6753
6754 // Store.
6755 assert(Opcode == Instruction::Store &&
6756 "Expected Store Instruction at this point");
6757 // X86InterleavedAccess support only the following interleaved-access group.
6758 static const CostTblEntry AVX512InterleavedStoreTbl[] = {
6759 {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
6760 {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
6761 {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
6762
6763 {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
6764 {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
6765 {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
6766 {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
6767 };
6768
6769 if (const auto *Entry =
6770 CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
6771 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6772 //If an entry does not exist, fallback to the default implementation.
6773
6774 // There is no strided stores meanwhile. And store can't be folded in
6775 // shuffle.
6776 unsigned NumOfSources = Factor; // The number of values to be merged.
6777 InstructionCost ShuffleCost =
6778 getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, SingleMemOpTy, {},
6779 CostKind, 0, nullptr);
6780 unsigned NumOfShufflesPerStore = NumOfSources - 1;
6781
6782 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6783 // We need additional instructions to keep sources.
6784 unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
6786 MaskCost +
6787 NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
6788 NumOfMoves;
6789 return Cost;
6790}
6791
6793 unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices,
6794 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
6795 bool UseMaskForCond, bool UseMaskForGaps) const {
6796 auto *VecTy = cast<FixedVectorType>(BaseTy);
6797
6798 auto isSupportedOnAVX512 = [&](Type *VecTy) {
6799 Type *EltTy = cast<VectorType>(VecTy)->getElementType();
6800 if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
6801 EltTy->isIntegerTy(32) || EltTy->isPointerTy())
6802 return true;
6803 if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy())
6804 return ST->hasBWI();
6805 if (EltTy->isBFloatTy())
6806 return ST->hasBF16();
6807 return false;
6808 };
6809 if (ST->hasAVX512() && isSupportedOnAVX512(VecTy))
6811 Opcode, VecTy, Factor, Indices, Alignment,
6812 AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
6813
6814 if (UseMaskForCond || UseMaskForGaps)
6815 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6816 Alignment, AddressSpace, CostKind,
6817 UseMaskForCond, UseMaskForGaps);
6818
6819 // Get estimation for interleaved load/store operations for SSE-AVX2.
6820 // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow
6821 // computing the cost using a generic formula as a function of generic
6822 // shuffles. We therefore use a lookup table instead, filled according to
6823 // the instruction sequences that codegen currently generates.
6824
6825 // VecTy for interleave memop is <VF*Factor x Elt>.
6826 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6827 // VecTy = <12 x i32>.
6828 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6829
6830 // This function can be called with VecTy=<6xi128>, Factor=3, in which case
6831 // the VF=2, while v2i128 is an unsupported MVT vector type
6832 // (see MachineValueType.h::getVectorVT()).
6833 if (!LegalVT.isVector())
6834 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6835 Alignment, AddressSpace, CostKind);
6836
6837 unsigned VF = VecTy->getNumElements() / Factor;
6838 Type *ScalarTy = VecTy->getElementType();
6839 // Deduplicate entries, model floats/pointers as appropriately-sized integers.
6840 if (!ScalarTy->isIntegerTy())
6841 ScalarTy =
6842 Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy));
6843
6844 // Get the cost of all the memory operations.
6845 // FIXME: discount dead loads.
6846 InstructionCost MemOpCosts =
6847 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
6848
6849 auto *VT = FixedVectorType::get(ScalarTy, VF);
6850 EVT ETy = TLI->getValueType(DL, VT);
6851 if (!ETy.isSimple())
6852 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6853 Alignment, AddressSpace, CostKind);
6854
6855 // TODO: Complete for other data-types and strides.
6856 // Each combination of Stride, element bit width and VF results in a different
6857 // sequence; The cost tables are therefore accessed with:
6858 // Factor (stride) and VectorType=VFxiN.
6859 // The Cost accounts only for the shuffle sequence;
6860 // The cost of the loads/stores is accounted for separately.
6861 //
6862 static const CostTblEntry AVX2InterleavedLoadTbl[] = {
6863 {2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8
6864 {2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8
6865 {2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8
6866 {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8
6867 {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8
6868
6869 {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16
6870 {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16
6871 {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16
6872
6873 {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32
6874 {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32
6875 {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32
6876
6877 {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64
6878 {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64
6879 {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64
6880 {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64
6881
6882 {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8
6883 {3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8
6884 {3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8
6885 {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
6886 {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8
6887
6888 {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16
6889 {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16
6890 {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16
6891 {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16
6892 {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16
6893
6894 {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32
6895 {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32
6896 {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32
6897 {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32
6898 {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32
6899
6900 {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64
6901 {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64
6902 {3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64
6903 {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64
6904
6905 {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8
6906 {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8
6907 {4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8
6908 {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8
6909 {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8
6910
6911 {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16
6912 {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16
6913 {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16
6914 {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16
6915 {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16
6916
6917 {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32
6918 {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32
6919 {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32
6920 {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32
6921 {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32
6922
6923 {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64
6924 {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64
6925 {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64
6926 {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64
6927
6928 {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8
6929 {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8
6930 {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8
6931 {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8
6932 {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8
6933
6934 {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16
6935 {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16
6936 {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16
6937 {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16
6938 {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16
6939
6940 {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32
6941 {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32
6942 {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32
6943 {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32
6944
6945 {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64
6946 {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64
6947 {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64
6948
6949 {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32
6950 };
6951
6952 static const CostTblEntry SSSE3InterleavedLoadTbl[] = {
6953 {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16
6954 };
6955
6956 static const CostTblEntry SSE2InterleavedLoadTbl[] = {
6957 {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16
6958 {2, MVT::v4i16, 7}, // (load 8i16 and) deinterleave into 2 x 4i16
6959
6960 {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32
6961 {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32
6962
6963 {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64
6964 };
6965
6966 static const CostTblEntry AVX2InterleavedStoreTbl[] = {
6967 {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store)
6968 {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store)
6969
6970 {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store)
6971 {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store)
6972 {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store)
6973
6974 {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store)
6975 {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store)
6976 {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store)
6977 {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store)
6978
6979 {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store)
6980 {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store)
6981 {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store)
6982 {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store)
6983 {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store)
6984
6985 {3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store)
6986 {3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store)
6987 {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store)
6988 {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
6989 {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
6990
6991 {3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store)
6992 {3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store)
6993 {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store)
6994 {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store)
6995 {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store)
6996
6997 {3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store)
6998 {3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store)
6999 {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store)
7000 {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store)
7001 {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store)
7002
7003 {3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store)
7004 {3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store)
7005 {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store)
7006 {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store)
7007
7008 {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store)
7009 {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store)
7010 {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store)
7011 {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store)
7012 {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store)
7013
7014 {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store)
7015 {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store)
7016 {4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store)
7017 {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store)
7018 {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store)
7019
7020 {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store)
7021 {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store)
7022 {4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store)
7023 {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store)
7024 {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store)
7025
7026 {4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store)
7027 {4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store)
7028 {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store)
7029 {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store)
7030
7031 {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store)
7032 {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store)
7033 {6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store)
7034 {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store)
7035 {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store)
7036
7037 {6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store)
7038 {6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store)
7039 {6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store)
7040 {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store)
7041 {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store)
7042
7043 {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store)
7044 {6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store)
7045 {6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store)
7046 {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store)
7047
7048 {6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store)
7049 {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store)
7050 {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store)
7051 };
7052
7053 static const CostTblEntry SSE2InterleavedStoreTbl[] = {
7054 {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store)
7055 {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store)
7056 {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store)
7057
7058 {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store)
7059 {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store)
7060
7061 {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store)
7062 };
7063
7064 if (Opcode == Instruction::Load) {
7065 auto GetDiscountedCost = [Factor, NumMembers = Indices.size(),
7066 MemOpCosts](const CostTblEntry *Entry) {
7067 // NOTE: this is just an approximation!
7068 // It can over/under -estimate the cost!
7069 return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor);
7070 };
7071
7072 if (ST->hasAVX2())
7073 if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor,
7074 ETy.getSimpleVT()))
7075 return GetDiscountedCost(Entry);
7076
7077 if (ST->hasSSSE3())
7078 if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor,
7079 ETy.getSimpleVT()))
7080 return GetDiscountedCost(Entry);
7081
7082 if (ST->hasSSE2())
7083 if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor,
7084 ETy.getSimpleVT()))
7085 return GetDiscountedCost(Entry);
7086 } else {
7087 assert(Opcode == Instruction::Store &&
7088 "Expected Store Instruction at this point");
7089 assert((!Indices.size() || Indices.size() == Factor) &&
7090 "Interleaved store only supports fully-interleaved groups.");
7091 if (ST->hasAVX2())
7092 if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor,
7093 ETy.getSimpleVT()))
7094 return MemOpCosts + Entry->Cost;
7095
7096 if (ST->hasSSE2())
7097 if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor,
7098 ETy.getSimpleVT()))
7099 return MemOpCosts + Entry->Cost;
7100 }
7101
7102 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
7103 Alignment, AddressSpace, CostKind,
7104 UseMaskForCond, UseMaskForGaps);
7105}
7106
7108 StackOffset BaseOffset,
7109 bool HasBaseReg, int64_t Scale,
7110 unsigned AddrSpace) const {
7111 // Scaling factors are not free at all.
7112 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
7113 // will take 2 allocations in the out of order engine instead of 1
7114 // for plain addressing mode, i.e. inst (reg1).
7115 // E.g.,
7116 // vaddps (%rsi,%rdx), %ymm0, %ymm1
7117 // Requires two allocations (one for the load, one for the computation)
7118 // whereas:
7119 // vaddps (%rsi), %ymm0, %ymm1
7120 // Requires just 1 allocation, i.e., freeing allocations for other operations
7121 // and having less micro operations to execute.
7122 //
7123 // For some X86 architectures, this is even worse because for instance for
7124 // stores, the complex addressing mode forces the instruction to use the
7125 // "load" ports instead of the dedicated "store" port.
7126 // E.g., on Haswell:
7127 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
7128 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
7130 AM.BaseGV = BaseGV;
7131 AM.BaseOffs = BaseOffset.getFixed();
7132 AM.HasBaseReg = HasBaseReg;
7133 AM.Scale = Scale;
7134 AM.ScalableOffset = BaseOffset.getScalable();
7135 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
7136 // Scale represents reg2 * scale, thus account for 1
7137 // as soon as we use a second register.
7138 return AM.Scale != 0;
7140}
7141
7143 // TODO: Hook MispredictPenalty of SchedMachineModel into this.
7144 return 14;
7145}
7146
7148 unsigned Bits = Ty->getScalarSizeInBits();
7149
7150 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
7151 // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
7152 if (ST->hasXOP() && (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
7153 return false;
7154
7155 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
7156 // shifts just as cheap as scalar ones.
7157 if (ST->hasAVX2() && (Bits == 32 || Bits == 64))
7158 return false;
7159
7160 // AVX512BW has shifts such as vpsllvw.
7161 if (ST->hasBWI() && Bits == 16)
7162 return false;
7163
7164 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
7165 // fully general vector.
7166 return true;
7167}
7168
7169unsigned X86TTIImpl::getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
7170 Type *ScalarValTy) const {
7171 if (ST->hasF16C() && ScalarMemTy->isHalfTy()) {
7172 return 4;
7173 }
7174 return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy);
7175}
7176
7178 SmallVectorImpl<Use *> &Ops) const {
7179 using namespace llvm::PatternMatch;
7180
7181 FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
7182 if (!VTy)
7183 return false;
7184
7185 if (I->getOpcode() == Instruction::Mul &&
7186 VTy->getElementType()->isIntegerTy(64)) {
7187 for (auto &Op : I->operands()) {
7188 // Make sure we are not already sinking this operand
7189 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
7190 continue;
7191
7192 // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or
7193 // the PMULUDQ pattern where the input is a zext_inreg from vXi32.
7194 if (ST->hasSSE41() &&
7195 match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),
7196 m_SpecificInt(32)))) {
7197 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
7198 Ops.push_back(&Op);
7199 } else if (ST->hasSSE2() &&
7200 match(Op.get(),
7201 m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) {
7202 Ops.push_back(&Op);
7203 }
7204 }
7205
7206 return !Ops.empty();
7207 }
7208
7209 // A uniform shift amount in a vector shift or funnel shift may be much
7210 // cheaper than a generic variable vector shift, so make that pattern visible
7211 // to SDAG by sinking the shuffle instruction next to the shift.
7212 int ShiftAmountOpNum = -1;
7213 if (I->isShift())
7214 ShiftAmountOpNum = 1;
7215 else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
7216 if (II->getIntrinsicID() == Intrinsic::fshl ||
7217 II->getIntrinsicID() == Intrinsic::fshr)
7218 ShiftAmountOpNum = 2;
7219 }
7220
7221 if (ShiftAmountOpNum == -1)
7222 return false;
7223
7224 auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
7225 if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
7226 isVectorShiftByScalarCheap(I->getType())) {
7227 Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
7228 return true;
7229 }
7230
7231 return false;
7232}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Expand Atomic instructions
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
Hexagon Common GEP
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
#define I(x, y, z)
Definition MD5.cpp:57
#define T
uint64_t IntrinsicInst * II
#define P(N)
static unsigned getNumElements(Type *Ty)
This file implements the SmallBitVector class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
CostTblEntryT< CostKindCosts > CostKindTblEntry
static bool isLegalMaskedLoadStore(Type *ScalarTy, const X86Subtarget *ST)
TypeConversionCostTblEntryT< CostKindCosts > TypeConversionCostKindTblEntry
This file a TargetTransformInfoImplBase conforming object specific to the X86 target machine.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1671
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1331
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:259
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1489
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1041
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:828
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:201
LLVM_ABI APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition APInt.cpp:482
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1563
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const override
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
const IntrinsicInst * getInst() const
const FeatureBitset & getFeatureBits() const
Machine Value Type.
bool is128BitVector() const
Return true if this is a 128-bit vector type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Information for memory intrinsic cost model.
Class to represent pointers.
unsigned getAddressSpace() const
Return the address space of the Pointer type.
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
bool test(unsigned Idx) const
size_type size() const
Returns the number of bits in this bitvector.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
Primary interface to the complete machine description for the target machine.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
virtual InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const
bool isStridedAccess(const SCEV *Ptr) const
unsigned minRequiredElementSize(const Value *Val, bool &isSigned) const
const SCEVConstant * getConstantStrideStep(ScalarEvolution *SE, const SCEV *Ptr) const
virtual bool isExpensiveToSpeculativelyExecute(const Instruction *I) const
virtual bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
CacheLevel
The possible cache levels.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
LLVM_ABI unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:294
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:156
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
static LLVM_ABI Type * getDoubleTy(LLVMContext &C)
Definition Type.cpp:285
Type * getContainedType(unsigned i) const
This method is used to implement the type iterator (defined at the end of the file).
Definition Type.h:381
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:284
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
Base class of all SIMD vector types.
static VectorType * getExtendedElementVectorType(VectorType *VTy)
This static method is like getInteger except that the element types are twice as wide as the elements...
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * getDoubleElementsVectorType(VectorType *VTy)
This static method returns a VectorType with twice as many elements as the input type and the same el...
Type * getElementType() const
bool useAVX512Regs() const
bool hasAVX512() const
bool hasAVX2() const
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind) const override
bool isLegalNTLoad(Type *DataType, Align Alignment) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
bool isLegalMaskedLoad(Type *DataType, Align Alignment, unsigned AddressSpace) const override
std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const override
unsigned getRegisterClassForType(bool Vector, Type *Ty) const override
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
bool isLegalNTStore(Type *DataType, Align Alignment) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getInterleavedMemoryOpCostAVX512(unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
bool isVectorShiftByScalarCheap(Type *Ty) const override
bool isLegalMaskedGather(Type *DataType, Align Alignment) const override
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
unsigned getAtomicMemIntrinsicMaxElementSize() const override
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const override
bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) const override
InstructionCost getBranchMispredictPenalty() const override
bool isExpensiveToSpeculativelyExecute(const Instruction *I) const override
bool hasConditionalLoadStoreForType(Type *Ty, bool IsStore) const override
std::optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool isLegalMaskedGatherScatter(Type *DataType, Align Alignment) const
bool enableInterleavedAccessVectorization() const override
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const override
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const override
Calculate the cost of Gather / Scatter operation.
unsigned getNumberOfRegisters(unsigned ClassID) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
Estimate the overhead of scalarizing an instruction.
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const override
bool isLegalMaskedStore(Type *DataType, Align Alignment, unsigned AddressSpace) const override
bool hasDivRemOp(Type *DataType, bool IsSigned) const override
bool isLegalMaskedCompressStore(Type *DataType, Align Alignment) const override
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
bool supportsEfficientVectorElementLoadStore() const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getIntImmCost(int64_t) const
Calculate the cost of materializing a 64-bit value.
InstructionCost getMinMaxCost(Intrinsic::ID IID, Type *Ty, TTI::TargetCostKind CostKind, FastMathFlags FMF) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool canMacroFuseCmp() const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
bool prefersVectorizedAddressing() const override
bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Type) const override
InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask, TTI::TargetCostKind CostKind) const override
bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition APInt.cpp:3009
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:807
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:780
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:771
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:868
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:744
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:832
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:779
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:784
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:762
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:569
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:838
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:724
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:914
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:736
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:947
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:844
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
ap_match< APInt > m_APIntAllowPoison(const APInt *&Res)
Match APInt while allowing poison in splat vector constants.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
FunctionAddr VTableAddr uintptr_t uintptr_t DataSize
Definition InstrProf.h:267
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1835
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1961
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
LLVM_ABI void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition CostTable.h:66
LLVM_ABI int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
#define N
std::optional< unsigned > operator[](TargetTransformInfo::TargetCostKind Kind) const
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Cost Table Entry.
Definition CostTable.h:25
Extended Value Type.
Definition ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition KnownBits.h:242
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Type Conversion Cost Table.
Definition CostTable.h:55