LLVM 23.0.0git
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements a TargetTransformInfo analysis pass specific to the
10/// X86 target machine. It uses the target's detailed information to provide
11/// more precise answers to certain TTI queries, while letting the target
12/// independent and default TTI implementations handle the rest.
13///
14//===----------------------------------------------------------------------===//
15/// About Cost Model numbers used below it's necessary to say the following:
16/// the numbers correspond to some "generic" X86 CPU instead of usage of a
17/// specific CPU model. Usually the numbers correspond to the CPU where the
18/// feature first appeared. For example, if we do Subtarget.hasSSE42() in
19/// the lookups below the cost is based on Nehalem as that was the first CPU
20/// to support that feature level and thus has most likely the worst case cost,
21/// although we may discard an outlying worst cost from one CPU (e.g. Atom).
22///
23/// Some examples of other technologies/CPUs:
24/// SSE 3 - Pentium4 / Athlon64
25/// SSE 4.1 - Penryn
26/// SSE 4.2 - Nehalem / Silvermont
27/// AVX - Sandy Bridge / Jaguar / Bulldozer
28/// AVX2 - Haswell / Ryzen
29/// AVX-512 - Xeon Phi / Skylake
30///
31/// And some examples of instruction target dependent costs (latency)
32/// divss sqrtss rsqrtss
33/// AMD K7 11-16 19 3
34/// Piledriver 9-24 13-15 5
35/// Jaguar 14 16 2
36/// Pentium II,III 18 30 2
37/// Nehalem 7-14 7-18 3
38/// Haswell 10-13 11 5
39///
40/// Interpreting the 4 TargetCostKind types:
41/// TCK_RecipThroughput and TCK_Latency should try to match the worst case
42/// values reported by the CPU scheduler models (and llvm-mca).
43/// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the
44/// actual encoding size of the instruction.
45/// TCK_SizeAndLatency should match the worst case micro-op counts reported by
46/// by the CPU scheduler models (and llvm-mca), to ensure that they are
47/// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are
48/// often used as the cost thresholds where TCK_SizeAndLatency is requested.
49//===----------------------------------------------------------------------===//
50
60#include <optional>
61
62using namespace llvm;
63
64#define DEBUG_TYPE "x86tti"
65
66//===----------------------------------------------------------------------===//
67//
68// X86 cost model.
69//
70//===----------------------------------------------------------------------===//
71
72// Helper struct to store/access costs for each cost kind.
73// TODO: Move this to allow other targets to use it?
75 unsigned RecipThroughputCost = ~0U;
76 unsigned LatencyCost = ~0U;
77 unsigned CodeSizeCost = ~0U;
78 unsigned SizeAndLatencyCost = ~0U;
79
80 std::optional<unsigned>
82 unsigned Cost = ~0U;
83 switch (Kind) {
86 break;
89 break;
92 break;
95 break;
96 }
97 if (Cost == ~0U)
98 return std::nullopt;
99 return Cost;
100 }
101};
104
106X86TTIImpl::getPopcntSupport(unsigned TyWidth) const {
107 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
108 // TODO: Currently the __builtin_popcount() implementation using SSE3
109 // instructions is inefficient. Once the problem is fixed, we should
110 // call ST->hasSSE3() instead of ST->hasPOPCNT().
111 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
112}
113
114std::optional<unsigned> X86TTIImpl::getCacheSize(
116 switch (Level) {
118 // - Penryn
119 // - Nehalem
120 // - Westmere
121 // - Sandy Bridge
122 // - Ivy Bridge
123 // - Haswell
124 // - Broadwell
125 // - Skylake
126 // - Kabylake
127 return 32 * 1024; // 32 KiB
129 // - Penryn
130 // - Nehalem
131 // - Westmere
132 // - Sandy Bridge
133 // - Ivy Bridge
134 // - Haswell
135 // - Broadwell
136 // - Skylake
137 // - Kabylake
138 return 256 * 1024; // 256 KiB
139 }
140
141 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
142}
143
144std::optional<unsigned> X86TTIImpl::getCacheAssociativity(
146 // - Penryn
147 // - Nehalem
148 // - Westmere
149 // - Sandy Bridge
150 // - Ivy Bridge
151 // - Haswell
152 // - Broadwell
153 // - Skylake
154 // - Kabylake
155 switch (Level) {
157 [[fallthrough]];
159 return 8;
160 }
161
162 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
163}
164
166
168 return Vector ? VectorClass
169 : Ty && Ty->isFloatingPointTy() ? ScalarFPClass
170 : GPRClass;
171}
172
173unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
174 if (ClassID == VectorClass && !ST->hasSSE1())
175 return 0;
176
177 if (!ST->is64Bit())
178 return 8;
179
180 if ((ClassID == GPRClass && ST->hasEGPR()) ||
181 (ClassID != GPRClass && ST->hasAVX512()))
182 return 32;
183
184 return 16;
185}
186
188 if (!ST->hasCF())
189 return false;
190 if (!Ty)
191 return true;
192 // Conditional faulting is supported by CFCMOV, which only accepts
193 // 16/32/64-bit operands.
194 // TODO: Support f32/f64 with VMOVSS/VMOVSD with zero mask when it's
195 // profitable.
196 auto *VTy = dyn_cast<FixedVectorType>(Ty);
197 if (!Ty->isIntegerTy() && (!VTy || VTy->getNumElements() != 1))
198 return false;
199 auto *ScalarTy = Ty->getScalarType();
200 switch (cast<IntegerType>(ScalarTy)->getBitWidth()) {
201 default:
202 return false;
203 case 16:
204 case 32:
205 case 64:
206 return true;
207 }
208}
209
212 unsigned PreferVectorWidth = ST->getPreferVectorWidth();
213 switch (K) {
215 return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
217 if (ST->hasAVX512() && PreferVectorWidth >= 512)
218 return TypeSize::getFixed(512);
219 if (ST->hasAVX() && PreferVectorWidth >= 256)
220 return TypeSize::getFixed(256);
221 if (ST->hasSSE1() && PreferVectorWidth >= 128)
222 return TypeSize::getFixed(128);
223 return TypeSize::getFixed(0);
225 return TypeSize::getScalable(0);
226 }
227
228 llvm_unreachable("Unsupported register kind");
229}
230
235
237 bool HasUnorderedReductions) const {
238 // If the loop will not be vectorized, don't interleave the loop.
239 // Let regular unroll to unroll the loop, which saves the overflow
240 // check and memory check cost.
241 if (VF.isScalar())
242 return 1;
243
244 if (ST->isAtom())
245 return 1;
246
247 // Sandybridge and Haswell have multiple execution ports and pipelined
248 // vector units.
249 if (ST->hasAVX())
250 return 4;
251
252 return 2;
253}
254
256 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
258 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
259
260 // vXi8 multiplications are always promoted to vXi16.
261 // Sub-128-bit types can be extended/packed more efficiently.
262 if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
263 Ty->getPrimitiveSizeInBits() <= 64 && Ty->getScalarSizeInBits() == 8) {
264 Type *WideVecTy =
266 return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
268 CostKind) +
269 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
271 CostKind) +
272 getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info);
273 }
274
275 // Legalize the type.
276 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
277
278 int ISD = TLI->InstructionOpcodeToISD(Opcode);
279 assert(ISD && "Invalid opcode");
280
281 if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
282 (LT.second.getScalarType() == MVT::i32 ||
283 LT.second.getScalarType() == MVT::i64)) {
284 // Check if the operands can be represented as a smaller datatype.
285 bool Op1Signed = false, Op2Signed = false;
286 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
287 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
288 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
289 bool SignedMode = Op1Signed || Op2Signed;
290
291 // If both vXi32 are representable as i15 and at least one is constant,
292 // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
293 // can treat this as PMADDWD which has the same costs as a vXi16 multiply.
294 if (OpMinSize <= 15 && !ST->isPMADDWDSlow() &&
295 LT.second.getScalarType() == MVT::i32) {
296 bool Op1Constant =
297 isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
298 bool Op2Constant =
299 isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
300 bool Op1Sext = isa<SExtInst>(Args[0]) &&
301 (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
302 bool Op2Sext = isa<SExtInst>(Args[1]) &&
303 (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
304
305 bool IsZeroExtended = !Op1Signed || !Op2Signed;
306 bool IsConstant = Op1Constant || Op2Constant;
307 bool IsSext = Op1Sext || Op2Sext;
308 if (IsConstant || IsZeroExtended || IsSext)
309 LT.second =
310 MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
311 }
312
313 // Check if the vXi32 operands can be shrunk into a smaller datatype.
314 // This should match the codegen from reduceVMULWidth.
315 // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()).
316 if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) {
317 if (OpMinSize <= 7)
318 return LT.first * 3; // pmullw/sext
319 if (!SignedMode && OpMinSize <= 8)
320 return LT.first * 3; // pmullw/zext
321 if (OpMinSize <= 15)
322 return LT.first * 5; // pmullw/pmulhw/pshuf
323 if (!SignedMode && OpMinSize <= 16)
324 return LT.first * 5; // pmullw/pmulhw/pshuf
325 }
326
327 // If both vXi64 are representable as (unsigned) i32, then we can perform
328 // the multiple with a single PMULUDQ instruction.
329 // TODO: Add (SSE41+) PMULDQ handling for signed extensions.
330 if (!SignedMode && OpMinSize <= 32 && LT.second.getScalarType() == MVT::i64)
331 ISD = X86ISD::PMULUDQ;
332 }
333
334 // Vector multiply by pow2 will be simplified to shifts.
335 // Vector multiply by -pow2 will be simplified to shifts/negates.
336 if (ISD == ISD::MUL && Op2Info.isConstant() &&
337 (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) {
339 getArithmeticInstrCost(Instruction::Shl, Ty, CostKind,
340 Op1Info.getNoProps(), Op2Info.getNoProps());
341 if (Op2Info.isNegatedPowerOf2())
342 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
343 return Cost;
344 }
345
346 // On X86, vector signed division by constants power-of-two are
347 // normally expanded to the sequence SRA + SRL + ADD + SRA.
348 // The OperandValue properties may not be the same as that of the previous
349 // operation; conservatively assume OP_None.
350 if ((ISD == ISD::SDIV || ISD == ISD::SREM) &&
351 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
353 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
354 Op1Info.getNoProps(), Op2Info.getNoProps());
355 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
356 Op1Info.getNoProps(), Op2Info.getNoProps());
357 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
358 Op1Info.getNoProps(), Op2Info.getNoProps());
359
360 if (ISD == ISD::SREM) {
361 // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
362 Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(),
363 Op2Info.getNoProps());
364 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(),
365 Op2Info.getNoProps());
366 }
367
368 return Cost;
369 }
370
371 // Vector unsigned division/remainder will be simplified to shifts/masks.
372 if ((ISD == ISD::UDIV || ISD == ISD::UREM) &&
373 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
374 if (ISD == ISD::UDIV)
375 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
376 Op1Info.getNoProps(), Op2Info.getNoProps());
377 // UREM
378 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
379 Op1Info.getNoProps(), Op2Info.getNoProps());
380 }
381
382 static const CostKindTblEntry GFNIUniformConstCostTable[] = {
383 { ISD::SHL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
384 { ISD::SRL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
385 { ISD::SRA, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
386 { ISD::SHL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
387 { ISD::SRL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
388 { ISD::SRA, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
389 { ISD::SHL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
390 { ISD::SRL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
391 { ISD::SRA, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
392 };
393
394 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasGFNI())
395 if (const auto *Entry =
396 CostTableLookup(GFNIUniformConstCostTable, ISD, LT.second))
397 if (auto KindCost = Entry->Cost[CostKind])
398 return LT.first * *KindCost;
399
400 static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
401 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
402 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
403 { ISD::SRA, MVT::v16i8, { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb.
404 { ISD::SHL, MVT::v32i8, { 1, 8, 2, 3 } }, // psllw + pand.
405 { ISD::SRL, MVT::v32i8, { 1, 8, 2, 3 } }, // psrlw + pand.
406 { ISD::SRA, MVT::v32i8, { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb.
407 { ISD::SHL, MVT::v64i8, { 1, 8, 2, 3 } }, // psllw + pand.
408 { ISD::SRL, MVT::v64i8, { 1, 8, 2, 3 } }, // psrlw + pand.
409 { ISD::SRA, MVT::v64i8, { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb.
410
411 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // psllw
412 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
413 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
414 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // psllw
415 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
416 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
417 };
418
419 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI())
420 if (const auto *Entry =
421 CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second))
422 if (auto KindCost = Entry->Cost[CostKind])
423 return LT.first * *KindCost;
424
425 static const CostKindTblEntry AVX512UniformConstCostTable[] = {
426 { ISD::SHL, MVT::v64i8, { 2, 12, 5, 6 } }, // psllw + pand.
427 { ISD::SRL, MVT::v64i8, { 2, 12, 5, 6 } }, // psrlw + pand.
428 { ISD::SRA, MVT::v64i8, { 3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb.
429
430 { ISD::SHL, MVT::v16i16, { 2, 7, 4, 4 } }, // psllw + split.
431 { ISD::SRL, MVT::v16i16, { 2, 7, 4, 4 } }, // psrlw + split.
432 { ISD::SRA, MVT::v16i16, { 2, 7, 4, 4 } }, // psraw + split.
433
434 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, // pslld
435 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, // psrld
436 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, // psrad
437 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, // pslld
438 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, // psrld
439 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, // psrad
440
441 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, // psraq
442 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, // psllq
443 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, // psrlq
444 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, // psraq
445 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, // psllq
446 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, // psrlq
447 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, // psraq
448
449 { ISD::SDIV, MVT::v16i32, { 6 } }, // pmuludq sequence
450 { ISD::SREM, MVT::v16i32, { 8 } }, // pmuludq+mul+sub sequence
451 { ISD::UDIV, MVT::v16i32, { 5 } }, // pmuludq sequence
452 { ISD::UREM, MVT::v16i32, { 7 } }, // pmuludq+mul+sub sequence
453 };
454
455 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512())
456 if (const auto *Entry =
457 CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second))
458 if (auto KindCost = Entry->Cost[CostKind])
459 return LT.first * *KindCost;
460
461 static const CostKindTblEntry AVX2UniformConstCostTable[] = {
462 { ISD::SHL, MVT::v16i8, { 1, 8, 2, 3 } }, // psllw + pand.
463 { ISD::SRL, MVT::v16i8, { 1, 8, 2, 3 } }, // psrlw + pand.
464 { ISD::SRA, MVT::v16i8, { 2, 10, 5, 6 } }, // psrlw, pand, pxor, psubb.
465 { ISD::SHL, MVT::v32i8, { 2, 8, 2, 4 } }, // psllw + pand.
466 { ISD::SRL, MVT::v32i8, { 2, 8, 2, 4 } }, // psrlw + pand.
467 { ISD::SRA, MVT::v32i8, { 3, 10, 5, 9 } }, // psrlw, pand, pxor, psubb.
468
469 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw
470 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw
471 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw
472 { ISD::SHL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psllw
473 { ISD::SRL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psrlw
474 { ISD::SRA, MVT::v16i16,{ 2, 2, 1, 2 } }, // psraw
475
476 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
477 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld
478 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad
479 { ISD::SHL, MVT::v8i32, { 2, 2, 1, 2 } }, // pslld
480 { ISD::SRL, MVT::v8i32, { 2, 2, 1, 2 } }, // psrld
481 { ISD::SRA, MVT::v8i32, { 2, 2, 1, 2 } }, // psrad
482
483 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq
484 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq
485 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
486 { ISD::SHL, MVT::v4i64, { 2, 2, 1, 2 } }, // psllq
487 { ISD::SRL, MVT::v4i64, { 2, 2, 1, 2 } }, // psrlq
488 { ISD::SRA, MVT::v4i64, { 4, 4, 3, 6 } }, // psrad + shuffle + split.
489
490 { ISD::SDIV, MVT::v8i32, { 6 } }, // pmuludq sequence
491 { ISD::SREM, MVT::v8i32, { 8 } }, // pmuludq+mul+sub sequence
492 { ISD::UDIV, MVT::v8i32, { 5 } }, // pmuludq sequence
493 { ISD::UREM, MVT::v8i32, { 7 } }, // pmuludq+mul+sub sequence
494 };
495
496 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2())
497 if (const auto *Entry =
498 CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second))
499 if (auto KindCost = Entry->Cost[CostKind])
500 return LT.first * *KindCost;
501
502 static const CostKindTblEntry AVXUniformConstCostTable[] = {
503 { ISD::SHL, MVT::v16i8, { 2, 7, 2, 3 } }, // psllw + pand.
504 { ISD::SRL, MVT::v16i8, { 2, 7, 2, 3 } }, // psrlw + pand.
505 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
506 { ISD::SHL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psllw + pand) + split.
507 { ISD::SRL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psrlw + pand) + split.
508 { ISD::SRA, MVT::v32i8, { 7, 7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split.
509
510 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 1 } }, // psllw.
511 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 1 } }, // psrlw.
512 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 1 } }, // psraw.
513 { ISD::SHL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psllw + split.
514 { ISD::SRL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psrlw + split.
515 { ISD::SRA, MVT::v16i16,{ 3, 6, 4, 5 } }, // psraw + split.
516
517 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 1 } }, // pslld.
518 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 1 } }, // psrld.
519 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 1 } }, // psrad.
520 { ISD::SHL, MVT::v8i32, { 3, 6, 4, 5 } }, // pslld + split.
521 { ISD::SRL, MVT::v8i32, { 3, 6, 4, 5 } }, // psrld + split.
522 { ISD::SRA, MVT::v8i32, { 3, 6, 4, 5 } }, // psrad + split.
523
524 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 1 } }, // psllq.
525 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 1 } }, // psrlq.
526 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
527 { ISD::SHL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
528 { ISD::SRL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
529 { ISD::SRA, MVT::v4i64, { 5, 7, 8, 9 } }, // 2 x psrad + shuffle + split.
530
531 { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split.
532 { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split.
533 { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split.
534 { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split.
535 };
536
537 // XOP has faster vXi8 shifts.
538 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() &&
539 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
540 if (const auto *Entry =
541 CostTableLookup(AVXUniformConstCostTable, ISD, LT.second))
542 if (auto KindCost = Entry->Cost[CostKind])
543 return LT.first * *KindCost;
544
545 static const CostKindTblEntry SSE2UniformConstCostTable[] = {
546 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
547 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
548 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
549
550 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw.
551 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw.
552 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw.
553
554 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
555 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld.
556 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad.
557
558 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq.
559 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq.
560 { ISD::SRA, MVT::v2i64, { 3, 5, 6, 6 } }, // 2 x psrad + shuffle.
561
562 { ISD::SDIV, MVT::v4i32, { 6 } }, // pmuludq sequence
563 { ISD::SREM, MVT::v4i32, { 8 } }, // pmuludq+mul+sub sequence
564 { ISD::UDIV, MVT::v4i32, { 5 } }, // pmuludq sequence
565 { ISD::UREM, MVT::v4i32, { 7 } }, // pmuludq+mul+sub sequence
566 };
567
568 // XOP has faster vXi8 shifts.
569 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() &&
570 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
571 if (const auto *Entry =
572 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
573 if (auto KindCost = Entry->Cost[CostKind])
574 return LT.first * *KindCost;
575
576 static const CostKindTblEntry AVX512BWConstCostTable[] = {
577 { ISD::SDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
578 { ISD::SREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
579 { ISD::UDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
580 { ISD::UREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
581
582 { ISD::SDIV, MVT::v32i16, { 6 } }, // vpmulhw sequence
583 { ISD::SREM, MVT::v32i16, { 8 } }, // vpmulhw+mul+sub sequence
584 { ISD::UDIV, MVT::v32i16, { 6 } }, // vpmulhuw sequence
585 { ISD::UREM, MVT::v32i16, { 8 } }, // vpmulhuw+mul+sub sequence
586 };
587
588 if (Op2Info.isConstant() && ST->hasBWI())
589 if (const auto *Entry =
590 CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
591 if (auto KindCost = Entry->Cost[CostKind])
592 return LT.first * *KindCost;
593
594 static const CostKindTblEntry AVX512ConstCostTable[] = {
595 { ISD::SDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
596 { ISD::SREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
597 { ISD::UDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
598 { ISD::UREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
599
600 { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence
601 { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence
602 { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence
603 { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence
604
605 { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence
606 { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence
607 { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence
608 { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence
609 };
610
611 if (Op2Info.isConstant() && ST->hasAVX512())
612 if (const auto *Entry =
613 CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
614 if (auto KindCost = Entry->Cost[CostKind])
615 return LT.first * *KindCost;
616
617 static const CostKindTblEntry AVX2ConstCostTable[] = {
618 { ISD::SDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
619 { ISD::SREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
620 { ISD::UDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
621 { ISD::UREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
622
623 { ISD::SDIV, MVT::v16i16, { 6 } }, // vpmulhw sequence
624 { ISD::SREM, MVT::v16i16, { 8 } }, // vpmulhw+mul+sub sequence
625 { ISD::UDIV, MVT::v16i16, { 6 } }, // vpmulhuw sequence
626 { ISD::UREM, MVT::v16i16, { 8 } }, // vpmulhuw+mul+sub sequence
627
628 { ISD::SDIV, MVT::v8i32, { 15 } }, // vpmuldq sequence
629 { ISD::SREM, MVT::v8i32, { 19 } }, // vpmuldq+mul+sub sequence
630 { ISD::UDIV, MVT::v8i32, { 15 } }, // vpmuludq sequence
631 { ISD::UREM, MVT::v8i32, { 19 } }, // vpmuludq+mul+sub sequence
632 };
633
634 if (Op2Info.isConstant() && ST->hasAVX2())
635 if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
636 if (auto KindCost = Entry->Cost[CostKind])
637 return LT.first * *KindCost;
638
639 static const CostKindTblEntry AVXConstCostTable[] = {
640 { ISD::SDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
641 { ISD::SREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
642 { ISD::UDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
643 { ISD::UREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
644
645 { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split.
646 { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split.
647 { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split.
648 { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split.
649
650 { ISD::SDIV, MVT::v8i32, { 32 } }, // vpmuludq sequence
651 { ISD::SREM, MVT::v8i32, { 38 } }, // vpmuludq+mul+sub sequence
652 { ISD::UDIV, MVT::v8i32, { 32 } }, // 2*pmuludq sequence + split.
653 { ISD::UREM, MVT::v8i32, { 42 } }, // 2*pmuludq+mul+sub sequence + split.
654 };
655
656 if (Op2Info.isConstant() && ST->hasAVX())
657 if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second))
658 if (auto KindCost = Entry->Cost[CostKind])
659 return LT.first * *KindCost;
660
661 static const CostKindTblEntry SSE41ConstCostTable[] = {
662 { ISD::SDIV, MVT::v4i32, { 15 } }, // vpmuludq sequence
663 { ISD::SREM, MVT::v4i32, { 20 } }, // vpmuludq+mul+sub sequence
664 };
665
666 if (Op2Info.isConstant() && ST->hasSSE41())
667 if (const auto *Entry =
668 CostTableLookup(SSE41ConstCostTable, ISD, LT.second))
669 if (auto KindCost = Entry->Cost[CostKind])
670 return LT.first * *KindCost;
671
672 static const CostKindTblEntry SSE2ConstCostTable[] = {
673 { ISD::SDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
674 { ISD::SREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
675 { ISD::UDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
676 { ISD::UREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
677
678 { ISD::SDIV, MVT::v8i16, { 6 } }, // pmulhw sequence
679 { ISD::SREM, MVT::v8i16, { 8 } }, // pmulhw+mul+sub sequence
680 { ISD::UDIV, MVT::v8i16, { 6 } }, // pmulhuw sequence
681 { ISD::UREM, MVT::v8i16, { 8 } }, // pmulhuw+mul+sub sequence
682
683 { ISD::SDIV, MVT::v4i32, { 19 } }, // pmuludq sequence
684 { ISD::SREM, MVT::v4i32, { 24 } }, // pmuludq+mul+sub sequence
685 { ISD::UDIV, MVT::v4i32, { 15 } }, // pmuludq sequence
686 { ISD::UREM, MVT::v4i32, { 20 } }, // pmuludq+mul+sub sequence
687 };
688
689 if (Op2Info.isConstant() && ST->hasSSE2())
690 if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
691 if (auto KindCost = Entry->Cost[CostKind])
692 return LT.first * *KindCost;
693
694 static const CostKindTblEntry AVX512BWUniformCostTable[] = {
695 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
696 { ISD::SRL, MVT::v16i8, { 3,10, 5, 8 } }, // psrlw + pand.
697 { ISD::SRA, MVT::v16i8, { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb.
698 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
699 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
700 { ISD::SRA, MVT::v32i8, { 5,10,10,13 } }, // psrlw, pand, pxor, psubb.
701 { ISD::SHL, MVT::v64i8, { 4, 7, 6, 8 } }, // psllw + pand.
702 { ISD::SRL, MVT::v64i8, { 4, 8, 7,10 } }, // psrlw + pand.
703 { ISD::SRA, MVT::v64i8, { 5,10,10,15 } }, // psrlw, pand, pxor, psubb.
704
705 { ISD::SHL, MVT::v32i16, { 2, 4, 2, 3 } }, // psllw
706 { ISD::SRL, MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw
707 { ISD::SRA, MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw
708 };
709
710 if (ST->hasBWI() && Op2Info.isUniform())
711 if (const auto *Entry =
712 CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second))
713 if (auto KindCost = Entry->Cost[CostKind])
714 return LT.first * *KindCost;
715
716 static const CostKindTblEntry AVX512UniformCostTable[] = {
717 { ISD::SHL, MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split.
718 { ISD::SRL, MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split.
719 { ISD::SRA, MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split.
720
721 { ISD::SHL, MVT::v16i32, { 2, 4, 2, 3 } }, // pslld
722 { ISD::SRL, MVT::v16i32, { 2, 4, 2, 3 } }, // psrld
723 { ISD::SRA, MVT::v16i32, { 2, 4, 2, 3 } }, // psrad
724
725 { ISD::SRA, MVT::v2i64, { 1, 2, 1, 2 } }, // psraq
726 { ISD::SHL, MVT::v4i64, { 1, 4, 1, 2 } }, // psllq
727 { ISD::SRL, MVT::v4i64, { 1, 4, 1, 2 } }, // psrlq
728 { ISD::SRA, MVT::v4i64, { 1, 4, 1, 2 } }, // psraq
729 { ISD::SHL, MVT::v8i64, { 1, 4, 1, 2 } }, // psllq
730 { ISD::SRL, MVT::v8i64, { 1, 4, 1, 2 } }, // psrlq
731 { ISD::SRA, MVT::v8i64, { 1, 4, 1, 2 } }, // psraq
732 };
733
734 if (ST->hasAVX512() && Op2Info.isUniform())
735 if (const auto *Entry =
736 CostTableLookup(AVX512UniformCostTable, ISD, LT.second))
737 if (auto KindCost = Entry->Cost[CostKind])
738 return LT.first * *KindCost;
739
740 static const CostKindTblEntry AVX2UniformCostTable[] = {
741 // Uniform splats are cheaper for the following instructions.
742 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
743 { ISD::SRL, MVT::v16i8, { 3, 9, 5, 8 } }, // psrlw + pand.
744 { ISD::SRA, MVT::v16i8, { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb.
745 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
746 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
747 { ISD::SRA, MVT::v32i8, { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb.
748
749 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 2 } }, // psllw.
750 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 2 } }, // psrlw.
751 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 2 } }, // psraw.
752 { ISD::SHL, MVT::v16i16, { 2, 4, 2, 3 } }, // psllw.
753 { ISD::SRL, MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw.
754 { ISD::SRA, MVT::v16i16, { 2, 4, 2, 3 } }, // psraw.
755
756 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 2 } }, // pslld
757 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 2 } }, // psrld
758 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 2 } }, // psrad
759 { ISD::SHL, MVT::v8i32, { 2, 4, 2, 3 } }, // pslld
760 { ISD::SRL, MVT::v8i32, { 2, 4, 2, 3 } }, // psrld
761 { ISD::SRA, MVT::v8i32, { 2, 4, 2, 3 } }, // psrad
762
763 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 2 } }, // psllq
764 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 2 } }, // psrlq
765 { ISD::SRA, MVT::v2i64, { 2, 4, 5, 7 } }, // 2 x psrad + shuffle.
766 { ISD::SHL, MVT::v4i64, { 2, 4, 1, 2 } }, // psllq
767 { ISD::SRL, MVT::v4i64, { 2, 4, 1, 2 } }, // psrlq
768 { ISD::SRA, MVT::v4i64, { 4, 6, 5, 9 } }, // 2 x psrad + shuffle.
769 };
770
771 if (ST->hasAVX2() && Op2Info.isUniform())
772 if (const auto *Entry =
773 CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
774 if (auto KindCost = Entry->Cost[CostKind])
775 return LT.first * *KindCost;
776
777 static const CostKindTblEntry AVXUniformCostTable[] = {
778 { ISD::SHL, MVT::v16i8, { 4, 4, 6, 8 } }, // psllw + pand.
779 { ISD::SRL, MVT::v16i8, { 4, 8, 5, 8 } }, // psrlw + pand.
780 { ISD::SRA, MVT::v16i8, { 6, 6, 9,13 } }, // psrlw, pand, pxor, psubb.
781 { ISD::SHL, MVT::v32i8, { 7, 8,11,14 } }, // psllw + pand + split.
782 { ISD::SRL, MVT::v32i8, { 7, 9,10,14 } }, // psrlw + pand + split.
783 { ISD::SRA, MVT::v32i8, { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split.
784
785 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 2 } }, // psllw.
786 { ISD::SRL, MVT::v8i16, { 1, 3, 1, 2 } }, // psrlw.
787 { ISD::SRA, MVT::v8i16, { 1, 3, 1, 2 } }, // psraw.
788 { ISD::SHL, MVT::v16i16, { 3, 7, 5, 7 } }, // psllw + split.
789 { ISD::SRL, MVT::v16i16, { 3, 7, 5, 7 } }, // psrlw + split.
790 { ISD::SRA, MVT::v16i16, { 3, 7, 5, 7 } }, // psraw + split.
791
792 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 2 } }, // pslld.
793 { ISD::SRL, MVT::v4i32, { 1, 3, 1, 2 } }, // psrld.
794 { ISD::SRA, MVT::v4i32, { 1, 3, 1, 2 } }, // psrad.
795 { ISD::SHL, MVT::v8i32, { 3, 7, 5, 7 } }, // pslld + split.
796 { ISD::SRL, MVT::v8i32, { 3, 7, 5, 7 } }, // psrld + split.
797 { ISD::SRA, MVT::v8i32, { 3, 7, 5, 7 } }, // psrad + split.
798
799 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 2 } }, // psllq.
800 { ISD::SRL, MVT::v2i64, { 1, 3, 1, 2 } }, // psrlq.
801 { ISD::SRA, MVT::v2i64, { 3, 4, 5, 7 } }, // 2 x psrad + shuffle.
802 { ISD::SHL, MVT::v4i64, { 3, 7, 4, 6 } }, // psllq + split.
803 { ISD::SRL, MVT::v4i64, { 3, 7, 4, 6 } }, // psrlq + split.
804 { ISD::SRA, MVT::v4i64, { 6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split.
805 };
806
807 // XOP has faster vXi8 shifts.
808 if (ST->hasAVX() && Op2Info.isUniform() &&
809 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
810 if (const auto *Entry =
811 CostTableLookup(AVXUniformCostTable, ISD, LT.second))
812 if (auto KindCost = Entry->Cost[CostKind])
813 return LT.first * *KindCost;
814
815 static const CostKindTblEntry SSE2UniformCostTable[] = {
816 // Uniform splats are cheaper for the following instructions.
817 { ISD::SHL, MVT::v16i8, { 9, 10, 6, 9 } }, // psllw + pand.
818 { ISD::SRL, MVT::v16i8, { 9, 13, 5, 9 } }, // psrlw + pand.
819 { ISD::SRA, MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence.
820
821 { ISD::SHL, MVT::v8i16, { 2, 2, 1, 2 } }, // psllw.
822 { ISD::SRL, MVT::v8i16, { 2, 2, 1, 2 } }, // psrlw.
823 { ISD::SRA, MVT::v8i16, { 2, 2, 1, 2 } }, // psraw.
824
825 { ISD::SHL, MVT::v4i32, { 2, 2, 1, 2 } }, // pslld
826 { ISD::SRL, MVT::v4i32, { 2, 2, 1, 2 } }, // psrld.
827 { ISD::SRA, MVT::v4i32, { 2, 2, 1, 2 } }, // psrad.
828
829 { ISD::SHL, MVT::v2i64, { 2, 2, 1, 2 } }, // psllq.
830 { ISD::SRL, MVT::v2i64, { 2, 2, 1, 2 } }, // psrlq.
831 { ISD::SRA, MVT::v2i64, { 5, 9, 5, 7 } }, // 2*psrlq + xor + sub.
832 };
833
834 if (ST->hasSSE2() && Op2Info.isUniform() &&
835 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
836 if (const auto *Entry =
837 CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
838 if (auto KindCost = Entry->Cost[CostKind])
839 return LT.first * *KindCost;
840
841 static const CostKindTblEntry AVX512DQCostTable[] = {
842 { ISD::MUL, MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq
843 { ISD::MUL, MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq
844 { ISD::MUL, MVT::v8i64, { 3, 15, 1, 3 } } // pmullq
845 };
846
847 // Look for AVX512DQ lowering tricks for custom cases.
848 if (ST->hasDQI())
849 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
850 if (auto KindCost = Entry->Cost[CostKind])
851 return LT.first * *KindCost;
852
853 static const CostKindTblEntry AVX512BWCostTable[] = {
854 { ISD::SHL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsllvw/pack sequence.
855 { ISD::SRL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsrlvw/pack sequence.
856 { ISD::SRA, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsravw/pack sequence.
857 { ISD::SHL, MVT::v32i8, { 4, 23,11,16 } }, // extend/vpsllvw/pack sequence.
858 { ISD::SRL, MVT::v32i8, { 4, 30,12,18 } }, // extend/vpsrlvw/pack sequence.
859 { ISD::SRA, MVT::v32i8, { 6, 13,24,30 } }, // extend/vpsravw/pack sequence.
860 { ISD::SHL, MVT::v64i8, { 6, 19,13,15 } }, // extend/vpsllvw/pack sequence.
861 { ISD::SRL, MVT::v64i8, { 7, 27,15,18 } }, // extend/vpsrlvw/pack sequence.
862 { ISD::SRA, MVT::v64i8, { 15, 15,30,30 } }, // extend/vpsravw/pack sequence.
863
864 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsllvw
865 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsrlvw
866 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsravw
867 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsllvw
868 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsrlvw
869 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsravw
870 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsllvw
871 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsrlvw
872 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsravw
873
874 { ISD::ADD, MVT::v64i8, { 1, 1, 1, 1 } }, // paddb
875 { ISD::ADD, MVT::v32i16, { 1, 1, 1, 1 } }, // paddw
876
877 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 1 } }, // paddb
878 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 1 } }, // paddw
879 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 1 } }, // paddd
880 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 1 } }, // paddq
881
882 { ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb
883 { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw
884
885 { ISD::MUL, MVT::v16i8, { 4, 12, 4, 5 } }, // extend/pmullw/trunc
886 { ISD::MUL, MVT::v32i8, { 3, 10, 7,10 } }, // pmaddubsw
887 { ISD::MUL, MVT::v64i8, { 3, 11, 7,10 } }, // pmaddubsw
888 { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw
889
890 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb
891 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 1 } }, // psubw
892 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 1 } }, // psubd
893 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 1 } }, // psubq
894 };
895
896 // Look for AVX512BW lowering tricks for custom cases.
897 if (ST->hasBWI())
898 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
899 if (auto KindCost = Entry->Cost[CostKind])
900 return LT.first * *KindCost;
901
902 static const CostKindTblEntry AVX512CostTable[] = {
903 { ISD::SHL, MVT::v64i8, { 15, 19,27,33 } }, // vpblendv+split sequence.
904 { ISD::SRL, MVT::v64i8, { 15, 19,30,36 } }, // vpblendv+split sequence.
905 { ISD::SRA, MVT::v64i8, { 37, 37,51,63 } }, // vpblendv+split sequence.
906
907 { ISD::SHL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
908 { ISD::SRL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
909 { ISD::SRA, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence.
910
911 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } },
912 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } },
913 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } },
914 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } },
915 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } },
916 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } },
917 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } },
918 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } },
919 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } },
920
921 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } },
922 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } },
923 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } },
924 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } },
925 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } },
926 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } },
927 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } },
928 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } },
929 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } },
930
931 { ISD::ADD, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*paddb + split
932 { ISD::ADD, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*paddw + split
933
934 { ISD::SUB, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*psubb + split
935 { ISD::SUB, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*psubw + split
936
937 { ISD::AND, MVT::v32i8, { 1, 1, 1, 1 } },
938 { ISD::AND, MVT::v16i16, { 1, 1, 1, 1 } },
939 { ISD::AND, MVT::v8i32, { 1, 1, 1, 1 } },
940 { ISD::AND, MVT::v4i64, { 1, 1, 1, 1 } },
941
942 { ISD::OR, MVT::v32i8, { 1, 1, 1, 1 } },
943 { ISD::OR, MVT::v16i16, { 1, 1, 1, 1 } },
944 { ISD::OR, MVT::v8i32, { 1, 1, 1, 1 } },
945 { ISD::OR, MVT::v4i64, { 1, 1, 1, 1 } },
946
947 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 1 } },
948 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 1 } },
949 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 1 } },
950 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 1 } },
951
952 { ISD::MUL, MVT::v16i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
953 { ISD::MUL, MVT::v8i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
954 { ISD::MUL, MVT::v4i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
955 { ISD::MUL, MVT::v8i64, { 6, 9, 8, 8 } }, // 3*pmuludq/3*shift/2*add
956 { ISD::MUL, MVT::i64, { 1 } }, // Skylake from http://www.agner.org/
957
958 { X86ISD::PMULUDQ, MVT::v8i64, { 1, 5, 1, 1 } },
959
960 { ISD::FNEG, MVT::v8f64, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
961 { ISD::FADD, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
962 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
963 { ISD::FSUB, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
964 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
965 { ISD::FMUL, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
966 { ISD::FMUL, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
967 { ISD::FMUL, MVT::v2f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
968 { ISD::FMUL, MVT::f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
969
970 { ISD::FDIV, MVT::f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
971 { ISD::FDIV, MVT::v2f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
972 { ISD::FDIV, MVT::v4f64, { 8, 14, 1, 1 } }, // Skylake from http://www.agner.org/
973 { ISD::FDIV, MVT::v8f64, { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/
974
975 { ISD::FNEG, MVT::v16f32, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
976 { ISD::FADD, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
977 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
978 { ISD::FSUB, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
979 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
980 { ISD::FMUL, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
981 { ISD::FMUL, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
982 { ISD::FMUL, MVT::v4f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
983 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
984
985 { ISD::FDIV, MVT::f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
986 { ISD::FDIV, MVT::v4f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
987 { ISD::FDIV, MVT::v8f32, { 5, 11, 1, 1 } }, // Skylake from http://www.agner.org/
988 { ISD::FDIV, MVT::v16f32, { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/
989 };
990
991 if (ST->hasAVX512())
992 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
993 if (auto KindCost = Entry->Cost[CostKind])
994 return LT.first * *KindCost;
995
996 static const CostKindTblEntry AVX2ShiftCostTable[] = {
997 // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
998 // customize them to detect the cases where shift amount is a scalar one.
999 { ISD::SHL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org)
1000 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
1001 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org)
1002 { ISD::SHL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org)
1003 { ISD::SRL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
1004 { ISD::SRA, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org)
1005 { ISD::SHL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org)
1006 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org)
1007 { ISD::SHL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org)
1008 { ISD::SRL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org)
1009 };
1010
1011 if (ST->hasAVX512()) {
1012 if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant())
1013 // On AVX512, a packed v32i16 shift left by a constant build_vector
1014 // is lowered into a vector multiply (vpmullw).
1015 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
1016 Op1Info.getNoProps(), Op2Info.getNoProps());
1017 }
1018
1019 // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
1020 if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
1021 if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
1022 Op2Info.isConstant())
1023 // On AVX2, a packed v16i16 shift left by a constant build_vector
1024 // is lowered into a vector multiply (vpmullw).
1025 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
1026 Op1Info.getNoProps(), Op2Info.getNoProps());
1027
1028 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
1029 if (auto KindCost = Entry->Cost[CostKind])
1030 return LT.first * *KindCost;
1031 }
1032
1033 static const CostKindTblEntry XOPShiftCostTable[] = {
1034 // 128bit shifts take 1cy, but right shifts require negation beforehand.
1035 { ISD::SHL, MVT::v16i8, { 1, 3, 1, 1 } },
1036 { ISD::SRL, MVT::v16i8, { 2, 3, 1, 1 } },
1037 { ISD::SRA, MVT::v16i8, { 2, 3, 1, 1 } },
1038 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 1 } },
1039 { ISD::SRL, MVT::v8i16, { 2, 3, 1, 1 } },
1040 { ISD::SRA, MVT::v8i16, { 2, 3, 1, 1 } },
1041 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 1 } },
1042 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 1 } },
1043 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 1 } },
1044 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 1 } },
1045 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } },
1046 { ISD::SRA, MVT::v2i64, { 2, 3, 1, 1 } },
1047 // 256bit shifts require splitting if AVX2 didn't catch them above.
1048 { ISD::SHL, MVT::v32i8, { 4, 7, 5, 6 } },
1049 { ISD::SRL, MVT::v32i8, { 6, 7, 5, 6 } },
1050 { ISD::SRA, MVT::v32i8, { 6, 7, 5, 6 } },
1051 { ISD::SHL, MVT::v16i16, { 4, 7, 5, 6 } },
1052 { ISD::SRL, MVT::v16i16, { 6, 7, 5, 6 } },
1053 { ISD::SRA, MVT::v16i16, { 6, 7, 5, 6 } },
1054 { ISD::SHL, MVT::v8i32, { 4, 7, 5, 6 } },
1055 { ISD::SRL, MVT::v8i32, { 6, 7, 5, 6 } },
1056 { ISD::SRA, MVT::v8i32, { 6, 7, 5, 6 } },
1057 { ISD::SHL, MVT::v4i64, { 4, 7, 5, 6 } },
1058 { ISD::SRL, MVT::v4i64, { 6, 7, 5, 6 } },
1059 { ISD::SRA, MVT::v4i64, { 6, 7, 5, 6 } },
1060 };
1061
1062 // Look for XOP lowering tricks.
1063 if (ST->hasXOP()) {
1064 // If the right shift is constant then we'll fold the negation so
1065 // it's as cheap as a left shift.
1066 int ShiftISD = ISD;
1067 if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant())
1068 ShiftISD = ISD::SHL;
1069 if (const auto *Entry =
1070 CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
1071 if (auto KindCost = Entry->Cost[CostKind])
1072 return LT.first * *KindCost;
1073 }
1074
1075 if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) {
1076 MVT VT = LT.second;
1077 // Vector shift left by non uniform constant can be lowered
1078 // into vector multiply.
1079 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
1080 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
1081 ISD = ISD::MUL;
1082 }
1083
1084 static const CostKindTblEntry GLMCostTable[] = {
1085 { ISD::FDIV, MVT::f32, { 18, 19, 1, 1 } }, // divss
1086 { ISD::FDIV, MVT::v4f32, { 35, 36, 1, 1 } }, // divps
1087 { ISD::FDIV, MVT::f64, { 33, 34, 1, 1 } }, // divsd
1088 { ISD::FDIV, MVT::v2f64, { 65, 66, 1, 1 } }, // divpd
1089 };
1090
1091 if (ST->useGLMDivSqrtCosts())
1092 if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second))
1093 if (auto KindCost = Entry->Cost[CostKind])
1094 return LT.first * *KindCost;
1095
1096 static const CostKindTblEntry SLMCostTable[] = {
1097 { ISD::MUL, MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld
1098 { ISD::MUL, MVT::v8i16, { 2, 5, 1, 1 } }, // pmullw
1099 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // mulsd
1100 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // mulss
1101 { ISD::FMUL, MVT::v2f64, { 4, 7, 1, 1 } }, // mulpd
1102 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // mulps
1103 { ISD::FDIV, MVT::f32, { 17, 19, 1, 1 } }, // divss
1104 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 6 } }, // divps
1105 { ISD::FDIV, MVT::f64, { 32, 34, 1, 1 } }, // divsd
1106 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 6 } }, // divpd
1107 { ISD::FADD, MVT::v2f64, { 2, 4, 1, 1 } }, // addpd
1108 { ISD::FSUB, MVT::v2f64, { 2, 4, 1, 1 } }, // subpd
1109 // v2i64/v4i64 mul is custom lowered as a series of long:
1110 // multiplies(3), shifts(3) and adds(2)
1111 // slm muldq version throughput is 2 and addq throughput 4
1112 // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
1113 // 3X4 (addq throughput) = 17
1114 { ISD::MUL, MVT::v2i64, { 17, 22, 9, 9 } },
1115 // slm addq\subq throughput is 4
1116 { ISD::ADD, MVT::v2i64, { 4, 2, 1, 2 } },
1117 { ISD::SUB, MVT::v2i64, { 4, 2, 1, 2 } },
1118 };
1119
1120 if (ST->useSLMArithCosts())
1121 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second))
1122 if (auto KindCost = Entry->Cost[CostKind])
1123 return LT.first * *KindCost;
1124
1125 static const CostKindTblEntry AVX2CostTable[] = {
1126 { ISD::SHL, MVT::v16i8, { 6, 21,11,16 } }, // vpblendvb sequence.
1127 { ISD::SHL, MVT::v32i8, { 6, 23,11,22 } }, // vpblendvb sequence.
1128 { ISD::SHL, MVT::v8i16, { 5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence.
1129 { ISD::SHL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1130
1131 { ISD::SRL, MVT::v16i8, { 6, 27,12,18 } }, // vpblendvb sequence.
1132 { ISD::SRL, MVT::v32i8, { 8, 30,12,24 } }, // vpblendvb sequence.
1133 { ISD::SRL, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence.
1134 { ISD::SRL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1135
1136 { ISD::SRA, MVT::v16i8, { 17, 17,24,30 } }, // vpblendvb sequence.
1137 { ISD::SRA, MVT::v32i8, { 18, 20,24,43 } }, // vpblendvb sequence.
1138 { ISD::SRA, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsravd/pack sequence.
1139 { ISD::SRA, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsravd/pack sequence.
1140 { ISD::SRA, MVT::v2i64, { 4, 5, 5, 5 } }, // srl/xor/sub sequence.
1141 { ISD::SRA, MVT::v4i64, { 8, 8, 5, 9 } }, // srl/xor/sub sequence.
1142
1143 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 2 } }, // psubb
1144 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 2 } }, // paddb
1145 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 2 } }, // psubw
1146 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 2 } }, // paddw
1147 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 2 } }, // psubd
1148 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 2 } }, // paddd
1149 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 2 } }, // psubq
1150 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq
1151
1152 { ISD::MUL, MVT::v16i8, { 5, 18, 6,12 } }, // extend/pmullw/pack
1153 { ISD::MUL, MVT::v32i8, { 4, 8, 8,16 } }, // pmaddubsw
1154 { ISD::MUL, MVT::v16i16, { 2, 5, 1, 2 } }, // pmullw
1155 { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld
1156 { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld
1157 { ISD::MUL, MVT::v4i64, { 6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add
1158 { ISD::MUL, MVT::v2i64, { 6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
1159
1160 { X86ISD::PMULUDQ, MVT::v4i64, { 1, 5, 1, 1 } },
1161
1162 { ISD::FNEG, MVT::v4f64, { 1, 1, 1, 2 } }, // vxorpd
1163 { ISD::FNEG, MVT::v8f32, { 1, 1, 1, 2 } }, // vxorps
1164
1165 { ISD::FADD, MVT::f64, { 1, 4, 1, 1 } }, // vaddsd
1166 { ISD::FADD, MVT::f32, { 1, 4, 1, 1 } }, // vaddss
1167 { ISD::FADD, MVT::v2f64, { 1, 4, 1, 1 } }, // vaddpd
1168 { ISD::FADD, MVT::v4f32, { 1, 4, 1, 1 } }, // vaddps
1169 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 2 } }, // vaddpd
1170 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 2 } }, // vaddps
1171
1172 { ISD::FSUB, MVT::f64, { 1, 4, 1, 1 } }, // vsubsd
1173 { ISD::FSUB, MVT::f32, { 1, 4, 1, 1 } }, // vsubss
1174 { ISD::FSUB, MVT::v2f64, { 1, 4, 1, 1 } }, // vsubpd
1175 { ISD::FSUB, MVT::v4f32, { 1, 4, 1, 1 } }, // vsubps
1176 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 2 } }, // vsubpd
1177 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 2 } }, // vsubps
1178
1179 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // vmulsd
1180 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // vmulss
1181 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // vmulpd
1182 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // vmulps
1183 { ISD::FMUL, MVT::v4f64, { 1, 5, 1, 2 } }, // vmulpd
1184 { ISD::FMUL, MVT::v8f32, { 1, 5, 1, 2 } }, // vmulps
1185
1186 { ISD::FDIV, MVT::f32, { 7, 13, 1, 1 } }, // vdivss
1187 { ISD::FDIV, MVT::v4f32, { 7, 13, 1, 1 } }, // vdivps
1188 { ISD::FDIV, MVT::v8f32, { 14, 21, 1, 3 } }, // vdivps
1189 { ISD::FDIV, MVT::f64, { 14, 20, 1, 1 } }, // vdivsd
1190 { ISD::FDIV, MVT::v2f64, { 14, 20, 1, 1 } }, // vdivpd
1191 { ISD::FDIV, MVT::v4f64, { 28, 35, 1, 3 } }, // vdivpd
1192 };
1193
1194 // Look for AVX2 lowering tricks for custom cases.
1195 if (ST->hasAVX2())
1196 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
1197 if (auto KindCost = Entry->Cost[CostKind])
1198 return LT.first * *KindCost;
1199
1200 static const CostKindTblEntry AVX1CostTable[] = {
1201 // We don't have to scalarize unsupported ops. We can issue two half-sized
1202 // operations and we only need to extract the upper YMM half.
1203 // Two ops + 1 extract + 1 insert = 4.
1204 { ISD::MUL, MVT::v32i8, { 10, 11, 18, 19 } }, // pmaddubsw + split
1205 { ISD::MUL, MVT::v16i8, { 5, 6, 8, 12 } }, // 2*pmaddubsw/3*and/psllw/or
1206 { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split
1207 { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split
1208 { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld
1209 { ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } },
1210
1211 { X86ISD::PMULUDQ, MVT::v4i64, { 3, 5, 5, 6 } }, // pmuludq + split
1212
1213 { ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps
1214 { ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps
1215 { ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps
1216 { ISD::AND, MVT::v4i64, { 1, 1, 1, 2 } }, // vandps
1217
1218 { ISD::OR, MVT::v32i8, { 1, 1, 1, 2 } }, // vorps
1219 { ISD::OR, MVT::v16i16, { 1, 1, 1, 2 } }, // vorps
1220 { ISD::OR, MVT::v8i32, { 1, 1, 1, 2 } }, // vorps
1221 { ISD::OR, MVT::v4i64, { 1, 1, 1, 2 } }, // vorps
1222
1223 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 2 } }, // vxorps
1224 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 2 } }, // vxorps
1225 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 2 } }, // vxorps
1226 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 2 } }, // vxorps
1227
1228 { ISD::SUB, MVT::v32i8, { 4, 2, 5, 6 } }, // psubb + split
1229 { ISD::ADD, MVT::v32i8, { 4, 2, 5, 6 } }, // paddb + split
1230 { ISD::SUB, MVT::v16i16, { 4, 2, 5, 6 } }, // psubw + split
1231 { ISD::ADD, MVT::v16i16, { 4, 2, 5, 6 } }, // paddw + split
1232 { ISD::SUB, MVT::v8i32, { 4, 2, 5, 6 } }, // psubd + split
1233 { ISD::ADD, MVT::v8i32, { 4, 2, 5, 6 } }, // paddd + split
1234 { ISD::SUB, MVT::v4i64, { 4, 2, 5, 6 } }, // psubq + split
1235 { ISD::ADD, MVT::v4i64, { 4, 2, 5, 6 } }, // paddq + split
1236 { ISD::SUB, MVT::v2i64, { 1, 1, 1, 1 } }, // psubq
1237 { ISD::ADD, MVT::v2i64, { 1, 1, 1, 1 } }, // paddq
1238
1239 { ISD::SHL, MVT::v16i8, { 10, 21,11,17 } }, // pblendvb sequence.
1240 { ISD::SHL, MVT::v32i8, { 22, 22,27,40 } }, // pblendvb sequence + split.
1241 { ISD::SHL, MVT::v8i16, { 6, 9,11,11 } }, // pblendvb sequence.
1242 { ISD::SHL, MVT::v16i16, { 13, 16,24,25 } }, // pblendvb sequence + split.
1243 { ISD::SHL, MVT::v4i32, { 3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld
1244 { ISD::SHL, MVT::v8i32, { 9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split
1245 { ISD::SHL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1246 { ISD::SHL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1247
1248 { ISD::SRL, MVT::v16i8, { 11, 27,12,18 } }, // pblendvb sequence.
1249 { ISD::SRL, MVT::v32i8, { 23, 23,30,43 } }, // pblendvb sequence + split.
1250 { ISD::SRL, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1251 { ISD::SRL, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1252 { ISD::SRL, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1253 { ISD::SRL, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1254 { ISD::SRL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1255 { ISD::SRL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1256
1257 { ISD::SRA, MVT::v16i8, { 21, 22,24,36 } }, // pblendvb sequence.
1258 { ISD::SRA, MVT::v32i8, { 44, 45,51,76 } }, // pblendvb sequence + split.
1259 { ISD::SRA, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1260 { ISD::SRA, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1261 { ISD::SRA, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1262 { ISD::SRA, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1263 { ISD::SRA, MVT::v2i64, { 5, 6,10,14 } }, // Shift each lane + blend.
1264 { ISD::SRA, MVT::v4i64, { 12, 12,22,30 } }, // Shift each lane + blend + split.
1265
1266 { ISD::FNEG, MVT::v4f64, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1267 { ISD::FNEG, MVT::v8f32, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1268
1269 { ISD::FADD, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1270 { ISD::FADD, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1271 { ISD::FADD, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1272 { ISD::FADD, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1273 { ISD::FADD, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1274 { ISD::FADD, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1275
1276 { ISD::FSUB, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1277 { ISD::FSUB, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1278 { ISD::FSUB, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1279 { ISD::FSUB, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1280 { ISD::FSUB, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1281 { ISD::FSUB, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1282
1283 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1284 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1285 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1286 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1287 { ISD::FMUL, MVT::v4f64, { 4, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1288 { ISD::FMUL, MVT::v8f32, { 2, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1289
1290 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1291 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1292 { ISD::FDIV, MVT::v8f32, { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/
1293 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1294 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1295 { ISD::FDIV, MVT::v4f64, { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/
1296 };
1297
1298 if (ST->hasAVX())
1299 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
1300 if (auto KindCost = Entry->Cost[CostKind])
1301 return LT.first * *KindCost;
1302
1303 static const CostKindTblEntry SSE42CostTable[] = {
1304 { ISD::FADD, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1305 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1306 { ISD::FADD, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1307 { ISD::FADD, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1308
1309 { ISD::FSUB, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1310 { ISD::FSUB, MVT::f32 , { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1311 { ISD::FSUB, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1312 { ISD::FSUB, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1313
1314 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1315 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1316 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1317 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1318
1319 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1320 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1321 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1322 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1323
1324 { ISD::MUL, MVT::v2i64, { 6, 10,10,10 } } // 3*pmuludq/3*shift/2*add
1325 };
1326
1327 if (ST->hasSSE42())
1328 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
1329 if (auto KindCost = Entry->Cost[CostKind])
1330 return LT.first * *KindCost;
1331
1332 static const CostKindTblEntry SSE41CostTable[] = {
1333 { ISD::SHL, MVT::v16i8, { 15, 24,17,22 } }, // pblendvb sequence.
1334 { ISD::SHL, MVT::v8i16, { 11, 14,11,11 } }, // pblendvb sequence.
1335 { ISD::SHL, MVT::v4i32, { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld
1336
1337 { ISD::SRL, MVT::v16i8, { 16, 27,18,24 } }, // pblendvb sequence.
1338 { ISD::SRL, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1339 { ISD::SRL, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1340 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1341
1342 { ISD::SRA, MVT::v16i8, { 38, 41,30,36 } }, // pblendvb sequence.
1343 { ISD::SRA, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1344 { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1345 { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence.
1346
1347 { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org)
1348 };
1349
1350 if (ST->hasSSE41())
1351 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
1352 if (auto KindCost = Entry->Cost[CostKind])
1353 return LT.first * *KindCost;
1354
1355 static const CostKindTblEntry SSSE3CostTable[] = {
1356 { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*pmaddubsw/3*and/psllw/or
1357 };
1358
1359 if (ST->hasSSSE3())
1360 if (const auto *Entry = CostTableLookup(SSSE3CostTable, ISD, LT.second))
1361 if (auto KindCost = Entry->Cost[CostKind])
1362 return LT.first * *KindCost;
1363
1364 static const CostKindTblEntry SSE2CostTable[] = {
1365 // We don't correctly identify costs of casts because they are marked as
1366 // custom.
1367 { ISD::SHL, MVT::v16i8, { 13, 21,26,28 } }, // cmpgtb sequence.
1368 { ISD::SHL, MVT::v8i16, { 24, 27,16,20 } }, // cmpgtw sequence.
1369 { ISD::SHL, MVT::v4i32, { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq.
1370 { ISD::SHL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1371
1372 { ISD::SRL, MVT::v16i8, { 14, 28,27,30 } }, // cmpgtb sequence.
1373 { ISD::SRL, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1374 { ISD::SRL, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1375 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1376
1377 { ISD::SRA, MVT::v16i8, { 27, 30,54,54 } }, // unpacked cmpgtb sequence.
1378 { ISD::SRA, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1379 { ISD::SRA, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1380 { ISD::SRA, MVT::v2i64, { 8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence.
1381
1382 { ISD::AND, MVT::v16i8, { 1, 1, 1, 1 } }, // pand
1383 { ISD::AND, MVT::v8i16, { 1, 1, 1, 1 } }, // pand
1384 { ISD::AND, MVT::v4i32, { 1, 1, 1, 1 } }, // pand
1385 { ISD::AND, MVT::v2i64, { 1, 1, 1, 1 } }, // pand
1386
1387 { ISD::OR, MVT::v16i8, { 1, 1, 1, 1 } }, // por
1388 { ISD::OR, MVT::v8i16, { 1, 1, 1, 1 } }, // por
1389 { ISD::OR, MVT::v4i32, { 1, 1, 1, 1 } }, // por
1390 { ISD::OR, MVT::v2i64, { 1, 1, 1, 1 } }, // por
1391
1392 { ISD::XOR, MVT::v16i8, { 1, 1, 1, 1 } }, // pxor
1393 { ISD::XOR, MVT::v8i16, { 1, 1, 1, 1 } }, // pxor
1394 { ISD::XOR, MVT::v4i32, { 1, 1, 1, 1 } }, // pxor
1395 { ISD::XOR, MVT::v2i64, { 1, 1, 1, 1 } }, // pxor
1396
1397 { ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq
1398 { ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq
1399
1400 { ISD::MUL, MVT::v16i8, { 6, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack
1401 { ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw
1402 { ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle
1403 { ISD::MUL, MVT::v2i64, { 7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add
1404
1405 { X86ISD::PMULUDQ, MVT::v2i64, { 1, 5, 1, 1 } },
1406
1407 { ISD::FDIV, MVT::f32, { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/
1408 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/
1409 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/
1410 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/
1411
1412 { ISD::FNEG, MVT::f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1413 { ISD::FNEG, MVT::f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1414 { ISD::FNEG, MVT::v4f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1415 { ISD::FNEG, MVT::v2f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1416
1417 { ISD::FADD, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1418 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1419 { ISD::FADD, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1420
1421 { ISD::FSUB, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1422 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1423 { ISD::FSUB, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1424
1425 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1426 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1427 };
1428
1429 if (ST->hasSSE2())
1430 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
1431 if (auto KindCost = Entry->Cost[CostKind])
1432 return LT.first * *KindCost;
1433
1434 static const CostKindTblEntry SSE1CostTable[] = {
1435 { ISD::FDIV, MVT::f32, { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/
1436 { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/
1437
1438 { ISD::FNEG, MVT::f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1439 { ISD::FNEG, MVT::v4f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1440
1441 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1442 { ISD::FADD, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1443
1444 { ISD::FSUB, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1445 { ISD::FSUB, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1446
1447 { ISD::FMUL, MVT::f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1448 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1449 };
1450
1451 if (ST->hasSSE1())
1452 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1453 if (auto KindCost = Entry->Cost[CostKind])
1454 return LT.first * *KindCost;
1455
1456 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
1457 { ISD::ADD, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1458 { ISD::SUB, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1459 { ISD::MUL, MVT::i64, { 2, 6, 1, 2 } },
1460 };
1461
1462 if (ST->is64Bit())
1463 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1464 if (auto KindCost = Entry->Cost[CostKind])
1465 return LT.first * *KindCost;
1466
1467 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1468 { ISD::ADD, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1469 { ISD::ADD, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1470 { ISD::ADD, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1471
1472 { ISD::SUB, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1473 { ISD::SUB, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1474 { ISD::SUB, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1475
1476 { ISD::MUL, MVT::i8, { 3, 4, 1, 1 } },
1477 { ISD::MUL, MVT::i16, { 2, 4, 1, 1 } },
1478 { ISD::MUL, MVT::i32, { 1, 4, 1, 1 } },
1479
1480 { ISD::FNEG, MVT::f64, { 2, 2, 1, 3 } }, // (x87)
1481 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1482 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1483 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // (x87)
1484 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87)
1485 };
1486
1487 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1488 if (auto KindCost = Entry->Cost[CostKind])
1489 return LT.first * *KindCost;
1490
1491 // It is not a good idea to vectorize division. We have to scalarize it and
1492 // in the process we will often end up having to spilling regular
1493 // registers. The overhead of division is going to dominate most kernels
1494 // anyways so try hard to prevent vectorization of division - it is
1495 // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1496 // to hide "20 cycles" for each lane.
1497 if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() &&
1498 (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
1499 ISD == ISD::UREM)) {
1500 InstructionCost ScalarCost =
1501 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
1502 Op1Info.getNoProps(), Op2Info.getNoProps());
1503 return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1504 }
1505
1506 // Handle some basic single instruction code size cases.
1507 if (CostKind == TTI::TCK_CodeSize) {
1508 switch (ISD) {
1509 case ISD::FADD:
1510 case ISD::FSUB:
1511 case ISD::FMUL:
1512 case ISD::FDIV:
1513 case ISD::FNEG:
1514 case ISD::AND:
1515 case ISD::OR:
1516 case ISD::XOR:
1517 return LT.first;
1518 break;
1519 }
1520 }
1521
1522 // Fallback to the default implementation.
1523 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1524 Args, CxtI);
1525}
1526
1529 unsigned Opcode1, const SmallBitVector &OpcodeMask,
1531 if (isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
1532 return TTI::TCC_Basic;
1534}
1535
1537 VectorType *DstTy, VectorType *SrcTy,
1538 ArrayRef<int> Mask,
1540 int Index, VectorType *SubTp,
1542 const Instruction *CxtI) const {
1543 assert((Mask.empty() || DstTy->isScalableTy() ||
1544 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
1545 "Expected the Mask to match the return size if given");
1546 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
1547 "Expected the same scalar types");
1548
1549 // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1550 // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1551 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1552
1553 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
1554
1555 // If all args are constant than this will be constant folded away.
1556 if (!Args.empty() &&
1557 all_of(Args, [](const Value *Arg) { return isa<Constant>(Arg); }))
1558 return TTI::TCC_Free;
1559
1560 // Recognize a basic concat_vector shuffle.
1561 if (Kind == TTI::SK_PermuteTwoSrc &&
1562 Mask.size() == (2 * SrcTy->getElementCount().getKnownMinValue()) &&
1563 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
1567 CostKind, Mask.size() / 2, SrcTy);
1568
1569 // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1570 if (Kind == TTI::SK_Transpose)
1571 if (LT.second != MVT::v4f64 && LT.second != MVT::v4i64)
1572 Kind = TTI::SK_PermuteTwoSrc;
1573
1574 if (Kind == TTI::SK_Broadcast) {
1575 // For Broadcasts we are splatting the first element from the first input
1576 // register, so only need to reference that input and all the output
1577 // registers are the same.
1578 LT.first = 1;
1579
1580 // If we're broadcasting a load then AVX/AVX2 can do this for free.
1581 // If many-used-load whose every use is one of a small set of operations
1582 // that SLP can rewrite into a single vector lane, codegen can fold it into
1583 // the free broadcast.
1584 using namespace PatternMatch;
1585 auto IsBroadcastLoadFoldUser = [&](const User *U) {
1586 if (isa<InsertElementInst>(U) && U->getOperand(1) == Args[0])
1587 return true;
1588 if (U->getType()->isVectorTy())
1589 return false;
1590 // Terminators (return/branch/switch/indirectbr/resume/invoke EH)
1591 // and phis carry the value across control flow.
1592 if (const auto *I = dyn_cast<Instruction>(U))
1593 if (I->isTerminator() ||
1595 return false;
1596 // Only pure calls can be folded.
1597 if (const auto *CB = dyn_cast<CallBase>(U))
1598 return CB->doesNotAccessMemory() && !CB->mayHaveSideEffects();
1599 return true;
1600 };
1601 auto IsFoldableSLPBroadcastLoad = [&]() {
1602 if (!match(Args[0], m_Load(m_Value())))
1603 return false;
1604 auto *FVT = dyn_cast<FixedVectorType>(DstTy);
1605 if (!FVT)
1606 return false;
1607 // getNumUses() counts each Use, matching the per-lane broadcast
1608 // accounting (a use like `op %x, %x` consumes two broadcast lanes).
1609 if (Args[0]->getNumUses() != FVT->getNumElements())
1610 return false;
1611 return all_of(Args[0]->users(), IsBroadcastLoadFoldUser);
1612 };
1613 if (!Args.empty() &&
1614 (match(Args[0], m_OneUse(m_Load(m_Value()))) ||
1615 IsFoldableSLPBroadcastLoad()) &&
1616 (ST->hasAVX2() ||
1617 (ST->hasAVX() && LT.second.getScalarSizeInBits() >= 32)))
1618 return TTI::TCC_Free;
1619 }
1620
1621 // Attempt to detect a cheaper inlane shuffle, avoiding 128-bit subvector
1622 // permutation.
1623 // Attempt to detect a shuffle mask with a single defined element.
1624 bool IsInLaneShuffle = false;
1625 bool IsSingleElementMask = false;
1626 if (SrcTy->getPrimitiveSizeInBits() > 0 &&
1627 (SrcTy->getPrimitiveSizeInBits() % 128) == 0 &&
1628 SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
1629 Mask.size() == SrcTy->getElementCount().getKnownMinValue()) {
1630 unsigned NumLanes = SrcTy->getPrimitiveSizeInBits() / 128;
1631 unsigned NumEltsPerLane = Mask.size() / NumLanes;
1632 if ((Mask.size() % NumLanes) == 0) {
1633 IsInLaneShuffle = all_of(enumerate(Mask), [&](const auto &P) {
1634 return P.value() == PoisonMaskElem ||
1635 ((P.value() % Mask.size()) / NumEltsPerLane) ==
1636 (P.index() / NumEltsPerLane);
1637 });
1638 IsSingleElementMask =
1639 (Mask.size() - 1) == static_cast<unsigned>(count_if(Mask, [](int M) {
1640 return M == PoisonMaskElem;
1641 }));
1642 }
1643 }
1644
1645 // Treat <X x bfloat> shuffles as <X x half>.
1646 if (LT.second.isVectorOf(MVT::bf16))
1647 LT.second = LT.second.changeVectorElementType(MVT::f16);
1648
1649 // Subvector extractions are free if they start at the beginning of a
1650 // vector and cheap if the subvectors are aligned.
1651 if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1652 int NumElts = LT.second.getVectorNumElements();
1653 if ((Index % NumElts) == 0)
1654 return TTI::TCC_Free;
1655 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1656 if (SubLT.second.isVector()) {
1657 int NumSubElts = SubLT.second.getVectorNumElements();
1658 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1659 return SubLT.first;
1660 // Handle some cases for widening legalization. For now we only handle
1661 // cases where the original subvector was naturally aligned and evenly
1662 // fit in its legalized subvector type.
1663 // FIXME: Remove some of the alignment restrictions.
1664 // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1665 // vectors.
1666 int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1667 if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1668 (NumSubElts % OrigSubElts) == 0 &&
1669 LT.second.getVectorElementType() ==
1670 SubLT.second.getVectorElementType() &&
1671 LT.second.getVectorElementType().getSizeInBits() ==
1672 SrcTy->getElementType()->getPrimitiveSizeInBits()) {
1673 assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1674 "Unexpected number of elements!");
1675 auto *VecTy = FixedVectorType::get(SrcTy->getElementType(),
1676 LT.second.getVectorNumElements());
1677 auto *SubTy = FixedVectorType::get(SrcTy->getElementType(),
1678 SubLT.second.getVectorNumElements());
1679 int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1680 InstructionCost ExtractCost =
1682 ExtractIndex, SubTy);
1683
1684 // If the original size is 32-bits or more, we can use pshufd. Otherwise
1685 // if we have SSSE3 we can use pshufb.
1686 if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1687 return ExtractCost + 1; // pshufd or pshufb
1688
1689 assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1690 "Unexpected vector size");
1691
1692 return ExtractCost + 2; // worst case pshufhw + pshufd
1693 }
1694 }
1695 // If the extract subvector is not optimal, treat it as single op shuffle.
1697 }
1698
1699 // Subvector insertions are cheap if the subvectors are aligned.
1700 // Note that in general, the insertion starting at the beginning of a vector
1701 // isn't free, because we need to preserve the rest of the wide vector,
1702 // but if the destination vector legalizes to the same width as the subvector
1703 // then the insertion will simplify to a (free) register copy.
1704 if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1705 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(DstTy);
1706 int NumElts = DstLT.second.getVectorNumElements();
1707 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1708 if (SubLT.second.isVector()) {
1709 int NumSubElts = SubLT.second.getVectorNumElements();
1710 bool MatchingTypes =
1711 NumElts == NumSubElts &&
1712 (SubTp->getElementCount().getKnownMinValue() % NumSubElts) == 0;
1713 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1714 return MatchingTypes ? TTI::TCC_Free : SubLT.first;
1715 }
1716
1717 // Attempt to match MOVSS (Idx == 0) or INSERTPS pattern. This will have
1718 // been matched by improveShuffleKindFromMask as a SK_InsertSubvector of
1719 // v1f32 (legalised to f32) into a v4f32.
1720 if (LT.first == 1 && LT.second == MVT::v4f32 && SubLT.first == 1 &&
1721 SubLT.second == MVT::f32 && (Index == 0 || ST->hasSSE41()))
1722 return 1;
1723
1724 // If the insertion is the lowest subvector then it will be blended
1725 // otherwise treat it like a 2-op shuffle.
1726 Kind =
1727 (Index == 0 && LT.first == 1) ? TTI::SK_Select : TTI::SK_PermuteTwoSrc;
1728 }
1729
1730 // Handle some common (illegal) sub-vector types as they are often very cheap
1731 // to shuffle even on targets without PSHUFB.
1732 EVT VT = TLI->getValueType(DL, SrcTy);
1733 if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1734 !ST->hasSSSE3()) {
1735 static const CostKindTblEntry SSE2SubVectorShuffleTbl[] = {
1736 {TTI::SK_Broadcast, MVT::v4i16, {1,1,1,1}}, // pshuflw
1737 {TTI::SK_Broadcast, MVT::v2i16, {1,1,1,1}}, // pshuflw
1738 {TTI::SK_Broadcast, MVT::v8i8, {2,2,2,2}}, // punpck/pshuflw
1739 {TTI::SK_Broadcast, MVT::v4i8, {2,2,2,2}}, // punpck/pshuflw
1740 {TTI::SK_Broadcast, MVT::v2i8, {1,1,1,1}}, // punpck
1741
1742 {TTI::SK_Reverse, MVT::v4i16, {1,1,1,1}}, // pshuflw
1743 {TTI::SK_Reverse, MVT::v2i16, {1,1,1,1}}, // pshuflw
1744 {TTI::SK_Reverse, MVT::v4i8, {3,3,3,3}}, // punpck/pshuflw/packus
1745 {TTI::SK_Reverse, MVT::v2i8, {1,1,1,1}}, // punpck
1746
1747 {TTI::SK_Splice, MVT::v4i16, {2,2,2,2}}, // punpck+psrldq
1748 {TTI::SK_Splice, MVT::v2i16, {2,2,2,2}}, // punpck+psrldq
1749 {TTI::SK_Splice, MVT::v4i8, {2,2,2,2}}, // punpck+psrldq
1750 {TTI::SK_Splice, MVT::v2i8, {2,2,2,2}}, // punpck+psrldq
1751
1752 {TTI::SK_PermuteTwoSrc, MVT::v4i16, {2,2,2,2}}, // punpck/pshuflw
1753 {TTI::SK_PermuteTwoSrc, MVT::v2i16, {2,2,2,2}}, // punpck/pshuflw
1754 {TTI::SK_PermuteTwoSrc, MVT::v8i8, {7,7,7,7}}, // punpck/pshuflw
1755 {TTI::SK_PermuteTwoSrc, MVT::v4i8, {4,4,4,4}}, // punpck/pshuflw
1756 {TTI::SK_PermuteTwoSrc, MVT::v2i8, {2,2,2,2}}, // punpck
1757
1758 {TTI::SK_PermuteSingleSrc, MVT::v4i16, {1,1,1,1}}, // pshuflw
1759 {TTI::SK_PermuteSingleSrc, MVT::v2i16, {1,1,1,1}}, // pshuflw
1760 {TTI::SK_PermuteSingleSrc, MVT::v8i8, {5,5,5,5}}, // punpck/pshuflw
1761 {TTI::SK_PermuteSingleSrc, MVT::v4i8, {3,3,3,3}}, // punpck/pshuflw
1762 {TTI::SK_PermuteSingleSrc, MVT::v2i8, {1,1,1,1}}, // punpck
1763 };
1764
1765 if (ST->hasSSE2())
1766 if (const auto *Entry =
1767 CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1768 if (auto KindCost = Entry->Cost[CostKind])
1769 return LT.first * *KindCost;
1770 }
1771
1772 // We are going to permute multiple sources and the result will be in multiple
1773 // destinations. Providing an accurate cost only for splits where the element
1774 // type remains the same.
1775 if (LT.first != 1) {
1776 MVT LegalVT = LT.second;
1777 if (LegalVT.isVector() &&
1778 LegalVT.getVectorElementType().getSizeInBits() ==
1779 SrcTy->getElementType()->getPrimitiveSizeInBits() &&
1780 LegalVT.getVectorNumElements() <
1781 cast<FixedVectorType>(SrcTy)->getNumElements()) {
1782 unsigned VecTySize = DL.getTypeStoreSize(SrcTy);
1783 unsigned LegalVTSize = LegalVT.getStoreSize();
1784 // Number of source vectors after legalization:
1785 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1786 // Number of destination vectors after legalization:
1787 InstructionCost NumOfDests = LT.first;
1788
1789 auto *SingleOpTy = FixedVectorType::get(SrcTy->getElementType(),
1790 LegalVT.getVectorNumElements());
1791
1792 if (!Mask.empty() && NumOfDests.isValid()) {
1793 // Try to perform better estimation of the permutation.
1794 // 1. Split the source/destination vectors into real registers.
1795 // 2. Do the mask analysis to identify which real registers are
1796 // permuted. If more than 1 source registers are used for the
1797 // destination register building, the cost for this destination register
1798 // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
1799 // source register is used, build mask and calculate the cost as a cost
1800 // of PermuteSingleSrc.
1801 // Also, for the single register permute we try to identify if the
1802 // destination register is just a copy of the source register or the
1803 // copy of the previous destination register (the cost is
1804 // TTI::TCC_Basic). If the source register is just reused, the cost for
1805 // this operation is TTI::TCC_Free.
1806 NumOfDests =
1808 FixedVectorType::get(SrcTy->getElementType(), Mask.size()))
1809 .first;
1810 unsigned E = NumOfDests.getValue();
1811 unsigned NormalizedVF =
1812 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
1813 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
1814 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
1815 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
1816 copy(Mask, NormalizedMask.begin());
1817 unsigned PrevSrcReg = 0;
1818 ArrayRef<int> PrevRegMask;
1821 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
1822 [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask,
1823 &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
1824 if (!ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) {
1825 // Check if the previous register can be just copied to the next
1826 // one.
1827 if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
1828 PrevRegMask != RegMask)
1829 Cost +=
1831 SingleOpTy, RegMask, CostKind, 0, nullptr);
1832 else
1833 // Just a copy of previous destination register.
1835 return;
1836 }
1837 if (SrcReg != DestReg &&
1838 any_of(RegMask, not_equal_to(PoisonMaskElem))) {
1839 // Just a copy of the source register.
1841 }
1842 PrevSrcReg = SrcReg;
1843 PrevRegMask = RegMask;
1844 },
1845 [this, SingleOpTy, CostKind,
1846 &Cost](ArrayRef<int> RegMask, unsigned /*Unused*/,
1847 unsigned /*Unused*/, bool /*Unused*/) {
1849 SingleOpTy, RegMask, CostKind, 0, nullptr);
1850 });
1851 return Cost;
1852 }
1853
1854 InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1855 return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1856 SingleOpTy, {}, CostKind, 0,
1857 nullptr);
1858 }
1859
1860 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1861 SubTp);
1862 }
1863
1864 // If we're just moving a single element around (probably as an alternative to
1865 // extracting it), we can assume this is cheap.
1866 if (LT.first == 1 && IsInLaneShuffle && IsSingleElementMask)
1867 return TTI::TCC_Basic;
1868
1869 static const CostKindTblEntry AVX512VBMIShuffleTbl[] = {
1870 { TTI::SK_Reverse, MVT::v64i8, { 1, 1, 1, 1 } }, // vpermb
1871 { TTI::SK_Reverse, MVT::v32i8, { 1, 1, 1, 1 } }, // vpermb
1872 { TTI::SK_PermuteSingleSrc, MVT::v64i8, { 1, 1, 1, 1 } }, // vpermb
1873 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 1, 1, 1, 1 } }, // vpermb
1874 { TTI::SK_PermuteTwoSrc, MVT::v64i8, { 2, 2, 2, 2 } }, // vpermt2b
1875 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 2, 2, 2, 2 } }, // vpermt2b
1876 { TTI::SK_PermuteTwoSrc, MVT::v16i8, { 2, 2, 2, 2 } } // vpermt2b
1877 };
1878
1879 if (ST->hasVBMI())
1880 if (const auto *Entry =
1881 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1882 if (auto KindCost = Entry->Cost[CostKind])
1883 return LT.first * *KindCost;
1884
1885 static const CostKindTblEntry AVX512BWShuffleTbl[] = {
1886 { TTI::SK_Broadcast, MVT::v32i16, { 1, 3, 1, 1 } }, // vpbroadcastw
1887 { TTI::SK_Broadcast, MVT::v32f16, { 1, 3, 1, 1 } }, // vpbroadcastw
1888 { TTI::SK_Broadcast, MVT::v64i8, { 1, 3, 1, 1 } }, // vpbroadcastb
1889
1890 { TTI::SK_Reverse, MVT::v32i16, { 2, 6, 2, 4 } }, // vpermw
1891 { TTI::SK_Reverse, MVT::v32f16, { 2, 6, 2, 4 } }, // vpermw
1892 { TTI::SK_Reverse, MVT::v16i16, { 2, 2, 2, 2 } }, // vpermw
1893 { TTI::SK_Reverse, MVT::v16f16, { 2, 2, 2, 2 } }, // vpermw
1894 { TTI::SK_Reverse, MVT::v64i8, { 2, 9, 2, 3 } }, // pshufb + vshufi64x2
1895
1896 { TTI::SK_PermuteSingleSrc, MVT::v32i16, { 2, 2, 2, 2 } }, // vpermw
1897 { TTI::SK_PermuteSingleSrc, MVT::v32f16, { 2, 2, 2, 2 } }, // vpermw
1898 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 2, 2, 2, 2 } }, // vpermw
1899 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 2, 2, 2, 2 } }, // vpermw
1900 { TTI::SK_PermuteSingleSrc, MVT::v64i8, { 8, 8, 8, 8 } }, // extend to v32i16
1901
1902 { TTI::SK_PermuteTwoSrc, MVT::v32i16,{ 2, 2, 2, 2 } }, // vpermt2w
1903 { TTI::SK_PermuteTwoSrc, MVT::v32f16,{ 2, 2, 2, 2 } }, // vpermt2w
1904 { TTI::SK_PermuteTwoSrc, MVT::v16i16,{ 2, 2, 2, 2 } }, // vpermt2w
1905 { TTI::SK_PermuteTwoSrc, MVT::v8i16, { 2, 2, 2, 2 } }, // vpermt2w
1906 { TTI::SK_PermuteTwoSrc, MVT::v64i8, { 19, 19, 19, 19 } }, // 6 * v32i8 + 1
1907
1908 { TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vblendmw
1909 { TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vblendmb
1910
1911 { TTI::SK_Splice, MVT::v32i16, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
1912 { TTI::SK_Splice, MVT::v32f16, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
1913 { TTI::SK_Splice, MVT::v64i8, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
1914 };
1915
1916 if (ST->hasBWI())
1917 if (const auto *Entry =
1918 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1919 if (auto KindCost = Entry->Cost[CostKind])
1920 return LT.first * *KindCost;
1921
1922 static const CostKindTblEntry AVX512InLaneShuffleTbl[] = {
1923 {TTI::SK_PermuteTwoSrc, MVT::v8f64, { 1, 3, 1, 1 } },
1924 {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } },
1925 {TTI::SK_PermuteTwoSrc, MVT::v8i64, { 1, 3, 1, 1 } },
1926 {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } },
1927 {TTI::SK_PermuteTwoSrc, MVT::v4f64, { 1, 3, 1, 1 } },
1928 {TTI::SK_PermuteTwoSrc, MVT::v8f32, { 1, 3, 1, 1 } },
1929 {TTI::SK_PermuteTwoSrc, MVT::v4i64, { 1, 3, 1, 1 } },
1930 {TTI::SK_PermuteTwoSrc, MVT::v8i32, { 1, 3, 1, 1 } },
1931 };
1932
1933 if (IsInLaneShuffle && ST->hasAVX512())
1934 if (const auto *Entry =
1935 CostTableLookup(AVX512InLaneShuffleTbl, Kind, LT.second))
1936 if (auto KindCost = Entry->Cost[CostKind])
1937 return LT.first * *KindCost;
1938
1939 static const CostKindTblEntry AVX512ShuffleTbl[] = {
1940 {TTI::SK_Broadcast, MVT::v8f64, { 1, 3, 1, 1 } }, // vbroadcastsd
1941 {TTI::SK_Broadcast, MVT::v4f64, { 1, 3, 1, 1 } }, // vbroadcastsd
1942 {TTI::SK_Broadcast, MVT::v16f32, { 1, 3, 1, 1 } }, // vbroadcastss
1943 {TTI::SK_Broadcast, MVT::v8f32, { 1, 3, 1, 1 } }, // vbroadcastss
1944 {TTI::SK_Broadcast, MVT::v8i64, { 1, 3, 1, 1 } }, // vpbroadcastq
1945 {TTI::SK_Broadcast, MVT::v4i64, { 1, 3, 1, 1 } }, // vpbroadcastq
1946 {TTI::SK_Broadcast, MVT::v16i32, { 1, 3, 1, 1 } }, // vpbroadcastd
1947 {TTI::SK_Broadcast, MVT::v8i32, { 1, 3, 1, 1 } }, // vpbroadcastd
1948 {TTI::SK_Broadcast, MVT::v32i16, { 1, 3, 1, 1 } }, // vpbroadcastw
1949 {TTI::SK_Broadcast, MVT::v16i16, { 1, 3, 1, 1 } }, // vpbroadcastw
1950 {TTI::SK_Broadcast, MVT::v32f16, { 1, 3, 1, 1 } }, // vpbroadcastw
1951 {TTI::SK_Broadcast, MVT::v16f16, { 1, 3, 1, 1 } }, // vpbroadcastw
1952 {TTI::SK_Broadcast, MVT::v64i8, { 1, 3, 1, 1 } }, // vpbroadcastb
1953 {TTI::SK_Broadcast, MVT::v32i8, { 1, 3, 1, 1 }}, // vpbroadcastb
1954
1955 {TTI::SK_Reverse, MVT::v8f64, { 1, 5, 2, 3 } }, // vpermpd
1956 {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 2, 3 } }, // vpermps
1957 {TTI::SK_Reverse, MVT::v8i64, { 1, 5, 2, 3 } }, // vpermq
1958 {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 2, 3 } }, // vpermd
1959 {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca
1960 {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca
1961 {TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca
1962
1963 {TTI::SK_Splice, MVT::v8f64, { 1, 1, 1, 1 } }, // vpalignd
1964 {TTI::SK_Splice, MVT::v4f64, { 1, 1, 1, 1 } }, // vpalignd
1965 {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd
1966 {TTI::SK_Splice, MVT::v8f32, { 1, 1, 1, 1 } }, // vpalignd
1967 {TTI::SK_Splice, MVT::v8i64, { 1, 1, 1, 1 } }, // vpalignd
1968 {TTI::SK_Splice, MVT::v4i64, { 1, 1, 1, 1 } }, // vpalignd
1969 {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd
1970 {TTI::SK_Splice, MVT::v8i32, { 1, 1, 1, 1 } }, // vpalignd
1971 {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr
1972 {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr
1973 {TTI::SK_Splice, MVT::v64i8, { 4, 4, 4, 4 } }, // split + palignr
1974
1975 {TTI::SK_PermuteSingleSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1976 {TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermpd
1977 {TTI::SK_PermuteSingleSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermpd
1978 {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1979 {TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermps
1980 {TTI::SK_PermuteSingleSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermps
1981 {TTI::SK_PermuteSingleSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1982 {TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermq
1983 {TTI::SK_PermuteSingleSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermq
1984 {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1985 {TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermd
1986 {TTI::SK_PermuteSingleSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermd
1987 {TTI::SK_PermuteSingleSrc, MVT::v16i8, { 1, 3, 1, 1 } }, // pshufb
1988
1989 {TTI::SK_PermuteTwoSrc, MVT::v8f64, { 2, 3, 1, 1 } }, // vpermt2pd
1990 {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 2, 3, 1, 1 } }, // vpermt2ps
1991 {TTI::SK_PermuteTwoSrc, MVT::v8i64, { 2, 3, 1, 1 } }, // vpermt2q
1992 {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 2, 3, 1, 1 } }, // vpermt2d
1993 {TTI::SK_PermuteTwoSrc, MVT::v4f64, { 2, 3, 1, 1 } }, // vpermt2pd
1994 {TTI::SK_PermuteTwoSrc, MVT::v8f32, { 2, 3, 1, 1 } }, // vpermt2ps
1995 {TTI::SK_PermuteTwoSrc, MVT::v4i64, { 2, 3, 1, 1 } }, // vpermt2q
1996 {TTI::SK_PermuteTwoSrc, MVT::v8i32, { 2, 3, 1, 1 } }, // vpermt2d
1997 {TTI::SK_PermuteTwoSrc, MVT::v2f64, { 1, 3, 1, 1 } },
1998 {TTI::SK_PermuteTwoSrc, MVT::v4f32, { 1, 3, 1, 1 } },
1999 {TTI::SK_PermuteTwoSrc, MVT::v2i64, { 1, 3, 1, 1 } },
2000 {TTI::SK_PermuteTwoSrc, MVT::v4i32, { 1, 3, 1, 1 } },
2001
2002 // FIXME: This just applies the type legalization cost rules above
2003 // assuming these completely split.
2004 {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } },
2005 {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } },
2006 {TTI::SK_PermuteSingleSrc, MVT::v64i8, { 14, 14, 14, 14 } },
2007 {TTI::SK_PermuteTwoSrc, MVT::v32i16, { 42, 42, 42, 42 } },
2008 {TTI::SK_PermuteTwoSrc, MVT::v32f16, { 42, 42, 42, 42 } },
2009 {TTI::SK_PermuteTwoSrc, MVT::v64i8, { 42, 42, 42, 42 } },
2010
2011 {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq
2012 {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq
2013 {TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vpternlogq
2014 {TTI::SK_Select, MVT::v8f64, { 1, 1, 1, 1 } }, // vblendmpd
2015 {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps
2016 {TTI::SK_Select, MVT::v8i64, { 1, 1, 1, 1 } }, // vblendmq
2017 {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd
2018 };
2019
2020 if (ST->hasAVX512())
2021 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
2022 if (auto KindCost = Entry->Cost[CostKind])
2023 return LT.first * *KindCost;
2024
2025 static const CostKindTblEntry AVX2InLaneShuffleTbl[] = {
2026 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 1, 1, 1, 1 } }, // vpshufb
2027 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 1, 1, 1, 1 } }, // vpshufb
2028 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 1, 1, 1, 1 } }, // vpshufb
2029
2030 { TTI::SK_Transpose, MVT::v4f64, { 1, 1, 1, 1 } }, // vshufpd/vunpck
2031 { TTI::SK_Transpose, MVT::v4i64, { 1, 1, 1, 1 } }, // vshufpd/vunpck
2032
2033 { TTI::SK_PermuteTwoSrc, MVT::v4f64, { 2, 2, 2, 2 } }, // 2*vshufpd + vblendpd
2034 { TTI::SK_PermuteTwoSrc, MVT::v8f32, { 2, 2, 2, 2 } }, // 2*vshufps + vblendps
2035 { TTI::SK_PermuteTwoSrc, MVT::v4i64, { 2, 2, 2, 2 } }, // 2*vpshufd + vpblendd
2036 { TTI::SK_PermuteTwoSrc, MVT::v8i32, { 2, 2, 2, 2 } }, // 2*vpshufd + vpblendd
2037 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 2, 2, 2, 2 } }, // 2*vpshufb + vpor
2038 { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 2, 2, 2, 2 } }, // 2*vpshufb + vpor
2039 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 2, 2, 2, 2 } }, // 2*vpshufb + vpor
2040 };
2041
2042 if (IsInLaneShuffle && ST->hasAVX2())
2043 if (const auto *Entry =
2044 CostTableLookup(AVX2InLaneShuffleTbl, Kind, LT.second))
2045 if (auto KindCost = Entry->Cost[CostKind])
2046 return LT.first * *KindCost;
2047
2048 static const CostKindTblEntry AVX2ShuffleTbl[] = {
2049 { TTI::SK_Broadcast, MVT::v4f64, { 1, 3, 1, 2 } }, // vbroadcastpd
2050 { TTI::SK_Broadcast, MVT::v8f32, { 1, 3, 1, 2 } }, // vbroadcastps
2051 { TTI::SK_Broadcast, MVT::v4i64, { 1, 3, 1, 2 } }, // vpbroadcastq
2052 { TTI::SK_Broadcast, MVT::v8i32, { 1, 3, 1, 2 } }, // vpbroadcastd
2053 { TTI::SK_Broadcast, MVT::v16i16, { 1, 3, 1, 2 } }, // vpbroadcastw
2054 { TTI::SK_Broadcast, MVT::v8i16, { 1, 3, 1, 1 } }, // vpbroadcastw
2055 { TTI::SK_Broadcast, MVT::v16f16, { 1, 3, 1, 2 } }, // vpbroadcastw
2056 { TTI::SK_Broadcast, MVT::v8f16, { 1, 3, 1, 1 } }, // vpbroadcastw
2057 { TTI::SK_Broadcast, MVT::v32i8, { 1, 3, 1, 2 } }, // vpbroadcastb
2058 { TTI::SK_Broadcast, MVT::v16i8, { 1, 3, 1, 1 } }, // vpbroadcastb
2059
2060 { TTI::SK_Reverse, MVT::v4f64, { 1, 6, 1, 2 } }, // vpermpd
2061 { TTI::SK_Reverse, MVT::v8f32, { 2, 7, 2, 4 } }, // vpermps
2062 { TTI::SK_Reverse, MVT::v4i64, { 1, 6, 1, 2 } }, // vpermq
2063 { TTI::SK_Reverse, MVT::v8i32, { 2, 7, 2, 4 } }, // vpermd
2064 { TTI::SK_Reverse, MVT::v16i16, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
2065 { TTI::SK_Reverse, MVT::v16f16, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
2066 { TTI::SK_Reverse, MVT::v32i8, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
2067
2068 { TTI::SK_Select, MVT::v16i16, { 1, 1, 1, 1 } }, // vpblendvb
2069 { TTI::SK_Select, MVT::v16f16, { 1, 1, 1, 1 } }, // vpblendvb
2070 { TTI::SK_Select, MVT::v32i8, { 1, 1, 1, 1 } }, // vpblendvb
2071
2072 { TTI::SK_Splice, MVT::v8i32, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2073 { TTI::SK_Splice, MVT::v8f32, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2074 { TTI::SK_Splice, MVT::v16i16, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2075 { TTI::SK_Splice, MVT::v16f16, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2076 { TTI::SK_Splice, MVT::v32i8, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2077
2078 { TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 1, 1, 1 } }, // vpermpd
2079 { TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 1, 1, 1 } }, // vpermps
2080 { TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 1, 1, 1 } }, // vpermq
2081 { TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 1, 1, 1 } }, // vpermd
2082 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 4, 4, 4, 4 } },
2083 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 4, 4, 4, 4 } },
2084 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 4, 4, 4, 4 } },
2085
2086 { TTI::SK_PermuteTwoSrc, MVT::v4f64, { 3, 3, 3, 3 } }, // 2*vpermpd + vblendpd
2087 { TTI::SK_PermuteTwoSrc, MVT::v8f32, { 3, 3, 3, 3 } }, // 2*vpermps + vblendps
2088 { TTI::SK_PermuteTwoSrc, MVT::v4i64, { 3, 3, 3, 3 } }, // 2*vpermq + vpblendd
2089 { TTI::SK_PermuteTwoSrc, MVT::v8i32, { 3, 3, 3, 3 } }, // 2*vpermd + vpblendd
2090 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 7, 7, 7, 7 } },
2091 { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 7, 7, 7, 7 } },
2092 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 7, 7, 7, 7 } },
2093 };
2094
2095 if (ST->hasAVX2())
2096 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
2097 if (auto KindCost = Entry->Cost[CostKind])
2098 return LT.first * *KindCost;
2099
2100 static const CostKindTblEntry XOPShuffleTbl[] = {
2101 { TTI::SK_PermuteSingleSrc, MVT::v4f64, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2pd
2102 { TTI::SK_PermuteSingleSrc, MVT::v8f32, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2ps
2103 { TTI::SK_PermuteSingleSrc, MVT::v4i64, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2pd
2104 { TTI::SK_PermuteSingleSrc, MVT::v8i32, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2ps
2105 { TTI::SK_PermuteSingleSrc, MVT::v16i16,{ 4, 4, 4, 4 } }, // vextractf128 + 2*vpperm
2106 // + vinsertf128
2107 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 4, 4, 4, 4 } }, // vextractf128 + 2*vpperm
2108 // + vinsertf128
2109
2110 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 6*vpperm
2111 // + vinsertf128
2112
2113 { TTI::SK_PermuteTwoSrc, MVT::v8i16, { 1, 1, 1, 1 } }, // vpperm
2114 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 9, 9, 9, 9 } }, // 2*vextractf128 + 6*vpperm
2115 // + vinsertf128
2116 { TTI::SK_PermuteTwoSrc, MVT::v16i8, { 1, 1, 1, 1 } }, // vpperm
2117 };
2118
2119 if (ST->hasXOP())
2120 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
2121 if (auto KindCost = Entry->Cost[CostKind])
2122 return LT.first * *KindCost;
2123
2124 static const CostKindTblEntry AVX1InLaneShuffleTbl[] = {
2125 { TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 1, 1, 1 } }, // vpermilpd
2126 { TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 1, 1, 1 } }, // vpermilpd
2127 { TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 1, 1, 1 } }, // vpermilps
2128 { TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 1, 1, 1 } }, // vpermilps
2129
2130 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb
2131 // + vpor + vinsertf128
2132 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb
2133 // + vpor + vinsertf128
2134 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb
2135 // + vpor + vinsertf128
2136
2137 { TTI::SK_Transpose, MVT::v4f64, { 1, 1, 1, 1 } }, // vshufpd/vunpck
2138 { TTI::SK_Transpose, MVT::v4i64, { 1, 1, 1, 1 } }, // vshufpd/vunpck
2139
2140 { TTI::SK_PermuteTwoSrc, MVT::v4f64, { 2, 2, 2, 2 } }, // 2*vshufpd + vblendpd
2141 { TTI::SK_PermuteTwoSrc, MVT::v8f32, { 2, 2, 2, 2 } }, // 2*vshufps + vblendps
2142 { TTI::SK_PermuteTwoSrc, MVT::v4i64, { 2, 2, 2, 2 } }, // 2*vpermilpd + vblendpd
2143 { TTI::SK_PermuteTwoSrc, MVT::v8i32, { 2, 2, 2, 2 } }, // 2*vpermilps + vblendps
2144 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb
2145 // + 2*vpor + vinsertf128
2146 { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb
2147 // + 2*vpor + vinsertf128
2148 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb
2149 // + 2*vpor + vinsertf128
2150 };
2151
2152 if (IsInLaneShuffle && ST->hasAVX())
2153 if (const auto *Entry =
2154 CostTableLookup(AVX1InLaneShuffleTbl, Kind, LT.second))
2155 if (auto KindCost = Entry->Cost[CostKind])
2156 return LT.first * *KindCost;
2157
2158 static const CostKindTblEntry AVX1ShuffleTbl[] = {
2159 {TTI::SK_Broadcast, MVT::v4f64, {2,3,2,3}}, // vperm2f128 + vpermilpd
2160 {TTI::SK_Broadcast, MVT::v8f32, {2,3,2,3}}, // vperm2f128 + vpermilps
2161 {TTI::SK_Broadcast, MVT::v4i64, {2,3,2,3}}, // vperm2f128 + vpermilpd
2162 {TTI::SK_Broadcast, MVT::v8i32, {2,3,2,3}}, // vperm2f128 + vpermilps
2163 {TTI::SK_Broadcast, MVT::v16i16, {2,3,3,4}}, // vpshuflw + vpshufd + vinsertf128
2164 {TTI::SK_Broadcast, MVT::v16f16, {2,3,3,4}}, // vpshuflw + vpshufd + vinsertf128
2165 {TTI::SK_Broadcast, MVT::v32i8, {3,4,3,6}}, // vpshufb + vinsertf128
2166
2167 {TTI::SK_Reverse, MVT::v4f64, {2,6,2,2}}, // vperm2f128 + vpermilpd
2168 {TTI::SK_Reverse, MVT::v8f32, {2,7,2,4}}, // vperm2f128 + vpermilps
2169 {TTI::SK_Reverse, MVT::v4i64, {2,6,2,2}}, // vperm2f128 + vpermilpd
2170 {TTI::SK_Reverse, MVT::v8i32, {2,7,2,4}}, // vperm2f128 + vpermilps
2171 {TTI::SK_Reverse, MVT::v16i16, {2,9,5,5}}, // vextractf128 + 2*pshufb
2172 // + vinsertf128
2173 {TTI::SK_Reverse, MVT::v16f16, {2,9,5,5}}, // vextractf128 + 2*pshufb
2174 // + vinsertf128
2175 {TTI::SK_Reverse, MVT::v32i8, {2,9,5,5}}, // vextractf128 + 2*pshufb
2176 // + vinsertf128
2177
2178 {TTI::SK_Select, MVT::v4i64, {1,1,1,1}}, // vblendpd
2179 {TTI::SK_Select, MVT::v4f64, {1,1,1,1}}, // vblendpd
2180 {TTI::SK_Select, MVT::v8i32, {1,1,1,1}}, // vblendps
2181 {TTI::SK_Select, MVT::v8f32, {1,1,1,1}}, // vblendps
2182 {TTI::SK_Select, MVT::v16i16, {3,3,3,3}}, // vpand + vpandn + vpor
2183 {TTI::SK_Select, MVT::v16f16, {3,3,3,3}}, // vpand + vpandn + vpor
2184 {TTI::SK_Select, MVT::v32i8, {3,3,3,3}}, // vpand + vpandn + vpor
2185
2186 {TTI::SK_Splice, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + shufpd
2187 {TTI::SK_Splice, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + shufpd
2188 {TTI::SK_Splice, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2189 {TTI::SK_Splice, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2190 {TTI::SK_Splice, MVT::v16i16, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2191 {TTI::SK_Splice, MVT::v16f16, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2192 {TTI::SK_Splice, MVT::v32i8, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2193
2194 {TTI::SK_PermuteSingleSrc, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + vshufpd
2195 {TTI::SK_PermuteSingleSrc, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + vshufpd
2196 {TTI::SK_PermuteSingleSrc, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2197 {TTI::SK_PermuteSingleSrc, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2198 {TTI::SK_PermuteSingleSrc, MVT::v16i16,{8,8,8,8}}, // vextractf128 + 4*pshufb
2199 // + 2*por + vinsertf128
2200 {TTI::SK_PermuteSingleSrc, MVT::v16f16,{8,8,8,8}}, // vextractf128 + 4*pshufb
2201 // + 2*por + vinsertf128
2202 {TTI::SK_PermuteSingleSrc, MVT::v32i8, {8,8,8,8}}, // vextractf128 + 4*pshufb
2203 // + 2*por + vinsertf128
2204
2205 {TTI::SK_PermuteTwoSrc, MVT::v4f64, {3,3,3,3}}, // 2*vperm2f128 + vshufpd
2206 {TTI::SK_PermuteTwoSrc, MVT::v4i64, {3,3,3,3}}, // 2*vperm2f128 + vshufpd
2207 {TTI::SK_PermuteTwoSrc, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2208 {TTI::SK_PermuteTwoSrc, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2209 {TTI::SK_PermuteTwoSrc, MVT::v16i16,{15,15,15,15}}, // 2*vextractf128 + 8*pshufb
2210 // + 4*por + vinsertf128
2211 {TTI::SK_PermuteTwoSrc, MVT::v16f16,{15,15,15,15}}, // 2*vextractf128 + 8*pshufb
2212 // + 4*por + vinsertf128
2213 {TTI::SK_PermuteTwoSrc, MVT::v32i8, {15,15,15,15}}, // 2*vextractf128 + 8*pshufb
2214 // + 4*por + vinsertf128
2215 };
2216
2217 if (ST->hasAVX())
2218 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
2219 if (auto KindCost = Entry->Cost[CostKind])
2220 return LT.first * *KindCost;
2221
2222 static const CostKindTblEntry SSE41ShuffleTbl[] = {
2223 {TTI::SK_Select, MVT::v2i64, {1,1,1,1}}, // pblendw
2224 {TTI::SK_Select, MVT::v2f64, {1,1,1,1}}, // movsd
2225 {TTI::SK_Select, MVT::v4i32, {1,1,1,1}}, // pblendw
2226 {TTI::SK_Select, MVT::v4f32, {1,1,1,1}}, // blendps
2227 {TTI::SK_Select, MVT::v8i16, {1,1,1,1}}, // pblendw
2228 {TTI::SK_Select, MVT::v8f16, {1,1,1,1}}, // pblendw
2229 {TTI::SK_Select, MVT::v16i8, {1,1,1,1}} // pblendvb
2230 };
2231
2232 if (ST->hasSSE41())
2233 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
2234 if (auto KindCost = Entry->Cost[CostKind])
2235 return LT.first * *KindCost;
2236
2237 static const CostKindTblEntry SSSE3ShuffleTbl[] = {
2238 {TTI::SK_Broadcast, MVT::v8i16, {1, 3, 2, 2}}, // pshufb
2239 {TTI::SK_Broadcast, MVT::v8f16, {1, 3, 2, 2}}, // pshufb
2240 {TTI::SK_Broadcast, MVT::v16i8, {1, 3, 2, 2}}, // pshufb
2241
2242 {TTI::SK_Reverse, MVT::v8i16, {1, 2, 1, 2}}, // pshufb
2243 {TTI::SK_Reverse, MVT::v8f16, {1, 2, 1, 2}}, // pshufb
2244 {TTI::SK_Reverse, MVT::v16i8, {1, 2, 1, 2}}, // pshufb
2245
2246 {TTI::SK_Splice, MVT::v4i32, {1, 1, 1, 1}}, // palignr
2247 {TTI::SK_Splice, MVT::v4f32, {1, 1, 1, 1}}, // palignr
2248 {TTI::SK_Splice, MVT::v8i16, {1, 1, 1, 1}}, // palignr
2249 {TTI::SK_Splice, MVT::v8f16, {1, 1, 1, 1}}, // palignr
2250 {TTI::SK_Splice, MVT::v16i8, {1, 1, 1, 1}}, // palignr
2251
2252 {TTI::SK_PermuteSingleSrc, MVT::v8i16, {1, 1, 1, 1}}, // pshufb
2253 {TTI::SK_PermuteSingleSrc, MVT::v8f16, {1, 1, 1, 1}}, // pshufb
2254 {TTI::SK_PermuteSingleSrc, MVT::v16i8, {1, 1, 1, 1}}, // pshufb
2255
2256 {TTI::SK_PermuteTwoSrc, MVT::v8i16, {3, 3, 3, 3}}, // 2*pshufb + por
2257 {TTI::SK_PermuteTwoSrc, MVT::v8f16, {3, 3, 3, 3}}, // 2*pshufb + por
2258 {TTI::SK_PermuteTwoSrc, MVT::v16i8, {3, 3, 3, 3}}, // 2*pshufb + por
2259 };
2260
2261 if (ST->hasSSSE3())
2262 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
2263 if (auto KindCost = Entry->Cost[CostKind])
2264 return LT.first * *KindCost;
2265
2266 static const CostKindTblEntry SSE2ShuffleTbl[] = {
2267 {TTI::SK_Broadcast, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2268 {TTI::SK_Broadcast, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
2269 {TTI::SK_Broadcast, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
2270 {TTI::SK_Broadcast, MVT::v8i16, {1, 2, 2, 2}}, // pshuflw + pshufd
2271 {TTI::SK_Broadcast, MVT::v8f16, {1, 2, 2, 2}}, // pshuflw + pshufd
2272 {TTI::SK_Broadcast, MVT::v16i8, {2, 3, 3, 4}}, // unpck + pshuflw + pshufd
2273
2274 {TTI::SK_Reverse, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2275 {TTI::SK_Reverse, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
2276 {TTI::SK_Reverse, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
2277 {TTI::SK_Reverse, MVT::v8i16, {2, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
2278 {TTI::SK_Reverse, MVT::v8f16, {2, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
2279 {TTI::SK_Reverse, MVT::v16i8, {5, 6,11,11}}, // 2*pshuflw + 2*pshufhw
2280 // + 2*pshufd + 2*unpck + packus
2281
2282 {TTI::SK_Select, MVT::v2i64, {1, 1, 1, 1}}, // movsd
2283 {TTI::SK_Select, MVT::v2f64, {1, 1, 1, 1}}, // movsd
2284 {TTI::SK_Select, MVT::v4i32, {2, 2, 2, 2}}, // 2*shufps
2285 {TTI::SK_Select, MVT::v8i16, {2, 2, 3, 3}}, // pand + pandn + por
2286 {TTI::SK_Select, MVT::v8f16, {2, 2, 3, 3}}, // pand + pandn + por
2287 {TTI::SK_Select, MVT::v16i8, {2, 2, 3, 3}}, // pand + pandn + por
2288
2289 {TTI::SK_Splice, MVT::v2i64, {1, 1, 1, 1}}, // shufpd
2290 {TTI::SK_Splice, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2291 {TTI::SK_Splice, MVT::v4i32, {2, 2, 2, 2}}, // 2*{unpck,movsd,pshufd}
2292 {TTI::SK_Splice, MVT::v8i16, {3, 3, 3, 3}}, // psrldq + psrlldq + por
2293 {TTI::SK_Splice, MVT::v8f16, {3, 3, 3, 3}}, // psrldq + psrlldq + por
2294 {TTI::SK_Splice, MVT::v16i8, {3, 3, 3, 3}}, // psrldq + psrlldq + por
2295
2296 {TTI::SK_PermuteSingleSrc, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2297 {TTI::SK_PermuteSingleSrc, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
2298 {TTI::SK_PermuteSingleSrc, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
2299 {TTI::SK_PermuteSingleSrc, MVT::v8i16, {3, 5, 5, 5}}, // 2*pshuflw + 2*pshufhw
2300 // + pshufd/unpck
2301 {TTI::SK_PermuteSingleSrc, MVT::v8f16, {3, 5, 5, 5}}, // 2*pshuflw + 2*pshufhw
2302 // + pshufd/unpck
2303 {TTI::SK_PermuteSingleSrc, MVT::v16i8, {8, 10, 10, 10}}, // 2*pshuflw + 2*pshufhw
2304 // + 2*pshufd + 2*unpck + 2*packus
2305
2306 {TTI::SK_PermuteTwoSrc, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2307 {TTI::SK_PermuteTwoSrc, MVT::v2i64, {1, 1, 1, 1}}, // shufpd
2308 {TTI::SK_PermuteTwoSrc, MVT::v4i32, {2, 2, 2, 2}}, // 2*{unpck,movsd,pshufd}
2309 {TTI::SK_PermuteTwoSrc, MVT::v8i16, {6, 8, 8, 8}}, // blend+permute
2310 {TTI::SK_PermuteTwoSrc, MVT::v8f16, {6, 8, 8, 8}}, // blend+permute
2311 {TTI::SK_PermuteTwoSrc, MVT::v16i8, {11, 13, 13, 13}}, // blend+permute
2312 };
2313
2314 static const CostTblEntry SSE3BroadcastLoadTbl[] = {
2315 {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
2316 };
2317
2318 if (ST->hasSSE2()) {
2319 bool IsLoad =
2320 llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); });
2321 if (ST->hasSSE3() && IsLoad)
2322 if (const auto *Entry =
2323 CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
2324 assert(isLegalBroadcastLoad(SrcTy->getElementType(),
2325 LT.second.getVectorElementCount()) &&
2326 "Table entry missing from isLegalBroadcastLoad()");
2327 return LT.first * Entry->Cost;
2328 }
2329
2330 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
2331 if (auto KindCost = Entry->Cost[CostKind])
2332 return LT.first * *KindCost;
2333 }
2334
2335 static const CostKindTblEntry SSE1ShuffleTbl[] = {
2336 { TTI::SK_Broadcast, MVT::v4f32, {1,1,1,1} }, // shufps
2337 { TTI::SK_Reverse, MVT::v4f32, {1,1,1,1} }, // shufps
2338 { TTI::SK_Select, MVT::v4f32, {2,2,2,2} }, // 2*shufps
2339 { TTI::SK_Splice, MVT::v4f32, {2,2,2,2} }, // 2*shufps
2340 { TTI::SK_PermuteSingleSrc, MVT::v4f32, {1,1,1,1} }, // shufps
2341 { TTI::SK_PermuteTwoSrc, MVT::v4f32, {2,2,2,2} }, // 2*shufps
2342 };
2343
2344 if (ST->hasSSE1()) {
2345 if (LT.first == 1 && LT.second == MVT::v4f32 && Mask.size() == 4) {
2346 // SHUFPS: both pairs must come from the same source register.
2347 auto MatchSHUFPS = [](int X, int Y) {
2348 return X < 0 || Y < 0 || ((X & 4) == (Y & 4));
2349 };
2350 if (MatchSHUFPS(Mask[0], Mask[1]) && MatchSHUFPS(Mask[2], Mask[3]))
2351 return 1;
2352 }
2353 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
2354 if (auto KindCost = Entry->Cost[CostKind])
2355 return LT.first * *KindCost;
2356 }
2357
2358 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
2359 SubTp);
2360}
2361
2363 Type *Src,
2366 const Instruction *I) const {
2367 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2368 assert(ISD && "Invalid opcode");
2369
2370 // The cost tables include both specific, custom (non-legal) src/dst type
2371 // conversions and generic, legalized types. We test for customs first, before
2372 // falling back to legalization.
2373 // FIXME: Need a better design of the cost table to handle non-simple types of
2374 // potential massive combinations (elem_num x src_type x dst_type).
2375 static const TypeConversionCostKindTblEntry AVX512BWConversionTbl[]{
2376 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2377 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2378
2379 // Mask sign extend has an instruction.
2380 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2381 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2382 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2383 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2384 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2385 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2386 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2387 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2388 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2389 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2390 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2391 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2392 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2393 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2394 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, { 1, 1, 1, 1 } },
2395 { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, { 1, 1, 1, 1 } },
2396 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, { 1, 1, 1, 1 } },
2397
2398 // Mask zero extend is a sext + shift.
2399 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2400 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2401 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2402 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2403 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2404 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2405 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2406 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2407 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2408 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2409 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2410 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2411 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2412 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2413 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, { 2, 1, 1, 1 } },
2414 { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, { 2, 1, 1, 1 } },
2415 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, { 2, 1, 1, 1 } },
2416
2417 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2418 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2419 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2420 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2421 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2422 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2423 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2424 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2425 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2426 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2427 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2428 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2429 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2430 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2431 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, { 2, 1, 1, 1 } },
2432 { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, { 2, 1, 1, 1 } },
2433 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, { 2, 1, 1, 1 } },
2434
2435 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 2, 1, 1, 1 } },
2436 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // widen to zmm
2437 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, { 2, 1, 1, 1 } }, // vpmovwb
2438 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, { 2, 1, 1, 1 } }, // vpmovwb
2439 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, { 2, 1, 1, 1 } }, // vpmovwb
2440 };
2441
2442 static const TypeConversionCostKindTblEntry AVX512DQConversionTbl[] = {
2443 // Mask sign extend has an instruction.
2444 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2445 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2446 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2447 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2448 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2449 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, { 1, 1, 1, 1 } },
2450 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } },
2451 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } },
2452
2453 // Mask zero extend is a sext + shift.
2454 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1, } },
2455 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1, } },
2456 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1, } },
2457 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1, } },
2458 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1, } },
2459 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, { 2, 1, 1, 1, } },
2460 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1, } },
2461 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1, } },
2462
2463 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2464 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2465 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2466 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2467 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2468 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } },
2469 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } },
2470 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, { 2, 1, 1, 1 } },
2471
2472 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2473 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2474
2475 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2476 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2477
2478 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2479 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2480
2481 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2482 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2483 };
2484
2485 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
2486 // 256-bit wide vectors.
2487
2488 static const TypeConversionCostKindTblEntry AVX512FConversionTbl[] = {
2489 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 1, 1, 1, 1 } },
2490 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, { 3, 1, 1, 1 } },
2491 { ISD::FP_EXTEND, MVT::v16f64, MVT::v16f32, { 4, 1, 1, 1 } }, // 2*vcvtps2pd+vextractf64x4
2492 { ISD::FP_EXTEND, MVT::v16f32, MVT::v16f16, { 1, 1, 1, 1 } }, // vcvtph2ps
2493 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
2494 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 1, 1, 1, 1 } },
2495 { ISD::FP_ROUND, MVT::v16f16, MVT::v16f32, { 1, 1, 1, 1 } }, // vcvtps2ph
2496
2497 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2498 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2499 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2500 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2501 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2502 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2503 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2504 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2505 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2506 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2507 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2508 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2509 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2510 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2511 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2512 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, { 2, 1, 1, 1 } }, // vpmovdb
2513 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, { 2, 1, 1, 1 } }, // vpmovdb
2514 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2515 { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2516 { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2517 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2518 { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2519 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, { 2, 1, 1, 1 } }, // vpmovqb
2520 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, { 1, 1, 1, 1 } }, // vpshufb
2521 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2522 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2523 { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2524 { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2525 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2526 { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2527 { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2528 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, { 1, 1, 1, 1 } }, // vpmovqd
2529 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // zmm vpmovqd
2530 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, { 5, 1, 1, 1 } },// 2*vpmovqd+concat+vpmovdb
2531
2532 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } }, // extend to v16i32
2533 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 8, 1, 1, 1 } },
2534 { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, { 8, 1, 1, 1 } },
2535
2536 // Sign extend is zmm vpternlogd+vptruncdb.
2537 // Zero extend is zmm broadcast load+vptruncdw.
2538 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 3, 1, 1, 1 } },
2539 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 4, 1, 1, 1 } },
2540 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 3, 1, 1, 1 } },
2541 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 4, 1, 1, 1 } },
2542 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 3, 1, 1, 1 } },
2543 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 4, 1, 1, 1 } },
2544 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 3, 1, 1, 1 } },
2545 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 4, 1, 1, 1 } },
2546
2547 // Sign extend is zmm vpternlogd+vptruncdw.
2548 // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
2549 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 3, 1, 1, 1 } },
2550 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2551 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 3, 1, 1, 1 } },
2552 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2553 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 3, 1, 1, 1 } },
2554 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2555 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 3, 1, 1, 1 } },
2556 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2557
2558 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2559 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2560 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2561 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2562 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2563 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2564 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2565 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2566 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2567 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2568
2569 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2570 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2571 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogq
2572 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2573
2574 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2575 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2576 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2577 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2578 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2579 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2580 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2581 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2582 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2583 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2584
2585 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2586 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2587
2588 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2589 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2590 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2591 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2592 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2593 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2594 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2595 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2596
2597 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2598 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2599 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2600 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2601 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2602 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2603 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2604 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2605 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, {26, 1, 1, 1 } },
2606 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 5, 1, 1, 1 } },
2607
2608 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2609 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, { 7, 1, 1, 1 } },
2610 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64, {15, 1, 1, 1 } },
2611 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32, {11, 1, 1, 1 } },
2612 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64, {31, 1, 1, 1 } },
2613 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2614 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, { 7, 1, 1, 1 } },
2615 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, { 5, 1, 1, 1 } },
2616 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64, {15, 1, 1, 1 } },
2617 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2618 { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, { 3, 1, 1, 1 } },
2619
2620 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2621 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2622 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, { 3, 1, 1, 1 } },
2623 { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, { 1, 1, 1, 1 } },
2624 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, { 3, 1, 1, 1 } },
2625 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, { 3, 1, 1, 1 } },
2626 };
2627
2628 static const TypeConversionCostKindTblEntry AVX512BWVLConversionTbl[] {
2629 // Mask sign extend has an instruction.
2630 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2631 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2632 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2633 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2634 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2635 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2636 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2637 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2638 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2639 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2640 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2641 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2642 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2643 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2644 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, { 1, 1, 1, 1 } },
2645 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, { 1, 1, 1, 1 } },
2646 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, { 1, 1, 1, 1 } },
2647
2648 // Mask zero extend is a sext + shift.
2649 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2650 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2651 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2652 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2653 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2654 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2655 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2656 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2657 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2658 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2659 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2660 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2661 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2662 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2663 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, { 2, 1, 1, 1 } },
2664 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, { 2, 1, 1, 1 } },
2665 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, { 2, 1, 1, 1 } },
2666
2667 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2668 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2669 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2670 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2671 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2672 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2673 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2674 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2675 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2676 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2677 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2678 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2679 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2680 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2681 { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, { 2, 1, 1, 1 } },
2682 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, { 2, 1, 1, 1 } },
2683 { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, { 2, 1, 1, 1 } },
2684
2685 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } },
2686 };
2687
2688 static const TypeConversionCostKindTblEntry AVX512DQVLConversionTbl[] = {
2689 // Mask sign extend has an instruction.
2690 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2691 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2692 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2693 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, { 1, 1, 1, 1 } },
2694 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2695 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, { 1, 1, 1, 1 } },
2696 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } },
2697 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2698
2699 // Mask zero extend is a sext + shift.
2700 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } },
2701 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1 } },
2702 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } },
2703 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, { 2, 1, 1, 1 } },
2704 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } },
2705 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, { 2, 1, 1, 1 } },
2706 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } },
2707 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } },
2708
2709 { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, { 2, 1, 1, 1 } },
2710 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } },
2711 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2712 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2713 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2714 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2715 { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, { 2, 1, 1, 1 } },
2716 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2717
2718 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2719 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2720 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2721 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2722
2723 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2724 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2725 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2726 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2727
2728 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2729 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2730 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2731 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2732
2733 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2734 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2735 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2736 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2737 };
2738
2739 static const TypeConversionCostKindTblEntry AVX512VLConversionTbl[] = {
2740 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2741 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2742 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2743 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 8, 1, 1, 1 } }, // split+2*v8i8
2744 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2745 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2746 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2747 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 8, 1, 1, 1 } }, // split+2*v8i16
2748 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2749 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2750 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2751 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2752 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2753 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2754 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // vpmovqd
2755 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqb
2756 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqw
2757 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, { 2, 1, 1, 1 } }, // vpmovwb
2758
2759 // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
2760 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
2761 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 5, 1, 1, 1 } },
2762 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 6, 1, 1, 1 } },
2763 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 5, 1, 1, 1 } },
2764 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 6, 1, 1, 1 } },
2765 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 5, 1, 1, 1 } },
2766 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 6, 1, 1, 1 } },
2767 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, {10, 1, 1, 1 } },
2768 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, {12, 1, 1, 1 } },
2769
2770 // sign extend is vpcmpeq+maskedmove+vpmovdw
2771 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
2772 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2773 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 5, 1, 1, 1 } },
2774 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2775 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 5, 1, 1, 1 } },
2776 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2777 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 5, 1, 1, 1 } },
2778 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, {10, 1, 1, 1 } },
2779 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, {12, 1, 1, 1 } },
2780
2781 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogd
2782 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2783 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogd
2784 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2785 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogd
2786 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2787 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2788 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2789
2790 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogq
2791 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2792 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogq
2793 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2794
2795 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2796 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2797 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2798 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2799 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2800 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2801 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2802 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2803 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2804 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2805 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2806 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2807
2808 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2809 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2810 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2811 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2812
2813 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
2814 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
2815 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2816 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2817 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2818 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2819 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 1, 1, 1, 1 } },
2820 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
2821 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2822 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2823 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 5, 1, 1, 1 } },
2824 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2825 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 5, 1, 1, 1 } },
2826
2827 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2828 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2829 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, { 5, 1, 1, 1 } },
2830
2831 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
2832 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
2833 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
2834 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
2835 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2836 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2837 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2838 };
2839
2840 static const TypeConversionCostKindTblEntry AVX2ConversionTbl[] = {
2841 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2842 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2843 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2844 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2845 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2846 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2847
2848 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2849 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2850 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2851 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2852 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2853 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2854 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2855 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2856 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2857 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2858 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2859 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2860 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2861 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2862
2863 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2864
2865 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 4, 1, 1, 1 } },
2866 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 4, 1, 1, 1 } },
2867 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 1, 1, 1, 1 } },
2868 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 1, 1, 1, 1 } },
2869 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 1, 1, 1, 1 } },
2870 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 4, 1, 1, 1 } },
2871 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 4, 1, 1, 1 } },
2872 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 1, 1, 1, 1 } },
2873 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 1, 1, 1, 1 } },
2874 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 5, 1, 1, 1 } },
2875 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } },
2876 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 2, 1, 1, 1 } },
2877
2878 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 3, 1, 1, 1 } },
2879 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 3, 1, 1, 1 } },
2880
2881 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2882 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2883 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2884 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 3, 1, 1, 1 } },
2885
2886 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 3, 1, 1, 1 } },
2887 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 3, 1, 1, 1 } },
2888 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2889 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
2890 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2891 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 4, 1, 1, 1 } },
2892 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 3, 1, 1, 1 } },
2893 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 4, 1, 1, 1 } },
2894
2895 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2896 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2897 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2898 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2899 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2900 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2901 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 3, 1, 1, 1 } },
2902
2903 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2904 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2905 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2906 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2907 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 2, 1, 1, 1 } },
2908 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 1, 1, 1, 1 } },
2909 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 2, 1, 1, 1 } },
2910 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2911 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2912 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2913 };
2914
2915 static const TypeConversionCostKindTblEntry AVXConversionTbl[] = {
2916 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2917 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2918 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2919 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2920 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2921 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2922
2923 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2924 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2925 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2926 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2927 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2928 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2929 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2930 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2931 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2932 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2933 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2934 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2935
2936 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 4, 1, 1, 1 } },
2937 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 5, 1, 1, 1 } },
2938 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 4, 1, 1, 1 } },
2939 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 9, 1, 1, 1 } },
2940 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, {11, 1, 1, 1 } },
2941
2942 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 6, 1, 1, 1 } },
2943 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 6, 1, 1, 1 } },
2944 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // and+extract+packuswb
2945 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 5, 1, 1, 1 } },
2946 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
2947 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 5, 1, 1, 1 } },
2948 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 3, 1, 1, 1 } }, // and+extract+2*packusdw
2949 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 2, 1, 1, 1 } },
2950
2951 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, { 3, 1, 1, 1 } },
2952 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, { 3, 1, 1, 1 } },
2953 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, { 8, 1, 1, 1 } },
2954 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2955 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2956 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2957 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2958 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2959 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2960 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2961 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 5, 1, 1, 1 } },
2962 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 8, 1, 1, 1 } },
2963
2964 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, { 7, 1, 1, 1 } },
2965 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, { 7, 1, 1, 1 } },
2966 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, { 6, 1, 1, 1 } },
2967 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2968 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2969 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2970 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2971 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 4, 1, 1, 1 } },
2972 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 4, 1, 1, 1 } },
2973 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
2974 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 6, 1, 1, 1 } },
2975 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 8, 1, 1, 1 } },
2976 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, {10, 1, 1, 1 } },
2977 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, {10, 1, 1, 1 } },
2978 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {18, 1, 1, 1 } },
2979 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2980 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, {10, 1, 1, 1 } },
2981
2982 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2983 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2984 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2985 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
2986 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
2987 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
2988 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
2989 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
2990 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 2, 1, 1, 1 } },
2991 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 2, 1, 1, 1 } },
2992 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 5, 1, 1, 1 } },
2993
2994 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2995 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2996 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2997 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
2998 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
2999 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
3000 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
3001 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
3002 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
3003 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
3004 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 6, 1, 1, 1 } },
3005 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 7, 1, 1, 1 } },
3006 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 7, 1, 1, 1 } },
3007
3008 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, { 1, 1, 1, 1 } },
3009 { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, { 1, 1, 1, 1 } },
3010 };
3011
3012 static const TypeConversionCostKindTblEntry SSE41ConversionTbl[] = {
3013 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
3014 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
3015 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
3016 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
3017 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
3018 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
3019 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
3020 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
3021 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
3022 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
3023 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
3024 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
3025
3026 // These truncates end up widening elements.
3027 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 1, 1, 1, 1 } }, // PMOVXZBQ
3028 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 1, 1, 1, 1 } }, // PMOVXZWQ
3029 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 1, 1, 1, 1 } }, // PMOVXZBD
3030
3031 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 2, 1, 1, 1 } },
3032 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 2, 1, 1, 1 } },
3033 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 2, 1, 1, 1 } },
3034
3035 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
3036 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
3037 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
3038 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
3039 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
3040 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
3041 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
3042 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
3043 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
3044 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 1, 1, 1, 1 } },
3045 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
3046
3047 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
3048 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
3049 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 4, 1, 1, 1 } },
3050 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 4, 1, 1, 1 } },
3051 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
3052 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
3053 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
3054 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
3055 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 3, 1, 1, 1 } },
3056 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
3057 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 2, 1, 1, 1 } },
3058 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {12, 1, 1, 1 } },
3059 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {22, 1, 1, 1 } },
3060 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 4, 1, 1, 1 } },
3061
3062 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
3063 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
3064 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
3065 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
3066 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
3067 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
3068 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
3069 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
3070 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
3071 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
3072
3073 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
3074 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3075 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
3076 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
3077 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
3078 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
3079 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
3080 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
3081 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
3082 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
3083 };
3084
3085 static const TypeConversionCostKindTblEntry SSE2ConversionTbl[] = {
3086 // These are somewhat magic numbers justified by comparing the
3087 // output of llvm-mca for our various supported scheduler models
3088 // and basing it off the worst case scenario.
3089 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
3090 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
3091 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 3, 1, 1, 1 } },
3092 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 3, 1, 1, 1 } },
3093 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 3, 1, 1, 1 } },
3094 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
3095 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 3, 1, 1, 1 } },
3096 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
3097 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
3098 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 4, 1, 1, 1 } },
3099 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 8, 1, 1, 1 } },
3100 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 8, 1, 1, 1 } },
3101
3102 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
3103 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
3104 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 8, 1, 1, 1 } },
3105 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 9, 1, 1, 1 } },
3106 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
3107 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 4, 1, 1, 1 } },
3108 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 4, 1, 1, 1 } },
3109 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
3110 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 7, 1, 1, 1 } },
3111 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 7, 1, 1, 1 } },
3112 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
3113 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, {15, 1, 1, 1 } },
3114 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {18, 1, 1, 1 } },
3115
3116 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
3117 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3118 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
3119 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
3120 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
3121 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
3122 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
3123 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
3124 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
3125 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
3126
3127 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
3128 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3129 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
3130 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, {15, 1, 1, 1 } },
3131 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
3132 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
3133 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
3134 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
3135 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 8, 1, 1, 1 } },
3136 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 8, 1, 1, 1 } },
3137
3138 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
3139 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
3140 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 2, 1, 1, 1 } },
3141 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 3, 1, 1, 1 } },
3142 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
3143 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 2, 1, 1, 1 } },
3144 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 2, 1, 1, 1 } },
3145 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 3, 1, 1, 1 } },
3146 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
3147 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 2, 1, 1, 1 } },
3148 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
3149 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 2, 1, 1, 1 } },
3150
3151 // These truncates are really widening elements.
3152 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 1, 1, 1, 1 } }, // PSHUFD
3153 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } }, // PUNPCKLWD+DQ
3154 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // PUNPCKLBW+WD+PSHUFD
3155 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 1, 1, 1, 1 } }, // PUNPCKLWD
3156 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } }, // PUNPCKLBW+WD
3157 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 1, 1, 1, 1 } }, // PUNPCKLBW
3158
3159 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 2, 1, 1, 1 } }, // PAND+PACKUSWB
3160 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } },
3161 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 3, 1, 1, 1 } }, // PAND+2*PACKUSWB
3162 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 7, 1, 1, 1 } },
3163 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, { 1, 1, 1, 1 } },
3164 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 3, 1, 1, 1 } },
3165 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
3166 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, {10, 1, 1, 1 } },
3167 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 4, 1, 1, 1 } }, // PAND+3*PACKUSWB
3168 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 2, 1, 1, 1 } }, // PSHUFD+PSHUFLW
3169 { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, { 1, 1, 1, 1 } }, // PSHUFD
3170 };
3171
3172 static const TypeConversionCostKindTblEntry F16ConversionTbl[] = {
3173 { ISD::FP_ROUND, MVT::f16, MVT::f32, { 1, 1, 1, 1 } },
3174 { ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, { 1, 1, 1, 1 } },
3175 { ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, { 1, 1, 1, 1 } },
3176 { ISD::FP_EXTEND, MVT::f32, MVT::f16, { 1, 1, 1, 1 } },
3177 { ISD::FP_EXTEND, MVT::f64, MVT::f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
3178 { ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, { 1, 1, 1, 1 } },
3179 { ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, { 1, 1, 1, 1 } },
3180 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
3181 };
3182
3183 // Attempt to map directly to (simple) MVT types to let us match custom entries.
3184 EVT SrcTy = TLI->getValueType(DL, Src);
3185 EVT DstTy = TLI->getValueType(DL, Dst);
3186
3187 // If we're sign-extending a vector comparison result back to the comparison
3188 // width, this will be free without AVX512 (or for 8/16-bit types without
3189 // BWI).
3190 if (!ST->hasAVX512() || (!ST->hasBWI() && DstTy.getScalarSizeInBits() < 32)) {
3191 if (I && Opcode == Instruction::CastOps::SExt &&
3192 SrcTy.isFixedLengthVectorOf(MVT::i1)) {
3193 if (auto *CmpI = dyn_cast<CmpInst>(I->getOperand(0))) {
3194 Type *CmpTy = CmpI->getOperand(0)->getType();
3195 if (CmpTy->getScalarSizeInBits() == DstTy.getScalarSizeInBits())
3196 return TTI::TCC_Free;
3197 }
3198 }
3199 }
3200
3201 // The function getSimpleVT only handles simple value types.
3202 if (SrcTy.isSimple() && DstTy.isSimple()) {
3203 MVT SimpleSrcTy = SrcTy.getSimpleVT();
3204 MVT SimpleDstTy = DstTy.getSimpleVT();
3205
3206 if (ST->useAVX512Regs()) {
3207 if (ST->hasBWI())
3208 if (const auto *Entry = ConvertCostTableLookup(
3209 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3210 if (auto KindCost = Entry->Cost[CostKind])
3211 return *KindCost;
3212
3213 if (ST->hasDQI())
3214 if (const auto *Entry = ConvertCostTableLookup(
3215 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3216 if (auto KindCost = Entry->Cost[CostKind])
3217 return *KindCost;
3218
3219 if (ST->hasAVX512())
3220 if (const auto *Entry = ConvertCostTableLookup(
3221 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3222 if (auto KindCost = Entry->Cost[CostKind])
3223 return *KindCost;
3224 }
3225
3226 if (ST->hasBWI())
3227 if (const auto *Entry = ConvertCostTableLookup(
3228 AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3229 if (auto KindCost = Entry->Cost[CostKind])
3230 return *KindCost;
3231
3232 if (ST->hasDQI())
3233 if (const auto *Entry = ConvertCostTableLookup(
3234 AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3235 if (auto KindCost = Entry->Cost[CostKind])
3236 return *KindCost;
3237
3238 if (ST->hasAVX512())
3239 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3240 SimpleDstTy, SimpleSrcTy))
3241 if (auto KindCost = Entry->Cost[CostKind])
3242 return *KindCost;
3243
3244 if (ST->hasAVX2()) {
3245 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3246 SimpleDstTy, SimpleSrcTy))
3247 if (auto KindCost = Entry->Cost[CostKind])
3248 return *KindCost;
3249 }
3250
3251 if (ST->hasAVX()) {
3252 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3253 SimpleDstTy, SimpleSrcTy))
3254 if (auto KindCost = Entry->Cost[CostKind])
3255 return *KindCost;
3256 }
3257
3258 if (ST->hasF16C()) {
3259 if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD,
3260 SimpleDstTy, SimpleSrcTy))
3261 if (auto KindCost = Entry->Cost[CostKind])
3262 return *KindCost;
3263 }
3264
3265 if (ST->hasSSE41()) {
3266 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3267 SimpleDstTy, SimpleSrcTy))
3268 if (auto KindCost = Entry->Cost[CostKind])
3269 return *KindCost;
3270 }
3271
3272 if (ST->hasSSE2()) {
3273 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3274 SimpleDstTy, SimpleSrcTy))
3275 if (auto KindCost = Entry->Cost[CostKind])
3276 return *KindCost;
3277 }
3278
3279 if ((ISD == ISD::FP_ROUND && SimpleDstTy == MVT::f16) ||
3280 (ISD == ISD::FP_EXTEND && SimpleSrcTy == MVT::f16)) {
3281 // fp16 conversions not covered by any table entries require a libcall.
3282 // Return a large (arbitrary) number to model this.
3283 return InstructionCost(64);
3284 }
3285 }
3286
3287 // Fall back to legalized types.
3288 std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src);
3289 std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst);
3290
3291 // If we're truncating to the same legalized type - just assume its free.
3292 if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second)
3293 return TTI::TCC_Free;
3294
3295 if (ST->useAVX512Regs()) {
3296 if (ST->hasBWI())
3297 if (const auto *Entry = ConvertCostTableLookup(
3298 AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
3299 if (auto KindCost = Entry->Cost[CostKind])
3300 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3301
3302 if (ST->hasDQI())
3303 if (const auto *Entry = ConvertCostTableLookup(
3304 AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
3305 if (auto KindCost = Entry->Cost[CostKind])
3306 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3307
3308 if (ST->hasAVX512())
3309 if (const auto *Entry = ConvertCostTableLookup(
3310 AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
3311 if (auto KindCost = Entry->Cost[CostKind])
3312 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3313 }
3314
3315 if (ST->hasBWI())
3316 if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
3317 LTDest.second, LTSrc.second))
3318 if (auto KindCost = Entry->Cost[CostKind])
3319 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3320
3321 if (ST->hasDQI())
3322 if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
3323 LTDest.second, LTSrc.second))
3324 if (auto KindCost = Entry->Cost[CostKind])
3325 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3326
3327 if (ST->hasAVX512())
3328 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3329 LTDest.second, LTSrc.second))
3330 if (auto KindCost = Entry->Cost[CostKind])
3331 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3332
3333 if (ST->hasAVX2())
3334 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3335 LTDest.second, LTSrc.second))
3336 if (auto KindCost = Entry->Cost[CostKind])
3337 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3338
3339 if (ST->hasAVX())
3340 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3341 LTDest.second, LTSrc.second))
3342 if (auto KindCost = Entry->Cost[CostKind])
3343 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3344
3345 if (ST->hasF16C()) {
3346 if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD,
3347 LTDest.second, LTSrc.second))
3348 if (auto KindCost = Entry->Cost[CostKind])
3349 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3350 }
3351
3352 if (ST->hasSSE41())
3353 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3354 LTDest.second, LTSrc.second))
3355 if (auto KindCost = Entry->Cost[CostKind])
3356 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3357
3358 if (ST->hasSSE2())
3359 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3360 LTDest.second, LTSrc.second))
3361 if (auto KindCost = Entry->Cost[CostKind])
3362 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3363
3364 // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
3365 // sitofp.
3366 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3367 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
3368 Type *ExtSrc = Src->getWithNewBitWidth(32);
3369 unsigned ExtOpc =
3370 (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
3371
3372 // For scalar loads the extend would be free.
3373 InstructionCost ExtCost = 0;
3374 if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
3375 ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
3376
3377 return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
3379 }
3380
3381 // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
3382 // i32.
3383 if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
3384 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
3385 Type *TruncDst = Dst->getWithNewBitWidth(32);
3386 return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
3387 getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
3389 }
3390
3391 // TODO: Allow non-throughput costs that aren't binary.
3392 auto AdjustCost = [&CostKind](InstructionCost Cost,
3395 return Cost == 0 ? 0 : N;
3396 return Cost * N;
3397 };
3398 return AdjustCost(
3399 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3400}
3401
3403 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
3405 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
3406 // Early out if this type isn't scalar/vector integer/float.
3407 if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy()))
3408 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3409 Op1Info, Op2Info, I);
3410
3411 // Legalize the type.
3412 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3413
3414 MVT MTy = LT.second;
3415
3416 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3417 assert(ISD && "Invalid opcode");
3418
3419 InstructionCost ExtraCost = 0;
3420 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
3421 // Some vector comparison predicates cost extra instructions.
3422 // TODO: Adjust ExtraCost based on CostKind?
3423 // TODO: Should we invert this and assume worst case cmp costs
3424 // and reduce for particular predicates?
3425 if (MTy.isVector() &&
3426 !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
3427 (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
3428 ST->hasBWI())) {
3429 // Fallback to I if a specific predicate wasn't specified.
3430 CmpInst::Predicate Pred = VecPred;
3431 if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE ||
3433 Pred = cast<CmpInst>(I)->getPredicate();
3434
3435 bool CmpWithConstant = false;
3436 if (auto *CmpInstr = dyn_cast_or_null<CmpInst>(I))
3437 CmpWithConstant = isa<Constant>(CmpInstr->getOperand(1));
3438
3439 switch (Pred) {
3441 // xor(cmpeq(x,y),-1)
3442 ExtraCost = CmpWithConstant ? 0 : 1;
3443 break;
3446 // xor(cmpgt(x,y),-1)
3447 ExtraCost = CmpWithConstant ? 0 : 1;
3448 break;
3451 // cmpgt(xor(x,signbit),xor(y,signbit))
3452 // xor(cmpeq(pmaxu(x,y),x),-1)
3453 ExtraCost = CmpWithConstant ? 1 : 2;
3454 break;
3457 if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
3458 (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
3459 // cmpeq(psubus(x,y),0)
3460 // cmpeq(pminu(x,y),x)
3461 ExtraCost = 1;
3462 } else {
3463 // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
3464 ExtraCost = CmpWithConstant ? 2 : 3;
3465 }
3466 break;
3469 // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases.
3470 // Use FCMP_UEQ expansion - FCMP_ONE should be the same.
3471 if (CondTy && !ST->hasAVX())
3472 return getCmpSelInstrCost(Opcode, ValTy, CondTy,
3474 Op1Info, Op2Info) +
3475 getCmpSelInstrCost(Opcode, ValTy, CondTy,
3477 Op1Info, Op2Info) +
3478 getArithmeticInstrCost(Instruction::Or, CondTy, CostKind);
3479
3480 break;
3483 // Assume worst case scenario and add the maximum extra cost.
3484 ExtraCost = 3;
3485 break;
3486 default:
3487 break;
3488 }
3489 }
3490 }
3491
3492 static const CostKindTblEntry SLMCostTbl[] = {
3493 // slm pcmpeq/pcmpgt throughput is 2
3494 { ISD::SETCC, MVT::v2i64, { 2, 5, 1, 2 } },
3495 // slm pblendvb/blendvpd/blendvps throughput is 4
3496 { ISD::SELECT, MVT::v2f64, { 4, 4, 1, 3 } }, // vblendvpd
3497 { ISD::SELECT, MVT::v4f32, { 4, 4, 1, 3 } }, // vblendvps
3498 { ISD::SELECT, MVT::v2i64, { 4, 4, 1, 3 } }, // pblendvb
3499 { ISD::SELECT, MVT::v8i32, { 4, 4, 1, 3 } }, // pblendvb
3500 { ISD::SELECT, MVT::v8i16, { 4, 4, 1, 3 } }, // pblendvb
3501 { ISD::SELECT, MVT::v16i8, { 4, 4, 1, 3 } }, // pblendvb
3502 };
3503
3504 static const CostKindTblEntry AVX512BWCostTbl[] = {
3505 { ISD::SETCC, MVT::v32i16, { 1, 1, 1, 1 } },
3506 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 1 } },
3507 { ISD::SETCC, MVT::v64i8, { 1, 1, 1, 1 } },
3508 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 1 } },
3509
3510 { ISD::SELECT, MVT::v32i16, { 1, 1, 1, 1 } },
3511 { ISD::SELECT, MVT::v64i8, { 1, 1, 1, 1 } },
3512 };
3513
3514 static const CostKindTblEntry AVX512CostTbl[] = {
3515 { ISD::SETCC, MVT::v8f64, { 1, 4, 1, 1 } },
3516 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 1 } },
3517 { ISD::SETCC, MVT::v16f32, { 1, 4, 1, 1 } },
3518 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 1 } },
3519
3520 { ISD::SETCC, MVT::v8i64, { 1, 1, 1, 1 } },
3521 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 1 } },
3522 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3523 { ISD::SETCC, MVT::v16i32, { 1, 1, 1, 1 } },
3524 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 1 } },
3525 { ISD::SETCC, MVT::v32i16, { 3, 7, 5, 5 } },
3526 { ISD::SETCC, MVT::v64i8, { 3, 7, 5, 5 } },
3527
3528 { ISD::SELECT, MVT::v8i64, { 1, 1, 1, 1 } },
3529 { ISD::SELECT, MVT::v4i64, { 1, 1, 1, 1 } },
3530 { ISD::SELECT, MVT::v2i64, { 1, 1, 1, 1 } },
3531 { ISD::SELECT, MVT::v16i32, { 1, 1, 1, 1 } },
3532 { ISD::SELECT, MVT::v8i32, { 1, 1, 1, 1 } },
3533 { ISD::SELECT, MVT::v4i32, { 1, 1, 1, 1 } },
3534 { ISD::SELECT, MVT::v8f64, { 1, 1, 1, 1 } },
3535 { ISD::SELECT, MVT::v4f64, { 1, 1, 1, 1 } },
3536 { ISD::SELECT, MVT::v2f64, { 1, 1, 1, 1 } },
3537 { ISD::SELECT, MVT::f64, { 1, 1, 1, 1 } },
3538 { ISD::SELECT, MVT::v16f32, { 1, 1, 1, 1 } },
3539 { ISD::SELECT, MVT::v8f32 , { 1, 1, 1, 1 } },
3540 { ISD::SELECT, MVT::v4f32, { 1, 1, 1, 1 } },
3541 { ISD::SELECT, MVT::f32 , { 1, 1, 1, 1 } },
3542
3543 { ISD::SELECT, MVT::v32i16, { 2, 2, 4, 4 } },
3544 { ISD::SELECT, MVT::v16i16, { 1, 1, 1, 1 } },
3545 { ISD::SELECT, MVT::v8i16, { 1, 1, 1, 1 } },
3546 { ISD::SELECT, MVT::v64i8, { 2, 2, 4, 4 } },
3547 { ISD::SELECT, MVT::v32i8, { 1, 1, 1, 1 } },
3548 { ISD::SELECT, MVT::v16i8, { 1, 1, 1, 1 } },
3549 };
3550
3551 static const CostKindTblEntry AVX2CostTbl[] = {
3552 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 2 } },
3553 { ISD::SETCC, MVT::v2f64, { 1, 4, 1, 1 } },
3554 { ISD::SETCC, MVT::f64, { 1, 4, 1, 1 } },
3555 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 2 } },
3556 { ISD::SETCC, MVT::v4f32, { 1, 4, 1, 1 } },
3557 { ISD::SETCC, MVT::f32, { 1, 4, 1, 1 } },
3558
3559 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 2 } },
3560 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 2 } },
3561 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 2 } },
3562 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 2 } },
3563
3564 { ISD::SELECT, MVT::v4f64, { 2, 2, 1, 2 } }, // vblendvpd
3565 { ISD::SELECT, MVT::v8f32, { 2, 2, 1, 2 } }, // vblendvps
3566 { ISD::SELECT, MVT::v4i64, { 2, 2, 1, 2 } }, // pblendvb
3567 { ISD::SELECT, MVT::v8i32, { 2, 2, 1, 2 } }, // pblendvb
3568 { ISD::SELECT, MVT::v16i16, { 2, 2, 1, 2 } }, // pblendvb
3569 { ISD::SELECT, MVT::v32i8, { 2, 2, 1, 2 } }, // pblendvb
3570 };
3571
3572 static const CostKindTblEntry XOPCostTbl[] = {
3573 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3574 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3575 };
3576
3577 static const CostKindTblEntry AVX1CostTbl[] = {
3578 { ISD::SETCC, MVT::v4f64, { 2, 3, 1, 2 } },
3579 { ISD::SETCC, MVT::v2f64, { 1, 3, 1, 1 } },
3580 { ISD::SETCC, MVT::f64, { 1, 3, 1, 1 } },
3581 { ISD::SETCC, MVT::v8f32, { 2, 3, 1, 2 } },
3582 { ISD::SETCC, MVT::v4f32, { 1, 3, 1, 1 } },
3583 { ISD::SETCC, MVT::f32, { 1, 3, 1, 1 } },
3584
3585 // AVX1 does not support 8-wide integer compare.
3586 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3587 { ISD::SETCC, MVT::v8i32, { 4, 2, 5, 6 } },
3588 { ISD::SETCC, MVT::v16i16, { 4, 2, 5, 6 } },
3589 { ISD::SETCC, MVT::v32i8, { 4, 2, 5, 6 } },
3590
3591 { ISD::SELECT, MVT::v4f64, { 3, 3, 1, 2 } }, // vblendvpd
3592 { ISD::SELECT, MVT::v8f32, { 3, 3, 1, 2 } }, // vblendvps
3593 { ISD::SELECT, MVT::v4i64, { 3, 3, 1, 2 } }, // vblendvpd
3594 { ISD::SELECT, MVT::v8i32, { 3, 3, 1, 2 } }, // vblendvps
3595 { ISD::SELECT, MVT::v16i16, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3596 { ISD::SELECT, MVT::v32i8, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3597 };
3598
3599 static const CostKindTblEntry SSE42CostTbl[] = {
3600 { ISD::SETCC, MVT::v2i64, { 1, 2, 1, 2 } },
3601 };
3602
3603 static const CostKindTblEntry SSE41CostTbl[] = {
3604 { ISD::SETCC, MVT::v2f64, { 1, 5, 1, 1 } },
3605 { ISD::SETCC, MVT::v4f32, { 1, 5, 1, 1 } },
3606
3607 { ISD::SELECT, MVT::v2f64, { 2, 2, 1, 2 } }, // blendvpd
3608 { ISD::SELECT, MVT::f64, { 2, 2, 1, 2 } }, // blendvpd
3609 { ISD::SELECT, MVT::v4f32, { 2, 2, 1, 2 } }, // blendvps
3610 { ISD::SELECT, MVT::f32 , { 2, 2, 1, 2 } }, // blendvps
3611 { ISD::SELECT, MVT::v2i64, { 2, 2, 1, 2 } }, // pblendvb
3612 { ISD::SELECT, MVT::v4i32, { 2, 2, 1, 2 } }, // pblendvb
3613 { ISD::SELECT, MVT::v8i16, { 2, 2, 1, 2 } }, // pblendvb
3614 { ISD::SELECT, MVT::v16i8, { 2, 2, 1, 2 } }, // pblendvb
3615 };
3616
3617 static const CostKindTblEntry SSE2CostTbl[] = {
3618 { ISD::SETCC, MVT::v2f64, { 2, 5, 1, 1 } },
3619 { ISD::SETCC, MVT::f64, { 1, 5, 1, 1 } },
3620
3621 { ISD::SETCC, MVT::v2i64, { 5, 4, 5, 5 } }, // pcmpeqd/pcmpgtd expansion
3622 { ISD::SETCC, MVT::v4i32, { 1, 1, 1, 1 } },
3623 { ISD::SETCC, MVT::v8i16, { 1, 1, 1, 1 } },
3624 { ISD::SETCC, MVT::v16i8, { 1, 1, 1, 1 } },
3625
3626 { ISD::SELECT, MVT::v2f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3627 { ISD::SELECT, MVT::f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3628 { ISD::SELECT, MVT::v2i64, { 2, 2, 3, 3 } }, // pand + pandn + por
3629 { ISD::SELECT, MVT::v4i32, { 2, 2, 3, 3 } }, // pand + pandn + por
3630 { ISD::SELECT, MVT::v8i16, { 2, 2, 3, 3 } }, // pand + pandn + por
3631 { ISD::SELECT, MVT::v16i8, { 2, 2, 3, 3 } }, // pand + pandn + por
3632 };
3633
3634 static const CostKindTblEntry SSE1CostTbl[] = {
3635 { ISD::SETCC, MVT::v4f32, { 2, 5, 1, 1 } },
3636 { ISD::SETCC, MVT::f32, { 1, 5, 1, 1 } },
3637
3638 { ISD::SELECT, MVT::v4f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3639 { ISD::SELECT, MVT::f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3640 };
3641
3642 if (ST->useSLMArithCosts())
3643 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
3644 if (auto KindCost = Entry->Cost[CostKind])
3645 return LT.first * (ExtraCost + *KindCost);
3646
3647 if (ST->hasBWI())
3648 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3649 if (auto KindCost = Entry->Cost[CostKind])
3650 return LT.first * (ExtraCost + *KindCost);
3651
3652 if (ST->hasAVX512())
3653 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3654 if (auto KindCost = Entry->Cost[CostKind])
3655 return LT.first * (ExtraCost + *KindCost);
3656
3657 if (ST->hasAVX2())
3658 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3659 if (auto KindCost = Entry->Cost[CostKind])
3660 return LT.first * (ExtraCost + *KindCost);
3661
3662 if (ST->hasXOP())
3663 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3664 if (auto KindCost = Entry->Cost[CostKind])
3665 return LT.first * (ExtraCost + *KindCost);
3666
3667 if (ST->hasAVX())
3668 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3669 if (auto KindCost = Entry->Cost[CostKind])
3670 return LT.first * (ExtraCost + *KindCost);
3671
3672 if (ST->hasSSE42())
3673 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3674 if (auto KindCost = Entry->Cost[CostKind])
3675 return LT.first * (ExtraCost + *KindCost);
3676
3677 if (ST->hasSSE41())
3678 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3679 if (auto KindCost = Entry->Cost[CostKind])
3680 return LT.first * (ExtraCost + *KindCost);
3681
3682 if (ST->hasSSE2())
3683 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3684 if (auto KindCost = Entry->Cost[CostKind])
3685 return LT.first * (ExtraCost + *KindCost);
3686
3687 if (ST->hasSSE1())
3688 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3689 if (auto KindCost = Entry->Cost[CostKind])
3690 return LT.first * (ExtraCost + *KindCost);
3691
3692 // Assume a 3cy latency for fp select ops.
3693 if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select)
3694 if (ValTy->getScalarType()->isFloatingPointTy())
3695 return 3;
3696
3697 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3698 Op1Info, Op2Info, I);
3699}
3700
3702
3706 // Costs should match the codegen from:
3707 // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
3708 // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
3709 // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
3710 // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
3711 // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
3712
3713 // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
3714 // specialized in these tables yet.
3715 static const CostKindTblEntry AVX512VBMI2CostTbl[] = {
3716 { ISD::FSHL, MVT::v8i64, { 1, 1, 1, 1 } },
3717 { ISD::FSHL, MVT::v4i64, { 1, 1, 1, 1 } },
3718 { ISD::FSHL, MVT::v2i64, { 1, 1, 1, 1 } },
3719 { ISD::FSHL, MVT::v16i32, { 1, 1, 1, 1 } },
3720 { ISD::FSHL, MVT::v8i32, { 1, 1, 1, 1 } },
3721 { ISD::FSHL, MVT::v4i32, { 1, 1, 1, 1 } },
3722 { ISD::FSHL, MVT::v32i16, { 1, 1, 1, 1 } },
3723 { ISD::FSHL, MVT::v16i16, { 1, 1, 1, 1 } },
3724 { ISD::FSHL, MVT::v8i16, { 1, 1, 1, 1 } },
3725 { ISD::ROTL, MVT::v32i16, { 1, 1, 1, 1 } },
3726 { ISD::ROTL, MVT::v16i16, { 1, 1, 1, 1 } },
3727 { ISD::ROTL, MVT::v8i16, { 1, 1, 1, 1 } },
3728 { ISD::ROTR, MVT::v32i16, { 1, 1, 1, 1 } },
3729 { ISD::ROTR, MVT::v16i16, { 1, 1, 1, 1 } },
3730 { ISD::ROTR, MVT::v8i16, { 1, 1, 1, 1 } },
3731 { X86ISD::VROTLI, MVT::v32i16, { 1, 1, 1, 1 } },
3732 { X86ISD::VROTLI, MVT::v16i16, { 1, 1, 1, 1 } },
3733 { X86ISD::VROTLI, MVT::v8i16, { 1, 1, 1, 1 } },
3734 };
3735 static const CostKindTblEntry AVX512BITALGCostTbl[] = {
3736 { ISD::CTPOP, MVT::v32i16, { 1, 1, 1, 1 } },
3737 { ISD::CTPOP, MVT::v64i8, { 1, 1, 1, 1 } },
3738 { ISD::CTPOP, MVT::v16i16, { 1, 1, 1, 1 } },
3739 { ISD::CTPOP, MVT::v32i8, { 1, 1, 1, 1 } },
3740 { ISD::CTPOP, MVT::v8i16, { 1, 1, 1, 1 } },
3741 { ISD::CTPOP, MVT::v16i8, { 1, 1, 1, 1 } },
3742 };
3743 static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = {
3744 { ISD::CTPOP, MVT::v8i64, { 1, 1, 1, 1 } },
3745 { ISD::CTPOP, MVT::v16i32, { 1, 1, 1, 1 } },
3746 { ISD::CTPOP, MVT::v4i64, { 1, 1, 1, 1 } },
3747 { ISD::CTPOP, MVT::v8i32, { 1, 1, 1, 1 } },
3748 { ISD::CTPOP, MVT::v2i64, { 1, 1, 1, 1 } },
3749 { ISD::CTPOP, MVT::v4i32, { 1, 1, 1, 1 } },
3750 };
3751 static const CostKindTblEntry AVX512CDCostTbl[] = {
3752 { ISD::CTLZ, MVT::v8i64, { 1, 5, 1, 1 } },
3753 { ISD::CTLZ, MVT::v16i32, { 1, 5, 1, 1 } },
3754 { ISD::CTLZ, MVT::v32i16, { 18, 27, 23, 27 } },
3755 { ISD::CTLZ, MVT::v64i8, { 3, 16, 9, 11 } },
3756 { ISD::CTLZ, MVT::v4i64, { 1, 5, 1, 1 } },
3757 { ISD::CTLZ, MVT::v8i32, { 1, 5, 1, 1 } },
3758 { ISD::CTLZ, MVT::v16i16, { 8, 19, 11, 13 } },
3759 { ISD::CTLZ, MVT::v32i8, { 2, 11, 9, 10 } },
3760 { ISD::CTLZ, MVT::v2i64, { 1, 5, 1, 1 } },
3761 { ISD::CTLZ, MVT::v4i32, { 1, 5, 1, 1 } },
3762 { ISD::CTLZ, MVT::v8i16, { 3, 15, 4, 6 } },
3763 { ISD::CTLZ, MVT::v16i8, { 2, 10, 9, 10 } },
3764
3765 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3766 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3767 { ISD::CTTZ, MVT::v4i64, { 1, 8, 6, 6 } },
3768 { ISD::CTTZ, MVT::v8i32, { 1, 8, 6, 6 } },
3769 { ISD::CTTZ, MVT::v2i64, { 1, 8, 6, 6 } },
3770 { ISD::CTTZ, MVT::v4i32, { 1, 8, 6, 6 } },
3771 };
3772 static const CostKindTblEntry AVX512BWCostTbl[] = {
3773 { ISD::ABS, MVT::v32i16, { 1, 1, 1, 1 } },
3774 { ISD::ABS, MVT::v64i8, { 1, 1, 1, 1 } },
3775 { ISD::BITREVERSE, MVT::v2i64, { 3, 10, 10, 11 } },
3776 { ISD::BITREVERSE, MVT::v4i64, { 3, 11, 10, 11 } },
3777 { ISD::BITREVERSE, MVT::v8i64, { 3, 12, 10, 14 } },
3778 { ISD::BITREVERSE, MVT::v4i32, { 3, 10, 10, 11 } },
3779 { ISD::BITREVERSE, MVT::v8i32, { 3, 11, 10, 11 } },
3780 { ISD::BITREVERSE, MVT::v16i32, { 3, 12, 10, 14 } },
3781 { ISD::BITREVERSE, MVT::v8i16, { 3, 10, 10, 11 } },
3782 { ISD::BITREVERSE, MVT::v16i16, { 3, 11, 10, 11 } },
3783 { ISD::BITREVERSE, MVT::v32i16, { 3, 12, 10, 14 } },
3784 { ISD::BITREVERSE, MVT::v16i8, { 2, 5, 9, 9 } },
3785 { ISD::BITREVERSE, MVT::v32i8, { 2, 5, 9, 9 } },
3786 { ISD::BITREVERSE, MVT::v64i8, { 2, 5, 9, 12 } },
3787 { ISD::BSWAP, MVT::v2i64, { 1, 1, 1, 2 } },
3788 { ISD::BSWAP, MVT::v4i64, { 1, 1, 1, 2 } },
3789 { ISD::BSWAP, MVT::v8i64, { 1, 1, 1, 2 } },
3790 { ISD::BSWAP, MVT::v4i32, { 1, 1, 1, 2 } },
3791 { ISD::BSWAP, MVT::v8i32, { 1, 1, 1, 2 } },
3792 { ISD::BSWAP, MVT::v16i32, { 1, 1, 1, 2 } },
3793 { ISD::BSWAP, MVT::v8i16, { 1, 1, 1, 2 } },
3794 { ISD::BSWAP, MVT::v16i16, { 1, 1, 1, 2 } },
3795 { ISD::BSWAP, MVT::v32i16, { 1, 1, 1, 2 } },
3796 { ISD::CTLZ, MVT::v8i64, { 8, 22, 23, 23 } },
3797 { ISD::CTLZ, MVT::v16i32, { 8, 23, 25, 25 } },
3798 { ISD::CTLZ, MVT::v32i16, { 4, 15, 15, 16 } },
3799 { ISD::CTLZ, MVT::v64i8, { 3, 12, 10, 9 } },
3800 { ISD::CTPOP, MVT::v2i64, { 3, 7, 10, 10 } },
3801 { ISD::CTPOP, MVT::v4i64, { 3, 7, 10, 10 } },
3802 { ISD::CTPOP, MVT::v8i64, { 3, 8, 10, 12 } },
3803 { ISD::CTPOP, MVT::v4i32, { 7, 11, 14, 14 } },
3804 { ISD::CTPOP, MVT::v8i32, { 7, 11, 14, 14 } },
3805 { ISD::CTPOP, MVT::v16i32, { 7, 12, 14, 16 } },
3806 { ISD::CTPOP, MVT::v8i16, { 2, 7, 11, 11 } },
3807 { ISD::CTPOP, MVT::v16i16, { 2, 7, 11, 11 } },
3808 { ISD::CTPOP, MVT::v32i16, { 3, 7, 11, 13 } },
3809 { ISD::CTPOP, MVT::v16i8, { 2, 4, 8, 8 } },
3810 { ISD::CTPOP, MVT::v32i8, { 2, 4, 8, 8 } },
3811 { ISD::CTPOP, MVT::v64i8, { 2, 5, 8, 10 } },
3812 { ISD::CTTZ, MVT::v8i16, { 3, 9, 14, 14 } },
3813 { ISD::CTTZ, MVT::v16i16, { 3, 9, 14, 14 } },
3814 { ISD::CTTZ, MVT::v32i16, { 3, 10, 14, 16 } },
3815 { ISD::CTTZ, MVT::v16i8, { 2, 6, 11, 11 } },
3816 { ISD::CTTZ, MVT::v32i8, { 2, 6, 11, 11 } },
3817 { ISD::CTTZ, MVT::v64i8, { 3, 7, 11, 13 } },
3818 { ISD::ROTL, MVT::v32i16, { 2, 8, 6, 8 } },
3819 { ISD::ROTL, MVT::v16i16, { 2, 8, 6, 7 } },
3820 { ISD::ROTL, MVT::v8i16, { 2, 7, 6, 7 } },
3821 { ISD::ROTL, MVT::v64i8, { 5, 6, 11, 12 } },
3822 { ISD::ROTL, MVT::v32i8, { 5, 15, 7, 10 } },
3823 { ISD::ROTL, MVT::v16i8, { 5, 15, 7, 10 } },
3824 { ISD::ROTR, MVT::v32i16, { 2, 8, 6, 8 } },
3825 { ISD::ROTR, MVT::v16i16, { 2, 8, 6, 7 } },
3826 { ISD::ROTR, MVT::v8i16, { 2, 7, 6, 7 } },
3827 { ISD::ROTR, MVT::v64i8, { 5, 6, 12, 14 } },
3828 { ISD::ROTR, MVT::v32i8, { 5, 14, 6, 9 } },
3829 { ISD::ROTR, MVT::v16i8, { 5, 14, 6, 9 } },
3830 { X86ISD::VROTLI, MVT::v32i16, { 2, 5, 3, 3 } },
3831 { X86ISD::VROTLI, MVT::v16i16, { 1, 5, 3, 3 } },
3832 { X86ISD::VROTLI, MVT::v8i16, { 1, 5, 3, 3 } },
3833 { X86ISD::VROTLI, MVT::v64i8, { 2, 9, 3, 4 } },
3834 { X86ISD::VROTLI, MVT::v32i8, { 1, 9, 3, 4 } },
3835 { X86ISD::VROTLI, MVT::v16i8, { 1, 8, 3, 4 } },
3836 { ISD::SADDSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3837 { ISD::SADDSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3838 { ISD::SMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3839 { ISD::SMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3840 { ISD::SMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3841 { ISD::SMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3842 { ISD::SMULO, MVT::v32i16, { 3, 6, 4, 4 } },
3843 { ISD::SMULO, MVT::v64i8, { 8, 21, 17, 18 } },
3844 { ISD::UMULO, MVT::v32i16, { 2, 5, 3, 3 } },
3845 { ISD::UMULO, MVT::v64i8, { 8, 15, 15, 16 } },
3846 { ISD::SSUBSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3847 { ISD::SSUBSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3848 { ISD::UADDSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3849 { ISD::UADDSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3850 { ISD::UMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3851 { ISD::UMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3852 { ISD::UMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3853 { ISD::UMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3854 { ISD::USUBSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3855 { ISD::USUBSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3856 };
3857 static const CostKindTblEntry AVX512CostTbl[] = {
3858 { ISD::ABS, MVT::v8i64, { 1, 1, 1, 1 } },
3859 { ISD::ABS, MVT::v4i64, { 1, 1, 1, 1 } },
3860 { ISD::ABS, MVT::v2i64, { 1, 1, 1, 1 } },
3861 { ISD::ABS, MVT::v16i32, { 1, 1, 1, 1 } },
3862 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 1 } },
3863 { ISD::ABS, MVT::v32i16, { 2, 7, 4, 4 } },
3864 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 1 } },
3865 { ISD::ABS, MVT::v64i8, { 2, 7, 4, 4 } },
3866 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 1 } },
3867 { ISD::BITREVERSE, MVT::v8i64, { 9, 13, 20, 20 } },
3868 { ISD::BITREVERSE, MVT::v16i32, { 9, 13, 20, 20 } },
3869 { ISD::BITREVERSE, MVT::v32i16, { 9, 13, 20, 20 } },
3870 { ISD::BITREVERSE, MVT::v64i8, { 6, 11, 17, 17 } },
3871 { ISD::BSWAP, MVT::v8i64, { 4, 7, 5, 5 } },
3872 { ISD::BSWAP, MVT::v16i32, { 4, 7, 5, 5 } },
3873 { ISD::BSWAP, MVT::v32i16, { 4, 7, 5, 5 } },
3874 { ISD::CTLZ, MVT::v8i64, { 10, 28, 32, 32 } },
3875 { ISD::CTLZ, MVT::v16i32, { 12, 30, 38, 38 } },
3876 { ISD::CTLZ, MVT::v32i16, { 8, 15, 29, 29 } },
3877 { ISD::CTLZ, MVT::v64i8, { 6, 11, 19, 19 } },
3878 { ISD::CTPOP, MVT::v8i64, { 16, 16, 19, 19 } },
3879 { ISD::CTPOP, MVT::v16i32, { 24, 19, 27, 27 } },
3880 { ISD::CTPOP, MVT::v32i16, { 18, 15, 22, 22 } },
3881 { ISD::CTPOP, MVT::v64i8, { 12, 11, 16, 16 } },
3882 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3883 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3884 { ISD::CTTZ, MVT::v32i16, { 7, 17, 27, 27 } },
3885 { ISD::CTTZ, MVT::v64i8, { 6, 13, 21, 21 } },
3886 { ISD::ROTL, MVT::v8i64, { 1, 1, 1, 1 } },
3887 { ISD::ROTL, MVT::v4i64, { 1, 1, 1, 1 } },
3888 { ISD::ROTL, MVT::v2i64, { 1, 1, 1, 1 } },
3889 { ISD::ROTL, MVT::v16i32, { 1, 1, 1, 1 } },
3890 { ISD::ROTL, MVT::v8i32, { 1, 1, 1, 1 } },
3891 { ISD::ROTL, MVT::v4i32, { 1, 1, 1, 1 } },
3892 { ISD::ROTR, MVT::v8i64, { 1, 1, 1, 1 } },
3893 { ISD::ROTR, MVT::v4i64, { 1, 1, 1, 1 } },
3894 { ISD::ROTR, MVT::v2i64, { 1, 1, 1, 1 } },
3895 { ISD::ROTR, MVT::v16i32, { 1, 1, 1, 1 } },
3896 { ISD::ROTR, MVT::v8i32, { 1, 1, 1, 1 } },
3897 { ISD::ROTR, MVT::v4i32, { 1, 1, 1, 1 } },
3898 { X86ISD::VROTLI, MVT::v8i64, { 1, 1, 1, 1 } },
3899 { X86ISD::VROTLI, MVT::v4i64, { 1, 1, 1, 1 } },
3900 { X86ISD::VROTLI, MVT::v2i64, { 1, 1, 1, 1 } },
3901 { X86ISD::VROTLI, MVT::v16i32, { 1, 1, 1, 1 } },
3902 { X86ISD::VROTLI, MVT::v8i32, { 1, 1, 1, 1 } },
3903 { X86ISD::VROTLI, MVT::v4i32, { 1, 1, 1, 1 } },
3904 { ISD::SADDSAT, MVT::v2i64, { 3, 3, 8, 9 } },
3905 { ISD::SADDSAT, MVT::v4i64, { 2, 2, 6, 7 } },
3906 { ISD::SADDSAT, MVT::v8i64, { 3, 3, 6, 7 } },
3907 { ISD::SADDSAT, MVT::v4i32, { 2, 2, 6, 7 } },
3908 { ISD::SADDSAT, MVT::v8i32, { 2, 2, 6, 7 } },
3909 { ISD::SADDSAT, MVT::v16i32, { 3, 3, 6, 7 } },
3910 { ISD::SADDSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3911 { ISD::SADDSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3912 { ISD::SMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3913 { ISD::SMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3914 { ISD::SMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3915 { ISD::SMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3916 { ISD::SMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3917 { ISD::SMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3918 { ISD::SMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3919 { ISD::SMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3920 { ISD::SMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3921 { ISD::SMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3922 { ISD::SMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3923 { ISD::SMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3924 { ISD::SMULO, MVT::v8i64, { 44, 44, 81, 93 } },
3925 { ISD::SMULO, MVT::v16i32, { 5, 12, 9, 11 } },
3926 { ISD::SMULO, MVT::v32i16, { 6, 12, 17, 17 } },
3927 { ISD::SMULO, MVT::v64i8, { 22, 28, 42, 42 } },
3928 { ISD::SSUBSAT, MVT::v2i64, { 2, 13, 9, 10 } },
3929 { ISD::SSUBSAT, MVT::v4i64, { 2, 15, 7, 8 } },
3930 { ISD::SSUBSAT, MVT::v8i64, { 2, 14, 7, 8 } },
3931 { ISD::SSUBSAT, MVT::v4i32, { 2, 14, 7, 8 } },
3932 { ISD::SSUBSAT, MVT::v8i32, { 2, 15, 7, 8 } },
3933 { ISD::SSUBSAT, MVT::v16i32, { 2, 14, 7, 8 } },
3934 { ISD::SSUBSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3935 { ISD::SSUBSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3936 { ISD::UMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3937 { ISD::UMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3938 { ISD::UMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3939 { ISD::UMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3940 { ISD::UMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3941 { ISD::UMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3942 { ISD::UMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3943 { ISD::UMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3944 { ISD::UMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3945 { ISD::UMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3946 { ISD::UMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3947 { ISD::UMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3948 { ISD::UMULO, MVT::v8i64, { 52, 52, 95, 104} },
3949 { ISD::UMULO, MVT::v16i32, { 5, 12, 8, 10 } },
3950 { ISD::UMULO, MVT::v32i16, { 5, 13, 16, 16 } },
3951 { ISD::UMULO, MVT::v64i8, { 18, 24, 30, 30 } },
3952 { ISD::UADDSAT, MVT::v2i64, { 1, 4, 4, 4 } },
3953 { ISD::UADDSAT, MVT::v4i64, { 1, 4, 4, 4 } },
3954 { ISD::UADDSAT, MVT::v8i64, { 1, 4, 4, 4 } },
3955 { ISD::UADDSAT, MVT::v4i32, { 1, 2, 4, 4 } },
3956 { ISD::UADDSAT, MVT::v8i32, { 1, 2, 4, 4 } },
3957 { ISD::UADDSAT, MVT::v16i32, { 2, 2, 4, 4 } },
3958 { ISD::UADDSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3959 { ISD::UADDSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3960 { ISD::USUBSAT, MVT::v2i64, { 1, 4, 2, 2 } },
3961 { ISD::USUBSAT, MVT::v4i64, { 1, 4, 2, 2 } },
3962 { ISD::USUBSAT, MVT::v8i64, { 1, 4, 2, 2 } },
3963 { ISD::USUBSAT, MVT::v8i32, { 1, 2, 2, 2 } },
3964 { ISD::USUBSAT, MVT::v16i32, { 1, 2, 2, 2 } },
3965 { ISD::USUBSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3966 { ISD::USUBSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3967 { ISD::FMAXNUM, MVT::f32, { 2, 2, 3, 3 } },
3968 { ISD::FMAXNUM, MVT::v4f32, { 1, 1, 3, 3 } },
3969 { ISD::FMAXNUM, MVT::v8f32, { 2, 2, 3, 3 } },
3970 { ISD::FMAXNUM, MVT::v16f32, { 4, 4, 3, 3 } },
3971 { ISD::FMAXNUM, MVT::f64, { 2, 2, 3, 3 } },
3972 { ISD::FMAXNUM, MVT::v2f64, { 1, 1, 3, 3 } },
3973 { ISD::FMAXNUM, MVT::v4f64, { 2, 2, 3, 3 } },
3974 { ISD::FMAXNUM, MVT::v8f64, { 3, 3, 3, 3 } },
3975 { ISD::FSQRT, MVT::f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3976 { ISD::FSQRT, MVT::v4f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3977 { ISD::FSQRT, MVT::v8f32, { 6, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3978 { ISD::FSQRT, MVT::v16f32, { 12, 20, 1, 3 } }, // Skylake from http://www.agner.org/
3979 { ISD::FSQRT, MVT::f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3980 { ISD::FSQRT, MVT::v2f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3981 { ISD::FSQRT, MVT::v4f64, { 12, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3982 { ISD::FSQRT, MVT::v8f64, { 24, 32, 1, 3 } }, // Skylake from http://www.agner.org/
3983 };
3984 static const CostKindTblEntry XOPCostTbl[] = {
3985 { ISD::BITREVERSE, MVT::v4i64, { 3, 6, 5, 6 } },
3986 { ISD::BITREVERSE, MVT::v8i32, { 3, 6, 5, 6 } },
3987 { ISD::BITREVERSE, MVT::v16i16, { 3, 6, 5, 6 } },
3988 { ISD::BITREVERSE, MVT::v32i8, { 3, 6, 5, 6 } },
3989 { ISD::BITREVERSE, MVT::v2i64, { 2, 7, 1, 1 } },
3990 { ISD::BITREVERSE, MVT::v4i32, { 2, 7, 1, 1 } },
3991 { ISD::BITREVERSE, MVT::v8i16, { 2, 7, 1, 1 } },
3992 { ISD::BITREVERSE, MVT::v16i8, { 2, 7, 1, 1 } },
3993 { ISD::BITREVERSE, MVT::i64, { 2, 2, 3, 4 } },
3994 { ISD::BITREVERSE, MVT::i32, { 2, 2, 3, 4 } },
3995 { ISD::BITREVERSE, MVT::i16, { 2, 2, 3, 4 } },
3996 { ISD::BITREVERSE, MVT::i8, { 2, 2, 3, 4 } },
3997 // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
3998 { ISD::ROTL, MVT::v4i64, { 4, 7, 5, 6 } },
3999 { ISD::ROTL, MVT::v8i32, { 4, 7, 5, 6 } },
4000 { ISD::ROTL, MVT::v16i16, { 4, 7, 5, 6 } },
4001 { ISD::ROTL, MVT::v32i8, { 4, 7, 5, 6 } },
4002 { ISD::ROTL, MVT::v2i64, { 1, 3, 1, 1 } },
4003 { ISD::ROTL, MVT::v4i32, { 1, 3, 1, 1 } },
4004 { ISD::ROTL, MVT::v8i16, { 1, 3, 1, 1 } },
4005 { ISD::ROTL, MVT::v16i8, { 1, 3, 1, 1 } },
4006 { ISD::ROTR, MVT::v4i64, { 4, 7, 8, 9 } },
4007 { ISD::ROTR, MVT::v8i32, { 4, 7, 8, 9 } },
4008 { ISD::ROTR, MVT::v16i16, { 4, 7, 8, 9 } },
4009 { ISD::ROTR, MVT::v32i8, { 4, 7, 8, 9 } },
4010 { ISD::ROTR, MVT::v2i64, { 1, 3, 3, 3 } },
4011 { ISD::ROTR, MVT::v4i32, { 1, 3, 3, 3 } },
4012 { ISD::ROTR, MVT::v8i16, { 1, 3, 3, 3 } },
4013 { ISD::ROTR, MVT::v16i8, { 1, 3, 3, 3 } },
4014 { X86ISD::VROTLI, MVT::v4i64, { 4, 7, 5, 6 } },
4015 { X86ISD::VROTLI, MVT::v8i32, { 4, 7, 5, 6 } },
4016 { X86ISD::VROTLI, MVT::v16i16, { 4, 7, 5, 6 } },
4017 { X86ISD::VROTLI, MVT::v32i8, { 4, 7, 5, 6 } },
4018 { X86ISD::VROTLI, MVT::v2i64, { 1, 3, 1, 1 } },
4019 { X86ISD::VROTLI, MVT::v4i32, { 1, 3, 1, 1 } },
4020 { X86ISD::VROTLI, MVT::v8i16, { 1, 3, 1, 1 } },
4021 { X86ISD::VROTLI, MVT::v16i8, { 1, 3, 1, 1 } },
4022 };
4023 static const CostKindTblEntry AVX2CostTbl[] = {
4024 { ISD::ABS, MVT::v2i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
4025 { ISD::ABS, MVT::v4i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
4026 { ISD::ABS, MVT::v4i32, { 1, 1, 1, 1 } },
4027 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 2 } },
4028 { ISD::ABS, MVT::v8i16, { 1, 1, 1, 1 } },
4029 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 2 } },
4030 { ISD::ABS, MVT::v16i8, { 1, 1, 1, 1 } },
4031 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 2 } },
4032 { ISD::BITREVERSE, MVT::v2i64, { 3, 11, 10, 11 } },
4033 { ISD::BITREVERSE, MVT::v4i64, { 5, 11, 10, 17 } },
4034 { ISD::BITREVERSE, MVT::v4i32, { 3, 11, 10, 11 } },
4035 { ISD::BITREVERSE, MVT::v8i32, { 5, 11, 10, 17 } },
4036 { ISD::BITREVERSE, MVT::v8i16, { 3, 11, 10, 11 } },
4037 { ISD::BITREVERSE, MVT::v16i16, { 5, 11, 10, 17 } },
4038 { ISD::BITREVERSE, MVT::v16i8, { 3, 6, 9, 9 } },
4039 { ISD::BITREVERSE, MVT::v32i8, { 4, 5, 9, 15 } },
4040 { ISD::BSWAP, MVT::v2i64, { 1, 2, 1, 2 } },
4041 { ISD::BSWAP, MVT::v4i64, { 1, 3, 1, 2 } },
4042 { ISD::BSWAP, MVT::v4i32, { 1, 2, 1, 2 } },
4043 { ISD::BSWAP, MVT::v8i32, { 1, 3, 1, 2 } },
4044 { ISD::BSWAP, MVT::v8i16, { 1, 2, 1, 2 } },
4045 { ISD::BSWAP, MVT::v16i16, { 1, 3, 1, 2 } },
4046 { ISD::CTLZ, MVT::v2i64, { 7, 18, 24, 25 } },
4047 { ISD::CTLZ, MVT::v4i64, { 14, 18, 24, 44 } },
4048 { ISD::CTLZ, MVT::v4i32, { 5, 16, 19, 20 } },
4049 { ISD::CTLZ, MVT::v8i32, { 10, 16, 19, 34 } },
4050 { ISD::CTLZ, MVT::v8i16, { 4, 13, 14, 15 } },
4051 { ISD::CTLZ, MVT::v16i16, { 6, 14, 14, 24 } },
4052 { ISD::CTLZ, MVT::v16i8, { 3, 12, 9, 10 } },
4053 { ISD::CTLZ, MVT::v32i8, { 4, 12, 9, 14 } },
4054 { ISD::CTPOP, MVT::v2i64, { 3, 9, 10, 10 } },
4055 { ISD::CTPOP, MVT::v4i64, { 4, 9, 10, 14 } },
4056 { ISD::CTPOP, MVT::v4i32, { 7, 12, 14, 14 } },
4057 { ISD::CTPOP, MVT::v8i32, { 7, 12, 14, 18 } },
4058 { ISD::CTPOP, MVT::v8i16, { 3, 7, 11, 11 } },
4059 { ISD::CTPOP, MVT::v16i16, { 6, 8, 11, 18 } },
4060 { ISD::CTPOP, MVT::v16i8, { 2, 5, 8, 8 } },
4061 { ISD::CTPOP, MVT::v32i8, { 3, 5, 8, 12 } },
4062 { ISD::CTTZ, MVT::v2i64, { 4, 11, 13, 13 } },
4063 { ISD::CTTZ, MVT::v4i64, { 5, 11, 13, 20 } },
4064 { ISD::CTTZ, MVT::v4i32, { 7, 14, 17, 17 } },
4065 { ISD::CTTZ, MVT::v8i32, { 7, 15, 17, 24 } },
4066 { ISD::CTTZ, MVT::v8i16, { 4, 9, 14, 14 } },
4067 { ISD::CTTZ, MVT::v16i16, { 6, 9, 14, 24 } },
4068 { ISD::CTTZ, MVT::v16i8, { 3, 7, 11, 11 } },
4069 { ISD::CTTZ, MVT::v32i8, { 5, 7, 11, 18 } },
4070 { ISD::SADDSAT, MVT::v2i64, { 4, 13, 8, 11 } },
4071 { ISD::SADDSAT, MVT::v4i64, { 3, 10, 8, 12 } },
4072 { ISD::SADDSAT, MVT::v4i32, { 2, 6, 7, 9 } },
4073 { ISD::SADDSAT, MVT::v8i32, { 4, 6, 7, 13 } },
4074 { ISD::SADDSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4075 { ISD::SADDSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4076 { ISD::SMAX, MVT::v2i64, { 2, 7, 2, 3 } },
4077 { ISD::SMAX, MVT::v4i64, { 2, 7, 2, 3 } },
4078 { ISD::SMAX, MVT::v8i32, { 1, 1, 1, 2 } },
4079 { ISD::SMAX, MVT::v16i16, { 1, 1, 1, 2 } },
4080 { ISD::SMAX, MVT::v32i8, { 1, 1, 1, 2 } },
4081 { ISD::SMIN, MVT::v2i64, { 2, 7, 2, 3 } },
4082 { ISD::SMIN, MVT::v4i64, { 2, 7, 2, 3 } },
4083 { ISD::SMIN, MVT::v8i32, { 1, 1, 1, 2 } },
4084 { ISD::SMIN, MVT::v16i16, { 1, 1, 1, 2 } },
4085 { ISD::SMIN, MVT::v32i8, { 1, 1, 1, 2 } },
4086 { ISD::SMULO, MVT::v4i64, { 20, 20, 33, 37 } },
4087 { ISD::SMULO, MVT::v2i64, { 8, 8, 13, 15 } },
4088 { ISD::SMULO, MVT::v8i32, { 8, 20, 13, 24 } },
4089 { ISD::SMULO, MVT::v4i32, { 5, 15, 11, 12 } },
4090 { ISD::SMULO, MVT::v16i16, { 4, 14, 8, 14 } },
4091 { ISD::SMULO, MVT::v8i16, { 3, 9, 6, 6 } },
4092 { ISD::SMULO, MVT::v32i8, { 9, 15, 18, 35 } },
4093 { ISD::SMULO, MVT::v16i8, { 6, 22, 14, 21 } },
4094 { ISD::SSUBSAT, MVT::v2i64, { 4, 13, 9, 13 } },
4095 { ISD::SSUBSAT, MVT::v4i64, { 4, 15, 9, 13 } },
4096 { ISD::SSUBSAT, MVT::v4i32, { 3, 14, 9, 11 } },
4097 { ISD::SSUBSAT, MVT::v8i32, { 4, 15, 9, 16 } },
4098 { ISD::SSUBSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4099 { ISD::SSUBSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4100 { ISD::UADDSAT, MVT::v2i64, { 2, 8, 6, 6 } },
4101 { ISD::UADDSAT, MVT::v4i64, { 3, 8, 6, 10 } },
4102 { ISD::UADDSAT, MVT::v8i32, { 2, 2, 4, 8 } },
4103 { ISD::UADDSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4104 { ISD::UADDSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4105 { ISD::UMAX, MVT::v2i64, { 2, 8, 5, 6 } },
4106 { ISD::UMAX, MVT::v4i64, { 2, 8, 5, 8 } },
4107 { ISD::UMAX, MVT::v8i32, { 1, 1, 1, 2 } },
4108 { ISD::UMAX, MVT::v16i16, { 1, 1, 1, 2 } },
4109 { ISD::UMAX, MVT::v32i8, { 1, 1, 1, 2 } },
4110 { ISD::UMIN, MVT::v2i64, { 2, 8, 5, 6 } },
4111 { ISD::UMIN, MVT::v4i64, { 2, 8, 5, 8 } },
4112 { ISD::UMIN, MVT::v8i32, { 1, 1, 1, 2 } },
4113 { ISD::UMIN, MVT::v16i16, { 1, 1, 1, 2 } },
4114 { ISD::UMIN, MVT::v32i8, { 1, 1, 1, 2 } },
4115 { ISD::UMULO, MVT::v4i64, { 24, 24, 39, 43 } },
4116 { ISD::UMULO, MVT::v2i64, { 10, 10, 15, 19 } },
4117 { ISD::UMULO, MVT::v8i32, { 8, 11, 13, 23 } },
4118 { ISD::UMULO, MVT::v4i32, { 5, 12, 11, 12 } },
4119 { ISD::UMULO, MVT::v16i16, { 4, 6, 8, 13 } },
4120 { ISD::UMULO, MVT::v8i16, { 2, 8, 6, 6 } },
4121 { ISD::UMULO, MVT::v32i8, { 9, 13, 17, 33 } },
4122 { ISD::UMULO, MVT::v16i8, { 6, 19, 13, 20 } },
4123 { ISD::USUBSAT, MVT::v2i64, { 2, 7, 6, 6 } },
4124 { ISD::USUBSAT, MVT::v4i64, { 3, 7, 6, 10 } },
4125 { ISD::USUBSAT, MVT::v8i32, { 2, 2, 2, 4 } },
4126 { ISD::USUBSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4127 { ISD::USUBSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4128 { ISD::FMAXNUM, MVT::f32, { 2, 7, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4129 { ISD::FMAXNUM, MVT::v4f32, { 2, 7, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4130 { ISD::FMAXNUM, MVT::v8f32, { 3, 7, 3, 6 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4131 { ISD::FMAXNUM, MVT::f64, { 2, 7, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4132 { ISD::FMAXNUM, MVT::v2f64, { 2, 7, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4133 { ISD::FMAXNUM, MVT::v4f64, { 3, 7, 3, 6 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4134 { ISD::FSQRT, MVT::f32, { 7, 15, 1, 1 } }, // vsqrtss
4135 { ISD::FSQRT, MVT::v4f32, { 7, 15, 1, 1 } }, // vsqrtps
4136 { ISD::FSQRT, MVT::v8f32, { 14, 21, 1, 3 } }, // vsqrtps
4137 { ISD::FSQRT, MVT::f64, { 14, 21, 1, 1 } }, // vsqrtsd
4138 { ISD::FSQRT, MVT::v2f64, { 14, 21, 1, 1 } }, // vsqrtpd
4139 { ISD::FSQRT, MVT::v4f64, { 28, 35, 1, 3 } }, // vsqrtpd
4140 };
4141 static const CostKindTblEntry AVX1CostTbl[] = {
4142 { ISD::ABS, MVT::v4i64, { 6, 8, 6, 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
4143 { ISD::ABS, MVT::v8i32, { 3, 6, 4, 5 } },
4144 { ISD::ABS, MVT::v16i16, { 3, 6, 4, 5 } },
4145 { ISD::ABS, MVT::v32i8, { 3, 6, 4, 5 } },
4146 { ISD::BITREVERSE, MVT::v4i64, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4147 { ISD::BITREVERSE, MVT::v2i64, { 8, 13, 10, 16 } },
4148 { ISD::BITREVERSE, MVT::v8i32, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4149 { ISD::BITREVERSE, MVT::v4i32, { 8, 13, 10, 16 } },
4150 { ISD::BITREVERSE, MVT::v16i16, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4151 { ISD::BITREVERSE, MVT::v8i16, { 8, 13, 10, 16 } },
4152 { ISD::BITREVERSE, MVT::v32i8, { 13, 15, 17, 26 } }, // 2 x 128-bit Op + extract/insert
4153 { ISD::BITREVERSE, MVT::v16i8, { 7, 7, 9, 13 } },
4154 { ISD::BSWAP, MVT::v4i64, { 5, 6, 5, 10 } },
4155 { ISD::BSWAP, MVT::v2i64, { 2, 2, 1, 3 } },
4156 { ISD::BSWAP, MVT::v8i32, { 5, 6, 5, 10 } },
4157 { ISD::BSWAP, MVT::v4i32, { 2, 2, 1, 3 } },
4158 { ISD::BSWAP, MVT::v16i16, { 5, 6, 5, 10 } },
4159 { ISD::BSWAP, MVT::v8i16, { 2, 2, 1, 3 } },
4160 { ISD::CTLZ, MVT::v4i64, { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert
4161 { ISD::CTLZ, MVT::v2i64, { 14, 24, 24, 28 } },
4162 { ISD::CTLZ, MVT::v8i32, { 24, 28, 39, 48 } }, // 2 x 128-bit Op + extract/insert
4163 { ISD::CTLZ, MVT::v4i32, { 12, 20, 19, 23 } },
4164 { ISD::CTLZ, MVT::v16i16, { 19, 22, 29, 38 } }, // 2 x 128-bit Op + extract/insert
4165 { ISD::CTLZ, MVT::v8i16, { 9, 16, 14, 18 } },
4166 { ISD::CTLZ, MVT::v32i8, { 14, 15, 19, 28 } }, // 2 x 128-bit Op + extract/insert
4167 { ISD::CTLZ, MVT::v16i8, { 7, 12, 9, 13 } },
4168 { ISD::CTPOP, MVT::v4i64, { 14, 18, 19, 28 } }, // 2 x 128-bit Op + extract/insert
4169 { ISD::CTPOP, MVT::v2i64, { 7, 14, 10, 14 } },
4170 { ISD::CTPOP, MVT::v8i32, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
4171 { ISD::CTPOP, MVT::v4i32, { 9, 20, 14, 18 } },
4172 { ISD::CTPOP, MVT::v16i16, { 16, 21, 22, 31 } }, // 2 x 128-bit Op + extract/insert
4173 { ISD::CTPOP, MVT::v8i16, { 8, 18, 11, 15 } },
4174 { ISD::CTPOP, MVT::v32i8, { 13, 15, 16, 25 } }, // 2 x 128-bit Op + extract/insert
4175 { ISD::CTPOP, MVT::v16i8, { 6, 12, 8, 12 } },
4176 { ISD::CTTZ, MVT::v4i64, { 17, 22, 24, 33 } }, // 2 x 128-bit Op + extract/insert
4177 { ISD::CTTZ, MVT::v2i64, { 9, 19, 13, 17 } },
4178 { ISD::CTTZ, MVT::v8i32, { 21, 27, 32, 41 } }, // 2 x 128-bit Op + extract/insert
4179 { ISD::CTTZ, MVT::v4i32, { 11, 24, 17, 21 } },
4180 { ISD::CTTZ, MVT::v16i16, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
4181 { ISD::CTTZ, MVT::v8i16, { 9, 21, 14, 18 } },
4182 { ISD::CTTZ, MVT::v32i8, { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert
4183 { ISD::CTTZ, MVT::v16i8, { 8, 16, 11, 15 } },
4184 { ISD::SADDSAT, MVT::v2i64, { 6, 13, 8, 11 } },
4185 { ISD::SADDSAT, MVT::v4i64, { 13, 20, 15, 25 } }, // 2 x 128-bit Op + extract/insert
4186 { ISD::SADDSAT, MVT::v8i32, { 12, 18, 14, 24 } }, // 2 x 128-bit Op + extract/insert
4187 { ISD::SADDSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4188 { ISD::SADDSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4189 { ISD::SMAX, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
4190 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 4 } },
4191 { ISD::SMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4192 { ISD::SMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4193 { ISD::SMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4194 { ISD::SMIN, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
4195 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
4196 { ISD::SMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4197 { ISD::SMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4198 { ISD::SMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4199 { ISD::SMULO, MVT::v4i64, { 20, 20, 33, 37 } },
4200 { ISD::SMULO, MVT::v2i64, { 9, 9, 13, 17 } },
4201 { ISD::SMULO, MVT::v8i32, { 15, 20, 24, 29 } },
4202 { ISD::SMULO, MVT::v4i32, { 7, 15, 11, 13 } },
4203 { ISD::SMULO, MVT::v16i16, { 8, 14, 14, 15 } },
4204 { ISD::SMULO, MVT::v8i16, { 3, 9, 6, 6 } },
4205 { ISD::SMULO, MVT::v32i8, { 20, 20, 37, 39 } },
4206 { ISD::SMULO, MVT::v16i8, { 9, 22, 18, 21 } },
4207 { ISD::SSUBSAT, MVT::v2i64, { 7, 13, 9, 13 } },
4208 { ISD::SSUBSAT, MVT::v4i64, { 15, 21, 18, 29 } }, // 2 x 128-bit Op + extract/insert
4209 { ISD::SSUBSAT, MVT::v8i32, { 15, 19, 18, 29 } }, // 2 x 128-bit Op + extract/insert
4210 { ISD::SSUBSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4211 { ISD::SSUBSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4212 { ISD::UADDSAT, MVT::v2i64, { 3, 8, 6, 6 } },
4213 { ISD::UADDSAT, MVT::v4i64, { 8, 11, 14, 15 } }, // 2 x 128-bit Op + extract/insert
4214 { ISD::UADDSAT, MVT::v8i32, { 6, 6, 10, 11 } }, // 2 x 128-bit Op + extract/insert
4215 { ISD::UADDSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4216 { ISD::UADDSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4217 { ISD::UMAX, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
4218 { ISD::UMAX, MVT::v2i64, { 4, 8, 5, 7 } },
4219 { ISD::UMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4220 { ISD::UMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4221 { ISD::UMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4222 { ISD::UMIN, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
4223 { ISD::UMIN, MVT::v2i64, { 4, 8, 5, 7 } },
4224 { ISD::UMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4225 { ISD::UMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4226 { ISD::UMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4227 { ISD::UMULO, MVT::v4i64, { 24, 26, 39, 45 } },
4228 { ISD::UMULO, MVT::v2i64, { 10, 12, 15, 20 } },
4229 { ISD::UMULO, MVT::v8i32, { 14, 15, 23, 28 } },
4230 { ISD::UMULO, MVT::v4i32, { 7, 12, 11, 13 } },
4231 { ISD::UMULO, MVT::v16i16, { 7, 11, 13, 14 } },
4232 { ISD::UMULO, MVT::v8i16, { 3, 8, 6, 6 } },
4233 { ISD::UMULO, MVT::v32i8, { 19, 19, 35, 37 } },
4234 { ISD::UMULO, MVT::v16i8, { 9, 19, 17, 20 } },
4235 { ISD::USUBSAT, MVT::v2i64, { 3, 7, 6, 6 } },
4236 { ISD::USUBSAT, MVT::v4i64, { 8, 10, 14, 15 } }, // 2 x 128-bit Op + extract/insert
4237 { ISD::USUBSAT, MVT::v8i32, { 4, 4, 7, 8 } }, // 2 x 128-bit Op + extract/insert
4238 { ISD::USUBSAT, MVT::v8i32, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4239 { ISD::USUBSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4240 { ISD::USUBSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4241 { ISD::FMAXNUM, MVT::f32, { 3, 6, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4242 { ISD::FMAXNUM, MVT::v4f32, { 3, 6, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4243 { ISD::FMAXNUM, MVT::v8f32, { 5, 7, 3, 10 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4244 { ISD::FMAXNUM, MVT::f64, { 3, 6, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4245 { ISD::FMAXNUM, MVT::v2f64, { 3, 6, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4246 { ISD::FMAXNUM, MVT::v4f64, { 5, 7, 3, 10 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4247 { ISD::FSQRT, MVT::f32, { 21, 21, 1, 1 } }, // vsqrtss
4248 { ISD::FSQRT, MVT::v4f32, { 21, 21, 1, 1 } }, // vsqrtps
4249 { ISD::FSQRT, MVT::v8f32, { 42, 42, 1, 3 } }, // vsqrtps
4250 { ISD::FSQRT, MVT::f64, { 27, 27, 1, 1 } }, // vsqrtsd
4251 { ISD::FSQRT, MVT::v2f64, { 27, 27, 1, 1 } }, // vsqrtpd
4252 { ISD::FSQRT, MVT::v4f64, { 54, 54, 1, 3 } }, // vsqrtpd
4253 };
4254 static const CostKindTblEntry GFNICostTbl[] = {
4255 { ISD::BITREVERSE, MVT::i8, { 3, 3, 3, 4 } }, // gf2p8affineqb
4256 { ISD::BITREVERSE, MVT::i16, { 3, 3, 4, 6 } }, // gf2p8affineqb
4257 { ISD::BITREVERSE, MVT::i32, { 3, 3, 4, 5 } }, // gf2p8affineqb
4258 { ISD::BITREVERSE, MVT::i64, { 3, 3, 4, 6 } }, // gf2p8affineqb
4259 { ISD::BITREVERSE, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4260 { ISD::BITREVERSE, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4261 { ISD::BITREVERSE, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4262 { ISD::BITREVERSE, MVT::v8i16, { 1, 8, 2, 4 } }, // gf2p8affineqb
4263 { ISD::BITREVERSE, MVT::v16i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
4264 { ISD::BITREVERSE, MVT::v32i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
4265 { ISD::BITREVERSE, MVT::v4i32, { 1, 8, 2, 4 } }, // gf2p8affineqb
4266 { ISD::BITREVERSE, MVT::v8i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
4267 { ISD::BITREVERSE, MVT::v16i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
4268 { ISD::BITREVERSE, MVT::v2i64, { 1, 8, 2, 4 } }, // gf2p8affineqb
4269 { ISD::BITREVERSE, MVT::v4i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
4270 { ISD::BITREVERSE, MVT::v8i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
4271 { X86ISD::VROTLI, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4272 { X86ISD::VROTLI, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4273 { X86ISD::VROTLI, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4274 };
4275 static const CostKindTblEntry GLMCostTbl[] = {
4276 { ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss
4277 { ISD::FSQRT, MVT::v4f32, { 37, 41, 1, 5 } }, // sqrtps
4278 { ISD::FSQRT, MVT::f64, { 34, 35, 1, 1 } }, // sqrtsd
4279 { ISD::FSQRT, MVT::v2f64, { 67, 71, 1, 5 } }, // sqrtpd
4280 };
4281 static const CostKindTblEntry SLMCostTbl[] = {
4282 { ISD::BSWAP, MVT::v2i64, { 5, 5, 1, 5 } },
4283 { ISD::BSWAP, MVT::v4i32, { 5, 5, 1, 5 } },
4284 { ISD::BSWAP, MVT::v8i16, { 5, 5, 1, 5 } },
4285 { ISD::FSQRT, MVT::f32, { 20, 20, 1, 1 } }, // sqrtss
4286 { ISD::FSQRT, MVT::v4f32, { 40, 41, 1, 5 } }, // sqrtps
4287 { ISD::FSQRT, MVT::f64, { 35, 35, 1, 1 } }, // sqrtsd
4288 { ISD::FSQRT, MVT::v2f64, { 70, 71, 1, 5 } }, // sqrtpd
4289 };
4290 static const CostKindTblEntry SSE42CostTbl[] = {
4291 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4292 { ISD::FMAXNUM, MVT::v4f32, { 4, 4, 4, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4293 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4294 { ISD::FMAXNUM, MVT::v2f64, { 4, 4, 4, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4295 { ISD::FSQRT, MVT::f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
4296 { ISD::FSQRT, MVT::v4f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
4297 };
4298 static const CostKindTblEntry SSE41CostTbl[] = {
4299 { ISD::ABS, MVT::v2i64, { 3, 4, 3, 5 } }, // BLENDVPD(X,PSUBQ(0,X),X)
4300 { ISD::SADDSAT, MVT::v2i64, { 10, 14, 17, 21 } },
4301 { ISD::SADDSAT, MVT::v4i32, { 5, 11, 8, 10 } },
4302 { ISD::SSUBSAT, MVT::v2i64, { 12, 19, 25, 29 } },
4303 { ISD::SSUBSAT, MVT::v4i32, { 6, 14, 10, 12 } },
4304 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 3 } },
4305 { ISD::SMAX, MVT::v4i32, { 1, 1, 1, 1 } },
4306 { ISD::SMAX, MVT::v16i8, { 1, 1, 1, 1 } },
4307 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
4308 { ISD::SMIN, MVT::v4i32, { 1, 1, 1, 1 } },
4309 { ISD::SMIN, MVT::v16i8, { 1, 1, 1, 1 } },
4310 { ISD::SMULO, MVT::v2i64, { 9, 11, 13, 17 } },
4311 { ISD::SMULO, MVT::v4i32, { 20, 24, 13, 19 } },
4312 { ISD::SMULO, MVT::v8i16, { 5, 9, 8, 8 } },
4313 { ISD::SMULO, MVT::v16i8, { 13, 22, 24, 25 } },
4314 { ISD::UADDSAT, MVT::v2i64, { 6, 13, 14, 14 } },
4315 { ISD::UADDSAT, MVT::v4i32, { 2, 2, 4, 4 } },
4316 { ISD::USUBSAT, MVT::v2i64, { 6, 10, 14, 14 } },
4317 { ISD::USUBSAT, MVT::v4i32, { 1, 2, 2, 2 } },
4318 { ISD::UMAX, MVT::v2i64, { 2, 11, 6, 7 } },
4319 { ISD::UMAX, MVT::v4i32, { 1, 1, 1, 1 } },
4320 { ISD::UMAX, MVT::v8i16, { 1, 1, 1, 1 } },
4321 { ISD::UMIN, MVT::v2i64, { 2, 11, 6, 7 } },
4322 { ISD::UMIN, MVT::v4i32, { 1, 1, 1, 1 } },
4323 { ISD::UMIN, MVT::v8i16, { 1, 1, 1, 1 } },
4324 { ISD::UMULO, MVT::v2i64, { 14, 20, 15, 20 } },
4325 { ISD::UMULO, MVT::v4i32, { 19, 22, 12, 18 } },
4326 { ISD::UMULO, MVT::v8i16, { 4, 9, 7, 7 } },
4327 { ISD::UMULO, MVT::v16i8, { 13, 19, 18, 20 } },
4328 };
4329 static const CostKindTblEntry SSSE3CostTbl[] = {
4330 { ISD::ABS, MVT::v4i32, { 1, 2, 1, 1 } },
4331 { ISD::ABS, MVT::v8i16, { 1, 2, 1, 1 } },
4332 { ISD::ABS, MVT::v16i8, { 1, 2, 1, 1 } },
4333 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 11, 21 } },
4334 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 11, 21 } },
4335 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 11, 21 } },
4336 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 10, 16 } },
4337 { ISD::BSWAP, MVT::v2i64, { 2, 3, 1, 5 } },
4338 { ISD::BSWAP, MVT::v4i32, { 2, 3, 1, 5 } },
4339 { ISD::BSWAP, MVT::v8i16, { 2, 3, 1, 5 } },
4340 { ISD::CTLZ, MVT::v2i64, { 18, 28, 28, 35 } },
4341 { ISD::CTLZ, MVT::v4i32, { 15, 20, 22, 28 } },
4342 { ISD::CTLZ, MVT::v8i16, { 13, 17, 16, 22 } },
4343 { ISD::CTLZ, MVT::v16i8, { 11, 15, 10, 16 } },
4344 { ISD::CTPOP, MVT::v2i64, { 13, 19, 12, 18 } },
4345 { ISD::CTPOP, MVT::v4i32, { 18, 24, 16, 22 } },
4346 { ISD::CTPOP, MVT::v8i16, { 13, 18, 14, 20 } },
4347 { ISD::CTPOP, MVT::v16i8, { 11, 12, 10, 16 } },
4348 { ISD::CTTZ, MVT::v2i64, { 13, 25, 15, 22 } },
4349 { ISD::CTTZ, MVT::v4i32, { 18, 26, 19, 25 } },
4350 { ISD::CTTZ, MVT::v8i16, { 13, 20, 17, 23 } },
4351 { ISD::CTTZ, MVT::v16i8, { 11, 16, 13, 19 } }
4352 };
4353 static const CostKindTblEntry SSE2CostTbl[] = {
4354 { ISD::ABS, MVT::v2i64, { 3, 6, 5, 5 } },
4355 { ISD::ABS, MVT::v4i32, { 1, 4, 4, 4 } },
4356 { ISD::ABS, MVT::v8i16, { 1, 2, 3, 3 } },
4357 { ISD::ABS, MVT::v16i8, { 1, 2, 3, 3 } },
4358 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 32, 32 } },
4359 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 30, 30 } },
4360 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 25, 25 } },
4361 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 21, 21 } },
4362 { ISD::BSWAP, MVT::v2i64, { 5, 6, 11, 11 } },
4363 { ISD::BSWAP, MVT::v4i32, { 5, 5, 9, 9 } },
4364 { ISD::BSWAP, MVT::v8i16, { 5, 5, 4, 5 } },
4365 { ISD::CTLZ, MVT::v2i64, { 10, 45, 36, 38 } },
4366 { ISD::CTLZ, MVT::v4i32, { 10, 45, 38, 40 } },
4367 { ISD::CTLZ, MVT::v8i16, { 9, 38, 32, 34 } },
4368 { ISD::CTLZ, MVT::v16i8, { 8, 39, 29, 32 } },
4369 { ISD::CTPOP, MVT::v2i64, { 12, 26, 16, 18 } },
4370 { ISD::CTPOP, MVT::v4i32, { 15, 29, 21, 23 } },
4371 { ISD::CTPOP, MVT::v8i16, { 13, 25, 18, 20 } },
4372 { ISD::CTPOP, MVT::v16i8, { 10, 21, 14, 16 } },
4373 { ISD::CTTZ, MVT::v2i64, { 14, 28, 19, 21 } },
4374 { ISD::CTTZ, MVT::v4i32, { 18, 31, 24, 26 } },
4375 { ISD::CTTZ, MVT::v8i16, { 16, 27, 21, 23 } },
4376 { ISD::CTTZ, MVT::v16i8, { 13, 23, 17, 19 } },
4377 { ISD::SADDSAT, MVT::v2i64, { 12, 14, 24, 24 } },
4378 { ISD::SADDSAT, MVT::v4i32, { 6, 11, 11, 12 } },
4379 { ISD::SADDSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4380 { ISD::SADDSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4381 { ISD::SMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4382 { ISD::SMAX, MVT::v4i32, { 2, 4, 5, 5 } },
4383 { ISD::SMAX, MVT::v8i16, { 1, 1, 1, 1 } },
4384 { ISD::SMAX, MVT::v16i8, { 2, 4, 5, 5 } },
4385 { ISD::SMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4386 { ISD::SMIN, MVT::v4i32, { 2, 4, 5, 5 } },
4387 { ISD::SMIN, MVT::v8i16, { 1, 1, 1, 1 } },
4388 { ISD::SMIN, MVT::v16i8, { 2, 4, 5, 5 } },
4389 { ISD::SMULO, MVT::v2i64, { 30, 33, 13, 23 } },
4390 { ISD::SMULO, MVT::v4i32, { 20, 24, 23, 23 } },
4391 { ISD::SMULO, MVT::v8i16, { 5, 10, 8, 8 } },
4392 { ISD::SMULO, MVT::v16i8, { 13, 23, 24, 25 } },
4393 { ISD::SSUBSAT, MVT::v2i64, { 16, 19, 31, 31 } },
4394 { ISD::SSUBSAT, MVT::v4i32, { 6, 14, 12, 13 } },
4395 { ISD::SSUBSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4396 { ISD::SSUBSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4397 { ISD::UADDSAT, MVT::v2i64, { 7, 13, 14, 14 } },
4398 { ISD::UADDSAT, MVT::v4i32, { 4, 5, 7, 7 } },
4399 { ISD::UADDSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4400 { ISD::UADDSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4401 { ISD::UMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4402 { ISD::UMAX, MVT::v4i32, { 2, 5, 8, 8 } },
4403 { ISD::UMAX, MVT::v8i16, { 1, 3, 3, 3 } },
4404 { ISD::UMAX, MVT::v16i8, { 1, 1, 1, 1 } },
4405 { ISD::UMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4406 { ISD::UMIN, MVT::v4i32, { 2, 5, 8, 8 } },
4407 { ISD::UMIN, MVT::v8i16, { 1, 3, 3, 3 } },
4408 { ISD::UMIN, MVT::v16i8, { 1, 1, 1, 1 } },
4409 { ISD::UMULO, MVT::v2i64, { 30, 33, 15, 29 } },
4410 { ISD::UMULO, MVT::v4i32, { 19, 22, 14, 18 } },
4411 { ISD::UMULO, MVT::v8i16, { 4, 9, 7, 7 } },
4412 { ISD::UMULO, MVT::v16i8, { 13, 19, 20, 20 } },
4413 { ISD::USUBSAT, MVT::v2i64, { 7, 10, 14, 14 } },
4414 { ISD::USUBSAT, MVT::v4i32, { 4, 4, 7, 7 } },
4415 { ISD::USUBSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4416 { ISD::USUBSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4417 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } },
4418 { ISD::FMAXNUM, MVT::v2f64, { 4, 6, 6, 6 } },
4419 { ISD::FSQRT, MVT::f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4420 { ISD::FSQRT, MVT::v2f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4421 };
4422 static const CostKindTblEntry SSE1CostTbl[] = {
4423 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } },
4424 { ISD::FMAXNUM, MVT::v4f32, { 4, 6, 6, 6 } },
4425 { ISD::FSQRT, MVT::f32, { 28, 30, 1, 2 } }, // Pentium III from http://www.agner.org/
4426 { ISD::FSQRT, MVT::v4f32, { 56, 56, 1, 2 } }, // Pentium III from http://www.agner.org/
4427 };
4428 static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets
4429 { ISD::CTTZ, MVT::i64, { 1, 1, 1, 1 } },
4430 };
4431 static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
4432 { ISD::CTTZ, MVT::i32, { 1, 1, 1, 1 } },
4433 { ISD::CTTZ, MVT::i16, { 2, 1, 1, 1 } },
4434 { ISD::CTTZ, MVT::i8, { 2, 1, 1, 1 } },
4435 };
4436 static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets
4437 { ISD::CTLZ, MVT::i64, { 1, 1, 1, 1 } },
4438 };
4439 static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
4440 { ISD::CTLZ, MVT::i32, { 1, 1, 1, 1 } },
4441 { ISD::CTLZ, MVT::i16, { 2, 1, 1, 1 } },
4442 { ISD::CTLZ, MVT::i8, { 2, 1, 1, 1 } },
4443 };
4444 static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets
4445 { ISD::CTPOP, MVT::i64, { 1, 1, 1, 1 } }, // popcnt
4446 };
4447 static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
4448 { ISD::CTPOP, MVT::i32, { 1, 1, 1, 1 } }, // popcnt
4449 { ISD::CTPOP, MVT::i16, { 1, 1, 2, 2 } }, // popcnt(zext())
4450 { ISD::CTPOP, MVT::i8, { 1, 1, 2, 2 } }, // popcnt(zext())
4451 };
4452 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
4453 { ISD::ABS, MVT::i64, { 1, 2, 3, 3 } }, // SUB+CMOV
4454 { ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } },
4455 { ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } },
4456 { ISD::CTLZ, MVT::i64, { 1, 2, 3, 3 } }, // MOV+BSR+XOR
4457 { ISD::CTLZ, MVT::i32, { 1, 2, 3, 3 } }, // MOV+BSR+XOR
4458 { ISD::CTLZ, MVT::i16, { 2, 2, 3, 3 } }, // MOV+BSR+XOR
4459 { ISD::CTLZ, MVT::i8, { 2, 2, 4, 3 } }, // MOV+BSR+XOR
4460 { ISD::CTLZ_ZERO_POISON,MVT::i64,{ 1, 2, 2, 2 } }, // BSR+XOR
4461 { ISD::CTTZ, MVT::i64, { 1, 2, 2, 2 } }, // MOV+BSF
4462 { ISD::CTTZ, MVT::i32, { 1, 2, 2, 2 } }, // MOV+BSF
4463 { ISD::CTTZ, MVT::i16, { 2, 2, 2, 2 } }, // MOV+BSF
4464 { ISD::CTTZ, MVT::i8, { 2, 2, 2, 2 } }, // MOV+BSF
4465 { ISD::CTTZ_ZERO_POISON,MVT::i64,{ 1, 2, 1, 2 } }, // BSF
4466 { ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } },
4467 { ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } },
4468 { ISD::ROTR, MVT::i64, { 2, 3, 1, 3 } },
4469 { X86ISD::VROTLI, MVT::i64, { 1, 1, 1, 1 } },
4470 { ISD::FSHL, MVT::i64, { 4, 4, 1, 4 } },
4471 { ISD::SADDSAT, MVT::i64, { 4, 4, 7, 10 } },
4472 { ISD::SSUBSAT, MVT::i64, { 4, 5, 8, 11 } },
4473 { ISD::UADDSAT, MVT::i64, { 2, 3, 4, 7 } },
4474 { ISD::USUBSAT, MVT::i64, { 2, 3, 4, 7 } },
4475 { ISD::SMAX, MVT::i64, { 1, 3, 2, 3 } },
4476 { ISD::SMIN, MVT::i64, { 1, 3, 2, 3 } },
4477 { ISD::UMAX, MVT::i64, { 1, 3, 2, 3 } },
4478 { ISD::UMIN, MVT::i64, { 1, 3, 2, 3 } },
4479 { ISD::SADDO, MVT::i64, { 2, 2, 4, 6 } },
4480 { ISD::UADDO, MVT::i64, { 2, 2, 4, 6 } },
4481 { ISD::SMULO, MVT::i64, { 4, 4, 4, 6 } },
4482 { ISD::UMULO, MVT::i64, { 8, 8, 4, 7 } },
4483 };
4484 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
4485 { ISD::ABS, MVT::i32, { 1, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
4486 { ISD::ABS, MVT::i16, { 2, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
4487 { ISD::ABS, MVT::i8, { 2, 4, 4, 3 } }, // SUB+XOR+SRA
4488 { ISD::BITREVERSE, MVT::i32, { 9, 12, 17, 19 } },
4489 { ISD::BITREVERSE, MVT::i16, { 9, 12, 17, 19 } },
4490 { ISD::BITREVERSE, MVT::i8, { 7, 9, 13, 14 } },
4491 { ISD::BSWAP, MVT::i32, { 1, 1, 1, 1 } },
4492 { ISD::BSWAP, MVT::i16, { 1, 2, 1, 2 } }, // ROL
4493 { ISD::CTLZ, MVT::i32, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
4494 { ISD::CTLZ, MVT::i16, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
4495 { ISD::CTLZ, MVT::i8, { 2, 2, 5, 6 } }, // BSR+XOR or BSR+XOR+CMOV
4496 { ISD::CTLZ_ZERO_POISON,MVT::i32,{ 1, 2, 2, 2 } }, // BSR+XOR
4497 { ISD::CTLZ_ZERO_POISON,MVT::i16,{ 2, 2, 2, 2 } }, // BSR+XOR
4498 { ISD::CTLZ_ZERO_POISON,MVT::i8, { 2, 2, 3, 3 } }, // BSR+XOR
4499 { ISD::CTTZ, MVT::i32, { 2, 2, 3, 3 } }, // TEST+BSF+CMOV/BRANCH
4500 { ISD::CTTZ, MVT::i16, { 2, 2, 2, 3 } }, // TEST+BSF+CMOV/BRANCH
4501 { ISD::CTTZ, MVT::i8, { 2, 2, 2, 3 } }, // TEST+BSF+CMOV/BRANCH
4502 { ISD::CTTZ_ZERO_POISON,MVT::i32,{ 1, 2, 1, 2 } }, // BSF
4503 { ISD::CTTZ_ZERO_POISON,MVT::i16,{ 2, 2, 1, 2 } }, // BSF
4504 { ISD::CTTZ_ZERO_POISON,MVT::i8, { 2, 2, 1, 2 } }, // BSF
4505 { ISD::CTPOP, MVT::i32, { 8, 7, 15, 15 } },
4506 { ISD::CTPOP, MVT::i16, { 9, 8, 17, 17 } },
4507 { ISD::CTPOP, MVT::i8, { 7, 6, 6, 6 } },
4508 { ISD::ROTL, MVT::i32, { 2, 3, 1, 3 } },
4509 { ISD::ROTL, MVT::i16, { 2, 3, 1, 3 } },
4510 { ISD::ROTL, MVT::i8, { 2, 3, 1, 3 } },
4511 { ISD::ROTR, MVT::i32, { 2, 3, 1, 3 } },
4512 { ISD::ROTR, MVT::i16, { 2, 3, 1, 3 } },
4513 { ISD::ROTR, MVT::i8, { 2, 3, 1, 3 } },
4514 { X86ISD::VROTLI, MVT::i32, { 1, 1, 1, 1 } },
4515 { X86ISD::VROTLI, MVT::i16, { 1, 1, 1, 1 } },
4516 { X86ISD::VROTLI, MVT::i8, { 1, 1, 1, 1 } },
4517 { ISD::FSHL, MVT::i32, { 4, 4, 1, 4 } },
4518 { ISD::FSHL, MVT::i16, { 4, 4, 2, 5 } },
4519 { ISD::FSHL, MVT::i8, { 4, 4, 2, 5 } },
4520 { ISD::SADDSAT, MVT::i32, { 3, 4, 6, 9 } },
4521 { ISD::SADDSAT, MVT::i16, { 4, 4, 7, 10 } },
4522 { ISD::SADDSAT, MVT::i8, { 4, 5, 8, 11 } },
4523 { ISD::SSUBSAT, MVT::i32, { 4, 4, 7, 10 } },
4524 { ISD::SSUBSAT, MVT::i16, { 4, 4, 7, 10 } },
4525 { ISD::SSUBSAT, MVT::i8, { 4, 5, 8, 11 } },
4526 { ISD::UADDSAT, MVT::i32, { 2, 3, 4, 7 } },
4527 { ISD::UADDSAT, MVT::i16, { 2, 3, 4, 7 } },
4528 { ISD::UADDSAT, MVT::i8, { 3, 3, 5, 8 } },
4529 { ISD::USUBSAT, MVT::i32, { 2, 3, 4, 7 } },
4530 { ISD::USUBSAT, MVT::i16, { 2, 3, 4, 7 } },
4531 { ISD::USUBSAT, MVT::i8, { 3, 3, 5, 8 } },
4532 { ISD::SMAX, MVT::i32, { 1, 2, 2, 3 } },
4533 { ISD::SMAX, MVT::i16, { 1, 4, 2, 4 } },
4534 { ISD::SMAX, MVT::i8, { 1, 4, 2, 4 } },
4535 { ISD::SMIN, MVT::i32, { 1, 2, 2, 3 } },
4536 { ISD::SMIN, MVT::i16, { 1, 4, 2, 4 } },
4537 { ISD::SMIN, MVT::i8, { 1, 4, 2, 4 } },
4538 { ISD::UMAX, MVT::i32, { 1, 2, 2, 3 } },
4539 { ISD::UMAX, MVT::i16, { 1, 4, 2, 4 } },
4540 { ISD::UMAX, MVT::i8, { 1, 4, 2, 4 } },
4541 { ISD::UMIN, MVT::i32, { 1, 2, 2, 3 } },
4542 { ISD::UMIN, MVT::i16, { 1, 4, 2, 4 } },
4543 { ISD::UMIN, MVT::i8, { 1, 4, 2, 4 } },
4544 { ISD::SADDO, MVT::i32, { 2, 2, 4, 6 } },
4545 { ISD::SADDO, MVT::i16, { 2, 2, 4, 6 } },
4546 { ISD::SADDO, MVT::i8, { 2, 2, 4, 6 } },
4547 { ISD::UADDO, MVT::i32, { 2, 2, 4, 6 } },
4548 { ISD::UADDO, MVT::i16, { 2, 2, 4, 6 } },
4549 { ISD::UADDO, MVT::i8, { 2, 2, 4, 6 } },
4550 { ISD::SMULO, MVT::i32, { 2, 2, 4, 6 } },
4551 { ISD::SMULO, MVT::i16, { 5, 5, 4, 6 } },
4552 { ISD::SMULO, MVT::i8, { 6, 6, 4, 6 } },
4553 { ISD::UMULO, MVT::i32, { 6, 6, 4, 8 } },
4554 { ISD::UMULO, MVT::i16, { 6, 6, 4, 9 } },
4555 { ISD::UMULO, MVT::i8, { 6, 6, 4, 6 } },
4556 };
4557
4558 Type *RetTy = ICA.getReturnType();
4559 Type *OpTy = RetTy;
4560 Intrinsic::ID IID = ICA.getID();
4561 unsigned ISD = ISD::DELETED_NODE;
4562 switch (IID) {
4563 default:
4564 break;
4565 case Intrinsic::abs:
4566 ISD = ISD::ABS;
4567 break;
4568 case Intrinsic::bitreverse:
4570 break;
4571 case Intrinsic::bswap:
4572 ISD = ISD::BSWAP;
4573 break;
4574 case Intrinsic::ctlz:
4575 ISD = ISD::CTLZ;
4576 break;
4577 case Intrinsic::ctpop:
4578 ISD = ISD::CTPOP;
4579 break;
4580 case Intrinsic::cttz:
4581 ISD = ISD::CTTZ;
4582 break;
4583 case Intrinsic::fshl:
4584 ISD = ISD::FSHL;
4585 if (!ICA.isTypeBasedOnly()) {
4586 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4587 if (Args[0] == Args[1]) {
4588 ISD = ISD::ROTL;
4589 // Handle uniform constant rotation amounts.
4590 // TODO: Handle funnel-shift cases.
4591 const APInt *Amt;
4592 if (Args[2] &&
4594 ISD = X86ISD::VROTLI;
4595 }
4596 }
4597 break;
4598 case Intrinsic::fshr:
4599 // FSHR has same costs so don't duplicate.
4600 ISD = ISD::FSHL;
4601 if (!ICA.isTypeBasedOnly()) {
4602 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4603 if (Args[0] == Args[1]) {
4604 ISD = ISD::ROTR;
4605 // Handle uniform constant rotation amount.
4606 // TODO: Handle funnel-shift cases.
4607 const APInt *Amt;
4608 if (Args[2] &&
4610 ISD = X86ISD::VROTLI;
4611 }
4612 }
4613 break;
4614 case Intrinsic::lrint:
4615 case Intrinsic::llrint: {
4616 // X86 can use the CVTP2SI instructions to lower lrint/llrint calls, which
4617 // have the same costs as the CVTTP2SI (fptosi) instructions
4618 const SmallVectorImpl<Type *> &ArgTys = ICA.getArgTypes();
4619 return getCastInstrCost(Instruction::FPToSI, RetTy, ArgTys[0],
4621 }
4622 case Intrinsic::maxnum:
4623 case Intrinsic::minnum:
4624 // FMINNUM has same costs so don't duplicate.
4625 ISD = ISD::FMAXNUM;
4626 break;
4627 case Intrinsic::sadd_sat:
4628 ISD = ISD::SADDSAT;
4629 break;
4630 case Intrinsic::smax:
4631 ISD = ISD::SMAX;
4632 break;
4633 case Intrinsic::smin:
4634 ISD = ISD::SMIN;
4635 break;
4636 case Intrinsic::ssub_sat:
4637 ISD = ISD::SSUBSAT;
4638 break;
4639 case Intrinsic::uadd_sat:
4640 ISD = ISD::UADDSAT;
4641 break;
4642 case Intrinsic::umax:
4643 ISD = ISD::UMAX;
4644 break;
4645 case Intrinsic::umin:
4646 ISD = ISD::UMIN;
4647 break;
4648 case Intrinsic::usub_sat:
4649 ISD = ISD::USUBSAT;
4650 break;
4651 case Intrinsic::sqrt:
4652 ISD = ISD::FSQRT;
4653 break;
4654 case Intrinsic::sadd_with_overflow:
4655 case Intrinsic::ssub_with_overflow:
4656 // SSUBO has same costs so don't duplicate.
4657 ISD = ISD::SADDO;
4658 OpTy = RetTy->getContainedType(0);
4659 break;
4660 case Intrinsic::uadd_with_overflow:
4661 case Intrinsic::usub_with_overflow:
4662 // USUBO has same costs so don't duplicate.
4663 ISD = ISD::UADDO;
4664 OpTy = RetTy->getContainedType(0);
4665 break;
4666 case Intrinsic::smul_with_overflow:
4667 ISD = ISD::SMULO;
4668 OpTy = RetTy->getContainedType(0);
4669 break;
4670 case Intrinsic::umul_with_overflow:
4671 ISD = ISD::UMULO;
4672 OpTy = RetTy->getContainedType(0);
4673 break;
4674 }
4675
4676 if (ISD != ISD::DELETED_NODE) {
4677 auto adjustTableCost = [&](int ISD, unsigned Cost,
4678 std::pair<InstructionCost, MVT> LT,
4680 InstructionCost LegalizationCost = LT.first;
4681 MVT MTy = LT.second;
4682
4683 // If there are no NANs to deal with, then these are reduced to a
4684 // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
4685 // assume is used in the non-fast case.
4686 if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) {
4687 if (FMF.noNaNs())
4688 return LegalizationCost * 1;
4689 }
4690
4691 // For cases where some ops can be folded into a load/store, assume free.
4692 if (MTy.isScalarInteger()) {
4693 if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
4694 if (const Instruction *II = ICA.getInst()) {
4695 if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
4696 return TTI::TCC_Free;
4697 if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
4698 if (LI->hasOneUse())
4699 return TTI::TCC_Free;
4700 }
4701 }
4702 }
4703 }
4704
4705 return LegalizationCost * (int)Cost;
4706 };
4707
4708 // Legalize the type.
4709 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy);
4710 MVT MTy = LT.second;
4711
4712 // Without BMI/LZCNT see if we're only looking for a *_ZERO_POISON cost.
4713 if (((ISD == ISD::CTTZ && !ST->hasBMI()) ||
4714 (ISD == ISD::CTLZ && !ST->hasLZCNT())) &&
4715 !MTy.isVector() && !ICA.isTypeBasedOnly()) {
4716 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4717 if (auto *Cst = dyn_cast<ConstantInt>(Args[1]))
4718 if (Cst->isAllOnesValue())
4719 ISD =
4721 }
4722
4723 // FSQRT is a single instruction.
4725 return LT.first;
4726
4727 if (ST->useGLMDivSqrtCosts())
4728 if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
4729 if (auto KindCost = Entry->Cost[CostKind])
4730 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4731
4732 if (ST->useSLMArithCosts())
4733 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
4734 if (auto KindCost = Entry->Cost[CostKind])
4735 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4736
4737 if (ST->hasVBMI2())
4738 if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy))
4739 if (auto KindCost = Entry->Cost[CostKind])
4740 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4741
4742 if (ST->hasBITALG())
4743 if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy))
4744 if (auto KindCost = Entry->Cost[CostKind])
4745 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4746
4747 if (ST->hasVPOPCNTDQ())
4748 if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy))
4749 if (auto KindCost = Entry->Cost[CostKind])
4750 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4751
4752 if (ST->hasGFNI())
4753 if (const auto *Entry = CostTableLookup(GFNICostTbl, ISD, MTy))
4754 if (auto KindCost = Entry->Cost[CostKind])
4755 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4756
4757 if (ST->hasCDI())
4758 if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
4759 if (auto KindCost = Entry->Cost[CostKind])
4760 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4761
4762 if (ST->hasBWI())
4763 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
4764 if (auto KindCost = Entry->Cost[CostKind])
4765 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4766
4767 if (ST->hasAVX512())
4768 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
4769 if (auto KindCost = Entry->Cost[CostKind])
4770 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4771
4772 if (ST->hasXOP())
4773 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
4774 if (auto KindCost = Entry->Cost[CostKind])
4775 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4776
4777 if (ST->hasAVX2())
4778 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
4779 if (auto KindCost = Entry->Cost[CostKind])
4780 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4781
4782 if (ST->hasAVX())
4783 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
4784 if (auto KindCost = Entry->Cost[CostKind])
4785 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4786
4787 if (ST->hasSSE42())
4788 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
4789 if (auto KindCost = Entry->Cost[CostKind])
4790 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4791
4792 if (ST->hasSSE41())
4793 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
4794 if (auto KindCost = Entry->Cost[CostKind])
4795 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4796
4797 if (ST->hasSSSE3())
4798 if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
4799 if (auto KindCost = Entry->Cost[CostKind])
4800 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4801
4802 if (ST->hasSSE2())
4803 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
4804 if (auto KindCost = Entry->Cost[CostKind])
4805 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4806
4807 if (ST->hasSSE1())
4808 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
4809 if (auto KindCost = Entry->Cost[CostKind])
4810 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4811
4812 if (ST->hasBMI()) {
4813 if (ST->is64Bit())
4814 if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
4815 if (auto KindCost = Entry->Cost[CostKind])
4816 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4817
4818 if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
4819 if (auto KindCost = Entry->Cost[CostKind])
4820 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4821 }
4822
4823 if (ST->hasLZCNT()) {
4824 if (ST->is64Bit())
4825 if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
4826 if (auto KindCost = Entry->Cost[CostKind])
4827 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4828
4829 if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
4830 if (auto KindCost = Entry->Cost[CostKind])
4831 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4832 }
4833
4834 if (ST->hasPOPCNT()) {
4835 if (ST->is64Bit())
4836 if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
4837 if (auto KindCost = Entry->Cost[CostKind])
4838 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4839
4840 if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
4841 if (auto KindCost = Entry->Cost[CostKind])
4842 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4843 }
4844
4845 if (ST->is64Bit())
4846 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
4847 if (auto KindCost = Entry->Cost[CostKind])
4848 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4849
4850 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
4851 if (auto KindCost = Entry->Cost[CostKind])
4852 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4853
4854 // Without arg data, we need to compute the expanded costs of custom lowered
4855 // intrinsics to prevent use of the (very low) default costs.
4856 if (ICA.isTypeBasedOnly() &&
4857 (IID == Intrinsic::fshl || IID == Intrinsic::fshr)) {
4858 Type *CondTy = RetTy->getWithNewBitWidth(1);
4860 Cost += getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
4861 Cost += getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
4862 Cost += getArithmeticInstrCost(BinaryOperator::Shl, RetTy, CostKind);
4863 Cost += getArithmeticInstrCost(BinaryOperator::LShr, RetTy, CostKind);
4864 Cost += getArithmeticInstrCost(BinaryOperator::And, RetTy, CostKind);
4865 Cost += getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
4867 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
4869 return Cost;
4870 }
4871 }
4872
4874}
4875
4877 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4878 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
4879 static const CostTblEntry SLMCostTbl[] = {
4880 { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 },
4881 { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 },
4882 { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 },
4883 { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 }
4884 };
4885
4886 assert(Val->isVectorTy() && "This must be a vector type");
4887 auto *VT = cast<VectorType>(Val);
4888 if (VT->isScalableTy())
4890
4891 Type *ScalarType = Val->getScalarType();
4892 InstructionCost RegisterFileMoveCost = 0;
4893
4894 // Non-immediate extraction/insertion can be handled as a sequence of
4895 // aliased loads+stores via the stack.
4896 if (Index == -1U && (Opcode == Instruction::ExtractElement ||
4897 Opcode == Instruction::InsertElement)) {
4898 // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
4899 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
4900
4901 // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling.
4902 assert(isa<FixedVectorType>(Val) && "Fixed vector type expected");
4903 Align VecAlign = DL.getPrefTypeAlign(Val);
4904 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
4905
4906 // Extract - store vector to stack, load scalar.
4907 if (Opcode == Instruction::ExtractElement) {
4908 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4909 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
4910 CostKind);
4911 }
4912 // Insert - store vector to stack, store scalar, load vector.
4913 if (Opcode == Instruction::InsertElement) {
4914 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4915 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
4916 CostKind) +
4917 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind);
4918 }
4919 }
4920
4921 if (Index != -1U && (Opcode == Instruction::ExtractElement ||
4922 Opcode == Instruction::InsertElement)) {
4923 // Extraction of vXi1 elements are now efficiently handled by MOVMSK.
4924 if (Opcode == Instruction::ExtractElement &&
4925 ScalarType->getScalarSizeInBits() == 1 &&
4926 cast<FixedVectorType>(Val)->getNumElements() > 1)
4927 return 1;
4928
4929 // Legalize the type.
4930 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
4931
4932 // This type is legalized to a scalar type.
4933 if (!LT.second.isVector())
4934 return TTI::TCC_Free;
4935
4936 // The type may be split. Normalize the index to the new type.
4937 unsigned SizeInBits = LT.second.getSizeInBits();
4938 unsigned NumElts = LT.second.getVectorNumElements();
4939 unsigned SubNumElts = NumElts;
4940 Index = Index % NumElts;
4941
4942 // For >128-bit vectors, we need to extract higher 128-bit subvectors.
4943 // For inserts, we also need to insert the subvector back.
4944 if (SizeInBits > 128) {
4945 assert((SizeInBits % 128) == 0 && "Illegal vector");
4946 unsigned NumSubVecs = SizeInBits / 128;
4947 SubNumElts = NumElts / NumSubVecs;
4948 if (SubNumElts <= Index) {
4949 RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
4950 Index %= SubNumElts;
4951 }
4952 }
4953
4954 MVT MScalarTy = LT.second.getScalarType();
4955 auto IsCheapPInsrPExtrInsertPS = [&]() {
4956 // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
4957 // Inserting f32 into index0 is just movss.
4958 // Also, assume insertps is relatively cheap on all >= SSE41 targets.
4959 return (MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4960 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4961 (MScalarTy == MVT::f32 && ST->hasSSE1() && Index == 0 &&
4962 Opcode == Instruction::InsertElement) ||
4963 (MScalarTy == MVT::f32 && ST->hasSSE41() &&
4964 Opcode == Instruction::InsertElement);
4965 };
4966
4967 if (Index == 0) {
4968 // Floating point scalars are already located in index #0.
4969 // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
4970 // true for all.
4971 if (ScalarType->isFloatingPointTy() &&
4972 (Opcode != Instruction::InsertElement || !Op0 ||
4973 isa<UndefValue>(Op0)))
4974 return RegisterFileMoveCost;
4975
4976 if (Opcode == Instruction::InsertElement &&
4978 // Consider the gather cost to be cheap.
4980 return RegisterFileMoveCost;
4981 if (!IsCheapPInsrPExtrInsertPS()) {
4982 // mov constant-to-GPR + movd/movq GPR -> XMM.
4983 if (isa_and_nonnull<Constant>(Op1) && Op1->getType()->isIntegerTy())
4984 return 2 + RegisterFileMoveCost;
4985 // Assume movd/movq GPR -> XMM is relatively cheap on all targets.
4986 return 1 + RegisterFileMoveCost;
4987 }
4988 }
4989
4990 // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
4991 if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
4992 return 1 + RegisterFileMoveCost;
4993 }
4994
4995 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4996 assert(ISD && "Unexpected vector opcode");
4997 if (ST->useSLMArithCosts())
4998 if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
4999 return Entry->Cost + RegisterFileMoveCost;
5000
5001 // Consider cheap cases.
5002 if (IsCheapPInsrPExtrInsertPS())
5003 return 1 + RegisterFileMoveCost;
5004
5005 // For extractions we just need to shuffle the element to index 0, which
5006 // should be very cheap (assume cost = 1). For insertions we need to shuffle
5007 // the elements to its destination. In both cases we must handle the
5008 // subvector move(s).
5009 // If the vector type is already less than 128-bits then don't reduce it.
5010 // TODO: Under what circumstances should we shuffle using the full width?
5011 InstructionCost ShuffleCost = 1;
5012 if (Opcode == Instruction::InsertElement) {
5013 auto *SubTy = cast<VectorType>(Val);
5014 EVT VT = TLI->getValueType(DL, Val);
5015 if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
5016 SubTy = FixedVectorType::get(ScalarType, SubNumElts);
5017 ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, SubTy, {},
5018 CostKind, 0, SubTy);
5019 }
5020 int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
5021 return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
5022 }
5023
5024 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1,
5025 VIC) +
5026 RegisterFileMoveCost;
5027}
5028
5030 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
5031 TTI::TargetCostKind CostKind, bool ForPoisonSrc, ArrayRef<Value *> VL,
5032 TTI::VectorInstrContext VIC) const {
5033 assert(DemandedElts.getBitWidth() ==
5034 cast<FixedVectorType>(Ty)->getNumElements() &&
5035 "Vector size mismatch");
5036
5037 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
5038 MVT MScalarTy = LT.second.getScalarType();
5039 unsigned LegalVectorBitWidth = LT.second.getSizeInBits();
5041
5042 constexpr unsigned LaneBitWidth = 128;
5043 assert((LegalVectorBitWidth < LaneBitWidth ||
5044 (LegalVectorBitWidth % LaneBitWidth) == 0) &&
5045 "Illegal vector");
5046
5047 const int NumLegalVectors = LT.first.getValue();
5048 assert(NumLegalVectors >= 0 && "Negative cost!");
5049
5050 // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
5051 // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT. SLPVectorizer has
5052 // a special heuristic regarding poison input which is passed here in
5053 // ForPoisonSrc.
5054 if (Insert && !ForPoisonSrc) {
5055 // This is nearly identical to BaseT::getScalarizationOverhead(), except
5056 // it is passing nullptr to getVectorInstrCost() for Op0 (instead of
5057 // Constant::getNullValue()), which makes the X86TTIImpl
5058 // getVectorInstrCost() return 0 instead of 1.
5059 for (unsigned I : seq(DemandedElts.getBitWidth())) {
5060 if (!DemandedElts[I])
5061 continue;
5062 Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, I,
5064 VL.empty() ? nullptr : VL[I],
5066 }
5067 return Cost;
5068 }
5069
5070 if (Insert) {
5071 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
5072 (MScalarTy.isInteger() && ST->hasSSE41()) ||
5073 (MScalarTy == MVT::f32 && ST->hasSSE41())) {
5074 // For types we can insert directly, insertion into 128-bit sub vectors is
5075 // cheap, followed by a cheap chain of concatenations.
5076 if (LegalVectorBitWidth <= LaneBitWidth) {
5077 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
5078 /*Extract*/ false, CostKind);
5079 } else {
5080 // In each 128-lane, if at least one index is demanded but not all
5081 // indices are demanded and this 128-lane is not the first 128-lane of
5082 // the legalized-vector, then this 128-lane needs a extracti128; If in
5083 // each 128-lane, there is at least one demanded index, this 128-lane
5084 // needs a inserti128.
5085
5086 // The following cases will help you build a better understanding:
5087 // Assume we insert several elements into a v8i32 vector in avx2,
5088 // Case#1: inserting into 1th index needs vpinsrd + inserti128.
5089 // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
5090 // inserti128.
5091 // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
5092 assert((LegalVectorBitWidth % LaneBitWidth) == 0 && "Illegal vector");
5093 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
5094 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
5095 unsigned NumLegalElts =
5096 LT.second.getVectorNumElements() * NumLegalVectors;
5097 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
5098 "Vector has been legalized to smaller element count");
5099 assert((NumLegalElts % NumLanesTotal) == 0 &&
5100 "Unexpected elts per lane");
5101 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
5102
5103 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
5104 auto *LaneTy =
5105 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
5106
5107 for (unsigned I = 0; I != NumLanesTotal; ++I) {
5108 APInt LaneEltMask = WidenedDemandedElts.extractBits(
5109 NumEltsPerLane, NumEltsPerLane * I);
5110 if (LaneEltMask.isZero())
5111 continue;
5112 // FIXME: we don't need to extract if all non-demanded elements
5113 // are legalization-inserted padding.
5114 if (!LaneEltMask.isAllOnes())
5116 CostKind, I * NumEltsPerLane, LaneTy);
5117 Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert,
5118 /*Extract*/ false, CostKind);
5119 }
5120
5121 APInt AffectedLanes =
5122 APIntOps::ScaleBitMask(WidenedDemandedElts, NumLanesTotal);
5123 APInt FullyAffectedLegalVectors = APIntOps::ScaleBitMask(
5124 AffectedLanes, NumLegalVectors, /*MatchAllBits=*/true);
5125 for (int LegalVec = 0; LegalVec != NumLegalVectors; ++LegalVec) {
5126 for (unsigned Lane = 0; Lane != NumLegalLanes; ++Lane) {
5127 unsigned I = NumLegalLanes * LegalVec + Lane;
5128 // No need to insert unaffected lane; or lane 0 of each legal vector
5129 // iff ALL lanes of that vector were affected and will be inserted.
5130 if (!AffectedLanes[I] ||
5131 (Lane == 0 && FullyAffectedLegalVectors[LegalVec]))
5132 continue;
5134 CostKind, I * NumEltsPerLane, LaneTy);
5135 }
5136 }
5137 }
5138 } else if (LT.second.isVector()) {
5139 // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
5140 // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
5141 // series of UNPCK followed by CONCAT_VECTORS - all of these can be
5142 // considered cheap.
5143 if (Ty->isIntOrIntVectorTy())
5144 Cost += DemandedElts.popcount();
5145
5146 // Get the smaller of the legalized or original pow2-extended number of
5147 // vector elements, which represents the number of unpacks we'll end up
5148 // performing.
5149 unsigned NumElts = LT.second.getVectorNumElements();
5150 unsigned Pow2Elts =
5152 Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
5153 }
5154 }
5155
5156 if (Extract) {
5157 // vXi1 can be efficiently extracted with MOVMSK.
5158 // TODO: AVX512 predicate mask handling.
5159 // NOTE: This doesn't work well for roundtrip scalarization.
5160 if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) {
5161 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
5162 unsigned MaxElts = ST->hasAVX2() ? 32 : 16;
5163 unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts;
5164 return MOVMSKCost;
5165 }
5166
5167 if (LT.second.isVector()) {
5168 unsigned NumLegalElts =
5169 LT.second.getVectorNumElements() * NumLegalVectors;
5170 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
5171 "Vector has been legalized to smaller element count");
5172
5173 // If we're extracting elements from a 128-bit subvector lane,
5174 // we only need to extract each lane once, not for every element.
5175 if (LegalVectorBitWidth > LaneBitWidth) {
5176 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
5177 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
5178 assert((NumLegalElts % NumLanesTotal) == 0 &&
5179 "Unexpected elts per lane");
5180 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
5181
5182 // Add cost for each demanded 128-bit subvector extraction.
5183 // Luckily this is a lot easier than for insertion.
5184 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
5185 auto *LaneTy =
5186 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
5187
5188 for (unsigned I = 0; I != NumLanesTotal; ++I) {
5189 APInt LaneEltMask = WidenedDemandedElts.extractBits(
5190 NumEltsPerLane, I * NumEltsPerLane);
5191 if (LaneEltMask.isZero())
5192 continue;
5194 I * NumEltsPerLane, LaneTy);
5196 LaneTy, LaneEltMask, /*Insert*/ false, Extract, CostKind);
5197 }
5198
5199 return Cost;
5200 }
5201 }
5202
5203 // Fallback to default extraction.
5204 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ false,
5205 Extract, CostKind);
5206 }
5207
5208 return Cost;
5209}
5210
5212X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
5213 int VF, const APInt &DemandedDstElts,
5215 const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy);
5216 // We don't differentiate element types here, only element bit width.
5217 EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits);
5218
5219 auto bailout = [&]() {
5220 return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
5221 DemandedDstElts, CostKind);
5222 };
5223
5224 // For now, only deal with AVX512 cases.
5225 if (!ST->hasAVX512())
5226 return bailout();
5227
5228 // Do we have a native shuffle for this element type, or should we promote?
5229 unsigned PromEltTyBits = EltTyBits;
5230 switch (EltTyBits) {
5231 case 32:
5232 case 64:
5233 break; // AVX512F.
5234 case 16:
5235 if (!ST->hasBWI())
5236 PromEltTyBits = 32; // promote to i32, AVX512F.
5237 break; // AVX512BW
5238 case 8:
5239 if (!ST->hasVBMI())
5240 PromEltTyBits = 32; // promote to i32, AVX512F.
5241 break; // AVX512VBMI
5242 case 1:
5243 // There is no support for shuffling i1 elements. We *must* promote.
5244 if (ST->hasBWI()) {
5245 if (ST->hasVBMI())
5246 PromEltTyBits = 8; // promote to i8, AVX512VBMI.
5247 else
5248 PromEltTyBits = 16; // promote to i16, AVX512BW.
5249 break;
5250 }
5251 PromEltTyBits = 32; // promote to i32, AVX512F.
5252 break;
5253 default:
5254 return bailout();
5255 }
5256 auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits);
5257
5258 auto *SrcVecTy = FixedVectorType::get(EltTy, VF);
5259 auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF);
5260
5261 int NumDstElements = VF * ReplicationFactor;
5262 auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements);
5263 auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements);
5264
5265 // Legalize the types.
5266 MVT LegalSrcVecTy = getTypeLegalizationCost(SrcVecTy).second;
5267 MVT LegalPromSrcVecTy = getTypeLegalizationCost(PromSrcVecTy).second;
5268 MVT LegalPromDstVecTy = getTypeLegalizationCost(PromDstVecTy).second;
5269 MVT LegalDstVecTy = getTypeLegalizationCost(DstVecTy).second;
5270 // They should have legalized into vector types.
5271 if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() ||
5272 !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector())
5273 return bailout();
5274
5275 if (PromEltTyBits != EltTyBits) {
5276 // If we have to perform the shuffle with wider elt type than our data type,
5277 // then we will first need to anyext (we don't care about the new bits)
5278 // the source elements, and then truncate Dst elements.
5279 InstructionCost PromotionCost;
5280 PromotionCost += getCastInstrCost(
5281 Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy,
5283 PromotionCost +=
5284 getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy,
5285 /*Src=*/PromDstVecTy,
5287 return PromotionCost + getReplicationShuffleCost(PromEltTy,
5288 ReplicationFactor, VF,
5289 DemandedDstElts, CostKind);
5290 }
5291
5292 assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits &&
5293 LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() &&
5294 "We expect that the legalization doesn't affect the element width, "
5295 "doesn't coalesce/split elements.");
5296
5297 unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements();
5298 unsigned NumDstVectors =
5299 divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec);
5300
5301 auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec);
5302
5303 // Not all the produced Dst elements may be demanded. In our case,
5304 // given that a single Dst vector is formed by a single shuffle,
5305 // if all elements that will form a single Dst vector aren't demanded,
5306 // then we won't need to do that shuffle, so adjust the cost accordingly.
5307 APInt DemandedDstVectors = APIntOps::ScaleBitMask(
5308 DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors);
5309 unsigned NumDstVectorsDemanded = DemandedDstVectors.popcount();
5310
5311 InstructionCost SingleShuffleCost =
5312 getShuffleCost(TTI::SK_PermuteSingleSrc, SingleDstVecTy, SingleDstVecTy,
5313 /*Mask=*/{}, CostKind,
5314 /*Index=*/0, /*SubTp=*/nullptr);
5315 return NumDstVectorsDemanded * SingleShuffleCost;
5316}
5317
5319 Align Alignment,
5320 unsigned AddressSpace,
5322 TTI::OperandValueInfo OpInfo,
5323 const Instruction *I) const {
5324 // FIXME: Load latency isn't handled here
5325 if (Opcode == Instruction::Load && CostKind == TTI::TCK_Latency)
5326 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5327 CostKind, OpInfo, I);
5328
5329 // TODO: Handle other cost kinds.
5331 if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
5332 // Store instruction with index and scale costs 2 Uops.
5333 // Check the preceding GEP to identify non-const indices.
5334 if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
5335 if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
5336 return TTI::TCC_Basic * 2;
5337 }
5338 }
5339 return TTI::TCC_Basic;
5340 }
5341
5342 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
5343 "Invalid Opcode");
5344 // Type legalization can't handle structs
5345 if (TLI->getValueType(DL, Src, true) == MVT::Other)
5346 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5347 CostKind, OpInfo, I);
5348
5349 // Legalize the type.
5350 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
5351
5352 auto *VTy = dyn_cast<FixedVectorType>(Src);
5353
5355
5356 // Add a cost for constant load to vector.
5357 if (Opcode == Instruction::Store && OpInfo.isConstant())
5358 Cost += getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src),
5359 /*AddressSpace=*/0, CostKind, OpInfo);
5360
5361 // Handle the simple case of non-vectors.
5362 // NOTE: this assumes that legalization never creates vector from scalars!
5363 if (!VTy || !LT.second.isVector()) {
5364 // Each load/store unit costs 1.
5365 return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1;
5366 }
5367
5368 bool IsLoad = Opcode == Instruction::Load;
5369
5370 Type *EltTy = VTy->getElementType();
5371
5372 const int EltTyBits = DL.getTypeSizeInBits(EltTy);
5373
5374 // Source of truth: how many elements were there in the original IR vector?
5375 const unsigned SrcNumElt = VTy->getNumElements();
5376
5377 // How far have we gotten?
5378 int NumEltRemaining = SrcNumElt;
5379 // Note that we intentionally capture by-reference, NumEltRemaining changes.
5380 auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
5381
5382 const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8);
5383
5384 // Note that even if we can store 64 bits of an XMM, we still operate on XMM.
5385 const unsigned XMMBits = 128;
5386 if (XMMBits % EltTyBits != 0)
5387 // Vector size must be a multiple of the element size. I.e. no padding.
5388 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5389 CostKind, OpInfo, I);
5390 const int NumEltPerXMM = XMMBits / EltTyBits;
5391
5392 auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
5393
5394 for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0;
5395 NumEltRemaining > 0; CurrOpSizeBytes /= 2) {
5396 // How many elements would a single op deal with at once?
5397 if ((8 * CurrOpSizeBytes) % EltTyBits != 0)
5398 // Vector size must be a multiple of the element size. I.e. no padding.
5399 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5400 CostKind, OpInfo, I);
5401 int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits;
5402
5403 assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?");
5404 assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||
5405 (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&
5406 "Unless we haven't halved the op size yet, "
5407 "we have less than two op's sized units of work left.");
5408
5409 auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
5410 ? FixedVectorType::get(EltTy, CurrNumEltPerOp)
5411 : XMMVecTy;
5412
5413 assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&
5414 "After halving sizes, the vector elt count is no longer a multiple "
5415 "of number of elements per operation?");
5416 auto *CoalescedVecTy =
5417 CurrNumEltPerOp == 1
5418 ? CurrVecTy
5420 IntegerType::get(Src->getContext(),
5421 EltTyBits * CurrNumEltPerOp),
5422 CurrVecTy->getNumElements() / CurrNumEltPerOp);
5423 assert(DL.getTypeSizeInBits(CoalescedVecTy) ==
5424 DL.getTypeSizeInBits(CurrVecTy) &&
5425 "coalesciing elements doesn't change vector width.");
5426
5427 while (NumEltRemaining > 0) {
5428 assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?");
5429
5430 // Can we use this vector size, as per the remaining element count?
5431 // Iff the vector is naturally aligned, we can do a wide load regardless.
5432 if (NumEltRemaining < CurrNumEltPerOp &&
5433 (!IsLoad || Alignment < CurrOpSizeBytes) && CurrOpSizeBytes != 1)
5434 break; // Try smalled vector size.
5435
5436 // This isn't exactly right. We're using slow unaligned 32-byte accesses
5437 // as a proxy for a double-pumped AVX memory interface such as on
5438 // Sandybridge.
5439 // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or
5440 // will be scalarized.
5441 if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
5442 Cost += 2;
5443 else if (CurrOpSizeBytes < 4)
5444 Cost += 2;
5445 else
5446 Cost += 1;
5447
5448 // If we're loading a uniform value, then we don't need to split the load,
5449 // loading just a single (widest) vector can be reused by all splits.
5450 if (IsLoad && OpInfo.isUniform())
5451 return Cost;
5452
5453 bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
5454
5455 // If we have fully processed the previous reg, we need to replenish it.
5456 if (SubVecEltsLeft == 0) {
5457 SubVecEltsLeft += CurrVecTy->getNumElements();
5458 // And that's free only for the 0'th subvector of a legalized vector.
5459 if (!Is0thSubVec)
5460 Cost +=
5463 VTy, VTy, {}, CostKind, NumEltDone(), CurrVecTy);
5464 }
5465
5466 // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
5467 // for smaller widths (32/16/8) we have to insert/extract them separately.
5468 // Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
5469 // but let's pretend that it is also true for 16/8 bit wide ops...)
5470 if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
5471 int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
5472 assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "");
5473 int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
5474 APInt DemandedElts =
5475 APInt::getBitsSet(CoalescedVecTy->getNumElements(),
5476 CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
5477 assert(DemandedElts.popcount() == 1 && "Inserting single value");
5478 Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
5479 !IsLoad, CostKind);
5480 }
5481
5482 SubVecEltsLeft -= CurrNumEltPerOp;
5483 NumEltRemaining -= CurrNumEltPerOp;
5484 Alignment = commonAlignment(Alignment, CurrOpSizeBytes);
5485 }
5486 }
5487
5488 assert(NumEltRemaining <= 0 && "Should have processed all the elements.");
5489
5490 return Cost;
5491}
5492
5496 switch (MICA.getID()) {
5497 case Intrinsic::masked_scatter:
5498 case Intrinsic::masked_gather:
5499 return getGatherScatterOpCost(MICA, CostKind);
5500 case Intrinsic::masked_load:
5501 case Intrinsic::masked_store:
5502 return getMaskedMemoryOpCost(MICA, CostKind);
5503 }
5505}
5506
5510 unsigned Opcode = MICA.getID() == Intrinsic::masked_load ? Instruction::Load
5511 : Instruction::Store;
5512 Type *SrcTy = MICA.getDataType();
5513 Align Alignment = MICA.getAlignment();
5514 unsigned AddressSpace = MICA.getAddressSpace();
5515
5516 bool IsLoad = (Instruction::Load == Opcode);
5517 bool IsStore = (Instruction::Store == Opcode);
5518
5519 auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
5520 if (!SrcVTy)
5521 // To calculate scalar take the regular cost, without mask
5522 return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
5523
5524 unsigned NumElem = SrcVTy->getNumElements();
5525 auto *MaskTy =
5526 FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
5527 if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment, AddressSpace)) ||
5528 (IsStore && !isLegalMaskedStore(SrcVTy, Alignment, AddressSpace))) {
5529 // Scalarization
5530 APInt DemandedElts = APInt::getAllOnes(NumElem);
5532 MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind);
5533 InstructionCost ScalarCompareCost = getCmpSelInstrCost(
5534 Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
5536 InstructionCost BranchCost = getCFInstrCost(Instruction::CondBr, CostKind);
5537 InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
5539 SrcVTy, DemandedElts, IsLoad, IsStore, CostKind);
5540 InstructionCost MemopCost =
5541 NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
5542 Alignment, AddressSpace, CostKind);
5543 return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
5544 }
5545
5546 // Legalize the type.
5547 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy);
5548 auto VT = TLI->getValueType(DL, SrcVTy);
5550 MVT Ty = LT.second;
5551 if (Ty == MVT::i16 || Ty == MVT::i32 || Ty == MVT::i64)
5552 // APX masked load/store for scalar is cheap.
5553 return Cost + LT.first;
5554
5555 if (VT.isSimple() && Ty != VT.getSimpleVT() &&
5556 LT.second.getVectorNumElements() == NumElem)
5557 // Promotion requires extend/truncate for data and a shuffle for mask.
5558 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, SrcVTy, {}, CostKind,
5559 0, nullptr) +
5560 getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, MaskTy, {}, CostKind,
5561 0, nullptr);
5562
5563 else if (LT.first * Ty.getVectorNumElements() > NumElem) {
5564 auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
5565 (unsigned)LT.first.getValue() *
5566 Ty.getVectorNumElements());
5567 // Expanding requires fill mask with zeroes
5568 Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, NewMaskTy, {},
5569 CostKind, 0, MaskTy);
5570 }
5571
5572 // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
5573 if (!ST->hasAVX512())
5574 return Cost + LT.first * (IsLoad ? 2 : 8);
5575
5576 // AVX-512 masked load/store is cheaper
5577 return Cost + LT.first;
5578}
5579
5581 ArrayRef<const Value *> Ptrs, const Value *Base,
5582 const TTI::PointersChainInfo &Info, Type *AccessTy,
5584 if (Info.isSameBase() && Info.isKnownStride()) {
5585 // If all the pointers have known stride all the differences are translated
5586 // into constants. X86 memory addressing allows encoding it into
5587 // displacement. So we just need to take the base GEP cost.
5588 if (const auto *BaseGEP = dyn_cast<GetElementPtrInst>(Base)) {
5589 SmallVector<const Value *> Indices(BaseGEP->indices());
5590 return getGEPCost(BaseGEP->getSourceElementType(),
5591 BaseGEP->getPointerOperand(), Indices, nullptr,
5592 CostKind);
5593 }
5594 return TTI::TCC_Free;
5595 }
5596 return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind);
5597}
5598
5601 const SCEV *Ptr,
5603 // Address computations in vectorized code with non-consecutive addresses will
5604 // likely result in more instructions compared to scalar code where the
5605 // computation can more often be merged into the index mode. The resulting
5606 // extra micro-ops can significantly decrease throughput.
5607 const unsigned NumVectorInstToHideOverhead = 10;
5608
5609 // Cost modeling of Strided Access Computation is hidden by the indexing
5610 // modes of X86 regardless of the stride value. We dont believe that there
5611 // is a difference between constant strided access in gerenal and constant
5612 // strided value which is less than or equal to 64.
5613 // Even in the case of (loop invariant) stride whose value is not known at
5614 // compile time, the address computation will not incur more than one extra
5615 // ADD instruction.
5616 if (PtrTy->isVectorTy() && SE && !ST->hasAVX2()) {
5617 // TODO: AVX2 is the current cut-off because we don't have correct
5618 // interleaving costs for prior ISA's.
5619 if (!BaseT::isStridedAccess(Ptr))
5620 return NumVectorInstToHideOverhead;
5621 if (!BaseT::getConstantStrideStep(SE, Ptr))
5622 return 1;
5623 }
5624
5625 return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
5626}
5627
5630 std::optional<FastMathFlags> FMF,
5633 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5634
5635 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5636 // and make it as the cost.
5637
5638 static const CostTblEntry SLMCostTbl[] = {
5639 { ISD::FADD, MVT::v2f64, 3 },
5640 { ISD::ADD, MVT::v2i64, 5 },
5641 };
5642
5643 static const CostTblEntry SSE2CostTbl[] = {
5644 { ISD::FADD, MVT::v2f64, 2 },
5645 { ISD::FADD, MVT::v2f32, 2 },
5646 { ISD::FADD, MVT::v4f32, 4 },
5647 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
5648 { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
5649 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
5650 { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
5651 { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
5652 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
5653 { ISD::ADD, MVT::v2i8, 2 },
5654 { ISD::ADD, MVT::v4i8, 2 },
5655 { ISD::ADD, MVT::v8i8, 2 },
5656 { ISD::ADD, MVT::v16i8, 3 },
5657 };
5658
5659 static const CostTblEntry AVX1CostTbl[] = {
5660 { ISD::FADD, MVT::v4f64, 3 },
5661 { ISD::FADD, MVT::v4f32, 3 },
5662 { ISD::FADD, MVT::v8f32, 4 },
5663 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
5664 { ISD::ADD, MVT::v4i64, 3 },
5665 { ISD::ADD, MVT::v8i32, 5 },
5666 { ISD::ADD, MVT::v16i16, 5 },
5667 { ISD::ADD, MVT::v32i8, 4 },
5668 };
5669
5670 static const CostTblEntry AVX512FCostTbl[] = {
5671 { ISD::FADD, MVT::v8f64, 4 },
5672 { ISD::FADD, MVT::v16f32, 5 },
5673 { ISD::ADD, MVT::v8i64, 4 },
5674 { ISD::ADD, MVT::v16i32, 6 },
5675 };
5676
5677 static const CostTblEntry AVX512BWCostTbl[] = {
5678 { ISD::ADD, MVT::v32i16, 7 },
5679 { ISD::ADD, MVT::v64i8, 4 },
5680 };
5681
5682 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5683 assert(ISD && "Invalid opcode");
5684
5685 // Before legalizing the type, give a chance to look up illegal narrow types
5686 // in the table.
5687 // FIXME: Is there a better way to do this?
5688 EVT VT = TLI->getValueType(DL, ValTy);
5689 if (VT.isSimple()) {
5690 MVT MTy = VT.getSimpleVT();
5691 if (ST->useSLMArithCosts())
5692 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5693 return Entry->Cost;
5694
5695 if (ST->hasBWI())
5696 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5697 return Entry->Cost;
5698
5699 if (ST->hasAVX512())
5700 if (const auto *Entry = CostTableLookup(AVX512FCostTbl, ISD, MTy))
5701 return Entry->Cost;
5702
5703 if (ST->hasAVX())
5704 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5705 return Entry->Cost;
5706
5707 if (ST->hasSSE2())
5708 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5709 return Entry->Cost;
5710 }
5711
5712 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5713
5714 MVT MTy = LT.second;
5715
5716 auto *ValVTy = cast<FixedVectorType>(ValTy);
5717
5718 // Special case: vXi8 mul reductions are performed as vXi16.
5719 if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) {
5720 auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16);
5721 auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements());
5722 return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
5724 CostKind) +
5725 getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind);
5726 }
5727
5728 InstructionCost ArithmeticCost = 0;
5729 if (LT.first != 1 && MTy.isVector() &&
5730 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5731 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5732 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5733 MTy.getVectorNumElements());
5734 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5735 ArithmeticCost *= LT.first - 1;
5736 }
5737
5738 if (ST->useSLMArithCosts())
5739 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5740 return ArithmeticCost + Entry->Cost;
5741
5742 if (ST->hasAVX())
5743 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5744 return ArithmeticCost + Entry->Cost;
5745
5746 if (ST->hasSSE2())
5747 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5748 return ArithmeticCost + Entry->Cost;
5749
5750 // FIXME: These assume a naive kshift+binop lowering, which is probably
5751 // conservative in most cases.
5752 static const CostTblEntry AVX512BoolReduction[] = {
5753 { ISD::AND, MVT::v2i1, 3 },
5754 { ISD::AND, MVT::v4i1, 5 },
5755 { ISD::AND, MVT::v8i1, 7 },
5756 { ISD::AND, MVT::v16i1, 9 },
5757 { ISD::AND, MVT::v32i1, 11 },
5758 { ISD::AND, MVT::v64i1, 13 },
5759 { ISD::OR, MVT::v2i1, 3 },
5760 { ISD::OR, MVT::v4i1, 5 },
5761 { ISD::OR, MVT::v8i1, 7 },
5762 { ISD::OR, MVT::v16i1, 9 },
5763 { ISD::OR, MVT::v32i1, 11 },
5764 { ISD::OR, MVT::v64i1, 13 },
5765 };
5766
5767 static const CostTblEntry AVX2BoolReduction[] = {
5768 { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
5769 { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
5770 { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
5771 { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
5772 };
5773
5774 static const CostTblEntry AVX1BoolReduction[] = {
5775 { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
5776 { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
5777 { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5778 { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5779 { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
5780 { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
5781 { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5782 { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5783 };
5784
5785 static const CostTblEntry SSE2BoolReduction[] = {
5786 { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
5787 { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
5788 { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
5789 { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
5790 { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
5791 { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
5792 { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
5793 { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
5794 };
5795
5796 // Handle bool allof/anyof patterns.
5797 if (ValVTy->getElementType()->isIntegerTy(1)) {
5798 if (ISD == ISD::ADD) {
5799 // vXi1 addition reduction will bitcast to scalar and perform a popcount.
5800 auto *IntTy = IntegerType::getIntNTy(ValVTy->getContext(),
5801 ValVTy->getNumElements());
5802 IntrinsicCostAttributes ICA(Intrinsic::ctpop, IntTy, {IntTy});
5803 return getCastInstrCost(Instruction::BitCast, IntTy, ValVTy,
5805 CostKind) +
5807 }
5808
5809 InstructionCost ArithmeticCost = 0;
5810 if (LT.first != 1 && MTy.isVector() &&
5811 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5812 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5813 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5814 MTy.getVectorNumElements());
5815 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5816 ArithmeticCost *= LT.first - 1;
5817 }
5818
5819 if (ST->hasAVX512())
5820 if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
5821 return ArithmeticCost + Entry->Cost;
5822 if (ST->hasAVX2())
5823 if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
5824 return ArithmeticCost + Entry->Cost;
5825 if (ST->hasAVX())
5826 if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
5827 return ArithmeticCost + Entry->Cost;
5828 if (ST->hasSSE2())
5829 if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
5830 return ArithmeticCost + Entry->Cost;
5831
5832 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5833 }
5834
5835 unsigned NumVecElts = ValVTy->getNumElements();
5836 unsigned ScalarSize = ValVTy->getScalarSizeInBits();
5837
5838 // Special case power of 2 reductions where the scalar type isn't changed
5839 // by type legalization.
5840 if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
5841 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5842
5843 InstructionCost ReductionCost = 0;
5844
5845 auto *Ty = ValVTy;
5846 if (LT.first != 1 && MTy.isVector() &&
5847 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5848 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5849 Ty = FixedVectorType::get(ValVTy->getElementType(),
5850 MTy.getVectorNumElements());
5851 ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5852 ReductionCost *= LT.first - 1;
5853 NumVecElts = MTy.getVectorNumElements();
5854 }
5855
5856 // Now handle reduction with the legal type, taking into account size changes
5857 // at each level.
5858 while (NumVecElts > 1) {
5859 // Determine the size of the remaining vector we need to reduce.
5860 unsigned Size = NumVecElts * ScalarSize;
5861 NumVecElts /= 2;
5862 // If we're reducing from 256/512 bits, use an extract_subvector.
5863 if (Size > 128) {
5864 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5865 ReductionCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, Ty, {},
5866 CostKind, NumVecElts, SubTy);
5867 Ty = SubTy;
5868 } else if (Size == 128) {
5869 // Reducing from 128 bits is a permute of v2f64/v2i64.
5870 FixedVectorType *ShufTy;
5871 if (ValVTy->isFloatingPointTy())
5872 ShufTy =
5873 FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
5874 else
5875 ShufTy =
5876 FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
5877 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy,
5878 {}, CostKind, 0, nullptr);
5879 } else if (Size == 64) {
5880 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5881 FixedVectorType *ShufTy;
5882 if (ValVTy->isFloatingPointTy())
5883 ShufTy =
5884 FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
5885 else
5886 ShufTy =
5887 FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
5888 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy,
5889 {}, CostKind, 0, nullptr);
5890 } else {
5891 // Reducing from smaller size is a shift by immediate.
5892 auto *ShiftTy = FixedVectorType::get(
5893 Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
5894 ReductionCost += getArithmeticInstrCost(
5895 Instruction::LShr, ShiftTy, CostKind,
5898 }
5899
5900 // Add the arithmetic op for this level.
5901 ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
5902 }
5903
5904 // Add the final extract element to the cost.
5905 return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5906 CostKind, 0, nullptr, nullptr,
5908}
5909
5912 FastMathFlags FMF) const {
5913 IntrinsicCostAttributes ICA(IID, Ty, {Ty, Ty}, FMF);
5914 return getIntrinsicInstrCost(ICA, CostKind);
5915}
5916
5919 FastMathFlags FMF,
5921 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5922
5923 MVT MTy = LT.second;
5924
5926 if (ValTy->isIntOrIntVectorTy()) {
5927 ISD = (IID == Intrinsic::umin || IID == Intrinsic::umax) ? ISD::UMIN
5928 : ISD::SMIN;
5929 } else {
5930 assert(ValTy->isFPOrFPVectorTy() &&
5931 "Expected float point or integer vector type.");
5932 ISD = (IID == Intrinsic::minnum || IID == Intrinsic::maxnum)
5933 ? ISD::FMINNUM
5934 : ISD::FMINIMUM;
5935 }
5936
5937 // We use llvm-mca across all supported CPUs to measure the cost stats.
5938 static const CostKindTblEntry SSE2CostTbl[] = {
5939 {ISD::SMIN, MVT::v2i64, {3, 4, 5, 6}},
5940 {ISD::UMIN, MVT::v2i64, {3, 4, 5, 6}},
5941 {ISD::SMIN, MVT::v2i32, {2, 2, 5, 6}},
5942 {ISD::UMIN, MVT::v2i32, {2, 2, 5, 6}},
5943 {ISD::SMIN, MVT::v4i32, {3, 7,11,12}},
5944 {ISD::UMIN, MVT::v4i32, {4, 7,14,15}},
5945 {ISD::SMIN, MVT::v2i16, {2, 3, 4, 4}},
5946 {ISD::UMIN, MVT::v2i16, {2, 3, 4, 6}},
5947 {ISD::SMIN, MVT::v4i16, {3, 5, 6, 6}},
5948 {ISD::UMIN, MVT::v4i16, {3, 5, 8, 10}},
5949 {ISD::SMIN, MVT::v8i16, {3, 8, 8, 8}},
5950 {ISD::UMIN, MVT::v8i16, {4, 8,12,14}},
5951 {ISD::SMIN, MVT::v2i8, {2, 3, 5, 6}},
5952 {ISD::UMIN, MVT::v2i8, {2, 3, 4, 4}},
5953 {ISD::SMIN, MVT::v4i8, {4, 6,12,13}},
5954 {ISD::UMIN, MVT::v4i8, {3, 6, 7, 7}},
5955 {ISD::SMIN, MVT::v8i8, {5, 9,18,19}},
5956 {ISD::UMIN, MVT::v8i8, {4, 8, 9, 9}},
5957 {ISD::SMIN, MVT::v16i8, {7,13,24,25}},
5958 {ISD::UMIN, MVT::v16i8, {3,10,11,11}},
5959 };
5960
5961 static const CostKindTblEntry SSE41CostTbl[] = {
5962 {ISD::SMIN, MVT::v2i64, {3, 4, 4, 6}},
5963 {ISD::UMIN, MVT::v2i64, {3, 4, 4, 6}},
5964 {ISD::SMIN, MVT::v2i32, {2, 2, 3, 3}},
5965 {ISD::UMIN, MVT::v2i32, {2, 2, 3, 3}},
5966 {ISD::SMIN, MVT::v4i32, {3, 4, 5, 5}},
5967 {ISD::UMIN, MVT::v4i32, {3, 4, 5, 5}},
5968 {ISD::UMIN, MVT::v2i16, {2, 3, 4, 4}},
5969 {ISD::SMIN, MVT::v4i16, {3, 5, 6, 6}},
5970 {ISD::UMIN, MVT::v4i16, {3, 5, 6, 6}},
5971 {ISD::SMIN, MVT::v8i16, {2, 8, 4, 5}},
5972 {ISD::UMIN, MVT::v8i16, {2, 5, 2, 2}},
5973 {ISD::SMIN, MVT::v2i8, {2, 3, 4, 4}},
5974 {ISD::SMIN, MVT::v4i8, {3, 6, 7, 7}},
5975 {ISD::SMIN, MVT::v8i8, {4, 8, 9, 9}},
5976 {ISD::SMIN, MVT::v16i8, {3,10, 7, 8}},
5977 {ISD::UMIN, MVT::v16i8, {3, 8, 5, 5}},
5978 };
5979
5980 static const CostKindTblEntry AVX1CostTbl[] = {
5981 {ISD::SMIN, MVT::v4i64, {5,11, 7,10}},
5982 {ISD::UMIN, MVT::v4i64, {6,12,10,13}},
5983 {ISD::SMIN, MVT::v8i32, {4, 9, 7, 7}},
5984 {ISD::UMIN, MVT::v8i32, {4, 9, 7, 7}},
5985 {ISD::SMIN, MVT::v16i16, {3,15, 6, 7}},
5986 {ISD::UMIN, MVT::v16i16, {2, 9, 4, 4}},
5987 {ISD::SMIN, MVT::v32i8, {4,17, 8, 9}},
5988 {ISD::UMIN, MVT::v32i8, {3,11, 6, 6}},
5989 };
5990
5991 static const CostKindTblEntry AVX2CostTbl[] = {
5992 {ISD::SMIN, MVT::v4i64, {4,11, 7,10}},
5993 {ISD::UMIN, MVT::v4i64, {4,12,10,13}},
5994 {ISD::SMIN, MVT::v2i32, {1, 2, 3, 3}},
5995 {ISD::UMIN, MVT::v2i32, {1, 2, 3, 3}},
5996 {ISD::UMIN, MVT::v4i32, {2, 4, 5, 5}},
5997 {ISD::SMIN, MVT::v4i32, {2, 4, 5, 5}},
5998 {ISD::SMIN, MVT::v8i32, {3, 9, 7, 7}},
5999 {ISD::UMIN, MVT::v8i32, {3, 9, 7, 7}},
6000 {ISD::SMIN, MVT::v4i16, {2, 4, 5, 5}},
6001 {ISD::UMIN, MVT::v4i16, {2, 4, 5, 5}},
6002 {ISD::SMIN, MVT::v16i16, {2,15, 6, 7}},
6003 {ISD::SMIN, MVT::v8i8, {3, 6, 7, 7}},
6004 {ISD::UMIN, MVT::v8i8, {3, 6, 7, 7}},
6005 {ISD::SMIN, MVT::v32i8, {3,17, 8, 9}},
6006 };
6007
6008 static const CostKindTblEntry AVX512FCostTbl[] = {
6009 {ISD::SMIN, MVT::v2i64, {2, 4, 3, 3}},
6010 {ISD::UMIN, MVT::v2i64, {2, 4, 3, 3}},
6011 {ISD::SMIN, MVT::v4i64, {3,10, 5, 5}},
6012 {ISD::UMIN, MVT::v4i64, {3,10, 5, 5}},
6013 {ISD::SMIN, MVT::v8i64, {5,16, 7, 7}},
6014 {ISD::UMIN, MVT::v8i64, {5,16, 7, 7}},
6015 {ISD::SMIN, MVT::v16i32, {4,12, 9, 9}},
6016 {ISD::UMIN, MVT::v16i32, {4,12, 9, 9}},
6017 };
6018
6019 static const CostKindTblEntry AVX512BWCostTbl[] = {
6020 {ISD::SMIN, MVT::v2i16, {1, 2, 3, 3}},
6021 {ISD::UMIN, MVT::v2i16, {1, 2, 3, 3}},
6022 {ISD::SMIN, MVT::v32i16, {2,19, 8, 9}},
6023 {ISD::UMIN, MVT::v32i16, {2,12, 6, 6}},
6024 {ISD::SMIN, MVT::v2i8, {1, 2, 3, 3}},
6025 {ISD::UMIN, MVT::v2i8, {1, 2, 3, 3}},
6026 {ISD::SMIN, MVT::v4i8, {2, 4, 5, 5}},
6027 {ISD::UMIN, MVT::v4i8, {2, 4, 5, 5}},
6028 {ISD::SMIN, MVT::v16i8, {2,10, 6, 7}},
6029 {ISD::UMIN, MVT::v16i8, {2, 6, 4, 4}},
6030 {ISD::SMIN, MVT::v32i8, {2,17, 8, 9}},
6031 {ISD::UMIN, MVT::v32i8, {2,10, 6, 6}},
6032 {ISD::SMIN, MVT::v64i8, {2,21,10,11}},
6033 {ISD::UMIN, MVT::v64i8, {2,14, 8, 8}},
6034 };
6035
6036 // Before legalizing the type, give a chance to look up illegal narrow types
6037 // in the table.
6038 // FIXME: Is there a better way to do this?
6039 EVT VT = TLI->getValueType(DL, ValTy);
6040 if (VT.isSimple()) {
6041 MVT MTy = VT.getSimpleVT();
6042 if (ST->hasBWI())
6043 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
6044 if (auto KindCost = Entry->Cost[CostKind])
6045 return *KindCost;
6046
6047 if (ST->hasAVX512())
6048 if (const auto *Entry = CostTableLookup(AVX512FCostTbl, ISD, MTy))
6049 if (auto KindCost = Entry->Cost[CostKind])
6050 return *KindCost;
6051
6052 if (ST->hasAVX2())
6053 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
6054 if (auto KindCost = Entry->Cost[CostKind])
6055 return *KindCost;
6056
6057 if (ST->hasAVX())
6058 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
6059 if (auto KindCost = Entry->Cost[CostKind])
6060 return *KindCost;
6061
6062 if (ST->hasSSE41())
6063 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
6064 if (auto KindCost = Entry->Cost[CostKind])
6065 return *KindCost;
6066
6067 if (ST->hasSSE2())
6068 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
6069 if (auto KindCost = Entry->Cost[CostKind])
6070 return *KindCost;
6071 }
6072
6073 auto *ValVTy = cast<FixedVectorType>(ValTy);
6074 unsigned NumVecElts = ValVTy->getNumElements();
6075
6076 auto *Ty = ValVTy;
6077 InstructionCost MinMaxCost = 0;
6078 if (LT.first != 1 && MTy.isVector() &&
6079 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
6080 // Type needs to be split. We need LT.first - 1 operations ops.
6081 Ty = FixedVectorType::get(ValVTy->getElementType(),
6082 MTy.getVectorNumElements());
6083 MinMaxCost = getMinMaxCost(IID, Ty, CostKind, FMF);
6084 MinMaxCost *= LT.first - 1;
6085 NumVecElts = MTy.getVectorNumElements();
6086 }
6087
6088 if (ST->hasBWI())
6089 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
6090 if (auto KindCost = Entry->Cost[CostKind])
6091 return MinMaxCost + *KindCost;
6092
6093 if (ST->hasAVX512())
6094 if (const auto *Entry = CostTableLookup(AVX512FCostTbl, ISD, MTy))
6095 if (auto KindCost = Entry->Cost[CostKind])
6096 return MinMaxCost + *KindCost;
6097
6098 if (ST->hasAVX2())
6099 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
6100 if (auto KindCost = Entry->Cost[CostKind])
6101 return MinMaxCost + *KindCost;
6102
6103 if (ST->hasAVX())
6104 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
6105 if (auto KindCost = Entry->Cost[CostKind])
6106 return MinMaxCost + *KindCost;
6107
6108 if (ST->hasSSE41())
6109 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
6110 if (auto KindCost = Entry->Cost[CostKind])
6111 return MinMaxCost + *KindCost;
6112
6113 if (ST->hasSSE2())
6114 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
6115 if (auto KindCost = Entry->Cost[CostKind])
6116 return MinMaxCost + *KindCost;
6117
6118 unsigned ScalarSize = ValTy->getScalarSizeInBits();
6119
6120 // Special case power of 2 reductions where the scalar type isn't changed
6121 // by type legalization.
6122 if (!isPowerOf2_32(ValVTy->getNumElements()) ||
6123 ScalarSize != MTy.getScalarSizeInBits())
6124 return BaseT::getMinMaxReductionCost(IID, ValTy, FMF, CostKind);
6125
6126 // Now handle reduction with the legal type, taking into account size changes
6127 // at each level.
6128 while (NumVecElts > 1) {
6129 // Determine the size of the remaining vector we need to reduce.
6130 unsigned Size = NumVecElts * ScalarSize;
6131 NumVecElts /= 2;
6132 // If we're reducing from 256/512 bits, use an extract_subvector.
6133 if (Size > 128) {
6134 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
6135 MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, Ty, {},
6136 CostKind, NumVecElts, SubTy);
6137 Ty = SubTy;
6138 } else if (Size == 128) {
6139 // Reducing from 128 bits is a permute of v2f64/v2i64.
6140 VectorType *ShufTy;
6141 if (ValTy->isFloatingPointTy())
6142 ShufTy =
6144 else
6145 ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
6146 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy, {},
6147 CostKind, 0, nullptr);
6148 } else if (Size == 64) {
6149 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
6150 FixedVectorType *ShufTy;
6151 if (ValTy->isFloatingPointTy())
6152 ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
6153 else
6154 ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
6155 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy, {},
6156 CostKind, 0, nullptr);
6157 } else {
6158 // Reducing from smaller size is a shift by immediate.
6159 auto *ShiftTy = FixedVectorType::get(
6160 Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
6161 MinMaxCost += getArithmeticInstrCost(
6162 Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
6165 }
6166
6167 // Add the arithmetic op for this level.
6168 MinMaxCost += getMinMaxCost(IID, Ty, CostKind, FMF);
6169 }
6170
6171 // Add the final extract element to the cost.
6172 return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
6173 CostKind, 0, nullptr, nullptr,
6175}
6176
6177/// Calculate the cost of materializing a 64-bit value. This helper
6178/// method might only calculate a fraction of a larger immediate. Therefore it
6179/// is valid to return a cost of ZERO.
6181 if (Val == 0)
6182 return TTI::TCC_Free;
6183
6184 if (isInt<32>(Val))
6185 return TTI::TCC_Basic;
6186
6187 return 2 * TTI::TCC_Basic;
6188}
6189
6192 assert(Ty->isIntegerTy());
6193
6194 unsigned BitSize = Ty->getPrimitiveSizeInBits();
6195 if (BitSize == 0)
6196 return ~0U;
6197
6198 // Never hoist constants larger than 128bit, because this might lead to
6199 // incorrect code generation or assertions in codegen.
6200 // Fixme: Create a cost model for types larger than i128 once the codegen
6201 // issues have been fixed.
6202 if (BitSize > 128)
6203 return TTI::TCC_Free;
6204
6205 if (Imm == 0)
6206 return TTI::TCC_Free;
6207
6208 // Sign-extend all constants to a multiple of 64-bit.
6209 APInt ImmVal = Imm;
6210 if (BitSize % 64 != 0)
6211 ImmVal = Imm.sext(alignTo(BitSize, 64));
6212
6213 // Split the constant into 64-bit chunks and calculate the cost for each
6214 // chunk.
6216 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
6217 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
6218 int64_t Val = Tmp.getSExtValue();
6219 Cost += getIntImmCost(Val);
6220 }
6221 // We need at least one instruction to materialize the constant.
6222 return std::max<InstructionCost>(1, Cost);
6223}
6224
6226 const APInt &Imm, Type *Ty,
6228 Instruction *Inst) const {
6229 assert(Ty->isIntegerTy());
6230
6231 unsigned BitSize = Ty->getPrimitiveSizeInBits();
6232 unsigned ImmBitWidth = Imm.getBitWidth();
6233
6234 // There is no cost model for constants with a bit size of 0. Return TCC_Free
6235 // here, so that constant hoisting will ignore this constant.
6236 if (BitSize == 0)
6237 return TTI::TCC_Free;
6238
6239 unsigned ImmIdx = ~0U;
6240 switch (Opcode) {
6241 default:
6242 return TTI::TCC_Free;
6243 case Instruction::GetElementPtr:
6244 // Always hoist the base address of a GetElementPtr. This prevents the
6245 // creation of new constants for every base constant that gets constant
6246 // folded with the offset.
6247 if (Idx == 0)
6248 return 2 * TTI::TCC_Basic;
6249 return TTI::TCC_Free;
6250 case Instruction::Store:
6251 ImmIdx = 0;
6252 break;
6253 case Instruction::ICmp:
6254 // This is an imperfect hack to prevent constant hoisting of
6255 // compares that might be trying to check if a 64-bit value fits in
6256 // 32-bits. The backend can optimize these cases using a right shift by 32.
6257 // There are other predicates and immediates the backend can use shifts for.
6258 if (Idx == 1 && ImmBitWidth == 64) {
6259 uint64_t ImmVal = Imm.getZExtValue();
6260 if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
6261 return TTI::TCC_Free;
6262
6263 if (auto *Cmp = dyn_cast_or_null<CmpInst>(Inst)) {
6264 if (Cmp->isEquality()) {
6265 KnownBits Known = computeKnownBits(Cmp->getOperand(0), DL);
6266 if (Known.countMinTrailingZeros() >= 32)
6267 return TTI::TCC_Free;
6268 }
6269 }
6270 }
6271 ImmIdx = 1;
6272 break;
6273 case Instruction::And:
6274 // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
6275 // by using a 32-bit operation with implicit zero extension. Detect such
6276 // immediates here as the normal path expects bit 31 to be sign extended.
6277 if (Idx == 1 && ImmBitWidth == 64 && Imm.isIntN(32))
6278 return TTI::TCC_Free;
6279 // If we have BMI then we can use BEXTR/BZHI to mask out upper i64 bits.
6280 if (Idx == 1 && ImmBitWidth == 64 && ST->is64Bit() && ST->hasBMI() &&
6281 Imm.isMask())
6282 return X86TTIImpl::getIntImmCost(ST->hasBMI2() ? 255 : 65535);
6283 ImmIdx = 1;
6284 break;
6285 case Instruction::Add:
6286 case Instruction::Sub:
6287 // For add/sub, we can use the opposite instruction for INT32_MIN.
6288 if (Idx == 1 && ImmBitWidth == 64 && Imm.getZExtValue() == 0x80000000)
6289 return TTI::TCC_Free;
6290 ImmIdx = 1;
6291 break;
6292 case Instruction::UDiv:
6293 case Instruction::SDiv:
6294 case Instruction::URem:
6295 case Instruction::SRem:
6296 // Division by constant is typically expanded later into a different
6297 // instruction sequence. This completely changes the constants.
6298 // Report them as "free" to stop ConstantHoist from marking them as opaque.
6299 return TTI::TCC_Free;
6300 case Instruction::Mul:
6301 case Instruction::Or:
6302 case Instruction::Xor:
6303 ImmIdx = 1;
6304 break;
6305 // Always return TCC_Free for the shift value of a shift instruction.
6306 case Instruction::Shl:
6307 case Instruction::LShr:
6308 case Instruction::AShr:
6309 if (Idx == 1)
6310 return TTI::TCC_Free;
6311 break;
6312 case Instruction::Trunc:
6313 case Instruction::ZExt:
6314 case Instruction::SExt:
6315 case Instruction::IntToPtr:
6316 case Instruction::PtrToInt:
6317 case Instruction::BitCast:
6318 case Instruction::PHI:
6319 case Instruction::Call:
6320 case Instruction::Select:
6321 case Instruction::Ret:
6322 case Instruction::Load:
6323 break;
6324 }
6325
6326 if (Idx == ImmIdx) {
6327 uint64_t NumConstants = divideCeil(BitSize, 64);
6329 return (Cost <= NumConstants * TTI::TCC_Basic)
6330 ? static_cast<int>(TTI::TCC_Free)
6331 : Cost;
6332 }
6333
6334 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
6335}
6336
6339 const APInt &Imm, Type *Ty,
6341 assert(Ty->isIntegerTy());
6342
6343 unsigned BitSize = Ty->getPrimitiveSizeInBits();
6344 // There is no cost model for constants with a bit size of 0. Return TCC_Free
6345 // here, so that constant hoisting will ignore this constant.
6346 if (BitSize == 0)
6347 return TTI::TCC_Free;
6348
6349 switch (IID) {
6350 default:
6351 return TTI::TCC_Free;
6352 case Intrinsic::sadd_with_overflow:
6353 case Intrinsic::uadd_with_overflow:
6354 case Intrinsic::ssub_with_overflow:
6355 case Intrinsic::usub_with_overflow:
6356 case Intrinsic::smul_with_overflow:
6357 case Intrinsic::umul_with_overflow:
6358 if ((Idx == 1) && Imm.getBitWidth() <= 64 && Imm.isSignedIntN(32))
6359 return TTI::TCC_Free;
6360 break;
6361 case Intrinsic::experimental_stackmap:
6362 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
6363 return TTI::TCC_Free;
6364 break;
6365 case Intrinsic::experimental_patchpoint_void:
6366 case Intrinsic::experimental_patchpoint:
6367 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
6368 return TTI::TCC_Free;
6369 break;
6370 }
6371 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
6372}
6373
6376 const Instruction *I) const {
6378 return Opcode == Instruction::PHI ? TTI::TCC_Free : TTI::TCC_Basic;
6379 // Branches are assumed to be predicted.
6380 return TTI::TCC_Free;
6381}
6382
6383int X86TTIImpl::getGatherOverhead() const {
6384 // Some CPUs have more overhead for gather. The specified overhead is relative
6385 // to the Load operation. "2" is the number provided by Intel architects. This
6386 // parameter is used for cost estimation of Gather Op and comparison with
6387 // other alternatives.
6388 // TODO: Remove the explicit hasAVX512()?, That would mean we would only
6389 // enable gather with a -march.
6390 if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
6391 return 2;
6392
6393 return 1024;
6394}
6395
6396int X86TTIImpl::getScatterOverhead() const {
6397 if (ST->hasAVX512())
6398 return 2;
6399
6400 return 1024;
6401}
6402
6403// Return an average cost of Gather / Scatter instruction, maybe improved later.
6404InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode,
6406 Type *SrcVTy, const Value *Ptr,
6407 Align Alignment,
6408 unsigned AddressSpace) const {
6409
6410 assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
6411 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
6412
6413 // Try to reduce index size from 64 bit (default for GEP)
6414 // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
6415 // operation will use 16 x 64 indices which do not fit in a zmm and needs
6416 // to split. Also check that the base pointer is the same for all lanes,
6417 // and that there's at most one variable index.
6418 auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
6419 unsigned IndexSize = DL.getPointerSizeInBits();
6420 const GetElementPtrInst *GEP = dyn_cast_or_null<GetElementPtrInst>(Ptr);
6421 if (IndexSize < 64 || !GEP)
6422 return IndexSize;
6423
6424 unsigned NumOfVarIndices = 0;
6425 const Value *Ptrs = GEP->getPointerOperand();
6426 if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
6427 return IndexSize;
6428 for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) {
6429 if (isa<Constant>(GEP->getOperand(I)))
6430 continue;
6431 Type *IndxTy = GEP->getOperand(I)->getType();
6432 if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
6433 IndxTy = IndexVTy->getElementType();
6434 if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
6435 !isa<SExtInst>(GEP->getOperand(I))) ||
6436 ++NumOfVarIndices > 1)
6437 return IndexSize; // 64
6438 }
6439 return (unsigned)32;
6440 };
6441
6442 // Trying to reduce IndexSize to 32 bits for vector 16.
6443 // By default the IndexSize is equal to pointer size.
6444 unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
6445 ? getIndexSizeInBits(Ptr, DL)
6446 : DL.getPointerSizeInBits();
6447
6448 auto *IndexVTy = FixedVectorType::get(
6449 IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
6450 std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(IndexVTy);
6451 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(SrcVTy);
6452 InstructionCost::CostType SplitFactor =
6453 std::max(IdxsLT.first, SrcLT.first).getValue();
6454 if (SplitFactor > 1) {
6455 // Handle splitting of vector of pointers
6456 auto *SplitSrcTy =
6457 FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
6458 return SplitFactor * getGSVectorCost(Opcode, CostKind, SplitSrcTy, Ptr,
6459 Alignment, AddressSpace);
6460 }
6461
6462 // If we didn't split, this will be a single gather/scatter instruction.
6464 return 1;
6465
6466 // The gather / scatter cost is given by Intel architects. It is a rough
6467 // number since we are looking at one instruction in a time.
6468 const int GSOverhead = (Opcode == Instruction::Load) ? getGatherOverhead()
6469 : getScatterOverhead();
6470 return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
6471 Alignment, AddressSpace, CostKind);
6472}
6473
6474/// Calculate the cost of Gather / Scatter operation
6478 bool IsLoad = MICA.getID() == Intrinsic::masked_gather ||
6479 MICA.getID() == Intrinsic::vp_gather;
6480 unsigned Opcode = IsLoad ? Instruction::Load : Instruction::Store;
6481 Type *SrcVTy = MICA.getDataType();
6482 const Value *Ptr = MICA.getPointer();
6483 Align Alignment = MICA.getAlignment();
6484 if ((Opcode == Instruction::Load &&
6485 (!isLegalMaskedGather(SrcVTy, Align(Alignment)) ||
6487 Align(Alignment)))) ||
6488 (Opcode == Instruction::Store &&
6489 (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) ||
6491 Align(Alignment)))))
6493
6494 assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
6495 unsigned AddressSpace = MICA.getAddressSpace();
6496 return getGSVectorCost(Opcode, CostKind, SrcVTy, Ptr, Alignment,
6497 AddressSpace);
6498}
6499
6501 const TargetTransformInfo::LSRCost &C2) const {
6502 // X86 specific here are "instruction number 1st priority".
6503 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, C1.NumIVMuls,
6504 C1.NumBaseAdds, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
6505 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, C2.NumIVMuls,
6506 C2.NumBaseAdds, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
6507}
6508
6510 return ST->hasMacroFusion() || ST->hasBranchFusion();
6511}
6512
6513static bool isLegalMaskedLoadStore(Type *ScalarTy, const X86Subtarget *ST) {
6514 if (!ST->hasAVX())
6515 return false;
6516
6517 if (ScalarTy->isPointerTy())
6518 return true;
6519
6520 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6521 return true;
6522
6523 if (ScalarTy->isHalfTy() && ST->hasBWI())
6524 return true;
6525
6526 if (ScalarTy->isBFloatTy() && ST->hasBF16())
6527 return true;
6528
6529 if (!ScalarTy->isIntegerTy())
6530 return false;
6531
6532 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6533 return IntWidth == 32 || IntWidth == 64 ||
6534 ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
6535}
6536
6538 unsigned AddressSpace,
6539 TTI::MaskKind MaskKind) const {
6540 Type *ScalarTy = DataTy->getScalarType();
6541
6542 // The backend can't handle a single element vector w/o CFCMOV.
6543 if (isa<VectorType>(DataTy) &&
6544 cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6545 return ST->hasCF() &&
6546 hasConditionalLoadStoreForType(ScalarTy, /*IsStore=*/false);
6547
6548 return isLegalMaskedLoadStore(ScalarTy, ST);
6549}
6550
6552 unsigned AddressSpace,
6553 TTI::MaskKind MaskKind) const {
6554 Type *ScalarTy = DataTy->getScalarType();
6555
6556 // The backend can't handle a single element vector w/o CFCMOV.
6557 if (isa<VectorType>(DataTy) &&
6558 cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6559 return ST->hasCF() &&
6560 hasConditionalLoadStoreForType(ScalarTy, /*IsStore=*/true);
6561
6562 return isLegalMaskedLoadStore(ScalarTy, ST);
6563}
6564
6565bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) const {
6566 unsigned DataSize = DL.getTypeStoreSize(DataType);
6567 // The only supported nontemporal loads are for aligned vectors of 16 or 32
6568 // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
6569 // (the equivalent stores only require AVX).
6570 if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
6571 return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
6572
6573 return false;
6574}
6575
6576bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) const {
6577 unsigned DataSize = DL.getTypeStoreSize(DataType);
6578
6579 // SSE4A supports nontemporal stores of float and double at arbitrary
6580 // alignment.
6581 if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
6582 return true;
6583
6584 // Besides the SSE4A subtarget exception above, only aligned stores are
6585 // available nontemporaly on any other subtarget. And only stores with a size
6586 // of 4..32 bytes (powers of 2, only) are permitted.
6587 if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
6588 !isPowerOf2_32(DataSize))
6589 return false;
6590
6591 // 32-byte vector nontemporal stores are supported by AVX (the equivalent
6592 // loads require AVX2).
6593 if (DataSize == 32)
6594 return ST->hasAVX();
6595 if (DataSize == 16)
6596 return ST->hasSSE1();
6597 return true;
6598}
6599
6601 ElementCount NumElements) const {
6602 // movddup
6603 return ST->hasSSE3() && !NumElements.isScalable() &&
6604 NumElements.getFixedValue() == 2 &&
6605 ElementTy == Type::getDoubleTy(ElementTy->getContext());
6606}
6607
6608bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy, Align Alignment) const {
6609 if (!isa<VectorType>(DataTy))
6610 return false;
6611
6612 if (!ST->hasAVX512())
6613 return false;
6614
6615 // The backend can't handle a single element vector.
6616 if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6617 return false;
6618
6619 Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
6620
6621 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6622 return true;
6623
6624 if (!ScalarTy->isIntegerTy())
6625 return false;
6626
6627 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6628 return IntWidth == 32 || IntWidth == 64 ||
6629 ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
6630}
6631
6633 Align Alignment) const {
6634 return isLegalMaskedExpandLoad(DataTy, Alignment);
6635}
6636
6637bool X86TTIImpl::supportsGather() const {
6638 // Some CPUs have better gather performance than others.
6639 // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
6640 // enable gather with a -march.
6641 return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2());
6642}
6643
6645 Align Alignment) const {
6646 // Gather / Scatter for vector 2 is not profitable on KNL / SKX
6647 // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend
6648 // it to 8 elements, but zeroing upper bits of the mask vector will add more
6649 // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO:
6650 // Check, maybe the gather/scatter instruction is better in the VariableMask
6651 // case.
6652 unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements();
6653 return NumElts == 1 ||
6654 (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())));
6655}
6656
6658 Align Alignment) const {
6659 Type *ScalarTy = DataTy->getScalarType();
6660 if (ScalarTy->isPointerTy())
6661 return true;
6662
6663 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6664 return true;
6665
6666 if (!ScalarTy->isIntegerTy())
6667 return false;
6668
6669 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6670 return IntWidth == 32 || IntWidth == 64;
6671}
6672
6673bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) const {
6674 if (!supportsGather() || !ST->preferGather())
6675 return false;
6676 return isLegalMaskedGatherScatter(DataTy, Alignment);
6677}
6678
6679bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
6680 unsigned Opcode1,
6681 const SmallBitVector &OpcodeMask) const {
6682 // ADDSUBPS 4xf32 SSE3
6683 // VADDSUBPS 4xf32 AVX
6684 // VADDSUBPS 8xf32 AVX2
6685 // ADDSUBPD 2xf64 SSE3
6686 // VADDSUBPD 2xf64 AVX
6687 // VADDSUBPD 4xf64 AVX2
6688
6689 unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
6690 assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible");
6691 if (!isPowerOf2_32(NumElements))
6692 return false;
6693 // Check the opcode pattern. We apply the mask on the opcode arguments and
6694 // then check if it is what we expect.
6695 for (int Lane : seq<int>(0, NumElements)) {
6696 unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0;
6697 // We expect FSub for even lanes and FAdd for odd lanes.
6698 if (Lane % 2 == 0 && Opc != Instruction::FSub)
6699 return false;
6700 if (Lane % 2 == 1 && Opc != Instruction::FAdd)
6701 return false;
6702 }
6703 // Now check that the pattern is supported by the target ISA.
6704 Type *ElemTy = cast<VectorType>(VecTy)->getElementType();
6705 if (ElemTy->isFloatTy())
6706 return ST->hasSSE3() && NumElements % 4 == 0;
6707 if (ElemTy->isDoubleTy())
6708 return ST->hasSSE3() && NumElements % 2 == 0;
6709 return false;
6710}
6711
6712bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) const {
6713 // AVX2 doesn't support scatter
6714 if (!ST->hasAVX512() || !ST->preferScatter())
6715 return false;
6716 return isLegalMaskedGatherScatter(DataType, Alignment);
6717}
6718
6719bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) const {
6720 EVT VT = TLI->getValueType(DL, DataType);
6721 return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
6722}
6723
6725 // FDIV is always expensive, even if it has a very low uop count.
6726 // TODO: Still necessary for recent CPUs with low latency/throughput fdiv?
6727 if (I->getOpcode() == Instruction::FDiv)
6728 return true;
6729
6731}
6732
6733bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) const { return false; }
6734
6736 const Function *Callee) const {
6737 const TargetMachine &TM = getTLI()->getTargetMachine();
6738
6739 // Work this as a subsetting of subtarget features.
6740 const X86Subtarget &CallerSubtarget = TM.getSubtarget<X86Subtarget>(*Caller);
6741 const X86Subtarget &CalleeSubtarget = TM.getSubtarget<X86Subtarget>(*Callee);
6742 const FeatureBitset &CallerBits = CallerSubtarget.getFeatureBits();
6743 const FeatureBitset &CalleeBits = CalleeSubtarget.getFeatureBits();
6744
6745 // Check whether callee features are a subset of caller features
6746 // (apart from the ignore list).
6747 const FeatureBitset &InlineIgnoreFeatures =
6748 CallerSubtarget.getInlineIgnoreFeatures();
6749 FeatureBitset RealCallerBits = CallerBits & ~InlineIgnoreFeatures;
6750 FeatureBitset RealCalleeBits = CalleeBits & ~InlineIgnoreFeatures;
6751 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
6752 return false;
6753
6754 // If the features are not exactly the same (or there is a difference in
6755 // AVX512 register usage), we need to additionally check for calls
6756 // that may become ABI-incompatible as a result of inlining.
6757 if (RealCallerBits == RealCalleeBits &&
6758 CallerSubtarget.useAVX512Regs() == CalleeSubtarget.useAVX512Regs())
6759 return true;
6760
6761 for (const Instruction &I : instructions(Callee)) {
6762 if (const auto *CB = dyn_cast<CallBase>(&I)) {
6763 // Having more target features is fine for inline ASM and intrinsics.
6764 if (CB->isInlineAsm() || CB->getIntrinsicID() != Intrinsic::not_intrinsic)
6765 continue;
6766
6768 for (Value *Arg : CB->args())
6769 Types.push_back(Arg->getType());
6770 if (!CB->getType()->isVoidTy())
6771 Types.push_back(CB->getType());
6772
6773 // Simple types are always ABI compatible.
6774 auto IsSimpleTy = [](Type *Ty) {
6775 return !Ty->isVectorTy() && !Ty->isAggregateType();
6776 };
6777 if (all_of(Types, IsSimpleTy))
6778 continue;
6779
6780 // Do a precise compatibility check.
6781 if (!areTypesABICompatible(Caller, Callee, Types))
6782 return false;
6783 }
6784 }
6785 return true;
6786}
6787
6789 const Function *Callee,
6790 ArrayRef<Type *> Types) const {
6791 const TargetMachine &TM = getTLI()->getTargetMachine();
6792 const TargetLowering *CallerTLI =
6793 TM.getSubtargetImpl(*Caller)->getTargetLowering();
6794 const TargetLowering *CalleeTLI =
6795 TM.getSubtargetImpl(*Callee)->getTargetLowering();
6796
6797 LLVMContext &Ctx = Caller->getContext();
6798 const DataLayout &DL = Caller->getDataLayout();
6799 CallingConv::ID CC = Callee->getCallingConv();
6800 return all_of(Types, [&](Type *Ty) {
6801 SmallVector<EVT> VTs;
6802 ComputeValueVTs(*CallerTLI, DL, Ty, VTs);
6803 return all_of(VTs, [&](EVT VT) {
6804 return CallerTLI->getRegisterTypeForCallingConv(Ctx, CC, VT) ==
6805 CalleeTLI->getRegisterTypeForCallingConv(Ctx, CC, VT);
6806 });
6807 });
6808}
6809
6811X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
6813 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
6814 Options.NumLoadsPerBlock = 2;
6815 // All GPR and vector loads can be unaligned.
6816 Options.AllowOverlappingLoads = true;
6817 if (IsZeroCmp) {
6818 // Only enable vector loads for equality comparison. Right now the vector
6819 // version is not as fast for three way compare (see #33329).
6820 const unsigned PreferredWidth = ST->getPreferVectorWidth();
6821 if (PreferredWidth >= 512 && ST->hasAVX512())
6822 Options.LoadSizes.push_back(64);
6823 if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
6824 if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
6825 }
6826 if (ST->is64Bit()) {
6827 Options.LoadSizes.push_back(8);
6828 }
6829 Options.LoadSizes.push_back(4);
6830 Options.LoadSizes.push_back(2);
6831 Options.LoadSizes.push_back(1);
6832 return Options;
6833}
6834
6836 return supportsGather();
6837}
6838
6840 return false;
6841}
6842
6844 // TODO: We expect this to be beneficial regardless of arch,
6845 // but there are currently some unexplained performance artifacts on Atom.
6846 // As a temporary solution, disable on Atom.
6847 return !(ST->isAtom());
6848}
6849
6851 switch (II->getIntrinsicID()) {
6852 default:
6853 return true;
6854 case Intrinsic::vector_reduce_smax:
6855 case Intrinsic::vector_reduce_smin:
6856 case Intrinsic::vector_reduce_umax:
6857 case Intrinsic::vector_reduce_umin:
6858 return false;
6859 }
6860}
6861
6862// Get estimation for interleaved load/store operations and strided load.
6863// \p Indices contains indices for strided load.
6864// \p Factor - the factor of interleaving.
6865// AVX-512 provides 3-src shuffles that significantly reduces the cost.
6867 unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
6868 ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
6869 TTI::TargetCostKind CostKind, bool UseMaskForCond,
6870 bool UseMaskForGaps) const {
6871 // VecTy for interleave memop is <VF*Factor x Elt>.
6872 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6873 // VecTy = <12 x i32>.
6874
6875 // Calculate the number of memory operations (NumOfMemOps), required
6876 // for load/store the VecTy.
6877 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6878 unsigned VecTySize = DL.getTypeStoreSize(VecTy);
6879 unsigned LegalVTSize = LegalVT.getStoreSize();
6880 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
6881
6882 // Get the cost of one memory operation.
6883 auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
6884 LegalVT.getVectorNumElements());
6885 InstructionCost MemOpCost;
6886 bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps;
6887 if (UseMaskedMemOp) {
6888 unsigned IID = Opcode == Instruction::Load ? Intrinsic::masked_load
6889 : Intrinsic::masked_store;
6890 MemOpCost = getMaskedMemoryOpCost(
6891 {IID, SingleMemOpTy, Alignment, AddressSpace}, CostKind);
6892 } else
6893 MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace,
6894 CostKind);
6895
6896 unsigned VF = VecTy->getNumElements() / Factor;
6897 MVT VT =
6898 MVT::getVectorVT(TLI->getSimpleValueType(DL, VecTy->getScalarType()), VF);
6899
6900 InstructionCost MaskCost;
6901 if (UseMaskedMemOp) {
6902 APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements());
6903 for (unsigned Index : Indices) {
6904 assert(Index < Factor && "Invalid index for interleaved memory op");
6905 for (unsigned Elm = 0; Elm < VF; Elm++)
6906 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
6907 }
6908
6909 Type *I1Type = Type::getInt1Ty(VecTy->getContext());
6910
6911 MaskCost = getReplicationShuffleCost(
6912 I1Type, Factor, VF,
6913 UseMaskForGaps ? DemandedLoadStoreElts
6915 CostKind);
6916
6917 // The Gaps mask is invariant and created outside the loop, therefore the
6918 // cost of creating it is not accounted for here. However if we have both
6919 // a MaskForGaps and some other mask that guards the execution of the
6920 // memory access, we need to account for the cost of And-ing the two masks
6921 // inside the loop.
6922 if (UseMaskForGaps) {
6923 auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements());
6924 MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind);
6925 }
6926 }
6927
6928 if (Opcode == Instruction::Load) {
6929 // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
6930 // contain the cost of the optimized shuffle sequence that the
6931 // X86InterleavedAccess pass will generate.
6932 // The cost of loads and stores are computed separately from the table.
6933
6934 // X86InterleavedAccess support only the following interleaved-access group.
6935 static const CostTblEntry AVX512InterleavedLoadTbl[] = {
6936 {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
6937 {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
6938 {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
6939 };
6940
6941 if (const auto *Entry =
6942 CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
6943 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6944 //If an entry does not exist, fallback to the default implementation.
6945
6946 // Kind of shuffle depends on number of loaded values.
6947 // If we load the entire data in one register, we can use a 1-src shuffle.
6948 // Otherwise, we'll merge 2 sources in each operation.
6949 TTI::ShuffleKind ShuffleKind =
6950 (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
6951
6952 InstructionCost ShuffleCost = getShuffleCost(
6953 ShuffleKind, SingleMemOpTy, SingleMemOpTy, {}, CostKind, 0, nullptr);
6954
6955 unsigned NumOfLoadsInInterleaveGrp =
6956 Indices.size() ? Indices.size() : Factor;
6957 auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
6958 VecTy->getNumElements() / Factor);
6959 InstructionCost NumOfResults =
6960 getTypeLegalizationCost(ResultTy).first * NumOfLoadsInInterleaveGrp;
6961
6962 // About a half of the loads may be folded in shuffles when we have only
6963 // one result. If we have more than one result, or the loads are masked,
6964 // we do not fold loads at all.
6965 unsigned NumOfUnfoldedLoads =
6966 UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
6967
6968 // Get a number of shuffle operations per result.
6969 unsigned NumOfShufflesPerResult =
6970 std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
6971
6972 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6973 // When we have more than one destination, we need additional instructions
6974 // to keep sources.
6975 InstructionCost NumOfMoves = 0;
6976 if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
6977 NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
6978
6979 InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
6980 MaskCost + NumOfUnfoldedLoads * MemOpCost +
6981 NumOfMoves;
6982
6983 return Cost;
6984 }
6985
6986 // Store.
6987 assert(Opcode == Instruction::Store &&
6988 "Expected Store Instruction at this point");
6989 // X86InterleavedAccess support only the following interleaved-access group.
6990 static const CostTblEntry AVX512InterleavedStoreTbl[] = {
6991 {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
6992 {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
6993 {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
6994
6995 {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
6996 {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
6997 {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
6998 {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
6999 };
7000
7001 if (const auto *Entry =
7002 CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
7003 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
7004 //If an entry does not exist, fallback to the default implementation.
7005
7006 // There is no strided stores meanwhile. And store can't be folded in
7007 // shuffle.
7008 unsigned NumOfSources = Factor; // The number of values to be merged.
7009 InstructionCost ShuffleCost =
7010 getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, SingleMemOpTy, {},
7011 CostKind, 0, nullptr);
7012 unsigned NumOfShufflesPerStore = NumOfSources - 1;
7013
7014 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
7015 // We need additional instructions to keep sources.
7016 unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
7018 MaskCost +
7019 NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
7020 NumOfMoves;
7021 return Cost;
7022}
7023
7025 unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices,
7026 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
7027 bool UseMaskForCond, bool UseMaskForGaps) const {
7028 auto *VecTy = cast<FixedVectorType>(BaseTy);
7029
7030 auto isSupportedOnAVX512 = [&](Type *VecTy) {
7031 Type *EltTy = cast<VectorType>(VecTy)->getElementType();
7032 if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
7033 EltTy->isIntegerTy(32) || EltTy->isPointerTy())
7034 return true;
7035 if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy())
7036 return ST->hasBWI();
7037 if (EltTy->isBFloatTy())
7038 return ST->hasBF16();
7039 return false;
7040 };
7041 if (ST->hasAVX512() && isSupportedOnAVX512(VecTy))
7043 Opcode, VecTy, Factor, Indices, Alignment,
7044 AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
7045
7046 if (UseMaskForCond || UseMaskForGaps)
7047 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
7048 Alignment, AddressSpace, CostKind,
7049 UseMaskForCond, UseMaskForGaps);
7050
7051 // Get estimation for interleaved load/store operations for SSE-AVX2.
7052 // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow
7053 // computing the cost using a generic formula as a function of generic
7054 // shuffles. We therefore use a lookup table instead, filled according to
7055 // the instruction sequences that codegen currently generates.
7056
7057 // VecTy for interleave memop is <VF*Factor x Elt>.
7058 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
7059 // VecTy = <12 x i32>.
7060 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
7061
7062 // This function can be called with VecTy=<6xi128>, Factor=3, in which case
7063 // the VF=2, while v2i128 is an unsupported MVT vector type
7064 // (see MachineValueType.h::getVectorVT()).
7065 if (!LegalVT.isVector())
7066 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
7067 Alignment, AddressSpace, CostKind);
7068
7069 unsigned VF = VecTy->getNumElements() / Factor;
7070 Type *ScalarTy = VecTy->getElementType();
7071 // Deduplicate entries, model floats/pointers as appropriately-sized integers.
7072 if (!ScalarTy->isIntegerTy())
7073 ScalarTy =
7074 Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy));
7075
7076 // Get the cost of all the memory operations.
7077 // FIXME: discount dead loads.
7078 InstructionCost MemOpCosts =
7079 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
7080
7081 auto *VT = FixedVectorType::get(ScalarTy, VF);
7082 EVT ETy = TLI->getValueType(DL, VT);
7083 if (!ETy.isSimple())
7084 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
7085 Alignment, AddressSpace, CostKind);
7086
7087 // TODO: Complete for other data-types and strides.
7088 // Each combination of Stride, element bit width and VF results in a different
7089 // sequence; The cost tables are therefore accessed with:
7090 // Factor (stride) and VectorType=VFxiN.
7091 // The Cost accounts only for the shuffle sequence;
7092 // The cost of the loads/stores is accounted for separately.
7093 //
7094 static const CostTblEntry AVX2InterleavedLoadTbl[] = {
7095 {2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8
7096 {2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8
7097 {2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8
7098 {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8
7099 {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8
7100
7101 {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16
7102 {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16
7103 {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16
7104
7105 {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32
7106 {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32
7107 {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32
7108
7109 {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64
7110 {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64
7111 {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64
7112 {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64
7113
7114 {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8
7115 {3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8
7116 {3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8
7117 {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
7118 {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8
7119
7120 {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16
7121 {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16
7122 {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16
7123 {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16
7124 {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16
7125
7126 {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32
7127 {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32
7128 {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32
7129 {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32
7130 {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32
7131
7132 {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64
7133 {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64
7134 {3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64
7135 {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64
7136
7137 {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8
7138 {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8
7139 {4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8
7140 {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8
7141 {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8
7142
7143 {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16
7144 {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16
7145 {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16
7146 {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16
7147 {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16
7148
7149 {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32
7150 {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32
7151 {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32
7152 {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32
7153 {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32
7154
7155 {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64
7156 {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64
7157 {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64
7158 {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64
7159
7160 {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8
7161 {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8
7162 {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8
7163 {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8
7164 {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8
7165
7166 {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16
7167 {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16
7168 {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16
7169 {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16
7170 {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16
7171
7172 {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32
7173 {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32
7174 {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32
7175 {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32
7176
7177 {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64
7178 {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64
7179 {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64
7180
7181 {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32
7182 };
7183
7184 static const CostTblEntry SSSE3InterleavedLoadTbl[] = {
7185 {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16
7186 };
7187
7188 static const CostTblEntry SSE2InterleavedLoadTbl[] = {
7189 {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16
7190 {2, MVT::v4i16, 7}, // (load 8i16 and) deinterleave into 2 x 4i16
7191
7192 {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32
7193 {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32
7194
7195 {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64
7196 };
7197
7198 static const CostTblEntry AVX2InterleavedStoreTbl[] = {
7199 {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store)
7200 {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store)
7201
7202 {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store)
7203 {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store)
7204 {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store)
7205
7206 {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store)
7207 {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store)
7208 {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store)
7209 {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store)
7210
7211 {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store)
7212 {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store)
7213 {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store)
7214 {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store)
7215 {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store)
7216
7217 {3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store)
7218 {3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store)
7219 {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store)
7220 {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
7221 {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
7222
7223 {3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store)
7224 {3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store)
7225 {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store)
7226 {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store)
7227 {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store)
7228
7229 {3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store)
7230 {3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store)
7231 {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store)
7232 {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store)
7233 {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store)
7234
7235 {3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store)
7236 {3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store)
7237 {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store)
7238 {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store)
7239
7240 {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store)
7241 {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store)
7242 {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store)
7243 {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store)
7244 {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store)
7245
7246 {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store)
7247 {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store)
7248 {4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store)
7249 {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store)
7250 {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store)
7251
7252 {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store)
7253 {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store)
7254 {4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store)
7255 {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store)
7256 {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store)
7257
7258 {4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store)
7259 {4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store)
7260 {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store)
7261 {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store)
7262
7263 {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store)
7264 {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store)
7265 {6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store)
7266 {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store)
7267 {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store)
7268
7269 {6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store)
7270 {6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store)
7271 {6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store)
7272 {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store)
7273 {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store)
7274
7275 {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store)
7276 {6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store)
7277 {6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store)
7278 {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store)
7279
7280 {6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store)
7281 {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store)
7282 {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store)
7283 };
7284
7285 static const CostTblEntry SSE2InterleavedStoreTbl[] = {
7286 {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store)
7287 {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store)
7288 {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store)
7289
7290 {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store)
7291 {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store)
7292
7293 {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store)
7294 };
7295
7296 if (Opcode == Instruction::Load) {
7297 auto GetDiscountedCost = [Factor, NumMembers = Indices.size(),
7298 MemOpCosts](const CostTblEntry *Entry) {
7299 // NOTE: this is just an approximation!
7300 // It can over/under -estimate the cost!
7301 return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor);
7302 };
7303
7304 if (ST->hasAVX2())
7305 if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor,
7306 ETy.getSimpleVT()))
7307 return GetDiscountedCost(Entry);
7308
7309 if (ST->hasSSSE3())
7310 if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor,
7311 ETy.getSimpleVT()))
7312 return GetDiscountedCost(Entry);
7313
7314 if (ST->hasSSE2())
7315 if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor,
7316 ETy.getSimpleVT()))
7317 return GetDiscountedCost(Entry);
7318 } else {
7319 assert(Opcode == Instruction::Store &&
7320 "Expected Store Instruction at this point");
7321 assert((!Indices.size() || Indices.size() == Factor) &&
7322 "Interleaved store only supports fully-interleaved groups.");
7323 if (ST->hasAVX2())
7324 if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor,
7325 ETy.getSimpleVT()))
7326 return MemOpCosts + Entry->Cost;
7327
7328 if (ST->hasSSE2())
7329 if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor,
7330 ETy.getSimpleVT()))
7331 return MemOpCosts + Entry->Cost;
7332 }
7333
7334 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
7335 Alignment, AddressSpace, CostKind,
7336 UseMaskForCond, UseMaskForGaps);
7337}
7338
7340 StackOffset BaseOffset,
7341 bool HasBaseReg, int64_t Scale,
7342 unsigned AddrSpace) const {
7343 // Scaling factors are not free at all.
7344 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
7345 // will take 2 allocations in the out of order engine instead of 1
7346 // for plain addressing mode, i.e. inst (reg1).
7347 // E.g.,
7348 // vaddps (%rsi,%rdx), %ymm0, %ymm1
7349 // Requires two allocations (one for the load, one for the computation)
7350 // whereas:
7351 // vaddps (%rsi), %ymm0, %ymm1
7352 // Requires just 1 allocation, i.e., freeing allocations for other operations
7353 // and having less micro operations to execute.
7354 //
7355 // For some X86 architectures, this is even worse because for instance for
7356 // stores, the complex addressing mode forces the instruction to use the
7357 // "load" ports instead of the dedicated "store" port.
7358 // E.g., on Haswell:
7359 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
7360 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
7362 AM.BaseGV = BaseGV;
7363 AM.BaseOffs = BaseOffset.getFixed();
7364 AM.HasBaseReg = HasBaseReg;
7365 AM.Scale = Scale;
7366 AM.ScalableOffset = BaseOffset.getScalable();
7367 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
7368 // Scale represents reg2 * scale, thus account for 1
7369 // as soon as we use a second register.
7370 return AM.Scale != 0;
7372}
7373
7375 // TODO: Hook MispredictPenalty of SchedMachineModel into this.
7376 return 14;
7377}
7378
7380 unsigned Bits = Ty->getScalarSizeInBits();
7381
7382 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
7383 // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
7384 if (ST->hasXOP() && (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
7385 return false;
7386
7387 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
7388 // shifts just as cheap as scalar ones.
7389 if (ST->hasAVX2() && (Bits == 32 || Bits == 64))
7390 return false;
7391
7392 // AVX512BW has shifts such as vpsllvw.
7393 if (ST->hasBWI() && Bits == 16)
7394 return false;
7395
7396 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
7397 // fully general vector.
7398 return true;
7399}
7400
7401unsigned X86TTIImpl::getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
7402 Type *ScalarValTy, Align Alignment,
7403 unsigned AddrSpace) const {
7404 if (ST->hasF16C() && ScalarMemTy->isHalfTy()) {
7405 return 4;
7406 }
7407 return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy, Alignment,
7408 AddrSpace);
7409}
7410
7412 SmallVectorImpl<Use *> &Ops) const {
7413 using namespace llvm::PatternMatch;
7414
7415 if (I->getOpcode() == Instruction::And &&
7416 (ST->hasBMI() || (I->getType()->isVectorTy() && ST->hasSSE2()))) {
7417 for (auto &Op : I->operands()) {
7418 // (and X, (not Y)) -> (andn X, Y)
7419 if (match(Op.get(), m_Not(m_Value())) && !I->getType()->isIntegerTy(8)) {
7420 Ops.push_back(&Op);
7421 return true;
7422 }
7423 // (and X, (splat (not Y))) -> (andn X, (splat Y))
7424 if (match(Op.get(),
7426 m_Value(), m_ZeroMask()))) {
7427 Use &InsertElt = cast<Instruction>(Op)->getOperandUse(0);
7428 Use &Not = cast<Instruction>(InsertElt)->getOperandUse(1);
7429 Ops.push_back(&Not);
7430 Ops.push_back(&InsertElt);
7431 Ops.push_back(&Op);
7432 return true;
7433 }
7434 }
7435 }
7436
7437 FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
7438 if (!VTy)
7439 return false;
7440
7441 if (I->getOpcode() == Instruction::Mul &&
7442 VTy->getElementType()->isIntegerTy(64)) {
7443 for (auto &Op : I->operands()) {
7444 // Make sure we are not already sinking this operand
7445 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
7446 continue;
7447
7448 // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or
7449 // the PMULUDQ pattern where the input is a zext_inreg from vXi32.
7450 if (ST->hasSSE41() &&
7451 match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),
7452 m_SpecificInt(32)))) {
7453 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
7454 Ops.push_back(&Op);
7455 } else if (ST->hasSSE2() &&
7456 match(Op.get(),
7457 m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) {
7458 Ops.push_back(&Op);
7459 }
7460 }
7461
7462 return !Ops.empty();
7463 }
7464
7465 // A uniform shift amount in a vector shift or funnel shift may be much
7466 // cheaper than a generic variable vector shift, so make that pattern visible
7467 // to SDAG by sinking the shuffle instruction next to the shift.
7468 int ShiftAmountOpNum = -1;
7469 if (I->isShift())
7470 ShiftAmountOpNum = 1;
7471 else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
7472 if (II->getIntrinsicID() == Intrinsic::fshl ||
7473 II->getIntrinsicID() == Intrinsic::fshr)
7474 ShiftAmountOpNum = 2;
7475 }
7476
7477 if (ShiftAmountOpNum == -1)
7478 return false;
7479
7480 auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
7481 if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
7482 isVectorShiftByScalarCheap(I->getType())) {
7483 Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
7484 return true;
7485 }
7486
7487 return false;
7488}
7489
7491 bool HasEGPR = ST->hasEGPR();
7492 const TargetMachine &TM = getTLI()->getTargetMachine();
7493
7494 for (User *U : F.users()) {
7496 if (!CB || CB->getCalledOperand() != &F)
7497 continue;
7498 Function *CallerFunc = CB->getFunction();
7499 if (TM.getSubtarget<X86Subtarget>(*CallerFunc).hasEGPR() != HasEGPR)
7500 return false;
7501 }
7502
7503 return true;
7504}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Expand Atomic instructions
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
#define X(NUM, ENUM, NAME)
Definition ELF.h:856
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
Hexagon Common GEP
iv users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
uint64_t IntrinsicInst * II
#define P(N)
static unsigned getNumElements(Type *Ty)
This file implements the SmallBitVector class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
CostTblEntryT< CostKindCosts > CostKindTblEntry
static bool isLegalMaskedLoadStore(Type *ScalarTy, const X86Subtarget *ST)
TypeConversionCostTblEntryT< CostKindCosts > TypeConversionCostKindTblEntry
This file a TargetTransformInfoImplBase conforming object specific to the X86 target machine.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1055
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1693
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1353
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:259
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1511
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1084
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:834
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:201
LLVM_ABI APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition APInt.cpp:483
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy, Align Alignment, unsigned AddrSpace) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Value * getCalledOperand() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:743
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:770
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:764
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:763
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:748
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:751
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:765
@ ICMP_NE
not equal
Definition InstrTypes.h:762
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:768
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:766
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:750
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:867
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:348
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
const IntrinsicInst * getInst() const
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Machine Value Type.
bool is128BitVector() const
Return true if this is a 128-bit vector type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Information for memory intrinsic cost model.
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
bool test(unsigned Idx) const
size_type size() const
Returns the number of bits in this bitvector.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
Primary interface to the complete machine description for the target machine.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
virtual const TargetLowering * getTargetLowering() const
virtual InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const
bool isStridedAccess(const SCEV *Ptr) const
unsigned minRequiredElementSize(const Value *Val, bool &isSigned) const
const SCEVConstant * getConstantStrideStep(ScalarEvolution *SE, const SCEV *Ptr) const
virtual bool isExpensiveToSpeculativelyExecute(const Instruction *I) const
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
MaskKind
Some targets only support masked load/store with a constant mask.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
CacheLevel
The possible cache levels.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:310
LLVM_ABI unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:288
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:309
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition Type.h:263
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:282
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:155
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:147
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:307
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:368
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:144
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:158
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:306
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:313
static LLVM_ABI Type * getDoubleTy(LLVMContext &C)
Definition Type.cpp:287
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition Type.h:227
Type * getContainedType(unsigned i) const
This method is used to implement the type iterator (defined at the end of the file).
Definition Type.h:397
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:286
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
Base class of all SIMD vector types.
static VectorType * getExtendedElementVectorType(VectorType *VTy)
This static method is like getInteger except that the element types are twice as wide as the elements...
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * getDoubleElementsVectorType(VectorType *VTy)
This static method returns a VectorType with twice as many elements as the input type and the same el...
Type * getElementType() const
bool useAVX512Regs() const
bool hasAVX512() const
bool hasAVX2() const
bool useFastCCForInternalCall(Function &F) const override
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind) const override
bool isLegalNTLoad(Type *DataType, Align Alignment) const override
std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const override
unsigned getRegisterClassForType(bool Vector, Type *Ty) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Get memory intrinsic cost based on arguments.
unsigned getMaxInterleaveFactor(ElementCount VF, bool HasUnorderedReductions) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool isLegalNTStore(Type *DataType, Align Alignment) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getInterleavedMemoryOpCostAVX512(unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
bool isVectorShiftByScalarCheap(Type *Ty) const override
bool isLegalMaskedGather(Type *DataType, Align Alignment) const override
bool shouldExpandReduction(const IntrinsicInst *II) const override
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
unsigned getAtomicMemIntrinsicMaxElementSize() const override
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const override
bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) const override
InstructionCost getBranchMispredictPenalty() const override
bool isExpensiveToSpeculativelyExecute(const Instruction *I) const override
bool hasConditionalLoadStoreForType(Type *Ty, bool IsStore) const override
bool isLegalMaskedStore(Type *DataType, Align Alignment, unsigned AddressSpace, TTI::MaskKind MaskKind=TTI::MaskKind::VariableOrConstantMask) const override
std::optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool isLegalMaskedGatherScatter(Type *DataType, Align Alignment) const
bool isLegalMaskedLoad(Type *DataType, Align Alignment, unsigned AddressSpace, TTI::MaskKind MaskKind=TTI::MaskKind::VariableOrConstantMask) const override
bool enableInterleavedAccessVectorization() const override
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const override
unsigned getNumberOfRegisters(unsigned ClassID) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const override
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy, Align Alignment, unsigned AddrSpace) const override
bool hasDivRemOp(Type *DataType, bool IsSigned) const override
bool isLegalMaskedCompressStore(Type *DataType, Align Alignment) const override
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
bool supportsEfficientVectorElementLoadStore() const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getIntImmCost(int64_t) const
Calculate the cost of materializing a 64-bit value.
InstructionCost getMinMaxCost(Intrinsic::ID IID, Type *Ty, TTI::TargetCostKind CostKind, FastMathFlags FMF) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool canMacroFuseCmp() const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool prefersVectorizedAddressing() const override
bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Type) const override
InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask, TTI::TargetCostKind CostKind) const override
bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
Estimate the overhead of scalarizing an instruction.
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
Calculate the cost of Gather / Scatter operation.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition APInt.cpp:3040
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:827
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:787
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:888
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:747
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:280
@ CTLZ_ZERO_POISON
Definition ISDOpcodes.h:796
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:852
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:374
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:804
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:348
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:769
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:858
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:356
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:727
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:986
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:934
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ CTTZ_ZERO_POISON
Bit counting operators with a poisoned result for zero inputs.
Definition ISDOpcodes.h:795
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:967
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:864
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:365
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
ap_match< APInt > m_APIntAllowPoison(const APInt *&Res)
Match APInt while allowing poison in splat vector constants.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
auto m_Value()
Match an arbitrary value and ignore it.
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
This is an optimization pass for GlobalISel generic memory operations.
constexpr auto not_equal_to(T &&Arg)
Functor variant of std::not_equal_to that can be used as a UnaryPredicate in functional algorithms li...
Definition STLExtras.h:2180
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
LLVM_ABI void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
RelativeUniformCounterPtr ValuesPtrExpr VTableAddr Value
Definition InstrProf.h:143
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1885
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2019
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
LLVM_ABI void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition CostTable.h:66
LLVM_ABI int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
#define N
std::optional< unsigned > operator[](TargetTransformInfo::TargetCostKind Kind) const
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Cost Table Entry.
Definition CostTable.h:25
Extended Value Type.
Definition ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:145
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:396
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:408
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:339
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:176
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:346
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition KnownBits.h:256
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Type Conversion Cost Table.
Definition CostTable.h:55