LLVM 23.0.0git
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements a TargetTransformInfo analysis pass specific to the
10/// X86 target machine. It uses the target's detailed information to provide
11/// more precise answers to certain TTI queries, while letting the target
12/// independent and default TTI implementations handle the rest.
13///
14//===----------------------------------------------------------------------===//
15/// About Cost Model numbers used below it's necessary to say the following:
16/// the numbers correspond to some "generic" X86 CPU instead of usage of a
17/// specific CPU model. Usually the numbers correspond to the CPU where the
18/// feature first appeared. For example, if we do Subtarget.hasSSE42() in
19/// the lookups below the cost is based on Nehalem as that was the first CPU
20/// to support that feature level and thus has most likely the worst case cost,
21/// although we may discard an outlying worst cost from one CPU (e.g. Atom).
22///
23/// Some examples of other technologies/CPUs:
24/// SSE 3 - Pentium4 / Athlon64
25/// SSE 4.1 - Penryn
26/// SSE 4.2 - Nehalem / Silvermont
27/// AVX - Sandy Bridge / Jaguar / Bulldozer
28/// AVX2 - Haswell / Ryzen
29/// AVX-512 - Xeon Phi / Skylake
30///
31/// And some examples of instruction target dependent costs (latency)
32/// divss sqrtss rsqrtss
33/// AMD K7 11-16 19 3
34/// Piledriver 9-24 13-15 5
35/// Jaguar 14 16 2
36/// Pentium II,III 18 30 2
37/// Nehalem 7-14 7-18 3
38/// Haswell 10-13 11 5
39///
40/// Interpreting the 4 TargetCostKind types:
41/// TCK_RecipThroughput and TCK_Latency should try to match the worst case
42/// values reported by the CPU scheduler models (and llvm-mca).
43/// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the
44/// actual encoding size of the instruction.
45/// TCK_SizeAndLatency should match the worst case micro-op counts reported by
46/// by the CPU scheduler models (and llvm-mca), to ensure that they are
47/// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are
48/// often used as the cost thresholds where TCK_SizeAndLatency is requested.
49//===----------------------------------------------------------------------===//
50
60#include <optional>
61
62using namespace llvm;
63
64#define DEBUG_TYPE "x86tti"
65
66//===----------------------------------------------------------------------===//
67//
68// X86 cost model.
69//
70//===----------------------------------------------------------------------===//
71
72// Helper struct to store/access costs for each cost kind.
73// TODO: Move this to allow other targets to use it?
75 unsigned RecipThroughputCost = ~0U;
76 unsigned LatencyCost = ~0U;
77 unsigned CodeSizeCost = ~0U;
78 unsigned SizeAndLatencyCost = ~0U;
79
80 std::optional<unsigned>
82 unsigned Cost = ~0U;
83 switch (Kind) {
86 break;
89 break;
92 break;
95 break;
96 }
97 if (Cost == ~0U)
98 return std::nullopt;
99 return Cost;
100 }
101};
104
106X86TTIImpl::getPopcntSupport(unsigned TyWidth) const {
107 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
108 // TODO: Currently the __builtin_popcount() implementation using SSE3
109 // instructions is inefficient. Once the problem is fixed, we should
110 // call ST->hasSSE3() instead of ST->hasPOPCNT().
111 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
112}
113
114std::optional<unsigned> X86TTIImpl::getCacheSize(
116 switch (Level) {
118 // - Penryn
119 // - Nehalem
120 // - Westmere
121 // - Sandy Bridge
122 // - Ivy Bridge
123 // - Haswell
124 // - Broadwell
125 // - Skylake
126 // - Kabylake
127 return 32 * 1024; // 32 KiB
129 // - Penryn
130 // - Nehalem
131 // - Westmere
132 // - Sandy Bridge
133 // - Ivy Bridge
134 // - Haswell
135 // - Broadwell
136 // - Skylake
137 // - Kabylake
138 return 256 * 1024; // 256 KiB
139 }
140
141 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
142}
143
144std::optional<unsigned> X86TTIImpl::getCacheAssociativity(
146 // - Penryn
147 // - Nehalem
148 // - Westmere
149 // - Sandy Bridge
150 // - Ivy Bridge
151 // - Haswell
152 // - Broadwell
153 // - Skylake
154 // - Kabylake
155 switch (Level) {
157 [[fallthrough]];
159 return 8;
160 }
161
162 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
163}
164
166
168 return Vector ? VectorClass
169 : Ty && Ty->isFloatingPointTy() ? ScalarFPClass
170 : GPRClass;
171}
172
173unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
174 if (ClassID == VectorClass && !ST->hasSSE1())
175 return 0;
176
177 if (!ST->is64Bit())
178 return 8;
179
180 if ((ClassID == GPRClass && ST->hasEGPR()) ||
181 (ClassID != GPRClass && ST->hasAVX512()))
182 return 32;
183
184 return 16;
185}
186
188 if (!ST->hasCF())
189 return false;
190 if (!Ty)
191 return true;
192 // Conditional faulting is supported by CFCMOV, which only accepts
193 // 16/32/64-bit operands.
194 // TODO: Support f32/f64 with VMOVSS/VMOVSD with zero mask when it's
195 // profitable.
196 auto *VTy = dyn_cast<FixedVectorType>(Ty);
197 if (!Ty->isIntegerTy() && (!VTy || VTy->getNumElements() != 1))
198 return false;
199 auto *ScalarTy = Ty->getScalarType();
200 switch (cast<IntegerType>(ScalarTy)->getBitWidth()) {
201 default:
202 return false;
203 case 16:
204 case 32:
205 case 64:
206 return true;
207 }
208}
209
212 unsigned PreferVectorWidth = ST->getPreferVectorWidth();
213 switch (K) {
215 return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
217 if (ST->hasAVX512() && PreferVectorWidth >= 512)
218 return TypeSize::getFixed(512);
219 if (ST->hasAVX() && PreferVectorWidth >= 256)
220 return TypeSize::getFixed(256);
221 if (ST->hasSSE1() && PreferVectorWidth >= 128)
222 return TypeSize::getFixed(128);
223 return TypeSize::getFixed(0);
225 return TypeSize::getScalable(0);
226 }
227
228 llvm_unreachable("Unsupported register kind");
229}
230
235
237 bool HasUnorderedReductions) const {
238 // If the loop will not be vectorized, don't interleave the loop.
239 // Let regular unroll to unroll the loop, which saves the overflow
240 // check and memory check cost.
241 if (VF.isScalar())
242 return 1;
243
244 if (ST->isAtom())
245 return 1;
246
247 // Sandybridge and Haswell have multiple execution ports and pipelined
248 // vector units.
249 if (ST->hasAVX())
250 return 4;
251
252 return 2;
253}
254
256 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
258 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
259
260 // vXi8 multiplications are always promoted to vXi16.
261 // Sub-128-bit types can be extended/packed more efficiently.
262 if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
263 Ty->getPrimitiveSizeInBits() <= 64 && Ty->getScalarSizeInBits() == 8) {
264 Type *WideVecTy =
266 return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
268 CostKind) +
269 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
271 CostKind) +
272 getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info);
273 }
274
275 // Legalize the type.
276 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
277
278 int ISD = TLI->InstructionOpcodeToISD(Opcode);
279 assert(ISD && "Invalid opcode");
280
281 if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
282 (LT.second.getScalarType() == MVT::i32 ||
283 LT.second.getScalarType() == MVT::i64)) {
284 // Check if the operands can be represented as a smaller datatype.
285 bool Op1Signed = false, Op2Signed = false;
286 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
287 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
288 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
289 bool SignedMode = Op1Signed || Op2Signed;
290
291 // If both vXi32 are representable as i15 and at least one is constant,
292 // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
293 // can treat this as PMADDWD which has the same costs as a vXi16 multiply.
294 if (OpMinSize <= 15 && !ST->isPMADDWDSlow() &&
295 LT.second.getScalarType() == MVT::i32) {
296 bool Op1Constant =
297 isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
298 bool Op2Constant =
299 isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
300 bool Op1Sext = isa<SExtInst>(Args[0]) &&
301 (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
302 bool Op2Sext = isa<SExtInst>(Args[1]) &&
303 (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
304
305 bool IsZeroExtended = !Op1Signed || !Op2Signed;
306 bool IsConstant = Op1Constant || Op2Constant;
307 bool IsSext = Op1Sext || Op2Sext;
308 if (IsConstant || IsZeroExtended || IsSext)
309 LT.second =
310 MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
311 }
312
313 // Check if the vXi32 operands can be shrunk into a smaller datatype.
314 // This should match the codegen from reduceVMULWidth.
315 // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()).
316 if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) {
317 if (OpMinSize <= 7)
318 return LT.first * 3; // pmullw/sext
319 if (!SignedMode && OpMinSize <= 8)
320 return LT.first * 3; // pmullw/zext
321 if (OpMinSize <= 15)
322 return LT.first * 5; // pmullw/pmulhw/pshuf
323 if (!SignedMode && OpMinSize <= 16)
324 return LT.first * 5; // pmullw/pmulhw/pshuf
325 }
326
327 // If both vXi64 are representable as (unsigned) i32, then we can perform
328 // the multiple with a single PMULUDQ instruction.
329 // TODO: Add (SSE41+) PMULDQ handling for signed extensions.
330 if (!SignedMode && OpMinSize <= 32 && LT.second.getScalarType() == MVT::i64)
331 ISD = X86ISD::PMULUDQ;
332 }
333
334 // Vector multiply by pow2 will be simplified to shifts.
335 // Vector multiply by -pow2 will be simplified to shifts/negates.
336 if (ISD == ISD::MUL && Op2Info.isConstant() &&
337 (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) {
339 getArithmeticInstrCost(Instruction::Shl, Ty, CostKind,
340 Op1Info.getNoProps(), Op2Info.getNoProps());
341 if (Op2Info.isNegatedPowerOf2())
342 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
343 return Cost;
344 }
345
346 // On X86, vector signed division by constants power-of-two are
347 // normally expanded to the sequence SRA + SRL + ADD + SRA.
348 // The OperandValue properties may not be the same as that of the previous
349 // operation; conservatively assume OP_None.
350 if ((ISD == ISD::SDIV || ISD == ISD::SREM) &&
351 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
353 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
354 Op1Info.getNoProps(), Op2Info.getNoProps());
355 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
356 Op1Info.getNoProps(), Op2Info.getNoProps());
357 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
358 Op1Info.getNoProps(), Op2Info.getNoProps());
359
360 if (ISD == ISD::SREM) {
361 // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
362 Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(),
363 Op2Info.getNoProps());
364 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(),
365 Op2Info.getNoProps());
366 }
367
368 return Cost;
369 }
370
371 // Vector unsigned division/remainder will be simplified to shifts/masks.
372 if ((ISD == ISD::UDIV || ISD == ISD::UREM) &&
373 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
374 if (ISD == ISD::UDIV)
375 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
376 Op1Info.getNoProps(), Op2Info.getNoProps());
377 // UREM
378 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
379 Op1Info.getNoProps(), Op2Info.getNoProps());
380 }
381
382 static const CostKindTblEntry GFNIUniformConstCostTable[] = {
383 { ISD::SHL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
384 { ISD::SRL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
385 { ISD::SRA, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
386 { ISD::SHL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
387 { ISD::SRL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
388 { ISD::SRA, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
389 { ISD::SHL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
390 { ISD::SRL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
391 { ISD::SRA, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
392 };
393
394 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasGFNI())
395 if (const auto *Entry =
396 CostTableLookup(GFNIUniformConstCostTable, ISD, LT.second))
397 if (auto KindCost = Entry->Cost[CostKind])
398 return LT.first * *KindCost;
399
400 static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
401 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
402 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
403 { ISD::SRA, MVT::v16i8, { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb.
404 { ISD::SHL, MVT::v32i8, { 1, 8, 2, 3 } }, // psllw + pand.
405 { ISD::SRL, MVT::v32i8, { 1, 8, 2, 3 } }, // psrlw + pand.
406 { ISD::SRA, MVT::v32i8, { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb.
407 { ISD::SHL, MVT::v64i8, { 1, 8, 2, 3 } }, // psllw + pand.
408 { ISD::SRL, MVT::v64i8, { 1, 8, 2, 3 } }, // psrlw + pand.
409 { ISD::SRA, MVT::v64i8, { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb.
410
411 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // psllw
412 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
413 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
414 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // psllw
415 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
416 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
417 };
418
419 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI())
420 if (const auto *Entry =
421 CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second))
422 if (auto KindCost = Entry->Cost[CostKind])
423 return LT.first * *KindCost;
424
425 static const CostKindTblEntry AVX512UniformConstCostTable[] = {
426 { ISD::SHL, MVT::v64i8, { 2, 12, 5, 6 } }, // psllw + pand.
427 { ISD::SRL, MVT::v64i8, { 2, 12, 5, 6 } }, // psrlw + pand.
428 { ISD::SRA, MVT::v64i8, { 3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb.
429
430 { ISD::SHL, MVT::v16i16, { 2, 7, 4, 4 } }, // psllw + split.
431 { ISD::SRL, MVT::v16i16, { 2, 7, 4, 4 } }, // psrlw + split.
432 { ISD::SRA, MVT::v16i16, { 2, 7, 4, 4 } }, // psraw + split.
433
434 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, // pslld
435 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, // psrld
436 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, // psrad
437 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, // pslld
438 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, // psrld
439 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, // psrad
440
441 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, // psraq
442 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, // psllq
443 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, // psrlq
444 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, // psraq
445 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, // psllq
446 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, // psrlq
447 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, // psraq
448
449 { ISD::SDIV, MVT::v16i32, { 6 } }, // pmuludq sequence
450 { ISD::SREM, MVT::v16i32, { 8 } }, // pmuludq+mul+sub sequence
451 { ISD::UDIV, MVT::v16i32, { 5 } }, // pmuludq sequence
452 { ISD::UREM, MVT::v16i32, { 7 } }, // pmuludq+mul+sub sequence
453 };
454
455 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512())
456 if (const auto *Entry =
457 CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second))
458 if (auto KindCost = Entry->Cost[CostKind])
459 return LT.first * *KindCost;
460
461 static const CostKindTblEntry AVX2UniformConstCostTable[] = {
462 { ISD::SHL, MVT::v16i8, { 1, 8, 2, 3 } }, // psllw + pand.
463 { ISD::SRL, MVT::v16i8, { 1, 8, 2, 3 } }, // psrlw + pand.
464 { ISD::SRA, MVT::v16i8, { 2, 10, 5, 6 } }, // psrlw, pand, pxor, psubb.
465 { ISD::SHL, MVT::v32i8, { 2, 8, 2, 4 } }, // psllw + pand.
466 { ISD::SRL, MVT::v32i8, { 2, 8, 2, 4 } }, // psrlw + pand.
467 { ISD::SRA, MVT::v32i8, { 3, 10, 5, 9 } }, // psrlw, pand, pxor, psubb.
468
469 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw
470 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw
471 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw
472 { ISD::SHL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psllw
473 { ISD::SRL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psrlw
474 { ISD::SRA, MVT::v16i16,{ 2, 2, 1, 2 } }, // psraw
475
476 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
477 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld
478 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad
479 { ISD::SHL, MVT::v8i32, { 2, 2, 1, 2 } }, // pslld
480 { ISD::SRL, MVT::v8i32, { 2, 2, 1, 2 } }, // psrld
481 { ISD::SRA, MVT::v8i32, { 2, 2, 1, 2 } }, // psrad
482
483 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq
484 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq
485 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
486 { ISD::SHL, MVT::v4i64, { 2, 2, 1, 2 } }, // psllq
487 { ISD::SRL, MVT::v4i64, { 2, 2, 1, 2 } }, // psrlq
488 { ISD::SRA, MVT::v4i64, { 4, 4, 3, 6 } }, // psrad + shuffle + split.
489
490 { ISD::SDIV, MVT::v8i32, { 6 } }, // pmuludq sequence
491 { ISD::SREM, MVT::v8i32, { 8 } }, // pmuludq+mul+sub sequence
492 { ISD::UDIV, MVT::v8i32, { 5 } }, // pmuludq sequence
493 { ISD::UREM, MVT::v8i32, { 7 } }, // pmuludq+mul+sub sequence
494 };
495
496 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2())
497 if (const auto *Entry =
498 CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second))
499 if (auto KindCost = Entry->Cost[CostKind])
500 return LT.first * *KindCost;
501
502 static const CostKindTblEntry AVXUniformConstCostTable[] = {
503 { ISD::SHL, MVT::v16i8, { 2, 7, 2, 3 } }, // psllw + pand.
504 { ISD::SRL, MVT::v16i8, { 2, 7, 2, 3 } }, // psrlw + pand.
505 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
506 { ISD::SHL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psllw + pand) + split.
507 { ISD::SRL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psrlw + pand) + split.
508 { ISD::SRA, MVT::v32i8, { 7, 7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split.
509
510 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 1 } }, // psllw.
511 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 1 } }, // psrlw.
512 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 1 } }, // psraw.
513 { ISD::SHL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psllw + split.
514 { ISD::SRL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psrlw + split.
515 { ISD::SRA, MVT::v16i16,{ 3, 6, 4, 5 } }, // psraw + split.
516
517 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 1 } }, // pslld.
518 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 1 } }, // psrld.
519 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 1 } }, // psrad.
520 { ISD::SHL, MVT::v8i32, { 3, 6, 4, 5 } }, // pslld + split.
521 { ISD::SRL, MVT::v8i32, { 3, 6, 4, 5 } }, // psrld + split.
522 { ISD::SRA, MVT::v8i32, { 3, 6, 4, 5 } }, // psrad + split.
523
524 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 1 } }, // psllq.
525 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 1 } }, // psrlq.
526 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
527 { ISD::SHL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
528 { ISD::SRL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
529 { ISD::SRA, MVT::v4i64, { 5, 7, 8, 9 } }, // 2 x psrad + shuffle + split.
530
531 { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split.
532 { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split.
533 { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split.
534 { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split.
535 };
536
537 // XOP has faster vXi8 shifts.
538 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() &&
539 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
540 if (const auto *Entry =
541 CostTableLookup(AVXUniformConstCostTable, ISD, LT.second))
542 if (auto KindCost = Entry->Cost[CostKind])
543 return LT.first * *KindCost;
544
545 static const CostKindTblEntry SSE2UniformConstCostTable[] = {
546 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
547 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
548 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
549
550 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw.
551 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw.
552 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw.
553
554 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
555 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld.
556 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad.
557
558 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq.
559 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq.
560 { ISD::SRA, MVT::v2i64, { 3, 5, 6, 6 } }, // 2 x psrad + shuffle.
561
562 { ISD::SDIV, MVT::v4i32, { 6 } }, // pmuludq sequence
563 { ISD::SREM, MVT::v4i32, { 8 } }, // pmuludq+mul+sub sequence
564 { ISD::UDIV, MVT::v4i32, { 5 } }, // pmuludq sequence
565 { ISD::UREM, MVT::v4i32, { 7 } }, // pmuludq+mul+sub sequence
566 };
567
568 // XOP has faster vXi8 shifts.
569 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() &&
570 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
571 if (const auto *Entry =
572 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
573 if (auto KindCost = Entry->Cost[CostKind])
574 return LT.first * *KindCost;
575
576 static const CostKindTblEntry AVX512BWConstCostTable[] = {
577 { ISD::SDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
578 { ISD::SREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
579 { ISD::UDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
580 { ISD::UREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
581
582 { ISD::SDIV, MVT::v32i16, { 6 } }, // vpmulhw sequence
583 { ISD::SREM, MVT::v32i16, { 8 } }, // vpmulhw+mul+sub sequence
584 { ISD::UDIV, MVT::v32i16, { 6 } }, // vpmulhuw sequence
585 { ISD::UREM, MVT::v32i16, { 8 } }, // vpmulhuw+mul+sub sequence
586 };
587
588 if (Op2Info.isConstant() && ST->hasBWI())
589 if (const auto *Entry =
590 CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
591 if (auto KindCost = Entry->Cost[CostKind])
592 return LT.first * *KindCost;
593
594 static const CostKindTblEntry AVX512ConstCostTable[] = {
595 { ISD::SDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
596 { ISD::SREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
597 { ISD::UDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
598 { ISD::UREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
599
600 { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence
601 { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence
602 { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence
603 { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence
604
605 { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence
606 { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence
607 { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence
608 { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence
609 };
610
611 if (Op2Info.isConstant() && ST->hasAVX512())
612 if (const auto *Entry =
613 CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
614 if (auto KindCost = Entry->Cost[CostKind])
615 return LT.first * *KindCost;
616
617 static const CostKindTblEntry AVX2ConstCostTable[] = {
618 { ISD::SDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
619 { ISD::SREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
620 { ISD::UDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
621 { ISD::UREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
622
623 { ISD::SDIV, MVT::v16i16, { 6 } }, // vpmulhw sequence
624 { ISD::SREM, MVT::v16i16, { 8 } }, // vpmulhw+mul+sub sequence
625 { ISD::UDIV, MVT::v16i16, { 6 } }, // vpmulhuw sequence
626 { ISD::UREM, MVT::v16i16, { 8 } }, // vpmulhuw+mul+sub sequence
627
628 { ISD::SDIV, MVT::v8i32, { 15 } }, // vpmuldq sequence
629 { ISD::SREM, MVT::v8i32, { 19 } }, // vpmuldq+mul+sub sequence
630 { ISD::UDIV, MVT::v8i32, { 15 } }, // vpmuludq sequence
631 { ISD::UREM, MVT::v8i32, { 19 } }, // vpmuludq+mul+sub sequence
632 };
633
634 if (Op2Info.isConstant() && ST->hasAVX2())
635 if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
636 if (auto KindCost = Entry->Cost[CostKind])
637 return LT.first * *KindCost;
638
639 static const CostKindTblEntry AVXConstCostTable[] = {
640 { ISD::SDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
641 { ISD::SREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
642 { ISD::UDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
643 { ISD::UREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
644
645 { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split.
646 { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split.
647 { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split.
648 { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split.
649
650 { ISD::SDIV, MVT::v8i32, { 32 } }, // vpmuludq sequence
651 { ISD::SREM, MVT::v8i32, { 38 } }, // vpmuludq+mul+sub sequence
652 { ISD::UDIV, MVT::v8i32, { 32 } }, // 2*pmuludq sequence + split.
653 { ISD::UREM, MVT::v8i32, { 42 } }, // 2*pmuludq+mul+sub sequence + split.
654 };
655
656 if (Op2Info.isConstant() && ST->hasAVX())
657 if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second))
658 if (auto KindCost = Entry->Cost[CostKind])
659 return LT.first * *KindCost;
660
661 static const CostKindTblEntry SSE41ConstCostTable[] = {
662 { ISD::SDIV, MVT::v4i32, { 15 } }, // vpmuludq sequence
663 { ISD::SREM, MVT::v4i32, { 20 } }, // vpmuludq+mul+sub sequence
664 };
665
666 if (Op2Info.isConstant() && ST->hasSSE41())
667 if (const auto *Entry =
668 CostTableLookup(SSE41ConstCostTable, ISD, LT.second))
669 if (auto KindCost = Entry->Cost[CostKind])
670 return LT.first * *KindCost;
671
672 static const CostKindTblEntry SSE2ConstCostTable[] = {
673 { ISD::SDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
674 { ISD::SREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
675 { ISD::UDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
676 { ISD::UREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
677
678 { ISD::SDIV, MVT::v8i16, { 6 } }, // pmulhw sequence
679 { ISD::SREM, MVT::v8i16, { 8 } }, // pmulhw+mul+sub sequence
680 { ISD::UDIV, MVT::v8i16, { 6 } }, // pmulhuw sequence
681 { ISD::UREM, MVT::v8i16, { 8 } }, // pmulhuw+mul+sub sequence
682
683 { ISD::SDIV, MVT::v4i32, { 19 } }, // pmuludq sequence
684 { ISD::SREM, MVT::v4i32, { 24 } }, // pmuludq+mul+sub sequence
685 { ISD::UDIV, MVT::v4i32, { 15 } }, // pmuludq sequence
686 { ISD::UREM, MVT::v4i32, { 20 } }, // pmuludq+mul+sub sequence
687 };
688
689 if (Op2Info.isConstant() && ST->hasSSE2())
690 if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
691 if (auto KindCost = Entry->Cost[CostKind])
692 return LT.first * *KindCost;
693
694 static const CostKindTblEntry AVX512BWUniformCostTable[] = {
695 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
696 { ISD::SRL, MVT::v16i8, { 3,10, 5, 8 } }, // psrlw + pand.
697 { ISD::SRA, MVT::v16i8, { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb.
698 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
699 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
700 { ISD::SRA, MVT::v32i8, { 5,10,10,13 } }, // psrlw, pand, pxor, psubb.
701 { ISD::SHL, MVT::v64i8, { 4, 7, 6, 8 } }, // psllw + pand.
702 { ISD::SRL, MVT::v64i8, { 4, 8, 7,10 } }, // psrlw + pand.
703 { ISD::SRA, MVT::v64i8, { 5,10,10,15 } }, // psrlw, pand, pxor, psubb.
704
705 { ISD::SHL, MVT::v32i16, { 2, 4, 2, 3 } }, // psllw
706 { ISD::SRL, MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw
707 { ISD::SRA, MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw
708 };
709
710 if (ST->hasBWI() && Op2Info.isUniform())
711 if (const auto *Entry =
712 CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second))
713 if (auto KindCost = Entry->Cost[CostKind])
714 return LT.first * *KindCost;
715
716 static const CostKindTblEntry AVX512UniformCostTable[] = {
717 { ISD::SHL, MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split.
718 { ISD::SRL, MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split.
719 { ISD::SRA, MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split.
720
721 { ISD::SHL, MVT::v16i32, { 2, 4, 2, 3 } }, // pslld
722 { ISD::SRL, MVT::v16i32, { 2, 4, 2, 3 } }, // psrld
723 { ISD::SRA, MVT::v16i32, { 2, 4, 2, 3 } }, // psrad
724
725 { ISD::SRA, MVT::v2i64, { 1, 2, 1, 2 } }, // psraq
726 { ISD::SHL, MVT::v4i64, { 1, 4, 1, 2 } }, // psllq
727 { ISD::SRL, MVT::v4i64, { 1, 4, 1, 2 } }, // psrlq
728 { ISD::SRA, MVT::v4i64, { 1, 4, 1, 2 } }, // psraq
729 { ISD::SHL, MVT::v8i64, { 1, 4, 1, 2 } }, // psllq
730 { ISD::SRL, MVT::v8i64, { 1, 4, 1, 2 } }, // psrlq
731 { ISD::SRA, MVT::v8i64, { 1, 4, 1, 2 } }, // psraq
732 };
733
734 if (ST->hasAVX512() && Op2Info.isUniform())
735 if (const auto *Entry =
736 CostTableLookup(AVX512UniformCostTable, ISD, LT.second))
737 if (auto KindCost = Entry->Cost[CostKind])
738 return LT.first * *KindCost;
739
740 static const CostKindTblEntry AVX2UniformCostTable[] = {
741 // Uniform splats are cheaper for the following instructions.
742 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
743 { ISD::SRL, MVT::v16i8, { 3, 9, 5, 8 } }, // psrlw + pand.
744 { ISD::SRA, MVT::v16i8, { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb.
745 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
746 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
747 { ISD::SRA, MVT::v32i8, { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb.
748
749 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 2 } }, // psllw.
750 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 2 } }, // psrlw.
751 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 2 } }, // psraw.
752 { ISD::SHL, MVT::v16i16, { 2, 4, 2, 3 } }, // psllw.
753 { ISD::SRL, MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw.
754 { ISD::SRA, MVT::v16i16, { 2, 4, 2, 3 } }, // psraw.
755
756 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 2 } }, // pslld
757 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 2 } }, // psrld
758 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 2 } }, // psrad
759 { ISD::SHL, MVT::v8i32, { 2, 4, 2, 3 } }, // pslld
760 { ISD::SRL, MVT::v8i32, { 2, 4, 2, 3 } }, // psrld
761 { ISD::SRA, MVT::v8i32, { 2, 4, 2, 3 } }, // psrad
762
763 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 2 } }, // psllq
764 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 2 } }, // psrlq
765 { ISD::SRA, MVT::v2i64, { 2, 4, 5, 7 } }, // 2 x psrad + shuffle.
766 { ISD::SHL, MVT::v4i64, { 2, 4, 1, 2 } }, // psllq
767 { ISD::SRL, MVT::v4i64, { 2, 4, 1, 2 } }, // psrlq
768 { ISD::SRA, MVT::v4i64, { 4, 6, 5, 9 } }, // 2 x psrad + shuffle.
769 };
770
771 if (ST->hasAVX2() && Op2Info.isUniform())
772 if (const auto *Entry =
773 CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
774 if (auto KindCost = Entry->Cost[CostKind])
775 return LT.first * *KindCost;
776
777 static const CostKindTblEntry AVXUniformCostTable[] = {
778 { ISD::SHL, MVT::v16i8, { 4, 4, 6, 8 } }, // psllw + pand.
779 { ISD::SRL, MVT::v16i8, { 4, 8, 5, 8 } }, // psrlw + pand.
780 { ISD::SRA, MVT::v16i8, { 6, 6, 9,13 } }, // psrlw, pand, pxor, psubb.
781 { ISD::SHL, MVT::v32i8, { 7, 8,11,14 } }, // psllw + pand + split.
782 { ISD::SRL, MVT::v32i8, { 7, 9,10,14 } }, // psrlw + pand + split.
783 { ISD::SRA, MVT::v32i8, { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split.
784
785 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 2 } }, // psllw.
786 { ISD::SRL, MVT::v8i16, { 1, 3, 1, 2 } }, // psrlw.
787 { ISD::SRA, MVT::v8i16, { 1, 3, 1, 2 } }, // psraw.
788 { ISD::SHL, MVT::v16i16, { 3, 7, 5, 7 } }, // psllw + split.
789 { ISD::SRL, MVT::v16i16, { 3, 7, 5, 7 } }, // psrlw + split.
790 { ISD::SRA, MVT::v16i16, { 3, 7, 5, 7 } }, // psraw + split.
791
792 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 2 } }, // pslld.
793 { ISD::SRL, MVT::v4i32, { 1, 3, 1, 2 } }, // psrld.
794 { ISD::SRA, MVT::v4i32, { 1, 3, 1, 2 } }, // psrad.
795 { ISD::SHL, MVT::v8i32, { 3, 7, 5, 7 } }, // pslld + split.
796 { ISD::SRL, MVT::v8i32, { 3, 7, 5, 7 } }, // psrld + split.
797 { ISD::SRA, MVT::v8i32, { 3, 7, 5, 7 } }, // psrad + split.
798
799 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 2 } }, // psllq.
800 { ISD::SRL, MVT::v2i64, { 1, 3, 1, 2 } }, // psrlq.
801 { ISD::SRA, MVT::v2i64, { 3, 4, 5, 7 } }, // 2 x psrad + shuffle.
802 { ISD::SHL, MVT::v4i64, { 3, 7, 4, 6 } }, // psllq + split.
803 { ISD::SRL, MVT::v4i64, { 3, 7, 4, 6 } }, // psrlq + split.
804 { ISD::SRA, MVT::v4i64, { 6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split.
805 };
806
807 // XOP has faster vXi8 shifts.
808 if (ST->hasAVX() && Op2Info.isUniform() &&
809 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
810 if (const auto *Entry =
811 CostTableLookup(AVXUniformCostTable, ISD, LT.second))
812 if (auto KindCost = Entry->Cost[CostKind])
813 return LT.first * *KindCost;
814
815 static const CostKindTblEntry SSE2UniformCostTable[] = {
816 // Uniform splats are cheaper for the following instructions.
817 { ISD::SHL, MVT::v16i8, { 9, 10, 6, 9 } }, // psllw + pand.
818 { ISD::SRL, MVT::v16i8, { 9, 13, 5, 9 } }, // psrlw + pand.
819 { ISD::SRA, MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence.
820
821 { ISD::SHL, MVT::v8i16, { 2, 2, 1, 2 } }, // psllw.
822 { ISD::SRL, MVT::v8i16, { 2, 2, 1, 2 } }, // psrlw.
823 { ISD::SRA, MVT::v8i16, { 2, 2, 1, 2 } }, // psraw.
824
825 { ISD::SHL, MVT::v4i32, { 2, 2, 1, 2 } }, // pslld
826 { ISD::SRL, MVT::v4i32, { 2, 2, 1, 2 } }, // psrld.
827 { ISD::SRA, MVT::v4i32, { 2, 2, 1, 2 } }, // psrad.
828
829 { ISD::SHL, MVT::v2i64, { 2, 2, 1, 2 } }, // psllq.
830 { ISD::SRL, MVT::v2i64, { 2, 2, 1, 2 } }, // psrlq.
831 { ISD::SRA, MVT::v2i64, { 5, 9, 5, 7 } }, // 2*psrlq + xor + sub.
832 };
833
834 if (ST->hasSSE2() && Op2Info.isUniform() &&
835 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
836 if (const auto *Entry =
837 CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
838 if (auto KindCost = Entry->Cost[CostKind])
839 return LT.first * *KindCost;
840
841 static const CostKindTblEntry AVX512DQCostTable[] = {
842 { ISD::MUL, MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq
843 { ISD::MUL, MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq
844 { ISD::MUL, MVT::v8i64, { 3, 15, 1, 3 } } // pmullq
845 };
846
847 // Look for AVX512DQ lowering tricks for custom cases.
848 if (ST->hasDQI())
849 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
850 if (auto KindCost = Entry->Cost[CostKind])
851 return LT.first * *KindCost;
852
853 static const CostKindTblEntry AVX512BWCostTable[] = {
854 { ISD::SHL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsllvw/pack sequence.
855 { ISD::SRL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsrlvw/pack sequence.
856 { ISD::SRA, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsravw/pack sequence.
857 { ISD::SHL, MVT::v32i8, { 4, 23,11,16 } }, // extend/vpsllvw/pack sequence.
858 { ISD::SRL, MVT::v32i8, { 4, 30,12,18 } }, // extend/vpsrlvw/pack sequence.
859 { ISD::SRA, MVT::v32i8, { 6, 13,24,30 } }, // extend/vpsravw/pack sequence.
860 { ISD::SHL, MVT::v64i8, { 6, 19,13,15 } }, // extend/vpsllvw/pack sequence.
861 { ISD::SRL, MVT::v64i8, { 7, 27,15,18 } }, // extend/vpsrlvw/pack sequence.
862 { ISD::SRA, MVT::v64i8, { 15, 15,30,30 } }, // extend/vpsravw/pack sequence.
863
864 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsllvw
865 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsrlvw
866 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsravw
867 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsllvw
868 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsrlvw
869 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsravw
870 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsllvw
871 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsrlvw
872 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsravw
873
874 { ISD::ADD, MVT::v64i8, { 1, 1, 1, 1 } }, // paddb
875 { ISD::ADD, MVT::v32i16, { 1, 1, 1, 1 } }, // paddw
876
877 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 1 } }, // paddb
878 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 1 } }, // paddw
879 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 1 } }, // paddd
880 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 1 } }, // paddq
881
882 { ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb
883 { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw
884
885 { ISD::MUL, MVT::v16i8, { 4, 12, 4, 5 } }, // extend/pmullw/trunc
886 { ISD::MUL, MVT::v32i8, { 3, 10, 7,10 } }, // pmaddubsw
887 { ISD::MUL, MVT::v64i8, { 3, 11, 7,10 } }, // pmaddubsw
888 { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw
889
890 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb
891 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 1 } }, // psubw
892 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 1 } }, // psubd
893 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 1 } }, // psubq
894 };
895
896 // Look for AVX512BW lowering tricks for custom cases.
897 if (ST->hasBWI())
898 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
899 if (auto KindCost = Entry->Cost[CostKind])
900 return LT.first * *KindCost;
901
902 static const CostKindTblEntry AVX512CostTable[] = {
903 { ISD::SHL, MVT::v64i8, { 15, 19,27,33 } }, // vpblendv+split sequence.
904 { ISD::SRL, MVT::v64i8, { 15, 19,30,36 } }, // vpblendv+split sequence.
905 { ISD::SRA, MVT::v64i8, { 37, 37,51,63 } }, // vpblendv+split sequence.
906
907 { ISD::SHL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
908 { ISD::SRL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
909 { ISD::SRA, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence.
910
911 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } },
912 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } },
913 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } },
914 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } },
915 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } },
916 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } },
917 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } },
918 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } },
919 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } },
920
921 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } },
922 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } },
923 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } },
924 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } },
925 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } },
926 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } },
927 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } },
928 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } },
929 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } },
930
931 { ISD::ADD, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*paddb + split
932 { ISD::ADD, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*paddw + split
933
934 { ISD::SUB, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*psubb + split
935 { ISD::SUB, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*psubw + split
936
937 { ISD::AND, MVT::v32i8, { 1, 1, 1, 1 } },
938 { ISD::AND, MVT::v16i16, { 1, 1, 1, 1 } },
939 { ISD::AND, MVT::v8i32, { 1, 1, 1, 1 } },
940 { ISD::AND, MVT::v4i64, { 1, 1, 1, 1 } },
941
942 { ISD::OR, MVT::v32i8, { 1, 1, 1, 1 } },
943 { ISD::OR, MVT::v16i16, { 1, 1, 1, 1 } },
944 { ISD::OR, MVT::v8i32, { 1, 1, 1, 1 } },
945 { ISD::OR, MVT::v4i64, { 1, 1, 1, 1 } },
946
947 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 1 } },
948 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 1 } },
949 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 1 } },
950 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 1 } },
951
952 { ISD::MUL, MVT::v16i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
953 { ISD::MUL, MVT::v8i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
954 { ISD::MUL, MVT::v4i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
955 { ISD::MUL, MVT::v8i64, { 6, 9, 8, 8 } }, // 3*pmuludq/3*shift/2*add
956 { ISD::MUL, MVT::i64, { 1 } }, // Skylake from http://www.agner.org/
957
958 { X86ISD::PMULUDQ, MVT::v8i64, { 1, 5, 1, 1 } },
959
960 { ISD::FNEG, MVT::v8f64, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
961 { ISD::FADD, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
962 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
963 { ISD::FSUB, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
964 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
965 { ISD::FMUL, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
966 { ISD::FMUL, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
967 { ISD::FMUL, MVT::v2f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
968 { ISD::FMUL, MVT::f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
969
970 { ISD::FDIV, MVT::f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
971 { ISD::FDIV, MVT::v2f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
972 { ISD::FDIV, MVT::v4f64, { 8, 14, 1, 1 } }, // Skylake from http://www.agner.org/
973 { ISD::FDIV, MVT::v8f64, { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/
974
975 { ISD::FNEG, MVT::v16f32, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
976 { ISD::FADD, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
977 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
978 { ISD::FSUB, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
979 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
980 { ISD::FMUL, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
981 { ISD::FMUL, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
982 { ISD::FMUL, MVT::v4f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
983 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
984
985 { ISD::FDIV, MVT::f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
986 { ISD::FDIV, MVT::v4f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
987 { ISD::FDIV, MVT::v8f32, { 5, 11, 1, 1 } }, // Skylake from http://www.agner.org/
988 { ISD::FDIV, MVT::v16f32, { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/
989 };
990
991 if (ST->hasAVX512())
992 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
993 if (auto KindCost = Entry->Cost[CostKind])
994 return LT.first * *KindCost;
995
996 static const CostKindTblEntry AVX2ShiftCostTable[] = {
997 // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
998 // customize them to detect the cases where shift amount is a scalar one.
999 { ISD::SHL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org)
1000 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
1001 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org)
1002 { ISD::SHL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org)
1003 { ISD::SRL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
1004 { ISD::SRA, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org)
1005 { ISD::SHL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org)
1006 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org)
1007 { ISD::SHL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org)
1008 { ISD::SRL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org)
1009 };
1010
1011 if (ST->hasAVX512()) {
1012 if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant())
1013 // On AVX512, a packed v32i16 shift left by a constant build_vector
1014 // is lowered into a vector multiply (vpmullw).
1015 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
1016 Op1Info.getNoProps(), Op2Info.getNoProps());
1017 }
1018
1019 // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
1020 if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
1021 if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
1022 Op2Info.isConstant())
1023 // On AVX2, a packed v16i16 shift left by a constant build_vector
1024 // is lowered into a vector multiply (vpmullw).
1025 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
1026 Op1Info.getNoProps(), Op2Info.getNoProps());
1027
1028 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
1029 if (auto KindCost = Entry->Cost[CostKind])
1030 return LT.first * *KindCost;
1031 }
1032
1033 static const CostKindTblEntry XOPShiftCostTable[] = {
1034 // 128bit shifts take 1cy, but right shifts require negation beforehand.
1035 { ISD::SHL, MVT::v16i8, { 1, 3, 1, 1 } },
1036 { ISD::SRL, MVT::v16i8, { 2, 3, 1, 1 } },
1037 { ISD::SRA, MVT::v16i8, { 2, 3, 1, 1 } },
1038 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 1 } },
1039 { ISD::SRL, MVT::v8i16, { 2, 3, 1, 1 } },
1040 { ISD::SRA, MVT::v8i16, { 2, 3, 1, 1 } },
1041 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 1 } },
1042 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 1 } },
1043 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 1 } },
1044 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 1 } },
1045 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } },
1046 { ISD::SRA, MVT::v2i64, { 2, 3, 1, 1 } },
1047 // 256bit shifts require splitting if AVX2 didn't catch them above.
1048 { ISD::SHL, MVT::v32i8, { 4, 7, 5, 6 } },
1049 { ISD::SRL, MVT::v32i8, { 6, 7, 5, 6 } },
1050 { ISD::SRA, MVT::v32i8, { 6, 7, 5, 6 } },
1051 { ISD::SHL, MVT::v16i16, { 4, 7, 5, 6 } },
1052 { ISD::SRL, MVT::v16i16, { 6, 7, 5, 6 } },
1053 { ISD::SRA, MVT::v16i16, { 6, 7, 5, 6 } },
1054 { ISD::SHL, MVT::v8i32, { 4, 7, 5, 6 } },
1055 { ISD::SRL, MVT::v8i32, { 6, 7, 5, 6 } },
1056 { ISD::SRA, MVT::v8i32, { 6, 7, 5, 6 } },
1057 { ISD::SHL, MVT::v4i64, { 4, 7, 5, 6 } },
1058 { ISD::SRL, MVT::v4i64, { 6, 7, 5, 6 } },
1059 { ISD::SRA, MVT::v4i64, { 6, 7, 5, 6 } },
1060 };
1061
1062 // Look for XOP lowering tricks.
1063 if (ST->hasXOP()) {
1064 // If the right shift is constant then we'll fold the negation so
1065 // it's as cheap as a left shift.
1066 int ShiftISD = ISD;
1067 if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant())
1068 ShiftISD = ISD::SHL;
1069 if (const auto *Entry =
1070 CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
1071 if (auto KindCost = Entry->Cost[CostKind])
1072 return LT.first * *KindCost;
1073 }
1074
1075 if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) {
1076 MVT VT = LT.second;
1077 // Vector shift left by non uniform constant can be lowered
1078 // into vector multiply.
1079 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
1080 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
1081 ISD = ISD::MUL;
1082 }
1083
1084 static const CostKindTblEntry GLMCostTable[] = {
1085 { ISD::FDIV, MVT::f32, { 18, 19, 1, 1 } }, // divss
1086 { ISD::FDIV, MVT::v4f32, { 35, 36, 1, 1 } }, // divps
1087 { ISD::FDIV, MVT::f64, { 33, 34, 1, 1 } }, // divsd
1088 { ISD::FDIV, MVT::v2f64, { 65, 66, 1, 1 } }, // divpd
1089 };
1090
1091 if (ST->useGLMDivSqrtCosts())
1092 if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second))
1093 if (auto KindCost = Entry->Cost[CostKind])
1094 return LT.first * *KindCost;
1095
1096 static const CostKindTblEntry SLMCostTable[] = {
1097 { ISD::MUL, MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld
1098 { ISD::MUL, MVT::v8i16, { 2, 5, 1, 1 } }, // pmullw
1099 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // mulsd
1100 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // mulss
1101 { ISD::FMUL, MVT::v2f64, { 4, 7, 1, 1 } }, // mulpd
1102 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // mulps
1103 { ISD::FDIV, MVT::f32, { 17, 19, 1, 1 } }, // divss
1104 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 6 } }, // divps
1105 { ISD::FDIV, MVT::f64, { 32, 34, 1, 1 } }, // divsd
1106 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 6 } }, // divpd
1107 { ISD::FADD, MVT::v2f64, { 2, 4, 1, 1 } }, // addpd
1108 { ISD::FSUB, MVT::v2f64, { 2, 4, 1, 1 } }, // subpd
1109 // v2i64/v4i64 mul is custom lowered as a series of long:
1110 // multiplies(3), shifts(3) and adds(2)
1111 // slm muldq version throughput is 2 and addq throughput 4
1112 // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
1113 // 3X4 (addq throughput) = 17
1114 { ISD::MUL, MVT::v2i64, { 17, 22, 9, 9 } },
1115 // slm addq\subq throughput is 4
1116 { ISD::ADD, MVT::v2i64, { 4, 2, 1, 2 } },
1117 { ISD::SUB, MVT::v2i64, { 4, 2, 1, 2 } },
1118 };
1119
1120 if (ST->useSLMArithCosts())
1121 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second))
1122 if (auto KindCost = Entry->Cost[CostKind])
1123 return LT.first * *KindCost;
1124
1125 static const CostKindTblEntry AVX2CostTable[] = {
1126 { ISD::SHL, MVT::v16i8, { 6, 21,11,16 } }, // vpblendvb sequence.
1127 { ISD::SHL, MVT::v32i8, { 6, 23,11,22 } }, // vpblendvb sequence.
1128 { ISD::SHL, MVT::v8i16, { 5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence.
1129 { ISD::SHL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1130
1131 { ISD::SRL, MVT::v16i8, { 6, 27,12,18 } }, // vpblendvb sequence.
1132 { ISD::SRL, MVT::v32i8, { 8, 30,12,24 } }, // vpblendvb sequence.
1133 { ISD::SRL, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence.
1134 { ISD::SRL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1135
1136 { ISD::SRA, MVT::v16i8, { 17, 17,24,30 } }, // vpblendvb sequence.
1137 { ISD::SRA, MVT::v32i8, { 18, 20,24,43 } }, // vpblendvb sequence.
1138 { ISD::SRA, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsravd/pack sequence.
1139 { ISD::SRA, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsravd/pack sequence.
1140 { ISD::SRA, MVT::v2i64, { 4, 5, 5, 5 } }, // srl/xor/sub sequence.
1141 { ISD::SRA, MVT::v4i64, { 8, 8, 5, 9 } }, // srl/xor/sub sequence.
1142
1143 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 2 } }, // psubb
1144 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 2 } }, // paddb
1145 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 2 } }, // psubw
1146 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 2 } }, // paddw
1147 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 2 } }, // psubd
1148 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 2 } }, // paddd
1149 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 2 } }, // psubq
1150 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq
1151
1152 { ISD::MUL, MVT::v16i8, { 5, 18, 6,12 } }, // extend/pmullw/pack
1153 { ISD::MUL, MVT::v32i8, { 4, 8, 8,16 } }, // pmaddubsw
1154 { ISD::MUL, MVT::v16i16, { 2, 5, 1, 2 } }, // pmullw
1155 { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld
1156 { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld
1157 { ISD::MUL, MVT::v4i64, { 6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add
1158 { ISD::MUL, MVT::v2i64, { 6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
1159
1160 { X86ISD::PMULUDQ, MVT::v4i64, { 1, 5, 1, 1 } },
1161
1162 { ISD::FNEG, MVT::v4f64, { 1, 1, 1, 2 } }, // vxorpd
1163 { ISD::FNEG, MVT::v8f32, { 1, 1, 1, 2 } }, // vxorps
1164
1165 { ISD::FADD, MVT::f64, { 1, 4, 1, 1 } }, // vaddsd
1166 { ISD::FADD, MVT::f32, { 1, 4, 1, 1 } }, // vaddss
1167 { ISD::FADD, MVT::v2f64, { 1, 4, 1, 1 } }, // vaddpd
1168 { ISD::FADD, MVT::v4f32, { 1, 4, 1, 1 } }, // vaddps
1169 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 2 } }, // vaddpd
1170 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 2 } }, // vaddps
1171
1172 { ISD::FSUB, MVT::f64, { 1, 4, 1, 1 } }, // vsubsd
1173 { ISD::FSUB, MVT::f32, { 1, 4, 1, 1 } }, // vsubss
1174 { ISD::FSUB, MVT::v2f64, { 1, 4, 1, 1 } }, // vsubpd
1175 { ISD::FSUB, MVT::v4f32, { 1, 4, 1, 1 } }, // vsubps
1176 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 2 } }, // vsubpd
1177 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 2 } }, // vsubps
1178
1179 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // vmulsd
1180 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // vmulss
1181 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // vmulpd
1182 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // vmulps
1183 { ISD::FMUL, MVT::v4f64, { 1, 5, 1, 2 } }, // vmulpd
1184 { ISD::FMUL, MVT::v8f32, { 1, 5, 1, 2 } }, // vmulps
1185
1186 { ISD::FDIV, MVT::f32, { 7, 13, 1, 1 } }, // vdivss
1187 { ISD::FDIV, MVT::v4f32, { 7, 13, 1, 1 } }, // vdivps
1188 { ISD::FDIV, MVT::v8f32, { 14, 21, 1, 3 } }, // vdivps
1189 { ISD::FDIV, MVT::f64, { 14, 20, 1, 1 } }, // vdivsd
1190 { ISD::FDIV, MVT::v2f64, { 14, 20, 1, 1 } }, // vdivpd
1191 { ISD::FDIV, MVT::v4f64, { 28, 35, 1, 3 } }, // vdivpd
1192 };
1193
1194 // Look for AVX2 lowering tricks for custom cases.
1195 if (ST->hasAVX2())
1196 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
1197 if (auto KindCost = Entry->Cost[CostKind])
1198 return LT.first * *KindCost;
1199
1200 static const CostKindTblEntry AVX1CostTable[] = {
1201 // We don't have to scalarize unsupported ops. We can issue two half-sized
1202 // operations and we only need to extract the upper YMM half.
1203 // Two ops + 1 extract + 1 insert = 4.
1204 { ISD::MUL, MVT::v32i8, { 10, 11, 18, 19 } }, // pmaddubsw + split
1205 { ISD::MUL, MVT::v16i8, { 5, 6, 8, 12 } }, // 2*pmaddubsw/3*and/psllw/or
1206 { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split
1207 { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split
1208 { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld
1209 { ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } },
1210
1211 { X86ISD::PMULUDQ, MVT::v4i64, { 3, 5, 5, 6 } }, // pmuludq + split
1212
1213 { ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps
1214 { ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps
1215 { ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps
1216 { ISD::AND, MVT::v4i64, { 1, 1, 1, 2 } }, // vandps
1217
1218 { ISD::OR, MVT::v32i8, { 1, 1, 1, 2 } }, // vorps
1219 { ISD::OR, MVT::v16i16, { 1, 1, 1, 2 } }, // vorps
1220 { ISD::OR, MVT::v8i32, { 1, 1, 1, 2 } }, // vorps
1221 { ISD::OR, MVT::v4i64, { 1, 1, 1, 2 } }, // vorps
1222
1223 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 2 } }, // vxorps
1224 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 2 } }, // vxorps
1225 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 2 } }, // vxorps
1226 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 2 } }, // vxorps
1227
1228 { ISD::SUB, MVT::v32i8, { 4, 2, 5, 6 } }, // psubb + split
1229 { ISD::ADD, MVT::v32i8, { 4, 2, 5, 6 } }, // paddb + split
1230 { ISD::SUB, MVT::v16i16, { 4, 2, 5, 6 } }, // psubw + split
1231 { ISD::ADD, MVT::v16i16, { 4, 2, 5, 6 } }, // paddw + split
1232 { ISD::SUB, MVT::v8i32, { 4, 2, 5, 6 } }, // psubd + split
1233 { ISD::ADD, MVT::v8i32, { 4, 2, 5, 6 } }, // paddd + split
1234 { ISD::SUB, MVT::v4i64, { 4, 2, 5, 6 } }, // psubq + split
1235 { ISD::ADD, MVT::v4i64, { 4, 2, 5, 6 } }, // paddq + split
1236 { ISD::SUB, MVT::v2i64, { 1, 1, 1, 1 } }, // psubq
1237 { ISD::ADD, MVT::v2i64, { 1, 1, 1, 1 } }, // paddq
1238
1239 { ISD::SHL, MVT::v16i8, { 10, 21,11,17 } }, // pblendvb sequence.
1240 { ISD::SHL, MVT::v32i8, { 22, 22,27,40 } }, // pblendvb sequence + split.
1241 { ISD::SHL, MVT::v8i16, { 6, 9,11,11 } }, // pblendvb sequence.
1242 { ISD::SHL, MVT::v16i16, { 13, 16,24,25 } }, // pblendvb sequence + split.
1243 { ISD::SHL, MVT::v4i32, { 3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld
1244 { ISD::SHL, MVT::v8i32, { 9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split
1245 { ISD::SHL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1246 { ISD::SHL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1247
1248 { ISD::SRL, MVT::v16i8, { 11, 27,12,18 } }, // pblendvb sequence.
1249 { ISD::SRL, MVT::v32i8, { 23, 23,30,43 } }, // pblendvb sequence + split.
1250 { ISD::SRL, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1251 { ISD::SRL, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1252 { ISD::SRL, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1253 { ISD::SRL, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1254 { ISD::SRL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1255 { ISD::SRL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1256
1257 { ISD::SRA, MVT::v16i8, { 21, 22,24,36 } }, // pblendvb sequence.
1258 { ISD::SRA, MVT::v32i8, { 44, 45,51,76 } }, // pblendvb sequence + split.
1259 { ISD::SRA, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1260 { ISD::SRA, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1261 { ISD::SRA, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1262 { ISD::SRA, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1263 { ISD::SRA, MVT::v2i64, { 5, 6,10,14 } }, // Shift each lane + blend.
1264 { ISD::SRA, MVT::v4i64, { 12, 12,22,30 } }, // Shift each lane + blend + split.
1265
1266 { ISD::FNEG, MVT::v4f64, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1267 { ISD::FNEG, MVT::v8f32, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1268
1269 { ISD::FADD, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1270 { ISD::FADD, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1271 { ISD::FADD, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1272 { ISD::FADD, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1273 { ISD::FADD, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1274 { ISD::FADD, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1275
1276 { ISD::FSUB, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1277 { ISD::FSUB, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1278 { ISD::FSUB, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1279 { ISD::FSUB, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1280 { ISD::FSUB, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1281 { ISD::FSUB, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1282
1283 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1284 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1285 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1286 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1287 { ISD::FMUL, MVT::v4f64, { 4, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1288 { ISD::FMUL, MVT::v8f32, { 2, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1289
1290 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1291 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1292 { ISD::FDIV, MVT::v8f32, { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/
1293 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1294 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1295 { ISD::FDIV, MVT::v4f64, { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/
1296 };
1297
1298 if (ST->hasAVX())
1299 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
1300 if (auto KindCost = Entry->Cost[CostKind])
1301 return LT.first * *KindCost;
1302
1303 static const CostKindTblEntry SSE42CostTable[] = {
1304 { ISD::FADD, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1305 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1306 { ISD::FADD, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1307 { ISD::FADD, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1308
1309 { ISD::FSUB, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1310 { ISD::FSUB, MVT::f32 , { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1311 { ISD::FSUB, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1312 { ISD::FSUB, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1313
1314 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1315 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1316 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1317 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1318
1319 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1320 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1321 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1322 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1323
1324 { ISD::MUL, MVT::v2i64, { 6, 10,10,10 } } // 3*pmuludq/3*shift/2*add
1325 };
1326
1327 if (ST->hasSSE42())
1328 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
1329 if (auto KindCost = Entry->Cost[CostKind])
1330 return LT.first * *KindCost;
1331
1332 static const CostKindTblEntry SSE41CostTable[] = {
1333 { ISD::SHL, MVT::v16i8, { 15, 24,17,22 } }, // pblendvb sequence.
1334 { ISD::SHL, MVT::v8i16, { 11, 14,11,11 } }, // pblendvb sequence.
1335 { ISD::SHL, MVT::v4i32, { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld
1336
1337 { ISD::SRL, MVT::v16i8, { 16, 27,18,24 } }, // pblendvb sequence.
1338 { ISD::SRL, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1339 { ISD::SRL, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1340 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1341
1342 { ISD::SRA, MVT::v16i8, { 38, 41,30,36 } }, // pblendvb sequence.
1343 { ISD::SRA, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1344 { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1345 { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence.
1346
1347 { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org)
1348 };
1349
1350 if (ST->hasSSE41())
1351 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
1352 if (auto KindCost = Entry->Cost[CostKind])
1353 return LT.first * *KindCost;
1354
1355 static const CostKindTblEntry SSSE3CostTable[] = {
1356 { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*pmaddubsw/3*and/psllw/or
1357 };
1358
1359 if (ST->hasSSSE3())
1360 if (const auto *Entry = CostTableLookup(SSSE3CostTable, ISD, LT.second))
1361 if (auto KindCost = Entry->Cost[CostKind])
1362 return LT.first * *KindCost;
1363
1364 static const CostKindTblEntry SSE2CostTable[] = {
1365 // We don't correctly identify costs of casts because they are marked as
1366 // custom.
1367 { ISD::SHL, MVT::v16i8, { 13, 21,26,28 } }, // cmpgtb sequence.
1368 { ISD::SHL, MVT::v8i16, { 24, 27,16,20 } }, // cmpgtw sequence.
1369 { ISD::SHL, MVT::v4i32, { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq.
1370 { ISD::SHL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1371
1372 { ISD::SRL, MVT::v16i8, { 14, 28,27,30 } }, // cmpgtb sequence.
1373 { ISD::SRL, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1374 { ISD::SRL, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1375 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1376
1377 { ISD::SRA, MVT::v16i8, { 27, 30,54,54 } }, // unpacked cmpgtb sequence.
1378 { ISD::SRA, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1379 { ISD::SRA, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1380 { ISD::SRA, MVT::v2i64, { 8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence.
1381
1382 { ISD::AND, MVT::v16i8, { 1, 1, 1, 1 } }, // pand
1383 { ISD::AND, MVT::v8i16, { 1, 1, 1, 1 } }, // pand
1384 { ISD::AND, MVT::v4i32, { 1, 1, 1, 1 } }, // pand
1385 { ISD::AND, MVT::v2i64, { 1, 1, 1, 1 } }, // pand
1386
1387 { ISD::OR, MVT::v16i8, { 1, 1, 1, 1 } }, // por
1388 { ISD::OR, MVT::v8i16, { 1, 1, 1, 1 } }, // por
1389 { ISD::OR, MVT::v4i32, { 1, 1, 1, 1 } }, // por
1390 { ISD::OR, MVT::v2i64, { 1, 1, 1, 1 } }, // por
1391
1392 { ISD::XOR, MVT::v16i8, { 1, 1, 1, 1 } }, // pxor
1393 { ISD::XOR, MVT::v8i16, { 1, 1, 1, 1 } }, // pxor
1394 { ISD::XOR, MVT::v4i32, { 1, 1, 1, 1 } }, // pxor
1395 { ISD::XOR, MVT::v2i64, { 1, 1, 1, 1 } }, // pxor
1396
1397 { ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq
1398 { ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq
1399
1400 { ISD::MUL, MVT::v16i8, { 6, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack
1401 { ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw
1402 { ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle
1403 { ISD::MUL, MVT::v2i64, { 7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add
1404
1405 { X86ISD::PMULUDQ, MVT::v2i64, { 1, 5, 1, 1 } },
1406
1407 { ISD::FDIV, MVT::f32, { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/
1408 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/
1409 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/
1410 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/
1411
1412 { ISD::FNEG, MVT::f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1413 { ISD::FNEG, MVT::f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1414 { ISD::FNEG, MVT::v4f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1415 { ISD::FNEG, MVT::v2f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1416
1417 { ISD::FADD, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1418 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1419 { ISD::FADD, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1420
1421 { ISD::FSUB, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1422 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1423 { ISD::FSUB, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1424
1425 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1426 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1427 };
1428
1429 if (ST->hasSSE2())
1430 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
1431 if (auto KindCost = Entry->Cost[CostKind])
1432 return LT.first * *KindCost;
1433
1434 static const CostKindTblEntry SSE1CostTable[] = {
1435 { ISD::FDIV, MVT::f32, { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/
1436 { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/
1437
1438 { ISD::FNEG, MVT::f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1439 { ISD::FNEG, MVT::v4f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1440
1441 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1442 { ISD::FADD, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1443
1444 { ISD::FSUB, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1445 { ISD::FSUB, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1446
1447 { ISD::FMUL, MVT::f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1448 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1449 };
1450
1451 if (ST->hasSSE1())
1452 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1453 if (auto KindCost = Entry->Cost[CostKind])
1454 return LT.first * *KindCost;
1455
1456 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
1457 { ISD::ADD, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1458 { ISD::SUB, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1459 { ISD::MUL, MVT::i64, { 2, 6, 1, 2 } },
1460 };
1461
1462 if (ST->is64Bit())
1463 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1464 if (auto KindCost = Entry->Cost[CostKind])
1465 return LT.first * *KindCost;
1466
1467 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1468 { ISD::ADD, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1469 { ISD::ADD, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1470 { ISD::ADD, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1471
1472 { ISD::SUB, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1473 { ISD::SUB, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1474 { ISD::SUB, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1475
1476 { ISD::MUL, MVT::i8, { 3, 4, 1, 1 } },
1477 { ISD::MUL, MVT::i16, { 2, 4, 1, 1 } },
1478 { ISD::MUL, MVT::i32, { 1, 4, 1, 1 } },
1479
1480 { ISD::FNEG, MVT::f64, { 2, 2, 1, 3 } }, // (x87)
1481 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1482 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1483 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // (x87)
1484 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87)
1485 };
1486
1487 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1488 if (auto KindCost = Entry->Cost[CostKind])
1489 return LT.first * *KindCost;
1490
1491 // It is not a good idea to vectorize division. We have to scalarize it and
1492 // in the process we will often end up having to spilling regular
1493 // registers. The overhead of division is going to dominate most kernels
1494 // anyways so try hard to prevent vectorization of division - it is
1495 // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1496 // to hide "20 cycles" for each lane.
1497 if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() &&
1498 (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
1499 ISD == ISD::UREM)) {
1500 InstructionCost ScalarCost =
1501 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
1502 Op1Info.getNoProps(), Op2Info.getNoProps());
1503 return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1504 }
1505
1506 // Handle some basic single instruction code size cases.
1507 if (CostKind == TTI::TCK_CodeSize) {
1508 switch (ISD) {
1509 case ISD::FADD:
1510 case ISD::FSUB:
1511 case ISD::FMUL:
1512 case ISD::FDIV:
1513 case ISD::FNEG:
1514 case ISD::AND:
1515 case ISD::OR:
1516 case ISD::XOR:
1517 return LT.first;
1518 break;
1519 }
1520 }
1521
1522 // Fallback to the default implementation.
1523 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1524 Args, CxtI);
1525}
1526
1529 unsigned Opcode1, const SmallBitVector &OpcodeMask,
1531 if (isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
1532 return TTI::TCC_Basic;
1534}
1535
1537 VectorType *DstTy, VectorType *SrcTy,
1538 ArrayRef<int> Mask,
1540 int Index, VectorType *SubTp,
1542 const Instruction *CxtI) const {
1543 assert((Mask.empty() || DstTy->isScalableTy() ||
1544 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
1545 "Expected the Mask to match the return size if given");
1546 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
1547 "Expected the same scalar types");
1548
1549 // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1550 // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1551 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1552
1553 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
1554
1555 // If all args are constant than this will be constant folded away.
1556 if (!Args.empty() &&
1557 all_of(Args, [](const Value *Arg) { return isa<Constant>(Arg); }))
1558 return TTI::TCC_Free;
1559
1560 // Recognize a basic concat_vector shuffle.
1561 if (Kind == TTI::SK_PermuteTwoSrc &&
1562 Mask.size() == (2 * SrcTy->getElementCount().getKnownMinValue()) &&
1563 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
1567 CostKind, Mask.size() / 2, SrcTy);
1568
1569 // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1570 if (Kind == TTI::SK_Transpose)
1571 if (LT.second != MVT::v4f64 && LT.second != MVT::v4i64)
1572 Kind = TTI::SK_PermuteTwoSrc;
1573
1574 if (Kind == TTI::SK_Broadcast) {
1575 // For Broadcasts we are splatting the first element from the first input
1576 // register, so only need to reference that input and all the output
1577 // registers are the same.
1578 LT.first = 1;
1579
1580 // If we're broadcasting a load then AVX/AVX2 can do this for free.
1581 // If many-used-load whose every use is one of a small set of operations
1582 // that SLP can rewrite into a single vector lane, codegen can fold it into
1583 // the free broadcast.
1584 using namespace PatternMatch;
1585 auto IsBroadcastLoadFoldUser = [&](const User *U) {
1586 if (isa<InsertElementInst>(U) && U->getOperand(1) == Args[0])
1587 return true;
1588 if (U->getType()->isVectorTy())
1589 return false;
1590 // Terminators (return/branch/switch/indirectbr/resume/invoke EH)
1591 // and phis carry the value across control flow.
1592 if (const auto *I = dyn_cast<Instruction>(U))
1593 if (I->isTerminator() ||
1595 return false;
1596 // Only pure calls can be folded.
1597 if (const auto *CB = dyn_cast<CallBase>(U))
1598 return CB->doesNotAccessMemory() && !CB->mayHaveSideEffects();
1599 return true;
1600 };
1601 auto IsFoldableSLPBroadcastLoad = [&]() {
1602 if (!match(Args[0], m_Load(m_Value())))
1603 return false;
1604 auto *FVT = dyn_cast<FixedVectorType>(DstTy);
1605 if (!FVT)
1606 return false;
1607 // getNumUses() counts each Use, matching the per-lane broadcast
1608 // accounting (a use like `op %x, %x` consumes two broadcast lanes).
1609 if (Args[0]->getNumUses() != FVT->getNumElements())
1610 return false;
1611 return all_of(Args[0]->users(), IsBroadcastLoadFoldUser);
1612 };
1613 if (!Args.empty() &&
1614 (match(Args[0], m_OneUse(m_Load(m_Value()))) ||
1615 IsFoldableSLPBroadcastLoad()) &&
1616 (ST->hasAVX2() ||
1617 (ST->hasAVX() && LT.second.getScalarSizeInBits() >= 32)))
1618 return TTI::TCC_Free;
1619 }
1620
1621 // Attempt to detect a cheaper inlane shuffle, avoiding 128-bit subvector
1622 // permutation.
1623 // Attempt to detect a shuffle mask with a single defined element.
1624 bool IsInLaneShuffle = false;
1625 bool IsSingleElementMask = false;
1626 if (SrcTy->getPrimitiveSizeInBits() > 0 &&
1627 (SrcTy->getPrimitiveSizeInBits() % 128) == 0 &&
1628 SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
1629 Mask.size() == SrcTy->getElementCount().getKnownMinValue()) {
1630 unsigned NumLanes = SrcTy->getPrimitiveSizeInBits() / 128;
1631 unsigned NumEltsPerLane = Mask.size() / NumLanes;
1632 if ((Mask.size() % NumLanes) == 0) {
1633 IsInLaneShuffle = all_of(enumerate(Mask), [&](const auto &P) {
1634 return P.value() == PoisonMaskElem ||
1635 ((P.value() % Mask.size()) / NumEltsPerLane) ==
1636 (P.index() / NumEltsPerLane);
1637 });
1638 IsSingleElementMask =
1639 (Mask.size() - 1) == static_cast<unsigned>(count_if(Mask, [](int M) {
1640 return M == PoisonMaskElem;
1641 }));
1642 }
1643 }
1644
1645 // Treat <X x bfloat> shuffles as <X x half>.
1646 if (LT.second.isVectorOf(MVT::bf16))
1647 LT.second = LT.second.changeVectorElementType(MVT::f16);
1648
1649 // Subvector extractions are free if they start at the beginning of a
1650 // vector and cheap if the subvectors are aligned.
1651 if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1652 int NumElts = LT.second.getVectorNumElements();
1653 if ((Index % NumElts) == 0)
1654 return TTI::TCC_Free;
1655 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1656 if (SubLT.second.isVector()) {
1657 int NumSubElts = SubLT.second.getVectorNumElements();
1658 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1659 return SubLT.first;
1660 // Handle some cases for widening legalization. For now we only handle
1661 // cases where the original subvector was naturally aligned and evenly
1662 // fit in its legalized subvector type.
1663 // FIXME: Remove some of the alignment restrictions.
1664 // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1665 // vectors.
1666 int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1667 if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1668 (NumSubElts % OrigSubElts) == 0 &&
1669 LT.second.getVectorElementType() ==
1670 SubLT.second.getVectorElementType() &&
1671 LT.second.getVectorElementType().getSizeInBits() ==
1672 SrcTy->getElementType()->getPrimitiveSizeInBits()) {
1673 assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1674 "Unexpected number of elements!");
1675 auto *VecTy = FixedVectorType::get(SrcTy->getElementType(),
1676 LT.second.getVectorNumElements());
1677 auto *SubTy = FixedVectorType::get(SrcTy->getElementType(),
1678 SubLT.second.getVectorNumElements());
1679 int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1680 InstructionCost ExtractCost =
1682 ExtractIndex, SubTy);
1683
1684 // If the original size is 32-bits or more, we can use pshufd. Otherwise
1685 // if we have SSSE3 we can use pshufb.
1686 if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1687 return ExtractCost + 1; // pshufd or pshufb
1688
1689 assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1690 "Unexpected vector size");
1691
1692 return ExtractCost + 2; // worst case pshufhw + pshufd
1693 }
1694 }
1695 // If the extract subvector is not optimal, treat it as single op shuffle.
1697 }
1698
1699 // Subvector insertions are cheap if the subvectors are aligned.
1700 // Note that in general, the insertion starting at the beginning of a vector
1701 // isn't free, because we need to preserve the rest of the wide vector,
1702 // but if the destination vector legalizes to the same width as the subvector
1703 // then the insertion will simplify to a (free) register copy.
1704 if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1705 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(DstTy);
1706 int NumElts = DstLT.second.getVectorNumElements();
1707 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1708 if (SubLT.second.isVector()) {
1709 int NumSubElts = SubLT.second.getVectorNumElements();
1710 bool MatchingTypes =
1711 NumElts == NumSubElts &&
1712 (SubTp->getElementCount().getKnownMinValue() % NumSubElts) == 0;
1713 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1714 return MatchingTypes ? TTI::TCC_Free : SubLT.first;
1715 }
1716
1717 // Attempt to match MOVSS (Idx == 0) or INSERTPS pattern. This will have
1718 // been matched by improveShuffleKindFromMask as a SK_InsertSubvector of
1719 // v1f32 (legalised to f32) into a v4f32.
1720 if (LT.first == 1 && LT.second == MVT::v4f32 && SubLT.first == 1 &&
1721 SubLT.second == MVT::f32 && (Index == 0 || ST->hasSSE41()))
1722 return 1;
1723
1724 // If the insertion is the lowest subvector then it will be blended
1725 // otherwise treat it like a 2-op shuffle.
1726 Kind =
1727 (Index == 0 && LT.first == 1) ? TTI::SK_Select : TTI::SK_PermuteTwoSrc;
1728 }
1729
1730 // Handle some common (illegal) sub-vector types as they are often very cheap
1731 // to shuffle even on targets without PSHUFB.
1732 EVT VT = TLI->getValueType(DL, SrcTy);
1733 if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1734 !ST->hasSSSE3()) {
1735 static const CostKindTblEntry SSE2SubVectorShuffleTbl[] = {
1736 {TTI::SK_Broadcast, MVT::v4i16, {1,1,1,1}}, // pshuflw
1737 {TTI::SK_Broadcast, MVT::v2i16, {1,1,1,1}}, // pshuflw
1738 {TTI::SK_Broadcast, MVT::v8i8, {2,2,2,2}}, // punpck/pshuflw
1739 {TTI::SK_Broadcast, MVT::v4i8, {2,2,2,2}}, // punpck/pshuflw
1740 {TTI::SK_Broadcast, MVT::v2i8, {1,1,1,1}}, // punpck
1741
1742 {TTI::SK_Reverse, MVT::v4i16, {1,1,1,1}}, // pshuflw
1743 {TTI::SK_Reverse, MVT::v2i16, {1,1,1,1}}, // pshuflw
1744 {TTI::SK_Reverse, MVT::v4i8, {3,3,3,3}}, // punpck/pshuflw/packus
1745 {TTI::SK_Reverse, MVT::v2i8, {1,1,1,1}}, // punpck
1746
1747 {TTI::SK_Splice, MVT::v4i16, {2,2,2,2}}, // punpck+psrldq
1748 {TTI::SK_Splice, MVT::v2i16, {2,2,2,2}}, // punpck+psrldq
1749 {TTI::SK_Splice, MVT::v4i8, {2,2,2,2}}, // punpck+psrldq
1750 {TTI::SK_Splice, MVT::v2i8, {2,2,2,2}}, // punpck+psrldq
1751
1752 {TTI::SK_PermuteTwoSrc, MVT::v4i16, {2,2,2,2}}, // punpck/pshuflw
1753 {TTI::SK_PermuteTwoSrc, MVT::v2i16, {2,2,2,2}}, // punpck/pshuflw
1754 {TTI::SK_PermuteTwoSrc, MVT::v8i8, {7,7,7,7}}, // punpck/pshuflw
1755 {TTI::SK_PermuteTwoSrc, MVT::v4i8, {4,4,4,4}}, // punpck/pshuflw
1756 {TTI::SK_PermuteTwoSrc, MVT::v2i8, {2,2,2,2}}, // punpck
1757
1758 {TTI::SK_PermuteSingleSrc, MVT::v4i16, {1,1,1,1}}, // pshuflw
1759 {TTI::SK_PermuteSingleSrc, MVT::v2i16, {1,1,1,1}}, // pshuflw
1760 {TTI::SK_PermuteSingleSrc, MVT::v8i8, {5,5,5,5}}, // punpck/pshuflw
1761 {TTI::SK_PermuteSingleSrc, MVT::v4i8, {3,3,3,3}}, // punpck/pshuflw
1762 {TTI::SK_PermuteSingleSrc, MVT::v2i8, {1,1,1,1}}, // punpck
1763 };
1764
1765 if (ST->hasSSE2())
1766 if (const auto *Entry =
1767 CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1768 if (auto KindCost = Entry->Cost[CostKind])
1769 return LT.first * *KindCost;
1770 }
1771
1772 // We are going to permute multiple sources and the result will be in multiple
1773 // destinations. Providing an accurate cost only for splits where the element
1774 // type remains the same.
1775 if (LT.first != 1) {
1776 MVT LegalVT = LT.second;
1777 if (LegalVT.isVector() &&
1778 LegalVT.getVectorElementType().getSizeInBits() ==
1779 SrcTy->getElementType()->getPrimitiveSizeInBits() &&
1780 LegalVT.getVectorNumElements() <
1781 cast<FixedVectorType>(SrcTy)->getNumElements()) {
1782 unsigned VecTySize = DL.getTypeStoreSize(SrcTy);
1783 unsigned LegalVTSize = LegalVT.getStoreSize();
1784 // Number of source vectors after legalization:
1785 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1786 // Number of destination vectors after legalization:
1787 InstructionCost NumOfDests = LT.first;
1788
1789 auto *SingleOpTy = FixedVectorType::get(SrcTy->getElementType(),
1790 LegalVT.getVectorNumElements());
1791
1792 if (!Mask.empty() && NumOfDests.isValid()) {
1793 // Try to perform better estimation of the permutation.
1794 // 1. Split the source/destination vectors into real registers.
1795 // 2. Do the mask analysis to identify which real registers are
1796 // permuted. If more than 1 source registers are used for the
1797 // destination register building, the cost for this destination register
1798 // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
1799 // source register is used, build mask and calculate the cost as a cost
1800 // of PermuteSingleSrc.
1801 // Also, for the single register permute we try to identify if the
1802 // destination register is just a copy of the source register or the
1803 // copy of the previous destination register (the cost is
1804 // TTI::TCC_Basic). If the source register is just reused, the cost for
1805 // this operation is TTI::TCC_Free.
1806 NumOfDests =
1808 FixedVectorType::get(SrcTy->getElementType(), Mask.size()))
1809 .first;
1810 unsigned E = NumOfDests.getValue();
1811 unsigned NormalizedVF =
1812 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
1813 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
1814 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
1815 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
1816 copy(Mask, NormalizedMask.begin());
1817 unsigned PrevSrcReg = 0;
1818 ArrayRef<int> PrevRegMask;
1821 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
1822 [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask,
1823 &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
1824 if (!ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) {
1825 // Check if the previous register can be just copied to the next
1826 // one.
1827 if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
1828 PrevRegMask != RegMask)
1829 Cost +=
1831 SingleOpTy, RegMask, CostKind, 0, nullptr);
1832 else
1833 // Just a copy of previous destination register.
1835 return;
1836 }
1837 if (SrcReg != DestReg &&
1838 any_of(RegMask, not_equal_to(PoisonMaskElem))) {
1839 // Just a copy of the source register.
1841 }
1842 PrevSrcReg = SrcReg;
1843 PrevRegMask = RegMask;
1844 },
1845 [this, SingleOpTy, CostKind,
1846 &Cost](ArrayRef<int> RegMask, unsigned /*Unused*/,
1847 unsigned /*Unused*/, bool /*Unused*/) {
1849 SingleOpTy, RegMask, CostKind, 0, nullptr);
1850 });
1851 return Cost;
1852 }
1853
1854 InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1855 return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1856 SingleOpTy, {}, CostKind, 0,
1857 nullptr);
1858 }
1859
1860 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1861 SubTp);
1862 }
1863
1864 // If we're just moving a single element around (probably as an alternative to
1865 // extracting it), we can assume this is cheap.
1866 if (LT.first == 1 && IsInLaneShuffle && IsSingleElementMask)
1867 return TTI::TCC_Basic;
1868
1869 static const CostKindTblEntry AVX512VBMIShuffleTbl[] = {
1870 { TTI::SK_Reverse, MVT::v64i8, { 1, 1, 1, 1 } }, // vpermb
1871 { TTI::SK_Reverse, MVT::v32i8, { 1, 1, 1, 1 } }, // vpermb
1872 { TTI::SK_PermuteSingleSrc, MVT::v64i8, { 1, 1, 1, 1 } }, // vpermb
1873 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 1, 1, 1, 1 } }, // vpermb
1874 { TTI::SK_PermuteTwoSrc, MVT::v64i8, { 2, 2, 2, 2 } }, // vpermt2b
1875 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 2, 2, 2, 2 } }, // vpermt2b
1876 { TTI::SK_PermuteTwoSrc, MVT::v16i8, { 2, 2, 2, 2 } } // vpermt2b
1877 };
1878
1879 if (ST->hasVBMI())
1880 if (const auto *Entry =
1881 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1882 if (auto KindCost = Entry->Cost[CostKind])
1883 return LT.first * *KindCost;
1884
1885 static const CostKindTblEntry AVX512BWShuffleTbl[] = {
1886 { TTI::SK_Broadcast, MVT::v32i16, { 1, 3, 1, 1 } }, // vpbroadcastw
1887 { TTI::SK_Broadcast, MVT::v32f16, { 1, 3, 1, 1 } }, // vpbroadcastw
1888 { TTI::SK_Broadcast, MVT::v64i8, { 1, 3, 1, 1 } }, // vpbroadcastb
1889
1890 { TTI::SK_Reverse, MVT::v32i16, { 2, 6, 2, 4 } }, // vpermw
1891 { TTI::SK_Reverse, MVT::v32f16, { 2, 6, 2, 4 } }, // vpermw
1892 { TTI::SK_Reverse, MVT::v16i16, { 2, 2, 2, 2 } }, // vpermw
1893 { TTI::SK_Reverse, MVT::v16f16, { 2, 2, 2, 2 } }, // vpermw
1894 { TTI::SK_Reverse, MVT::v64i8, { 2, 9, 2, 3 } }, // pshufb + vshufi64x2
1895
1896 { TTI::SK_PermuteSingleSrc, MVT::v32i16, { 2, 2, 2, 2 } }, // vpermw
1897 { TTI::SK_PermuteSingleSrc, MVT::v32f16, { 2, 2, 2, 2 } }, // vpermw
1898 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 2, 2, 2, 2 } }, // vpermw
1899 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 2, 2, 2, 2 } }, // vpermw
1900 { TTI::SK_PermuteSingleSrc, MVT::v64i8, { 8, 8, 8, 8 } }, // extend to v32i16
1901
1902 { TTI::SK_PermuteTwoSrc, MVT::v32i16,{ 2, 2, 2, 2 } }, // vpermt2w
1903 { TTI::SK_PermuteTwoSrc, MVT::v32f16,{ 2, 2, 2, 2 } }, // vpermt2w
1904 { TTI::SK_PermuteTwoSrc, MVT::v16i16,{ 2, 2, 2, 2 } }, // vpermt2w
1905 { TTI::SK_PermuteTwoSrc, MVT::v8i16, { 2, 2, 2, 2 } }, // vpermt2w
1906 { TTI::SK_PermuteTwoSrc, MVT::v64i8, { 19, 19, 19, 19 } }, // 6 * v32i8 + 1
1907
1908 { TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vblendmw
1909 { TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vblendmb
1910
1911 { TTI::SK_Splice, MVT::v32i16, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
1912 { TTI::SK_Splice, MVT::v32f16, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
1913 { TTI::SK_Splice, MVT::v64i8, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
1914 };
1915
1916 if (ST->hasBWI())
1917 if (const auto *Entry =
1918 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1919 if (auto KindCost = Entry->Cost[CostKind])
1920 return LT.first * *KindCost;
1921
1922 static const CostKindTblEntry AVX512InLaneShuffleTbl[] = {
1923 {TTI::SK_PermuteTwoSrc, MVT::v8f64, { 1, 3, 1, 1 } },
1924 {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } },
1925 {TTI::SK_PermuteTwoSrc, MVT::v8i64, { 1, 3, 1, 1 } },
1926 {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } },
1927 {TTI::SK_PermuteTwoSrc, MVT::v4f64, { 1, 3, 1, 1 } },
1928 {TTI::SK_PermuteTwoSrc, MVT::v8f32, { 1, 3, 1, 1 } },
1929 {TTI::SK_PermuteTwoSrc, MVT::v4i64, { 1, 3, 1, 1 } },
1930 {TTI::SK_PermuteTwoSrc, MVT::v8i32, { 1, 3, 1, 1 } },
1931 };
1932
1933 if (IsInLaneShuffle && ST->hasAVX512())
1934 if (const auto *Entry =
1935 CostTableLookup(AVX512InLaneShuffleTbl, Kind, LT.second))
1936 if (auto KindCost = Entry->Cost[CostKind])
1937 return LT.first * *KindCost;
1938
1939 static const CostKindTblEntry AVX512ShuffleTbl[] = {
1940 {TTI::SK_Broadcast, MVT::v8f64, { 1, 3, 1, 1 } }, // vbroadcastsd
1941 {TTI::SK_Broadcast, MVT::v4f64, { 1, 3, 1, 1 } }, // vbroadcastsd
1942 {TTI::SK_Broadcast, MVT::v16f32, { 1, 3, 1, 1 } }, // vbroadcastss
1943 {TTI::SK_Broadcast, MVT::v8f32, { 1, 3, 1, 1 } }, // vbroadcastss
1944 {TTI::SK_Broadcast, MVT::v8i64, { 1, 3, 1, 1 } }, // vpbroadcastq
1945 {TTI::SK_Broadcast, MVT::v4i64, { 1, 3, 1, 1 } }, // vpbroadcastq
1946 {TTI::SK_Broadcast, MVT::v16i32, { 1, 3, 1, 1 } }, // vpbroadcastd
1947 {TTI::SK_Broadcast, MVT::v8i32, { 1, 3, 1, 1 } }, // vpbroadcastd
1948 {TTI::SK_Broadcast, MVT::v32i16, { 1, 3, 1, 1 } }, // vpbroadcastw
1949 {TTI::SK_Broadcast, MVT::v16i16, { 1, 3, 1, 1 } }, // vpbroadcastw
1950 {TTI::SK_Broadcast, MVT::v32f16, { 1, 3, 1, 1 } }, // vpbroadcastw
1951 {TTI::SK_Broadcast, MVT::v16f16, { 1, 3, 1, 1 } }, // vpbroadcastw
1952 {TTI::SK_Broadcast, MVT::v64i8, { 1, 3, 1, 1 } }, // vpbroadcastb
1953 {TTI::SK_Broadcast, MVT::v32i8, { 1, 3, 1, 1 }}, // vpbroadcastb
1954
1955 {TTI::SK_Reverse, MVT::v8f64, { 1, 5, 2, 3 } }, // vpermpd
1956 {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 2, 3 } }, // vpermps
1957 {TTI::SK_Reverse, MVT::v8i64, { 1, 5, 2, 3 } }, // vpermq
1958 {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 2, 3 } }, // vpermd
1959 {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca
1960 {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca
1961 {TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca
1962
1963 {TTI::SK_Splice, MVT::v8f64, { 1, 1, 1, 1 } }, // vpalignd
1964 {TTI::SK_Splice, MVT::v4f64, { 1, 1, 1, 1 } }, // vpalignd
1965 {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd
1966 {TTI::SK_Splice, MVT::v8f32, { 1, 1, 1, 1 } }, // vpalignd
1967 {TTI::SK_Splice, MVT::v8i64, { 1, 1, 1, 1 } }, // vpalignd
1968 {TTI::SK_Splice, MVT::v4i64, { 1, 1, 1, 1 } }, // vpalignd
1969 {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd
1970 {TTI::SK_Splice, MVT::v8i32, { 1, 1, 1, 1 } }, // vpalignd
1971 {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr
1972 {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr
1973 {TTI::SK_Splice, MVT::v64i8, { 4, 4, 4, 4 } }, // split + palignr
1974
1975 {TTI::SK_PermuteSingleSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1976 {TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermpd
1977 {TTI::SK_PermuteSingleSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermpd
1978 {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1979 {TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermps
1980 {TTI::SK_PermuteSingleSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermps
1981 {TTI::SK_PermuteSingleSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1982 {TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermq
1983 {TTI::SK_PermuteSingleSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermq
1984 {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1985 {TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermd
1986 {TTI::SK_PermuteSingleSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermd
1987 {TTI::SK_PermuteSingleSrc, MVT::v16i8, { 1, 3, 1, 1 } }, // pshufb
1988
1989 {TTI::SK_PermuteTwoSrc, MVT::v8f64, { 2, 3, 1, 1 } }, // vpermt2pd
1990 {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 2, 3, 1, 1 } }, // vpermt2ps
1991 {TTI::SK_PermuteTwoSrc, MVT::v8i64, { 2, 3, 1, 1 } }, // vpermt2q
1992 {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 2, 3, 1, 1 } }, // vpermt2d
1993 {TTI::SK_PermuteTwoSrc, MVT::v4f64, { 2, 3, 1, 1 } }, // vpermt2pd
1994 {TTI::SK_PermuteTwoSrc, MVT::v8f32, { 2, 3, 1, 1 } }, // vpermt2ps
1995 {TTI::SK_PermuteTwoSrc, MVT::v4i64, { 2, 3, 1, 1 } }, // vpermt2q
1996 {TTI::SK_PermuteTwoSrc, MVT::v8i32, { 2, 3, 1, 1 } }, // vpermt2d
1997 {TTI::SK_PermuteTwoSrc, MVT::v2f64, { 1, 3, 1, 1 } },
1998 {TTI::SK_PermuteTwoSrc, MVT::v4f32, { 1, 3, 1, 1 } },
1999 {TTI::SK_PermuteTwoSrc, MVT::v2i64, { 1, 3, 1, 1 } },
2000 {TTI::SK_PermuteTwoSrc, MVT::v4i32, { 1, 3, 1, 1 } },
2001
2002 // FIXME: This just applies the type legalization cost rules above
2003 // assuming these completely split.
2004 {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } },
2005 {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } },
2006 {TTI::SK_PermuteSingleSrc, MVT::v64i8, { 14, 14, 14, 14 } },
2007 {TTI::SK_PermuteTwoSrc, MVT::v32i16, { 42, 42, 42, 42 } },
2008 {TTI::SK_PermuteTwoSrc, MVT::v32f16, { 42, 42, 42, 42 } },
2009 {TTI::SK_PermuteTwoSrc, MVT::v64i8, { 42, 42, 42, 42 } },
2010
2011 {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq
2012 {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq
2013 {TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vpternlogq
2014 {TTI::SK_Select, MVT::v8f64, { 1, 1, 1, 1 } }, // vblendmpd
2015 {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps
2016 {TTI::SK_Select, MVT::v8i64, { 1, 1, 1, 1 } }, // vblendmq
2017 {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd
2018 };
2019
2020 if (ST->hasAVX512())
2021 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
2022 if (auto KindCost = Entry->Cost[CostKind])
2023 return LT.first * *KindCost;
2024
2025 static const CostKindTblEntry AVX2InLaneShuffleTbl[] = {
2026 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 1, 1, 1, 1 } }, // vpshufb
2027 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 1, 1, 1, 1 } }, // vpshufb
2028 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 1, 1, 1, 1 } }, // vpshufb
2029
2030 { TTI::SK_Transpose, MVT::v4f64, { 1, 1, 1, 1 } }, // vshufpd/vunpck
2031 { TTI::SK_Transpose, MVT::v4i64, { 1, 1, 1, 1 } }, // vshufpd/vunpck
2032
2033 { TTI::SK_PermuteTwoSrc, MVT::v4f64, { 2, 2, 2, 2 } }, // 2*vshufpd + vblendpd
2034 { TTI::SK_PermuteTwoSrc, MVT::v8f32, { 2, 2, 2, 2 } }, // 2*vshufps + vblendps
2035 { TTI::SK_PermuteTwoSrc, MVT::v4i64, { 2, 2, 2, 2 } }, // 2*vpshufd + vpblendd
2036 { TTI::SK_PermuteTwoSrc, MVT::v8i32, { 2, 2, 2, 2 } }, // 2*vpshufd + vpblendd
2037 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 2, 2, 2, 2 } }, // 2*vpshufb + vpor
2038 { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 2, 2, 2, 2 } }, // 2*vpshufb + vpor
2039 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 2, 2, 2, 2 } }, // 2*vpshufb + vpor
2040 };
2041
2042 if (IsInLaneShuffle && ST->hasAVX2())
2043 if (const auto *Entry =
2044 CostTableLookup(AVX2InLaneShuffleTbl, Kind, LT.second))
2045 if (auto KindCost = Entry->Cost[CostKind])
2046 return LT.first * *KindCost;
2047
2048 static const CostKindTblEntry AVX2ShuffleTbl[] = {
2049 { TTI::SK_Broadcast, MVT::v4f64, { 1, 3, 1, 2 } }, // vbroadcastpd
2050 { TTI::SK_Broadcast, MVT::v8f32, { 1, 3, 1, 2 } }, // vbroadcastps
2051 { TTI::SK_Broadcast, MVT::v4i64, { 1, 3, 1, 2 } }, // vpbroadcastq
2052 { TTI::SK_Broadcast, MVT::v8i32, { 1, 3, 1, 2 } }, // vpbroadcastd
2053 { TTI::SK_Broadcast, MVT::v16i16, { 1, 3, 1, 2 } }, // vpbroadcastw
2054 { TTI::SK_Broadcast, MVT::v8i16, { 1, 3, 1, 1 } }, // vpbroadcastw
2055 { TTI::SK_Broadcast, MVT::v16f16, { 1, 3, 1, 2 } }, // vpbroadcastw
2056 { TTI::SK_Broadcast, MVT::v8f16, { 1, 3, 1, 1 } }, // vpbroadcastw
2057 { TTI::SK_Broadcast, MVT::v32i8, { 1, 3, 1, 2 } }, // vpbroadcastb
2058 { TTI::SK_Broadcast, MVT::v16i8, { 1, 3, 1, 1 } }, // vpbroadcastb
2059
2060 { TTI::SK_Reverse, MVT::v4f64, { 1, 6, 1, 2 } }, // vpermpd
2061 { TTI::SK_Reverse, MVT::v8f32, { 2, 7, 2, 4 } }, // vpermps
2062 { TTI::SK_Reverse, MVT::v4i64, { 1, 6, 1, 2 } }, // vpermq
2063 { TTI::SK_Reverse, MVT::v8i32, { 2, 7, 2, 4 } }, // vpermd
2064 { TTI::SK_Reverse, MVT::v16i16, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
2065 { TTI::SK_Reverse, MVT::v16f16, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
2066 { TTI::SK_Reverse, MVT::v32i8, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
2067
2068 { TTI::SK_Select, MVT::v16i16, { 1, 1, 1, 1 } }, // vpblendvb
2069 { TTI::SK_Select, MVT::v16f16, { 1, 1, 1, 1 } }, // vpblendvb
2070 { TTI::SK_Select, MVT::v32i8, { 1, 1, 1, 1 } }, // vpblendvb
2071
2072 { TTI::SK_Splice, MVT::v8i32, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2073 { TTI::SK_Splice, MVT::v8f32, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2074 { TTI::SK_Splice, MVT::v16i16, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2075 { TTI::SK_Splice, MVT::v16f16, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2076 { TTI::SK_Splice, MVT::v32i8, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2077
2078 { TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 1, 1, 1 } }, // vpermpd
2079 { TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 1, 1, 1 } }, // vpermps
2080 { TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 1, 1, 1 } }, // vpermq
2081 { TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 1, 1, 1 } }, // vpermd
2082 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 4, 4, 4, 4 } },
2083 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 4, 4, 4, 4 } },
2084 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 4, 4, 4, 4 } },
2085
2086 { TTI::SK_PermuteTwoSrc, MVT::v4f64, { 3, 3, 3, 3 } }, // 2*vpermpd + vblendpd
2087 { TTI::SK_PermuteTwoSrc, MVT::v8f32, { 3, 3, 3, 3 } }, // 2*vpermps + vblendps
2088 { TTI::SK_PermuteTwoSrc, MVT::v4i64, { 3, 3, 3, 3 } }, // 2*vpermq + vpblendd
2089 { TTI::SK_PermuteTwoSrc, MVT::v8i32, { 3, 3, 3, 3 } }, // 2*vpermd + vpblendd
2090 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 7, 7, 7, 7 } },
2091 { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 7, 7, 7, 7 } },
2092 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 7, 7, 7, 7 } },
2093 };
2094
2095 if (ST->hasAVX2())
2096 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
2097 if (auto KindCost = Entry->Cost[CostKind])
2098 return LT.first * *KindCost;
2099
2100 static const CostKindTblEntry XOPShuffleTbl[] = {
2101 { TTI::SK_PermuteSingleSrc, MVT::v4f64, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2pd
2102 { TTI::SK_PermuteSingleSrc, MVT::v8f32, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2ps
2103 { TTI::SK_PermuteSingleSrc, MVT::v4i64, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2pd
2104 { TTI::SK_PermuteSingleSrc, MVT::v8i32, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2ps
2105 { TTI::SK_PermuteSingleSrc, MVT::v16i16,{ 4, 4, 4, 4 } }, // vextractf128 + 2*vpperm
2106 // + vinsertf128
2107 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 4, 4, 4, 4 } }, // vextractf128 + 2*vpperm
2108 // + vinsertf128
2109
2110 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 6*vpperm
2111 // + vinsertf128
2112
2113 { TTI::SK_PermuteTwoSrc, MVT::v8i16, { 1, 1, 1, 1 } }, // vpperm
2114 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 9, 9, 9, 9 } }, // 2*vextractf128 + 6*vpperm
2115 // + vinsertf128
2116 { TTI::SK_PermuteTwoSrc, MVT::v16i8, { 1, 1, 1, 1 } }, // vpperm
2117 };
2118
2119 if (ST->hasXOP())
2120 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
2121 if (auto KindCost = Entry->Cost[CostKind])
2122 return LT.first * *KindCost;
2123
2124 static const CostKindTblEntry AVX1InLaneShuffleTbl[] = {
2125 { TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 1, 1, 1 } }, // vpermilpd
2126 { TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 1, 1, 1 } }, // vpermilpd
2127 { TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 1, 1, 1 } }, // vpermilps
2128 { TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 1, 1, 1 } }, // vpermilps
2129
2130 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb
2131 // + vpor + vinsertf128
2132 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb
2133 // + vpor + vinsertf128
2134 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb
2135 // + vpor + vinsertf128
2136
2137 { TTI::SK_Transpose, MVT::v4f64, { 1, 1, 1, 1 } }, // vshufpd/vunpck
2138 { TTI::SK_Transpose, MVT::v4i64, { 1, 1, 1, 1 } }, // vshufpd/vunpck
2139
2140 { TTI::SK_PermuteTwoSrc, MVT::v4f64, { 2, 2, 2, 2 } }, // 2*vshufpd + vblendpd
2141 { TTI::SK_PermuteTwoSrc, MVT::v8f32, { 2, 2, 2, 2 } }, // 2*vshufps + vblendps
2142 { TTI::SK_PermuteTwoSrc, MVT::v4i64, { 2, 2, 2, 2 } }, // 2*vpermilpd + vblendpd
2143 { TTI::SK_PermuteTwoSrc, MVT::v8i32, { 2, 2, 2, 2 } }, // 2*vpermilps + vblendps
2144 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb
2145 // + 2*vpor + vinsertf128
2146 { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb
2147 // + 2*vpor + vinsertf128
2148 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb
2149 // + 2*vpor + vinsertf128
2150 };
2151
2152 if (IsInLaneShuffle && ST->hasAVX())
2153 if (const auto *Entry =
2154 CostTableLookup(AVX1InLaneShuffleTbl, Kind, LT.second))
2155 if (auto KindCost = Entry->Cost[CostKind])
2156 return LT.first * *KindCost;
2157
2158 static const CostKindTblEntry AVX1ShuffleTbl[] = {
2159 {TTI::SK_Broadcast, MVT::v4f64, {2,3,2,3}}, // vperm2f128 + vpermilpd
2160 {TTI::SK_Broadcast, MVT::v8f32, {2,3,2,3}}, // vperm2f128 + vpermilps
2161 {TTI::SK_Broadcast, MVT::v4i64, {2,3,2,3}}, // vperm2f128 + vpermilpd
2162 {TTI::SK_Broadcast, MVT::v8i32, {2,3,2,3}}, // vperm2f128 + vpermilps
2163 {TTI::SK_Broadcast, MVT::v16i16, {2,3,3,4}}, // vpshuflw + vpshufd + vinsertf128
2164 {TTI::SK_Broadcast, MVT::v16f16, {2,3,3,4}}, // vpshuflw + vpshufd + vinsertf128
2165 {TTI::SK_Broadcast, MVT::v32i8, {3,4,3,6}}, // vpshufb + vinsertf128
2166
2167 {TTI::SK_Reverse, MVT::v4f64, {2,6,2,2}}, // vperm2f128 + vpermilpd
2168 {TTI::SK_Reverse, MVT::v8f32, {2,7,2,4}}, // vperm2f128 + vpermilps
2169 {TTI::SK_Reverse, MVT::v4i64, {2,6,2,2}}, // vperm2f128 + vpermilpd
2170 {TTI::SK_Reverse, MVT::v8i32, {2,7,2,4}}, // vperm2f128 + vpermilps
2171 {TTI::SK_Reverse, MVT::v16i16, {2,9,5,5}}, // vextractf128 + 2*pshufb
2172 // + vinsertf128
2173 {TTI::SK_Reverse, MVT::v16f16, {2,9,5,5}}, // vextractf128 + 2*pshufb
2174 // + vinsertf128
2175 {TTI::SK_Reverse, MVT::v32i8, {2,9,5,5}}, // vextractf128 + 2*pshufb
2176 // + vinsertf128
2177
2178 {TTI::SK_Select, MVT::v4i64, {1,1,1,1}}, // vblendpd
2179 {TTI::SK_Select, MVT::v4f64, {1,1,1,1}}, // vblendpd
2180 {TTI::SK_Select, MVT::v8i32, {1,1,1,1}}, // vblendps
2181 {TTI::SK_Select, MVT::v8f32, {1,1,1,1}}, // vblendps
2182 {TTI::SK_Select, MVT::v16i16, {3,3,3,3}}, // vpand + vpandn + vpor
2183 {TTI::SK_Select, MVT::v16f16, {3,3,3,3}}, // vpand + vpandn + vpor
2184 {TTI::SK_Select, MVT::v32i8, {3,3,3,3}}, // vpand + vpandn + vpor
2185
2186 {TTI::SK_Splice, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + shufpd
2187 {TTI::SK_Splice, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + shufpd
2188 {TTI::SK_Splice, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2189 {TTI::SK_Splice, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2190 {TTI::SK_Splice, MVT::v16i16, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2191 {TTI::SK_Splice, MVT::v16f16, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2192 {TTI::SK_Splice, MVT::v32i8, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2193
2194 {TTI::SK_PermuteSingleSrc, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + vshufpd
2195 {TTI::SK_PermuteSingleSrc, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + vshufpd
2196 {TTI::SK_PermuteSingleSrc, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2197 {TTI::SK_PermuteSingleSrc, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2198 {TTI::SK_PermuteSingleSrc, MVT::v16i16,{8,8,8,8}}, // vextractf128 + 4*pshufb
2199 // + 2*por + vinsertf128
2200 {TTI::SK_PermuteSingleSrc, MVT::v16f16,{8,8,8,8}}, // vextractf128 + 4*pshufb
2201 // + 2*por + vinsertf128
2202 {TTI::SK_PermuteSingleSrc, MVT::v32i8, {8,8,8,8}}, // vextractf128 + 4*pshufb
2203 // + 2*por + vinsertf128
2204
2205 {TTI::SK_PermuteTwoSrc, MVT::v4f64, {3,3,3,3}}, // 2*vperm2f128 + vshufpd
2206 {TTI::SK_PermuteTwoSrc, MVT::v4i64, {3,3,3,3}}, // 2*vperm2f128 + vshufpd
2207 {TTI::SK_PermuteTwoSrc, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2208 {TTI::SK_PermuteTwoSrc, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2209 {TTI::SK_PermuteTwoSrc, MVT::v16i16,{15,15,15,15}}, // 2*vextractf128 + 8*pshufb
2210 // + 4*por + vinsertf128
2211 {TTI::SK_PermuteTwoSrc, MVT::v16f16,{15,15,15,15}}, // 2*vextractf128 + 8*pshufb
2212 // + 4*por + vinsertf128
2213 {TTI::SK_PermuteTwoSrc, MVT::v32i8, {15,15,15,15}}, // 2*vextractf128 + 8*pshufb
2214 // + 4*por + vinsertf128
2215 };
2216
2217 if (ST->hasAVX())
2218 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
2219 if (auto KindCost = Entry->Cost[CostKind])
2220 return LT.first * *KindCost;
2221
2222 static const CostKindTblEntry SSE41ShuffleTbl[] = {
2223 {TTI::SK_Select, MVT::v2i64, {1,1,1,1}}, // pblendw
2224 {TTI::SK_Select, MVT::v2f64, {1,1,1,1}}, // movsd
2225 {TTI::SK_Select, MVT::v4i32, {1,1,1,1}}, // pblendw
2226 {TTI::SK_Select, MVT::v4f32, {1,1,1,1}}, // blendps
2227 {TTI::SK_Select, MVT::v8i16, {1,1,1,1}}, // pblendw
2228 {TTI::SK_Select, MVT::v8f16, {1,1,1,1}}, // pblendw
2229 {TTI::SK_Select, MVT::v16i8, {1,1,1,1}} // pblendvb
2230 };
2231
2232 if (ST->hasSSE41())
2233 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
2234 if (auto KindCost = Entry->Cost[CostKind])
2235 return LT.first * *KindCost;
2236
2237 static const CostKindTblEntry SSSE3ShuffleTbl[] = {
2238 {TTI::SK_Broadcast, MVT::v8i16, {1, 3, 2, 2}}, // pshufb
2239 {TTI::SK_Broadcast, MVT::v8f16, {1, 3, 2, 2}}, // pshufb
2240 {TTI::SK_Broadcast, MVT::v16i8, {1, 3, 2, 2}}, // pshufb
2241
2242 {TTI::SK_Reverse, MVT::v8i16, {1, 2, 1, 2}}, // pshufb
2243 {TTI::SK_Reverse, MVT::v8f16, {1, 2, 1, 2}}, // pshufb
2244 {TTI::SK_Reverse, MVT::v16i8, {1, 2, 1, 2}}, // pshufb
2245
2246 {TTI::SK_Select, MVT::v8i16, {3, 3, 3, 3}}, // 2*pshufb + por
2247 {TTI::SK_Select, MVT::v8f16, {3, 3, 3, 3}}, // 2*pshufb + por
2248 {TTI::SK_Select, MVT::v16i8, {3, 3, 3, 3}}, // 2*pshufb + por
2249
2250 {TTI::SK_Splice, MVT::v4i32, {1, 1, 1, 1}}, // palignr
2251 {TTI::SK_Splice, MVT::v4f32, {1, 1, 1, 1}}, // palignr
2252 {TTI::SK_Splice, MVT::v8i16, {1, 1, 1, 1}}, // palignr
2253 {TTI::SK_Splice, MVT::v8f16, {1, 1, 1, 1}}, // palignr
2254 {TTI::SK_Splice, MVT::v16i8, {1, 1, 1, 1}}, // palignr
2255
2256 {TTI::SK_PermuteSingleSrc, MVT::v8i16, {1, 1, 1, 1}}, // pshufb
2257 {TTI::SK_PermuteSingleSrc, MVT::v8f16, {1, 1, 1, 1}}, // pshufb
2258 {TTI::SK_PermuteSingleSrc, MVT::v16i8, {1, 1, 1, 1}}, // pshufb
2259
2260 {TTI::SK_PermuteTwoSrc, MVT::v8i16, {3, 3, 3, 3}}, // 2*pshufb + por
2261 {TTI::SK_PermuteTwoSrc, MVT::v8f16, {3, 3, 3, 3}}, // 2*pshufb + por
2262 {TTI::SK_PermuteTwoSrc, MVT::v16i8, {3, 3, 3, 3}}, // 2*pshufb + por
2263 };
2264
2265 if (ST->hasSSSE3())
2266 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
2267 if (auto KindCost = Entry->Cost[CostKind])
2268 return LT.first * *KindCost;
2269
2270 static const CostKindTblEntry SSE2ShuffleTbl[] = {
2271 {TTI::SK_Broadcast, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2272 {TTI::SK_Broadcast, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
2273 {TTI::SK_Broadcast, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
2274 {TTI::SK_Broadcast, MVT::v8i16, {1, 2, 2, 2}}, // pshuflw + pshufd
2275 {TTI::SK_Broadcast, MVT::v8f16, {1, 2, 2, 2}}, // pshuflw + pshufd
2276 {TTI::SK_Broadcast, MVT::v16i8, {2, 3, 3, 4}}, // unpck + pshuflw + pshufd
2277
2278 {TTI::SK_Reverse, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2279 {TTI::SK_Reverse, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
2280 {TTI::SK_Reverse, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
2281 {TTI::SK_Reverse, MVT::v8i16, {2, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
2282 {TTI::SK_Reverse, MVT::v8f16, {2, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
2283 {TTI::SK_Reverse, MVT::v16i8, {5, 6,11,11}}, // 2*pshuflw + 2*pshufhw
2284 // + 2*pshufd + 2*unpck + packus
2285
2286 {TTI::SK_Select, MVT::v2i64, {1, 1, 1, 1}}, // movsd
2287 {TTI::SK_Select, MVT::v2f64, {1, 1, 1, 1}}, // movsd
2288 {TTI::SK_Select, MVT::v4i32, {2, 2, 2, 2}}, // 2*shufps
2289 {TTI::SK_Select, MVT::v8i16, {3, 3, 3, 3}}, // pand + pandn + por
2290 {TTI::SK_Select, MVT::v8f16, {3, 3, 3, 3}}, // pand + pandn + por
2291 {TTI::SK_Select, MVT::v16i8, {3, 3, 3, 3}}, // pand + pandn + por
2292
2293 {TTI::SK_Splice, MVT::v2i64, {1, 1, 1, 1}}, // shufpd
2294 {TTI::SK_Splice, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2295 {TTI::SK_Splice, MVT::v4i32, {2, 2, 2, 2}}, // 2*{unpck,movsd,pshufd}
2296 {TTI::SK_Splice, MVT::v8i16, {3, 3, 3, 3}}, // psrldq + psrlldq + por
2297 {TTI::SK_Splice, MVT::v8f16, {3, 3, 3, 3}}, // psrldq + psrlldq + por
2298 {TTI::SK_Splice, MVT::v16i8, {3, 3, 3, 3}}, // psrldq + psrlldq + por
2299
2300 {TTI::SK_PermuteSingleSrc, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2301 {TTI::SK_PermuteSingleSrc, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
2302 {TTI::SK_PermuteSingleSrc, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
2303 {TTI::SK_PermuteSingleSrc, MVT::v8i16, {3, 5, 5, 5}}, // 2*pshuflw + 2*pshufhw
2304 // + pshufd/unpck
2305 {TTI::SK_PermuteSingleSrc, MVT::v8f16, {3, 5, 5, 5}}, // 2*pshuflw + 2*pshufhw
2306 // + pshufd/unpck
2307 {TTI::SK_PermuteSingleSrc, MVT::v16i8, {8, 10, 10, 10}}, // 2*pshuflw + 2*pshufhw
2308 // + 2*pshufd + 2*unpck + 2*packus
2309
2310 {TTI::SK_PermuteTwoSrc, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2311 {TTI::SK_PermuteTwoSrc, MVT::v2i64, {1, 1, 1, 1}}, // shufpd
2312 {TTI::SK_PermuteTwoSrc, MVT::v4i32, {2, 2, 2, 2}}, // 2*{unpck,movsd,pshufd}
2313 {TTI::SK_PermuteTwoSrc, MVT::v8i16, {6, 8, 8, 8}}, // blend+permute
2314 {TTI::SK_PermuteTwoSrc, MVT::v8f16, {6, 8, 8, 8}}, // blend+permute
2315 {TTI::SK_PermuteTwoSrc, MVT::v16i8, {11, 13, 13, 13}}, // blend+permute
2316 };
2317
2318 static const CostTblEntry SSE3BroadcastLoadTbl[] = {
2319 {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
2320 };
2321
2322 if (ST->hasSSE2()) {
2323 bool IsLoad =
2324 llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); });
2325 if (ST->hasSSE3() && IsLoad)
2326 if (const auto *Entry =
2327 CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
2328 assert(isLegalBroadcastLoad(SrcTy->getElementType(),
2329 LT.second.getVectorElementCount()) &&
2330 "Table entry missing from isLegalBroadcastLoad()");
2331 return LT.first * Entry->Cost;
2332 }
2333
2334 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
2335 if (auto KindCost = Entry->Cost[CostKind])
2336 return LT.first * *KindCost;
2337 }
2338
2339 static const CostKindTblEntry SSE1ShuffleTbl[] = {
2340 { TTI::SK_Broadcast, MVT::v4f32, {1,1,1,1} }, // shufps
2341 { TTI::SK_Reverse, MVT::v4f32, {1,1,1,1} }, // shufps
2342 { TTI::SK_Select, MVT::v4f32, {2,2,2,2} }, // 2*shufps
2343 { TTI::SK_Splice, MVT::v4f32, {2,2,2,2} }, // 2*shufps
2344 { TTI::SK_PermuteSingleSrc, MVT::v4f32, {1,1,1,1} }, // shufps
2345 { TTI::SK_PermuteTwoSrc, MVT::v4f32, {2,2,2,2} }, // 2*shufps
2346 };
2347
2348 if (ST->hasSSE1()) {
2349 if (LT.first == 1 && LT.second == MVT::v4f32 && Mask.size() == 4) {
2350 // SHUFPS: both pairs must come from the same source register.
2351 auto MatchSHUFPS = [](int X, int Y) {
2352 return X < 0 || Y < 0 || ((X & 4) == (Y & 4));
2353 };
2354 if (MatchSHUFPS(Mask[0], Mask[1]) && MatchSHUFPS(Mask[2], Mask[3]))
2355 return 1;
2356 }
2357 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
2358 if (auto KindCost = Entry->Cost[CostKind])
2359 return LT.first * *KindCost;
2360 }
2361
2362 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
2363 SubTp);
2364}
2365
2367 Type *Src,
2370 const Instruction *I) const {
2371 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2372 assert(ISD && "Invalid opcode");
2373
2374 // The cost tables include both specific, custom (non-legal) src/dst type
2375 // conversions and generic, legalized types. We test for customs first, before
2376 // falling back to legalization.
2377 // FIXME: Need a better design of the cost table to handle non-simple types of
2378 // potential massive combinations (elem_num x src_type x dst_type).
2379 static const TypeConversionCostKindTblEntry AVX512BWConversionTbl[]{
2380 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2381 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2382
2383 // Mask sign extend has an instruction.
2384 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2385 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2386 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2387 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2388 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2389 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2390 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2391 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2392 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2393 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2394 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2395 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2396 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2397 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2398 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, { 1, 1, 1, 1 } },
2399 { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, { 1, 1, 1, 1 } },
2400 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, { 1, 1, 1, 1 } },
2401
2402 // Mask zero extend is a sext + shift.
2403 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2404 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2405 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2406 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2407 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2408 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2409 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2410 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2411 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2412 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2413 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2414 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2415 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2416 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2417 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, { 2, 1, 1, 1 } },
2418 { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, { 2, 1, 1, 1 } },
2419 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, { 2, 1, 1, 1 } },
2420
2421 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2422 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2423 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2424 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2425 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2426 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2427 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2428 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2429 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2430 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2431 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2432 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2433 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2434 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2435 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, { 2, 1, 1, 1 } },
2436 { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, { 2, 1, 1, 1 } },
2437 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, { 2, 1, 1, 1 } },
2438
2439 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 2, 1, 1, 1 } },
2440 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // widen to zmm
2441 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, { 2, 1, 1, 1 } }, // vpmovwb
2442 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, { 2, 1, 1, 1 } }, // vpmovwb
2443 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, { 2, 1, 1, 1 } }, // vpmovwb
2444 };
2445
2446 static const TypeConversionCostKindTblEntry AVX512DQConversionTbl[] = {
2447 // Mask sign extend has an instruction.
2448 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2449 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2450 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2451 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2452 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2453 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, { 1, 1, 1, 1 } },
2454 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } },
2455 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } },
2456
2457 // Mask zero extend is a sext + shift.
2458 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1, } },
2459 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1, } },
2460 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1, } },
2461 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1, } },
2462 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1, } },
2463 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, { 2, 1, 1, 1, } },
2464 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1, } },
2465 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1, } },
2466
2467 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2468 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2469 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2470 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2471 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2472 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } },
2473 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } },
2474 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, { 2, 1, 1, 1 } },
2475
2476 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2477 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2478
2479 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2480 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2481
2482 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2483 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2484
2485 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2486 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2487 };
2488
2489 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
2490 // 256-bit wide vectors.
2491
2492 static const TypeConversionCostKindTblEntry AVX512FConversionTbl[] = {
2493 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 1, 1, 1, 1 } },
2494 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, { 3, 1, 1, 1 } },
2495 { ISD::FP_EXTEND, MVT::v16f64, MVT::v16f32, { 4, 1, 1, 1 } }, // 2*vcvtps2pd+vextractf64x4
2496 { ISD::FP_EXTEND, MVT::v16f32, MVT::v16f16, { 1, 1, 1, 1 } }, // vcvtph2ps
2497 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
2498 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 1, 1, 1, 1 } },
2499 { ISD::FP_ROUND, MVT::v16f16, MVT::v16f32, { 1, 1, 1, 1 } }, // vcvtps2ph
2500
2501 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2502 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2503 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2504 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2505 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2506 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2507 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2508 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2509 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2510 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2511 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2512 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2513 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2514 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2515 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2516 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, { 2, 1, 1, 1 } }, // vpmovdb
2517 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, { 2, 1, 1, 1 } }, // vpmovdb
2518 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2519 { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2520 { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2521 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2522 { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2523 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, { 2, 1, 1, 1 } }, // vpmovqb
2524 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, { 1, 1, 1, 1 } }, // vpshufb
2525 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2526 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2527 { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2528 { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2529 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2530 { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2531 { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2532 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, { 1, 1, 1, 1 } }, // vpmovqd
2533 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // zmm vpmovqd
2534 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, { 5, 1, 1, 1 } },// 2*vpmovqd+concat+vpmovdb
2535
2536 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } }, // extend to v16i32
2537 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 8, 1, 1, 1 } },
2538 { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, { 8, 1, 1, 1 } },
2539
2540 // Sign extend is zmm vpternlogd+vptruncdb.
2541 // Zero extend is zmm broadcast load+vptruncdw.
2542 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 3, 1, 1, 1 } },
2543 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 4, 1, 1, 1 } },
2544 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 3, 1, 1, 1 } },
2545 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 4, 1, 1, 1 } },
2546 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 3, 1, 1, 1 } },
2547 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 4, 1, 1, 1 } },
2548 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 3, 1, 1, 1 } },
2549 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 4, 1, 1, 1 } },
2550
2551 // Sign extend is zmm vpternlogd+vptruncdw.
2552 // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
2553 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 3, 1, 1, 1 } },
2554 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2555 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 3, 1, 1, 1 } },
2556 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2557 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 3, 1, 1, 1 } },
2558 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2559 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 3, 1, 1, 1 } },
2560 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2561
2562 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2563 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2564 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2565 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2566 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2567 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2568 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2569 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2570 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2571 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2572
2573 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2574 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2575 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogq
2576 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2577
2578 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2579 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2580 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2581 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2582 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2583 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2584 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2585 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2586 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2587 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2588
2589 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2590 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2591
2592 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2593 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2594 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2595 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2596 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2597 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2598 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2599 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2600
2601 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2602 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2603 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2604 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2605 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2606 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2607 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2608 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2609 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, {26, 1, 1, 1 } },
2610 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 5, 1, 1, 1 } },
2611
2612 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2613 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, { 7, 1, 1, 1 } },
2614 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64, {15, 1, 1, 1 } },
2615 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32, {11, 1, 1, 1 } },
2616 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64, {31, 1, 1, 1 } },
2617 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2618 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, { 7, 1, 1, 1 } },
2619 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, { 5, 1, 1, 1 } },
2620 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64, {15, 1, 1, 1 } },
2621 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2622 { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, { 3, 1, 1, 1 } },
2623
2624 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2625 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2626 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, { 3, 1, 1, 1 } },
2627 { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, { 1, 1, 1, 1 } },
2628 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, { 3, 1, 1, 1 } },
2629 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, { 3, 1, 1, 1 } },
2630 };
2631
2632 static const TypeConversionCostKindTblEntry AVX512BWVLConversionTbl[] {
2633 // Mask sign extend has an instruction.
2634 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2635 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2636 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2637 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2638 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2639 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2640 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2641 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2642 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2643 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2644 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2645 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2646 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2647 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2648 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, { 1, 1, 1, 1 } },
2649 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, { 1, 1, 1, 1 } },
2650 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, { 1, 1, 1, 1 } },
2651
2652 // Mask zero extend is a sext + shift.
2653 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2654 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2655 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2656 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2657 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2658 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2659 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2660 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2661 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2662 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2663 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2664 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2665 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2666 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2667 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, { 2, 1, 1, 1 } },
2668 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, { 2, 1, 1, 1 } },
2669 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, { 2, 1, 1, 1 } },
2670
2671 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2672 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2673 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2674 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2675 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2676 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2677 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2678 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2679 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2680 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2681 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2682 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2683 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2684 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2685 { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, { 2, 1, 1, 1 } },
2686 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, { 2, 1, 1, 1 } },
2687 { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, { 2, 1, 1, 1 } },
2688
2689 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } },
2690 };
2691
2692 static const TypeConversionCostKindTblEntry AVX512DQVLConversionTbl[] = {
2693 // Mask sign extend has an instruction.
2694 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2695 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2696 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2697 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, { 1, 1, 1, 1 } },
2698 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2699 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, { 1, 1, 1, 1 } },
2700 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } },
2701 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2702
2703 // Mask zero extend is a sext + shift.
2704 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } },
2705 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1 } },
2706 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } },
2707 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, { 2, 1, 1, 1 } },
2708 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } },
2709 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, { 2, 1, 1, 1 } },
2710 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } },
2711 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } },
2712
2713 { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, { 2, 1, 1, 1 } },
2714 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } },
2715 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2716 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2717 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2718 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2719 { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, { 2, 1, 1, 1 } },
2720 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2721
2722 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2723 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2724 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2725 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2726
2727 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2728 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2729 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2730 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2731
2732 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2733 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2734 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2735 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2736
2737 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2738 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2739 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2740 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2741 };
2742
2743 static const TypeConversionCostKindTblEntry AVX512VLConversionTbl[] = {
2744 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2745 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2746 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2747 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 8, 1, 1, 1 } }, // split+2*v8i8
2748 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2749 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2750 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2751 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 8, 1, 1, 1 } }, // split+2*v8i16
2752 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2753 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2754 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2755 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2756 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2757 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2758 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // vpmovqd
2759 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqb
2760 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqw
2761 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, { 2, 1, 1, 1 } }, // vpmovwb
2762
2763 // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
2764 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
2765 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 5, 1, 1, 1 } },
2766 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 6, 1, 1, 1 } },
2767 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 5, 1, 1, 1 } },
2768 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 6, 1, 1, 1 } },
2769 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 5, 1, 1, 1 } },
2770 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 6, 1, 1, 1 } },
2771 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, {10, 1, 1, 1 } },
2772 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, {12, 1, 1, 1 } },
2773
2774 // sign extend is vpcmpeq+maskedmove+vpmovdw
2775 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
2776 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2777 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 5, 1, 1, 1 } },
2778 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2779 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 5, 1, 1, 1 } },
2780 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2781 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 5, 1, 1, 1 } },
2782 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, {10, 1, 1, 1 } },
2783 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, {12, 1, 1, 1 } },
2784
2785 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogd
2786 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2787 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogd
2788 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2789 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogd
2790 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2791 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2792 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2793
2794 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogq
2795 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2796 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogq
2797 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2798
2799 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2800 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2801 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2802 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2803 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2804 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2805 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2806 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2807 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2808 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2809 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2810 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2811
2812 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2813 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2814 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2815 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2816
2817 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
2818 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
2819 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2820 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2821 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2822 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2823 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 1, 1, 1, 1 } },
2824 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
2825 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2826 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2827 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 5, 1, 1, 1 } },
2828 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2829 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 5, 1, 1, 1 } },
2830
2831 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2832 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2833 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, { 5, 1, 1, 1 } },
2834
2835 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
2836 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
2837 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
2838 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
2839 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2840 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2841 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2842 };
2843
2844 static const TypeConversionCostKindTblEntry AVX2ConversionTbl[] = {
2845 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2846 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2847 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2848 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2849 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2850 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2851
2852 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2853 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2854 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2855 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2856 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2857 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2858 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2859 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2860 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2861 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2862 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2863 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2864 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2865 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2866
2867 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2868
2869 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 4, 1, 1, 1 } },
2870 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 4, 1, 1, 1 } },
2871 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 1, 1, 1, 1 } },
2872 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 1, 1, 1, 1 } },
2873 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 1, 1, 1, 1 } },
2874 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 4, 1, 1, 1 } },
2875 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 4, 1, 1, 1 } },
2876 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 1, 1, 1, 1 } },
2877 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 1, 1, 1, 1 } },
2878 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 5, 1, 1, 1 } },
2879 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } },
2880 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 2, 1, 1, 1 } },
2881
2882 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 3, 1, 1, 1 } },
2883 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 3, 1, 1, 1 } },
2884
2885 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2886 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2887 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2888 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 3, 1, 1, 1 } },
2889
2890 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 3, 1, 1, 1 } },
2891 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 3, 1, 1, 1 } },
2892 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2893 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
2894 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2895 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 4, 1, 1, 1 } },
2896 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 3, 1, 1, 1 } },
2897 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 4, 1, 1, 1 } },
2898
2899 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2900 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2901 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2902 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2903 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2904 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2905 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 3, 1, 1, 1 } },
2906
2907 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2908 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2909 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2910 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2911 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 2, 1, 1, 1 } },
2912 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 1, 1, 1, 1 } },
2913 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 2, 1, 1, 1 } },
2914 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2915 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2916 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2917 };
2918
2919 static const TypeConversionCostKindTblEntry AVXConversionTbl[] = {
2920 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2921 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2922 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2923 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2924 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2925 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2926
2927 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2928 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2929 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2930 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2931 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2932 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2933 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2934 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2935 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2936 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2937 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2938 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2939
2940 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 4, 1, 1, 1 } },
2941 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 5, 1, 1, 1 } },
2942 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 4, 1, 1, 1 } },
2943 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 9, 1, 1, 1 } },
2944 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, {11, 1, 1, 1 } },
2945
2946 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 6, 1, 1, 1 } },
2947 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 6, 1, 1, 1 } },
2948 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // and+extract+packuswb
2949 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 5, 1, 1, 1 } },
2950 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
2951 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 5, 1, 1, 1 } },
2952 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 3, 1, 1, 1 } }, // and+extract+2*packusdw
2953 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 2, 1, 1, 1 } },
2954
2955 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, { 3, 1, 1, 1 } },
2956 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, { 3, 1, 1, 1 } },
2957 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, { 8, 1, 1, 1 } },
2958 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2959 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2960 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2961 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2962 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2963 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2964 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2965 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 5, 1, 1, 1 } },
2966 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 8, 1, 1, 1 } },
2967
2968 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, { 7, 1, 1, 1 } },
2969 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, { 7, 1, 1, 1 } },
2970 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, { 6, 1, 1, 1 } },
2971 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2972 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2973 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2974 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2975 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 4, 1, 1, 1 } },
2976 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 4, 1, 1, 1 } },
2977 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
2978 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 6, 1, 1, 1 } },
2979 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 8, 1, 1, 1 } },
2980 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, {10, 1, 1, 1 } },
2981 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, {10, 1, 1, 1 } },
2982 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {18, 1, 1, 1 } },
2983 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2984 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, {10, 1, 1, 1 } },
2985
2986 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2987 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2988 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2989 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
2990 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
2991 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
2992 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
2993 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
2994 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 2, 1, 1, 1 } },
2995 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 2, 1, 1, 1 } },
2996 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 5, 1, 1, 1 } },
2997
2998 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2999 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
3000 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
3001 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
3002 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
3003 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
3004 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
3005 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
3006 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
3007 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
3008 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 6, 1, 1, 1 } },
3009 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 7, 1, 1, 1 } },
3010 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 7, 1, 1, 1 } },
3011
3012 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, { 1, 1, 1, 1 } },
3013 { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, { 1, 1, 1, 1 } },
3014 };
3015
3016 static const TypeConversionCostKindTblEntry SSE41ConversionTbl[] = {
3017 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
3018 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
3019 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
3020 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
3021 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
3022 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
3023 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
3024 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
3025 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
3026 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
3027 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
3028 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
3029
3030 // These truncates end up widening elements.
3031 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 1, 1, 1, 1 } }, // PMOVXZBQ
3032 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 1, 1, 1, 1 } }, // PMOVXZWQ
3033 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 1, 1, 1, 1 } }, // PMOVXZBD
3034
3035 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 2, 1, 1, 1 } },
3036 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 2, 1, 1, 1 } },
3037 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 2, 1, 1, 1 } },
3038
3039 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
3040 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
3041 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
3042 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
3043 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
3044 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
3045 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
3046 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
3047 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
3048 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 1, 1, 1, 1 } },
3049 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
3050
3051 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
3052 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
3053 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 4, 1, 1, 1 } },
3054 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 4, 1, 1, 1 } },
3055 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
3056 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
3057 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
3058 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
3059 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 3, 1, 1, 1 } },
3060 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
3061 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 2, 1, 1, 1 } },
3062 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {12, 1, 1, 1 } },
3063 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {22, 1, 1, 1 } },
3064 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 4, 1, 1, 1 } },
3065
3066 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
3067 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
3068 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
3069 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
3070 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
3071 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
3072 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
3073 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
3074 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
3075 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
3076
3077 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
3078 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3079 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
3080 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
3081 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
3082 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
3083 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
3084 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
3085 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
3086 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
3087 };
3088
3089 static const TypeConversionCostKindTblEntry SSE2ConversionTbl[] = {
3090 // These are somewhat magic numbers justified by comparing the
3091 // output of llvm-mca for our various supported scheduler models
3092 // and basing it off the worst case scenario.
3093 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
3094 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
3095 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 3, 1, 1, 1 } },
3096 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 3, 1, 1, 1 } },
3097 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 3, 1, 1, 1 } },
3098 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
3099 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 3, 1, 1, 1 } },
3100 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
3101 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
3102 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 4, 1, 1, 1 } },
3103 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 8, 1, 1, 1 } },
3104 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 8, 1, 1, 1 } },
3105
3106 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
3107 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
3108 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 8, 1, 1, 1 } },
3109 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 9, 1, 1, 1 } },
3110 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
3111 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 4, 1, 1, 1 } },
3112 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 4, 1, 1, 1 } },
3113 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
3114 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 7, 1, 1, 1 } },
3115 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 7, 1, 1, 1 } },
3116 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
3117 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, {15, 1, 1, 1 } },
3118 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {18, 1, 1, 1 } },
3119
3120 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
3121 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3122 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
3123 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
3124 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
3125 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
3126 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
3127 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
3128 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
3129 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
3130
3131 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
3132 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3133 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
3134 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, {15, 1, 1, 1 } },
3135 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
3136 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
3137 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
3138 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
3139 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 8, 1, 1, 1 } },
3140 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 8, 1, 1, 1 } },
3141
3142 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
3143 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
3144 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 2, 1, 1, 1 } },
3145 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 3, 1, 1, 1 } },
3146 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
3147 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 2, 1, 1, 1 } },
3148 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 2, 1, 1, 1 } },
3149 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 3, 1, 1, 1 } },
3150 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
3151 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 2, 1, 1, 1 } },
3152 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
3153 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 2, 1, 1, 1 } },
3154
3155 // These truncates are really widening elements.
3156 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 1, 1, 1, 1 } }, // PSHUFD
3157 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } }, // PUNPCKLWD+DQ
3158 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // PUNPCKLBW+WD+PSHUFD
3159 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 1, 1, 1, 1 } }, // PUNPCKLWD
3160 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } }, // PUNPCKLBW+WD
3161 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 1, 1, 1, 1 } }, // PUNPCKLBW
3162
3163 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 2, 1, 1, 1 } }, // PAND+PACKUSWB
3164 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } },
3165 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 3, 1, 1, 1 } }, // PAND+2*PACKUSWB
3166 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 7, 1, 1, 1 } },
3167 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, { 1, 1, 1, 1 } },
3168 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 3, 1, 1, 1 } },
3169 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
3170 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, {10, 1, 1, 1 } },
3171 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 4, 1, 1, 1 } }, // PAND+3*PACKUSWB
3172 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 2, 1, 1, 1 } }, // PSHUFD+PSHUFLW
3173 { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, { 1, 1, 1, 1 } }, // PSHUFD
3174 };
3175
3176 static const TypeConversionCostKindTblEntry F16ConversionTbl[] = {
3177 { ISD::FP_ROUND, MVT::f16, MVT::f32, { 1, 1, 1, 1 } },
3178 { ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, { 1, 1, 1, 1 } },
3179 { ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, { 1, 1, 1, 1 } },
3180 { ISD::FP_EXTEND, MVT::f32, MVT::f16, { 1, 1, 1, 1 } },
3181 { ISD::FP_EXTEND, MVT::f64, MVT::f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
3182 { ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, { 1, 1, 1, 1 } },
3183 { ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, { 1, 1, 1, 1 } },
3184 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
3185 };
3186
3187 // Attempt to map directly to (simple) MVT types to let us match custom entries.
3188 EVT SrcTy = TLI->getValueType(DL, Src);
3189 EVT DstTy = TLI->getValueType(DL, Dst);
3190
3191 // If we're sign-extending a vector comparison result back to the comparison
3192 // width, this will be free without AVX512 (or for 8/16-bit types without
3193 // BWI).
3194 if (!ST->hasAVX512() || (!ST->hasBWI() && DstTy.getScalarSizeInBits() < 32)) {
3195 if (I && Opcode == Instruction::CastOps::SExt &&
3196 SrcTy.isFixedLengthVectorOf(MVT::i1)) {
3197 if (auto *CmpI = dyn_cast<CmpInst>(I->getOperand(0))) {
3198 Type *CmpTy = CmpI->getOperand(0)->getType();
3199 if (CmpTy->getScalarSizeInBits() == DstTy.getScalarSizeInBits())
3200 return TTI::TCC_Free;
3201 }
3202 }
3203 }
3204
3205 // The function getSimpleVT only handles simple value types.
3206 if (SrcTy.isSimple() && DstTy.isSimple()) {
3207 MVT SimpleSrcTy = SrcTy.getSimpleVT();
3208 MVT SimpleDstTy = DstTy.getSimpleVT();
3209
3210 if (ST->useAVX512Regs()) {
3211 if (ST->hasBWI())
3212 if (const auto *Entry = ConvertCostTableLookup(
3213 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3214 if (auto KindCost = Entry->Cost[CostKind])
3215 return *KindCost;
3216
3217 if (ST->hasDQI())
3218 if (const auto *Entry = ConvertCostTableLookup(
3219 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3220 if (auto KindCost = Entry->Cost[CostKind])
3221 return *KindCost;
3222
3223 if (ST->hasAVX512())
3224 if (const auto *Entry = ConvertCostTableLookup(
3225 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3226 if (auto KindCost = Entry->Cost[CostKind])
3227 return *KindCost;
3228 }
3229
3230 if (ST->hasBWI())
3231 if (const auto *Entry = ConvertCostTableLookup(
3232 AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3233 if (auto KindCost = Entry->Cost[CostKind])
3234 return *KindCost;
3235
3236 if (ST->hasDQI())
3237 if (const auto *Entry = ConvertCostTableLookup(
3238 AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3239 if (auto KindCost = Entry->Cost[CostKind])
3240 return *KindCost;
3241
3242 if (ST->hasAVX512())
3243 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3244 SimpleDstTy, SimpleSrcTy))
3245 if (auto KindCost = Entry->Cost[CostKind])
3246 return *KindCost;
3247
3248 if (ST->hasAVX2()) {
3249 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3250 SimpleDstTy, SimpleSrcTy))
3251 if (auto KindCost = Entry->Cost[CostKind])
3252 return *KindCost;
3253 }
3254
3255 if (ST->hasAVX()) {
3256 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3257 SimpleDstTy, SimpleSrcTy))
3258 if (auto KindCost = Entry->Cost[CostKind])
3259 return *KindCost;
3260 }
3261
3262 if (ST->hasF16C()) {
3263 if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD,
3264 SimpleDstTy, SimpleSrcTy))
3265 if (auto KindCost = Entry->Cost[CostKind])
3266 return *KindCost;
3267 }
3268
3269 if (ST->hasSSE41()) {
3270 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3271 SimpleDstTy, SimpleSrcTy))
3272 if (auto KindCost = Entry->Cost[CostKind])
3273 return *KindCost;
3274 }
3275
3276 if (ST->hasSSE2()) {
3277 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3278 SimpleDstTy, SimpleSrcTy))
3279 if (auto KindCost = Entry->Cost[CostKind])
3280 return *KindCost;
3281 }
3282
3283 if ((ISD == ISD::FP_ROUND && SimpleDstTy == MVT::f16) ||
3284 (ISD == ISD::FP_EXTEND && SimpleSrcTy == MVT::f16)) {
3285 // fp16 conversions not covered by any table entries require a libcall.
3286 // Return a large (arbitrary) number to model this.
3287 return InstructionCost(64);
3288 }
3289 }
3290
3291 // Fall back to legalized types.
3292 std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src);
3293 std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst);
3294
3295 // If we're truncating to the same legalized type - just assume its free.
3296 if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second)
3297 return TTI::TCC_Free;
3298
3299 if (ST->useAVX512Regs()) {
3300 if (ST->hasBWI())
3301 if (const auto *Entry = ConvertCostTableLookup(
3302 AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
3303 if (auto KindCost = Entry->Cost[CostKind])
3304 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3305
3306 if (ST->hasDQI())
3307 if (const auto *Entry = ConvertCostTableLookup(
3308 AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
3309 if (auto KindCost = Entry->Cost[CostKind])
3310 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3311
3312 if (ST->hasAVX512())
3313 if (const auto *Entry = ConvertCostTableLookup(
3314 AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
3315 if (auto KindCost = Entry->Cost[CostKind])
3316 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3317 }
3318
3319 if (ST->hasBWI())
3320 if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
3321 LTDest.second, LTSrc.second))
3322 if (auto KindCost = Entry->Cost[CostKind])
3323 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3324
3325 if (ST->hasDQI())
3326 if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
3327 LTDest.second, LTSrc.second))
3328 if (auto KindCost = Entry->Cost[CostKind])
3329 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3330
3331 if (ST->hasAVX512())
3332 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3333 LTDest.second, LTSrc.second))
3334 if (auto KindCost = Entry->Cost[CostKind])
3335 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3336
3337 if (ST->hasAVX2())
3338 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3339 LTDest.second, LTSrc.second))
3340 if (auto KindCost = Entry->Cost[CostKind])
3341 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3342
3343 if (ST->hasAVX())
3344 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3345 LTDest.second, LTSrc.second))
3346 if (auto KindCost = Entry->Cost[CostKind])
3347 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3348
3349 if (ST->hasF16C()) {
3350 if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD,
3351 LTDest.second, LTSrc.second))
3352 if (auto KindCost = Entry->Cost[CostKind])
3353 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3354 }
3355
3356 if (ST->hasSSE41())
3357 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3358 LTDest.second, LTSrc.second))
3359 if (auto KindCost = Entry->Cost[CostKind])
3360 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3361
3362 if (ST->hasSSE2())
3363 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3364 LTDest.second, LTSrc.second))
3365 if (auto KindCost = Entry->Cost[CostKind])
3366 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3367
3368 // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
3369 // sitofp.
3370 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3371 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
3372 Type *ExtSrc = Src->getWithNewBitWidth(32);
3373 unsigned ExtOpc =
3374 (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
3375
3376 // For scalar loads the extend would be free.
3377 InstructionCost ExtCost = 0;
3378 if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
3379 ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
3380
3381 return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
3383 }
3384
3385 // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
3386 // i32.
3387 if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
3388 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
3389 Type *TruncDst = Dst->getWithNewBitWidth(32);
3390 return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
3391 getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
3393 }
3394
3395 // TODO: Allow non-throughput costs that aren't binary.
3396 auto AdjustCost = [&CostKind](InstructionCost Cost,
3399 return Cost == 0 ? 0 : N;
3400 return Cost * N;
3401 };
3402 return AdjustCost(
3403 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3404}
3405
3407 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
3409 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
3410 // Early out if this type isn't scalar/vector integer/float.
3411 if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy()))
3412 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3413 Op1Info, Op2Info, I);
3414
3415 // Legalize the type.
3416 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3417
3418 MVT MTy = LT.second;
3419
3420 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3421 assert(ISD && "Invalid opcode");
3422
3423 InstructionCost ExtraCost = 0;
3424 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
3425 // Some vector comparison predicates cost extra instructions.
3426 // TODO: Adjust ExtraCost based on CostKind?
3427 // TODO: Should we invert this and assume worst case cmp costs
3428 // and reduce for particular predicates?
3429 if (MTy.isVector() &&
3430 !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
3431 (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
3432 ST->hasBWI())) {
3433 // Fallback to I if a specific predicate wasn't specified.
3434 CmpInst::Predicate Pred = VecPred;
3435 if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE ||
3437 Pred = cast<CmpInst>(I)->getPredicate();
3438
3439 bool CmpWithConstant = false;
3440 if (auto *CmpInstr = dyn_cast_or_null<CmpInst>(I))
3441 CmpWithConstant = isa<Constant>(CmpInstr->getOperand(1));
3442
3443 switch (Pred) {
3445 // xor(cmpeq(x,y),-1)
3446 ExtraCost = CmpWithConstant ? 0 : 1;
3447 break;
3450 // xor(cmpgt(x,y),-1)
3451 ExtraCost = CmpWithConstant ? 0 : 1;
3452 break;
3455 // cmpgt(xor(x,signbit),xor(y,signbit))
3456 // xor(cmpeq(pmaxu(x,y),x),-1)
3457 ExtraCost = CmpWithConstant ? 1 : 2;
3458 break;
3461 if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
3462 (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
3463 // cmpeq(psubus(x,y),0)
3464 // cmpeq(pminu(x,y),x)
3465 ExtraCost = 1;
3466 } else {
3467 // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
3468 ExtraCost = CmpWithConstant ? 2 : 3;
3469 }
3470 break;
3473 // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases.
3474 // Use FCMP_UEQ expansion - FCMP_ONE should be the same.
3475 if (CondTy && !ST->hasAVX())
3476 return getCmpSelInstrCost(Opcode, ValTy, CondTy,
3478 Op1Info, Op2Info) +
3479 getCmpSelInstrCost(Opcode, ValTy, CondTy,
3481 Op1Info, Op2Info) +
3482 getArithmeticInstrCost(Instruction::Or, CondTy, CostKind);
3483
3484 break;
3487 // Assume worst case scenario and add the maximum extra cost.
3488 ExtraCost = 3;
3489 break;
3490 default:
3491 break;
3492 }
3493 }
3494 }
3495
3496 static const CostKindTblEntry SLMCostTbl[] = {
3497 // slm pcmpeq/pcmpgt throughput is 2
3498 { ISD::SETCC, MVT::v2i64, { 2, 5, 1, 2 } },
3499 // slm pblendvb/blendvpd/blendvps throughput is 4
3500 { ISD::SELECT, MVT::v2f64, { 4, 4, 1, 3 } }, // vblendvpd
3501 { ISD::SELECT, MVT::v4f32, { 4, 4, 1, 3 } }, // vblendvps
3502 { ISD::SELECT, MVT::v2i64, { 4, 4, 1, 3 } }, // pblendvb
3503 { ISD::SELECT, MVT::v8i32, { 4, 4, 1, 3 } }, // pblendvb
3504 { ISD::SELECT, MVT::v8i16, { 4, 4, 1, 3 } }, // pblendvb
3505 { ISD::SELECT, MVT::v16i8, { 4, 4, 1, 3 } }, // pblendvb
3506 };
3507
3508 static const CostKindTblEntry AVX512BWCostTbl[] = {
3509 { ISD::SETCC, MVT::v32i16, { 1, 1, 1, 1 } },
3510 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 1 } },
3511 { ISD::SETCC, MVT::v64i8, { 1, 1, 1, 1 } },
3512 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 1 } },
3513
3514 { ISD::SELECT, MVT::v32i16, { 1, 1, 1, 1 } },
3515 { ISD::SELECT, MVT::v64i8, { 1, 1, 1, 1 } },
3516 };
3517
3518 static const CostKindTblEntry AVX512CostTbl[] = {
3519 { ISD::SETCC, MVT::v8f64, { 1, 4, 1, 1 } },
3520 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 1 } },
3521 { ISD::SETCC, MVT::v16f32, { 1, 4, 1, 1 } },
3522 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 1 } },
3523
3524 { ISD::SETCC, MVT::v8i64, { 1, 1, 1, 1 } },
3525 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 1 } },
3526 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3527 { ISD::SETCC, MVT::v16i32, { 1, 1, 1, 1 } },
3528 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 1 } },
3529 { ISD::SETCC, MVT::v32i16, { 3, 7, 5, 5 } },
3530 { ISD::SETCC, MVT::v64i8, { 3, 7, 5, 5 } },
3531
3532 { ISD::SELECT, MVT::v8i64, { 1, 1, 1, 1 } },
3533 { ISD::SELECT, MVT::v4i64, { 1, 1, 1, 1 } },
3534 { ISD::SELECT, MVT::v2i64, { 1, 1, 1, 1 } },
3535 { ISD::SELECT, MVT::v16i32, { 1, 1, 1, 1 } },
3536 { ISD::SELECT, MVT::v8i32, { 1, 1, 1, 1 } },
3537 { ISD::SELECT, MVT::v4i32, { 1, 1, 1, 1 } },
3538 { ISD::SELECT, MVT::v8f64, { 1, 1, 1, 1 } },
3539 { ISD::SELECT, MVT::v4f64, { 1, 1, 1, 1 } },
3540 { ISD::SELECT, MVT::v2f64, { 1, 1, 1, 1 } },
3541 { ISD::SELECT, MVT::f64, { 1, 1, 1, 1 } },
3542 { ISD::SELECT, MVT::v16f32, { 1, 1, 1, 1 } },
3543 { ISD::SELECT, MVT::v8f32 , { 1, 1, 1, 1 } },
3544 { ISD::SELECT, MVT::v4f32, { 1, 1, 1, 1 } },
3545 { ISD::SELECT, MVT::f32 , { 1, 1, 1, 1 } },
3546
3547 { ISD::SELECT, MVT::v32i16, { 2, 2, 4, 4 } },
3548 { ISD::SELECT, MVT::v16i16, { 1, 1, 1, 1 } },
3549 { ISD::SELECT, MVT::v8i16, { 1, 1, 1, 1 } },
3550 { ISD::SELECT, MVT::v64i8, { 2, 2, 4, 4 } },
3551 { ISD::SELECT, MVT::v32i8, { 1, 1, 1, 1 } },
3552 { ISD::SELECT, MVT::v16i8, { 1, 1, 1, 1 } },
3553 };
3554
3555 static const CostKindTblEntry AVX2CostTbl[] = {
3556 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 2 } },
3557 { ISD::SETCC, MVT::v2f64, { 1, 4, 1, 1 } },
3558 { ISD::SETCC, MVT::f64, { 1, 4, 1, 1 } },
3559 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 2 } },
3560 { ISD::SETCC, MVT::v4f32, { 1, 4, 1, 1 } },
3561 { ISD::SETCC, MVT::f32, { 1, 4, 1, 1 } },
3562
3563 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 2 } },
3564 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 2 } },
3565 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 2 } },
3566 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 2 } },
3567
3568 { ISD::SELECT, MVT::v4f64, { 2, 2, 1, 2 } }, // vblendvpd
3569 { ISD::SELECT, MVT::v8f32, { 2, 2, 1, 2 } }, // vblendvps
3570 { ISD::SELECT, MVT::v4i64, { 2, 2, 1, 2 } }, // pblendvb
3571 { ISD::SELECT, MVT::v8i32, { 2, 2, 1, 2 } }, // pblendvb
3572 { ISD::SELECT, MVT::v16i16, { 2, 2, 1, 2 } }, // pblendvb
3573 { ISD::SELECT, MVT::v32i8, { 2, 2, 1, 2 } }, // pblendvb
3574 };
3575
3576 static const CostKindTblEntry XOPCostTbl[] = {
3577 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3578 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3579 };
3580
3581 static const CostKindTblEntry AVX1CostTbl[] = {
3582 { ISD::SETCC, MVT::v4f64, { 2, 3, 1, 2 } },
3583 { ISD::SETCC, MVT::v2f64, { 1, 3, 1, 1 } },
3584 { ISD::SETCC, MVT::f64, { 1, 3, 1, 1 } },
3585 { ISD::SETCC, MVT::v8f32, { 2, 3, 1, 2 } },
3586 { ISD::SETCC, MVT::v4f32, { 1, 3, 1, 1 } },
3587 { ISD::SETCC, MVT::f32, { 1, 3, 1, 1 } },
3588
3589 // AVX1 does not support 8-wide integer compare.
3590 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3591 { ISD::SETCC, MVT::v8i32, { 4, 2, 5, 6 } },
3592 { ISD::SETCC, MVT::v16i16, { 4, 2, 5, 6 } },
3593 { ISD::SETCC, MVT::v32i8, { 4, 2, 5, 6 } },
3594
3595 { ISD::SELECT, MVT::v4f64, { 3, 3, 1, 2 } }, // vblendvpd
3596 { ISD::SELECT, MVT::v8f32, { 3, 3, 1, 2 } }, // vblendvps
3597 { ISD::SELECT, MVT::v4i64, { 3, 3, 1, 2 } }, // vblendvpd
3598 { ISD::SELECT, MVT::v8i32, { 3, 3, 1, 2 } }, // vblendvps
3599 { ISD::SELECT, MVT::v16i16, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3600 { ISD::SELECT, MVT::v32i8, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3601 };
3602
3603 static const CostKindTblEntry SSE42CostTbl[] = {
3604 { ISD::SETCC, MVT::v2i64, { 1, 2, 1, 2 } },
3605 };
3606
3607 static const CostKindTblEntry SSE41CostTbl[] = {
3608 { ISD::SETCC, MVT::v2f64, { 1, 5, 1, 1 } },
3609 { ISD::SETCC, MVT::v4f32, { 1, 5, 1, 1 } },
3610
3611 { ISD::SELECT, MVT::v2f64, { 2, 2, 1, 2 } }, // blendvpd
3612 { ISD::SELECT, MVT::f64, { 2, 2, 1, 2 } }, // blendvpd
3613 { ISD::SELECT, MVT::v4f32, { 2, 2, 1, 2 } }, // blendvps
3614 { ISD::SELECT, MVT::f32 , { 2, 2, 1, 2 } }, // blendvps
3615 { ISD::SELECT, MVT::v2i64, { 2, 2, 1, 2 } }, // pblendvb
3616 { ISD::SELECT, MVT::v4i32, { 2, 2, 1, 2 } }, // pblendvb
3617 { ISD::SELECT, MVT::v8i16, { 2, 2, 1, 2 } }, // pblendvb
3618 { ISD::SELECT, MVT::v16i8, { 2, 2, 1, 2 } }, // pblendvb
3619 };
3620
3621 static const CostKindTblEntry SSE2CostTbl[] = {
3622 { ISD::SETCC, MVT::v2f64, { 2, 5, 1, 1 } },
3623 { ISD::SETCC, MVT::f64, { 1, 5, 1, 1 } },
3624
3625 { ISD::SETCC, MVT::v2i64, { 5, 4, 5, 5 } }, // pcmpeqd/pcmpgtd expansion
3626 { ISD::SETCC, MVT::v4i32, { 1, 1, 1, 1 } },
3627 { ISD::SETCC, MVT::v8i16, { 1, 1, 1, 1 } },
3628 { ISD::SETCC, MVT::v16i8, { 1, 1, 1, 1 } },
3629
3630 { ISD::SELECT, MVT::v2f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3631 { ISD::SELECT, MVT::f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3632 { ISD::SELECT, MVT::v2i64, { 2, 2, 3, 3 } }, // pand + pandn + por
3633 { ISD::SELECT, MVT::v4i32, { 2, 2, 3, 3 } }, // pand + pandn + por
3634 { ISD::SELECT, MVT::v8i16, { 2, 2, 3, 3 } }, // pand + pandn + por
3635 { ISD::SELECT, MVT::v16i8, { 2, 2, 3, 3 } }, // pand + pandn + por
3636 };
3637
3638 static const CostKindTblEntry SSE1CostTbl[] = {
3639 { ISD::SETCC, MVT::v4f32, { 2, 5, 1, 1 } },
3640 { ISD::SETCC, MVT::f32, { 1, 5, 1, 1 } },
3641
3642 { ISD::SELECT, MVT::v4f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3643 { ISD::SELECT, MVT::f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3644 };
3645
3646 if (ST->useSLMArithCosts())
3647 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
3648 if (auto KindCost = Entry->Cost[CostKind])
3649 return LT.first * (ExtraCost + *KindCost);
3650
3651 if (ST->hasBWI())
3652 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3653 if (auto KindCost = Entry->Cost[CostKind])
3654 return LT.first * (ExtraCost + *KindCost);
3655
3656 if (ST->hasAVX512())
3657 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3658 if (auto KindCost = Entry->Cost[CostKind])
3659 return LT.first * (ExtraCost + *KindCost);
3660
3661 if (ST->hasAVX2())
3662 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3663 if (auto KindCost = Entry->Cost[CostKind])
3664 return LT.first * (ExtraCost + *KindCost);
3665
3666 if (ST->hasXOP())
3667 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3668 if (auto KindCost = Entry->Cost[CostKind])
3669 return LT.first * (ExtraCost + *KindCost);
3670
3671 if (ST->hasAVX())
3672 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3673 if (auto KindCost = Entry->Cost[CostKind])
3674 return LT.first * (ExtraCost + *KindCost);
3675
3676 if (ST->hasSSE42())
3677 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3678 if (auto KindCost = Entry->Cost[CostKind])
3679 return LT.first * (ExtraCost + *KindCost);
3680
3681 if (ST->hasSSE41())
3682 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3683 if (auto KindCost = Entry->Cost[CostKind])
3684 return LT.first * (ExtraCost + *KindCost);
3685
3686 if (ST->hasSSE2())
3687 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3688 if (auto KindCost = Entry->Cost[CostKind])
3689 return LT.first * (ExtraCost + *KindCost);
3690
3691 if (ST->hasSSE1())
3692 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3693 if (auto KindCost = Entry->Cost[CostKind])
3694 return LT.first * (ExtraCost + *KindCost);
3695
3696 // Assume a 3cy latency for fp select ops.
3697 if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select)
3698 if (ValTy->getScalarType()->isFloatingPointTy())
3699 return 3;
3700
3701 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3702 Op1Info, Op2Info, I);
3703}
3704
3706
3710 // Costs should match the codegen from:
3711 // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
3712 // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
3713 // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
3714 // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
3715 // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
3716
3717 // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
3718 // specialized in these tables yet.
3719 static const CostKindTblEntry AVX512VBMI2CostTbl[] = {
3720 { ISD::FSHL, MVT::v8i64, { 1, 1, 1, 1 } },
3721 { ISD::FSHL, MVT::v4i64, { 1, 1, 1, 1 } },
3722 { ISD::FSHL, MVT::v2i64, { 1, 1, 1, 1 } },
3723 { ISD::FSHL, MVT::v16i32, { 1, 1, 1, 1 } },
3724 { ISD::FSHL, MVT::v8i32, { 1, 1, 1, 1 } },
3725 { ISD::FSHL, MVT::v4i32, { 1, 1, 1, 1 } },
3726 { ISD::FSHL, MVT::v32i16, { 1, 1, 1, 1 } },
3727 { ISD::FSHL, MVT::v16i16, { 1, 1, 1, 1 } },
3728 { ISD::FSHL, MVT::v8i16, { 1, 1, 1, 1 } },
3729 { ISD::ROTL, MVT::v32i16, { 1, 1, 1, 1 } },
3730 { ISD::ROTL, MVT::v16i16, { 1, 1, 1, 1 } },
3731 { ISD::ROTL, MVT::v8i16, { 1, 1, 1, 1 } },
3732 { ISD::ROTR, MVT::v32i16, { 1, 1, 1, 1 } },
3733 { ISD::ROTR, MVT::v16i16, { 1, 1, 1, 1 } },
3734 { ISD::ROTR, MVT::v8i16, { 1, 1, 1, 1 } },
3735 { X86ISD::VROTLI, MVT::v32i16, { 1, 1, 1, 1 } },
3736 { X86ISD::VROTLI, MVT::v16i16, { 1, 1, 1, 1 } },
3737 { X86ISD::VROTLI, MVT::v8i16, { 1, 1, 1, 1 } },
3738 };
3739 static const CostKindTblEntry AVX512BITALGCostTbl[] = {
3740 { ISD::CTPOP, MVT::v32i16, { 1, 1, 1, 1 } },
3741 { ISD::CTPOP, MVT::v64i8, { 1, 1, 1, 1 } },
3742 { ISD::CTPOP, MVT::v16i16, { 1, 1, 1, 1 } },
3743 { ISD::CTPOP, MVT::v32i8, { 1, 1, 1, 1 } },
3744 { ISD::CTPOP, MVT::v8i16, { 1, 1, 1, 1 } },
3745 { ISD::CTPOP, MVT::v16i8, { 1, 1, 1, 1 } },
3746 };
3747 static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = {
3748 { ISD::CTPOP, MVT::v8i64, { 1, 1, 1, 1 } },
3749 { ISD::CTPOP, MVT::v16i32, { 1, 1, 1, 1 } },
3750 { ISD::CTPOP, MVT::v4i64, { 1, 1, 1, 1 } },
3751 { ISD::CTPOP, MVT::v8i32, { 1, 1, 1, 1 } },
3752 { ISD::CTPOP, MVT::v2i64, { 1, 1, 1, 1 } },
3753 { ISD::CTPOP, MVT::v4i32, { 1, 1, 1, 1 } },
3754 };
3755 static const CostKindTblEntry AVX512CDCostTbl[] = {
3756 { ISD::CTLZ, MVT::v8i64, { 1, 5, 1, 1 } },
3757 { ISD::CTLZ, MVT::v16i32, { 1, 5, 1, 1 } },
3758 { ISD::CTLZ, MVT::v32i16, { 18, 27, 23, 27 } },
3759 { ISD::CTLZ, MVT::v64i8, { 3, 16, 9, 11 } },
3760 { ISD::CTLZ, MVT::v4i64, { 1, 5, 1, 1 } },
3761 { ISD::CTLZ, MVT::v8i32, { 1, 5, 1, 1 } },
3762 { ISD::CTLZ, MVT::v16i16, { 8, 19, 11, 13 } },
3763 { ISD::CTLZ, MVT::v32i8, { 2, 11, 9, 10 } },
3764 { ISD::CTLZ, MVT::v2i64, { 1, 5, 1, 1 } },
3765 { ISD::CTLZ, MVT::v4i32, { 1, 5, 1, 1 } },
3766 { ISD::CTLZ, MVT::v8i16, { 3, 15, 4, 6 } },
3767 { ISD::CTLZ, MVT::v16i8, { 2, 10, 9, 10 } },
3768
3769 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3770 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3771 { ISD::CTTZ, MVT::v4i64, { 1, 8, 6, 6 } },
3772 { ISD::CTTZ, MVT::v8i32, { 1, 8, 6, 6 } },
3773 { ISD::CTTZ, MVT::v2i64, { 1, 8, 6, 6 } },
3774 { ISD::CTTZ, MVT::v4i32, { 1, 8, 6, 6 } },
3775 };
3776 static const CostKindTblEntry AVX512BWCostTbl[] = {
3777 { ISD::ABS, MVT::v32i16, { 1, 1, 1, 1 } },
3778 { ISD::ABS, MVT::v64i8, { 1, 1, 1, 1 } },
3779 { ISD::BITREVERSE, MVT::v2i64, { 3, 10, 10, 11 } },
3780 { ISD::BITREVERSE, MVT::v4i64, { 3, 11, 10, 11 } },
3781 { ISD::BITREVERSE, MVT::v8i64, { 3, 12, 10, 14 } },
3782 { ISD::BITREVERSE, MVT::v4i32, { 3, 10, 10, 11 } },
3783 { ISD::BITREVERSE, MVT::v8i32, { 3, 11, 10, 11 } },
3784 { ISD::BITREVERSE, MVT::v16i32, { 3, 12, 10, 14 } },
3785 { ISD::BITREVERSE, MVT::v8i16, { 3, 10, 10, 11 } },
3786 { ISD::BITREVERSE, MVT::v16i16, { 3, 11, 10, 11 } },
3787 { ISD::BITREVERSE, MVT::v32i16, { 3, 12, 10, 14 } },
3788 { ISD::BITREVERSE, MVT::v16i8, { 2, 5, 9, 9 } },
3789 { ISD::BITREVERSE, MVT::v32i8, { 2, 5, 9, 9 } },
3790 { ISD::BITREVERSE, MVT::v64i8, { 2, 5, 9, 12 } },
3791 { ISD::BSWAP, MVT::v2i64, { 1, 1, 1, 2 } },
3792 { ISD::BSWAP, MVT::v4i64, { 1, 1, 1, 2 } },
3793 { ISD::BSWAP, MVT::v8i64, { 1, 1, 1, 2 } },
3794 { ISD::BSWAP, MVT::v4i32, { 1, 1, 1, 2 } },
3795 { ISD::BSWAP, MVT::v8i32, { 1, 1, 1, 2 } },
3796 { ISD::BSWAP, MVT::v16i32, { 1, 1, 1, 2 } },
3797 { ISD::BSWAP, MVT::v8i16, { 1, 1, 1, 2 } },
3798 { ISD::BSWAP, MVT::v16i16, { 1, 1, 1, 2 } },
3799 { ISD::BSWAP, MVT::v32i16, { 1, 1, 1, 2 } },
3800 { ISD::CTLZ, MVT::v8i64, { 8, 22, 23, 23 } },
3801 { ISD::CTLZ, MVT::v16i32, { 8, 23, 25, 25 } },
3802 { ISD::CTLZ, MVT::v32i16, { 4, 15, 15, 16 } },
3803 { ISD::CTLZ, MVT::v64i8, { 3, 12, 10, 9 } },
3804 { ISD::CTPOP, MVT::v2i64, { 3, 7, 10, 10 } },
3805 { ISD::CTPOP, MVT::v4i64, { 3, 7, 10, 10 } },
3806 { ISD::CTPOP, MVT::v8i64, { 3, 8, 10, 12 } },
3807 { ISD::CTPOP, MVT::v4i32, { 7, 11, 14, 14 } },
3808 { ISD::CTPOP, MVT::v8i32, { 7, 11, 14, 14 } },
3809 { ISD::CTPOP, MVT::v16i32, { 7, 12, 14, 16 } },
3810 { ISD::CTPOP, MVT::v8i16, { 2, 7, 11, 11 } },
3811 { ISD::CTPOP, MVT::v16i16, { 2, 7, 11, 11 } },
3812 { ISD::CTPOP, MVT::v32i16, { 3, 7, 11, 13 } },
3813 { ISD::CTPOP, MVT::v16i8, { 2, 4, 8, 8 } },
3814 { ISD::CTPOP, MVT::v32i8, { 2, 4, 8, 8 } },
3815 { ISD::CTPOP, MVT::v64i8, { 2, 5, 8, 10 } },
3816 { ISD::CTTZ, MVT::v8i16, { 3, 9, 14, 14 } },
3817 { ISD::CTTZ, MVT::v16i16, { 3, 9, 14, 14 } },
3818 { ISD::CTTZ, MVT::v32i16, { 3, 10, 14, 16 } },
3819 { ISD::CTTZ, MVT::v16i8, { 2, 6, 11, 11 } },
3820 { ISD::CTTZ, MVT::v32i8, { 2, 6, 11, 11 } },
3821 { ISD::CTTZ, MVT::v64i8, { 3, 7, 11, 13 } },
3822 { ISD::ROTL, MVT::v32i16, { 2, 8, 6, 8 } },
3823 { ISD::ROTL, MVT::v16i16, { 2, 8, 6, 7 } },
3824 { ISD::ROTL, MVT::v8i16, { 2, 7, 6, 7 } },
3825 { ISD::ROTL, MVT::v64i8, { 5, 6, 11, 12 } },
3826 { ISD::ROTL, MVT::v32i8, { 5, 15, 7, 10 } },
3827 { ISD::ROTL, MVT::v16i8, { 5, 15, 7, 10 } },
3828 { ISD::ROTR, MVT::v32i16, { 2, 8, 6, 8 } },
3829 { ISD::ROTR, MVT::v16i16, { 2, 8, 6, 7 } },
3830 { ISD::ROTR, MVT::v8i16, { 2, 7, 6, 7 } },
3831 { ISD::ROTR, MVT::v64i8, { 5, 6, 12, 14 } },
3832 { ISD::ROTR, MVT::v32i8, { 5, 14, 6, 9 } },
3833 { ISD::ROTR, MVT::v16i8, { 5, 14, 6, 9 } },
3834 { X86ISD::VROTLI, MVT::v32i16, { 2, 5, 3, 3 } },
3835 { X86ISD::VROTLI, MVT::v16i16, { 1, 5, 3, 3 } },
3836 { X86ISD::VROTLI, MVT::v8i16, { 1, 5, 3, 3 } },
3837 { X86ISD::VROTLI, MVT::v64i8, { 2, 9, 3, 4 } },
3838 { X86ISD::VROTLI, MVT::v32i8, { 1, 9, 3, 4 } },
3839 { X86ISD::VROTLI, MVT::v16i8, { 1, 8, 3, 4 } },
3840 { ISD::SADDSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3841 { ISD::SADDSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3842 { ISD::SMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3843 { ISD::SMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3844 { ISD::SMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3845 { ISD::SMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3846 { ISD::SMULO, MVT::v32i16, { 3, 6, 4, 4 } },
3847 { ISD::SMULO, MVT::v64i8, { 8, 21, 17, 18 } },
3848 { ISD::UMULO, MVT::v32i16, { 2, 5, 3, 3 } },
3849 { ISD::UMULO, MVT::v64i8, { 8, 15, 15, 16 } },
3850 { ISD::SSUBSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3851 { ISD::SSUBSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3852 { ISD::UADDSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3853 { ISD::UADDSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3854 { ISD::UMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3855 { ISD::UMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3856 { ISD::UMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3857 { ISD::UMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3858 { ISD::USUBSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3859 { ISD::USUBSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3860 };
3861 static const CostKindTblEntry AVX512CostTbl[] = {
3862 { ISD::ABS, MVT::v8i64, { 1, 1, 1, 1 } },
3863 { ISD::ABS, MVT::v4i64, { 1, 1, 1, 1 } },
3864 { ISD::ABS, MVT::v2i64, { 1, 1, 1, 1 } },
3865 { ISD::ABS, MVT::v16i32, { 1, 1, 1, 1 } },
3866 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 1 } },
3867 { ISD::ABS, MVT::v32i16, { 2, 7, 4, 4 } },
3868 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 1 } },
3869 { ISD::ABS, MVT::v64i8, { 2, 7, 4, 4 } },
3870 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 1 } },
3871 { ISD::BITREVERSE, MVT::v8i64, { 9, 13, 20, 20 } },
3872 { ISD::BITREVERSE, MVT::v16i32, { 9, 13, 20, 20 } },
3873 { ISD::BITREVERSE, MVT::v32i16, { 9, 13, 20, 20 } },
3874 { ISD::BITREVERSE, MVT::v64i8, { 6, 11, 17, 17 } },
3875 { ISD::BSWAP, MVT::v8i64, { 4, 7, 5, 5 } },
3876 { ISD::BSWAP, MVT::v16i32, { 4, 7, 5, 5 } },
3877 { ISD::BSWAP, MVT::v32i16, { 4, 7, 5, 5 } },
3878 { ISD::CTLZ, MVT::v8i64, { 10, 28, 32, 32 } },
3879 { ISD::CTLZ, MVT::v16i32, { 12, 30, 38, 38 } },
3880 { ISD::CTLZ, MVT::v32i16, { 8, 15, 29, 29 } },
3881 { ISD::CTLZ, MVT::v64i8, { 6, 11, 19, 19 } },
3882 { ISD::CTPOP, MVT::v8i64, { 16, 16, 19, 19 } },
3883 { ISD::CTPOP, MVT::v16i32, { 24, 19, 27, 27 } },
3884 { ISD::CTPOP, MVT::v32i16, { 18, 15, 22, 22 } },
3885 { ISD::CTPOP, MVT::v64i8, { 12, 11, 16, 16 } },
3886 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3887 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3888 { ISD::CTTZ, MVT::v32i16, { 7, 17, 27, 27 } },
3889 { ISD::CTTZ, MVT::v64i8, { 6, 13, 21, 21 } },
3890 { ISD::ROTL, MVT::v8i64, { 1, 1, 1, 1 } },
3891 { ISD::ROTL, MVT::v4i64, { 1, 1, 1, 1 } },
3892 { ISD::ROTL, MVT::v2i64, { 1, 1, 1, 1 } },
3893 { ISD::ROTL, MVT::v16i32, { 1, 1, 1, 1 } },
3894 { ISD::ROTL, MVT::v8i32, { 1, 1, 1, 1 } },
3895 { ISD::ROTL, MVT::v4i32, { 1, 1, 1, 1 } },
3896 { ISD::ROTR, MVT::v8i64, { 1, 1, 1, 1 } },
3897 { ISD::ROTR, MVT::v4i64, { 1, 1, 1, 1 } },
3898 { ISD::ROTR, MVT::v2i64, { 1, 1, 1, 1 } },
3899 { ISD::ROTR, MVT::v16i32, { 1, 1, 1, 1 } },
3900 { ISD::ROTR, MVT::v8i32, { 1, 1, 1, 1 } },
3901 { ISD::ROTR, MVT::v4i32, { 1, 1, 1, 1 } },
3902 { X86ISD::VROTLI, MVT::v8i64, { 1, 1, 1, 1 } },
3903 { X86ISD::VROTLI, MVT::v4i64, { 1, 1, 1, 1 } },
3904 { X86ISD::VROTLI, MVT::v2i64, { 1, 1, 1, 1 } },
3905 { X86ISD::VROTLI, MVT::v16i32, { 1, 1, 1, 1 } },
3906 { X86ISD::VROTLI, MVT::v8i32, { 1, 1, 1, 1 } },
3907 { X86ISD::VROTLI, MVT::v4i32, { 1, 1, 1, 1 } },
3908 { ISD::SADDSAT, MVT::v2i64, { 3, 3, 8, 9 } },
3909 { ISD::SADDSAT, MVT::v4i64, { 2, 2, 6, 7 } },
3910 { ISD::SADDSAT, MVT::v8i64, { 3, 3, 6, 7 } },
3911 { ISD::SADDSAT, MVT::v4i32, { 2, 2, 6, 7 } },
3912 { ISD::SADDSAT, MVT::v8i32, { 2, 2, 6, 7 } },
3913 { ISD::SADDSAT, MVT::v16i32, { 3, 3, 6, 7 } },
3914 { ISD::SADDSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3915 { ISD::SADDSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3916 { ISD::SMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3917 { ISD::SMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3918 { ISD::SMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3919 { ISD::SMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3920 { ISD::SMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3921 { ISD::SMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3922 { ISD::SMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3923 { ISD::SMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3924 { ISD::SMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3925 { ISD::SMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3926 { ISD::SMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3927 { ISD::SMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3928 { ISD::SMULO, MVT::v8i64, { 44, 44, 81, 93 } },
3929 { ISD::SMULO, MVT::v16i32, { 5, 12, 9, 11 } },
3930 { ISD::SMULO, MVT::v32i16, { 6, 12, 17, 17 } },
3931 { ISD::SMULO, MVT::v64i8, { 22, 28, 42, 42 } },
3932 { ISD::SSUBSAT, MVT::v2i64, { 2, 13, 9, 10 } },
3933 { ISD::SSUBSAT, MVT::v4i64, { 2, 15, 7, 8 } },
3934 { ISD::SSUBSAT, MVT::v8i64, { 2, 14, 7, 8 } },
3935 { ISD::SSUBSAT, MVT::v4i32, { 2, 14, 7, 8 } },
3936 { ISD::SSUBSAT, MVT::v8i32, { 2, 15, 7, 8 } },
3937 { ISD::SSUBSAT, MVT::v16i32, { 2, 14, 7, 8 } },
3938 { ISD::SSUBSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3939 { ISD::SSUBSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3940 { ISD::UMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3941 { ISD::UMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3942 { ISD::UMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3943 { ISD::UMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3944 { ISD::UMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3945 { ISD::UMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3946 { ISD::UMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3947 { ISD::UMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3948 { ISD::UMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3949 { ISD::UMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3950 { ISD::UMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3951 { ISD::UMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3952 { ISD::UMULO, MVT::v8i64, { 52, 52, 95, 104} },
3953 { ISD::UMULO, MVT::v16i32, { 5, 12, 8, 10 } },
3954 { ISD::UMULO, MVT::v32i16, { 5, 13, 16, 16 } },
3955 { ISD::UMULO, MVT::v64i8, { 18, 24, 30, 30 } },
3956 { ISD::UADDSAT, MVT::v2i64, { 1, 4, 4, 4 } },
3957 { ISD::UADDSAT, MVT::v4i64, { 1, 4, 4, 4 } },
3958 { ISD::UADDSAT, MVT::v8i64, { 1, 4, 4, 4 } },
3959 { ISD::UADDSAT, MVT::v4i32, { 1, 2, 4, 4 } },
3960 { ISD::UADDSAT, MVT::v8i32, { 1, 2, 4, 4 } },
3961 { ISD::UADDSAT, MVT::v16i32, { 2, 2, 4, 4 } },
3962 { ISD::UADDSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3963 { ISD::UADDSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3964 { ISD::USUBSAT, MVT::v2i64, { 1, 4, 2, 2 } },
3965 { ISD::USUBSAT, MVT::v4i64, { 1, 4, 2, 2 } },
3966 { ISD::USUBSAT, MVT::v8i64, { 1, 4, 2, 2 } },
3967 { ISD::USUBSAT, MVT::v8i32, { 1, 2, 2, 2 } },
3968 { ISD::USUBSAT, MVT::v16i32, { 1, 2, 2, 2 } },
3969 { ISD::USUBSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3970 { ISD::USUBSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3971 { ISD::FMAXNUM, MVT::f32, { 2, 2, 3, 3 } },
3972 { ISD::FMAXNUM, MVT::v4f32, { 1, 1, 3, 3 } },
3973 { ISD::FMAXNUM, MVT::v8f32, { 2, 2, 3, 3 } },
3974 { ISD::FMAXNUM, MVT::v16f32, { 4, 4, 3, 3 } },
3975 { ISD::FMAXNUM, MVT::f64, { 2, 2, 3, 3 } },
3976 { ISD::FMAXNUM, MVT::v2f64, { 1, 1, 3, 3 } },
3977 { ISD::FMAXNUM, MVT::v4f64, { 2, 2, 3, 3 } },
3978 { ISD::FMAXNUM, MVT::v8f64, { 3, 3, 3, 3 } },
3979 { ISD::FSQRT, MVT::f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3980 { ISD::FSQRT, MVT::v4f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3981 { ISD::FSQRT, MVT::v8f32, { 6, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3982 { ISD::FSQRT, MVT::v16f32, { 12, 20, 1, 3 } }, // Skylake from http://www.agner.org/
3983 { ISD::FSQRT, MVT::f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3984 { ISD::FSQRT, MVT::v2f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3985 { ISD::FSQRT, MVT::v4f64, { 12, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3986 { ISD::FSQRT, MVT::v8f64, { 24, 32, 1, 3 } }, // Skylake from http://www.agner.org/
3987 };
3988 static const CostKindTblEntry XOPCostTbl[] = {
3989 { ISD::BITREVERSE, MVT::v4i64, { 3, 6, 5, 6 } },
3990 { ISD::BITREVERSE, MVT::v8i32, { 3, 6, 5, 6 } },
3991 { ISD::BITREVERSE, MVT::v16i16, { 3, 6, 5, 6 } },
3992 { ISD::BITREVERSE, MVT::v32i8, { 3, 6, 5, 6 } },
3993 { ISD::BITREVERSE, MVT::v2i64, { 2, 7, 1, 1 } },
3994 { ISD::BITREVERSE, MVT::v4i32, { 2, 7, 1, 1 } },
3995 { ISD::BITREVERSE, MVT::v8i16, { 2, 7, 1, 1 } },
3996 { ISD::BITREVERSE, MVT::v16i8, { 2, 7, 1, 1 } },
3997 { ISD::BITREVERSE, MVT::i64, { 2, 2, 3, 4 } },
3998 { ISD::BITREVERSE, MVT::i32, { 2, 2, 3, 4 } },
3999 { ISD::BITREVERSE, MVT::i16, { 2, 2, 3, 4 } },
4000 { ISD::BITREVERSE, MVT::i8, { 2, 2, 3, 4 } },
4001 // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
4002 { ISD::ROTL, MVT::v4i64, { 4, 7, 5, 6 } },
4003 { ISD::ROTL, MVT::v8i32, { 4, 7, 5, 6 } },
4004 { ISD::ROTL, MVT::v16i16, { 4, 7, 5, 6 } },
4005 { ISD::ROTL, MVT::v32i8, { 4, 7, 5, 6 } },
4006 { ISD::ROTL, MVT::v2i64, { 1, 3, 1, 1 } },
4007 { ISD::ROTL, MVT::v4i32, { 1, 3, 1, 1 } },
4008 { ISD::ROTL, MVT::v8i16, { 1, 3, 1, 1 } },
4009 { ISD::ROTL, MVT::v16i8, { 1, 3, 1, 1 } },
4010 { ISD::ROTR, MVT::v4i64, { 4, 7, 8, 9 } },
4011 { ISD::ROTR, MVT::v8i32, { 4, 7, 8, 9 } },
4012 { ISD::ROTR, MVT::v16i16, { 4, 7, 8, 9 } },
4013 { ISD::ROTR, MVT::v32i8, { 4, 7, 8, 9 } },
4014 { ISD::ROTR, MVT::v2i64, { 1, 3, 3, 3 } },
4015 { ISD::ROTR, MVT::v4i32, { 1, 3, 3, 3 } },
4016 { ISD::ROTR, MVT::v8i16, { 1, 3, 3, 3 } },
4017 { ISD::ROTR, MVT::v16i8, { 1, 3, 3, 3 } },
4018 { X86ISD::VROTLI, MVT::v4i64, { 4, 7, 5, 6 } },
4019 { X86ISD::VROTLI, MVT::v8i32, { 4, 7, 5, 6 } },
4020 { X86ISD::VROTLI, MVT::v16i16, { 4, 7, 5, 6 } },
4021 { X86ISD::VROTLI, MVT::v32i8, { 4, 7, 5, 6 } },
4022 { X86ISD::VROTLI, MVT::v2i64, { 1, 3, 1, 1 } },
4023 { X86ISD::VROTLI, MVT::v4i32, { 1, 3, 1, 1 } },
4024 { X86ISD::VROTLI, MVT::v8i16, { 1, 3, 1, 1 } },
4025 { X86ISD::VROTLI, MVT::v16i8, { 1, 3, 1, 1 } },
4026 };
4027 static const CostKindTblEntry AVX2CostTbl[] = {
4028 { ISD::ABS, MVT::v2i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
4029 { ISD::ABS, MVT::v4i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
4030 { ISD::ABS, MVT::v4i32, { 1, 1, 1, 1 } },
4031 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 2 } },
4032 { ISD::ABS, MVT::v8i16, { 1, 1, 1, 1 } },
4033 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 2 } },
4034 { ISD::ABS, MVT::v16i8, { 1, 1, 1, 1 } },
4035 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 2 } },
4036 { ISD::BITREVERSE, MVT::v2i64, { 3, 11, 10, 11 } },
4037 { ISD::BITREVERSE, MVT::v4i64, { 5, 11, 10, 17 } },
4038 { ISD::BITREVERSE, MVT::v4i32, { 3, 11, 10, 11 } },
4039 { ISD::BITREVERSE, MVT::v8i32, { 5, 11, 10, 17 } },
4040 { ISD::BITREVERSE, MVT::v8i16, { 3, 11, 10, 11 } },
4041 { ISD::BITREVERSE, MVT::v16i16, { 5, 11, 10, 17 } },
4042 { ISD::BITREVERSE, MVT::v16i8, { 3, 6, 9, 9 } },
4043 { ISD::BITREVERSE, MVT::v32i8, { 4, 5, 9, 15 } },
4044 { ISD::BSWAP, MVT::v2i64, { 1, 2, 1, 2 } },
4045 { ISD::BSWAP, MVT::v4i64, { 1, 3, 1, 2 } },
4046 { ISD::BSWAP, MVT::v4i32, { 1, 2, 1, 2 } },
4047 { ISD::BSWAP, MVT::v8i32, { 1, 3, 1, 2 } },
4048 { ISD::BSWAP, MVT::v8i16, { 1, 2, 1, 2 } },
4049 { ISD::BSWAP, MVT::v16i16, { 1, 3, 1, 2 } },
4050 { ISD::CTLZ, MVT::v2i64, { 7, 18, 24, 25 } },
4051 { ISD::CTLZ, MVT::v4i64, { 14, 18, 24, 44 } },
4052 { ISD::CTLZ, MVT::v4i32, { 5, 16, 19, 20 } },
4053 { ISD::CTLZ, MVT::v8i32, { 10, 16, 19, 34 } },
4054 { ISD::CTLZ, MVT::v8i16, { 4, 13, 14, 15 } },
4055 { ISD::CTLZ, MVT::v16i16, { 6, 14, 14, 24 } },
4056 { ISD::CTLZ, MVT::v16i8, { 3, 12, 9, 10 } },
4057 { ISD::CTLZ, MVT::v32i8, { 4, 12, 9, 14 } },
4058 { ISD::CTPOP, MVT::v2i64, { 3, 9, 10, 10 } },
4059 { ISD::CTPOP, MVT::v4i64, { 4, 9, 10, 14 } },
4060 { ISD::CTPOP, MVT::v4i32, { 7, 12, 14, 14 } },
4061 { ISD::CTPOP, MVT::v8i32, { 7, 12, 14, 18 } },
4062 { ISD::CTPOP, MVT::v8i16, { 3, 7, 11, 11 } },
4063 { ISD::CTPOP, MVT::v16i16, { 6, 8, 11, 18 } },
4064 { ISD::CTPOP, MVT::v16i8, { 2, 5, 8, 8 } },
4065 { ISD::CTPOP, MVT::v32i8, { 3, 5, 8, 12 } },
4066 { ISD::CTTZ, MVT::v2i64, { 4, 11, 13, 13 } },
4067 { ISD::CTTZ, MVT::v4i64, { 5, 11, 13, 20 } },
4068 { ISD::CTTZ, MVT::v4i32, { 7, 14, 17, 17 } },
4069 { ISD::CTTZ, MVT::v8i32, { 7, 15, 17, 24 } },
4070 { ISD::CTTZ, MVT::v8i16, { 4, 9, 14, 14 } },
4071 { ISD::CTTZ, MVT::v16i16, { 6, 9, 14, 24 } },
4072 { ISD::CTTZ, MVT::v16i8, { 3, 7, 11, 11 } },
4073 { ISD::CTTZ, MVT::v32i8, { 5, 7, 11, 18 } },
4074 { ISD::SADDSAT, MVT::v2i64, { 4, 13, 8, 11 } },
4075 { ISD::SADDSAT, MVT::v4i64, { 3, 10, 8, 12 } },
4076 { ISD::SADDSAT, MVT::v4i32, { 2, 6, 7, 9 } },
4077 { ISD::SADDSAT, MVT::v8i32, { 4, 6, 7, 13 } },
4078 { ISD::SADDSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4079 { ISD::SADDSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4080 { ISD::SMAX, MVT::v2i64, { 2, 7, 2, 3 } },
4081 { ISD::SMAX, MVT::v4i64, { 2, 7, 2, 3 } },
4082 { ISD::SMAX, MVT::v8i32, { 1, 1, 1, 2 } },
4083 { ISD::SMAX, MVT::v16i16, { 1, 1, 1, 2 } },
4084 { ISD::SMAX, MVT::v32i8, { 1, 1, 1, 2 } },
4085 { ISD::SMIN, MVT::v2i64, { 2, 7, 2, 3 } },
4086 { ISD::SMIN, MVT::v4i64, { 2, 7, 2, 3 } },
4087 { ISD::SMIN, MVT::v8i32, { 1, 1, 1, 2 } },
4088 { ISD::SMIN, MVT::v16i16, { 1, 1, 1, 2 } },
4089 { ISD::SMIN, MVT::v32i8, { 1, 1, 1, 2 } },
4090 { ISD::SMULO, MVT::v4i64, { 20, 20, 33, 37 } },
4091 { ISD::SMULO, MVT::v2i64, { 8, 8, 13, 15 } },
4092 { ISD::SMULO, MVT::v8i32, { 8, 20, 13, 24 } },
4093 { ISD::SMULO, MVT::v4i32, { 5, 15, 11, 12 } },
4094 { ISD::SMULO, MVT::v16i16, { 4, 14, 8, 14 } },
4095 { ISD::SMULO, MVT::v8i16, { 3, 9, 6, 6 } },
4096 { ISD::SMULO, MVT::v32i8, { 9, 15, 18, 35 } },
4097 { ISD::SMULO, MVT::v16i8, { 6, 22, 14, 21 } },
4098 { ISD::SSUBSAT, MVT::v2i64, { 4, 13, 9, 13 } },
4099 { ISD::SSUBSAT, MVT::v4i64, { 4, 15, 9, 13 } },
4100 { ISD::SSUBSAT, MVT::v4i32, { 3, 14, 9, 11 } },
4101 { ISD::SSUBSAT, MVT::v8i32, { 4, 15, 9, 16 } },
4102 { ISD::SSUBSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4103 { ISD::SSUBSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4104 { ISD::UADDSAT, MVT::v2i64, { 2, 8, 6, 6 } },
4105 { ISD::UADDSAT, MVT::v4i64, { 3, 8, 6, 10 } },
4106 { ISD::UADDSAT, MVT::v8i32, { 2, 2, 4, 8 } },
4107 { ISD::UADDSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4108 { ISD::UADDSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4109 { ISD::UMAX, MVT::v2i64, { 2, 8, 5, 6 } },
4110 { ISD::UMAX, MVT::v4i64, { 2, 8, 5, 8 } },
4111 { ISD::UMAX, MVT::v8i32, { 1, 1, 1, 2 } },
4112 { ISD::UMAX, MVT::v16i16, { 1, 1, 1, 2 } },
4113 { ISD::UMAX, MVT::v32i8, { 1, 1, 1, 2 } },
4114 { ISD::UMIN, MVT::v2i64, { 2, 8, 5, 6 } },
4115 { ISD::UMIN, MVT::v4i64, { 2, 8, 5, 8 } },
4116 { ISD::UMIN, MVT::v8i32, { 1, 1, 1, 2 } },
4117 { ISD::UMIN, MVT::v16i16, { 1, 1, 1, 2 } },
4118 { ISD::UMIN, MVT::v32i8, { 1, 1, 1, 2 } },
4119 { ISD::UMULO, MVT::v4i64, { 24, 24, 39, 43 } },
4120 { ISD::UMULO, MVT::v2i64, { 10, 10, 15, 19 } },
4121 { ISD::UMULO, MVT::v8i32, { 8, 11, 13, 23 } },
4122 { ISD::UMULO, MVT::v4i32, { 5, 12, 11, 12 } },
4123 { ISD::UMULO, MVT::v16i16, { 4, 6, 8, 13 } },
4124 { ISD::UMULO, MVT::v8i16, { 2, 8, 6, 6 } },
4125 { ISD::UMULO, MVT::v32i8, { 9, 13, 17, 33 } },
4126 { ISD::UMULO, MVT::v16i8, { 6, 19, 13, 20 } },
4127 { ISD::USUBSAT, MVT::v2i64, { 2, 7, 6, 6 } },
4128 { ISD::USUBSAT, MVT::v4i64, { 3, 7, 6, 10 } },
4129 { ISD::USUBSAT, MVT::v8i32, { 2, 2, 2, 4 } },
4130 { ISD::USUBSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4131 { ISD::USUBSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4132 { ISD::FMAXNUM, MVT::f32, { 2, 7, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4133 { ISD::FMAXNUM, MVT::v4f32, { 2, 7, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4134 { ISD::FMAXNUM, MVT::v8f32, { 3, 7, 3, 6 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4135 { ISD::FMAXNUM, MVT::f64, { 2, 7, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4136 { ISD::FMAXNUM, MVT::v2f64, { 2, 7, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4137 { ISD::FMAXNUM, MVT::v4f64, { 3, 7, 3, 6 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4138 { ISD::FSQRT, MVT::f32, { 7, 15, 1, 1 } }, // vsqrtss
4139 { ISD::FSQRT, MVT::v4f32, { 7, 15, 1, 1 } }, // vsqrtps
4140 { ISD::FSQRT, MVT::v8f32, { 14, 21, 1, 3 } }, // vsqrtps
4141 { ISD::FSQRT, MVT::f64, { 14, 21, 1, 1 } }, // vsqrtsd
4142 { ISD::FSQRT, MVT::v2f64, { 14, 21, 1, 1 } }, // vsqrtpd
4143 { ISD::FSQRT, MVT::v4f64, { 28, 35, 1, 3 } }, // vsqrtpd
4144 };
4145 static const CostKindTblEntry AVX1CostTbl[] = {
4146 { ISD::ABS, MVT::v4i64, { 6, 8, 6, 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
4147 { ISD::ABS, MVT::v8i32, { 3, 6, 4, 5 } },
4148 { ISD::ABS, MVT::v16i16, { 3, 6, 4, 5 } },
4149 { ISD::ABS, MVT::v32i8, { 3, 6, 4, 5 } },
4150 { ISD::BITREVERSE, MVT::v4i64, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4151 { ISD::BITREVERSE, MVT::v2i64, { 8, 13, 10, 16 } },
4152 { ISD::BITREVERSE, MVT::v8i32, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4153 { ISD::BITREVERSE, MVT::v4i32, { 8, 13, 10, 16 } },
4154 { ISD::BITREVERSE, MVT::v16i16, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4155 { ISD::BITREVERSE, MVT::v8i16, { 8, 13, 10, 16 } },
4156 { ISD::BITREVERSE, MVT::v32i8, { 13, 15, 17, 26 } }, // 2 x 128-bit Op + extract/insert
4157 { ISD::BITREVERSE, MVT::v16i8, { 7, 7, 9, 13 } },
4158 { ISD::BSWAP, MVT::v4i64, { 5, 6, 5, 10 } },
4159 { ISD::BSWAP, MVT::v2i64, { 2, 2, 1, 3 } },
4160 { ISD::BSWAP, MVT::v8i32, { 5, 6, 5, 10 } },
4161 { ISD::BSWAP, MVT::v4i32, { 2, 2, 1, 3 } },
4162 { ISD::BSWAP, MVT::v16i16, { 5, 6, 5, 10 } },
4163 { ISD::BSWAP, MVT::v8i16, { 2, 2, 1, 3 } },
4164 { ISD::CTLZ, MVT::v4i64, { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert
4165 { ISD::CTLZ, MVT::v2i64, { 14, 24, 24, 28 } },
4166 { ISD::CTLZ, MVT::v8i32, { 24, 28, 39, 48 } }, // 2 x 128-bit Op + extract/insert
4167 { ISD::CTLZ, MVT::v4i32, { 12, 20, 19, 23 } },
4168 { ISD::CTLZ, MVT::v16i16, { 19, 22, 29, 38 } }, // 2 x 128-bit Op + extract/insert
4169 { ISD::CTLZ, MVT::v8i16, { 9, 16, 14, 18 } },
4170 { ISD::CTLZ, MVT::v32i8, { 14, 15, 19, 28 } }, // 2 x 128-bit Op + extract/insert
4171 { ISD::CTLZ, MVT::v16i8, { 7, 12, 9, 13 } },
4172 { ISD::CTPOP, MVT::v4i64, { 14, 18, 19, 28 } }, // 2 x 128-bit Op + extract/insert
4173 { ISD::CTPOP, MVT::v2i64, { 7, 14, 10, 14 } },
4174 { ISD::CTPOP, MVT::v8i32, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
4175 { ISD::CTPOP, MVT::v4i32, { 9, 20, 14, 18 } },
4176 { ISD::CTPOP, MVT::v16i16, { 16, 21, 22, 31 } }, // 2 x 128-bit Op + extract/insert
4177 { ISD::CTPOP, MVT::v8i16, { 8, 18, 11, 15 } },
4178 { ISD::CTPOP, MVT::v32i8, { 13, 15, 16, 25 } }, // 2 x 128-bit Op + extract/insert
4179 { ISD::CTPOP, MVT::v16i8, { 6, 12, 8, 12 } },
4180 { ISD::CTTZ, MVT::v4i64, { 17, 22, 24, 33 } }, // 2 x 128-bit Op + extract/insert
4181 { ISD::CTTZ, MVT::v2i64, { 9, 19, 13, 17 } },
4182 { ISD::CTTZ, MVT::v8i32, { 21, 27, 32, 41 } }, // 2 x 128-bit Op + extract/insert
4183 { ISD::CTTZ, MVT::v4i32, { 11, 24, 17, 21 } },
4184 { ISD::CTTZ, MVT::v16i16, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
4185 { ISD::CTTZ, MVT::v8i16, { 9, 21, 14, 18 } },
4186 { ISD::CTTZ, MVT::v32i8, { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert
4187 { ISD::CTTZ, MVT::v16i8, { 8, 16, 11, 15 } },
4188 { ISD::SADDSAT, MVT::v2i64, { 6, 13, 8, 11 } },
4189 { ISD::SADDSAT, MVT::v4i64, { 13, 20, 15, 25 } }, // 2 x 128-bit Op + extract/insert
4190 { ISD::SADDSAT, MVT::v8i32, { 12, 18, 14, 24 } }, // 2 x 128-bit Op + extract/insert
4191 { ISD::SADDSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4192 { ISD::SADDSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4193 { ISD::SMAX, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
4194 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 4 } },
4195 { ISD::SMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4196 { ISD::SMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4197 { ISD::SMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4198 { ISD::SMIN, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
4199 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
4200 { ISD::SMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4201 { ISD::SMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4202 { ISD::SMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4203 { ISD::SMULO, MVT::v4i64, { 20, 20, 33, 37 } },
4204 { ISD::SMULO, MVT::v2i64, { 9, 9, 13, 17 } },
4205 { ISD::SMULO, MVT::v8i32, { 15, 20, 24, 29 } },
4206 { ISD::SMULO, MVT::v4i32, { 7, 15, 11, 13 } },
4207 { ISD::SMULO, MVT::v16i16, { 8, 14, 14, 15 } },
4208 { ISD::SMULO, MVT::v8i16, { 3, 9, 6, 6 } },
4209 { ISD::SMULO, MVT::v32i8, { 20, 20, 37, 39 } },
4210 { ISD::SMULO, MVT::v16i8, { 9, 22, 18, 21 } },
4211 { ISD::SSUBSAT, MVT::v2i64, { 7, 13, 9, 13 } },
4212 { ISD::SSUBSAT, MVT::v4i64, { 15, 21, 18, 29 } }, // 2 x 128-bit Op + extract/insert
4213 { ISD::SSUBSAT, MVT::v8i32, { 15, 19, 18, 29 } }, // 2 x 128-bit Op + extract/insert
4214 { ISD::SSUBSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4215 { ISD::SSUBSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4216 { ISD::UADDSAT, MVT::v2i64, { 3, 8, 6, 6 } },
4217 { ISD::UADDSAT, MVT::v4i64, { 8, 11, 14, 15 } }, // 2 x 128-bit Op + extract/insert
4218 { ISD::UADDSAT, MVT::v8i32, { 6, 6, 10, 11 } }, // 2 x 128-bit Op + extract/insert
4219 { ISD::UADDSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4220 { ISD::UADDSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4221 { ISD::UMAX, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
4222 { ISD::UMAX, MVT::v2i64, { 4, 8, 5, 7 } },
4223 { ISD::UMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4224 { ISD::UMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4225 { ISD::UMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4226 { ISD::UMIN, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
4227 { ISD::UMIN, MVT::v2i64, { 4, 8, 5, 7 } },
4228 { ISD::UMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4229 { ISD::UMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4230 { ISD::UMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4231 { ISD::UMULO, MVT::v4i64, { 24, 26, 39, 45 } },
4232 { ISD::UMULO, MVT::v2i64, { 10, 12, 15, 20 } },
4233 { ISD::UMULO, MVT::v8i32, { 14, 15, 23, 28 } },
4234 { ISD::UMULO, MVT::v4i32, { 7, 12, 11, 13 } },
4235 { ISD::UMULO, MVT::v16i16, { 7, 11, 13, 14 } },
4236 { ISD::UMULO, MVT::v8i16, { 3, 8, 6, 6 } },
4237 { ISD::UMULO, MVT::v32i8, { 19, 19, 35, 37 } },
4238 { ISD::UMULO, MVT::v16i8, { 9, 19, 17, 20 } },
4239 { ISD::USUBSAT, MVT::v2i64, { 3, 7, 6, 6 } },
4240 { ISD::USUBSAT, MVT::v4i64, { 8, 10, 14, 15 } }, // 2 x 128-bit Op + extract/insert
4241 { ISD::USUBSAT, MVT::v8i32, { 4, 4, 7, 8 } }, // 2 x 128-bit Op + extract/insert
4242 { ISD::USUBSAT, MVT::v8i32, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4243 { ISD::USUBSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4244 { ISD::USUBSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4245 { ISD::FMAXNUM, MVT::f32, { 3, 6, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4246 { ISD::FMAXNUM, MVT::v4f32, { 3, 6, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4247 { ISD::FMAXNUM, MVT::v8f32, { 5, 7, 3, 10 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4248 { ISD::FMAXNUM, MVT::f64, { 3, 6, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4249 { ISD::FMAXNUM, MVT::v2f64, { 3, 6, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4250 { ISD::FMAXNUM, MVT::v4f64, { 5, 7, 3, 10 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4251 { ISD::FSQRT, MVT::f32, { 21, 21, 1, 1 } }, // vsqrtss
4252 { ISD::FSQRT, MVT::v4f32, { 21, 21, 1, 1 } }, // vsqrtps
4253 { ISD::FSQRT, MVT::v8f32, { 42, 42, 1, 3 } }, // vsqrtps
4254 { ISD::FSQRT, MVT::f64, { 27, 27, 1, 1 } }, // vsqrtsd
4255 { ISD::FSQRT, MVT::v2f64, { 27, 27, 1, 1 } }, // vsqrtpd
4256 { ISD::FSQRT, MVT::v4f64, { 54, 54, 1, 3 } }, // vsqrtpd
4257 };
4258 static const CostKindTblEntry GFNICostTbl[] = {
4259 { ISD::BITREVERSE, MVT::i8, { 3, 3, 3, 4 } }, // gf2p8affineqb
4260 { ISD::BITREVERSE, MVT::i16, { 3, 3, 4, 6 } }, // gf2p8affineqb
4261 { ISD::BITREVERSE, MVT::i32, { 3, 3, 4, 5 } }, // gf2p8affineqb
4262 { ISD::BITREVERSE, MVT::i64, { 3, 3, 4, 6 } }, // gf2p8affineqb
4263 { ISD::BITREVERSE, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4264 { ISD::BITREVERSE, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4265 { ISD::BITREVERSE, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4266 { ISD::BITREVERSE, MVT::v8i16, { 1, 8, 2, 4 } }, // gf2p8affineqb
4267 { ISD::BITREVERSE, MVT::v16i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
4268 { ISD::BITREVERSE, MVT::v32i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
4269 { ISD::BITREVERSE, MVT::v4i32, { 1, 8, 2, 4 } }, // gf2p8affineqb
4270 { ISD::BITREVERSE, MVT::v8i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
4271 { ISD::BITREVERSE, MVT::v16i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
4272 { ISD::BITREVERSE, MVT::v2i64, { 1, 8, 2, 4 } }, // gf2p8affineqb
4273 { ISD::BITREVERSE, MVT::v4i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
4274 { ISD::BITREVERSE, MVT::v8i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
4275 { X86ISD::VROTLI, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4276 { X86ISD::VROTLI, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4277 { X86ISD::VROTLI, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4278 };
4279 static const CostKindTblEntry GLMCostTbl[] = {
4280 { ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss
4281 { ISD::FSQRT, MVT::v4f32, { 37, 41, 1, 5 } }, // sqrtps
4282 { ISD::FSQRT, MVT::f64, { 34, 35, 1, 1 } }, // sqrtsd
4283 { ISD::FSQRT, MVT::v2f64, { 67, 71, 1, 5 } }, // sqrtpd
4284 };
4285 static const CostKindTblEntry SLMCostTbl[] = {
4286 { ISD::BSWAP, MVT::v2i64, { 5, 5, 1, 5 } },
4287 { ISD::BSWAP, MVT::v4i32, { 5, 5, 1, 5 } },
4288 { ISD::BSWAP, MVT::v8i16, { 5, 5, 1, 5 } },
4289 { ISD::FSQRT, MVT::f32, { 20, 20, 1, 1 } }, // sqrtss
4290 { ISD::FSQRT, MVT::v4f32, { 40, 41, 1, 5 } }, // sqrtps
4291 { ISD::FSQRT, MVT::f64, { 35, 35, 1, 1 } }, // sqrtsd
4292 { ISD::FSQRT, MVT::v2f64, { 70, 71, 1, 5 } }, // sqrtpd
4293 };
4294 static const CostKindTblEntry SSE42CostTbl[] = {
4295 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4296 { ISD::FMAXNUM, MVT::v4f32, { 4, 4, 4, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4297 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4298 { ISD::FMAXNUM, MVT::v2f64, { 4, 4, 4, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4299 { ISD::FSQRT, MVT::f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
4300 { ISD::FSQRT, MVT::v4f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
4301 };
4302 static const CostKindTblEntry SSE41CostTbl[] = {
4303 { ISD::ABS, MVT::v2i64, { 3, 4, 3, 5 } }, // BLENDVPD(X,PSUBQ(0,X),X)
4304 { ISD::SADDSAT, MVT::v2i64, { 10, 14, 17, 21 } },
4305 { ISD::SADDSAT, MVT::v4i32, { 5, 11, 8, 10 } },
4306 { ISD::SSUBSAT, MVT::v2i64, { 12, 19, 25, 29 } },
4307 { ISD::SSUBSAT, MVT::v4i32, { 6, 14, 10, 12 } },
4308 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 3 } },
4309 { ISD::SMAX, MVT::v4i32, { 1, 1, 1, 1 } },
4310 { ISD::SMAX, MVT::v16i8, { 1, 1, 1, 1 } },
4311 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
4312 { ISD::SMIN, MVT::v4i32, { 1, 1, 1, 1 } },
4313 { ISD::SMIN, MVT::v16i8, { 1, 1, 1, 1 } },
4314 { ISD::SMULO, MVT::v2i64, { 9, 11, 13, 17 } },
4315 { ISD::SMULO, MVT::v4i32, { 20, 24, 13, 19 } },
4316 { ISD::SMULO, MVT::v8i16, { 5, 9, 8, 8 } },
4317 { ISD::SMULO, MVT::v16i8, { 13, 22, 24, 25 } },
4318 { ISD::UADDSAT, MVT::v2i64, { 6, 13, 14, 14 } },
4319 { ISD::UADDSAT, MVT::v4i32, { 2, 2, 4, 4 } },
4320 { ISD::USUBSAT, MVT::v2i64, { 6, 10, 14, 14 } },
4321 { ISD::USUBSAT, MVT::v4i32, { 1, 2, 2, 2 } },
4322 { ISD::UMAX, MVT::v2i64, { 2, 11, 6, 7 } },
4323 { ISD::UMAX, MVT::v4i32, { 1, 1, 1, 1 } },
4324 { ISD::UMAX, MVT::v8i16, { 1, 1, 1, 1 } },
4325 { ISD::UMIN, MVT::v2i64, { 2, 11, 6, 7 } },
4326 { ISD::UMIN, MVT::v4i32, { 1, 1, 1, 1 } },
4327 { ISD::UMIN, MVT::v8i16, { 1, 1, 1, 1 } },
4328 { ISD::UMULO, MVT::v2i64, { 14, 20, 15, 20 } },
4329 { ISD::UMULO, MVT::v4i32, { 19, 22, 12, 18 } },
4330 { ISD::UMULO, MVT::v8i16, { 4, 9, 7, 7 } },
4331 { ISD::UMULO, MVT::v16i8, { 13, 19, 18, 20 } },
4332 };
4333 static const CostKindTblEntry SSSE3CostTbl[] = {
4334 { ISD::ABS, MVT::v4i32, { 1, 2, 1, 1 } },
4335 { ISD::ABS, MVT::v8i16, { 1, 2, 1, 1 } },
4336 { ISD::ABS, MVT::v16i8, { 1, 2, 1, 1 } },
4337 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 11, 21 } },
4338 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 11, 21 } },
4339 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 11, 21 } },
4340 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 10, 16 } },
4341 { ISD::BSWAP, MVT::v2i64, { 2, 3, 1, 5 } },
4342 { ISD::BSWAP, MVT::v4i32, { 2, 3, 1, 5 } },
4343 { ISD::BSWAP, MVT::v8i16, { 2, 3, 1, 5 } },
4344 { ISD::CTLZ, MVT::v2i64, { 18, 28, 28, 35 } },
4345 { ISD::CTLZ, MVT::v4i32, { 15, 20, 22, 28 } },
4346 { ISD::CTLZ, MVT::v8i16, { 13, 17, 16, 22 } },
4347 { ISD::CTLZ, MVT::v16i8, { 11, 15, 10, 16 } },
4348 { ISD::CTPOP, MVT::v2i64, { 13, 19, 12, 18 } },
4349 { ISD::CTPOP, MVT::v4i32, { 18, 24, 16, 22 } },
4350 { ISD::CTPOP, MVT::v8i16, { 13, 18, 14, 20 } },
4351 { ISD::CTPOP, MVT::v16i8, { 11, 12, 10, 16 } },
4352 { ISD::CTTZ, MVT::v2i64, { 13, 25, 15, 22 } },
4353 { ISD::CTTZ, MVT::v4i32, { 18, 26, 19, 25 } },
4354 { ISD::CTTZ, MVT::v8i16, { 13, 20, 17, 23 } },
4355 { ISD::CTTZ, MVT::v16i8, { 11, 16, 13, 19 } }
4356 };
4357 static const CostKindTblEntry SSE2CostTbl[] = {
4358 { ISD::ABS, MVT::v2i64, { 3, 6, 5, 5 } },
4359 { ISD::ABS, MVT::v4i32, { 1, 4, 4, 4 } },
4360 { ISD::ABS, MVT::v8i16, { 1, 2, 3, 3 } },
4361 { ISD::ABS, MVT::v16i8, { 1, 2, 3, 3 } },
4362 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 32, 32 } },
4363 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 30, 30 } },
4364 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 25, 25 } },
4365 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 21, 21 } },
4366 { ISD::BSWAP, MVT::v2i64, { 5, 6, 11, 11 } },
4367 { ISD::BSWAP, MVT::v4i32, { 5, 5, 9, 9 } },
4368 { ISD::BSWAP, MVT::v8i16, { 5, 5, 4, 5 } },
4369 { ISD::CTLZ, MVT::v2i64, { 10, 45, 36, 38 } },
4370 { ISD::CTLZ, MVT::v4i32, { 10, 45, 38, 40 } },
4371 { ISD::CTLZ, MVT::v8i16, { 9, 38, 32, 34 } },
4372 { ISD::CTLZ, MVT::v16i8, { 8, 39, 29, 32 } },
4373 { ISD::CTPOP, MVT::v2i64, { 12, 26, 16, 18 } },
4374 { ISD::CTPOP, MVT::v4i32, { 15, 29, 21, 23 } },
4375 { ISD::CTPOP, MVT::v8i16, { 13, 25, 18, 20 } },
4376 { ISD::CTPOP, MVT::v16i8, { 10, 21, 14, 16 } },
4377 { ISD::CTTZ, MVT::v2i64, { 14, 28, 19, 21 } },
4378 { ISD::CTTZ, MVT::v4i32, { 18, 31, 24, 26 } },
4379 { ISD::CTTZ, MVT::v8i16, { 16, 27, 21, 23 } },
4380 { ISD::CTTZ, MVT::v16i8, { 13, 23, 17, 19 } },
4381 { ISD::SADDSAT, MVT::v2i64, { 12, 14, 24, 24 } },
4382 { ISD::SADDSAT, MVT::v4i32, { 6, 11, 11, 12 } },
4383 { ISD::SADDSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4384 { ISD::SADDSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4385 { ISD::SMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4386 { ISD::SMAX, MVT::v4i32, { 2, 4, 5, 5 } },
4387 { ISD::SMAX, MVT::v8i16, { 1, 1, 1, 1 } },
4388 { ISD::SMAX, MVT::v16i8, { 2, 4, 5, 5 } },
4389 { ISD::SMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4390 { ISD::SMIN, MVT::v4i32, { 2, 4, 5, 5 } },
4391 { ISD::SMIN, MVT::v8i16, { 1, 1, 1, 1 } },
4392 { ISD::SMIN, MVT::v16i8, { 2, 4, 5, 5 } },
4393 { ISD::SMULO, MVT::v2i64, { 30, 33, 13, 23 } },
4394 { ISD::SMULO, MVT::v4i32, { 20, 24, 23, 23 } },
4395 { ISD::SMULO, MVT::v8i16, { 5, 10, 8, 8 } },
4396 { ISD::SMULO, MVT::v16i8, { 13, 23, 24, 25 } },
4397 { ISD::SSUBSAT, MVT::v2i64, { 16, 19, 31, 31 } },
4398 { ISD::SSUBSAT, MVT::v4i32, { 6, 14, 12, 13 } },
4399 { ISD::SSUBSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4400 { ISD::SSUBSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4401 { ISD::UADDSAT, MVT::v2i64, { 7, 13, 14, 14 } },
4402 { ISD::UADDSAT, MVT::v4i32, { 4, 5, 7, 7 } },
4403 { ISD::UADDSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4404 { ISD::UADDSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4405 { ISD::UMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4406 { ISD::UMAX, MVT::v4i32, { 2, 5, 8, 8 } },
4407 { ISD::UMAX, MVT::v8i16, { 1, 3, 3, 3 } },
4408 { ISD::UMAX, MVT::v16i8, { 1, 1, 1, 1 } },
4409 { ISD::UMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4410 { ISD::UMIN, MVT::v4i32, { 2, 5, 8, 8 } },
4411 { ISD::UMIN, MVT::v8i16, { 1, 3, 3, 3 } },
4412 { ISD::UMIN, MVT::v16i8, { 1, 1, 1, 1 } },
4413 { ISD::UMULO, MVT::v2i64, { 30, 33, 15, 29 } },
4414 { ISD::UMULO, MVT::v4i32, { 19, 22, 14, 18 } },
4415 { ISD::UMULO, MVT::v8i16, { 4, 9, 7, 7 } },
4416 { ISD::UMULO, MVT::v16i8, { 13, 19, 20, 20 } },
4417 { ISD::USUBSAT, MVT::v2i64, { 7, 10, 14, 14 } },
4418 { ISD::USUBSAT, MVT::v4i32, { 4, 4, 7, 7 } },
4419 { ISD::USUBSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4420 { ISD::USUBSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4421 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } },
4422 { ISD::FMAXNUM, MVT::v2f64, { 4, 6, 6, 6 } },
4423 { ISD::FSQRT, MVT::f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4424 { ISD::FSQRT, MVT::v2f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4425 };
4426 static const CostKindTblEntry SSE1CostTbl[] = {
4427 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } },
4428 { ISD::FMAXNUM, MVT::v4f32, { 4, 6, 6, 6 } },
4429 { ISD::FSQRT, MVT::f32, { 28, 30, 1, 2 } }, // Pentium III from http://www.agner.org/
4430 { ISD::FSQRT, MVT::v4f32, { 56, 56, 1, 2 } }, // Pentium III from http://www.agner.org/
4431 };
4432 static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets
4433 { ISD::CTTZ, MVT::i64, { 1, 1, 1, 1 } },
4434 };
4435 static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
4436 { ISD::CTTZ, MVT::i32, { 1, 1, 1, 1 } },
4437 { ISD::CTTZ, MVT::i16, { 2, 1, 1, 1 } },
4438 { ISD::CTTZ, MVT::i8, { 2, 1, 1, 1 } },
4439 };
4440 static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets
4441 { ISD::CTLZ, MVT::i64, { 1, 1, 1, 1 } },
4442 };
4443 static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
4444 { ISD::CTLZ, MVT::i32, { 1, 1, 1, 1 } },
4445 { ISD::CTLZ, MVT::i16, { 2, 1, 1, 1 } },
4446 { ISD::CTLZ, MVT::i8, { 2, 1, 1, 1 } },
4447 };
4448 static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets
4449 { ISD::CTPOP, MVT::i64, { 1, 1, 1, 1 } }, // popcnt
4450 };
4451 static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
4452 { ISD::CTPOP, MVT::i32, { 1, 1, 1, 1 } }, // popcnt
4453 { ISD::CTPOP, MVT::i16, { 1, 1, 2, 2 } }, // popcnt(zext())
4454 { ISD::CTPOP, MVT::i8, { 1, 1, 2, 2 } }, // popcnt(zext())
4455 };
4456 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
4457 { ISD::ABS, MVT::i64, { 1, 2, 3, 3 } }, // SUB+CMOV
4458 { ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } },
4459 { ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } },
4460 { ISD::CTLZ, MVT::i64, { 1, 2, 3, 3 } }, // MOV+BSR+XOR
4461 { ISD::CTLZ, MVT::i32, { 1, 2, 3, 3 } }, // MOV+BSR+XOR
4462 { ISD::CTLZ, MVT::i16, { 2, 2, 3, 3 } }, // MOV+BSR+XOR
4463 { ISD::CTLZ, MVT::i8, { 2, 2, 4, 3 } }, // MOV+BSR+XOR
4464 { ISD::CTLZ_ZERO_POISON,MVT::i64,{ 1, 2, 2, 2 } }, // BSR+XOR
4465 { ISD::CTTZ, MVT::i64, { 1, 2, 2, 2 } }, // MOV+BSF
4466 { ISD::CTTZ, MVT::i32, { 1, 2, 2, 2 } }, // MOV+BSF
4467 { ISD::CTTZ, MVT::i16, { 2, 2, 2, 2 } }, // MOV+BSF
4468 { ISD::CTTZ, MVT::i8, { 2, 2, 2, 2 } }, // MOV+BSF
4469 { ISD::CTTZ_ZERO_POISON,MVT::i64,{ 1, 2, 1, 2 } }, // BSF
4470 { ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } },
4471 { ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } },
4472 { ISD::ROTR, MVT::i64, { 2, 3, 1, 3 } },
4473 { X86ISD::VROTLI, MVT::i64, { 1, 1, 1, 1 } },
4474 { ISD::FSHL, MVT::i64, { 4, 4, 1, 4 } },
4475 { ISD::SADDSAT, MVT::i64, { 4, 4, 7, 10 } },
4476 { ISD::SSUBSAT, MVT::i64, { 4, 5, 8, 11 } },
4477 { ISD::UADDSAT, MVT::i64, { 2, 3, 4, 7 } },
4478 { ISD::USUBSAT, MVT::i64, { 2, 3, 4, 7 } },
4479 { ISD::SMAX, MVT::i64, { 1, 3, 2, 3 } },
4480 { ISD::SMIN, MVT::i64, { 1, 3, 2, 3 } },
4481 { ISD::UMAX, MVT::i64, { 1, 3, 2, 3 } },
4482 { ISD::UMIN, MVT::i64, { 1, 3, 2, 3 } },
4483 { ISD::SADDO, MVT::i64, { 2, 2, 4, 6 } },
4484 { ISD::UADDO, MVT::i64, { 2, 2, 4, 6 } },
4485 { ISD::SMULO, MVT::i64, { 4, 4, 4, 6 } },
4486 { ISD::UMULO, MVT::i64, { 8, 8, 4, 7 } },
4487 };
4488 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
4489 { ISD::ABS, MVT::i32, { 1, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
4490 { ISD::ABS, MVT::i16, { 2, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
4491 { ISD::ABS, MVT::i8, { 2, 4, 4, 3 } }, // SUB+XOR+SRA
4492 { ISD::BITREVERSE, MVT::i32, { 9, 12, 17, 19 } },
4493 { ISD::BITREVERSE, MVT::i16, { 9, 12, 17, 19 } },
4494 { ISD::BITREVERSE, MVT::i8, { 7, 9, 13, 14 } },
4495 { ISD::BSWAP, MVT::i32, { 1, 1, 1, 1 } },
4496 { ISD::BSWAP, MVT::i16, { 1, 2, 1, 2 } }, // ROL
4497 { ISD::CTLZ, MVT::i32, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
4498 { ISD::CTLZ, MVT::i16, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
4499 { ISD::CTLZ, MVT::i8, { 2, 2, 5, 6 } }, // BSR+XOR or BSR+XOR+CMOV
4500 { ISD::CTLZ_ZERO_POISON,MVT::i32,{ 1, 2, 2, 2 } }, // BSR+XOR
4501 { ISD::CTLZ_ZERO_POISON,MVT::i16,{ 2, 2, 2, 2 } }, // BSR+XOR
4502 { ISD::CTLZ_ZERO_POISON,MVT::i8, { 2, 2, 3, 3 } }, // BSR+XOR
4503 { ISD::CTTZ, MVT::i32, { 2, 2, 3, 3 } }, // TEST+BSF+CMOV/BRANCH
4504 { ISD::CTTZ, MVT::i16, { 2, 2, 2, 3 } }, // TEST+BSF+CMOV/BRANCH
4505 { ISD::CTTZ, MVT::i8, { 2, 2, 2, 3 } }, // TEST+BSF+CMOV/BRANCH
4506 { ISD::CTTZ_ZERO_POISON,MVT::i32,{ 1, 2, 1, 2 } }, // BSF
4507 { ISD::CTTZ_ZERO_POISON,MVT::i16,{ 2, 2, 1, 2 } }, // BSF
4508 { ISD::CTTZ_ZERO_POISON,MVT::i8, { 2, 2, 1, 2 } }, // BSF
4509 { ISD::CTPOP, MVT::i32, { 8, 7, 15, 15 } },
4510 { ISD::CTPOP, MVT::i16, { 9, 8, 17, 17 } },
4511 { ISD::CTPOP, MVT::i8, { 7, 6, 6, 6 } },
4512 { ISD::ROTL, MVT::i32, { 2, 3, 1, 3 } },
4513 { ISD::ROTL, MVT::i16, { 2, 3, 1, 3 } },
4514 { ISD::ROTL, MVT::i8, { 2, 3, 1, 3 } },
4515 { ISD::ROTR, MVT::i32, { 2, 3, 1, 3 } },
4516 { ISD::ROTR, MVT::i16, { 2, 3, 1, 3 } },
4517 { ISD::ROTR, MVT::i8, { 2, 3, 1, 3 } },
4518 { X86ISD::VROTLI, MVT::i32, { 1, 1, 1, 1 } },
4519 { X86ISD::VROTLI, MVT::i16, { 1, 1, 1, 1 } },
4520 { X86ISD::VROTLI, MVT::i8, { 1, 1, 1, 1 } },
4521 { ISD::FSHL, MVT::i32, { 4, 4, 1, 4 } },
4522 { ISD::FSHL, MVT::i16, { 4, 4, 2, 5 } },
4523 { ISD::FSHL, MVT::i8, { 4, 4, 2, 5 } },
4524 { ISD::SADDSAT, MVT::i32, { 3, 4, 6, 9 } },
4525 { ISD::SADDSAT, MVT::i16, { 4, 4, 7, 10 } },
4526 { ISD::SADDSAT, MVT::i8, { 4, 5, 8, 11 } },
4527 { ISD::SSUBSAT, MVT::i32, { 4, 4, 7, 10 } },
4528 { ISD::SSUBSAT, MVT::i16, { 4, 4, 7, 10 } },
4529 { ISD::SSUBSAT, MVT::i8, { 4, 5, 8, 11 } },
4530 { ISD::UADDSAT, MVT::i32, { 2, 3, 4, 7 } },
4531 { ISD::UADDSAT, MVT::i16, { 2, 3, 4, 7 } },
4532 { ISD::UADDSAT, MVT::i8, { 3, 3, 5, 8 } },
4533 { ISD::USUBSAT, MVT::i32, { 2, 3, 4, 7 } },
4534 { ISD::USUBSAT, MVT::i16, { 2, 3, 4, 7 } },
4535 { ISD::USUBSAT, MVT::i8, { 3, 3, 5, 8 } },
4536 { ISD::SMAX, MVT::i32, { 1, 2, 2, 3 } },
4537 { ISD::SMAX, MVT::i16, { 1, 4, 2, 4 } },
4538 { ISD::SMAX, MVT::i8, { 1, 4, 2, 4 } },
4539 { ISD::SMIN, MVT::i32, { 1, 2, 2, 3 } },
4540 { ISD::SMIN, MVT::i16, { 1, 4, 2, 4 } },
4541 { ISD::SMIN, MVT::i8, { 1, 4, 2, 4 } },
4542 { ISD::UMAX, MVT::i32, { 1, 2, 2, 3 } },
4543 { ISD::UMAX, MVT::i16, { 1, 4, 2, 4 } },
4544 { ISD::UMAX, MVT::i8, { 1, 4, 2, 4 } },
4545 { ISD::UMIN, MVT::i32, { 1, 2, 2, 3 } },
4546 { ISD::UMIN, MVT::i16, { 1, 4, 2, 4 } },
4547 { ISD::UMIN, MVT::i8, { 1, 4, 2, 4 } },
4548 { ISD::SADDO, MVT::i32, { 2, 2, 4, 6 } },
4549 { ISD::SADDO, MVT::i16, { 2, 2, 4, 6 } },
4550 { ISD::SADDO, MVT::i8, { 2, 2, 4, 6 } },
4551 { ISD::UADDO, MVT::i32, { 2, 2, 4, 6 } },
4552 { ISD::UADDO, MVT::i16, { 2, 2, 4, 6 } },
4553 { ISD::UADDO, MVT::i8, { 2, 2, 4, 6 } },
4554 { ISD::SMULO, MVT::i32, { 2, 2, 4, 6 } },
4555 { ISD::SMULO, MVT::i16, { 5, 5, 4, 6 } },
4556 { ISD::SMULO, MVT::i8, { 6, 6, 4, 6 } },
4557 { ISD::UMULO, MVT::i32, { 6, 6, 4, 8 } },
4558 { ISD::UMULO, MVT::i16, { 6, 6, 4, 9 } },
4559 { ISD::UMULO, MVT::i8, { 6, 6, 4, 6 } },
4560 };
4561
4562 Type *RetTy = ICA.getReturnType();
4563 Type *OpTy = RetTy;
4564 Intrinsic::ID IID = ICA.getID();
4565 unsigned ISD = ISD::DELETED_NODE;
4566 switch (IID) {
4567 default:
4568 break;
4569 case Intrinsic::abs:
4570 ISD = ISD::ABS;
4571 break;
4572 case Intrinsic::bitreverse:
4574 break;
4575 case Intrinsic::bswap:
4576 ISD = ISD::BSWAP;
4577 break;
4578 case Intrinsic::ctlz:
4579 ISD = ISD::CTLZ;
4580 break;
4581 case Intrinsic::ctpop:
4582 ISD = ISD::CTPOP;
4583 break;
4584 case Intrinsic::cttz:
4585 ISD = ISD::CTTZ;
4586 break;
4587 case Intrinsic::fshl:
4588 ISD = ISD::FSHL;
4589 if (!ICA.isTypeBasedOnly()) {
4590 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4591 if (Args[0] == Args[1]) {
4592 ISD = ISD::ROTL;
4593 // Handle uniform constant rotation amounts.
4594 // TODO: Handle funnel-shift cases.
4595 const APInt *Amt;
4596 if (Args[2] &&
4598 ISD = X86ISD::VROTLI;
4599 }
4600 }
4601 break;
4602 case Intrinsic::fshr:
4603 // FSHR has same costs so don't duplicate.
4604 ISD = ISD::FSHL;
4605 if (!ICA.isTypeBasedOnly()) {
4606 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4607 if (Args[0] == Args[1]) {
4608 ISD = ISD::ROTR;
4609 // Handle uniform constant rotation amount.
4610 // TODO: Handle funnel-shift cases.
4611 const APInt *Amt;
4612 if (Args[2] &&
4614 ISD = X86ISD::VROTLI;
4615 }
4616 }
4617 break;
4618 case Intrinsic::lrint:
4619 case Intrinsic::llrint: {
4620 // X86 can use the CVTP2SI instructions to lower lrint/llrint calls, which
4621 // have the same costs as the CVTTP2SI (fptosi) instructions
4622 const SmallVectorImpl<Type *> &ArgTys = ICA.getArgTypes();
4623 return getCastInstrCost(Instruction::FPToSI, RetTy, ArgTys[0],
4625 }
4626 case Intrinsic::maxnum:
4627 case Intrinsic::minnum:
4628 // FMINNUM has same costs so don't duplicate.
4629 ISD = ISD::FMAXNUM;
4630 break;
4631 case Intrinsic::sadd_sat:
4632 ISD = ISD::SADDSAT;
4633 break;
4634 case Intrinsic::smax:
4635 ISD = ISD::SMAX;
4636 break;
4637 case Intrinsic::smin:
4638 ISD = ISD::SMIN;
4639 break;
4640 case Intrinsic::ssub_sat:
4641 ISD = ISD::SSUBSAT;
4642 break;
4643 case Intrinsic::uadd_sat:
4644 ISD = ISD::UADDSAT;
4645 break;
4646 case Intrinsic::umax:
4647 ISD = ISD::UMAX;
4648 break;
4649 case Intrinsic::umin:
4650 ISD = ISD::UMIN;
4651 break;
4652 case Intrinsic::usub_sat:
4653 ISD = ISD::USUBSAT;
4654 break;
4655 case Intrinsic::sqrt:
4656 ISD = ISD::FSQRT;
4657 break;
4658 case Intrinsic::sadd_with_overflow:
4659 case Intrinsic::ssub_with_overflow:
4660 // SSUBO has same costs so don't duplicate.
4661 ISD = ISD::SADDO;
4662 OpTy = RetTy->getContainedType(0);
4663 break;
4664 case Intrinsic::uadd_with_overflow:
4665 case Intrinsic::usub_with_overflow:
4666 // USUBO has same costs so don't duplicate.
4667 ISD = ISD::UADDO;
4668 OpTy = RetTy->getContainedType(0);
4669 break;
4670 case Intrinsic::smul_with_overflow:
4671 ISD = ISD::SMULO;
4672 OpTy = RetTy->getContainedType(0);
4673 break;
4674 case Intrinsic::umul_with_overflow:
4675 ISD = ISD::UMULO;
4676 OpTy = RetTy->getContainedType(0);
4677 break;
4678 }
4679
4680 if (ISD != ISD::DELETED_NODE) {
4681 auto adjustTableCost = [&](int ISD, unsigned Cost,
4682 std::pair<InstructionCost, MVT> LT,
4684 InstructionCost LegalizationCost = LT.first;
4685 MVT MTy = LT.second;
4686
4687 // If there are no NANs to deal with, then these are reduced to a
4688 // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
4689 // assume is used in the non-fast case.
4690 if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) {
4691 if (FMF.noNaNs())
4692 return LegalizationCost * 1;
4693 }
4694
4695 // For cases where some ops can be folded into a load/store, assume free.
4696 if (MTy.isScalarInteger()) {
4697 if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
4698 if (const Instruction *II = ICA.getInst()) {
4699 if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
4700 return TTI::TCC_Free;
4701 if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
4702 if (LI->hasOneUse())
4703 return TTI::TCC_Free;
4704 }
4705 }
4706 }
4707 }
4708
4709 return LegalizationCost * (int)Cost;
4710 };
4711
4712 // Legalize the type.
4713 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy);
4714 MVT MTy = LT.second;
4715
4716 // Without BMI/LZCNT see if we're only looking for a *_ZERO_POISON cost.
4717 if (((ISD == ISD::CTTZ && !ST->hasBMI()) ||
4718 (ISD == ISD::CTLZ && !ST->hasLZCNT())) &&
4719 !MTy.isVector() && !ICA.isTypeBasedOnly()) {
4720 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4721 if (auto *Cst = dyn_cast<ConstantInt>(Args[1]))
4722 if (Cst->isAllOnesValue())
4723 ISD =
4725 }
4726
4727 // FSQRT is a single instruction.
4729 return LT.first;
4730
4731 if (ST->useGLMDivSqrtCosts())
4732 if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
4733 if (auto KindCost = Entry->Cost[CostKind])
4734 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4735
4736 if (ST->useSLMArithCosts())
4737 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
4738 if (auto KindCost = Entry->Cost[CostKind])
4739 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4740
4741 if (ST->hasVBMI2())
4742 if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy))
4743 if (auto KindCost = Entry->Cost[CostKind])
4744 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4745
4746 if (ST->hasBITALG())
4747 if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy))
4748 if (auto KindCost = Entry->Cost[CostKind])
4749 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4750
4751 if (ST->hasVPOPCNTDQ())
4752 if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy))
4753 if (auto KindCost = Entry->Cost[CostKind])
4754 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4755
4756 if (ST->hasGFNI())
4757 if (const auto *Entry = CostTableLookup(GFNICostTbl, ISD, MTy))
4758 if (auto KindCost = Entry->Cost[CostKind])
4759 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4760
4761 if (ST->hasCDI())
4762 if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
4763 if (auto KindCost = Entry->Cost[CostKind])
4764 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4765
4766 if (ST->hasBWI())
4767 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
4768 if (auto KindCost = Entry->Cost[CostKind])
4769 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4770
4771 if (ST->hasAVX512())
4772 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
4773 if (auto KindCost = Entry->Cost[CostKind])
4774 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4775
4776 if (ST->hasXOP())
4777 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
4778 if (auto KindCost = Entry->Cost[CostKind])
4779 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4780
4781 if (ST->hasAVX2())
4782 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
4783 if (auto KindCost = Entry->Cost[CostKind])
4784 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4785
4786 if (ST->hasAVX())
4787 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
4788 if (auto KindCost = Entry->Cost[CostKind])
4789 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4790
4791 if (ST->hasSSE42())
4792 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
4793 if (auto KindCost = Entry->Cost[CostKind])
4794 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4795
4796 if (ST->hasSSE41())
4797 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
4798 if (auto KindCost = Entry->Cost[CostKind])
4799 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4800
4801 if (ST->hasSSSE3())
4802 if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
4803 if (auto KindCost = Entry->Cost[CostKind])
4804 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4805
4806 if (ST->hasSSE2())
4807 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
4808 if (auto KindCost = Entry->Cost[CostKind])
4809 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4810
4811 if (ST->hasSSE1())
4812 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
4813 if (auto KindCost = Entry->Cost[CostKind])
4814 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4815
4816 if (ST->hasBMI()) {
4817 if (ST->is64Bit())
4818 if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
4819 if (auto KindCost = Entry->Cost[CostKind])
4820 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4821
4822 if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
4823 if (auto KindCost = Entry->Cost[CostKind])
4824 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4825 }
4826
4827 if (ST->hasLZCNT()) {
4828 if (ST->is64Bit())
4829 if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
4830 if (auto KindCost = Entry->Cost[CostKind])
4831 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4832
4833 if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
4834 if (auto KindCost = Entry->Cost[CostKind])
4835 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4836 }
4837
4838 if (ST->hasPOPCNT()) {
4839 if (ST->is64Bit())
4840 if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
4841 if (auto KindCost = Entry->Cost[CostKind])
4842 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4843
4844 if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
4845 if (auto KindCost = Entry->Cost[CostKind])
4846 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4847 }
4848
4849 if (ST->is64Bit())
4850 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
4851 if (auto KindCost = Entry->Cost[CostKind])
4852 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4853
4854 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
4855 if (auto KindCost = Entry->Cost[CostKind])
4856 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4857
4858 // Without arg data, we need to compute the expanded costs of custom lowered
4859 // intrinsics to prevent use of the (very low) default costs.
4860 if (ICA.isTypeBasedOnly() &&
4861 (IID == Intrinsic::fshl || IID == Intrinsic::fshr)) {
4862 Type *CondTy = RetTy->getWithNewBitWidth(1);
4864 Cost += getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
4865 Cost += getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
4866 Cost += getArithmeticInstrCost(BinaryOperator::Shl, RetTy, CostKind);
4867 Cost += getArithmeticInstrCost(BinaryOperator::LShr, RetTy, CostKind);
4868 Cost += getArithmeticInstrCost(BinaryOperator::And, RetTy, CostKind);
4869 Cost += getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
4871 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
4873 return Cost;
4874 }
4875 }
4876
4878}
4879
4881 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4882 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
4883 static const CostTblEntry SLMCostTbl[] = {
4884 { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 },
4885 { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 },
4886 { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 },
4887 { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 }
4888 };
4889
4890 assert(Val->isVectorTy() && "This must be a vector type");
4891 auto *VT = cast<VectorType>(Val);
4892 if (VT->isScalableTy())
4894
4895 Type *ScalarType = Val->getScalarType();
4896 InstructionCost RegisterFileMoveCost = 0;
4897
4898 // Non-immediate extraction/insertion can be handled as a sequence of
4899 // aliased loads+stores via the stack.
4900 if (Index == -1U && (Opcode == Instruction::ExtractElement ||
4901 Opcode == Instruction::InsertElement)) {
4902 // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
4903 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
4904
4905 // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling.
4906 assert(isa<FixedVectorType>(Val) && "Fixed vector type expected");
4907 Align VecAlign = DL.getPrefTypeAlign(Val);
4908 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
4909
4910 // Extract - store vector to stack, load scalar.
4911 if (Opcode == Instruction::ExtractElement) {
4912 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4913 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
4914 CostKind);
4915 }
4916 // Insert - store vector to stack, store scalar, load vector.
4917 if (Opcode == Instruction::InsertElement) {
4918 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4919 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
4920 CostKind) +
4921 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind);
4922 }
4923 }
4924
4925 if (Index != -1U && (Opcode == Instruction::ExtractElement ||
4926 Opcode == Instruction::InsertElement)) {
4927 // Extraction of vXi1 elements are now efficiently handled by MOVMSK.
4928 if (Opcode == Instruction::ExtractElement &&
4929 ScalarType->getScalarSizeInBits() == 1 &&
4930 cast<FixedVectorType>(Val)->getNumElements() > 1)
4931 return 1;
4932
4933 // Legalize the type.
4934 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
4935
4936 // This type is legalized to a scalar type.
4937 if (!LT.second.isVector())
4938 return TTI::TCC_Free;
4939
4940 // The type may be split. Normalize the index to the new type.
4941 unsigned SizeInBits = LT.second.getSizeInBits();
4942 unsigned NumElts = LT.second.getVectorNumElements();
4943 unsigned SubNumElts = NumElts;
4944 Index = Index % NumElts;
4945
4946 // For >128-bit vectors, we need to extract higher 128-bit subvectors.
4947 // For inserts, we also need to insert the subvector back.
4948 if (SizeInBits > 128) {
4949 assert((SizeInBits % 128) == 0 && "Illegal vector");
4950 unsigned NumSubVecs = SizeInBits / 128;
4951 SubNumElts = NumElts / NumSubVecs;
4952 if (SubNumElts <= Index) {
4953 RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
4954 Index %= SubNumElts;
4955 }
4956 }
4957
4958 MVT MScalarTy = LT.second.getScalarType();
4959 auto IsCheapPInsrPExtrInsertPS = [&]() {
4960 // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
4961 // Inserting f32 into index0 is just movss.
4962 // Also, assume insertps is relatively cheap on all >= SSE41 targets.
4963 return (MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4964 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4965 (MScalarTy == MVT::f32 && ST->hasSSE1() && Index == 0 &&
4966 Opcode == Instruction::InsertElement) ||
4967 (MScalarTy == MVT::f32 && ST->hasSSE41() &&
4968 Opcode == Instruction::InsertElement);
4969 };
4970
4971 if (Index == 0) {
4972 // Floating point scalars are already located in index #0.
4973 // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
4974 // true for all.
4975 if (ScalarType->isFloatingPointTy() &&
4976 (Opcode != Instruction::InsertElement || !Op0 ||
4977 isa<UndefValue>(Op0)))
4978 return RegisterFileMoveCost;
4979
4980 if (Opcode == Instruction::InsertElement &&
4982 // Consider the gather cost to be cheap.
4984 return RegisterFileMoveCost;
4985 if (!IsCheapPInsrPExtrInsertPS()) {
4986 // mov constant-to-GPR + movd/movq GPR -> XMM.
4987 if (isa_and_nonnull<Constant>(Op1) && Op1->getType()->isIntegerTy())
4988 return 2 + RegisterFileMoveCost;
4989 // Assume movd/movq GPR -> XMM is relatively cheap on all targets.
4990 return 1 + RegisterFileMoveCost;
4991 }
4992 }
4993
4994 // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
4995 if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
4996 return 1 + RegisterFileMoveCost;
4997 }
4998
4999 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5000 assert(ISD && "Unexpected vector opcode");
5001 if (ST->useSLMArithCosts())
5002 if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
5003 return Entry->Cost + RegisterFileMoveCost;
5004
5005 // Consider cheap cases.
5006 if (IsCheapPInsrPExtrInsertPS())
5007 return 1 + RegisterFileMoveCost;
5008
5009 // For extractions we just need to shuffle the element to index 0, which
5010 // should be very cheap (assume cost = 1). For insertions we need to shuffle
5011 // the elements to its destination. In both cases we must handle the
5012 // subvector move(s).
5013 // If the vector type is already less than 128-bits then don't reduce it.
5014 // TODO: Under what circumstances should we shuffle using the full width?
5015 InstructionCost ShuffleCost = 1;
5016 if (Opcode == Instruction::InsertElement) {
5017 auto *SubTy = cast<VectorType>(Val);
5018 EVT VT = TLI->getValueType(DL, Val);
5019 if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
5020 SubTy = FixedVectorType::get(ScalarType, SubNumElts);
5021 ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, SubTy, {},
5022 CostKind, 0, SubTy);
5023 }
5024 int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
5025 return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
5026 }
5027
5028 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1,
5029 VIC) +
5030 RegisterFileMoveCost;
5031}
5032
5034 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
5035 TTI::TargetCostKind CostKind, bool ForPoisonSrc, ArrayRef<Value *> VL,
5036 TTI::VectorInstrContext VIC) const {
5037 assert(DemandedElts.getBitWidth() ==
5038 cast<FixedVectorType>(Ty)->getNumElements() &&
5039 "Vector size mismatch");
5040
5041 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
5042 MVT MScalarTy = LT.second.getScalarType();
5043 unsigned LegalVectorBitWidth = LT.second.getSizeInBits();
5045
5046 constexpr unsigned LaneBitWidth = 128;
5047 assert((LegalVectorBitWidth < LaneBitWidth ||
5048 (LegalVectorBitWidth % LaneBitWidth) == 0) &&
5049 "Illegal vector");
5050
5051 const int NumLegalVectors = LT.first.getValue();
5052 assert(NumLegalVectors >= 0 && "Negative cost!");
5053
5054 // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
5055 // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT. SLPVectorizer has
5056 // a special heuristic regarding poison input which is passed here in
5057 // ForPoisonSrc.
5058 if (Insert && !ForPoisonSrc) {
5059 // This is nearly identical to BaseT::getScalarizationOverhead(), except
5060 // it is passing nullptr to getVectorInstrCost() for Op0 (instead of
5061 // Constant::getNullValue()), which makes the X86TTIImpl
5062 // getVectorInstrCost() return 0 instead of 1.
5063 for (unsigned I : seq(DemandedElts.getBitWidth())) {
5064 if (!DemandedElts[I])
5065 continue;
5066 Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, I,
5068 VL.empty() ? nullptr : VL[I],
5070 }
5071 return Cost;
5072 }
5073
5074 if (Insert) {
5075 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
5076 (MScalarTy.isInteger() && ST->hasSSE41()) ||
5077 (MScalarTy == MVT::f32 && ST->hasSSE41())) {
5078 // For types we can insert directly, insertion into 128-bit sub vectors is
5079 // cheap, followed by a cheap chain of concatenations.
5080 if (LegalVectorBitWidth <= LaneBitWidth) {
5081 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
5082 /*Extract*/ false, CostKind);
5083 } else {
5084 // In each 128-lane, if at least one index is demanded but not all
5085 // indices are demanded and this 128-lane is not the first 128-lane of
5086 // the legalized-vector, then this 128-lane needs a extracti128; If in
5087 // each 128-lane, there is at least one demanded index, this 128-lane
5088 // needs a inserti128.
5089
5090 // The following cases will help you build a better understanding:
5091 // Assume we insert several elements into a v8i32 vector in avx2,
5092 // Case#1: inserting into 1th index needs vpinsrd + inserti128.
5093 // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
5094 // inserti128.
5095 // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
5096 assert((LegalVectorBitWidth % LaneBitWidth) == 0 && "Illegal vector");
5097 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
5098 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
5099 unsigned NumLegalElts =
5100 LT.second.getVectorNumElements() * NumLegalVectors;
5101 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
5102 "Vector has been legalized to smaller element count");
5103 assert((NumLegalElts % NumLanesTotal) == 0 &&
5104 "Unexpected elts per lane");
5105 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
5106
5107 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
5108 auto *LaneTy =
5109 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
5110
5111 for (unsigned I = 0; I != NumLanesTotal; ++I) {
5112 APInt LaneEltMask = WidenedDemandedElts.extractBits(
5113 NumEltsPerLane, NumEltsPerLane * I);
5114 if (LaneEltMask.isZero())
5115 continue;
5116 // FIXME: we don't need to extract if all non-demanded elements
5117 // are legalization-inserted padding.
5118 if (!LaneEltMask.isAllOnes())
5120 CostKind, I * NumEltsPerLane, LaneTy);
5121 Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert,
5122 /*Extract*/ false, CostKind);
5123 }
5124
5125 APInt AffectedLanes =
5126 APIntOps::ScaleBitMask(WidenedDemandedElts, NumLanesTotal);
5127 APInt FullyAffectedLegalVectors = APIntOps::ScaleBitMask(
5128 AffectedLanes, NumLegalVectors, /*MatchAllBits=*/true);
5129 for (int LegalVec = 0; LegalVec != NumLegalVectors; ++LegalVec) {
5130 for (unsigned Lane = 0; Lane != NumLegalLanes; ++Lane) {
5131 unsigned I = NumLegalLanes * LegalVec + Lane;
5132 // No need to insert unaffected lane; or lane 0 of each legal vector
5133 // iff ALL lanes of that vector were affected and will be inserted.
5134 if (!AffectedLanes[I] ||
5135 (Lane == 0 && FullyAffectedLegalVectors[LegalVec]))
5136 continue;
5138 CostKind, I * NumEltsPerLane, LaneTy);
5139 }
5140 }
5141 }
5142 } else if (LT.second.isVector()) {
5143 // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
5144 // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
5145 // series of UNPCK followed by CONCAT_VECTORS - all of these can be
5146 // considered cheap.
5147 if (Ty->isIntOrIntVectorTy())
5148 Cost += DemandedElts.popcount();
5149
5150 // Get the smaller of the legalized or original pow2-extended number of
5151 // vector elements, which represents the number of unpacks we'll end up
5152 // performing.
5153 unsigned NumElts = LT.second.getVectorNumElements();
5154 unsigned Pow2Elts =
5156 Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
5157 }
5158 }
5159
5160 if (Extract) {
5161 // vXi1 can be efficiently extracted with MOVMSK.
5162 // TODO: AVX512 predicate mask handling.
5163 // NOTE: This doesn't work well for roundtrip scalarization.
5164 if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) {
5165 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
5166 unsigned MaxElts = ST->hasAVX2() ? 32 : 16;
5167 unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts;
5168 return MOVMSKCost;
5169 }
5170
5171 if (LT.second.isVector()) {
5172 unsigned NumLegalElts =
5173 LT.second.getVectorNumElements() * NumLegalVectors;
5174 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
5175 "Vector has been legalized to smaller element count");
5176
5177 // If we're extracting elements from a 128-bit subvector lane,
5178 // we only need to extract each lane once, not for every element.
5179 if (LegalVectorBitWidth > LaneBitWidth) {
5180 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
5181 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
5182 assert((NumLegalElts % NumLanesTotal) == 0 &&
5183 "Unexpected elts per lane");
5184 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
5185
5186 // Add cost for each demanded 128-bit subvector extraction.
5187 // Luckily this is a lot easier than for insertion.
5188 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
5189 auto *LaneTy =
5190 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
5191
5192 for (unsigned I = 0; I != NumLanesTotal; ++I) {
5193 APInt LaneEltMask = WidenedDemandedElts.extractBits(
5194 NumEltsPerLane, I * NumEltsPerLane);
5195 if (LaneEltMask.isZero())
5196 continue;
5198 I * NumEltsPerLane, LaneTy);
5200 LaneTy, LaneEltMask, /*Insert*/ false, Extract, CostKind);
5201 }
5202
5203 return Cost;
5204 }
5205 }
5206
5207 // Fallback to default extraction.
5208 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ false,
5209 Extract, CostKind);
5210 }
5211
5212 return Cost;
5213}
5214
5216X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
5217 int VF, const APInt &DemandedDstElts,
5219 const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy);
5220 // We don't differentiate element types here, only element bit width.
5221 EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits);
5222
5223 auto bailout = [&]() {
5224 return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
5225 DemandedDstElts, CostKind);
5226 };
5227
5228 // For now, only deal with AVX512 cases.
5229 if (!ST->hasAVX512())
5230 return bailout();
5231
5232 // Do we have a native shuffle for this element type, or should we promote?
5233 unsigned PromEltTyBits = EltTyBits;
5234 switch (EltTyBits) {
5235 case 32:
5236 case 64:
5237 break; // AVX512F.
5238 case 16:
5239 if (!ST->hasBWI())
5240 PromEltTyBits = 32; // promote to i32, AVX512F.
5241 break; // AVX512BW
5242 case 8:
5243 if (!ST->hasVBMI())
5244 PromEltTyBits = 32; // promote to i32, AVX512F.
5245 break; // AVX512VBMI
5246 case 1:
5247 // There is no support for shuffling i1 elements. We *must* promote.
5248 if (ST->hasBWI()) {
5249 if (ST->hasVBMI())
5250 PromEltTyBits = 8; // promote to i8, AVX512VBMI.
5251 else
5252 PromEltTyBits = 16; // promote to i16, AVX512BW.
5253 break;
5254 }
5255 PromEltTyBits = 32; // promote to i32, AVX512F.
5256 break;
5257 default:
5258 return bailout();
5259 }
5260 auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits);
5261
5262 auto *SrcVecTy = FixedVectorType::get(EltTy, VF);
5263 auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF);
5264
5265 int NumDstElements = VF * ReplicationFactor;
5266 auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements);
5267 auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements);
5268
5269 // Legalize the types.
5270 MVT LegalSrcVecTy = getTypeLegalizationCost(SrcVecTy).second;
5271 MVT LegalPromSrcVecTy = getTypeLegalizationCost(PromSrcVecTy).second;
5272 MVT LegalPromDstVecTy = getTypeLegalizationCost(PromDstVecTy).second;
5273 MVT LegalDstVecTy = getTypeLegalizationCost(DstVecTy).second;
5274 // They should have legalized into vector types.
5275 if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() ||
5276 !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector())
5277 return bailout();
5278
5279 if (PromEltTyBits != EltTyBits) {
5280 // If we have to perform the shuffle with wider elt type than our data type,
5281 // then we will first need to anyext (we don't care about the new bits)
5282 // the source elements, and then truncate Dst elements.
5283 InstructionCost PromotionCost;
5284 PromotionCost += getCastInstrCost(
5285 Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy,
5287 PromotionCost +=
5288 getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy,
5289 /*Src=*/PromDstVecTy,
5291 return PromotionCost + getReplicationShuffleCost(PromEltTy,
5292 ReplicationFactor, VF,
5293 DemandedDstElts, CostKind);
5294 }
5295
5296 assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits &&
5297 LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() &&
5298 "We expect that the legalization doesn't affect the element width, "
5299 "doesn't coalesce/split elements.");
5300
5301 unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements();
5302 unsigned NumDstVectors =
5303 divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec);
5304
5305 auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec);
5306
5307 // Not all the produced Dst elements may be demanded. In our case,
5308 // given that a single Dst vector is formed by a single shuffle,
5309 // if all elements that will form a single Dst vector aren't demanded,
5310 // then we won't need to do that shuffle, so adjust the cost accordingly.
5311 APInt DemandedDstVectors = APIntOps::ScaleBitMask(
5312 DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors);
5313 unsigned NumDstVectorsDemanded = DemandedDstVectors.popcount();
5314
5315 InstructionCost SingleShuffleCost =
5316 getShuffleCost(TTI::SK_PermuteSingleSrc, SingleDstVecTy, SingleDstVecTy,
5317 /*Mask=*/{}, CostKind,
5318 /*Index=*/0, /*SubTp=*/nullptr);
5319 return NumDstVectorsDemanded * SingleShuffleCost;
5320}
5321
5323 Align Alignment,
5324 unsigned AddressSpace,
5326 TTI::OperandValueInfo OpInfo,
5327 const Instruction *I) const {
5328 // FIXME: Load latency isn't handled here
5329 if (Opcode == Instruction::Load && CostKind == TTI::TCK_Latency)
5330 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5331 CostKind, OpInfo, I);
5332
5333 // TODO: Handle other cost kinds.
5335 if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
5336 // Store instruction with index and scale costs 2 Uops.
5337 // Check the preceding GEP to identify non-const indices.
5338 if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
5339 if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
5340 return TTI::TCC_Basic * 2;
5341 }
5342 }
5343 return TTI::TCC_Basic;
5344 }
5345
5346 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
5347 "Invalid Opcode");
5348 // Type legalization can't handle structs
5349 if (TLI->getValueType(DL, Src, true) == MVT::Other)
5350 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5351 CostKind, OpInfo, I);
5352
5353 // Legalize the type.
5354 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
5355
5356 auto *VTy = dyn_cast<FixedVectorType>(Src);
5357
5359
5360 // Add a cost for constant load to vector.
5361 if (Opcode == Instruction::Store && OpInfo.isConstant())
5362 Cost += getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src),
5363 /*AddressSpace=*/0, CostKind, OpInfo);
5364
5365 // Handle the simple case of non-vectors.
5366 // NOTE: this assumes that legalization never creates vector from scalars!
5367 if (!VTy || !LT.second.isVector()) {
5368 // Each load/store unit costs 1.
5369 return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1;
5370 }
5371
5372 bool IsLoad = Opcode == Instruction::Load;
5373
5374 Type *EltTy = VTy->getElementType();
5375
5376 const int EltTyBits = DL.getTypeSizeInBits(EltTy);
5377
5378 // Source of truth: how many elements were there in the original IR vector?
5379 const unsigned SrcNumElt = VTy->getNumElements();
5380
5381 // How far have we gotten?
5382 int NumEltRemaining = SrcNumElt;
5383 // Note that we intentionally capture by-reference, NumEltRemaining changes.
5384 auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
5385
5386 const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8);
5387
5388 // Note that even if we can store 64 bits of an XMM, we still operate on XMM.
5389 const unsigned XMMBits = 128;
5390 if (XMMBits % EltTyBits != 0)
5391 // Vector size must be a multiple of the element size. I.e. no padding.
5392 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5393 CostKind, OpInfo, I);
5394 const int NumEltPerXMM = XMMBits / EltTyBits;
5395
5396 auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
5397
5398 for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0;
5399 NumEltRemaining > 0; CurrOpSizeBytes /= 2) {
5400 // How many elements would a single op deal with at once?
5401 if ((8 * CurrOpSizeBytes) % EltTyBits != 0)
5402 // Vector size must be a multiple of the element size. I.e. no padding.
5403 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5404 CostKind, OpInfo, I);
5405 int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits;
5406
5407 assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?");
5408 assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||
5409 (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&
5410 "Unless we haven't halved the op size yet, "
5411 "we have less than two op's sized units of work left.");
5412
5413 auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
5414 ? FixedVectorType::get(EltTy, CurrNumEltPerOp)
5415 : XMMVecTy;
5416
5417 assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&
5418 "After halving sizes, the vector elt count is no longer a multiple "
5419 "of number of elements per operation?");
5420 auto *CoalescedVecTy =
5421 CurrNumEltPerOp == 1
5422 ? CurrVecTy
5424 IntegerType::get(Src->getContext(),
5425 EltTyBits * CurrNumEltPerOp),
5426 CurrVecTy->getNumElements() / CurrNumEltPerOp);
5427 assert(DL.getTypeSizeInBits(CoalescedVecTy) ==
5428 DL.getTypeSizeInBits(CurrVecTy) &&
5429 "coalesciing elements doesn't change vector width.");
5430
5431 while (NumEltRemaining > 0) {
5432 assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?");
5433
5434 // Can we use this vector size, as per the remaining element count?
5435 // Iff the vector is naturally aligned, we can do a wide load regardless.
5436 if (NumEltRemaining < CurrNumEltPerOp &&
5437 (!IsLoad || Alignment < CurrOpSizeBytes) && CurrOpSizeBytes != 1)
5438 break; // Try smalled vector size.
5439
5440 // This isn't exactly right. We're using slow unaligned 32-byte accesses
5441 // as a proxy for a double-pumped AVX memory interface such as on
5442 // Sandybridge.
5443 // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or
5444 // will be scalarized.
5445 if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
5446 Cost += 2;
5447 else if (CurrOpSizeBytes < 4)
5448 Cost += 2;
5449 else
5450 Cost += 1;
5451
5452 // If we're loading a uniform value, then we don't need to split the load,
5453 // loading just a single (widest) vector can be reused by all splits.
5454 if (IsLoad && OpInfo.isUniform())
5455 return Cost;
5456
5457 bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
5458
5459 // If we have fully processed the previous reg, we need to replenish it.
5460 if (SubVecEltsLeft == 0) {
5461 SubVecEltsLeft += CurrVecTy->getNumElements();
5462 // And that's free only for the 0'th subvector of a legalized vector.
5463 if (!Is0thSubVec)
5464 Cost +=
5467 VTy, VTy, {}, CostKind, NumEltDone(), CurrVecTy);
5468 }
5469
5470 // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
5471 // for smaller widths (32/16/8) we have to insert/extract them separately.
5472 // Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
5473 // but let's pretend that it is also true for 16/8 bit wide ops...)
5474 if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
5475 int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
5476 assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "");
5477 int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
5478 APInt DemandedElts =
5479 APInt::getBitsSet(CoalescedVecTy->getNumElements(),
5480 CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
5481 assert(DemandedElts.popcount() == 1 && "Inserting single value");
5482 Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
5483 !IsLoad, CostKind);
5484 }
5485
5486 SubVecEltsLeft -= CurrNumEltPerOp;
5487 NumEltRemaining -= CurrNumEltPerOp;
5488 Alignment = commonAlignment(Alignment, CurrOpSizeBytes);
5489 }
5490 }
5491
5492 assert(NumEltRemaining <= 0 && "Should have processed all the elements.");
5493
5494 return Cost;
5495}
5496
5500 switch (MICA.getID()) {
5501 case Intrinsic::masked_scatter:
5502 case Intrinsic::masked_gather:
5503 return getGatherScatterOpCost(MICA, CostKind);
5504 case Intrinsic::masked_load:
5505 case Intrinsic::masked_store:
5506 return getMaskedMemoryOpCost(MICA, CostKind);
5507 }
5509}
5510
5514 unsigned Opcode = MICA.getID() == Intrinsic::masked_load ? Instruction::Load
5515 : Instruction::Store;
5516 Type *SrcTy = MICA.getDataType();
5517 Align Alignment = MICA.getAlignment();
5518 unsigned AddressSpace = MICA.getAddressSpace();
5519
5520 bool IsLoad = (Instruction::Load == Opcode);
5521 bool IsStore = (Instruction::Store == Opcode);
5522
5523 auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
5524 if (!SrcVTy)
5525 // To calculate scalar take the regular cost, without mask
5526 return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
5527
5528 unsigned NumElem = SrcVTy->getNumElements();
5529 auto *MaskTy =
5530 FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
5531 if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment, AddressSpace)) ||
5532 (IsStore && !isLegalMaskedStore(SrcVTy, Alignment, AddressSpace))) {
5533 // Scalarization
5534 APInt DemandedElts = APInt::getAllOnes(NumElem);
5536 MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind);
5537 InstructionCost ScalarCompareCost = getCmpSelInstrCost(
5538 Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
5540 InstructionCost BranchCost = getCFInstrCost(Instruction::CondBr, CostKind);
5541 InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
5543 SrcVTy, DemandedElts, IsLoad, IsStore, CostKind);
5544 InstructionCost MemopCost =
5545 NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
5546 Alignment, AddressSpace, CostKind);
5547 return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
5548 }
5549
5550 // Legalize the type.
5551 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy);
5552 auto VT = TLI->getValueType(DL, SrcVTy);
5554 MVT Ty = LT.second;
5555 if (Ty == MVT::i16 || Ty == MVT::i32 || Ty == MVT::i64)
5556 // APX masked load/store for scalar is cheap.
5557 return Cost + LT.first;
5558
5559 if (VT.isSimple() && Ty != VT.getSimpleVT() &&
5560 LT.second.getVectorNumElements() == NumElem)
5561 // Promotion requires extend/truncate for data and a shuffle for mask.
5562 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, SrcVTy, {}, CostKind,
5563 0, nullptr) +
5564 getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, MaskTy, {}, CostKind,
5565 0, nullptr);
5566
5567 else if (LT.first * Ty.getVectorNumElements() > NumElem) {
5568 auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
5569 (unsigned)LT.first.getValue() *
5570 Ty.getVectorNumElements());
5571 // Expanding requires fill mask with zeroes
5572 Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, NewMaskTy, {},
5573 CostKind, 0, MaskTy);
5574 }
5575
5576 // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
5577 if (!ST->hasAVX512())
5578 return Cost + LT.first * (IsLoad ? 2 : 8);
5579
5580 // AVX-512 masked load/store is cheaper
5581 return Cost + LT.first;
5582}
5583
5585 ArrayRef<const Value *> Ptrs, const Value *Base,
5586 const TTI::PointersChainInfo &Info, Type *AccessTy,
5588 if (Info.isSameBase() && Info.isKnownStride()) {
5589 // If all the pointers have known stride all the differences are translated
5590 // into constants. X86 memory addressing allows encoding it into
5591 // displacement. So we just need to take the base GEP cost.
5592 if (const auto *BaseGEP = dyn_cast<GetElementPtrInst>(Base)) {
5593 SmallVector<const Value *> Indices(BaseGEP->indices());
5594 return getGEPCost(BaseGEP->getSourceElementType(),
5595 BaseGEP->getPointerOperand(), Indices, nullptr,
5596 CostKind);
5597 }
5598 return TTI::TCC_Free;
5599 }
5600 return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind);
5601}
5602
5605 const SCEV *Ptr,
5607 // Address computations in vectorized code with non-consecutive addresses will
5608 // likely result in more instructions compared to scalar code where the
5609 // computation can more often be merged into the index mode. The resulting
5610 // extra micro-ops can significantly decrease throughput.
5611 const unsigned NumVectorInstToHideOverhead = 10;
5612
5613 // Cost modeling of Strided Access Computation is hidden by the indexing
5614 // modes of X86 regardless of the stride value. We dont believe that there
5615 // is a difference between constant strided access in gerenal and constant
5616 // strided value which is less than or equal to 64.
5617 // Even in the case of (loop invariant) stride whose value is not known at
5618 // compile time, the address computation will not incur more than one extra
5619 // ADD instruction.
5620 if (PtrTy->isVectorTy() && SE && !ST->hasAVX2()) {
5621 // TODO: AVX2 is the current cut-off because we don't have correct
5622 // interleaving costs for prior ISA's.
5623 if (!BaseT::isStridedAccess(Ptr))
5624 return NumVectorInstToHideOverhead;
5625 if (!BaseT::getConstantStrideStep(SE, Ptr))
5626 return 1;
5627 }
5628
5629 return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
5630}
5631
5634 std::optional<FastMathFlags> FMF,
5637 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5638
5639 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5640 // and make it as the cost.
5641
5642 static const CostTblEntry SLMCostTbl[] = {
5643 { ISD::FADD, MVT::v2f64, 3 },
5644 { ISD::ADD, MVT::v2i64, 5 },
5645 };
5646
5647 static const CostTblEntry SSE2CostTbl[] = {
5648 { ISD::FADD, MVT::v2f64, 2 },
5649 { ISD::FADD, MVT::v2f32, 2 },
5650 { ISD::FADD, MVT::v4f32, 4 },
5651 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
5652 { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
5653 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
5654 { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
5655 { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
5656 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
5657 { ISD::ADD, MVT::v2i8, 2 },
5658 { ISD::ADD, MVT::v4i8, 2 },
5659 { ISD::ADD, MVT::v8i8, 2 },
5660 { ISD::ADD, MVT::v16i8, 3 },
5661 };
5662
5663 static const CostTblEntry AVX1CostTbl[] = {
5664 { ISD::FADD, MVT::v4f64, 3 },
5665 { ISD::FADD, MVT::v4f32, 3 },
5666 { ISD::FADD, MVT::v8f32, 4 },
5667 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
5668 { ISD::ADD, MVT::v4i64, 3 },
5669 { ISD::ADD, MVT::v8i32, 5 },
5670 { ISD::ADD, MVT::v16i16, 5 },
5671 { ISD::ADD, MVT::v32i8, 4 },
5672 };
5673
5674 static const CostTblEntry AVX512FCostTbl[] = {
5675 { ISD::FADD, MVT::v8f64, 4 },
5676 { ISD::FADD, MVT::v16f32, 5 },
5677 { ISD::ADD, MVT::v8i64, 4 },
5678 { ISD::ADD, MVT::v16i32, 6 },
5679 };
5680
5681 static const CostTblEntry AVX512BWCostTbl[] = {
5682 { ISD::ADD, MVT::v32i16, 7 },
5683 { ISD::ADD, MVT::v64i8, 4 },
5684 };
5685
5686 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5687 assert(ISD && "Invalid opcode");
5688
5689 // Before legalizing the type, give a chance to look up illegal narrow types
5690 // in the table.
5691 // FIXME: Is there a better way to do this?
5692 EVT VT = TLI->getValueType(DL, ValTy);
5693 if (VT.isSimple()) {
5694 MVT MTy = VT.getSimpleVT();
5695 if (ST->useSLMArithCosts())
5696 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5697 return Entry->Cost;
5698
5699 if (ST->hasBWI())
5700 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5701 return Entry->Cost;
5702
5703 if (ST->hasAVX512())
5704 if (const auto *Entry = CostTableLookup(AVX512FCostTbl, ISD, MTy))
5705 return Entry->Cost;
5706
5707 if (ST->hasAVX())
5708 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5709 return Entry->Cost;
5710
5711 if (ST->hasSSE2())
5712 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5713 return Entry->Cost;
5714 }
5715
5716 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5717
5718 MVT MTy = LT.second;
5719
5720 auto *ValVTy = cast<FixedVectorType>(ValTy);
5721
5722 // Special case: vXi8 mul reductions are performed as vXi16.
5723 if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) {
5724 auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16);
5725 auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements());
5726 return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
5728 CostKind) +
5729 getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind);
5730 }
5731
5732 InstructionCost ArithmeticCost = 0;
5733 if (LT.first != 1 && MTy.isVector() &&
5734 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5735 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5736 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5737 MTy.getVectorNumElements());
5738 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5739 ArithmeticCost *= LT.first - 1;
5740 }
5741
5742 if (ST->useSLMArithCosts())
5743 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5744 return ArithmeticCost + Entry->Cost;
5745
5746 if (ST->hasAVX())
5747 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5748 return ArithmeticCost + Entry->Cost;
5749
5750 if (ST->hasSSE2())
5751 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5752 return ArithmeticCost + Entry->Cost;
5753
5754 // FIXME: These assume a naive kshift+binop lowering, which is probably
5755 // conservative in most cases.
5756 static const CostTblEntry AVX512BoolReduction[] = {
5757 { ISD::AND, MVT::v2i1, 3 },
5758 { ISD::AND, MVT::v4i1, 5 },
5759 { ISD::AND, MVT::v8i1, 7 },
5760 { ISD::AND, MVT::v16i1, 9 },
5761 { ISD::AND, MVT::v32i1, 11 },
5762 { ISD::AND, MVT::v64i1, 13 },
5763 { ISD::OR, MVT::v2i1, 3 },
5764 { ISD::OR, MVT::v4i1, 5 },
5765 { ISD::OR, MVT::v8i1, 7 },
5766 { ISD::OR, MVT::v16i1, 9 },
5767 { ISD::OR, MVT::v32i1, 11 },
5768 { ISD::OR, MVT::v64i1, 13 },
5769 };
5770
5771 static const CostTblEntry AVX2BoolReduction[] = {
5772 { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
5773 { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
5774 { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
5775 { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
5776 };
5777
5778 static const CostTblEntry AVX1BoolReduction[] = {
5779 { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
5780 { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
5781 { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5782 { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5783 { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
5784 { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
5785 { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5786 { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5787 };
5788
5789 static const CostTblEntry SSE2BoolReduction[] = {
5790 { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
5791 { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
5792 { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
5793 { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
5794 { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
5795 { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
5796 { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
5797 { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
5798 };
5799
5800 // Handle bool allof/anyof patterns.
5801 if (ValVTy->getElementType()->isIntegerTy(1)) {
5802 if (ISD == ISD::ADD) {
5803 // vXi1 addition reduction will bitcast to scalar and perform a popcount.
5804 auto *IntTy = IntegerType::getIntNTy(ValVTy->getContext(),
5805 ValVTy->getNumElements());
5806 IntrinsicCostAttributes ICA(Intrinsic::ctpop, IntTy, {IntTy});
5807 return getCastInstrCost(Instruction::BitCast, IntTy, ValVTy,
5809 CostKind) +
5811 }
5812
5813 InstructionCost ArithmeticCost = 0;
5814 if (LT.first != 1 && MTy.isVector() &&
5815 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5816 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5817 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5818 MTy.getVectorNumElements());
5819 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5820 ArithmeticCost *= LT.first - 1;
5821 }
5822
5823 if (ST->hasAVX512())
5824 if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
5825 return ArithmeticCost + Entry->Cost;
5826 if (ST->hasAVX2())
5827 if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
5828 return ArithmeticCost + Entry->Cost;
5829 if (ST->hasAVX())
5830 if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
5831 return ArithmeticCost + Entry->Cost;
5832 if (ST->hasSSE2())
5833 if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
5834 return ArithmeticCost + Entry->Cost;
5835
5836 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5837 }
5838
5839 unsigned NumVecElts = ValVTy->getNumElements();
5840 unsigned ScalarSize = ValVTy->getScalarSizeInBits();
5841
5842 // Special case power of 2 reductions where the scalar type isn't changed
5843 // by type legalization.
5844 if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
5845 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5846
5847 InstructionCost ReductionCost = 0;
5848
5849 auto *Ty = ValVTy;
5850 if (LT.first != 1 && MTy.isVector() &&
5851 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5852 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5853 Ty = FixedVectorType::get(ValVTy->getElementType(),
5854 MTy.getVectorNumElements());
5855 ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5856 ReductionCost *= LT.first - 1;
5857 NumVecElts = MTy.getVectorNumElements();
5858 }
5859
5860 // Now handle reduction with the legal type, taking into account size changes
5861 // at each level.
5862 while (NumVecElts > 1) {
5863 // Determine the size of the remaining vector we need to reduce.
5864 unsigned Size = NumVecElts * ScalarSize;
5865 NumVecElts /= 2;
5866 // If we're reducing from 256/512 bits, use an extract_subvector.
5867 if (Size > 128) {
5868 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5869 ReductionCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, Ty, {},
5870 CostKind, NumVecElts, SubTy);
5871 Ty = SubTy;
5872 } else if (Size == 128) {
5873 // Reducing from 128 bits is a permute of v2f64/v2i64.
5874 FixedVectorType *ShufTy;
5875 if (ValVTy->isFloatingPointTy())
5876 ShufTy =
5877 FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
5878 else
5879 ShufTy =
5880 FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
5881 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy,
5882 {}, CostKind, 0, nullptr);
5883 } else if (Size == 64) {
5884 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5885 FixedVectorType *ShufTy;
5886 if (ValVTy->isFloatingPointTy())
5887 ShufTy =
5888 FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
5889 else
5890 ShufTy =
5891 FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
5892 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy,
5893 {}, CostKind, 0, nullptr);
5894 } else {
5895 // Reducing from smaller size is a shift by immediate.
5896 auto *ShiftTy = FixedVectorType::get(
5897 Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
5898 ReductionCost += getArithmeticInstrCost(
5899 Instruction::LShr, ShiftTy, CostKind,
5902 }
5903
5904 // Add the arithmetic op for this level.
5905 ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
5906 }
5907
5908 // Add the final extract element to the cost.
5909 return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5910 CostKind, 0, nullptr, nullptr,
5912}
5913
5916 FastMathFlags FMF) const {
5917 IntrinsicCostAttributes ICA(IID, Ty, {Ty, Ty}, FMF);
5918 return getIntrinsicInstrCost(ICA, CostKind);
5919}
5920
5923 FastMathFlags FMF,
5925 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5926
5927 MVT MTy = LT.second;
5928
5930 if (ValTy->isIntOrIntVectorTy()) {
5931 ISD = (IID == Intrinsic::umin || IID == Intrinsic::umax) ? ISD::UMIN
5932 : ISD::SMIN;
5933 } else {
5934 assert(ValTy->isFPOrFPVectorTy() &&
5935 "Expected float point or integer vector type.");
5936 ISD = (IID == Intrinsic::minnum || IID == Intrinsic::maxnum)
5937 ? ISD::FMINNUM
5938 : ISD::FMINIMUM;
5939 }
5940
5941 // We use llvm-mca across all supported CPUs to measure the cost stats.
5942 static const CostKindTblEntry SSE2CostTbl[] = {
5943 {ISD::SMIN, MVT::v2i64, {3, 4, 5, 6}},
5944 {ISD::UMIN, MVT::v2i64, {3, 4, 5, 6}},
5945 {ISD::SMIN, MVT::v2i32, {2, 2, 5, 6}},
5946 {ISD::UMIN, MVT::v2i32, {2, 2, 5, 6}},
5947 {ISD::SMIN, MVT::v4i32, {3, 7,11,12}},
5948 {ISD::UMIN, MVT::v4i32, {4, 7,14,15}},
5949 {ISD::SMIN, MVT::v2i16, {2, 3, 4, 4}},
5950 {ISD::UMIN, MVT::v2i16, {2, 3, 4, 6}},
5951 {ISD::SMIN, MVT::v4i16, {3, 5, 6, 6}},
5952 {ISD::UMIN, MVT::v4i16, {3, 5, 8, 10}},
5953 {ISD::SMIN, MVT::v8i16, {3, 8, 8, 8}},
5954 {ISD::UMIN, MVT::v8i16, {4, 8,12,14}},
5955 {ISD::SMIN, MVT::v2i8, {2, 3, 5, 6}},
5956 {ISD::UMIN, MVT::v2i8, {2, 3, 4, 4}},
5957 {ISD::SMIN, MVT::v4i8, {4, 6,12,13}},
5958 {ISD::UMIN, MVT::v4i8, {3, 6, 7, 7}},
5959 {ISD::SMIN, MVT::v8i8, {5, 9,18,19}},
5960 {ISD::UMIN, MVT::v8i8, {4, 8, 9, 9}},
5961 {ISD::SMIN, MVT::v16i8, {7,13,24,25}},
5962 {ISD::UMIN, MVT::v16i8, {3,10,11,11}},
5963 };
5964
5965 static const CostKindTblEntry SSE41CostTbl[] = {
5966 {ISD::SMIN, MVT::v2i64, {3, 4, 4, 6}},
5967 {ISD::UMIN, MVT::v2i64, {3, 4, 4, 6}},
5968 {ISD::SMIN, MVT::v2i32, {2, 2, 3, 3}},
5969 {ISD::UMIN, MVT::v2i32, {2, 2, 3, 3}},
5970 {ISD::SMIN, MVT::v4i32, {3, 4, 5, 5}},
5971 {ISD::UMIN, MVT::v4i32, {3, 4, 5, 5}},
5972 {ISD::UMIN, MVT::v2i16, {2, 3, 4, 4}},
5973 {ISD::SMIN, MVT::v4i16, {3, 5, 6, 6}},
5974 {ISD::UMIN, MVT::v4i16, {3, 5, 6, 6}},
5975 {ISD::SMIN, MVT::v8i16, {2, 8, 4, 5}},
5976 {ISD::UMIN, MVT::v8i16, {2, 5, 2, 2}},
5977 {ISD::SMIN, MVT::v2i8, {2, 3, 4, 4}},
5978 {ISD::SMIN, MVT::v4i8, {3, 6, 7, 7}},
5979 {ISD::SMIN, MVT::v8i8, {4, 8, 9, 9}},
5980 {ISD::SMIN, MVT::v16i8, {3,10, 7, 8}},
5981 {ISD::UMIN, MVT::v16i8, {3, 8, 5, 5}},
5982 };
5983
5984 static const CostKindTblEntry AVX1CostTbl[] = {
5985 {ISD::SMIN, MVT::v4i64, {5,11, 7,10}},
5986 {ISD::UMIN, MVT::v4i64, {6,12,10,13}},
5987 {ISD::SMIN, MVT::v8i32, {4, 9, 7, 7}},
5988 {ISD::UMIN, MVT::v8i32, {4, 9, 7, 7}},
5989 {ISD::SMIN, MVT::v16i16, {3,15, 6, 7}},
5990 {ISD::UMIN, MVT::v16i16, {2, 9, 4, 4}},
5991 {ISD::SMIN, MVT::v32i8, {4,17, 8, 9}},
5992 {ISD::UMIN, MVT::v32i8, {3,11, 6, 6}},
5993 };
5994
5995 static const CostKindTblEntry AVX2CostTbl[] = {
5996 {ISD::SMIN, MVT::v4i64, {4,11, 7,10}},
5997 {ISD::UMIN, MVT::v4i64, {4,12,10,13}},
5998 {ISD::SMIN, MVT::v2i32, {1, 2, 3, 3}},
5999 {ISD::UMIN, MVT::v2i32, {1, 2, 3, 3}},
6000 {ISD::UMIN, MVT::v4i32, {2, 4, 5, 5}},
6001 {ISD::SMIN, MVT::v4i32, {2, 4, 5, 5}},
6002 {ISD::SMIN, MVT::v8i32, {3, 9, 7, 7}},
6003 {ISD::UMIN, MVT::v8i32, {3, 9, 7, 7}},
6004 {ISD::SMIN, MVT::v4i16, {2, 4, 5, 5}},
6005 {ISD::UMIN, MVT::v4i16, {2, 4, 5, 5}},
6006 {ISD::SMIN, MVT::v16i16, {2,15, 6, 7}},
6007 {ISD::SMIN, MVT::v8i8, {3, 6, 7, 7}},
6008 {ISD::UMIN, MVT::v8i8, {3, 6, 7, 7}},
6009 {ISD::SMIN, MVT::v32i8, {3,17, 8, 9}},
6010 };
6011
6012 static const CostKindTblEntry AVX512FCostTbl[] = {
6013 {ISD::SMIN, MVT::v2i64, {2, 4, 3, 3}},
6014 {ISD::UMIN, MVT::v2i64, {2, 4, 3, 3}},
6015 {ISD::SMIN, MVT::v4i64, {3,10, 5, 5}},
6016 {ISD::UMIN, MVT::v4i64, {3,10, 5, 5}},
6017 {ISD::SMIN, MVT::v8i64, {5,16, 7, 7}},
6018 {ISD::UMIN, MVT::v8i64, {5,16, 7, 7}},
6019 {ISD::SMIN, MVT::v16i32, {4,12, 9, 9}},
6020 {ISD::UMIN, MVT::v16i32, {4,12, 9, 9}},
6021 };
6022
6023 static const CostKindTblEntry AVX512BWCostTbl[] = {
6024 {ISD::SMIN, MVT::v2i16, {1, 2, 3, 3}},
6025 {ISD::UMIN, MVT::v2i16, {1, 2, 3, 3}},
6026 {ISD::SMIN, MVT::v32i16, {2,19, 8, 9}},
6027 {ISD::UMIN, MVT::v32i16, {2,12, 6, 6}},
6028 {ISD::SMIN, MVT::v2i8, {1, 2, 3, 3}},
6029 {ISD::UMIN, MVT::v2i8, {1, 2, 3, 3}},
6030 {ISD::SMIN, MVT::v4i8, {2, 4, 5, 5}},
6031 {ISD::UMIN, MVT::v4i8, {2, 4, 5, 5}},
6032 {ISD::SMIN, MVT::v16i8, {2,10, 6, 7}},
6033 {ISD::UMIN, MVT::v16i8, {2, 6, 4, 4}},
6034 {ISD::SMIN, MVT::v32i8, {2,17, 8, 9}},
6035 {ISD::UMIN, MVT::v32i8, {2,10, 6, 6}},
6036 {ISD::SMIN, MVT::v64i8, {2,21,10,11}},
6037 {ISD::UMIN, MVT::v64i8, {2,14, 8, 8}},
6038 };
6039
6040 // Before legalizing the type, give a chance to look up illegal narrow types
6041 // in the table.
6042 // FIXME: Is there a better way to do this?
6043 EVT VT = TLI->getValueType(DL, ValTy);
6044 if (VT.isSimple()) {
6045 MVT MTy = VT.getSimpleVT();
6046 if (ST->hasBWI())
6047 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
6048 if (auto KindCost = Entry->Cost[CostKind])
6049 return *KindCost;
6050
6051 if (ST->hasAVX512())
6052 if (const auto *Entry = CostTableLookup(AVX512FCostTbl, ISD, MTy))
6053 if (auto KindCost = Entry->Cost[CostKind])
6054 return *KindCost;
6055
6056 if (ST->hasAVX2())
6057 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
6058 if (auto KindCost = Entry->Cost[CostKind])
6059 return *KindCost;
6060
6061 if (ST->hasAVX())
6062 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
6063 if (auto KindCost = Entry->Cost[CostKind])
6064 return *KindCost;
6065
6066 if (ST->hasSSE41())
6067 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
6068 if (auto KindCost = Entry->Cost[CostKind])
6069 return *KindCost;
6070
6071 if (ST->hasSSE2())
6072 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
6073 if (auto KindCost = Entry->Cost[CostKind])
6074 return *KindCost;
6075 }
6076
6077 auto *ValVTy = cast<FixedVectorType>(ValTy);
6078 unsigned NumVecElts = ValVTy->getNumElements();
6079
6080 auto *Ty = ValVTy;
6081 InstructionCost MinMaxCost = 0;
6082 if (LT.first != 1 && MTy.isVector() &&
6083 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
6084 // Type needs to be split. We need LT.first - 1 operations ops.
6085 Ty = FixedVectorType::get(ValVTy->getElementType(),
6086 MTy.getVectorNumElements());
6087 MinMaxCost = getMinMaxCost(IID, Ty, CostKind, FMF);
6088 MinMaxCost *= LT.first - 1;
6089 NumVecElts = MTy.getVectorNumElements();
6090 }
6091
6092 if (ST->hasBWI())
6093 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
6094 if (auto KindCost = Entry->Cost[CostKind])
6095 return MinMaxCost + *KindCost;
6096
6097 if (ST->hasAVX512())
6098 if (const auto *Entry = CostTableLookup(AVX512FCostTbl, ISD, MTy))
6099 if (auto KindCost = Entry->Cost[CostKind])
6100 return MinMaxCost + *KindCost;
6101
6102 if (ST->hasAVX2())
6103 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
6104 if (auto KindCost = Entry->Cost[CostKind])
6105 return MinMaxCost + *KindCost;
6106
6107 if (ST->hasAVX())
6108 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
6109 if (auto KindCost = Entry->Cost[CostKind])
6110 return MinMaxCost + *KindCost;
6111
6112 if (ST->hasSSE41())
6113 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
6114 if (auto KindCost = Entry->Cost[CostKind])
6115 return MinMaxCost + *KindCost;
6116
6117 if (ST->hasSSE2())
6118 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
6119 if (auto KindCost = Entry->Cost[CostKind])
6120 return MinMaxCost + *KindCost;
6121
6122 unsigned ScalarSize = ValTy->getScalarSizeInBits();
6123
6124 // Special case power of 2 reductions where the scalar type isn't changed
6125 // by type legalization.
6126 if (!isPowerOf2_32(ValVTy->getNumElements()) ||
6127 ScalarSize != MTy.getScalarSizeInBits())
6128 return BaseT::getMinMaxReductionCost(IID, ValTy, FMF, CostKind);
6129
6130 // Now handle reduction with the legal type, taking into account size changes
6131 // at each level.
6132 while (NumVecElts > 1) {
6133 // Determine the size of the remaining vector we need to reduce.
6134 unsigned Size = NumVecElts * ScalarSize;
6135 NumVecElts /= 2;
6136 // If we're reducing from 256/512 bits, use an extract_subvector.
6137 if (Size > 128) {
6138 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
6139 MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, Ty, {},
6140 CostKind, NumVecElts, SubTy);
6141 Ty = SubTy;
6142 } else if (Size == 128) {
6143 // Reducing from 128 bits is a permute of v2f64/v2i64.
6144 VectorType *ShufTy;
6145 if (ValTy->isFloatingPointTy())
6146 ShufTy =
6148 else
6149 ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
6150 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy, {},
6151 CostKind, 0, nullptr);
6152 } else if (Size == 64) {
6153 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
6154 FixedVectorType *ShufTy;
6155 if (ValTy->isFloatingPointTy())
6156 ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
6157 else
6158 ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
6159 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy, {},
6160 CostKind, 0, nullptr);
6161 } else {
6162 // Reducing from smaller size is a shift by immediate.
6163 auto *ShiftTy = FixedVectorType::get(
6164 Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
6165 MinMaxCost += getArithmeticInstrCost(
6166 Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
6169 }
6170
6171 // Add the arithmetic op for this level.
6172 MinMaxCost += getMinMaxCost(IID, Ty, CostKind, FMF);
6173 }
6174
6175 // Add the final extract element to the cost.
6176 return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
6177 CostKind, 0, nullptr, nullptr,
6179}
6180
6181/// Calculate the cost of materializing a 64-bit value. This helper
6182/// method might only calculate a fraction of a larger immediate. Therefore it
6183/// is valid to return a cost of ZERO.
6185 if (Val == 0)
6186 return TTI::TCC_Free;
6187
6188 if (isInt<32>(Val))
6189 return TTI::TCC_Basic;
6190
6191 return 2 * TTI::TCC_Basic;
6192}
6193
6196 assert(Ty->isIntegerTy());
6197
6198 unsigned BitSize = Ty->getPrimitiveSizeInBits();
6199 if (BitSize == 0)
6200 return ~0U;
6201
6202 // Never hoist constants larger than 128bit, because this might lead to
6203 // incorrect code generation or assertions in codegen.
6204 // Fixme: Create a cost model for types larger than i128 once the codegen
6205 // issues have been fixed.
6206 if (BitSize > 128)
6207 return TTI::TCC_Free;
6208
6209 if (Imm == 0)
6210 return TTI::TCC_Free;
6211
6212 // Sign-extend all constants to a multiple of 64-bit.
6213 APInt ImmVal = Imm;
6214 if (BitSize % 64 != 0)
6215 ImmVal = Imm.sext(alignTo(BitSize, 64));
6216
6217 // Split the constant into 64-bit chunks and calculate the cost for each
6218 // chunk.
6220 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
6221 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
6222 int64_t Val = Tmp.getSExtValue();
6223 Cost += getIntImmCost(Val);
6224 }
6225 // We need at least one instruction to materialize the constant.
6226 return std::max<InstructionCost>(1, Cost);
6227}
6228
6230 const APInt &Imm, Type *Ty,
6232 Instruction *Inst) const {
6233 assert(Ty->isIntegerTy());
6234
6235 unsigned BitSize = Ty->getPrimitiveSizeInBits();
6236 unsigned ImmBitWidth = Imm.getBitWidth();
6237
6238 // There is no cost model for constants with a bit size of 0. Return TCC_Free
6239 // here, so that constant hoisting will ignore this constant.
6240 if (BitSize == 0)
6241 return TTI::TCC_Free;
6242
6243 unsigned ImmIdx = ~0U;
6244 switch (Opcode) {
6245 default:
6246 return TTI::TCC_Free;
6247 case Instruction::GetElementPtr:
6248 // Always hoist the base address of a GetElementPtr. This prevents the
6249 // creation of new constants for every base constant that gets constant
6250 // folded with the offset.
6251 if (Idx == 0)
6252 return 2 * TTI::TCC_Basic;
6253 return TTI::TCC_Free;
6254 case Instruction::Store:
6255 ImmIdx = 0;
6256 break;
6257 case Instruction::ICmp:
6258 // This is an imperfect hack to prevent constant hoisting of
6259 // compares that might be trying to check if a 64-bit value fits in
6260 // 32-bits. The backend can optimize these cases using a right shift by 32.
6261 // There are other predicates and immediates the backend can use shifts for.
6262 if (Idx == 1 && ImmBitWidth == 64) {
6263 uint64_t ImmVal = Imm.getZExtValue();
6264 if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
6265 return TTI::TCC_Free;
6266
6267 if (auto *Cmp = dyn_cast_or_null<CmpInst>(Inst)) {
6268 if (Cmp->isEquality()) {
6269 KnownBits Known = computeKnownBits(Cmp->getOperand(0), DL);
6270 if (Known.countMinTrailingZeros() >= 32)
6271 return TTI::TCC_Free;
6272 }
6273 }
6274 }
6275 ImmIdx = 1;
6276 break;
6277 case Instruction::And:
6278 // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
6279 // by using a 32-bit operation with implicit zero extension. Detect such
6280 // immediates here as the normal path expects bit 31 to be sign extended.
6281 if (Idx == 1 && ImmBitWidth == 64 && Imm.isIntN(32))
6282 return TTI::TCC_Free;
6283 // If we have BMI then we can use BEXTR/BZHI to mask out upper i64 bits.
6284 if (Idx == 1 && ImmBitWidth == 64 && ST->is64Bit() && ST->hasBMI() &&
6285 Imm.isMask())
6286 return X86TTIImpl::getIntImmCost(ST->hasBMI2() ? 255 : 65535);
6287 ImmIdx = 1;
6288 break;
6289 case Instruction::Add:
6290 case Instruction::Sub:
6291 // For add/sub, we can use the opposite instruction for INT32_MIN.
6292 if (Idx == 1 && ImmBitWidth == 64 && Imm.getZExtValue() == 0x80000000)
6293 return TTI::TCC_Free;
6294 ImmIdx = 1;
6295 break;
6296 case Instruction::UDiv:
6297 case Instruction::SDiv:
6298 case Instruction::URem:
6299 case Instruction::SRem:
6300 // Division by constant is typically expanded later into a different
6301 // instruction sequence. This completely changes the constants.
6302 // Report them as "free" to stop ConstantHoist from marking them as opaque.
6303 return TTI::TCC_Free;
6304 case Instruction::Mul:
6305 case Instruction::Or:
6306 case Instruction::Xor:
6307 ImmIdx = 1;
6308 break;
6309 // Always return TCC_Free for the shift value of a shift instruction.
6310 case Instruction::Shl:
6311 case Instruction::LShr:
6312 case Instruction::AShr:
6313 if (Idx == 1)
6314 return TTI::TCC_Free;
6315 break;
6316 case Instruction::Trunc:
6317 case Instruction::ZExt:
6318 case Instruction::SExt:
6319 case Instruction::IntToPtr:
6320 case Instruction::PtrToInt:
6321 case Instruction::BitCast:
6322 case Instruction::PHI:
6323 case Instruction::Call:
6324 case Instruction::Select:
6325 case Instruction::Ret:
6326 case Instruction::Load:
6327 break;
6328 }
6329
6330 if (Idx == ImmIdx) {
6331 uint64_t NumConstants = divideCeil(BitSize, 64);
6333 return (Cost <= NumConstants * TTI::TCC_Basic)
6334 ? static_cast<int>(TTI::TCC_Free)
6335 : Cost;
6336 }
6337
6338 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
6339}
6340
6343 const APInt &Imm, Type *Ty,
6345 assert(Ty->isIntegerTy());
6346
6347 unsigned BitSize = Ty->getPrimitiveSizeInBits();
6348 // There is no cost model for constants with a bit size of 0. Return TCC_Free
6349 // here, so that constant hoisting will ignore this constant.
6350 if (BitSize == 0)
6351 return TTI::TCC_Free;
6352
6353 switch (IID) {
6354 default:
6355 return TTI::TCC_Free;
6356 case Intrinsic::sadd_with_overflow:
6357 case Intrinsic::uadd_with_overflow:
6358 case Intrinsic::ssub_with_overflow:
6359 case Intrinsic::usub_with_overflow:
6360 case Intrinsic::smul_with_overflow:
6361 case Intrinsic::umul_with_overflow:
6362 if ((Idx == 1) && Imm.getBitWidth() <= 64 && Imm.isSignedIntN(32))
6363 return TTI::TCC_Free;
6364 break;
6365 case Intrinsic::experimental_stackmap:
6366 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
6367 return TTI::TCC_Free;
6368 break;
6369 case Intrinsic::experimental_patchpoint_void:
6370 case Intrinsic::experimental_patchpoint:
6371 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
6372 return TTI::TCC_Free;
6373 break;
6374 }
6375 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
6376}
6377
6380 const Instruction *I) const {
6382 return Opcode == Instruction::PHI ? TTI::TCC_Free : TTI::TCC_Basic;
6383 // Branches are assumed to be predicted.
6384 return TTI::TCC_Free;
6385}
6386
6387int X86TTIImpl::getGatherOverhead() const {
6388 // Some CPUs have more overhead for gather. The specified overhead is relative
6389 // to the Load operation. "2" is the number provided by Intel architects. This
6390 // parameter is used for cost estimation of Gather Op and comparison with
6391 // other alternatives.
6392 // TODO: Remove the explicit hasAVX512()?, That would mean we would only
6393 // enable gather with a -march.
6394 if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
6395 return 2;
6396
6397 return 1024;
6398}
6399
6400int X86TTIImpl::getScatterOverhead() const {
6401 if (ST->hasAVX512())
6402 return 2;
6403
6404 return 1024;
6405}
6406
6407// Return an average cost of Gather / Scatter instruction, maybe improved later.
6408InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode,
6410 Type *SrcVTy, const Value *Ptr,
6411 Align Alignment,
6412 unsigned AddressSpace) const {
6413
6414 assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
6415 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
6416
6417 // Try to reduce index size from 64 bit (default for GEP)
6418 // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
6419 // operation will use 16 x 64 indices which do not fit in a zmm and needs
6420 // to split. Also check that the base pointer is the same for all lanes,
6421 // and that there's at most one variable index.
6422 auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
6423 unsigned IndexSize = DL.getPointerSizeInBits();
6424 const GetElementPtrInst *GEP = dyn_cast_or_null<GetElementPtrInst>(Ptr);
6425 if (IndexSize < 64 || !GEP)
6426 return IndexSize;
6427
6428 unsigned NumOfVarIndices = 0;
6429 const Value *Ptrs = GEP->getPointerOperand();
6430 if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
6431 return IndexSize;
6432 for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) {
6433 if (isa<Constant>(GEP->getOperand(I)))
6434 continue;
6435 Type *IndxTy = GEP->getOperand(I)->getType();
6436 if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
6437 IndxTy = IndexVTy->getElementType();
6438 if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
6439 !isa<SExtInst>(GEP->getOperand(I))) ||
6440 ++NumOfVarIndices > 1)
6441 return IndexSize; // 64
6442 }
6443 return (unsigned)32;
6444 };
6445
6446 // Trying to reduce IndexSize to 32 bits for vector 16.
6447 // By default the IndexSize is equal to pointer size.
6448 unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
6449 ? getIndexSizeInBits(Ptr, DL)
6450 : DL.getPointerSizeInBits();
6451
6452 auto *IndexVTy = FixedVectorType::get(
6453 IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
6454 std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(IndexVTy);
6455 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(SrcVTy);
6456 InstructionCost::CostType SplitFactor =
6457 std::max(IdxsLT.first, SrcLT.first).getValue();
6458 if (SplitFactor > 1) {
6459 // Handle splitting of vector of pointers
6460 auto *SplitSrcTy =
6461 FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
6462 return SplitFactor * getGSVectorCost(Opcode, CostKind, SplitSrcTy, Ptr,
6463 Alignment, AddressSpace);
6464 }
6465
6466 // If we didn't split, this will be a single gather/scatter instruction.
6468 return 1;
6469
6470 // The gather / scatter cost is given by Intel architects. It is a rough
6471 // number since we are looking at one instruction in a time.
6472 const int GSOverhead = (Opcode == Instruction::Load) ? getGatherOverhead()
6473 : getScatterOverhead();
6474 return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
6475 Alignment, AddressSpace, CostKind);
6476}
6477
6478/// Calculate the cost of Gather / Scatter operation
6482 bool IsLoad = MICA.getID() == Intrinsic::masked_gather ||
6483 MICA.getID() == Intrinsic::vp_gather;
6484 unsigned Opcode = IsLoad ? Instruction::Load : Instruction::Store;
6485 Type *SrcVTy = MICA.getDataType();
6486 const Value *Ptr = MICA.getPointer();
6487 Align Alignment = MICA.getAlignment();
6488 if ((Opcode == Instruction::Load &&
6489 (!isLegalMaskedGather(SrcVTy, Align(Alignment)) ||
6491 Align(Alignment)))) ||
6492 (Opcode == Instruction::Store &&
6493 (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) ||
6495 Align(Alignment)))))
6497
6498 assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
6499 unsigned AddressSpace = MICA.getAddressSpace();
6500 return getGSVectorCost(Opcode, CostKind, SrcVTy, Ptr, Alignment,
6501 AddressSpace);
6502}
6503
6505 const TargetTransformInfo::LSRCost &C2) const {
6506 // X86 specific here are "instruction number 1st priority".
6507 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, C1.NumIVMuls,
6508 C1.NumBaseAdds, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
6509 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, C2.NumIVMuls,
6510 C2.NumBaseAdds, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
6511}
6512
6514 return ST->hasMacroFusion() || ST->hasBranchFusion();
6515}
6516
6517static bool isLegalMaskedLoadStore(Type *ScalarTy, const X86Subtarget *ST) {
6518 if (!ST->hasAVX())
6519 return false;
6520
6521 if (ScalarTy->isPointerTy())
6522 return true;
6523
6524 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6525 return true;
6526
6527 if (ScalarTy->isHalfTy() && ST->hasBWI())
6528 return true;
6529
6530 if (ScalarTy->isBFloatTy() && ST->hasBF16())
6531 return true;
6532
6533 if (!ScalarTy->isIntegerTy())
6534 return false;
6535
6536 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6537 return IntWidth == 32 || IntWidth == 64 ||
6538 ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
6539}
6540
6542 unsigned AddressSpace,
6543 TTI::MaskKind MaskKind) const {
6544 Type *ScalarTy = DataTy->getScalarType();
6545
6546 // The backend can't handle a single element vector w/o CFCMOV.
6547 if (isa<VectorType>(DataTy) &&
6548 cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6549 return ST->hasCF() &&
6550 hasConditionalLoadStoreForType(ScalarTy, /*IsStore=*/false);
6551
6552 return isLegalMaskedLoadStore(ScalarTy, ST);
6553}
6554
6556 unsigned AddressSpace,
6557 TTI::MaskKind MaskKind) const {
6558 Type *ScalarTy = DataTy->getScalarType();
6559
6560 // The backend can't handle a single element vector w/o CFCMOV.
6561 if (isa<VectorType>(DataTy) &&
6562 cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6563 return ST->hasCF() &&
6564 hasConditionalLoadStoreForType(ScalarTy, /*IsStore=*/true);
6565
6566 return isLegalMaskedLoadStore(ScalarTy, ST);
6567}
6568
6569bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) const {
6570 unsigned DataSize = DL.getTypeStoreSize(DataType);
6571 // The only supported nontemporal loads are for aligned vectors of 16 or 32
6572 // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
6573 // (the equivalent stores only require AVX).
6574 if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
6575 return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
6576
6577 return false;
6578}
6579
6580bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) const {
6581 unsigned DataSize = DL.getTypeStoreSize(DataType);
6582
6583 // SSE4A supports nontemporal stores of float and double at arbitrary
6584 // alignment.
6585 if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
6586 return true;
6587
6588 // Besides the SSE4A subtarget exception above, only aligned stores are
6589 // available nontemporaly on any other subtarget. And only stores with a size
6590 // of 4..32 bytes (powers of 2, only) are permitted.
6591 if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
6592 !isPowerOf2_32(DataSize))
6593 return false;
6594
6595 // 32-byte vector nontemporal stores are supported by AVX (the equivalent
6596 // loads require AVX2).
6597 if (DataSize == 32)
6598 return ST->hasAVX();
6599 if (DataSize == 16)
6600 return ST->hasSSE1();
6601 return true;
6602}
6603
6605 ElementCount NumElements) const {
6606 // movddup
6607 return ST->hasSSE3() && !NumElements.isScalable() &&
6608 NumElements.getFixedValue() == 2 &&
6609 ElementTy == Type::getDoubleTy(ElementTy->getContext());
6610}
6611
6612bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy, Align Alignment) const {
6613 if (!isa<VectorType>(DataTy))
6614 return false;
6615
6616 if (!ST->hasAVX512())
6617 return false;
6618
6619 // The backend can't handle a single element vector.
6620 if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6621 return false;
6622
6623 Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
6624
6625 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6626 return true;
6627
6628 if (!ScalarTy->isIntegerTy())
6629 return false;
6630
6631 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6632 return IntWidth == 32 || IntWidth == 64 ||
6633 ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
6634}
6635
6637 Align Alignment) const {
6638 return isLegalMaskedExpandLoad(DataTy, Alignment);
6639}
6640
6641bool X86TTIImpl::supportsGather() const {
6642 // Some CPUs have better gather performance than others.
6643 // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
6644 // enable gather with a -march.
6645 return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2());
6646}
6647
6649 Align Alignment) const {
6650 // Gather / Scatter for vector 2 is not profitable on KNL / SKX
6651 // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend
6652 // it to 8 elements, but zeroing upper bits of the mask vector will add more
6653 // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO:
6654 // Check, maybe the gather/scatter instruction is better in the VariableMask
6655 // case.
6656 unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements();
6657 return NumElts == 1 ||
6658 (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())));
6659}
6660
6662 Align Alignment) const {
6663 Type *ScalarTy = DataTy->getScalarType();
6664 if (ScalarTy->isPointerTy())
6665 return true;
6666
6667 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6668 return true;
6669
6670 if (!ScalarTy->isIntegerTy())
6671 return false;
6672
6673 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6674 return IntWidth == 32 || IntWidth == 64;
6675}
6676
6677bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) const {
6678 if (!supportsGather() || !ST->preferGather())
6679 return false;
6680 return isLegalMaskedGatherScatter(DataTy, Alignment);
6681}
6682
6683bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
6684 unsigned Opcode1,
6685 const SmallBitVector &OpcodeMask) const {
6686 // ADDSUBPS 4xf32 SSE3
6687 // VADDSUBPS 4xf32 AVX
6688 // VADDSUBPS 8xf32 AVX2
6689 // ADDSUBPD 2xf64 SSE3
6690 // VADDSUBPD 2xf64 AVX
6691 // VADDSUBPD 4xf64 AVX2
6692
6693 unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
6694 assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible");
6695 if (!isPowerOf2_32(NumElements))
6696 return false;
6697 // Check the opcode pattern. We apply the mask on the opcode arguments and
6698 // then check if it is what we expect.
6699 for (int Lane : seq<int>(0, NumElements)) {
6700 unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0;
6701 // We expect FSub for even lanes and FAdd for odd lanes.
6702 if (Lane % 2 == 0 && Opc != Instruction::FSub)
6703 return false;
6704 if (Lane % 2 == 1 && Opc != Instruction::FAdd)
6705 return false;
6706 }
6707 // Now check that the pattern is supported by the target ISA.
6708 Type *ElemTy = cast<VectorType>(VecTy)->getElementType();
6709 if (ElemTy->isFloatTy())
6710 return ST->hasSSE3() && NumElements % 4 == 0;
6711 if (ElemTy->isDoubleTy())
6712 return ST->hasSSE3() && NumElements % 2 == 0;
6713 return false;
6714}
6715
6716bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) const {
6717 // AVX2 doesn't support scatter
6718 if (!ST->hasAVX512() || !ST->preferScatter())
6719 return false;
6720 return isLegalMaskedGatherScatter(DataType, Alignment);
6721}
6722
6723bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) const {
6724 EVT VT = TLI->getValueType(DL, DataType);
6725 return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
6726}
6727
6729 // FDIV is always expensive, even if it has a very low uop count.
6730 // TODO: Still necessary for recent CPUs with low latency/throughput fdiv?
6731 if (I->getOpcode() == Instruction::FDiv)
6732 return true;
6733
6735}
6736
6737bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) const { return false; }
6738
6740 const Function *Callee) const {
6741 const TargetMachine &TM = getTLI()->getTargetMachine();
6742
6743 // Work this as a subsetting of subtarget features.
6744 const X86Subtarget &CallerSubtarget = TM.getSubtarget<X86Subtarget>(*Caller);
6745 const X86Subtarget &CalleeSubtarget = TM.getSubtarget<X86Subtarget>(*Callee);
6746 const FeatureBitset &CallerBits = CallerSubtarget.getFeatureBits();
6747 const FeatureBitset &CalleeBits = CalleeSubtarget.getFeatureBits();
6748
6749 // Check whether callee features are a subset of caller features
6750 // (apart from the ignore list).
6751 const FeatureBitset &InlineIgnoreFeatures =
6752 CallerSubtarget.getInlineIgnoreFeatures();
6753 FeatureBitset RealCallerBits = CallerBits & ~InlineIgnoreFeatures;
6754 FeatureBitset RealCalleeBits = CalleeBits & ~InlineIgnoreFeatures;
6755 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
6756 return false;
6757
6758 // If the features are not exactly the same (or there is a difference in
6759 // AVX512 register usage), we need to additionally check for calls
6760 // that may become ABI-incompatible as a result of inlining.
6761 if (RealCallerBits == RealCalleeBits &&
6762 CallerSubtarget.useAVX512Regs() == CalleeSubtarget.useAVX512Regs())
6763 return true;
6764
6765 for (const Instruction &I : instructions(Callee)) {
6766 if (const auto *CB = dyn_cast<CallBase>(&I)) {
6767 // Having more target features is fine for inline ASM and intrinsics.
6768 if (CB->isInlineAsm() || CB->getIntrinsicID() != Intrinsic::not_intrinsic)
6769 continue;
6770
6772 for (Value *Arg : CB->args())
6773 Types.push_back(Arg->getType());
6774 if (!CB->getType()->isVoidTy())
6775 Types.push_back(CB->getType());
6776
6777 // Simple types are always ABI compatible.
6778 auto IsSimpleTy = [](Type *Ty) {
6779 return !Ty->isVectorTy() && !Ty->isAggregateType();
6780 };
6781 if (all_of(Types, IsSimpleTy))
6782 continue;
6783
6784 // Do a precise compatibility check.
6785 if (!areTypesABICompatible(Caller, Callee, Types))
6786 return false;
6787 }
6788 }
6789 return true;
6790}
6791
6793 const Function *Callee,
6794 ArrayRef<Type *> Types) const {
6795 const TargetMachine &TM = getTLI()->getTargetMachine();
6796 const TargetLowering *CallerTLI =
6797 TM.getSubtargetImpl(*Caller)->getTargetLowering();
6798 const TargetLowering *CalleeTLI =
6799 TM.getSubtargetImpl(*Callee)->getTargetLowering();
6800
6801 LLVMContext &Ctx = Caller->getContext();
6802 const DataLayout &DL = Caller->getDataLayout();
6803 CallingConv::ID CC = Callee->getCallingConv();
6804 return all_of(Types, [&](Type *Ty) {
6805 SmallVector<EVT> VTs;
6806 ComputeValueVTs(*CallerTLI, DL, Ty, VTs);
6807 return all_of(VTs, [&](EVT VT) {
6808 return CallerTLI->getRegisterTypeForCallingConv(Ctx, CC, VT) ==
6809 CalleeTLI->getRegisterTypeForCallingConv(Ctx, CC, VT);
6810 });
6811 });
6812}
6813
6815X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
6817 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
6818 Options.NumLoadsPerBlock = 2;
6819 // All GPR and vector loads can be unaligned.
6820 Options.AllowOverlappingLoads = true;
6821 if (IsZeroCmp) {
6822 // Only enable vector loads for equality comparison. Right now the vector
6823 // version is not as fast for three way compare (see #33329).
6824 const unsigned PreferredWidth = ST->getPreferVectorWidth();
6825 if (PreferredWidth >= 512 && ST->hasAVX512())
6826 Options.LoadSizes.push_back(64);
6827 if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
6828 if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
6829 }
6830 if (ST->is64Bit()) {
6831 Options.LoadSizes.push_back(8);
6832 }
6833 Options.LoadSizes.push_back(4);
6834 Options.LoadSizes.push_back(2);
6835 Options.LoadSizes.push_back(1);
6836 return Options;
6837}
6838
6840 return supportsGather();
6841}
6842
6844 return false;
6845}
6846
6848 // TODO: We expect this to be beneficial regardless of arch,
6849 // but there are currently some unexplained performance artifacts on Atom.
6850 // As a temporary solution, disable on Atom.
6851 return !(ST->isAtom());
6852}
6853
6855 switch (II->getIntrinsicID()) {
6856 default:
6857 return true;
6858 case Intrinsic::vector_reduce_smax:
6859 case Intrinsic::vector_reduce_smin:
6860 case Intrinsic::vector_reduce_umax:
6861 case Intrinsic::vector_reduce_umin:
6862 return false;
6863 }
6864}
6865
6866// Get estimation for interleaved load/store operations and strided load.
6867// \p Indices contains indices for strided load.
6868// \p Factor - the factor of interleaving.
6869// AVX-512 provides 3-src shuffles that significantly reduces the cost.
6871 unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
6872 ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
6873 TTI::TargetCostKind CostKind, bool UseMaskForCond,
6874 bool UseMaskForGaps) const {
6875 // VecTy for interleave memop is <VF*Factor x Elt>.
6876 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6877 // VecTy = <12 x i32>.
6878
6879 // Calculate the number of memory operations (NumOfMemOps), required
6880 // for load/store the VecTy.
6881 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6882 unsigned VecTySize = DL.getTypeStoreSize(VecTy);
6883 unsigned LegalVTSize = LegalVT.getStoreSize();
6884 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
6885
6886 // Get the cost of one memory operation.
6887 auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
6888 LegalVT.getVectorNumElements());
6889 InstructionCost MemOpCost;
6890 bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps;
6891 if (UseMaskedMemOp) {
6892 unsigned IID = Opcode == Instruction::Load ? Intrinsic::masked_load
6893 : Intrinsic::masked_store;
6894 MemOpCost = getMaskedMemoryOpCost(
6895 {IID, SingleMemOpTy, Alignment, AddressSpace}, CostKind);
6896 } else
6897 MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace,
6898 CostKind);
6899
6900 unsigned VF = VecTy->getNumElements() / Factor;
6901 MVT VT =
6902 MVT::getVectorVT(TLI->getSimpleValueType(DL, VecTy->getScalarType()), VF);
6903
6904 InstructionCost MaskCost;
6905 if (UseMaskedMemOp) {
6906 APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements());
6907 for (unsigned Index : Indices) {
6908 assert(Index < Factor && "Invalid index for interleaved memory op");
6909 for (unsigned Elm = 0; Elm < VF; Elm++)
6910 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
6911 }
6912
6913 Type *I1Type = Type::getInt1Ty(VecTy->getContext());
6914
6915 MaskCost = getReplicationShuffleCost(
6916 I1Type, Factor, VF,
6917 UseMaskForGaps ? DemandedLoadStoreElts
6919 CostKind);
6920
6921 // The Gaps mask is invariant and created outside the loop, therefore the
6922 // cost of creating it is not accounted for here. However if we have both
6923 // a MaskForGaps and some other mask that guards the execution of the
6924 // memory access, we need to account for the cost of And-ing the two masks
6925 // inside the loop.
6926 if (UseMaskForGaps) {
6927 auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements());
6928 MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind);
6929 }
6930 }
6931
6932 if (Opcode == Instruction::Load) {
6933 // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
6934 // contain the cost of the optimized shuffle sequence that the
6935 // X86InterleavedAccess pass will generate.
6936 // The cost of loads and stores are computed separately from the table.
6937
6938 // X86InterleavedAccess support only the following interleaved-access group.
6939 static const CostTblEntry AVX512InterleavedLoadTbl[] = {
6940 {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
6941 {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
6942 {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
6943 };
6944
6945 if (const auto *Entry =
6946 CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
6947 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6948 //If an entry does not exist, fallback to the default implementation.
6949
6950 // Kind of shuffle depends on number of loaded values.
6951 // If we load the entire data in one register, we can use a 1-src shuffle.
6952 // Otherwise, we'll merge 2 sources in each operation.
6953 TTI::ShuffleKind ShuffleKind =
6954 (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
6955
6956 InstructionCost ShuffleCost = getShuffleCost(
6957 ShuffleKind, SingleMemOpTy, SingleMemOpTy, {}, CostKind, 0, nullptr);
6958
6959 unsigned NumOfLoadsInInterleaveGrp =
6960 Indices.size() ? Indices.size() : Factor;
6961 auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
6962 VecTy->getNumElements() / Factor);
6963 InstructionCost NumOfResults =
6964 getTypeLegalizationCost(ResultTy).first * NumOfLoadsInInterleaveGrp;
6965
6966 // About a half of the loads may be folded in shuffles when we have only
6967 // one result. If we have more than one result, or the loads are masked,
6968 // we do not fold loads at all.
6969 unsigned NumOfUnfoldedLoads =
6970 UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
6971
6972 // Get a number of shuffle operations per result.
6973 unsigned NumOfShufflesPerResult =
6974 std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
6975
6976 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6977 // When we have more than one destination, we need additional instructions
6978 // to keep sources.
6979 InstructionCost NumOfMoves = 0;
6980 if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
6981 NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
6982
6983 InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
6984 MaskCost + NumOfUnfoldedLoads * MemOpCost +
6985 NumOfMoves;
6986
6987 return Cost;
6988 }
6989
6990 // Store.
6991 assert(Opcode == Instruction::Store &&
6992 "Expected Store Instruction at this point");
6993 // X86InterleavedAccess support only the following interleaved-access group.
6994 static const CostTblEntry AVX512InterleavedStoreTbl[] = {
6995 {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
6996 {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
6997 {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
6998
6999 {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
7000 {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
7001 {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
7002 {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
7003 };
7004
7005 if (const auto *Entry =
7006 CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
7007 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
7008 //If an entry does not exist, fallback to the default implementation.
7009
7010 // There is no strided stores meanwhile. And store can't be folded in
7011 // shuffle.
7012 unsigned NumOfSources = Factor; // The number of values to be merged.
7013 InstructionCost ShuffleCost =
7014 getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, SingleMemOpTy, {},
7015 CostKind, 0, nullptr);
7016 unsigned NumOfShufflesPerStore = NumOfSources - 1;
7017
7018 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
7019 // We need additional instructions to keep sources.
7020 unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
7022 MaskCost +
7023 NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
7024 NumOfMoves;
7025 return Cost;
7026}
7027
7029 unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices,
7030 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
7031 bool UseMaskForCond, bool UseMaskForGaps) const {
7032 auto *VecTy = cast<FixedVectorType>(BaseTy);
7033
7034 auto isSupportedOnAVX512 = [&](Type *VecTy) {
7035 Type *EltTy = cast<VectorType>(VecTy)->getElementType();
7036 if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
7037 EltTy->isIntegerTy(32) || EltTy->isPointerTy())
7038 return true;
7039 if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy())
7040 return ST->hasBWI();
7041 if (EltTy->isBFloatTy())
7042 return ST->hasBF16();
7043 return false;
7044 };
7045 if (ST->hasAVX512() && isSupportedOnAVX512(VecTy))
7047 Opcode, VecTy, Factor, Indices, Alignment,
7048 AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
7049
7050 if (UseMaskForCond || UseMaskForGaps)
7051 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
7052 Alignment, AddressSpace, CostKind,
7053 UseMaskForCond, UseMaskForGaps);
7054
7055 // Get estimation for interleaved load/store operations for SSE-AVX2.
7056 // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow
7057 // computing the cost using a generic formula as a function of generic
7058 // shuffles. We therefore use a lookup table instead, filled according to
7059 // the instruction sequences that codegen currently generates.
7060
7061 // VecTy for interleave memop is <VF*Factor x Elt>.
7062 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
7063 // VecTy = <12 x i32>.
7064 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
7065
7066 // This function can be called with VecTy=<6xi128>, Factor=3, in which case
7067 // the VF=2, while v2i128 is an unsupported MVT vector type
7068 // (see MachineValueType.h::getVectorVT()).
7069 if (!LegalVT.isVector())
7070 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
7071 Alignment, AddressSpace, CostKind);
7072
7073 unsigned VF = VecTy->getNumElements() / Factor;
7074 Type *ScalarTy = VecTy->getElementType();
7075 // Deduplicate entries, model floats/pointers as appropriately-sized integers.
7076 if (!ScalarTy->isIntegerTy())
7077 ScalarTy =
7078 Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy));
7079
7080 // Get the cost of all the memory operations.
7081 // FIXME: discount dead loads.
7082 InstructionCost MemOpCosts =
7083 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
7084
7085 auto *VT = FixedVectorType::get(ScalarTy, VF);
7086 EVT ETy = TLI->getValueType(DL, VT);
7087 if (!ETy.isSimple())
7088 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
7089 Alignment, AddressSpace, CostKind);
7090
7091 // TODO: Complete for other data-types and strides.
7092 // Each combination of Stride, element bit width and VF results in a different
7093 // sequence; The cost tables are therefore accessed with:
7094 // Factor (stride) and VectorType=VFxiN.
7095 // The Cost accounts only for the shuffle sequence;
7096 // The cost of the loads/stores is accounted for separately.
7097 //
7098 static const CostTblEntry AVX2InterleavedLoadTbl[] = {
7099 {2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8
7100 {2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8
7101 {2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8
7102 {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8
7103 {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8
7104
7105 {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16
7106 {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16
7107 {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16
7108
7109 {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32
7110 {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32
7111 {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32
7112
7113 {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64
7114 {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64
7115 {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64
7116 {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64
7117
7118 {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8
7119 {3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8
7120 {3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8
7121 {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
7122 {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8
7123
7124 {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16
7125 {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16
7126 {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16
7127 {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16
7128 {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16
7129
7130 {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32
7131 {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32
7132 {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32
7133 {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32
7134 {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32
7135
7136 {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64
7137 {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64
7138 {3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64
7139 {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64
7140
7141 {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8
7142 {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8
7143 {4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8
7144 {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8
7145 {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8
7146
7147 {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16
7148 {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16
7149 {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16
7150 {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16
7151 {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16
7152
7153 {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32
7154 {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32
7155 {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32
7156 {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32
7157 {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32
7158
7159 {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64
7160 {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64
7161 {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64
7162 {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64
7163
7164 {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8
7165 {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8
7166 {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8
7167 {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8
7168 {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8
7169
7170 {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16
7171 {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16
7172 {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16
7173 {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16
7174 {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16
7175
7176 {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32
7177 {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32
7178 {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32
7179 {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32
7180
7181 {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64
7182 {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64
7183 {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64
7184
7185 {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32
7186 };
7187
7188 static const CostTblEntry SSSE3InterleavedLoadTbl[] = {
7189 {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16
7190 };
7191
7192 static const CostTblEntry SSE2InterleavedLoadTbl[] = {
7193 {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16
7194 {2, MVT::v4i16, 7}, // (load 8i16 and) deinterleave into 2 x 4i16
7195
7196 {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32
7197 {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32
7198
7199 {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64
7200 };
7201
7202 static const CostTblEntry AVX2InterleavedStoreTbl[] = {
7203 {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store)
7204 {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store)
7205
7206 {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store)
7207 {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store)
7208 {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store)
7209
7210 {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store)
7211 {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store)
7212 {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store)
7213 {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store)
7214
7215 {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store)
7216 {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store)
7217 {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store)
7218 {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store)
7219 {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store)
7220
7221 {3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store)
7222 {3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store)
7223 {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store)
7224 {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
7225 {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
7226
7227 {3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store)
7228 {3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store)
7229 {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store)
7230 {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store)
7231 {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store)
7232
7233 {3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store)
7234 {3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store)
7235 {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store)
7236 {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store)
7237 {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store)
7238
7239 {3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store)
7240 {3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store)
7241 {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store)
7242 {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store)
7243
7244 {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store)
7245 {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store)
7246 {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store)
7247 {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store)
7248 {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store)
7249
7250 {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store)
7251 {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store)
7252 {4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store)
7253 {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store)
7254 {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store)
7255
7256 {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store)
7257 {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store)
7258 {4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store)
7259 {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store)
7260 {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store)
7261
7262 {4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store)
7263 {4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store)
7264 {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store)
7265 {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store)
7266
7267 {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store)
7268 {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store)
7269 {6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store)
7270 {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store)
7271 {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store)
7272
7273 {6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store)
7274 {6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store)
7275 {6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store)
7276 {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store)
7277 {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store)
7278
7279 {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store)
7280 {6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store)
7281 {6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store)
7282 {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store)
7283
7284 {6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store)
7285 {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store)
7286 {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store)
7287 };
7288
7289 static const CostTblEntry SSE2InterleavedStoreTbl[] = {
7290 {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store)
7291 {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store)
7292 {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store)
7293
7294 {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store)
7295 {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store)
7296
7297 {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store)
7298 };
7299
7300 if (Opcode == Instruction::Load) {
7301 auto GetDiscountedCost = [Factor, NumMembers = Indices.size(),
7302 MemOpCosts](const CostTblEntry *Entry) {
7303 // NOTE: this is just an approximation!
7304 // It can over/under -estimate the cost!
7305 return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor);
7306 };
7307
7308 if (ST->hasAVX2())
7309 if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor,
7310 ETy.getSimpleVT()))
7311 return GetDiscountedCost(Entry);
7312
7313 if (ST->hasSSSE3())
7314 if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor,
7315 ETy.getSimpleVT()))
7316 return GetDiscountedCost(Entry);
7317
7318 if (ST->hasSSE2())
7319 if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor,
7320 ETy.getSimpleVT()))
7321 return GetDiscountedCost(Entry);
7322 } else {
7323 assert(Opcode == Instruction::Store &&
7324 "Expected Store Instruction at this point");
7325 assert((!Indices.size() || Indices.size() == Factor) &&
7326 "Interleaved store only supports fully-interleaved groups.");
7327 if (ST->hasAVX2())
7328 if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor,
7329 ETy.getSimpleVT()))
7330 return MemOpCosts + Entry->Cost;
7331
7332 if (ST->hasSSE2())
7333 if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor,
7334 ETy.getSimpleVT()))
7335 return MemOpCosts + Entry->Cost;
7336 }
7337
7338 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
7339 Alignment, AddressSpace, CostKind,
7340 UseMaskForCond, UseMaskForGaps);
7341}
7342
7344 StackOffset BaseOffset,
7345 bool HasBaseReg, int64_t Scale,
7346 unsigned AddrSpace) const {
7347 // Scaling factors are not free at all.
7348 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
7349 // will take 2 allocations in the out of order engine instead of 1
7350 // for plain addressing mode, i.e. inst (reg1).
7351 // E.g.,
7352 // vaddps (%rsi,%rdx), %ymm0, %ymm1
7353 // Requires two allocations (one for the load, one for the computation)
7354 // whereas:
7355 // vaddps (%rsi), %ymm0, %ymm1
7356 // Requires just 1 allocation, i.e., freeing allocations for other operations
7357 // and having less micro operations to execute.
7358 //
7359 // For some X86 architectures, this is even worse because for instance for
7360 // stores, the complex addressing mode forces the instruction to use the
7361 // "load" ports instead of the dedicated "store" port.
7362 // E.g., on Haswell:
7363 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
7364 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
7366 AM.BaseGV = BaseGV;
7367 AM.BaseOffs = BaseOffset.getFixed();
7368 AM.HasBaseReg = HasBaseReg;
7369 AM.Scale = Scale;
7370 AM.ScalableOffset = BaseOffset.getScalable();
7371 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
7372 // Scale represents reg2 * scale, thus account for 1
7373 // as soon as we use a second register.
7374 return AM.Scale != 0;
7376}
7377
7379 // TODO: Hook MispredictPenalty of SchedMachineModel into this.
7380 return 14;
7381}
7382
7384 unsigned Bits = Ty->getScalarSizeInBits();
7385
7386 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
7387 // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
7388 if (ST->hasXOP() && (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
7389 return false;
7390
7391 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
7392 // shifts just as cheap as scalar ones.
7393 if (ST->hasAVX2() && (Bits == 32 || Bits == 64))
7394 return false;
7395
7396 // AVX512BW has shifts such as vpsllvw.
7397 if (ST->hasBWI() && Bits == 16)
7398 return false;
7399
7400 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
7401 // fully general vector.
7402 return true;
7403}
7404
7405unsigned X86TTIImpl::getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
7406 Type *ScalarValTy, Align Alignment,
7407 unsigned AddrSpace) const {
7408 if (ST->hasF16C() && ScalarMemTy->isHalfTy()) {
7409 return 4;
7410 }
7411 return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy, Alignment,
7412 AddrSpace);
7413}
7414
7416 SmallVectorImpl<Use *> &Ops) const {
7417 using namespace llvm::PatternMatch;
7418
7419 if (I->getOpcode() == Instruction::And &&
7420 (ST->hasBMI() || (I->getType()->isVectorTy() && ST->hasSSE2()))) {
7421 for (auto &Op : I->operands()) {
7422 // (and X, (not Y)) -> (andn X, Y)
7423 if (match(Op.get(), m_Not(m_Value())) && !I->getType()->isIntegerTy(8)) {
7424 Ops.push_back(&Op);
7425 return true;
7426 }
7427 // (and X, (splat (not Y))) -> (andn X, (splat Y))
7428 if (match(Op.get(),
7430 m_Value(), m_ZeroMask()))) {
7431 Use &InsertElt = cast<Instruction>(Op)->getOperandUse(0);
7432 Use &Not = cast<Instruction>(InsertElt)->getOperandUse(1);
7433 Ops.push_back(&Not);
7434 Ops.push_back(&InsertElt);
7435 Ops.push_back(&Op);
7436 return true;
7437 }
7438 }
7439 }
7440
7441 FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
7442 if (!VTy)
7443 return false;
7444
7445 if (I->getOpcode() == Instruction::Mul &&
7446 VTy->getElementType()->isIntegerTy(64)) {
7447 for (auto &Op : I->operands()) {
7448 // Make sure we are not already sinking this operand
7449 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
7450 continue;
7451
7452 // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or
7453 // the PMULUDQ pattern where the input is a zext_inreg from vXi32.
7454 if (ST->hasSSE41() &&
7455 match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),
7456 m_SpecificInt(32)))) {
7457 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
7458 Ops.push_back(&Op);
7459 } else if (ST->hasSSE2() &&
7460 match(Op.get(),
7461 m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) {
7462 Ops.push_back(&Op);
7463 }
7464 }
7465
7466 return !Ops.empty();
7467 }
7468
7469 // A uniform shift amount in a vector shift or funnel shift may be much
7470 // cheaper than a generic variable vector shift, so make that pattern visible
7471 // to SDAG by sinking the shuffle instruction next to the shift.
7472 int ShiftAmountOpNum = -1;
7473 if (I->isShift())
7474 ShiftAmountOpNum = 1;
7475 else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
7476 if (II->getIntrinsicID() == Intrinsic::fshl ||
7477 II->getIntrinsicID() == Intrinsic::fshr)
7478 ShiftAmountOpNum = 2;
7479 }
7480
7481 if (ShiftAmountOpNum == -1)
7482 return false;
7483
7484 auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
7485 if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
7486 isVectorShiftByScalarCheap(I->getType())) {
7487 Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
7488 return true;
7489 }
7490
7491 return false;
7492}
7493
7495 bool HasEGPR = ST->hasEGPR();
7496 const TargetMachine &TM = getTLI()->getTargetMachine();
7497
7498 for (User *U : F.users()) {
7500 if (!CB || CB->getCalledOperand() != &F)
7501 continue;
7502 Function *CallerFunc = CB->getFunction();
7503 if (TM.getSubtarget<X86Subtarget>(*CallerFunc).hasEGPR() != HasEGPR)
7504 return false;
7505 }
7506
7507 return true;
7508}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Expand Atomic instructions
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
#define X(NUM, ENUM, NAME)
Definition ELF.h:856
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
Hexagon Common GEP
iv users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
uint64_t IntrinsicInst * II
#define P(N)
static unsigned getNumElements(Type *Ty)
This file implements the SmallBitVector class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
CostTblEntryT< CostKindCosts > CostKindTblEntry
static bool isLegalMaskedLoadStore(Type *ScalarTy, const X86Subtarget *ST)
TypeConversionCostTblEntryT< CostKindCosts > TypeConversionCostKindTblEntry
This file a TargetTransformInfoImplBase conforming object specific to the X86 target machine.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1055
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1693
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1353
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:259
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1511
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1084
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:834
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:201
LLVM_ABI APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition APInt.cpp:483
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy, Align Alignment, unsigned AddrSpace) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Value * getCalledOperand() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:743
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:770
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:764
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:763
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:748
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:751
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:765
@ ICMP_NE
not equal
Definition InstrTypes.h:762
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:768
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:766
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:750
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:867
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:348
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
const IntrinsicInst * getInst() const
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Machine Value Type.
bool is128BitVector() const
Return true if this is a 128-bit vector type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Information for memory intrinsic cost model.
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
bool test(unsigned Idx) const
size_type size() const
Returns the number of bits in this bitvector.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
Primary interface to the complete machine description for the target machine.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
virtual const TargetLowering * getTargetLowering() const
virtual InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const
bool isStridedAccess(const SCEV *Ptr) const
unsigned minRequiredElementSize(const Value *Val, bool &isSigned) const
const SCEVConstant * getConstantStrideStep(ScalarEvolution *SE, const SCEV *Ptr) const
virtual bool isExpensiveToSpeculativelyExecute(const Instruction *I) const
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
MaskKind
Some targets only support masked load/store with a constant mask.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
CacheLevel
The possible cache levels.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:310
LLVM_ABI unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:288
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:309
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition Type.h:263
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:282
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:155
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:147
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:307
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:368
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:144
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:158
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:306
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:313
static LLVM_ABI Type * getDoubleTy(LLVMContext &C)
Definition Type.cpp:287
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition Type.h:227
Type * getContainedType(unsigned i) const
This method is used to implement the type iterator (defined at the end of the file).
Definition Type.h:397
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:286
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
Base class of all SIMD vector types.
static VectorType * getExtendedElementVectorType(VectorType *VTy)
This static method is like getInteger except that the element types are twice as wide as the elements...
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * getDoubleElementsVectorType(VectorType *VTy)
This static method returns a VectorType with twice as many elements as the input type and the same el...
Type * getElementType() const
bool useAVX512Regs() const
bool hasAVX512() const
bool hasAVX2() const
bool useFastCCForInternalCall(Function &F) const override
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind) const override
bool isLegalNTLoad(Type *DataType, Align Alignment) const override
std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const override
unsigned getRegisterClassForType(bool Vector, Type *Ty) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Get memory intrinsic cost based on arguments.
unsigned getMaxInterleaveFactor(ElementCount VF, bool HasUnorderedReductions) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool isLegalNTStore(Type *DataType, Align Alignment) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getInterleavedMemoryOpCostAVX512(unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
bool isVectorShiftByScalarCheap(Type *Ty) const override
bool isLegalMaskedGather(Type *DataType, Align Alignment) const override
bool shouldExpandReduction(const IntrinsicInst *II) const override
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
unsigned getAtomicMemIntrinsicMaxElementSize() const override
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const override
bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) const override
InstructionCost getBranchMispredictPenalty() const override
bool isExpensiveToSpeculativelyExecute(const Instruction *I) const override
bool hasConditionalLoadStoreForType(Type *Ty, bool IsStore) const override
bool isLegalMaskedStore(Type *DataType, Align Alignment, unsigned AddressSpace, TTI::MaskKind MaskKind=TTI::MaskKind::VariableOrConstantMask) const override
std::optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool isLegalMaskedGatherScatter(Type *DataType, Align Alignment) const
bool isLegalMaskedLoad(Type *DataType, Align Alignment, unsigned AddressSpace, TTI::MaskKind MaskKind=TTI::MaskKind::VariableOrConstantMask) const override
bool enableInterleavedAccessVectorization() const override
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const override
unsigned getNumberOfRegisters(unsigned ClassID) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const override
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy, Align Alignment, unsigned AddrSpace) const override
bool hasDivRemOp(Type *DataType, bool IsSigned) const override
bool isLegalMaskedCompressStore(Type *DataType, Align Alignment) const override
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
bool supportsEfficientVectorElementLoadStore() const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getIntImmCost(int64_t) const
Calculate the cost of materializing a 64-bit value.
InstructionCost getMinMaxCost(Intrinsic::ID IID, Type *Ty, TTI::TargetCostKind CostKind, FastMathFlags FMF) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool canMacroFuseCmp() const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool prefersVectorizedAddressing() const override
bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Type) const override
InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask, TTI::TargetCostKind CostKind) const override
bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
Estimate the overhead of scalarizing an instruction.
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
Calculate the cost of Gather / Scatter operation.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition APInt.cpp:3040
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:827
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:787
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:888
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:747
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:280
@ CTLZ_ZERO_POISON
Definition ISDOpcodes.h:796
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:852
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:374
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:804
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:348
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:769
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:858
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:356
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:727
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:986
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:934
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ CTTZ_ZERO_POISON
Bit counting operators with a poisoned result for zero inputs.
Definition ISDOpcodes.h:795
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:967
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:864
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:365
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
ap_match< APInt > m_APIntAllowPoison(const APInt *&Res)
Match APInt while allowing poison in splat vector constants.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
auto m_Value()
Match an arbitrary value and ignore it.
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
This is an optimization pass for GlobalISel generic memory operations.
constexpr auto not_equal_to(T &&Arg)
Functor variant of std::not_equal_to that can be used as a UnaryPredicate in functional algorithms li...
Definition STLExtras.h:2180
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
LLVM_ABI void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
RelativeUniformCounterPtr ValuesPtrExpr VTableAddr Value
Definition InstrProf.h:143
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1885
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2019
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
LLVM_ABI void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition CostTable.h:66
LLVM_ABI int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
#define N
std::optional< unsigned > operator[](TargetTransformInfo::TargetCostKind Kind) const
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Cost Table Entry.
Definition CostTable.h:25
Extended Value Type.
Definition ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:145
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:396
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:408
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:339
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:176
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:346
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition KnownBits.h:256
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Type Conversion Cost Table.
Definition CostTable.h:55