LLVM 20.0.0git
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements a TargetTransformInfo analysis pass specific to the
10/// X86 target machine. It uses the target's detailed information to provide
11/// more precise answers to certain TTI queries, while letting the target
12/// independent and default TTI implementations handle the rest.
13///
14//===----------------------------------------------------------------------===//
15/// About Cost Model numbers used below it's necessary to say the following:
16/// the numbers correspond to some "generic" X86 CPU instead of usage of a
17/// specific CPU model. Usually the numbers correspond to the CPU where the
18/// feature first appeared. For example, if we do Subtarget.hasSSE42() in
19/// the lookups below the cost is based on Nehalem as that was the first CPU
20/// to support that feature level and thus has most likely the worst case cost,
21/// although we may discard an outlying worst cost from one CPU (e.g. Atom).
22///
23/// Some examples of other technologies/CPUs:
24/// SSE 3 - Pentium4 / Athlon64
25/// SSE 4.1 - Penryn
26/// SSE 4.2 - Nehalem / Silvermont
27/// AVX - Sandy Bridge / Jaguar / Bulldozer
28/// AVX2 - Haswell / Ryzen
29/// AVX-512 - Xeon Phi / Skylake
30///
31/// And some examples of instruction target dependent costs (latency)
32/// divss sqrtss rsqrtss
33/// AMD K7 11-16 19 3
34/// Piledriver 9-24 13-15 5
35/// Jaguar 14 16 2
36/// Pentium II,III 18 30 2
37/// Nehalem 7-14 7-18 3
38/// Haswell 10-13 11 5
39///
40/// Interpreting the 4 TargetCostKind types:
41/// TCK_RecipThroughput and TCK_Latency should try to match the worst case
42/// values reported by the CPU scheduler models (and llvm-mca).
43/// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the
44/// actual encoding size of the instruction.
45/// TCK_SizeAndLatency should match the worst case micro-op counts reported by
46/// by the CPU scheduler models (and llvm-mca), to ensure that they are
47/// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are
48/// often used as the cost thresholds where TCK_SizeAndLatency is requested.
49//===----------------------------------------------------------------------===//
50
58#include <optional>
59
60using namespace llvm;
61
62#define DEBUG_TYPE "x86tti"
63
64//===----------------------------------------------------------------------===//
65//
66// X86 cost model.
67//
68//===----------------------------------------------------------------------===//
69
70// Helper struct to store/access costs for each cost kind.
71// TODO: Move this to allow other targets to use it?
73 unsigned RecipThroughputCost = ~0U;
74 unsigned LatencyCost = ~0U;
75 unsigned CodeSizeCost = ~0U;
76 unsigned SizeAndLatencyCost = ~0U;
77
78 std::optional<unsigned>
80 unsigned Cost = ~0U;
81 switch (Kind) {
84 break;
87 break;
90 break;
93 break;
94 }
95 if (Cost == ~0U)
96 return std::nullopt;
97 return Cost;
98 }
99};
102
105 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
106 // TODO: Currently the __builtin_popcount() implementation using SSE3
107 // instructions is inefficient. Once the problem is fixed, we should
108 // call ST->hasSSE3() instead of ST->hasPOPCNT().
109 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
110}
111
112std::optional<unsigned> X86TTIImpl::getCacheSize(
114 switch (Level) {
116 // - Penryn
117 // - Nehalem
118 // - Westmere
119 // - Sandy Bridge
120 // - Ivy Bridge
121 // - Haswell
122 // - Broadwell
123 // - Skylake
124 // - Kabylake
125 return 32 * 1024; // 32 KByte
127 // - Penryn
128 // - Nehalem
129 // - Westmere
130 // - Sandy Bridge
131 // - Ivy Bridge
132 // - Haswell
133 // - Broadwell
134 // - Skylake
135 // - Kabylake
136 return 256 * 1024; // 256 KByte
137 }
138
139 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
140}
141
142std::optional<unsigned> X86TTIImpl::getCacheAssociativity(
144 // - Penryn
145 // - Nehalem
146 // - Westmere
147 // - Sandy Bridge
148 // - Ivy Bridge
149 // - Haswell
150 // - Broadwell
151 // - Skylake
152 // - Kabylake
153 switch (Level) {
155 [[fallthrough]];
157 return 8;
158 }
159
160 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
161}
162
163unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
164 bool Vector = (ClassID == 1);
165 if (Vector && !ST->hasSSE1())
166 return 0;
167
168 if (ST->is64Bit()) {
169 if (Vector && ST->hasAVX512())
170 return 32;
171 if (!Vector && ST->hasEGPR())
172 return 32;
173 return 16;
174 }
175 return 8;
176}
177
179 if (!ST->hasCF())
180 return false;
181 if (!Ty)
182 return true;
183 // Conditional faulting is supported by CFCMOV, which only accepts
184 // 16/32/64-bit operands.
185 // TODO: Support f32/f64 with VMOVSS/VMOVSD with zero mask when it's
186 // profitable.
187 auto *VTy = dyn_cast<FixedVectorType>(Ty);
188 if (!Ty->isIntegerTy() && (!VTy || VTy->getNumElements() != 1))
189 return false;
190 auto *ScalarTy = Ty->getScalarType();
191 switch (cast<IntegerType>(ScalarTy)->getBitWidth()) {
192 default:
193 return false;
194 case 16:
195 case 32:
196 case 64:
197 return true;
198 }
199}
200
203 unsigned PreferVectorWidth = ST->getPreferVectorWidth();
204 switch (K) {
206 return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
208 if (ST->hasAVX512() && ST->hasEVEX512() && PreferVectorWidth >= 512)
209 return TypeSize::getFixed(512);
210 if (ST->hasAVX() && PreferVectorWidth >= 256)
211 return TypeSize::getFixed(256);
212 if (ST->hasSSE1() && PreferVectorWidth >= 128)
213 return TypeSize::getFixed(128);
214 return TypeSize::getFixed(0);
216 return TypeSize::getScalable(0);
217 }
218
219 llvm_unreachable("Unsupported register kind");
220}
221
224 .getFixedValue();
225}
226
228 // If the loop will not be vectorized, don't interleave the loop.
229 // Let regular unroll to unroll the loop, which saves the overflow
230 // check and memory check cost.
231 if (VF.isScalar())
232 return 1;
233
234 if (ST->isAtom())
235 return 1;
236
237 // Sandybridge and Haswell have multiple execution ports and pipelined
238 // vector units.
239 if (ST->hasAVX())
240 return 4;
241
242 return 2;
243}
244
246 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
249 const Instruction *CxtI) {
250
251 // vXi8 multiplications are always promoted to vXi16.
252 // Sub-128-bit types can be extended/packed more efficiently.
253 if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
254 Ty->getPrimitiveSizeInBits() <= 64 && Ty->getScalarSizeInBits() == 8) {
255 Type *WideVecTy =
256 VectorType::getExtendedElementVectorType(cast<VectorType>(Ty));
257 return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
259 CostKind) +
260 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
262 CostKind) +
263 getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info);
264 }
265
266 // Legalize the type.
267 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
268
269 int ISD = TLI->InstructionOpcodeToISD(Opcode);
270 assert(ISD && "Invalid opcode");
271
272 if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
273 (LT.second.getScalarType() == MVT::i32 ||
274 LT.second.getScalarType() == MVT::i64)) {
275 // Check if the operands can be represented as a smaller datatype.
276 bool Op1Signed = false, Op2Signed = false;
277 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
278 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
279 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
280 bool SignedMode = Op1Signed || Op2Signed;
281
282 // If both vXi32 are representable as i15 and at least one is constant,
283 // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
284 // can treat this as PMADDWD which has the same costs as a vXi16 multiply.
285 if (OpMinSize <= 15 && !ST->isPMADDWDSlow() &&
286 LT.second.getScalarType() == MVT::i32) {
287 bool Op1Constant =
288 isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
289 bool Op2Constant =
290 isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
291 bool Op1Sext = isa<SExtInst>(Args[0]) &&
292 (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
293 bool Op2Sext = isa<SExtInst>(Args[1]) &&
294 (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
295
296 bool IsZeroExtended = !Op1Signed || !Op2Signed;
297 bool IsConstant = Op1Constant || Op2Constant;
298 bool IsSext = Op1Sext || Op2Sext;
299 if (IsConstant || IsZeroExtended || IsSext)
300 LT.second =
301 MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
302 }
303
304 // Check if the vXi32 operands can be shrunk into a smaller datatype.
305 // This should match the codegen from reduceVMULWidth.
306 // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()).
307 if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) {
308 if (OpMinSize <= 7)
309 return LT.first * 3; // pmullw/sext
310 if (!SignedMode && OpMinSize <= 8)
311 return LT.first * 3; // pmullw/zext
312 if (OpMinSize <= 15)
313 return LT.first * 5; // pmullw/pmulhw/pshuf
314 if (!SignedMode && OpMinSize <= 16)
315 return LT.first * 5; // pmullw/pmulhw/pshuf
316 }
317
318 // If both vXi64 are representable as (unsigned) i32, then we can perform
319 // the multiple with a single PMULUDQ instruction.
320 // TODO: Add (SSE41+) PMULDQ handling for signed extensions.
321 if (!SignedMode && OpMinSize <= 32 && LT.second.getScalarType() == MVT::i64)
322 ISD = X86ISD::PMULUDQ;
323 }
324
325 // Vector multiply by pow2 will be simplified to shifts.
326 // Vector multiply by -pow2 will be simplified to shifts/negates.
327 if (ISD == ISD::MUL && Op2Info.isConstant() &&
328 (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) {
330 getArithmeticInstrCost(Instruction::Shl, Ty, CostKind,
331 Op1Info.getNoProps(), Op2Info.getNoProps());
332 if (Op2Info.isNegatedPowerOf2())
333 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
334 return Cost;
335 }
336
337 // On X86, vector signed division by constants power-of-two are
338 // normally expanded to the sequence SRA + SRL + ADD + SRA.
339 // The OperandValue properties may not be the same as that of the previous
340 // operation; conservatively assume OP_None.
341 if ((ISD == ISD::SDIV || ISD == ISD::SREM) &&
342 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
344 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
345 Op1Info.getNoProps(), Op2Info.getNoProps());
346 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
347 Op1Info.getNoProps(), Op2Info.getNoProps());
348 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
349 Op1Info.getNoProps(), Op2Info.getNoProps());
350
351 if (ISD == ISD::SREM) {
352 // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
353 Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(),
354 Op2Info.getNoProps());
355 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(),
356 Op2Info.getNoProps());
357 }
358
359 return Cost;
360 }
361
362 // Vector unsigned division/remainder will be simplified to shifts/masks.
363 if ((ISD == ISD::UDIV || ISD == ISD::UREM) &&
364 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
365 if (ISD == ISD::UDIV)
366 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
367 Op1Info.getNoProps(), Op2Info.getNoProps());
368 // UREM
369 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
370 Op1Info.getNoProps(), Op2Info.getNoProps());
371 }
372
373 static const CostKindTblEntry GFNIUniformConstCostTable[] = {
374 { ISD::SHL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
375 { ISD::SRL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
376 { ISD::SRA, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
377 { ISD::SHL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
378 { ISD::SRL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
379 { ISD::SRA, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
380 { ISD::SHL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
381 { ISD::SRL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
382 { ISD::SRA, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
383 };
384
385 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasGFNI())
386 if (const auto *Entry =
387 CostTableLookup(GFNIUniformConstCostTable, ISD, LT.second))
388 if (auto KindCost = Entry->Cost[CostKind])
389 return LT.first * *KindCost;
390
391 static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
392 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
393 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
394 { ISD::SRA, MVT::v16i8, { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb.
395 { ISD::SHL, MVT::v32i8, { 1, 8, 2, 3 } }, // psllw + pand.
396 { ISD::SRL, MVT::v32i8, { 1, 8, 2, 3 } }, // psrlw + pand.
397 { ISD::SRA, MVT::v32i8, { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb.
398 { ISD::SHL, MVT::v64i8, { 1, 8, 2, 3 } }, // psllw + pand.
399 { ISD::SRL, MVT::v64i8, { 1, 8, 2, 3 } }, // psrlw + pand.
400 { ISD::SRA, MVT::v64i8, { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb.
401
402 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // psllw
403 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
404 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
405 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // psllw
406 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
407 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
408 };
409
410 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI())
411 if (const auto *Entry =
412 CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second))
413 if (auto KindCost = Entry->Cost[CostKind])
414 return LT.first * *KindCost;
415
416 static const CostKindTblEntry AVX512UniformConstCostTable[] = {
417 { ISD::SHL, MVT::v64i8, { 2, 12, 5, 6 } }, // psllw + pand.
418 { ISD::SRL, MVT::v64i8, { 2, 12, 5, 6 } }, // psrlw + pand.
419 { ISD::SRA, MVT::v64i8, { 3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb.
420
421 { ISD::SHL, MVT::v16i16, { 2, 7, 4, 4 } }, // psllw + split.
422 { ISD::SRL, MVT::v16i16, { 2, 7, 4, 4 } }, // psrlw + split.
423 { ISD::SRA, MVT::v16i16, { 2, 7, 4, 4 } }, // psraw + split.
424
425 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, // pslld
426 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, // psrld
427 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, // psrad
428 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, // pslld
429 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, // psrld
430 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, // psrad
431
432 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, // psraq
433 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, // psllq
434 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, // psrlq
435 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, // psraq
436 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, // psllq
437 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, // psrlq
438 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, // psraq
439
440 { ISD::SDIV, MVT::v16i32, { 6 } }, // pmuludq sequence
441 { ISD::SREM, MVT::v16i32, { 8 } }, // pmuludq+mul+sub sequence
442 { ISD::UDIV, MVT::v16i32, { 5 } }, // pmuludq sequence
443 { ISD::UREM, MVT::v16i32, { 7 } }, // pmuludq+mul+sub sequence
444 };
445
446 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512())
447 if (const auto *Entry =
448 CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second))
449 if (auto KindCost = Entry->Cost[CostKind])
450 return LT.first * *KindCost;
451
452 static const CostKindTblEntry AVX2UniformConstCostTable[] = {
453 { ISD::SHL, MVT::v16i8, { 1, 8, 2, 3 } }, // psllw + pand.
454 { ISD::SRL, MVT::v16i8, { 1, 8, 2, 3 } }, // psrlw + pand.
455 { ISD::SRA, MVT::v16i8, { 2, 10, 5, 6 } }, // psrlw, pand, pxor, psubb.
456 { ISD::SHL, MVT::v32i8, { 2, 8, 2, 4 } }, // psllw + pand.
457 { ISD::SRL, MVT::v32i8, { 2, 8, 2, 4 } }, // psrlw + pand.
458 { ISD::SRA, MVT::v32i8, { 3, 10, 5, 9 } }, // psrlw, pand, pxor, psubb.
459
460 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw
461 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw
462 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw
463 { ISD::SHL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psllw
464 { ISD::SRL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psrlw
465 { ISD::SRA, MVT::v16i16,{ 2, 2, 1, 2 } }, // psraw
466
467 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
468 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld
469 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad
470 { ISD::SHL, MVT::v8i32, { 2, 2, 1, 2 } }, // pslld
471 { ISD::SRL, MVT::v8i32, { 2, 2, 1, 2 } }, // psrld
472 { ISD::SRA, MVT::v8i32, { 2, 2, 1, 2 } }, // psrad
473
474 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq
475 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq
476 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
477 { ISD::SHL, MVT::v4i64, { 2, 2, 1, 2 } }, // psllq
478 { ISD::SRL, MVT::v4i64, { 2, 2, 1, 2 } }, // psrlq
479 { ISD::SRA, MVT::v4i64, { 4, 4, 3, 6 } }, // psrad + shuffle + split.
480
481 { ISD::SDIV, MVT::v8i32, { 6 } }, // pmuludq sequence
482 { ISD::SREM, MVT::v8i32, { 8 } }, // pmuludq+mul+sub sequence
483 { ISD::UDIV, MVT::v8i32, { 5 } }, // pmuludq sequence
484 { ISD::UREM, MVT::v8i32, { 7 } }, // pmuludq+mul+sub sequence
485 };
486
487 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2())
488 if (const auto *Entry =
489 CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second))
490 if (auto KindCost = Entry->Cost[CostKind])
491 return LT.first * *KindCost;
492
493 static const CostKindTblEntry AVXUniformConstCostTable[] = {
494 { ISD::SHL, MVT::v16i8, { 2, 7, 2, 3 } }, // psllw + pand.
495 { ISD::SRL, MVT::v16i8, { 2, 7, 2, 3 } }, // psrlw + pand.
496 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
497 { ISD::SHL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psllw + pand) + split.
498 { ISD::SRL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psrlw + pand) + split.
499 { ISD::SRA, MVT::v32i8, { 7, 7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split.
500
501 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 1 } }, // psllw.
502 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 1 } }, // psrlw.
503 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 1 } }, // psraw.
504 { ISD::SHL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psllw + split.
505 { ISD::SRL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psrlw + split.
506 { ISD::SRA, MVT::v16i16,{ 3, 6, 4, 5 } }, // psraw + split.
507
508 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 1 } }, // pslld.
509 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 1 } }, // psrld.
510 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 1 } }, // psrad.
511 { ISD::SHL, MVT::v8i32, { 3, 6, 4, 5 } }, // pslld + split.
512 { ISD::SRL, MVT::v8i32, { 3, 6, 4, 5 } }, // psrld + split.
513 { ISD::SRA, MVT::v8i32, { 3, 6, 4, 5 } }, // psrad + split.
514
515 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 1 } }, // psllq.
516 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 1 } }, // psrlq.
517 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
518 { ISD::SHL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
519 { ISD::SRL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
520 { ISD::SRA, MVT::v4i64, { 5, 7, 8, 9 } }, // 2 x psrad + shuffle + split.
521
522 { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split.
523 { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split.
524 { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split.
525 { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split.
526 };
527
528 // XOP has faster vXi8 shifts.
529 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() &&
530 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
531 if (const auto *Entry =
532 CostTableLookup(AVXUniformConstCostTable, ISD, LT.second))
533 if (auto KindCost = Entry->Cost[CostKind])
534 return LT.first * *KindCost;
535
536 static const CostKindTblEntry SSE2UniformConstCostTable[] = {
537 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
538 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
539 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
540
541 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw.
542 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw.
543 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw.
544
545 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
546 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld.
547 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad.
548
549 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq.
550 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq.
551 { ISD::SRA, MVT::v2i64, { 3, 5, 6, 6 } }, // 2 x psrad + shuffle.
552
553 { ISD::SDIV, MVT::v4i32, { 6 } }, // pmuludq sequence
554 { ISD::SREM, MVT::v4i32, { 8 } }, // pmuludq+mul+sub sequence
555 { ISD::UDIV, MVT::v4i32, { 5 } }, // pmuludq sequence
556 { ISD::UREM, MVT::v4i32, { 7 } }, // pmuludq+mul+sub sequence
557 };
558
559 // XOP has faster vXi8 shifts.
560 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() &&
561 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
562 if (const auto *Entry =
563 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
564 if (auto KindCost = Entry->Cost[CostKind])
565 return LT.first * *KindCost;
566
567 static const CostKindTblEntry AVX512BWConstCostTable[] = {
568 { ISD::SDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
569 { ISD::SREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
570 { ISD::UDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
571 { ISD::UREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
572
573 { ISD::SDIV, MVT::v32i16, { 6 } }, // vpmulhw sequence
574 { ISD::SREM, MVT::v32i16, { 8 } }, // vpmulhw+mul+sub sequence
575 { ISD::UDIV, MVT::v32i16, { 6 } }, // vpmulhuw sequence
576 { ISD::UREM, MVT::v32i16, { 8 } }, // vpmulhuw+mul+sub sequence
577 };
578
579 if (Op2Info.isConstant() && ST->hasBWI())
580 if (const auto *Entry =
581 CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
582 if (auto KindCost = Entry->Cost[CostKind])
583 return LT.first * *KindCost;
584
585 static const CostKindTblEntry AVX512ConstCostTable[] = {
586 { ISD::SDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
587 { ISD::SREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
588 { ISD::UDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
589 { ISD::UREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
590
591 { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence
592 { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence
593 { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence
594 { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence
595
596 { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence
597 { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence
598 { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence
599 { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence
600 };
601
602 if (Op2Info.isConstant() && ST->hasAVX512())
603 if (const auto *Entry =
604 CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
605 if (auto KindCost = Entry->Cost[CostKind])
606 return LT.first * *KindCost;
607
608 static const CostKindTblEntry AVX2ConstCostTable[] = {
609 { ISD::SDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
610 { ISD::SREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
611 { ISD::UDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
612 { ISD::UREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
613
614 { ISD::SDIV, MVT::v16i16, { 6 } }, // vpmulhw sequence
615 { ISD::SREM, MVT::v16i16, { 8 } }, // vpmulhw+mul+sub sequence
616 { ISD::UDIV, MVT::v16i16, { 6 } }, // vpmulhuw sequence
617 { ISD::UREM, MVT::v16i16, { 8 } }, // vpmulhuw+mul+sub sequence
618
619 { ISD::SDIV, MVT::v8i32, { 15 } }, // vpmuldq sequence
620 { ISD::SREM, MVT::v8i32, { 19 } }, // vpmuldq+mul+sub sequence
621 { ISD::UDIV, MVT::v8i32, { 15 } }, // vpmuludq sequence
622 { ISD::UREM, MVT::v8i32, { 19 } }, // vpmuludq+mul+sub sequence
623 };
624
625 if (Op2Info.isConstant() && ST->hasAVX2())
626 if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
627 if (auto KindCost = Entry->Cost[CostKind])
628 return LT.first * *KindCost;
629
630 static const CostKindTblEntry AVXConstCostTable[] = {
631 { ISD::SDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
632 { ISD::SREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
633 { ISD::UDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
634 { ISD::UREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
635
636 { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split.
637 { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split.
638 { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split.
639 { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split.
640
641 { ISD::SDIV, MVT::v8i32, { 32 } }, // vpmuludq sequence
642 { ISD::SREM, MVT::v8i32, { 38 } }, // vpmuludq+mul+sub sequence
643 { ISD::UDIV, MVT::v8i32, { 32 } }, // 2*pmuludq sequence + split.
644 { ISD::UREM, MVT::v8i32, { 42 } }, // 2*pmuludq+mul+sub sequence + split.
645 };
646
647 if (Op2Info.isConstant() && ST->hasAVX())
648 if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second))
649 if (auto KindCost = Entry->Cost[CostKind])
650 return LT.first * *KindCost;
651
652 static const CostKindTblEntry SSE41ConstCostTable[] = {
653 { ISD::SDIV, MVT::v4i32, { 15 } }, // vpmuludq sequence
654 { ISD::SREM, MVT::v4i32, { 20 } }, // vpmuludq+mul+sub sequence
655 };
656
657 if (Op2Info.isConstant() && ST->hasSSE41())
658 if (const auto *Entry =
659 CostTableLookup(SSE41ConstCostTable, ISD, LT.second))
660 if (auto KindCost = Entry->Cost[CostKind])
661 return LT.first * *KindCost;
662
663 static const CostKindTblEntry SSE2ConstCostTable[] = {
664 { ISD::SDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
665 { ISD::SREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
666 { ISD::UDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
667 { ISD::UREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
668
669 { ISD::SDIV, MVT::v8i16, { 6 } }, // pmulhw sequence
670 { ISD::SREM, MVT::v8i16, { 8 } }, // pmulhw+mul+sub sequence
671 { ISD::UDIV, MVT::v8i16, { 6 } }, // pmulhuw sequence
672 { ISD::UREM, MVT::v8i16, { 8 } }, // pmulhuw+mul+sub sequence
673
674 { ISD::SDIV, MVT::v4i32, { 19 } }, // pmuludq sequence
675 { ISD::SREM, MVT::v4i32, { 24 } }, // pmuludq+mul+sub sequence
676 { ISD::UDIV, MVT::v4i32, { 15 } }, // pmuludq sequence
677 { ISD::UREM, MVT::v4i32, { 20 } }, // pmuludq+mul+sub sequence
678 };
679
680 if (Op2Info.isConstant() && ST->hasSSE2())
681 if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
682 if (auto KindCost = Entry->Cost[CostKind])
683 return LT.first * *KindCost;
684
685 static const CostKindTblEntry AVX512BWUniformCostTable[] = {
686 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
687 { ISD::SRL, MVT::v16i8, { 3,10, 5, 8 } }, // psrlw + pand.
688 { ISD::SRA, MVT::v16i8, { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb.
689 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
690 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
691 { ISD::SRA, MVT::v32i8, { 5,10,10,13 } }, // psrlw, pand, pxor, psubb.
692 { ISD::SHL, MVT::v64i8, { 4, 7, 6, 8 } }, // psllw + pand.
693 { ISD::SRL, MVT::v64i8, { 4, 8, 7,10 } }, // psrlw + pand.
694 { ISD::SRA, MVT::v64i8, { 5,10,10,15 } }, // psrlw, pand, pxor, psubb.
695
696 { ISD::SHL, MVT::v32i16, { 2, 4, 2, 3 } }, // psllw
697 { ISD::SRL, MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw
698 { ISD::SRA, MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw
699 };
700
701 if (ST->hasBWI() && Op2Info.isUniform())
702 if (const auto *Entry =
703 CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second))
704 if (auto KindCost = Entry->Cost[CostKind])
705 return LT.first * *KindCost;
706
707 static const CostKindTblEntry AVX512UniformCostTable[] = {
708 { ISD::SHL, MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split.
709 { ISD::SRL, MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split.
710 { ISD::SRA, MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split.
711
712 { ISD::SHL, MVT::v16i32, { 2, 4, 2, 3 } }, // pslld
713 { ISD::SRL, MVT::v16i32, { 2, 4, 2, 3 } }, // psrld
714 { ISD::SRA, MVT::v16i32, { 2, 4, 2, 3 } }, // psrad
715
716 { ISD::SRA, MVT::v2i64, { 1, 2, 1, 2 } }, // psraq
717 { ISD::SHL, MVT::v4i64, { 1, 4, 1, 2 } }, // psllq
718 { ISD::SRL, MVT::v4i64, { 1, 4, 1, 2 } }, // psrlq
719 { ISD::SRA, MVT::v4i64, { 1, 4, 1, 2 } }, // psraq
720 { ISD::SHL, MVT::v8i64, { 1, 4, 1, 2 } }, // psllq
721 { ISD::SRL, MVT::v8i64, { 1, 4, 1, 2 } }, // psrlq
722 { ISD::SRA, MVT::v8i64, { 1, 4, 1, 2 } }, // psraq
723 };
724
725 if (ST->hasAVX512() && Op2Info.isUniform())
726 if (const auto *Entry =
727 CostTableLookup(AVX512UniformCostTable, ISD, LT.second))
728 if (auto KindCost = Entry->Cost[CostKind])
729 return LT.first * *KindCost;
730
731 static const CostKindTblEntry AVX2UniformCostTable[] = {
732 // Uniform splats are cheaper for the following instructions.
733 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
734 { ISD::SRL, MVT::v16i8, { 3, 9, 5, 8 } }, // psrlw + pand.
735 { ISD::SRA, MVT::v16i8, { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb.
736 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
737 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
738 { ISD::SRA, MVT::v32i8, { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb.
739
740 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 2 } }, // psllw.
741 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 2 } }, // psrlw.
742 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 2 } }, // psraw.
743 { ISD::SHL, MVT::v16i16, { 2, 4, 2, 3 } }, // psllw.
744 { ISD::SRL, MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw.
745 { ISD::SRA, MVT::v16i16, { 2, 4, 2, 3 } }, // psraw.
746
747 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 2 } }, // pslld
748 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 2 } }, // psrld
749 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 2 } }, // psrad
750 { ISD::SHL, MVT::v8i32, { 2, 4, 2, 3 } }, // pslld
751 { ISD::SRL, MVT::v8i32, { 2, 4, 2, 3 } }, // psrld
752 { ISD::SRA, MVT::v8i32, { 2, 4, 2, 3 } }, // psrad
753
754 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 2 } }, // psllq
755 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 2 } }, // psrlq
756 { ISD::SRA, MVT::v2i64, { 2, 4, 5, 7 } }, // 2 x psrad + shuffle.
757 { ISD::SHL, MVT::v4i64, { 2, 4, 1, 2 } }, // psllq
758 { ISD::SRL, MVT::v4i64, { 2, 4, 1, 2 } }, // psrlq
759 { ISD::SRA, MVT::v4i64, { 4, 6, 5, 9 } }, // 2 x psrad + shuffle.
760 };
761
762 if (ST->hasAVX2() && Op2Info.isUniform())
763 if (const auto *Entry =
764 CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
765 if (auto KindCost = Entry->Cost[CostKind])
766 return LT.first * *KindCost;
767
768 static const CostKindTblEntry AVXUniformCostTable[] = {
769 { ISD::SHL, MVT::v16i8, { 4, 4, 6, 8 } }, // psllw + pand.
770 { ISD::SRL, MVT::v16i8, { 4, 8, 5, 8 } }, // psrlw + pand.
771 { ISD::SRA, MVT::v16i8, { 6, 6, 9,13 } }, // psrlw, pand, pxor, psubb.
772 { ISD::SHL, MVT::v32i8, { 7, 8,11,14 } }, // psllw + pand + split.
773 { ISD::SRL, MVT::v32i8, { 7, 9,10,14 } }, // psrlw + pand + split.
774 { ISD::SRA, MVT::v32i8, { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split.
775
776 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 2 } }, // psllw.
777 { ISD::SRL, MVT::v8i16, { 1, 3, 1, 2 } }, // psrlw.
778 { ISD::SRA, MVT::v8i16, { 1, 3, 1, 2 } }, // psraw.
779 { ISD::SHL, MVT::v16i16, { 3, 7, 5, 7 } }, // psllw + split.
780 { ISD::SRL, MVT::v16i16, { 3, 7, 5, 7 } }, // psrlw + split.
781 { ISD::SRA, MVT::v16i16, { 3, 7, 5, 7 } }, // psraw + split.
782
783 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 2 } }, // pslld.
784 { ISD::SRL, MVT::v4i32, { 1, 3, 1, 2 } }, // psrld.
785 { ISD::SRA, MVT::v4i32, { 1, 3, 1, 2 } }, // psrad.
786 { ISD::SHL, MVT::v8i32, { 3, 7, 5, 7 } }, // pslld + split.
787 { ISD::SRL, MVT::v8i32, { 3, 7, 5, 7 } }, // psrld + split.
788 { ISD::SRA, MVT::v8i32, { 3, 7, 5, 7 } }, // psrad + split.
789
790 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 2 } }, // psllq.
791 { ISD::SRL, MVT::v2i64, { 1, 3, 1, 2 } }, // psrlq.
792 { ISD::SRA, MVT::v2i64, { 3, 4, 5, 7 } }, // 2 x psrad + shuffle.
793 { ISD::SHL, MVT::v4i64, { 3, 7, 4, 6 } }, // psllq + split.
794 { ISD::SRL, MVT::v4i64, { 3, 7, 4, 6 } }, // psrlq + split.
795 { ISD::SRA, MVT::v4i64, { 6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split.
796 };
797
798 // XOP has faster vXi8 shifts.
799 if (ST->hasAVX() && Op2Info.isUniform() &&
800 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
801 if (const auto *Entry =
802 CostTableLookup(AVXUniformCostTable, ISD, LT.second))
803 if (auto KindCost = Entry->Cost[CostKind])
804 return LT.first * *KindCost;
805
806 static const CostKindTblEntry SSE2UniformCostTable[] = {
807 // Uniform splats are cheaper for the following instructions.
808 { ISD::SHL, MVT::v16i8, { 9, 10, 6, 9 } }, // psllw + pand.
809 { ISD::SRL, MVT::v16i8, { 9, 13, 5, 9 } }, // psrlw + pand.
810 { ISD::SRA, MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence.
811
812 { ISD::SHL, MVT::v8i16, { 2, 2, 1, 2 } }, // psllw.
813 { ISD::SRL, MVT::v8i16, { 2, 2, 1, 2 } }, // psrlw.
814 { ISD::SRA, MVT::v8i16, { 2, 2, 1, 2 } }, // psraw.
815
816 { ISD::SHL, MVT::v4i32, { 2, 2, 1, 2 } }, // pslld
817 { ISD::SRL, MVT::v4i32, { 2, 2, 1, 2 } }, // psrld.
818 { ISD::SRA, MVT::v4i32, { 2, 2, 1, 2 } }, // psrad.
819
820 { ISD::SHL, MVT::v2i64, { 2, 2, 1, 2 } }, // psllq.
821 { ISD::SRL, MVT::v2i64, { 2, 2, 1, 2 } }, // psrlq.
822 { ISD::SRA, MVT::v2i64, { 5, 9, 5, 7 } }, // 2*psrlq + xor + sub.
823 };
824
825 if (ST->hasSSE2() && Op2Info.isUniform() &&
826 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
827 if (const auto *Entry =
828 CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
829 if (auto KindCost = Entry->Cost[CostKind])
830 return LT.first * *KindCost;
831
832 static const CostKindTblEntry AVX512DQCostTable[] = {
833 { ISD::MUL, MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq
834 { ISD::MUL, MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq
835 { ISD::MUL, MVT::v8i64, { 3, 15, 1, 3 } } // pmullq
836 };
837
838 // Look for AVX512DQ lowering tricks for custom cases.
839 if (ST->hasDQI())
840 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
841 if (auto KindCost = Entry->Cost[CostKind])
842 return LT.first * *KindCost;
843
844 static const CostKindTblEntry AVX512BWCostTable[] = {
845 { ISD::SHL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsllvw/pack sequence.
846 { ISD::SRL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsrlvw/pack sequence.
847 { ISD::SRA, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsravw/pack sequence.
848 { ISD::SHL, MVT::v32i8, { 4, 23,11,16 } }, // extend/vpsllvw/pack sequence.
849 { ISD::SRL, MVT::v32i8, { 4, 30,12,18 } }, // extend/vpsrlvw/pack sequence.
850 { ISD::SRA, MVT::v32i8, { 6, 13,24,30 } }, // extend/vpsravw/pack sequence.
851 { ISD::SHL, MVT::v64i8, { 6, 19,13,15 } }, // extend/vpsllvw/pack sequence.
852 { ISD::SRL, MVT::v64i8, { 7, 27,15,18 } }, // extend/vpsrlvw/pack sequence.
853 { ISD::SRA, MVT::v64i8, { 15, 15,30,30 } }, // extend/vpsravw/pack sequence.
854
855 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsllvw
856 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsrlvw
857 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsravw
858 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsllvw
859 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsrlvw
860 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsravw
861 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsllvw
862 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsrlvw
863 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsravw
864
865 { ISD::ADD, MVT::v64i8, { 1, 1, 1, 1 } }, // paddb
866 { ISD::ADD, MVT::v32i16, { 1, 1, 1, 1 } }, // paddw
867
868 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 1 } }, // paddb
869 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 1 } }, // paddw
870 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 1 } }, // paddd
871 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 1 } }, // paddq
872
873 { ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb
874 { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw
875
876 { ISD::MUL, MVT::v16i8, { 4, 12, 4, 5 } }, // extend/pmullw/trunc
877 { ISD::MUL, MVT::v32i8, { 3, 10, 7,10 } }, // pmaddubsw
878 { ISD::MUL, MVT::v64i8, { 3, 11, 7,10 } }, // pmaddubsw
879 { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw
880
881 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb
882 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 1 } }, // psubw
883 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 1 } }, // psubd
884 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 1 } }, // psubq
885 };
886
887 // Look for AVX512BW lowering tricks for custom cases.
888 if (ST->hasBWI())
889 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
890 if (auto KindCost = Entry->Cost[CostKind])
891 return LT.first * *KindCost;
892
893 static const CostKindTblEntry AVX512CostTable[] = {
894 { ISD::SHL, MVT::v64i8, { 15, 19,27,33 } }, // vpblendv+split sequence.
895 { ISD::SRL, MVT::v64i8, { 15, 19,30,36 } }, // vpblendv+split sequence.
896 { ISD::SRA, MVT::v64i8, { 37, 37,51,63 } }, // vpblendv+split sequence.
897
898 { ISD::SHL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
899 { ISD::SRL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
900 { ISD::SRA, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence.
901
902 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } },
903 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } },
904 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } },
905 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } },
906 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } },
907 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } },
908 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } },
909 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } },
910 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } },
911
912 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } },
913 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } },
914 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } },
915 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } },
916 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } },
917 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } },
918 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } },
919 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } },
920 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } },
921
922 { ISD::ADD, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*paddb + split
923 { ISD::ADD, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*paddw + split
924
925 { ISD::SUB, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*psubb + split
926 { ISD::SUB, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*psubw + split
927
928 { ISD::AND, MVT::v32i8, { 1, 1, 1, 1 } },
929 { ISD::AND, MVT::v16i16, { 1, 1, 1, 1 } },
930 { ISD::AND, MVT::v8i32, { 1, 1, 1, 1 } },
931 { ISD::AND, MVT::v4i64, { 1, 1, 1, 1 } },
932
933 { ISD::OR, MVT::v32i8, { 1, 1, 1, 1 } },
934 { ISD::OR, MVT::v16i16, { 1, 1, 1, 1 } },
935 { ISD::OR, MVT::v8i32, { 1, 1, 1, 1 } },
936 { ISD::OR, MVT::v4i64, { 1, 1, 1, 1 } },
937
938 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 1 } },
939 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 1 } },
940 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 1 } },
941 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 1 } },
942
943 { ISD::MUL, MVT::v16i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
944 { ISD::MUL, MVT::v8i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
945 { ISD::MUL, MVT::v4i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
946 { ISD::MUL, MVT::v8i64, { 6, 9, 8, 8 } }, // 3*pmuludq/3*shift/2*add
947 { ISD::MUL, MVT::i64, { 1 } }, // Skylake from http://www.agner.org/
948
949 { X86ISD::PMULUDQ, MVT::v8i64, { 1, 5, 1, 1 } },
950
951 { ISD::FNEG, MVT::v8f64, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
952 { ISD::FADD, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
953 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
954 { ISD::FSUB, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
955 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
956 { ISD::FMUL, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
957 { ISD::FMUL, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
958 { ISD::FMUL, MVT::v2f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
959 { ISD::FMUL, MVT::f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
960
961 { ISD::FDIV, MVT::f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
962 { ISD::FDIV, MVT::v2f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
963 { ISD::FDIV, MVT::v4f64, { 8, 14, 1, 1 } }, // Skylake from http://www.agner.org/
964 { ISD::FDIV, MVT::v8f64, { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/
965
966 { ISD::FNEG, MVT::v16f32, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
967 { ISD::FADD, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
968 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
969 { ISD::FSUB, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
970 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
971 { ISD::FMUL, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
972 { ISD::FMUL, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
973 { ISD::FMUL, MVT::v4f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
974 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
975
976 { ISD::FDIV, MVT::f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
977 { ISD::FDIV, MVT::v4f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
978 { ISD::FDIV, MVT::v8f32, { 5, 11, 1, 1 } }, // Skylake from http://www.agner.org/
979 { ISD::FDIV, MVT::v16f32, { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/
980 };
981
982 if (ST->hasAVX512())
983 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
984 if (auto KindCost = Entry->Cost[CostKind])
985 return LT.first * *KindCost;
986
987 static const CostKindTblEntry AVX2ShiftCostTable[] = {
988 // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
989 // customize them to detect the cases where shift amount is a scalar one.
990 { ISD::SHL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org)
991 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
992 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org)
993 { ISD::SHL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org)
994 { ISD::SRL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
995 { ISD::SRA, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org)
996 { ISD::SHL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org)
997 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org)
998 { ISD::SHL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org)
999 { ISD::SRL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org)
1000 };
1001
1002 if (ST->hasAVX512()) {
1003 if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant())
1004 // On AVX512, a packed v32i16 shift left by a constant build_vector
1005 // is lowered into a vector multiply (vpmullw).
1006 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
1007 Op1Info.getNoProps(), Op2Info.getNoProps());
1008 }
1009
1010 // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
1011 if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
1012 if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
1013 Op2Info.isConstant())
1014 // On AVX2, a packed v16i16 shift left by a constant build_vector
1015 // is lowered into a vector multiply (vpmullw).
1016 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
1017 Op1Info.getNoProps(), Op2Info.getNoProps());
1018
1019 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
1020 if (auto KindCost = Entry->Cost[CostKind])
1021 return LT.first * *KindCost;
1022 }
1023
1024 static const CostKindTblEntry XOPShiftCostTable[] = {
1025 // 128bit shifts take 1cy, but right shifts require negation beforehand.
1026 { ISD::SHL, MVT::v16i8, { 1, 3, 1, 1 } },
1027 { ISD::SRL, MVT::v16i8, { 2, 3, 1, 1 } },
1028 { ISD::SRA, MVT::v16i8, { 2, 3, 1, 1 } },
1029 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 1 } },
1030 { ISD::SRL, MVT::v8i16, { 2, 3, 1, 1 } },
1031 { ISD::SRA, MVT::v8i16, { 2, 3, 1, 1 } },
1032 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 1 } },
1033 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 1 } },
1034 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 1 } },
1035 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 1 } },
1036 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } },
1037 { ISD::SRA, MVT::v2i64, { 2, 3, 1, 1 } },
1038 // 256bit shifts require splitting if AVX2 didn't catch them above.
1039 { ISD::SHL, MVT::v32i8, { 4, 7, 5, 6 } },
1040 { ISD::SRL, MVT::v32i8, { 6, 7, 5, 6 } },
1041 { ISD::SRA, MVT::v32i8, { 6, 7, 5, 6 } },
1042 { ISD::SHL, MVT::v16i16, { 4, 7, 5, 6 } },
1043 { ISD::SRL, MVT::v16i16, { 6, 7, 5, 6 } },
1044 { ISD::SRA, MVT::v16i16, { 6, 7, 5, 6 } },
1045 { ISD::SHL, MVT::v8i32, { 4, 7, 5, 6 } },
1046 { ISD::SRL, MVT::v8i32, { 6, 7, 5, 6 } },
1047 { ISD::SRA, MVT::v8i32, { 6, 7, 5, 6 } },
1048 { ISD::SHL, MVT::v4i64, { 4, 7, 5, 6 } },
1049 { ISD::SRL, MVT::v4i64, { 6, 7, 5, 6 } },
1050 { ISD::SRA, MVT::v4i64, { 6, 7, 5, 6 } },
1051 };
1052
1053 // Look for XOP lowering tricks.
1054 if (ST->hasXOP()) {
1055 // If the right shift is constant then we'll fold the negation so
1056 // it's as cheap as a left shift.
1057 int ShiftISD = ISD;
1058 if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant())
1059 ShiftISD = ISD::SHL;
1060 if (const auto *Entry =
1061 CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
1062 if (auto KindCost = Entry->Cost[CostKind])
1063 return LT.first * *KindCost;
1064 }
1065
1066 if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) {
1067 MVT VT = LT.second;
1068 // Vector shift left by non uniform constant can be lowered
1069 // into vector multiply.
1070 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
1071 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
1072 ISD = ISD::MUL;
1073 }
1074
1075 static const CostKindTblEntry GLMCostTable[] = {
1076 { ISD::FDIV, MVT::f32, { 18, 19, 1, 1 } }, // divss
1077 { ISD::FDIV, MVT::v4f32, { 35, 36, 1, 1 } }, // divps
1078 { ISD::FDIV, MVT::f64, { 33, 34, 1, 1 } }, // divsd
1079 { ISD::FDIV, MVT::v2f64, { 65, 66, 1, 1 } }, // divpd
1080 };
1081
1082 if (ST->useGLMDivSqrtCosts())
1083 if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second))
1084 if (auto KindCost = Entry->Cost[CostKind])
1085 return LT.first * *KindCost;
1086
1087 static const CostKindTblEntry SLMCostTable[] = {
1088 { ISD::MUL, MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld
1089 { ISD::MUL, MVT::v8i16, { 2, 5, 1, 1 } }, // pmullw
1090 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // mulsd
1091 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // mulss
1092 { ISD::FMUL, MVT::v2f64, { 4, 7, 1, 1 } }, // mulpd
1093 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // mulps
1094 { ISD::FDIV, MVT::f32, { 17, 19, 1, 1 } }, // divss
1095 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 6 } }, // divps
1096 { ISD::FDIV, MVT::f64, { 32, 34, 1, 1 } }, // divsd
1097 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 6 } }, // divpd
1098 { ISD::FADD, MVT::v2f64, { 2, 4, 1, 1 } }, // addpd
1099 { ISD::FSUB, MVT::v2f64, { 2, 4, 1, 1 } }, // subpd
1100 // v2i64/v4i64 mul is custom lowered as a series of long:
1101 // multiplies(3), shifts(3) and adds(2)
1102 // slm muldq version throughput is 2 and addq throughput 4
1103 // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
1104 // 3X4 (addq throughput) = 17
1105 { ISD::MUL, MVT::v2i64, { 17, 22, 9, 9 } },
1106 // slm addq\subq throughput is 4
1107 { ISD::ADD, MVT::v2i64, { 4, 2, 1, 2 } },
1108 { ISD::SUB, MVT::v2i64, { 4, 2, 1, 2 } },
1109 };
1110
1111 if (ST->useSLMArithCosts())
1112 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second))
1113 if (auto KindCost = Entry->Cost[CostKind])
1114 return LT.first * *KindCost;
1115
1116 static const CostKindTblEntry AVX2CostTable[] = {
1117 { ISD::SHL, MVT::v16i8, { 6, 21,11,16 } }, // vpblendvb sequence.
1118 { ISD::SHL, MVT::v32i8, { 6, 23,11,22 } }, // vpblendvb sequence.
1119 { ISD::SHL, MVT::v8i16, { 5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence.
1120 { ISD::SHL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1121
1122 { ISD::SRL, MVT::v16i8, { 6, 27,12,18 } }, // vpblendvb sequence.
1123 { ISD::SRL, MVT::v32i8, { 8, 30,12,24 } }, // vpblendvb sequence.
1124 { ISD::SRL, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence.
1125 { ISD::SRL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1126
1127 { ISD::SRA, MVT::v16i8, { 17, 17,24,30 } }, // vpblendvb sequence.
1128 { ISD::SRA, MVT::v32i8, { 18, 20,24,43 } }, // vpblendvb sequence.
1129 { ISD::SRA, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsravd/pack sequence.
1130 { ISD::SRA, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsravd/pack sequence.
1131 { ISD::SRA, MVT::v2i64, { 4, 5, 5, 5 } }, // srl/xor/sub sequence.
1132 { ISD::SRA, MVT::v4i64, { 8, 8, 5, 9 } }, // srl/xor/sub sequence.
1133
1134 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 2 } }, // psubb
1135 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 2 } }, // paddb
1136 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 2 } }, // psubw
1137 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 2 } }, // paddw
1138 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 2 } }, // psubd
1139 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 2 } }, // paddd
1140 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 2 } }, // psubq
1141 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq
1142
1143 { ISD::MUL, MVT::v16i8, { 5, 18, 6,12 } }, // extend/pmullw/pack
1144 { ISD::MUL, MVT::v32i8, { 4, 8, 8,16 } }, // pmaddubsw
1145 { ISD::MUL, MVT::v16i16, { 2, 5, 1, 2 } }, // pmullw
1146 { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld
1147 { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld
1148 { ISD::MUL, MVT::v4i64, { 6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add
1149 { ISD::MUL, MVT::v2i64, { 6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
1150
1151 { X86ISD::PMULUDQ, MVT::v4i64, { 1, 5, 1, 1 } },
1152
1153 { ISD::FNEG, MVT::v4f64, { 1, 1, 1, 2 } }, // vxorpd
1154 { ISD::FNEG, MVT::v8f32, { 1, 1, 1, 2 } }, // vxorps
1155
1156 { ISD::FADD, MVT::f64, { 1, 4, 1, 1 } }, // vaddsd
1157 { ISD::FADD, MVT::f32, { 1, 4, 1, 1 } }, // vaddss
1158 { ISD::FADD, MVT::v2f64, { 1, 4, 1, 1 } }, // vaddpd
1159 { ISD::FADD, MVT::v4f32, { 1, 4, 1, 1 } }, // vaddps
1160 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 2 } }, // vaddpd
1161 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 2 } }, // vaddps
1162
1163 { ISD::FSUB, MVT::f64, { 1, 4, 1, 1 } }, // vsubsd
1164 { ISD::FSUB, MVT::f32, { 1, 4, 1, 1 } }, // vsubss
1165 { ISD::FSUB, MVT::v2f64, { 1, 4, 1, 1 } }, // vsubpd
1166 { ISD::FSUB, MVT::v4f32, { 1, 4, 1, 1 } }, // vsubps
1167 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 2 } }, // vsubpd
1168 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 2 } }, // vsubps
1169
1170 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // vmulsd
1171 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // vmulss
1172 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // vmulpd
1173 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // vmulps
1174 { ISD::FMUL, MVT::v4f64, { 1, 5, 1, 2 } }, // vmulpd
1175 { ISD::FMUL, MVT::v8f32, { 1, 5, 1, 2 } }, // vmulps
1176
1177 { ISD::FDIV, MVT::f32, { 7, 13, 1, 1 } }, // vdivss
1178 { ISD::FDIV, MVT::v4f32, { 7, 13, 1, 1 } }, // vdivps
1179 { ISD::FDIV, MVT::v8f32, { 14, 21, 1, 3 } }, // vdivps
1180 { ISD::FDIV, MVT::f64, { 14, 20, 1, 1 } }, // vdivsd
1181 { ISD::FDIV, MVT::v2f64, { 14, 20, 1, 1 } }, // vdivpd
1182 { ISD::FDIV, MVT::v4f64, { 28, 35, 1, 3 } }, // vdivpd
1183 };
1184
1185 // Look for AVX2 lowering tricks for custom cases.
1186 if (ST->hasAVX2())
1187 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
1188 if (auto KindCost = Entry->Cost[CostKind])
1189 return LT.first * *KindCost;
1190
1191 static const CostKindTblEntry AVX1CostTable[] = {
1192 // We don't have to scalarize unsupported ops. We can issue two half-sized
1193 // operations and we only need to extract the upper YMM half.
1194 // Two ops + 1 extract + 1 insert = 4.
1195 { ISD::MUL, MVT::v32i8, { 10, 11, 18, 19 } }, // pmaddubsw + split
1196 { ISD::MUL, MVT::v16i8, { 5, 6, 8, 12 } }, // 2*pmaddubsw/3*and/psllw/or
1197 { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split
1198 { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split
1199 { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld
1200 { ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } },
1201
1202 { ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps
1203 { ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps
1204 { ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps
1205 { ISD::AND, MVT::v4i64, { 1, 1, 1, 2 } }, // vandps
1206
1207 { ISD::OR, MVT::v32i8, { 1, 1, 1, 2 } }, // vorps
1208 { ISD::OR, MVT::v16i16, { 1, 1, 1, 2 } }, // vorps
1209 { ISD::OR, MVT::v8i32, { 1, 1, 1, 2 } }, // vorps
1210 { ISD::OR, MVT::v4i64, { 1, 1, 1, 2 } }, // vorps
1211
1212 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 2 } }, // vxorps
1213 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 2 } }, // vxorps
1214 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 2 } }, // vxorps
1215 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 2 } }, // vxorps
1216
1217 { ISD::SUB, MVT::v32i8, { 4, 2, 5, 6 } }, // psubb + split
1218 { ISD::ADD, MVT::v32i8, { 4, 2, 5, 6 } }, // paddb + split
1219 { ISD::SUB, MVT::v16i16, { 4, 2, 5, 6 } }, // psubw + split
1220 { ISD::ADD, MVT::v16i16, { 4, 2, 5, 6 } }, // paddw + split
1221 { ISD::SUB, MVT::v8i32, { 4, 2, 5, 6 } }, // psubd + split
1222 { ISD::ADD, MVT::v8i32, { 4, 2, 5, 6 } }, // paddd + split
1223 { ISD::SUB, MVT::v4i64, { 4, 2, 5, 6 } }, // psubq + split
1224 { ISD::ADD, MVT::v4i64, { 4, 2, 5, 6 } }, // paddq + split
1225 { ISD::SUB, MVT::v2i64, { 1, 1, 1, 1 } }, // psubq
1226 { ISD::ADD, MVT::v2i64, { 1, 1, 1, 1 } }, // paddq
1227
1228 { ISD::SHL, MVT::v16i8, { 10, 21,11,17 } }, // pblendvb sequence.
1229 { ISD::SHL, MVT::v32i8, { 22, 22,27,40 } }, // pblendvb sequence + split.
1230 { ISD::SHL, MVT::v8i16, { 6, 9,11,11 } }, // pblendvb sequence.
1231 { ISD::SHL, MVT::v16i16, { 13, 16,24,25 } }, // pblendvb sequence + split.
1232 { ISD::SHL, MVT::v4i32, { 3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld
1233 { ISD::SHL, MVT::v8i32, { 9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split
1234 { ISD::SHL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1235 { ISD::SHL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1236
1237 { ISD::SRL, MVT::v16i8, { 11, 27,12,18 } }, // pblendvb sequence.
1238 { ISD::SRL, MVT::v32i8, { 23, 23,30,43 } }, // pblendvb sequence + split.
1239 { ISD::SRL, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1240 { ISD::SRL, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1241 { ISD::SRL, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1242 { ISD::SRL, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1243 { ISD::SRL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1244 { ISD::SRL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1245
1246 { ISD::SRA, MVT::v16i8, { 21, 22,24,36 } }, // pblendvb sequence.
1247 { ISD::SRA, MVT::v32i8, { 44, 45,51,76 } }, // pblendvb sequence + split.
1248 { ISD::SRA, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1249 { ISD::SRA, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1250 { ISD::SRA, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1251 { ISD::SRA, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1252 { ISD::SRA, MVT::v2i64, { 5, 6,10,14 } }, // Shift each lane + blend.
1253 { ISD::SRA, MVT::v4i64, { 12, 12,22,30 } }, // Shift each lane + blend + split.
1254
1255 { ISD::FNEG, MVT::v4f64, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1256 { ISD::FNEG, MVT::v8f32, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1257
1258 { ISD::FADD, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1259 { ISD::FADD, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1260 { ISD::FADD, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1261 { ISD::FADD, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1262 { ISD::FADD, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1263 { ISD::FADD, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1264
1265 { ISD::FSUB, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1266 { ISD::FSUB, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1267 { ISD::FSUB, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1268 { ISD::FSUB, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1269 { ISD::FSUB, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1270 { ISD::FSUB, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1271
1272 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1273 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1274 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1275 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1276 { ISD::FMUL, MVT::v4f64, { 4, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1277 { ISD::FMUL, MVT::v8f32, { 2, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1278
1279 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1280 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1281 { ISD::FDIV, MVT::v8f32, { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/
1282 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1283 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1284 { ISD::FDIV, MVT::v4f64, { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/
1285 };
1286
1287 if (ST->hasAVX())
1288 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
1289 if (auto KindCost = Entry->Cost[CostKind])
1290 return LT.first * *KindCost;
1291
1292 static const CostKindTblEntry SSE42CostTable[] = {
1293 { ISD::FADD, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1294 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1295 { ISD::FADD, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1296 { ISD::FADD, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1297
1298 { ISD::FSUB, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1299 { ISD::FSUB, MVT::f32 , { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1300 { ISD::FSUB, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1301 { ISD::FSUB, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1302
1303 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1304 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1305 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1306 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1307
1308 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1309 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1310 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1311 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1312
1313 { ISD::MUL, MVT::v2i64, { 6, 10,10,10 } } // 3*pmuludq/3*shift/2*add
1314 };
1315
1316 if (ST->hasSSE42())
1317 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
1318 if (auto KindCost = Entry->Cost[CostKind])
1319 return LT.first * *KindCost;
1320
1321 static const CostKindTblEntry SSE41CostTable[] = {
1322 { ISD::SHL, MVT::v16i8, { 15, 24,17,22 } }, // pblendvb sequence.
1323 { ISD::SHL, MVT::v8i16, { 11, 14,11,11 } }, // pblendvb sequence.
1324 { ISD::SHL, MVT::v4i32, { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld
1325
1326 { ISD::SRL, MVT::v16i8, { 16, 27,18,24 } }, // pblendvb sequence.
1327 { ISD::SRL, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1328 { ISD::SRL, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1329 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1330
1331 { ISD::SRA, MVT::v16i8, { 38, 41,30,36 } }, // pblendvb sequence.
1332 { ISD::SRA, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1333 { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1334 { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence.
1335
1336 { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org)
1337 };
1338
1339 if (ST->hasSSE41())
1340 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
1341 if (auto KindCost = Entry->Cost[CostKind])
1342 return LT.first * *KindCost;
1343
1344 static const CostKindTblEntry SSSE3CostTable[] = {
1345 { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*pmaddubsw/3*and/psllw/or
1346 };
1347
1348 if (ST->hasSSSE3())
1349 if (const auto *Entry = CostTableLookup(SSSE3CostTable, ISD, LT.second))
1350 if (auto KindCost = Entry->Cost[CostKind])
1351 return LT.first * *KindCost;
1352
1353 static const CostKindTblEntry SSE2CostTable[] = {
1354 // We don't correctly identify costs of casts because they are marked as
1355 // custom.
1356 { ISD::SHL, MVT::v16i8, { 13, 21,26,28 } }, // cmpgtb sequence.
1357 { ISD::SHL, MVT::v8i16, { 24, 27,16,20 } }, // cmpgtw sequence.
1358 { ISD::SHL, MVT::v4i32, { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq.
1359 { ISD::SHL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1360
1361 { ISD::SRL, MVT::v16i8, { 14, 28,27,30 } }, // cmpgtb sequence.
1362 { ISD::SRL, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1363 { ISD::SRL, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1364 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1365
1366 { ISD::SRA, MVT::v16i8, { 27, 30,54,54 } }, // unpacked cmpgtb sequence.
1367 { ISD::SRA, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1368 { ISD::SRA, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1369 { ISD::SRA, MVT::v2i64, { 8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence.
1370
1371 { ISD::AND, MVT::v16i8, { 1, 1, 1, 1 } }, // pand
1372 { ISD::AND, MVT::v8i16, { 1, 1, 1, 1 } }, // pand
1373 { ISD::AND, MVT::v4i32, { 1, 1, 1, 1 } }, // pand
1374 { ISD::AND, MVT::v2i64, { 1, 1, 1, 1 } }, // pand
1375
1376 { ISD::OR, MVT::v16i8, { 1, 1, 1, 1 } }, // por
1377 { ISD::OR, MVT::v8i16, { 1, 1, 1, 1 } }, // por
1378 { ISD::OR, MVT::v4i32, { 1, 1, 1, 1 } }, // por
1379 { ISD::OR, MVT::v2i64, { 1, 1, 1, 1 } }, // por
1380
1381 { ISD::XOR, MVT::v16i8, { 1, 1, 1, 1 } }, // pxor
1382 { ISD::XOR, MVT::v8i16, { 1, 1, 1, 1 } }, // pxor
1383 { ISD::XOR, MVT::v4i32, { 1, 1, 1, 1 } }, // pxor
1384 { ISD::XOR, MVT::v2i64, { 1, 1, 1, 1 } }, // pxor
1385
1386 { ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq
1387 { ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq
1388
1389 { ISD::MUL, MVT::v16i8, { 6, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack
1390 { ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw
1391 { ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle
1392 { ISD::MUL, MVT::v2i64, { 7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add
1393
1394 { X86ISD::PMULUDQ, MVT::v2i64, { 1, 5, 1, 1 } },
1395
1396 { ISD::FDIV, MVT::f32, { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/
1397 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/
1398 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/
1399 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/
1400
1401 { ISD::FNEG, MVT::f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1402 { ISD::FNEG, MVT::f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1403 { ISD::FNEG, MVT::v4f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1404 { ISD::FNEG, MVT::v2f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1405
1406 { ISD::FADD, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1407 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1408 { ISD::FADD, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1409
1410 { ISD::FSUB, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1411 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1412 { ISD::FSUB, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1413
1414 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1415 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1416 };
1417
1418 if (ST->hasSSE2())
1419 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
1420 if (auto KindCost = Entry->Cost[CostKind])
1421 return LT.first * *KindCost;
1422
1423 static const CostKindTblEntry SSE1CostTable[] = {
1424 { ISD::FDIV, MVT::f32, { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/
1425 { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/
1426
1427 { ISD::FNEG, MVT::f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1428 { ISD::FNEG, MVT::v4f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1429
1430 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1431 { ISD::FADD, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1432
1433 { ISD::FSUB, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1434 { ISD::FSUB, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1435
1436 { ISD::FMUL, MVT::f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1437 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1438 };
1439
1440 if (ST->hasSSE1())
1441 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1442 if (auto KindCost = Entry->Cost[CostKind])
1443 return LT.first * *KindCost;
1444
1445 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
1446 { ISD::ADD, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1447 { ISD::SUB, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1448 { ISD::MUL, MVT::i64, { 2, 6, 1, 2 } },
1449 };
1450
1451 if (ST->is64Bit())
1452 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1453 if (auto KindCost = Entry->Cost[CostKind])
1454 return LT.first * *KindCost;
1455
1456 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1457 { ISD::ADD, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1458 { ISD::ADD, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1459 { ISD::ADD, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1460
1461 { ISD::SUB, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1462 { ISD::SUB, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1463 { ISD::SUB, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1464
1465 { ISD::MUL, MVT::i8, { 3, 4, 1, 1 } },
1466 { ISD::MUL, MVT::i16, { 2, 4, 1, 1 } },
1467 { ISD::MUL, MVT::i32, { 1, 4, 1, 1 } },
1468
1469 { ISD::FNEG, MVT::f64, { 2, 2, 1, 3 } }, // (x87)
1470 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1471 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1472 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // (x87)
1473 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87)
1474 };
1475
1476 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1477 if (auto KindCost = Entry->Cost[CostKind])
1478 return LT.first * *KindCost;
1479
1480 // It is not a good idea to vectorize division. We have to scalarize it and
1481 // in the process we will often end up having to spilling regular
1482 // registers. The overhead of division is going to dominate most kernels
1483 // anyways so try hard to prevent vectorization of division - it is
1484 // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1485 // to hide "20 cycles" for each lane.
1486 if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() &&
1487 (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
1488 ISD == ISD::UREM)) {
1489 InstructionCost ScalarCost =
1491 Op1Info.getNoProps(), Op2Info.getNoProps());
1492 return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1493 }
1494
1495 // Handle some basic single instruction code size cases.
1496 if (CostKind == TTI::TCK_CodeSize) {
1497 switch (ISD) {
1498 case ISD::FADD:
1499 case ISD::FSUB:
1500 case ISD::FMUL:
1501 case ISD::FDIV:
1502 case ISD::FNEG:
1503 case ISD::AND:
1504 case ISD::OR:
1505 case ISD::XOR:
1506 return LT.first;
1507 break;
1508 }
1509 }
1510
1511 // Fallback to the default implementation.
1512 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1513 Args, CxtI);
1514}
1515
1518 unsigned Opcode1, const SmallBitVector &OpcodeMask,
1520 if (isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
1521 return TTI::TCC_Basic;
1523}
1524
1526 TTI::ShuffleKind Kind, VectorType *BaseTp, ArrayRef<int> Mask,
1527 TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
1528 ArrayRef<const Value *> Args, const Instruction *CxtI) {
1529 // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1530 // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1531 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(BaseTp);
1532
1533 Kind = improveShuffleKindFromMask(Kind, Mask, BaseTp, Index, SubTp);
1534
1535 // If all args are constant than this will be constant folded away.
1536 if (!Args.empty() &&
1537 all_of(Args, [](const Value *Arg) { return isa<Constant>(Arg); }))
1538 return TTI::TCC_Free;
1539
1540 // Recognize a basic concat_vector shuffle.
1541 if (Kind == TTI::SK_PermuteTwoSrc &&
1542 Mask.size() == (2 * BaseTp->getElementCount().getKnownMinValue()) &&
1543 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
1546 CostKind, Mask.size() / 2, BaseTp);
1547
1548 // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1549 if (Kind == TTI::SK_Transpose)
1550 Kind = TTI::SK_PermuteTwoSrc;
1551
1552 if (Kind == TTI::SK_Broadcast) {
1553 // For Broadcasts we are splatting the first element from the first input
1554 // register, so only need to reference that input and all the output
1555 // registers are the same.
1556 LT.first = 1;
1557
1558 // If we're broadcasting a load then AVX/AVX2 can do this for free.
1559 using namespace PatternMatch;
1560 if (!Args.empty() && match(Args[0], m_OneUse(m_Load(m_Value()))) &&
1561 (ST->hasAVX2() ||
1562 (ST->hasAVX() && LT.second.getScalarSizeInBits() >= 32)))
1563 return TTI::TCC_Free;
1564 }
1565
1566 // Attempt to detect a cheaper inlane shuffle, avoiding 128-bit subvector
1567 // permutation.
1568 // Attempt to detect a shuffle mask with a single defined element.
1569 bool IsInLaneShuffle = false;
1570 bool IsSingleElementMask = false;
1571 if (BaseTp->getPrimitiveSizeInBits() > 0 &&
1572 (BaseTp->getPrimitiveSizeInBits() % 128) == 0 &&
1573 BaseTp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
1574 Mask.size() == BaseTp->getElementCount().getKnownMinValue()) {
1575 unsigned NumLanes = BaseTp->getPrimitiveSizeInBits() / 128;
1576 unsigned NumEltsPerLane = Mask.size() / NumLanes;
1577 if ((Mask.size() % NumLanes) == 0) {
1578 IsInLaneShuffle = all_of(enumerate(Mask), [&](const auto &P) {
1579 return P.value() == PoisonMaskElem ||
1580 ((P.value() % Mask.size()) / NumEltsPerLane) ==
1581 (P.index() / NumEltsPerLane);
1582 });
1583 IsSingleElementMask =
1584 (Mask.size() - 1) == static_cast<unsigned>(count_if(Mask, [](int M) {
1585 return M == PoisonMaskElem;
1586 }));
1587 }
1588 }
1589
1590 // Treat <X x bfloat> shuffles as <X x half>.
1591 if (LT.second.isVector() && LT.second.getScalarType() == MVT::bf16)
1592 LT.second = LT.second.changeVectorElementType(MVT::f16);
1593
1594 // Subvector extractions are free if they start at the beginning of a
1595 // vector and cheap if the subvectors are aligned.
1596 if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1597 int NumElts = LT.second.getVectorNumElements();
1598 if ((Index % NumElts) == 0)
1599 return TTI::TCC_Free;
1600 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1601 if (SubLT.second.isVector()) {
1602 int NumSubElts = SubLT.second.getVectorNumElements();
1603 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1604 return SubLT.first;
1605 // Handle some cases for widening legalization. For now we only handle
1606 // cases where the original subvector was naturally aligned and evenly
1607 // fit in its legalized subvector type.
1608 // FIXME: Remove some of the alignment restrictions.
1609 // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1610 // vectors.
1611 int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1612 if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1613 (NumSubElts % OrigSubElts) == 0 &&
1614 LT.second.getVectorElementType() ==
1615 SubLT.second.getVectorElementType() &&
1616 LT.second.getVectorElementType().getSizeInBits() ==
1618 assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1619 "Unexpected number of elements!");
1620 auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
1621 LT.second.getVectorNumElements());
1622 auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
1623 SubLT.second.getVectorNumElements());
1624 int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1625 InstructionCost ExtractCost = getShuffleCost(
1626 TTI::SK_ExtractSubvector, VecTy, {}, CostKind, ExtractIndex, SubTy);
1627
1628 // If the original size is 32-bits or more, we can use pshufd. Otherwise
1629 // if we have SSSE3 we can use pshufb.
1630 if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1631 return ExtractCost + 1; // pshufd or pshufb
1632
1633 assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1634 "Unexpected vector size");
1635
1636 return ExtractCost + 2; // worst case pshufhw + pshufd
1637 }
1638 }
1639 // If the extract subvector is not optimal, treat it as single op shuffle.
1641 }
1642
1643 // Subvector insertions are cheap if the subvectors are aligned.
1644 // Note that in general, the insertion starting at the beginning of a vector
1645 // isn't free, because we need to preserve the rest of the wide vector,
1646 // but if the destination vector legalizes to the same width as the subvector
1647 // then the insertion will simplify to a (free) register copy.
1648 if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1649 int NumElts = LT.second.getVectorNumElements();
1650 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1651 if (SubLT.second.isVector()) {
1652 int NumSubElts = SubLT.second.getVectorNumElements();
1653 bool MatchingTypes =
1654 NumElts == NumSubElts &&
1655 (SubTp->getElementCount().getKnownMinValue() % NumSubElts) == 0;
1656 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1657 return MatchingTypes ? TTI::TCC_Free : SubLT.first;
1658 }
1659
1660 // Attempt to match MOVSS (Idx == 0) or INSERTPS pattern. This will have
1661 // been matched by improveShuffleKindFromMask as a SK_InsertSubvector of
1662 // v1f32 (legalised to f32) into a v4f32.
1663 if (LT.first == 1 && LT.second == MVT::v4f32 && SubLT.first == 1 &&
1664 SubLT.second == MVT::f32 && (Index == 0 || ST->hasSSE41()))
1665 return 1;
1666
1667 // If the insertion isn't aligned, treat it like a 2-op shuffle.
1668 Kind = TTI::SK_PermuteTwoSrc;
1669 }
1670
1671 // Handle some common (illegal) sub-vector types as they are often very cheap
1672 // to shuffle even on targets without PSHUFB.
1673 EVT VT = TLI->getValueType(DL, BaseTp);
1674 if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1675 !ST->hasSSSE3()) {
1676 static const CostKindTblEntry SSE2SubVectorShuffleTbl[] = {
1677 {TTI::SK_Broadcast, MVT::v4i16, {1,1,1,1}}, // pshuflw
1678 {TTI::SK_Broadcast, MVT::v2i16, {1,1,1,1}}, // pshuflw
1679 {TTI::SK_Broadcast, MVT::v8i8, {2,2,2,2}}, // punpck/pshuflw
1680 {TTI::SK_Broadcast, MVT::v4i8, {2,2,2,2}}, // punpck/pshuflw
1681 {TTI::SK_Broadcast, MVT::v2i8, {1,1,1,1}}, // punpck
1682
1683 {TTI::SK_Reverse, MVT::v4i16, {1,1,1,1}}, // pshuflw
1684 {TTI::SK_Reverse, MVT::v2i16, {1,1,1,1}}, // pshuflw
1685 {TTI::SK_Reverse, MVT::v4i8, {3,3,3,3}}, // punpck/pshuflw/packus
1686 {TTI::SK_Reverse, MVT::v2i8, {1,1,1,1}}, // punpck
1687
1688 {TTI::SK_Splice, MVT::v4i16, {2,2,2,2}}, // punpck+psrldq
1689 {TTI::SK_Splice, MVT::v2i16, {2,2,2,2}}, // punpck+psrldq
1690 {TTI::SK_Splice, MVT::v4i8, {2,2,2,2}}, // punpck+psrldq
1691 {TTI::SK_Splice, MVT::v2i8, {2,2,2,2}}, // punpck+psrldq
1692
1693 {TTI::SK_PermuteTwoSrc, MVT::v4i16, {2,2,2,2}}, // punpck/pshuflw
1694 {TTI::SK_PermuteTwoSrc, MVT::v2i16, {2,2,2,2}}, // punpck/pshuflw
1695 {TTI::SK_PermuteTwoSrc, MVT::v8i8, {7,7,7,7}}, // punpck/pshuflw
1696 {TTI::SK_PermuteTwoSrc, MVT::v4i8, {4,4,4,4}}, // punpck/pshuflw
1697 {TTI::SK_PermuteTwoSrc, MVT::v2i8, {2,2,2,2}}, // punpck
1698
1699 {TTI::SK_PermuteSingleSrc, MVT::v4i16, {1,1,1,1}}, // pshuflw
1700 {TTI::SK_PermuteSingleSrc, MVT::v2i16, {1,1,1,1}}, // pshuflw
1701 {TTI::SK_PermuteSingleSrc, MVT::v8i8, {5,5,5,5}}, // punpck/pshuflw
1702 {TTI::SK_PermuteSingleSrc, MVT::v4i8, {3,3,3,3}}, // punpck/pshuflw
1703 {TTI::SK_PermuteSingleSrc, MVT::v2i8, {1,1,1,1}}, // punpck
1704 };
1705
1706 if (ST->hasSSE2())
1707 if (const auto *Entry =
1708 CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1709 if (auto KindCost = Entry->Cost[CostKind])
1710 return LT.first * *KindCost;
1711 }
1712
1713 // We are going to permute multiple sources and the result will be in multiple
1714 // destinations. Providing an accurate cost only for splits where the element
1715 // type remains the same.
1716 if (LT.first != 1) {
1717 MVT LegalVT = LT.second;
1718 if (LegalVT.isVector() &&
1719 LegalVT.getVectorElementType().getSizeInBits() ==
1721 LegalVT.getVectorNumElements() <
1722 cast<FixedVectorType>(BaseTp)->getNumElements()) {
1723 unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
1724 unsigned LegalVTSize = LegalVT.getStoreSize();
1725 // Number of source vectors after legalization:
1726 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1727 // Number of destination vectors after legalization:
1728 InstructionCost NumOfDests = LT.first;
1729
1730 auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
1731 LegalVT.getVectorNumElements());
1732
1733 if (!Mask.empty() && NumOfDests.isValid()) {
1734 // Try to perform better estimation of the permutation.
1735 // 1. Split the source/destination vectors into real registers.
1736 // 2. Do the mask analysis to identify which real registers are
1737 // permuted. If more than 1 source registers are used for the
1738 // destination register building, the cost for this destination register
1739 // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
1740 // source register is used, build mask and calculate the cost as a cost
1741 // of PermuteSingleSrc.
1742 // Also, for the single register permute we try to identify if the
1743 // destination register is just a copy of the source register or the
1744 // copy of the previous destination register (the cost is
1745 // TTI::TCC_Basic). If the source register is just reused, the cost for
1746 // this operation is TTI::TCC_Free.
1747 NumOfDests =
1749 FixedVectorType::get(BaseTp->getElementType(), Mask.size()))
1750 .first;
1751 unsigned E = *NumOfDests.getValue();
1752 unsigned NormalizedVF =
1753 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
1754 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
1755 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
1756 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
1757 copy(Mask, NormalizedMask.begin());
1758 unsigned PrevSrcReg = 0;
1759 ArrayRef<int> PrevRegMask;
1762 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
1763 [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask,
1764 &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
1765 if (!ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) {
1766 // Check if the previous register can be just copied to the next
1767 // one.
1768 if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
1769 PrevRegMask != RegMask)
1771 RegMask, CostKind, 0, nullptr);
1772 else
1773 // Just a copy of previous destination register.
1775 return;
1776 }
1777 if (SrcReg != DestReg &&
1778 any_of(RegMask, [](int I) { return I != PoisonMaskElem; })) {
1779 // Just a copy of the source register.
1781 }
1782 PrevSrcReg = SrcReg;
1783 PrevRegMask = RegMask;
1784 },
1785 [this, SingleOpTy, CostKind,
1786 &Cost](ArrayRef<int> RegMask, unsigned /*Unused*/,
1787 unsigned /*Unused*/, bool /*Unused*/) {
1788 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
1789 CostKind, 0, nullptr);
1790 });
1791 return Cost;
1792 }
1793
1794 InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1795 return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1796 {}, CostKind, 0, nullptr);
1797 }
1798
1799 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
1800 }
1801
1802 // If we're just moving a single element around (probably as an alternative to
1803 // extracting it), we can assume this is cheap.
1804 if (LT.first == 1 && IsInLaneShuffle && IsSingleElementMask)
1805 return TTI::TCC_Basic;
1806
1807 static const CostKindTblEntry AVX512VBMIShuffleTbl[] = {
1808 { TTI::SK_Reverse, MVT::v64i8, { 1, 1, 1, 1 } }, // vpermb
1809 { TTI::SK_Reverse, MVT::v32i8, { 1, 1, 1, 1 } }, // vpermb
1810 { TTI::SK_PermuteSingleSrc, MVT::v64i8, { 1, 1, 1, 1 } }, // vpermb
1811 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 1, 1, 1, 1 } }, // vpermb
1812 { TTI::SK_PermuteTwoSrc, MVT::v64i8, { 2, 2, 2, 2 } }, // vpermt2b
1813 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 2, 2, 2, 2 } }, // vpermt2b
1814 { TTI::SK_PermuteTwoSrc, MVT::v16i8, { 2, 2, 2, 2 } } // vpermt2b
1815 };
1816
1817 if (ST->hasVBMI())
1818 if (const auto *Entry =
1819 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1820 if (auto KindCost = Entry->Cost[CostKind])
1821 return LT.first * *KindCost;
1822
1823 static const CostKindTblEntry AVX512BWShuffleTbl[] = {
1824 { TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw
1825 { TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw
1826 { TTI::SK_Broadcast, MVT::v64i8, { 1, 1, 1, 1 } }, // vpbroadcastb
1827
1828 { TTI::SK_Reverse, MVT::v32i16, { 2, 2, 2, 2 } }, // vpermw
1829 { TTI::SK_Reverse, MVT::v32f16, { 2, 2, 2, 2 } }, // vpermw
1830 { TTI::SK_Reverse, MVT::v16i16, { 2, 2, 2, 2 } }, // vpermw
1831 { TTI::SK_Reverse, MVT::v64i8, { 2, 2, 2, 2 } }, // pshufb + vshufi64x2
1832
1833 { TTI::SK_PermuteSingleSrc, MVT::v32i16, { 2, 2, 2, 2 } }, // vpermw
1834 { TTI::SK_PermuteSingleSrc, MVT::v32f16, { 2, 2, 2, 2 } }, // vpermw
1835 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 2, 2, 2, 2 } }, // vpermw
1836 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 2, 2, 2, 2 } }, // vpermw
1837 { TTI::SK_PermuteSingleSrc, MVT::v64i8, { 8, 8, 8, 8 } }, // extend to v32i16
1838
1839 { TTI::SK_PermuteTwoSrc, MVT::v32i16,{ 2, 2, 2, 2 } }, // vpermt2w
1840 { TTI::SK_PermuteTwoSrc, MVT::v32f16,{ 2, 2, 2, 2 } }, // vpermt2w
1841 { TTI::SK_PermuteTwoSrc, MVT::v16i16,{ 2, 2, 2, 2 } }, // vpermt2w
1842 { TTI::SK_PermuteTwoSrc, MVT::v8i16, { 2, 2, 2, 2 } }, // vpermt2w
1843 { TTI::SK_PermuteTwoSrc, MVT::v64i8, { 19, 19, 19, 19 } }, // 6 * v32i8 + 1
1844
1845 { TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vblendmw
1846 { TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vblendmb
1847
1848 { TTI::SK_Splice, MVT::v32i16, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
1849 { TTI::SK_Splice, MVT::v32f16, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
1850 { TTI::SK_Splice, MVT::v64i8, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
1851 };
1852
1853 if (ST->hasBWI())
1854 if (const auto *Entry =
1855 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1856 if (auto KindCost = Entry->Cost[CostKind])
1857 return LT.first * *KindCost;
1858
1859 static const CostKindTblEntry AVX512ShuffleTbl[] = {
1860 {TTI::SK_Broadcast, MVT::v8f64, { 1, 1, 1, 1 } }, // vbroadcastsd
1861 {TTI::SK_Broadcast, MVT::v16f32, { 1, 1, 1, 1 } }, // vbroadcastss
1862 {TTI::SK_Broadcast, MVT::v8i64, { 1, 1, 1, 1 } }, // vpbroadcastq
1863 {TTI::SK_Broadcast, MVT::v16i32, { 1, 1, 1, 1 } }, // vpbroadcastd
1864 {TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw
1865 {TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw
1866 {TTI::SK_Broadcast, MVT::v64i8, { 1, 1, 1, 1 } }, // vpbroadcastb
1867
1868 {TTI::SK_Reverse, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1869 {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1870 {TTI::SK_Reverse, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1871 {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1872 {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca
1873 {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca
1874 {TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca
1875
1876 {TTI::SK_Splice, MVT::v8f64, { 1, 1, 1, 1 } }, // vpalignd
1877 {TTI::SK_Splice, MVT::v4f64, { 1, 1, 1, 1 } }, // vpalignd
1878 {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd
1879 {TTI::SK_Splice, MVT::v8f32, { 1, 1, 1, 1 } }, // vpalignd
1880 {TTI::SK_Splice, MVT::v8i64, { 1, 1, 1, 1 } }, // vpalignd
1881 {TTI::SK_Splice, MVT::v4i64, { 1, 1, 1, 1 } }, // vpalignd
1882 {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd
1883 {TTI::SK_Splice, MVT::v8i32, { 1, 1, 1, 1 } }, // vpalignd
1884 {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr
1885 {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr
1886 {TTI::SK_Splice, MVT::v64i8, { 4, 4, 4, 4 } }, // split + palignr
1887
1888 {TTI::SK_PermuteSingleSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1889 {TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermpd
1890 {TTI::SK_PermuteSingleSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermpd
1891 {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1892 {TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermps
1893 {TTI::SK_PermuteSingleSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermps
1894 {TTI::SK_PermuteSingleSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1895 {TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermq
1896 {TTI::SK_PermuteSingleSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermq
1897 {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1898 {TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermd
1899 {TTI::SK_PermuteSingleSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermd
1900 {TTI::SK_PermuteSingleSrc, MVT::v16i8, { 1, 3, 1, 1 } }, // pshufb
1901
1902 {TTI::SK_PermuteTwoSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermt2pd
1903 {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermt2ps
1904 {TTI::SK_PermuteTwoSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermt2q
1905 {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermt2d
1906 {TTI::SK_PermuteTwoSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermt2pd
1907 {TTI::SK_PermuteTwoSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermt2ps
1908 {TTI::SK_PermuteTwoSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermt2q
1909 {TTI::SK_PermuteTwoSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermt2d
1910 {TTI::SK_PermuteTwoSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermt2pd
1911 {TTI::SK_PermuteTwoSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermt2ps
1912 {TTI::SK_PermuteTwoSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermt2q
1913 {TTI::SK_PermuteTwoSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermt2d
1914
1915 // FIXME: This just applies the type legalization cost rules above
1916 // assuming these completely split.
1917 {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } },
1918 {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } },
1919 {TTI::SK_PermuteSingleSrc, MVT::v64i8, { 14, 14, 14, 14 } },
1920 {TTI::SK_PermuteTwoSrc, MVT::v32i16, { 42, 42, 42, 42 } },
1921 {TTI::SK_PermuteTwoSrc, MVT::v32f16, { 42, 42, 42, 42 } },
1922 {TTI::SK_PermuteTwoSrc, MVT::v64i8, { 42, 42, 42, 42 } },
1923
1924 {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq
1925 {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq
1926 {TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vpternlogq
1927 {TTI::SK_Select, MVT::v8f64, { 1, 1, 1, 1 } }, // vblendmpd
1928 {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps
1929 {TTI::SK_Select, MVT::v8i64, { 1, 1, 1, 1 } }, // vblendmq
1930 {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd
1931 };
1932
1933 if (ST->hasAVX512())
1934 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1935 if (auto KindCost = Entry->Cost[CostKind])
1936 return LT.first * *KindCost;
1937
1938 static const CostKindTblEntry AVX2InLaneShuffleTbl[] = {
1939 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 1, 1, 1, 1 } }, // vpshufb
1940 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 1, 1, 1, 1 } }, // vpshufb
1941 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 1, 1, 1, 1 } }, // vpshufb
1942
1943 { TTI::SK_PermuteTwoSrc, MVT::v4f64, { 2, 2, 2, 2 } }, // 2*vshufpd + vblendpd
1944 { TTI::SK_PermuteTwoSrc, MVT::v8f32, { 2, 2, 2, 2 } }, // 2*vshufps + vblendps
1945 { TTI::SK_PermuteTwoSrc, MVT::v4i64, { 2, 2, 2, 2 } }, // 2*vpshufd + vpblendd
1946 { TTI::SK_PermuteTwoSrc, MVT::v8i32, { 2, 2, 2, 2 } }, // 2*vpshufd + vpblendd
1947 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 2, 2, 2, 2 } }, // 2*vpshufb + vpor
1948 { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 2, 2, 2, 2 } }, // 2*vpshufb + vpor
1949 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 2, 2, 2, 2 } }, // 2*vpshufb + vpor
1950 };
1951
1952 if (IsInLaneShuffle && ST->hasAVX2())
1953 if (const auto *Entry =
1954 CostTableLookup(AVX2InLaneShuffleTbl, Kind, LT.second))
1955 if (auto KindCost = Entry->Cost[CostKind])
1956 return LT.first * *KindCost;
1957
1958 static const CostKindTblEntry AVX2ShuffleTbl[] = {
1959 { TTI::SK_Broadcast, MVT::v4f64, { 1, 1, 1, 1 } }, // vbroadcastpd
1960 { TTI::SK_Broadcast, MVT::v8f32, { 1, 1, 1, 1 } }, // vbroadcastps
1961 { TTI::SK_Broadcast, MVT::v4i64, { 1, 1, 1, 1 } }, // vpbroadcastq
1962 { TTI::SK_Broadcast, MVT::v8i32, { 1, 1, 1, 1 } }, // vpbroadcastd
1963 { TTI::SK_Broadcast, MVT::v16i16, { 1, 1, 1, 1 } }, // vpbroadcastw
1964 { TTI::SK_Broadcast, MVT::v16f16, { 1, 1, 1, 1 } }, // vpbroadcastw
1965 { TTI::SK_Broadcast, MVT::v32i8, { 1, 1, 1, 1 } }, // vpbroadcastb
1966
1967 { TTI::SK_Reverse, MVT::v4f64, { 1, 1, 1, 1 } }, // vpermpd
1968 { TTI::SK_Reverse, MVT::v8f32, { 1, 1, 1, 1 } }, // vpermps
1969 { TTI::SK_Reverse, MVT::v4i64, { 1, 1, 1, 1 } }, // vpermq
1970 { TTI::SK_Reverse, MVT::v8i32, { 1, 1, 1, 1 } }, // vpermd
1971 { TTI::SK_Reverse, MVT::v16i16, { 2, 2, 2, 2 } }, // vperm2i128 + pshufb
1972 { TTI::SK_Reverse, MVT::v16f16, { 2, 2, 2, 2 } }, // vperm2i128 + pshufb
1973 { TTI::SK_Reverse, MVT::v32i8, { 2, 2, 2, 2 } }, // vperm2i128 + pshufb
1974
1975 { TTI::SK_Select, MVT::v16i16, { 1, 1, 1, 1 } }, // vpblendvb
1976 { TTI::SK_Select, MVT::v16f16, { 1, 1, 1, 1 } }, // vpblendvb
1977 { TTI::SK_Select, MVT::v32i8, { 1, 1, 1, 1 } }, // vpblendvb
1978
1979 { TTI::SK_Splice, MVT::v8i32, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
1980 { TTI::SK_Splice, MVT::v8f32, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
1981 { TTI::SK_Splice, MVT::v16i16, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
1982 { TTI::SK_Splice, MVT::v16f16, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
1983 { TTI::SK_Splice, MVT::v32i8, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
1984
1985 { TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 1, 1, 1 } }, // vpermpd
1986 { TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 1, 1, 1 } }, // vpermps
1987 { TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 1, 1, 1 } }, // vpermq
1988 { TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 1, 1, 1 } }, // vpermd
1989 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 4, 4, 4, 4 } },
1990 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 4, 4, 4, 4 } },
1991 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 4, 4, 4, 4 } },
1992
1993 { TTI::SK_PermuteTwoSrc, MVT::v4f64, { 3, 3, 3, 3 } }, // 2*vpermpd + vblendpd
1994 { TTI::SK_PermuteTwoSrc, MVT::v8f32, { 3, 3, 3, 3 } }, // 2*vpermps + vblendps
1995 { TTI::SK_PermuteTwoSrc, MVT::v4i64, { 3, 3, 3, 3 } }, // 2*vpermq + vpblendd
1996 { TTI::SK_PermuteTwoSrc, MVT::v8i32, { 3, 3, 3, 3 } }, // 2*vpermd + vpblendd
1997 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 7, 7, 7, 7 } },
1998 { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 7, 7, 7, 7 } },
1999 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 7, 7, 7, 7 } },
2000 };
2001
2002 if (ST->hasAVX2())
2003 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
2004 if (auto KindCost = Entry->Cost[CostKind])
2005 return LT.first * *KindCost;
2006
2007 static const CostKindTblEntry XOPShuffleTbl[] = {
2008 { TTI::SK_PermuteSingleSrc, MVT::v4f64, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2pd
2009 { TTI::SK_PermuteSingleSrc, MVT::v8f32, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2ps
2010 { TTI::SK_PermuteSingleSrc, MVT::v4i64, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2pd
2011 { TTI::SK_PermuteSingleSrc, MVT::v8i32, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2ps
2012 { TTI::SK_PermuteSingleSrc, MVT::v16i16,{ 4, 4, 4, 4 } }, // vextractf128 + 2*vpperm
2013 // + vinsertf128
2014 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 4, 4, 4, 4 } }, // vextractf128 + 2*vpperm
2015 // + vinsertf128
2016
2017 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 6*vpperm
2018 // + vinsertf128
2019
2020 { TTI::SK_PermuteTwoSrc, MVT::v8i16, { 1, 1, 1, 1 } }, // vpperm
2021 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 9, 9, 9, 9 } }, // 2*vextractf128 + 6*vpperm
2022 // + vinsertf128
2023 { TTI::SK_PermuteTwoSrc, MVT::v16i8, { 1, 1, 1, 1 } }, // vpperm
2024 };
2025
2026 if (ST->hasXOP())
2027 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
2028 if (auto KindCost = Entry->Cost[CostKind])
2029 return LT.first * *KindCost;
2030
2031 static const CostKindTblEntry AVX1InLaneShuffleTbl[] = {
2032 { TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 1, 1, 1 } }, // vpermilpd
2033 { TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 1, 1, 1 } }, // vpermilpd
2034 { TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 1, 1, 1 } }, // vpermilps
2035 { TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 1, 1, 1 } }, // vpermilps
2036
2037 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb
2038 // + vpor + vinsertf128
2039 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb
2040 // + vpor + vinsertf128
2041 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb
2042 // + vpor + vinsertf128
2043
2044 { TTI::SK_PermuteTwoSrc, MVT::v4f64, { 2, 2, 2, 2 } }, // 2*vshufpd + vblendpd
2045 { TTI::SK_PermuteTwoSrc, MVT::v8f32, { 2, 2, 2, 2 } }, // 2*vshufps + vblendps
2046 { TTI::SK_PermuteTwoSrc, MVT::v4i64, { 2, 2, 2, 2 } }, // 2*vpermilpd + vblendpd
2047 { TTI::SK_PermuteTwoSrc, MVT::v8i32, { 2, 2, 2, 2 } }, // 2*vpermilps + vblendps
2048 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb
2049 // + 2*vpor + vinsertf128
2050 { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb
2051 // + 2*vpor + vinsertf128
2052 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb
2053 // + 2*vpor + vinsertf128
2054 };
2055
2056 if (IsInLaneShuffle && ST->hasAVX())
2057 if (const auto *Entry =
2058 CostTableLookup(AVX1InLaneShuffleTbl, Kind, LT.second))
2059 if (auto KindCost = Entry->Cost[CostKind])
2060 return LT.first * *KindCost;
2061
2062 static const CostKindTblEntry AVX1ShuffleTbl[] = {
2063 {TTI::SK_Broadcast, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + vpermilpd
2064 {TTI::SK_Broadcast, MVT::v8f32, {2,2,2,2}}, // vperm2f128 + vpermilps
2065 {TTI::SK_Broadcast, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + vpermilpd
2066 {TTI::SK_Broadcast, MVT::v8i32, {2,2,2,2}}, // vperm2f128 + vpermilps
2067 {TTI::SK_Broadcast, MVT::v16i16, {3,3,3,3}}, // vpshuflw + vpshufd + vinsertf128
2068 {TTI::SK_Broadcast, MVT::v16f16, {3,3,3,3}}, // vpshuflw + vpshufd + vinsertf128
2069 {TTI::SK_Broadcast, MVT::v32i8, {2,2,2,2}}, // vpshufb + vinsertf128
2070
2071 {TTI::SK_Reverse, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + vpermilpd
2072 {TTI::SK_Reverse, MVT::v8f32, {2,2,2,2}}, // vperm2f128 + vpermilps
2073 {TTI::SK_Reverse, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + vpermilpd
2074 {TTI::SK_Reverse, MVT::v8i32, {2,2,2,2}}, // vperm2f128 + vpermilps
2075 {TTI::SK_Reverse, MVT::v16i16, {4,4,4,4}}, // vextractf128 + 2*pshufb
2076 // + vinsertf128
2077 {TTI::SK_Reverse, MVT::v16f16, {4,4,4,4}}, // vextractf128 + 2*pshufb
2078 // + vinsertf128
2079 {TTI::SK_Reverse, MVT::v32i8, {4,4,4,4}}, // vextractf128 + 2*pshufb
2080 // + vinsertf128
2081
2082 {TTI::SK_Select, MVT::v4i64, {1,1,1,1}}, // vblendpd
2083 {TTI::SK_Select, MVT::v4f64, {1,1,1,1}}, // vblendpd
2084 {TTI::SK_Select, MVT::v8i32, {1,1,1,1}}, // vblendps
2085 {TTI::SK_Select, MVT::v8f32, {1,1,1,1}}, // vblendps
2086 {TTI::SK_Select, MVT::v16i16, {3,3,3,3}}, // vpand + vpandn + vpor
2087 {TTI::SK_Select, MVT::v16f16, {3,3,3,3}}, // vpand + vpandn + vpor
2088 {TTI::SK_Select, MVT::v32i8, {3,3,3,3}}, // vpand + vpandn + vpor
2089
2090 {TTI::SK_Splice, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + shufpd
2091 {TTI::SK_Splice, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + shufpd
2092 {TTI::SK_Splice, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2093 {TTI::SK_Splice, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2094 {TTI::SK_Splice, MVT::v16i16, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2095 {TTI::SK_Splice, MVT::v16f16, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2096 {TTI::SK_Splice, MVT::v32i8, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2097
2098 {TTI::SK_PermuteSingleSrc, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + vshufpd
2099 {TTI::SK_PermuteSingleSrc, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + vshufpd
2100 {TTI::SK_PermuteSingleSrc, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2101 {TTI::SK_PermuteSingleSrc, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2102 {TTI::SK_PermuteSingleSrc, MVT::v16i16,{8,8,8,8}}, // vextractf128 + 4*pshufb
2103 // + 2*por + vinsertf128
2104 {TTI::SK_PermuteSingleSrc, MVT::v16f16,{8,8,8,8}}, // vextractf128 + 4*pshufb
2105 // + 2*por + vinsertf128
2106 {TTI::SK_PermuteSingleSrc, MVT::v32i8, {8,8,8,8}}, // vextractf128 + 4*pshufb
2107 // + 2*por + vinsertf128
2108
2109 {TTI::SK_PermuteTwoSrc, MVT::v4f64, {3,3,3,3}}, // 2*vperm2f128 + vshufpd
2110 {TTI::SK_PermuteTwoSrc, MVT::v4i64, {3,3,3,3}}, // 2*vperm2f128 + vshufpd
2111 {TTI::SK_PermuteTwoSrc, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2112 {TTI::SK_PermuteTwoSrc, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2113 {TTI::SK_PermuteTwoSrc, MVT::v16i16,{15,15,15,15}}, // 2*vextractf128 + 8*pshufb
2114 // + 4*por + vinsertf128
2115 {TTI::SK_PermuteTwoSrc, MVT::v16f16,{15,15,15,15}}, // 2*vextractf128 + 8*pshufb
2116 // + 4*por + vinsertf128
2117 {TTI::SK_PermuteTwoSrc, MVT::v32i8, {15,15,15,15}}, // 2*vextractf128 + 8*pshufb
2118 // + 4*por + vinsertf128
2119 };
2120
2121 if (ST->hasAVX())
2122 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
2123 if (auto KindCost = Entry->Cost[CostKind])
2124 return LT.first * *KindCost;
2125
2126 static const CostKindTblEntry SSE41ShuffleTbl[] = {
2127 {TTI::SK_Select, MVT::v2i64, {1,1,1,1}}, // pblendw
2128 {TTI::SK_Select, MVT::v2f64, {1,1,1,1}}, // movsd
2129 {TTI::SK_Select, MVT::v4i32, {1,1,1,1}}, // pblendw
2130 {TTI::SK_Select, MVT::v4f32, {1,1,1,1}}, // blendps
2131 {TTI::SK_Select, MVT::v8i16, {1,1,1,1}}, // pblendw
2132 {TTI::SK_Select, MVT::v8f16, {1,1,1,1}}, // pblendw
2133 {TTI::SK_Select, MVT::v16i8, {1,1,1,1}} // pblendvb
2134 };
2135
2136 if (ST->hasSSE41())
2137 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
2138 if (auto KindCost = Entry->Cost[CostKind])
2139 return LT.first * *KindCost;
2140
2141 static const CostKindTblEntry SSSE3ShuffleTbl[] = {
2142 {TTI::SK_Broadcast, MVT::v8i16, {1, 1, 1, 1}}, // pshufb
2143 {TTI::SK_Broadcast, MVT::v8f16, {1, 1, 1, 1}}, // pshufb
2144 {TTI::SK_Broadcast, MVT::v16i8, {1, 1, 1, 1}}, // pshufb
2145
2146 {TTI::SK_Reverse, MVT::v8i16, {1, 1, 1, 1}}, // pshufb
2147 {TTI::SK_Reverse, MVT::v8f16, {1, 1, 1, 1}}, // pshufb
2148 {TTI::SK_Reverse, MVT::v16i8, {1, 1, 1, 1}}, // pshufb
2149
2150 {TTI::SK_Select, MVT::v8i16, {3, 3, 3, 3}}, // 2*pshufb + por
2151 {TTI::SK_Select, MVT::v8f16, {3, 3, 3, 3}}, // 2*pshufb + por
2152 {TTI::SK_Select, MVT::v16i8, {3, 3, 3, 3}}, // 2*pshufb + por
2153
2154 {TTI::SK_Splice, MVT::v4i32, {1, 1, 1, 1}}, // palignr
2155 {TTI::SK_Splice, MVT::v4f32, {1, 1, 1, 1}}, // palignr
2156 {TTI::SK_Splice, MVT::v8i16, {1, 1, 1, 1}}, // palignr
2157 {TTI::SK_Splice, MVT::v8f16, {1, 1, 1, 1}}, // palignr
2158 {TTI::SK_Splice, MVT::v16i8, {1, 1, 1, 1}}, // palignr
2159
2160 {TTI::SK_PermuteSingleSrc, MVT::v8i16, {1, 1, 1, 1}}, // pshufb
2161 {TTI::SK_PermuteSingleSrc, MVT::v8f16, {1, 1, 1, 1}}, // pshufb
2162 {TTI::SK_PermuteSingleSrc, MVT::v16i8, {1, 1, 1, 1}}, // pshufb
2163
2164 {TTI::SK_PermuteTwoSrc, MVT::v8i16, {3, 3, 3, 3}}, // 2*pshufb + por
2165 {TTI::SK_PermuteTwoSrc, MVT::v8f16, {3, 3, 3, 3}}, // 2*pshufb + por
2166 {TTI::SK_PermuteTwoSrc, MVT::v16i8, {3, 3, 3, 3}}, // 2*pshufb + por
2167 };
2168
2169 if (ST->hasSSSE3())
2170 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
2171 if (auto KindCost = Entry->Cost[CostKind])
2172 return LT.first * *KindCost;
2173
2174 static const CostKindTblEntry SSE2ShuffleTbl[] = {
2175 {TTI::SK_Broadcast, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2176 {TTI::SK_Broadcast, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
2177 {TTI::SK_Broadcast, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
2178 {TTI::SK_Broadcast, MVT::v8i16, {2, 2, 2, 2}}, // pshuflw + pshufd
2179 {TTI::SK_Broadcast, MVT::v8f16, {2, 2, 2, 2}}, // pshuflw + pshufd
2180 {TTI::SK_Broadcast, MVT::v16i8, {3, 3, 3, 3}}, // unpck + pshuflw + pshufd
2181
2182 {TTI::SK_Reverse, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2183 {TTI::SK_Reverse, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
2184 {TTI::SK_Reverse, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
2185 {TTI::SK_Reverse, MVT::v8i16, {3, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
2186 {TTI::SK_Reverse, MVT::v8f16, {3, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
2187 {TTI::SK_Reverse, MVT::v16i8, {9, 9, 9, 9}}, // 2*pshuflw + 2*pshufhw
2188 // + 2*pshufd + 2*unpck + packus
2189
2190 {TTI::SK_Select, MVT::v2i64, {1, 1, 1, 1}}, // movsd
2191 {TTI::SK_Select, MVT::v2f64, {1, 1, 1, 1}}, // movsd
2192 {TTI::SK_Select, MVT::v4i32, {2, 2, 2, 2}}, // 2*shufps
2193 {TTI::SK_Select, MVT::v8i16, {3, 3, 3, 3}}, // pand + pandn + por
2194 {TTI::SK_Select, MVT::v8f16, {3, 3, 3, 3}}, // pand + pandn + por
2195 {TTI::SK_Select, MVT::v16i8, {3, 3, 3, 3}}, // pand + pandn + por
2196
2197 {TTI::SK_Splice, MVT::v2i64, {1, 1, 1, 1}}, // shufpd
2198 {TTI::SK_Splice, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2199 {TTI::SK_Splice, MVT::v4i32, {2, 2, 2, 2}}, // 2*{unpck,movsd,pshufd}
2200 {TTI::SK_Splice, MVT::v8i16, {3, 3, 3, 3}}, // psrldq + psrlldq + por
2201 {TTI::SK_Splice, MVT::v8f16, {3, 3, 3, 3}}, // psrldq + psrlldq + por
2202 {TTI::SK_Splice, MVT::v16i8, {3, 3, 3, 3}}, // psrldq + psrlldq + por
2203
2204 {TTI::SK_PermuteSingleSrc, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2205 {TTI::SK_PermuteSingleSrc, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
2206 {TTI::SK_PermuteSingleSrc, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
2207 {TTI::SK_PermuteSingleSrc, MVT::v8i16, {5, 5, 5, 5}}, // 2*pshuflw + 2*pshufhw
2208 // + pshufd/unpck
2209 {TTI::SK_PermuteSingleSrc, MVT::v8f16, {5, 5, 5, 5}}, // 2*pshuflw + 2*pshufhw
2210 // + pshufd/unpck
2211 {TTI::SK_PermuteSingleSrc, MVT::v16i8, {10, 10, 10, 10}}, // 2*pshuflw + 2*pshufhw
2212 // + 2*pshufd + 2*unpck + 2*packus
2213
2214 {TTI::SK_PermuteTwoSrc, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2215 {TTI::SK_PermuteTwoSrc, MVT::v2i64, {1, 1, 1, 1}}, // shufpd
2216 {TTI::SK_PermuteTwoSrc, MVT::v4i32, {2, 2, 2, 2}}, // 2*{unpck,movsd,pshufd}
2217 {TTI::SK_PermuteTwoSrc, MVT::v8i16, {8, 8, 8, 8}}, // blend+permute
2218 {TTI::SK_PermuteTwoSrc, MVT::v8f16, {8, 8, 8, 8}}, // blend+permute
2219 {TTI::SK_PermuteTwoSrc, MVT::v16i8, {13, 13, 13, 13}}, // blend+permute
2220 };
2221
2222 static const CostTblEntry SSE3BroadcastLoadTbl[] = {
2223 {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
2224 };
2225
2226 if (ST->hasSSE2()) {
2227 bool IsLoad =
2228 llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); });
2229 if (ST->hasSSE3() && IsLoad)
2230 if (const auto *Entry =
2231 CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
2233 LT.second.getVectorElementCount()) &&
2234 "Table entry missing from isLegalBroadcastLoad()");
2235 return LT.first * Entry->Cost;
2236 }
2237
2238 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
2239 if (auto KindCost = Entry->Cost[CostKind])
2240 return LT.first * *KindCost;
2241 }
2242
2243 static const CostKindTblEntry SSE1ShuffleTbl[] = {
2244 { TTI::SK_Broadcast, MVT::v4f32, {1,1,1,1} }, // shufps
2245 { TTI::SK_Reverse, MVT::v4f32, {1,1,1,1} }, // shufps
2246 { TTI::SK_Select, MVT::v4f32, {2,2,2,2} }, // 2*shufps
2247 { TTI::SK_Splice, MVT::v4f32, {2,2,2,2} }, // 2*shufps
2248 { TTI::SK_PermuteSingleSrc, MVT::v4f32, {1,1,1,1} }, // shufps
2249 { TTI::SK_PermuteTwoSrc, MVT::v4f32, {2,2,2,2} }, // 2*shufps
2250 };
2251
2252 if (ST->hasSSE1()) {
2253 if (LT.first == 1 && LT.second == MVT::v4f32 && Mask.size() == 4) {
2254 // SHUFPS: both pairs must come from the same source register.
2255 auto MatchSHUFPS = [](int X, int Y) {
2256 return X < 0 || Y < 0 || ((X & 4) == (Y & 4));
2257 };
2258 if (MatchSHUFPS(Mask[0], Mask[1]) && MatchSHUFPS(Mask[2], Mask[3]))
2259 return 1;
2260 }
2261 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
2262 if (auto KindCost = Entry->Cost[CostKind])
2263 return LT.first * *KindCost;
2264 }
2265
2266 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
2267}
2268
2270 Type *Src,
2273 const Instruction *I) {
2274 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2275 assert(ISD && "Invalid opcode");
2276
2277 // The cost tables include both specific, custom (non-legal) src/dst type
2278 // conversions and generic, legalized types. We test for customs first, before
2279 // falling back to legalization.
2280 // FIXME: Need a better design of the cost table to handle non-simple types of
2281 // potential massive combinations (elem_num x src_type x dst_type).
2282 static const TypeConversionCostKindTblEntry AVX512BWConversionTbl[]{
2283 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2284 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2285
2286 // Mask sign extend has an instruction.
2287 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2288 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2289 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2290 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2291 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2292 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2293 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2294 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2295 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2296 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2297 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2298 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2299 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2300 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2301 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, { 1, 1, 1, 1 } },
2302 { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, { 1, 1, 1, 1 } },
2303 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, { 1, 1, 1, 1 } },
2304
2305 // Mask zero extend is a sext + shift.
2306 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2307 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2308 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2309 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2310 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2311 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2312 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2313 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2314 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2315 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2316 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2317 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2318 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2319 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2320 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, { 2, 1, 1, 1 } },
2321 { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, { 2, 1, 1, 1 } },
2322 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, { 2, 1, 1, 1 } },
2323
2324 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2325 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2326 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2327 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2328 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2329 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2330 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2331 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2332 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2333 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2334 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2335 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2336 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2337 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2338 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, { 2, 1, 1, 1 } },
2339 { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, { 2, 1, 1, 1 } },
2340 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, { 2, 1, 1, 1 } },
2341
2342 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 2, 1, 1, 1 } },
2343 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // widen to zmm
2344 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, { 2, 1, 1, 1 } }, // vpmovwb
2345 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, { 2, 1, 1, 1 } }, // vpmovwb
2346 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, { 2, 1, 1, 1 } }, // vpmovwb
2347 };
2348
2349 static const TypeConversionCostKindTblEntry AVX512DQConversionTbl[] = {
2350 // Mask sign extend has an instruction.
2351 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2352 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2353 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2354 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2355 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2356 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, { 1, 1, 1, 1 } },
2357 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } },
2358 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } },
2359
2360 // Mask zero extend is a sext + shift.
2361 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1, } },
2362 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1, } },
2363 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1, } },
2364 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1, } },
2365 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1, } },
2366 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, { 2, 1, 1, 1, } },
2367 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1, } },
2368 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1, } },
2369
2370 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2371 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2372 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2373 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2374 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2375 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } },
2376 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } },
2377 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, { 2, 1, 1, 1 } },
2378
2379 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2380 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2381
2382 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2383 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2384
2385 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2386 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2387
2388 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2389 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2390 };
2391
2392 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
2393 // 256-bit wide vectors.
2394
2395 static const TypeConversionCostKindTblEntry AVX512FConversionTbl[] = {
2396 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 1, 1, 1, 1 } },
2397 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, { 3, 1, 1, 1 } },
2398 { ISD::FP_EXTEND, MVT::v16f64, MVT::v16f32, { 4, 1, 1, 1 } }, // 2*vcvtps2pd+vextractf64x4
2399 { ISD::FP_EXTEND, MVT::v16f32, MVT::v16f16, { 1, 1, 1, 1 } }, // vcvtph2ps
2400 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
2401 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 1, 1, 1, 1 } },
2402 { ISD::FP_ROUND, MVT::v16f16, MVT::v16f32, { 1, 1, 1, 1 } }, // vcvtps2ph
2403
2404 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2405 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2406 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2407 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2408 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2409 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2410 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2411 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2412 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2413 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2414 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2415 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2416 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2417 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2418 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2419 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, { 2, 1, 1, 1 } }, // vpmovdb
2420 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, { 2, 1, 1, 1 } }, // vpmovdb
2421 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2422 { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2423 { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2424 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2425 { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2426 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, { 2, 1, 1, 1 } }, // vpmovqb
2427 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, { 1, 1, 1, 1 } }, // vpshufb
2428 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2429 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2430 { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2431 { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2432 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2433 { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2434 { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2435 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, { 1, 1, 1, 1 } }, // vpmovqd
2436 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // zmm vpmovqd
2437 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, { 5, 1, 1, 1 } },// 2*vpmovqd+concat+vpmovdb
2438
2439 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } }, // extend to v16i32
2440 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 8, 1, 1, 1 } },
2441 { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, { 8, 1, 1, 1 } },
2442
2443 // Sign extend is zmm vpternlogd+vptruncdb.
2444 // Zero extend is zmm broadcast load+vptruncdw.
2445 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 3, 1, 1, 1 } },
2446 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 4, 1, 1, 1 } },
2447 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 3, 1, 1, 1 } },
2448 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 4, 1, 1, 1 } },
2449 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 3, 1, 1, 1 } },
2450 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 4, 1, 1, 1 } },
2451 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 3, 1, 1, 1 } },
2452 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 4, 1, 1, 1 } },
2453
2454 // Sign extend is zmm vpternlogd+vptruncdw.
2455 // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
2456 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 3, 1, 1, 1 } },
2457 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2458 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 3, 1, 1, 1 } },
2459 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2460 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 3, 1, 1, 1 } },
2461 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2462 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 3, 1, 1, 1 } },
2463 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2464
2465 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2466 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2467 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2468 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2469 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2470 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2471 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2472 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2473 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2474 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2475
2476 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2477 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2478 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogq
2479 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2480
2481 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2482 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2483 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2484 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2485 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2486 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2487 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2488 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2489 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2490 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2491
2492 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2493 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2494
2495 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2496 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2497 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2498 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2499 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2500 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2501 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2502 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2503
2504 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2505 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2506 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2507 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2508 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2509 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2510 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2511 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2512 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, {26, 1, 1, 1 } },
2513 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 5, 1, 1, 1 } },
2514
2515 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2516 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, { 7, 1, 1, 1 } },
2517 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64, {15, 1, 1, 1 } },
2518 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32, {11, 1, 1, 1 } },
2519 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64, {31, 1, 1, 1 } },
2520 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2521 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, { 7, 1, 1, 1 } },
2522 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, { 5, 1, 1, 1 } },
2523 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64, {15, 1, 1, 1 } },
2524 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2525 { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, { 3, 1, 1, 1 } },
2526
2527 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2528 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2529 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, { 3, 1, 1, 1 } },
2530 { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, { 1, 1, 1, 1 } },
2531 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, { 3, 1, 1, 1 } },
2532 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, { 3, 1, 1, 1 } },
2533 };
2534
2535 static const TypeConversionCostKindTblEntry AVX512BWVLConversionTbl[] {
2536 // Mask sign extend has an instruction.
2537 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2538 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2539 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2540 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2541 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2542 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2543 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2544 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2545 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2546 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2547 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2548 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2549 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2550 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2551 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, { 1, 1, 1, 1 } },
2552 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, { 1, 1, 1, 1 } },
2553 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, { 1, 1, 1, 1 } },
2554
2555 // Mask zero extend is a sext + shift.
2556 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2557 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2558 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2559 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2560 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2561 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2562 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2563 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2564 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2565 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2566 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2567 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2568 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2569 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2570 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, { 2, 1, 1, 1 } },
2571 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, { 2, 1, 1, 1 } },
2572 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, { 2, 1, 1, 1 } },
2573
2574 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2575 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2576 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2577 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2578 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2579 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2580 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2581 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2582 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2583 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2584 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2585 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2586 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2587 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2588 { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, { 2, 1, 1, 1 } },
2589 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, { 2, 1, 1, 1 } },
2590 { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, { 2, 1, 1, 1 } },
2591
2592 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } },
2593 };
2594
2595 static const TypeConversionCostKindTblEntry AVX512DQVLConversionTbl[] = {
2596 // Mask sign extend has an instruction.
2597 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2598 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2599 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2600 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, { 1, 1, 1, 1 } },
2601 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2602 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, { 1, 1, 1, 1 } },
2603 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } },
2604 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2605
2606 // Mask zero extend is a sext + shift.
2607 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } },
2608 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1 } },
2609 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } },
2610 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, { 2, 1, 1, 1 } },
2611 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } },
2612 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, { 2, 1, 1, 1 } },
2613 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } },
2614 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } },
2615
2616 { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, { 2, 1, 1, 1 } },
2617 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } },
2618 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2619 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2620 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2621 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2622 { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, { 2, 1, 1, 1 } },
2623 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2624
2625 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2626 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2627 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2628 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2629
2630 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2631 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2632 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2633 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2634
2635 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2636 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2637 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2638 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2639
2640 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2641 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2642 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2643 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2644 };
2645
2646 static const TypeConversionCostKindTblEntry AVX512VLConversionTbl[] = {
2647 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2648 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2649 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2650 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 8, 1, 1, 1 } }, // split+2*v8i8
2651 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2652 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2653 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2654 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 8, 1, 1, 1 } }, // split+2*v8i16
2655 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2656 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2657 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2658 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2659 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2660 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2661 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // vpmovqd
2662 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqb
2663 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqw
2664 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, { 2, 1, 1, 1 } }, // vpmovwb
2665
2666 // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
2667 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
2668 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 5, 1, 1, 1 } },
2669 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 6, 1, 1, 1 } },
2670 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 5, 1, 1, 1 } },
2671 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 6, 1, 1, 1 } },
2672 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 5, 1, 1, 1 } },
2673 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 6, 1, 1, 1 } },
2674 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, {10, 1, 1, 1 } },
2675 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, {12, 1, 1, 1 } },
2676
2677 // sign extend is vpcmpeq+maskedmove+vpmovdw
2678 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
2679 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2680 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 5, 1, 1, 1 } },
2681 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2682 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 5, 1, 1, 1 } },
2683 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2684 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 5, 1, 1, 1 } },
2685 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, {10, 1, 1, 1 } },
2686 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, {12, 1, 1, 1 } },
2687
2688 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogd
2689 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2690 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogd
2691 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2692 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogd
2693 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2694 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2695 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2696
2697 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogq
2698 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2699 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogq
2700 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2701
2702 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2703 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2704 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2705 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2706 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2707 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2708 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2709 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2710 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2711 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2712 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2713 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2714
2715 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2716 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2717 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2718 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2719
2720 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
2721 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
2722 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2723 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2724 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2725 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2726 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 1, 1, 1, 1 } },
2727 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
2728 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2729 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2730 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 5, 1, 1, 1 } },
2731 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2732 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 5, 1, 1, 1 } },
2733
2734 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2735 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2736 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, { 5, 1, 1, 1 } },
2737
2738 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
2739 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
2740 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
2741 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
2742 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2743 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2744 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2745 };
2746
2747 static const TypeConversionCostKindTblEntry AVX2ConversionTbl[] = {
2748 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2749 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2750 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2751 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2752 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2753 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2754
2755 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2756 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2757 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2758 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2759 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2760 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2761 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2762 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2763 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2764 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2765 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2766 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2767 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2768 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2769
2770 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2771
2772 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 4, 1, 1, 1 } },
2773 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 4, 1, 1, 1 } },
2774 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 1, 1, 1, 1 } },
2775 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 1, 1, 1, 1 } },
2776 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 1, 1, 1, 1 } },
2777 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 4, 1, 1, 1 } },
2778 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 4, 1, 1, 1 } },
2779 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 1, 1, 1, 1 } },
2780 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 1, 1, 1, 1 } },
2781 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 5, 1, 1, 1 } },
2782 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } },
2783 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 2, 1, 1, 1 } },
2784
2785 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 3, 1, 1, 1 } },
2786 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 3, 1, 1, 1 } },
2787
2788 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2789 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2790 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2791 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 3, 1, 1, 1 } },
2792
2793 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 3, 1, 1, 1 } },
2794 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 3, 1, 1, 1 } },
2795 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2796 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
2797 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2798 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 4, 1, 1, 1 } },
2799 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 3, 1, 1, 1 } },
2800 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 4, 1, 1, 1 } },
2801
2802 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2803 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2804 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2805 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2806 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2807 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2808 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 3, 1, 1, 1 } },
2809
2810 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2811 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2812 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2813 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2814 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 2, 1, 1, 1 } },
2815 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 1, 1, 1, 1 } },
2816 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 2, 1, 1, 1 } },
2817 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2818 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2819 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2820 };
2821
2822 static const TypeConversionCostKindTblEntry AVXConversionTbl[] = {
2823 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2824 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2825 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2826 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2827 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2828 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2829
2830 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2831 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2832 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2833 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2834 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2835 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2836 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2837 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2838 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2839 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2840 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2841 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2842
2843 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 4, 1, 1, 1 } },
2844 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 5, 1, 1, 1 } },
2845 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 4, 1, 1, 1 } },
2846 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 9, 1, 1, 1 } },
2847 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, {11, 1, 1, 1 } },
2848
2849 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 6, 1, 1, 1 } },
2850 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 6, 1, 1, 1 } },
2851 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // and+extract+packuswb
2852 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 5, 1, 1, 1 } },
2853 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
2854 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 5, 1, 1, 1 } },
2855 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 3, 1, 1, 1 } }, // and+extract+2*packusdw
2856 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 2, 1, 1, 1 } },
2857
2858 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, { 3, 1, 1, 1 } },
2859 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, { 3, 1, 1, 1 } },
2860 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, { 8, 1, 1, 1 } },
2861 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2862 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2863 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2864 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2865 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2866 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2867 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2868 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 5, 1, 1, 1 } },
2869 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 8, 1, 1, 1 } },
2870
2871 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, { 7, 1, 1, 1 } },
2872 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, { 7, 1, 1, 1 } },
2873 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, { 6, 1, 1, 1 } },
2874 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2875 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2876 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2877 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2878 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 4, 1, 1, 1 } },
2879 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 4, 1, 1, 1 } },
2880 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
2881 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 6, 1, 1, 1 } },
2882 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 8, 1, 1, 1 } },
2883 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, {10, 1, 1, 1 } },
2884 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, {10, 1, 1, 1 } },
2885 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {18, 1, 1, 1 } },
2886 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2887 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, {10, 1, 1, 1 } },
2888
2889 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2890 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2891 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2892 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
2893 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
2894 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
2895 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
2896 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
2897 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 2, 1, 1, 1 } },
2898 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 2, 1, 1, 1 } },
2899 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 5, 1, 1, 1 } },
2900
2901 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2902 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2903 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2904 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
2905 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
2906 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
2907 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
2908 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
2909 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
2910 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2911 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 6, 1, 1, 1 } },
2912 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 7, 1, 1, 1 } },
2913 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 7, 1, 1, 1 } },
2914
2915 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, { 1, 1, 1, 1 } },
2916 { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, { 1, 1, 1, 1 } },
2917 };
2918
2919 static const TypeConversionCostKindTblEntry SSE41ConversionTbl[] = {
2920 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
2921 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
2922 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
2923 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
2924 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
2925 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
2926 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
2927 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
2928 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
2929 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
2930 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
2931 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
2932
2933 // These truncates end up widening elements.
2934 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 1, 1, 1, 1 } }, // PMOVXZBQ
2935 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 1, 1, 1, 1 } }, // PMOVXZWQ
2936 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 1, 1, 1, 1 } }, // PMOVXZBD
2937
2938 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 2, 1, 1, 1 } },
2939 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 2, 1, 1, 1 } },
2940 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 2, 1, 1, 1 } },
2941
2942 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
2943 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
2944 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
2945 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
2946 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
2947 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2948 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
2949 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2950 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
2951 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 1, 1, 1, 1 } },
2952 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2953
2954 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
2955 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
2956 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 4, 1, 1, 1 } },
2957 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 4, 1, 1, 1 } },
2958 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
2959 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2960 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
2961 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2962 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 3, 1, 1, 1 } },
2963 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
2964 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 2, 1, 1, 1 } },
2965 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {12, 1, 1, 1 } },
2966 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {22, 1, 1, 1 } },
2967 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 4, 1, 1, 1 } },
2968
2969 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
2970 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
2971 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
2972 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
2973 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
2974 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
2975 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
2976 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
2977 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
2978 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
2979
2980 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
2981 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
2982 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
2983 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
2984 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
2985 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
2986 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
2987 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
2988 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
2989 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2990 };
2991
2992 static const TypeConversionCostKindTblEntry SSE2ConversionTbl[] = {
2993 // These are somewhat magic numbers justified by comparing the
2994 // output of llvm-mca for our various supported scheduler models
2995 // and basing it off the worst case scenario.
2996 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
2997 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
2998 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 3, 1, 1, 1 } },
2999 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 3, 1, 1, 1 } },
3000 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 3, 1, 1, 1 } },
3001 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
3002 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 3, 1, 1, 1 } },
3003 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
3004 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
3005 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 4, 1, 1, 1 } },
3006 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 8, 1, 1, 1 } },
3007 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 8, 1, 1, 1 } },
3008
3009 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
3010 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
3011 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 8, 1, 1, 1 } },
3012 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 9, 1, 1, 1 } },
3013 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
3014 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 4, 1, 1, 1 } },
3015 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 4, 1, 1, 1 } },
3016 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
3017 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 7, 1, 1, 1 } },
3018 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 7, 1, 1, 1 } },
3019 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
3020 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, {15, 1, 1, 1 } },
3021 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {18, 1, 1, 1 } },
3022
3023 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
3024 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3025 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
3026 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
3027 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
3028 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
3029 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
3030 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
3031 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
3032 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
3033
3034 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
3035 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3036 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
3037 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, {15, 1, 1, 1 } },
3038 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
3039 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
3040 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
3041 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
3042 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 8, 1, 1, 1 } },
3043 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 8, 1, 1, 1 } },
3044
3045 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
3046 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
3047 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 2, 1, 1, 1 } },
3048 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 3, 1, 1, 1 } },
3049 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
3050 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 2, 1, 1, 1 } },
3051 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 2, 1, 1, 1 } },
3052 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 3, 1, 1, 1 } },
3053 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
3054 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 2, 1, 1, 1 } },
3055 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
3056 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 2, 1, 1, 1 } },
3057
3058 // These truncates are really widening elements.
3059 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 1, 1, 1, 1 } }, // PSHUFD
3060 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } }, // PUNPCKLWD+DQ
3061 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // PUNPCKLBW+WD+PSHUFD
3062 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 1, 1, 1, 1 } }, // PUNPCKLWD
3063 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } }, // PUNPCKLBW+WD
3064 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 1, 1, 1, 1 } }, // PUNPCKLBW
3065
3066 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 2, 1, 1, 1 } }, // PAND+PACKUSWB
3067 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } },
3068 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 3, 1, 1, 1 } }, // PAND+2*PACKUSWB
3069 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 7, 1, 1, 1 } },
3070 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, { 1, 1, 1, 1 } },
3071 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 3, 1, 1, 1 } },
3072 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
3073 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, {10, 1, 1, 1 } },
3074 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 4, 1, 1, 1 } }, // PAND+3*PACKUSWB
3075 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 2, 1, 1, 1 } }, // PSHUFD+PSHUFLW
3076 { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, { 1, 1, 1, 1 } }, // PSHUFD
3077 };
3078
3079 static const TypeConversionCostKindTblEntry F16ConversionTbl[] = {
3080 { ISD::FP_ROUND, MVT::f16, MVT::f32, { 1, 1, 1, 1 } },
3081 { ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, { 1, 1, 1, 1 } },
3082 { ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, { 1, 1, 1, 1 } },
3083 { ISD::FP_EXTEND, MVT::f32, MVT::f16, { 1, 1, 1, 1 } },
3084 { ISD::FP_EXTEND, MVT::f64, MVT::f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
3085 { ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, { 1, 1, 1, 1 } },
3086 { ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, { 1, 1, 1, 1 } },
3087 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
3088 };
3089
3090 // Attempt to map directly to (simple) MVT types to let us match custom entries.
3091 EVT SrcTy = TLI->getValueType(DL, Src);
3092 EVT DstTy = TLI->getValueType(DL, Dst);
3093
3094 // The function getSimpleVT only handles simple value types.
3095 if (SrcTy.isSimple() && DstTy.isSimple()) {
3096 MVT SimpleSrcTy = SrcTy.getSimpleVT();
3097 MVT SimpleDstTy = DstTy.getSimpleVT();
3098
3099 if (ST->useAVX512Regs()) {
3100 if (ST->hasBWI())
3101 if (const auto *Entry = ConvertCostTableLookup(
3102 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3103 if (auto KindCost = Entry->Cost[CostKind])
3104 return *KindCost;
3105
3106 if (ST->hasDQI())
3107 if (const auto *Entry = ConvertCostTableLookup(
3108 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3109 if (auto KindCost = Entry->Cost[CostKind])
3110 return *KindCost;
3111
3112 if (ST->hasAVX512())
3113 if (const auto *Entry = ConvertCostTableLookup(
3114 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3115 if (auto KindCost = Entry->Cost[CostKind])
3116 return *KindCost;
3117 }
3118
3119 if (ST->hasBWI())
3120 if (const auto *Entry = ConvertCostTableLookup(
3121 AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3122 if (auto KindCost = Entry->Cost[CostKind])
3123 return *KindCost;
3124
3125 if (ST->hasDQI())
3126 if (const auto *Entry = ConvertCostTableLookup(
3127 AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3128 if (auto KindCost = Entry->Cost[CostKind])
3129 return *KindCost;
3130
3131 if (ST->hasAVX512())
3132 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3133 SimpleDstTy, SimpleSrcTy))
3134 if (auto KindCost = Entry->Cost[CostKind])
3135 return *KindCost;
3136
3137 if (ST->hasAVX2()) {
3138 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3139 SimpleDstTy, SimpleSrcTy))
3140 if (auto KindCost = Entry->Cost[CostKind])
3141 return *KindCost;
3142 }
3143
3144 if (ST->hasAVX()) {
3145 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3146 SimpleDstTy, SimpleSrcTy))
3147 if (auto KindCost = Entry->Cost[CostKind])
3148 return *KindCost;
3149 }
3150
3151 if (ST->hasF16C()) {
3152 if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD,
3153 SimpleDstTy, SimpleSrcTy))
3154 if (auto KindCost = Entry->Cost[CostKind])
3155 return *KindCost;
3156 }
3157
3158 if (ST->hasSSE41()) {
3159 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3160 SimpleDstTy, SimpleSrcTy))
3161 if (auto KindCost = Entry->Cost[CostKind])
3162 return *KindCost;
3163 }
3164
3165 if (ST->hasSSE2()) {
3166 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3167 SimpleDstTy, SimpleSrcTy))
3168 if (auto KindCost = Entry->Cost[CostKind])
3169 return *KindCost;
3170 }
3171
3172 if ((ISD == ISD::FP_ROUND && SimpleDstTy == MVT::f16) ||
3173 (ISD == ISD::FP_EXTEND && SimpleSrcTy == MVT::f16)) {
3174 // fp16 conversions not covered by any table entries require a libcall.
3175 // Return a large (arbitrary) number to model this.
3176 return InstructionCost(64);
3177 }
3178 }
3179
3180 // Fall back to legalized types.
3181 std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src);
3182 std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst);
3183
3184 // If we're truncating to the same legalized type - just assume its free.
3185 if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second)
3186 return TTI::TCC_Free;
3187
3188 if (ST->useAVX512Regs()) {
3189 if (ST->hasBWI())
3190 if (const auto *Entry = ConvertCostTableLookup(
3191 AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
3192 if (auto KindCost = Entry->Cost[CostKind])
3193 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3194
3195 if (ST->hasDQI())
3196 if (const auto *Entry = ConvertCostTableLookup(
3197 AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
3198 if (auto KindCost = Entry->Cost[CostKind])
3199 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3200
3201 if (ST->hasAVX512())
3202 if (const auto *Entry = ConvertCostTableLookup(
3203 AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
3204 if (auto KindCost = Entry->Cost[CostKind])
3205 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3206 }
3207
3208 if (ST->hasBWI())
3209 if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
3210 LTDest.second, LTSrc.second))
3211 if (auto KindCost = Entry->Cost[CostKind])
3212 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3213
3214 if (ST->hasDQI())
3215 if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
3216 LTDest.second, LTSrc.second))
3217 if (auto KindCost = Entry->Cost[CostKind])
3218 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3219
3220 if (ST->hasAVX512())
3221 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3222 LTDest.second, LTSrc.second))
3223 if (auto KindCost = Entry->Cost[CostKind])
3224 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3225
3226 if (ST->hasAVX2())
3227 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3228 LTDest.second, LTSrc.second))
3229 if (auto KindCost = Entry->Cost[CostKind])
3230 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3231
3232 if (ST->hasAVX())
3233 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3234 LTDest.second, LTSrc.second))
3235 if (auto KindCost = Entry->Cost[CostKind])
3236 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3237
3238 if (ST->hasF16C()) {
3239 if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD,
3240 LTDest.second, LTSrc.second))
3241 if (auto KindCost = Entry->Cost[CostKind])
3242 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3243 }
3244
3245 if (ST->hasSSE41())
3246 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3247 LTDest.second, LTSrc.second))
3248 if (auto KindCost = Entry->Cost[CostKind])
3249 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3250
3251 if (ST->hasSSE2())
3252 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3253 LTDest.second, LTSrc.second))
3254 if (auto KindCost = Entry->Cost[CostKind])
3255 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3256
3257 // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
3258 // sitofp.
3259 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3260 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
3261 Type *ExtSrc = Src->getWithNewBitWidth(32);
3262 unsigned ExtOpc =
3263 (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
3264
3265 // For scalar loads the extend would be free.
3266 InstructionCost ExtCost = 0;
3267 if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
3268 ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
3269
3270 return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
3272 }
3273
3274 // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
3275 // i32.
3276 if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
3277 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
3278 Type *TruncDst = Dst->getWithNewBitWidth(32);
3279 return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
3280 getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
3282 }
3283
3284 // TODO: Allow non-throughput costs that aren't binary.
3285 auto AdjustCost = [&CostKind](InstructionCost Cost,
3288 return Cost == 0 ? 0 : N;
3289 return Cost * N;
3290 };
3291 return AdjustCost(
3292 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3293}
3294
3296 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
3298 TTI::OperandValueInfo Op2Info, const Instruction *I) {
3299 // Early out if this type isn't scalar/vector integer/float.
3300 if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy()))
3301 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3302 Op1Info, Op2Info, I);
3303
3304 // Legalize the type.
3305 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3306
3307 MVT MTy = LT.second;
3308
3309 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3310 assert(ISD && "Invalid opcode");
3311
3312 InstructionCost ExtraCost = 0;
3313 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
3314 // Some vector comparison predicates cost extra instructions.
3315 // TODO: Adjust ExtraCost based on CostKind?
3316 // TODO: Should we invert this and assume worst case cmp costs
3317 // and reduce for particular predicates?
3318 if (MTy.isVector() &&
3319 !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
3320 (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
3321 ST->hasBWI())) {
3322 // Fallback to I if a specific predicate wasn't specified.
3323 CmpInst::Predicate Pred = VecPred;
3324 if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE ||
3326 Pred = cast<CmpInst>(I)->getPredicate();
3327
3328 bool CmpWithConstant = false;
3329 if (auto *CmpInstr = dyn_cast_or_null<CmpInst>(I))
3330 CmpWithConstant = isa<Constant>(CmpInstr->getOperand(1));
3331
3332 switch (Pred) {
3334 // xor(cmpeq(x,y),-1)
3335 ExtraCost = CmpWithConstant ? 0 : 1;
3336 break;
3339 // xor(cmpgt(x,y),-1)
3340 ExtraCost = CmpWithConstant ? 0 : 1;
3341 break;
3344 // cmpgt(xor(x,signbit),xor(y,signbit))
3345 // xor(cmpeq(pmaxu(x,y),x),-1)
3346 ExtraCost = CmpWithConstant ? 1 : 2;
3347 break;
3350 if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
3351 (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
3352 // cmpeq(psubus(x,y),0)
3353 // cmpeq(pminu(x,y),x)
3354 ExtraCost = 1;
3355 } else {
3356 // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
3357 ExtraCost = CmpWithConstant ? 2 : 3;
3358 }
3359 break;
3362 // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases.
3363 // Use FCMP_UEQ expansion - FCMP_ONE should be the same.
3364 if (CondTy && !ST->hasAVX())
3365 return getCmpSelInstrCost(Opcode, ValTy, CondTy,
3367 Op1Info, Op2Info) +
3368 getCmpSelInstrCost(Opcode, ValTy, CondTy,
3370 Op1Info, Op2Info) +
3371 getArithmeticInstrCost(Instruction::Or, CondTy, CostKind);
3372
3373 break;
3376 // Assume worst case scenario and add the maximum extra cost.
3377 ExtraCost = 3;
3378 break;
3379 default:
3380 break;
3381 }
3382 }
3383 }
3384
3385 static const CostKindTblEntry SLMCostTbl[] = {
3386 // slm pcmpeq/pcmpgt throughput is 2
3387 { ISD::SETCC, MVT::v2i64, { 2, 5, 1, 2 } },
3388 // slm pblendvb/blendvpd/blendvps throughput is 4
3389 { ISD::SELECT, MVT::v2f64, { 4, 4, 1, 3 } }, // vblendvpd
3390 { ISD::SELECT, MVT::v4f32, { 4, 4, 1, 3 } }, // vblendvps
3391 { ISD::SELECT, MVT::v2i64, { 4, 4, 1, 3 } }, // pblendvb
3392 { ISD::SELECT, MVT::v8i32, { 4, 4, 1, 3 } }, // pblendvb
3393 { ISD::SELECT, MVT::v8i16, { 4, 4, 1, 3 } }, // pblendvb
3394 { ISD::SELECT, MVT::v16i8, { 4, 4, 1, 3 } }, // pblendvb
3395 };
3396
3397 static const CostKindTblEntry AVX512BWCostTbl[] = {
3398 { ISD::SETCC, MVT::v32i16, { 1, 1, 1, 1 } },
3399 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 1 } },
3400 { ISD::SETCC, MVT::v64i8, { 1, 1, 1, 1 } },
3401 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 1 } },
3402
3403 { ISD::SELECT, MVT::v32i16, { 1, 1, 1, 1 } },
3404 { ISD::SELECT, MVT::v64i8, { 1, 1, 1, 1 } },
3405 };
3406
3407 static const CostKindTblEntry AVX512CostTbl[] = {
3408 { ISD::SETCC, MVT::v8f64, { 1, 4, 1, 1 } },
3409 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 1 } },
3410 { ISD::SETCC, MVT::v16f32, { 1, 4, 1, 1 } },
3411 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 1 } },
3412
3413 { ISD::SETCC, MVT::v8i64, { 1, 1, 1, 1 } },
3414 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 1 } },
3415 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3416 { ISD::SETCC, MVT::v16i32, { 1, 1, 1, 1 } },
3417 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 1 } },
3418 { ISD::SETCC, MVT::v32i16, { 3, 7, 5, 5 } },
3419 { ISD::SETCC, MVT::v64i8, { 3, 7, 5, 5 } },
3420
3421 { ISD::SELECT, MVT::v8i64, { 1, 1, 1, 1 } },
3422 { ISD::SELECT, MVT::v4i64, { 1, 1, 1, 1 } },
3423 { ISD::SELECT, MVT::v2i64, { 1, 1, 1, 1 } },
3424 { ISD::SELECT, MVT::v16i32, { 1, 1, 1, 1 } },
3425 { ISD::SELECT, MVT::v8i32, { 1, 1, 1, 1 } },
3426 { ISD::SELECT, MVT::v4i32, { 1, 1, 1, 1 } },
3427 { ISD::SELECT, MVT::v8f64, { 1, 1, 1, 1 } },
3428 { ISD::SELECT, MVT::v4f64, { 1, 1, 1, 1 } },
3429 { ISD::SELECT, MVT::v2f64, { 1, 1, 1, 1 } },
3430 { ISD::SELECT, MVT::f64, { 1, 1, 1, 1 } },
3431 { ISD::SELECT, MVT::v16f32, { 1, 1, 1, 1 } },
3432 { ISD::SELECT, MVT::v8f32 , { 1, 1, 1, 1 } },
3433 { ISD::SELECT, MVT::v4f32, { 1, 1, 1, 1 } },
3434 { ISD::SELECT, MVT::f32 , { 1, 1, 1, 1 } },
3435
3436 { ISD::SELECT, MVT::v32i16, { 2, 2, 4, 4 } },
3437 { ISD::SELECT, MVT::v16i16, { 1, 1, 1, 1 } },
3438 { ISD::SELECT, MVT::v8i16, { 1, 1, 1, 1 } },
3439 { ISD::SELECT, MVT::v64i8, { 2, 2, 4, 4 } },
3440 { ISD::SELECT, MVT::v32i8, { 1, 1, 1, 1 } },
3441 { ISD::SELECT, MVT::v16i8, { 1, 1, 1, 1 } },
3442 };
3443
3444 static const CostKindTblEntry AVX2CostTbl[] = {
3445 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 2 } },
3446 { ISD::SETCC, MVT::v2f64, { 1, 4, 1, 1 } },
3447 { ISD::SETCC, MVT::f64, { 1, 4, 1, 1 } },
3448 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 2 } },
3449 { ISD::SETCC, MVT::v4f32, { 1, 4, 1, 1 } },
3450 { ISD::SETCC, MVT::f32, { 1, 4, 1, 1 } },
3451
3452 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 2 } },
3453 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 2 } },
3454 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 2 } },
3455 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 2 } },
3456
3457 { ISD::SELECT, MVT::v4f64, { 2, 2, 1, 2 } }, // vblendvpd
3458 { ISD::SELECT, MVT::v8f32, { 2, 2, 1, 2 } }, // vblendvps
3459 { ISD::SELECT, MVT::v4i64, { 2, 2, 1, 2 } }, // pblendvb
3460 { ISD::SELECT, MVT::v8i32, { 2, 2, 1, 2 } }, // pblendvb
3461 { ISD::SELECT, MVT::v16i16, { 2, 2, 1, 2 } }, // pblendvb
3462 { ISD::SELECT, MVT::v32i8, { 2, 2, 1, 2 } }, // pblendvb
3463 };
3464
3465 static const CostKindTblEntry XOPCostTbl[] = {
3466 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3467 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3468 };
3469
3470 static const CostKindTblEntry AVX1CostTbl[] = {
3471 { ISD::SETCC, MVT::v4f64, { 2, 3, 1, 2 } },
3472 { ISD::SETCC, MVT::v2f64, { 1, 3, 1, 1 } },
3473 { ISD::SETCC, MVT::f64, { 1, 3, 1, 1 } },
3474 { ISD::SETCC, MVT::v8f32, { 2, 3, 1, 2 } },
3475 { ISD::SETCC, MVT::v4f32, { 1, 3, 1, 1 } },
3476 { ISD::SETCC, MVT::f32, { 1, 3, 1, 1 } },
3477
3478 // AVX1 does not support 8-wide integer compare.
3479 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3480 { ISD::SETCC, MVT::v8i32, { 4, 2, 5, 6 } },
3481 { ISD::SETCC, MVT::v16i16, { 4, 2, 5, 6 } },
3482 { ISD::SETCC, MVT::v32i8, { 4, 2, 5, 6 } },
3483
3484 { ISD::SELECT, MVT::v4f64, { 3, 3, 1, 2 } }, // vblendvpd
3485 { ISD::SELECT, MVT::v8f32, { 3, 3, 1, 2 } }, // vblendvps
3486 { ISD::SELECT, MVT::v4i64, { 3, 3, 1, 2 } }, // vblendvpd
3487 { ISD::SELECT, MVT::v8i32, { 3, 3, 1, 2 } }, // vblendvps
3488 { ISD::SELECT, MVT::v16i16, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3489 { ISD::SELECT, MVT::v32i8, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3490 };
3491
3492 static const CostKindTblEntry SSE42CostTbl[] = {
3493 { ISD::SETCC, MVT::v2i64, { 1, 2, 1, 2 } },
3494 };
3495
3496 static const CostKindTblEntry SSE41CostTbl[] = {
3497 { ISD::SETCC, MVT::v2f64, { 1, 5, 1, 1 } },
3498 { ISD::SETCC, MVT::v4f32, { 1, 5, 1, 1 } },
3499
3500 { ISD::SELECT, MVT::v2f64, { 2, 2, 1, 2 } }, // blendvpd
3501 { ISD::SELECT, MVT::f64, { 2, 2, 1, 2 } }, // blendvpd
3502 { ISD::SELECT, MVT::v4f32, { 2, 2, 1, 2 } }, // blendvps
3503 { ISD::SELECT, MVT::f32 , { 2, 2, 1, 2 } }, // blendvps
3504 { ISD::SELECT, MVT::v2i64, { 2, 2, 1, 2 } }, // pblendvb
3505 { ISD::SELECT, MVT::v4i32, { 2, 2, 1, 2 } }, // pblendvb
3506 { ISD::SELECT, MVT::v8i16, { 2, 2, 1, 2 } }, // pblendvb
3507 { ISD::SELECT, MVT::v16i8, { 2, 2, 1, 2 } }, // pblendvb
3508 };
3509
3510 static const CostKindTblEntry SSE2CostTbl[] = {
3511 { ISD::SETCC, MVT::v2f64, { 2, 5, 1, 1 } },
3512 { ISD::SETCC, MVT::f64, { 1, 5, 1, 1 } },
3513
3514 { ISD::SETCC, MVT::v2i64, { 5, 4, 5, 5 } }, // pcmpeqd/pcmpgtd expansion
3515 { ISD::SETCC, MVT::v4i32, { 1, 1, 1, 1 } },
3516 { ISD::SETCC, MVT::v8i16, { 1, 1, 1, 1 } },
3517 { ISD::SETCC, MVT::v16i8, { 1, 1, 1, 1 } },
3518
3519 { ISD::SELECT, MVT::v2f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3520 { ISD::SELECT, MVT::f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3521 { ISD::SELECT, MVT::v2i64, { 2, 2, 3, 3 } }, // pand + pandn + por
3522 { ISD::SELECT, MVT::v4i32, { 2, 2, 3, 3 } }, // pand + pandn + por
3523 { ISD::SELECT, MVT::v8i16, { 2, 2, 3, 3 } }, // pand + pandn + por
3524 { ISD::SELECT, MVT::v16i8, { 2, 2, 3, 3 } }, // pand + pandn + por
3525 };
3526
3527 static const CostKindTblEntry SSE1CostTbl[] = {
3528 { ISD::SETCC, MVT::v4f32, { 2, 5, 1, 1 } },
3529 { ISD::SETCC, MVT::f32, { 1, 5, 1, 1 } },
3530
3531 { ISD::SELECT, MVT::v4f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3532 { ISD::SELECT, MVT::f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3533 };
3534
3535 if (ST->useSLMArithCosts())
3536 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
3537 if (auto KindCost = Entry->Cost[CostKind])
3538 return LT.first * (ExtraCost + *KindCost);
3539
3540 if (ST->hasBWI())
3541 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3542 if (auto KindCost = Entry->Cost[CostKind])
3543 return LT.first * (ExtraCost + *KindCost);
3544
3545 if (ST->hasAVX512())
3546 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3547 if (auto KindCost = Entry->Cost[CostKind])
3548 return LT.first * (ExtraCost + *KindCost);
3549
3550 if (ST->hasAVX2())
3551 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3552 if (auto KindCost = Entry->Cost[CostKind])
3553 return LT.first * (ExtraCost + *KindCost);
3554
3555 if (ST->hasXOP())
3556 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3557 if (auto KindCost = Entry->Cost[CostKind])
3558 return LT.first * (ExtraCost + *KindCost);
3559
3560 if (ST->hasAVX())
3561 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3562 if (auto KindCost = Entry->Cost[CostKind])
3563 return LT.first * (ExtraCost + *KindCost);
3564
3565 if (ST->hasSSE42())
3566 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3567 if (auto KindCost = Entry->Cost[CostKind])
3568 return LT.first * (ExtraCost + *KindCost);
3569
3570 if (ST->hasSSE41())
3571 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3572 if (auto KindCost = Entry->Cost[CostKind])
3573 return LT.first * (ExtraCost + *KindCost);
3574
3575 if (ST->hasSSE2())
3576 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3577 if (auto KindCost = Entry->Cost[CostKind])
3578 return LT.first * (ExtraCost + *KindCost);
3579
3580 if (ST->hasSSE1())
3581 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3582 if (auto KindCost = Entry->Cost[CostKind])
3583 return LT.first * (ExtraCost + *KindCost);
3584
3585 // Assume a 3cy latency for fp select ops.
3586 if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select)
3587 if (ValTy->getScalarType()->isFloatingPointTy())
3588 return 3;
3589
3590 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3591 Op1Info, Op2Info, I);
3592}
3593
3595
3599 // Costs should match the codegen from:
3600 // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
3601 // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
3602 // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
3603 // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
3604 // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
3605
3606 // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
3607 // specialized in these tables yet.
3608 static const CostKindTblEntry AVX512VBMI2CostTbl[] = {
3609 { ISD::FSHL, MVT::v8i64, { 1, 1, 1, 1 } },
3610 { ISD::FSHL, MVT::v4i64, { 1, 1, 1, 1 } },
3611 { ISD::FSHL, MVT::v2i64, { 1, 1, 1, 1 } },
3612 { ISD::FSHL, MVT::v16i32, { 1, 1, 1, 1 } },
3613 { ISD::FSHL, MVT::v8i32, { 1, 1, 1, 1 } },
3614 { ISD::FSHL, MVT::v4i32, { 1, 1, 1, 1 } },
3615 { ISD::FSHL, MVT::v32i16, { 1, 1, 1, 1 } },
3616 { ISD::FSHL, MVT::v16i16, { 1, 1, 1, 1 } },
3617 { ISD::FSHL, MVT::v8i16, { 1, 1, 1, 1 } },
3618 { ISD::ROTL, MVT::v32i16, { 1, 1, 1, 1 } },
3619 { ISD::ROTL, MVT::v16i16, { 1, 1, 1, 1 } },
3620 { ISD::ROTL, MVT::v8i16, { 1, 1, 1, 1 } },
3621 { ISD::ROTR, MVT::v32i16, { 1, 1, 1, 1 } },
3622 { ISD::ROTR, MVT::v16i16, { 1, 1, 1, 1 } },
3623 { ISD::ROTR, MVT::v8i16, { 1, 1, 1, 1 } },
3624 { X86ISD::VROTLI, MVT::v32i16, { 1, 1, 1, 1 } },
3625 { X86ISD::VROTLI, MVT::v16i16, { 1, 1, 1, 1 } },
3626 { X86ISD::VROTLI, MVT::v8i16, { 1, 1, 1, 1 } },
3627 };
3628 static const CostKindTblEntry AVX512BITALGCostTbl[] = {
3629 { ISD::CTPOP, MVT::v32i16, { 1, 1, 1, 1 } },
3630 { ISD::CTPOP, MVT::v64i8, { 1, 1, 1, 1 } },
3631 { ISD::CTPOP, MVT::v16i16, { 1, 1, 1, 1 } },
3632 { ISD::CTPOP, MVT::v32i8, { 1, 1, 1, 1 } },
3633 { ISD::CTPOP, MVT::v8i16, { 1, 1, 1, 1 } },
3634 { ISD::CTPOP, MVT::v16i8, { 1, 1, 1, 1 } },
3635 };
3636 static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = {
3637 { ISD::CTPOP, MVT::v8i64, { 1, 1, 1, 1 } },
3638 { ISD::CTPOP, MVT::v16i32, { 1, 1, 1, 1 } },
3639 { ISD::CTPOP, MVT::v4i64, { 1, 1, 1, 1 } },
3640 { ISD::CTPOP, MVT::v8i32, { 1, 1, 1, 1 } },
3641 { ISD::CTPOP, MVT::v2i64, { 1, 1, 1, 1 } },
3642 { ISD::CTPOP, MVT::v4i32, { 1, 1, 1, 1 } },
3643 };
3644 static const CostKindTblEntry AVX512CDCostTbl[] = {
3645 { ISD::CTLZ, MVT::v8i64, { 1, 5, 1, 1 } },
3646 { ISD::CTLZ, MVT::v16i32, { 1, 5, 1, 1 } },
3647 { ISD::CTLZ, MVT::v32i16, { 18, 27, 23, 27 } },
3648 { ISD::CTLZ, MVT::v64i8, { 3, 16, 9, 11 } },
3649 { ISD::CTLZ, MVT::v4i64, { 1, 5, 1, 1 } },
3650 { ISD::CTLZ, MVT::v8i32, { 1, 5, 1, 1 } },
3651 { ISD::CTLZ, MVT::v16i16, { 8, 19, 11, 13 } },
3652 { ISD::CTLZ, MVT::v32i8, { 2, 11, 9, 10 } },
3653 { ISD::CTLZ, MVT::v2i64, { 1, 5, 1, 1 } },
3654 { ISD::CTLZ, MVT::v4i32, { 1, 5, 1, 1 } },
3655 { ISD::CTLZ, MVT::v8i16, { 3, 15, 4, 6 } },
3656 { ISD::CTLZ, MVT::v16i8, { 2, 10, 9, 10 } },
3657
3658 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3659 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3660 { ISD::CTTZ, MVT::v4i64, { 1, 8, 6, 6 } },
3661 { ISD::CTTZ, MVT::v8i32, { 1, 8, 6, 6 } },
3662 { ISD::CTTZ, MVT::v2i64, { 1, 8, 6, 6 } },
3663 { ISD::CTTZ, MVT::v4i32, { 1, 8, 6, 6 } },
3664 };
3665 static const CostKindTblEntry AVX512BWCostTbl[] = {
3666 { ISD::ABS, MVT::v32i16, { 1, 1, 1, 1 } },
3667 { ISD::ABS, MVT::v64i8, { 1, 1, 1, 1 } },
3668 { ISD::BITREVERSE, MVT::v2i64, { 3, 10, 10, 11 } },
3669 { ISD::BITREVERSE, MVT::v4i64, { 3, 11, 10, 11 } },
3670 { ISD::BITREVERSE, MVT::v8i64, { 3, 12, 10, 14 } },
3671 { ISD::BITREVERSE, MVT::v4i32, { 3, 10, 10, 11 } },
3672 { ISD::BITREVERSE, MVT::v8i32, { 3, 11, 10, 11 } },
3673 { ISD::BITREVERSE, MVT::v16i32, { 3, 12, 10, 14 } },
3674 { ISD::BITREVERSE, MVT::v8i16, { 3, 10, 10, 11 } },
3675 { ISD::BITREVERSE, MVT::v16i16, { 3, 11, 10, 11 } },
3676 { ISD::BITREVERSE, MVT::v32i16, { 3, 12, 10, 14 } },
3677 { ISD::BITREVERSE, MVT::v16i8, { 2, 5, 9, 9 } },
3678 { ISD::BITREVERSE, MVT::v32i8, { 2, 5, 9, 9 } },
3679 { ISD::BITREVERSE, MVT::v64i8, { 2, 5, 9, 12 } },
3680 { ISD::BSWAP, MVT::v2i64, { 1, 1, 1, 2 } },
3681 { ISD::BSWAP, MVT::v4i64, { 1, 1, 1, 2 } },
3682 { ISD::BSWAP, MVT::v8i64, { 1, 1, 1, 2 } },
3683 { ISD::BSWAP, MVT::v4i32, { 1, 1, 1, 2 } },
3684 { ISD::BSWAP, MVT::v8i32, { 1, 1, 1, 2 } },
3685 { ISD::BSWAP, MVT::v16i32, { 1, 1, 1, 2 } },
3686 { ISD::BSWAP, MVT::v8i16, { 1, 1, 1, 2 } },
3687 { ISD::BSWAP, MVT::v16i16, { 1, 1, 1, 2 } },
3688 { ISD::BSWAP, MVT::v32i16, { 1, 1, 1, 2 } },
3689 { ISD::CTLZ, MVT::v8i64, { 8, 22, 23, 23 } },
3690 { ISD::CTLZ, MVT::v16i32, { 8, 23, 25, 25 } },
3691 { ISD::CTLZ, MVT::v32i16, { 4, 15, 15, 16 } },
3692 { ISD::CTLZ, MVT::v64i8, { 3, 12, 10, 9 } },
3693 { ISD::CTPOP, MVT::v2i64, { 3, 7, 10, 10 } },
3694 { ISD::CTPOP, MVT::v4i64, { 3, 7, 10, 10 } },
3695 { ISD::CTPOP, MVT::v8i64, { 3, 8, 10, 12 } },
3696 { ISD::CTPOP, MVT::v4i32, { 7, 11, 14, 14 } },
3697 { ISD::CTPOP, MVT::v8i32, { 7, 11, 14, 14 } },
3698 { ISD::CTPOP, MVT::v16i32, { 7, 12, 14, 16 } },
3699 { ISD::CTPOP, MVT::v8i16, { 2, 7, 11, 11 } },
3700 { ISD::CTPOP, MVT::v16i16, { 2, 7, 11, 11 } },
3701 { ISD::CTPOP, MVT::v32i16, { 3, 7, 11, 13 } },
3702 { ISD::CTPOP, MVT::v16i8, { 2, 4, 8, 8 } },
3703 { ISD::CTPOP, MVT::v32i8, { 2, 4, 8, 8 } },
3704 { ISD::CTPOP, MVT::v64i8, { 2, 5, 8, 10 } },
3705 { ISD::CTTZ, MVT::v8i16, { 3, 9, 14, 14 } },
3706 { ISD::CTTZ, MVT::v16i16, { 3, 9, 14, 14 } },
3707 { ISD::CTTZ, MVT::v32i16, { 3, 10, 14, 16 } },
3708 { ISD::CTTZ, MVT::v16i8, { 2, 6, 11, 11 } },
3709 { ISD::CTTZ, MVT::v32i8, { 2, 6, 11, 11 } },
3710 { ISD::CTTZ, MVT::v64i8, { 3, 7, 11, 13 } },
3711 { ISD::ROTL, MVT::v32i16, { 2, 8, 6, 8 } },
3712 { ISD::ROTL, MVT::v16i16, { 2, 8, 6, 7 } },
3713 { ISD::ROTL, MVT::v8i16, { 2, 7, 6, 7 } },
3714 { ISD::ROTL, MVT::v64i8, { 5, 6, 11, 12 } },
3715 { ISD::ROTL, MVT::v32i8, { 5, 15, 7, 10 } },
3716 { ISD::ROTL, MVT::v16i8, { 5, 15, 7, 10 } },
3717 { ISD::ROTR, MVT::v32i16, { 2, 8, 6, 8 } },
3718 { ISD::ROTR, MVT::v16i16, { 2, 8, 6, 7 } },
3719 { ISD::ROTR, MVT::v8i16, { 2, 7, 6, 7 } },
3720 { ISD::ROTR, MVT::v64i8, { 5, 6, 12, 14 } },
3721 { ISD::ROTR, MVT::v32i8, { 5, 14, 6, 9 } },
3722 { ISD::ROTR, MVT::v16i8, { 5, 14, 6, 9 } },
3723 { X86ISD::VROTLI, MVT::v32i16, { 2, 5, 3, 3 } },
3724 { X86ISD::VROTLI, MVT::v16i16, { 1, 5, 3, 3 } },
3725 { X86ISD::VROTLI, MVT::v8i16, { 1, 5, 3, 3 } },
3726 { X86ISD::VROTLI, MVT::v64i8, { 2, 9, 3, 4 } },
3727 { X86ISD::VROTLI, MVT::v32i8, { 1, 9, 3, 4 } },
3728 { X86ISD::VROTLI, MVT::v16i8, { 1, 8, 3, 4 } },
3729 { ISD::SADDSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3730 { ISD::SADDSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3731 { ISD::SMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3732 { ISD::SMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3733 { ISD::SMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3734 { ISD::SMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3735 { ISD::SMULO, MVT::v32i16, { 3, 6, 4, 4 } },
3736 { ISD::SMULO, MVT::v64i8, { 8, 21, 17, 18 } },
3737 { ISD::UMULO, MVT::v32i16, { 2, 5, 3, 3 } },
3738 { ISD::UMULO, MVT::v64i8, { 8, 15, 15, 16 } },
3739 { ISD::SSUBSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3740 { ISD::SSUBSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3741 { ISD::UADDSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3742 { ISD::UADDSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3743 { ISD::UMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3744 { ISD::UMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3745 { ISD::UMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3746 { ISD::UMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3747 { ISD::USUBSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3748 { ISD::USUBSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3749 };
3750 static const CostKindTblEntry AVX512CostTbl[] = {
3751 { ISD::ABS, MVT::v8i64, { 1, 1, 1, 1 } },
3752 { ISD::ABS, MVT::v4i64, { 1, 1, 1, 1 } },
3753 { ISD::ABS, MVT::v2i64, { 1, 1, 1, 1 } },
3754 { ISD::ABS, MVT::v16i32, { 1, 1, 1, 1 } },
3755 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 1 } },
3756 { ISD::ABS, MVT::v32i16, { 2, 7, 4, 4 } },
3757 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 1 } },
3758 { ISD::ABS, MVT::v64i8, { 2, 7, 4, 4 } },
3759 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 1 } },
3760 { ISD::BITREVERSE, MVT::v8i64, { 9, 13, 20, 20 } },
3761 { ISD::BITREVERSE, MVT::v16i32, { 9, 13, 20, 20 } },
3762 { ISD::BITREVERSE, MVT::v32i16, { 9, 13, 20, 20 } },
3763 { ISD::BITREVERSE, MVT::v64i8, { 6, 11, 17, 17 } },
3764 { ISD::BSWAP, MVT::v8i64, { 4, 7, 5, 5 } },
3765 { ISD::BSWAP, MVT::v16i32, { 4, 7, 5, 5 } },
3766 { ISD::BSWAP, MVT::v32i16, { 4, 7, 5, 5 } },
3767 { ISD::CTLZ, MVT::v8i64, { 10, 28, 32, 32 } },
3768 { ISD::CTLZ, MVT::v16i32, { 12, 30, 38, 38 } },
3769 { ISD::CTLZ, MVT::v32i16, { 8, 15, 29, 29 } },
3770 { ISD::CTLZ, MVT::v64i8, { 6, 11, 19, 19 } },
3771 { ISD::CTPOP, MVT::v8i64, { 16, 16, 19, 19 } },
3772 { ISD::CTPOP, MVT::v16i32, { 24, 19, 27, 27 } },
3773 { ISD::CTPOP, MVT::v32i16, { 18, 15, 22, 22 } },
3774 { ISD::CTPOP, MVT::v64i8, { 12, 11, 16, 16 } },
3775 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3776 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3777 { ISD::CTTZ, MVT::v32i16, { 7, 17, 27, 27 } },
3778 { ISD::CTTZ, MVT::v64i8, { 6, 13, 21, 21 } },
3779 { ISD::ROTL, MVT::v8i64, { 1, 1, 1, 1 } },
3780 { ISD::ROTL, MVT::v4i64, { 1, 1, 1, 1 } },
3781 { ISD::ROTL, MVT::v2i64, { 1, 1, 1, 1 } },
3782 { ISD::ROTL, MVT::v16i32, { 1, 1, 1, 1 } },
3783 { ISD::ROTL, MVT::v8i32, { 1, 1, 1, 1 } },
3784 { ISD::ROTL, MVT::v4i32, { 1, 1, 1, 1 } },
3785 { ISD::ROTR, MVT::v8i64, { 1, 1, 1, 1 } },
3786 { ISD::ROTR, MVT::v4i64, { 1, 1, 1, 1 } },
3787 { ISD::ROTR, MVT::v2i64, { 1, 1, 1, 1 } },
3788 { ISD::ROTR, MVT::v16i32, { 1, 1, 1, 1 } },
3789 { ISD::ROTR, MVT::v8i32, { 1, 1, 1, 1 } },
3790 { ISD::ROTR, MVT::v4i32, { 1, 1, 1, 1 } },
3791 { X86ISD::VROTLI, MVT::v8i64, { 1, 1, 1, 1 } },
3792 { X86ISD::VROTLI, MVT::v4i64, { 1, 1, 1, 1 } },
3793 { X86ISD::VROTLI, MVT::v2i64, { 1, 1, 1, 1 } },
3794 { X86ISD::VROTLI, MVT::v16i32, { 1, 1, 1, 1 } },
3795 { X86ISD::VROTLI, MVT::v8i32, { 1, 1, 1, 1 } },
3796 { X86ISD::VROTLI, MVT::v4i32, { 1, 1, 1, 1 } },
3797 { ISD::SADDSAT, MVT::v2i64, { 3, 3, 8, 9 } },
3798 { ISD::SADDSAT, MVT::v4i64, { 2, 2, 6, 7 } },
3799 { ISD::SADDSAT, MVT::v8i64, { 3, 3, 6, 7 } },
3800 { ISD::SADDSAT, MVT::v4i32, { 2, 2, 6, 7 } },
3801 { ISD::SADDSAT, MVT::v8i32, { 2, 2, 6, 7 } },
3802 { ISD::SADDSAT, MVT::v16i32, { 3, 3, 6, 7 } },
3803 { ISD::SADDSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3804 { ISD::SADDSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3805 { ISD::SMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3806 { ISD::SMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3807 { ISD::SMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3808 { ISD::SMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3809 { ISD::SMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3810 { ISD::SMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3811 { ISD::SMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3812 { ISD::SMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3813 { ISD::SMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3814 { ISD::SMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3815 { ISD::SMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3816 { ISD::SMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3817 { ISD::SMULO, MVT::v8i64, { 44, 44, 81, 93 } },
3818 { ISD::SMULO, MVT::v16i32, { 5, 12, 9, 11 } },
3819 { ISD::SMULO, MVT::v32i16, { 6, 12, 17, 17 } },
3820 { ISD::SMULO, MVT::v64i8, { 22, 28, 42, 42 } },
3821 { ISD::SSUBSAT, MVT::v2i64, { 2, 13, 9, 10 } },
3822 { ISD::SSUBSAT, MVT::v4i64, { 2, 15, 7, 8 } },
3823 { ISD::SSUBSAT, MVT::v8i64, { 2, 14, 7, 8 } },
3824 { ISD::SSUBSAT, MVT::v4i32, { 2, 14, 7, 8 } },
3825 { ISD::SSUBSAT, MVT::v8i32, { 2, 15, 7, 8 } },
3826 { ISD::SSUBSAT, MVT::v16i32, { 2, 14, 7, 8 } },
3827 { ISD::SSUBSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3828 { ISD::SSUBSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3829 { ISD::UMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3830 { ISD::UMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3831 { ISD::UMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3832 { ISD::UMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3833 { ISD::UMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3834 { ISD::UMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3835 { ISD::UMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3836 { ISD::UMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3837 { ISD::UMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3838 { ISD::UMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3839 { ISD::UMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3840 { ISD::UMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3841 { ISD::UMULO, MVT::v8i64, { 52, 52, 95, 104} },
3842 { ISD::UMULO, MVT::v16i32, { 5, 12, 8, 10 } },
3843 { ISD::UMULO, MVT::v32i16, { 5, 13, 16, 16 } },
3844 { ISD::UMULO, MVT::v64i8, { 18, 24, 30, 30 } },
3845 { ISD::UADDSAT, MVT::v2i64, { 1, 4, 4, 4 } },
3846 { ISD::UADDSAT, MVT::v4i64, { 1, 4, 4, 4 } },
3847 { ISD::UADDSAT, MVT::v8i64, { 1, 4, 4, 4 } },
3848 { ISD::UADDSAT, MVT::v4i32, { 1, 2, 4, 4 } },
3849 { ISD::UADDSAT, MVT::v8i32, { 1, 2, 4, 4 } },
3850 { ISD::UADDSAT, MVT::v16i32, { 2, 2, 4, 4 } },
3851 { ISD::UADDSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3852 { ISD::UADDSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3853 { ISD::USUBSAT, MVT::v2i64, { 1, 4, 2, 2 } },
3854 { ISD::USUBSAT, MVT::v4i64, { 1, 4, 2, 2 } },
3855 { ISD::USUBSAT, MVT::v8i64, { 1, 4, 2, 2 } },
3856 { ISD::USUBSAT, MVT::v8i32, { 1, 2, 2, 2 } },
3857 { ISD::USUBSAT, MVT::v16i32, { 1, 2, 2, 2 } },
3858 { ISD::USUBSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3859 { ISD::USUBSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3860 { ISD::FMAXNUM, MVT::f32, { 2, 2, 3, 3 } },
3861 { ISD::FMAXNUM, MVT::v4f32, { 1, 1, 3, 3 } },
3862 { ISD::FMAXNUM, MVT::v8f32, { 2, 2, 3, 3 } },
3863 { ISD::FMAXNUM, MVT::v16f32, { 4, 4, 3, 3 } },
3864 { ISD::FMAXNUM, MVT::f64, { 2, 2, 3, 3 } },
3865 { ISD::FMAXNUM, MVT::v2f64, { 1, 1, 3, 3 } },
3866 { ISD::FMAXNUM, MVT::v4f64, { 2, 2, 3, 3 } },
3867 { ISD::FMAXNUM, MVT::v8f64, { 3, 3, 3, 3 } },
3868 { ISD::FSQRT, MVT::f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3869 { ISD::FSQRT, MVT::v4f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3870 { ISD::FSQRT, MVT::v8f32, { 6, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3871 { ISD::FSQRT, MVT::v16f32, { 12, 20, 1, 3 } }, // Skylake from http://www.agner.org/
3872 { ISD::FSQRT, MVT::f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3873 { ISD::FSQRT, MVT::v2f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3874 { ISD::FSQRT, MVT::v4f64, { 12, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3875 { ISD::FSQRT, MVT::v8f64, { 24, 32, 1, 3 } }, // Skylake from http://www.agner.org/
3876 };
3877 static const CostKindTblEntry XOPCostTbl[] = {
3878 { ISD::BITREVERSE, MVT::v4i64, { 3, 6, 5, 6 } },
3879 { ISD::BITREVERSE, MVT::v8i32, { 3, 6, 5, 6 } },
3880 { ISD::BITREVERSE, MVT::v16i16, { 3, 6, 5, 6 } },
3881 { ISD::BITREVERSE, MVT::v32i8, { 3, 6, 5, 6 } },
3882 { ISD::BITREVERSE, MVT::v2i64, { 2, 7, 1, 1 } },
3883 { ISD::BITREVERSE, MVT::v4i32, { 2, 7, 1, 1 } },
3884 { ISD::BITREVERSE, MVT::v8i16, { 2, 7, 1, 1 } },
3885 { ISD::BITREVERSE, MVT::v16i8, { 2, 7, 1, 1 } },
3886 { ISD::BITREVERSE, MVT::i64, { 2, 2, 3, 4 } },
3887 { ISD::BITREVERSE, MVT::i32, { 2, 2, 3, 4 } },
3888 { ISD::BITREVERSE, MVT::i16, { 2, 2, 3, 4 } },
3889 { ISD::BITREVERSE, MVT::i8, { 2, 2, 3, 4 } },
3890 // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
3891 { ISD::ROTL, MVT::v4i64, { 4, 7, 5, 6 } },
3892 { ISD::ROTL, MVT::v8i32, { 4, 7, 5, 6 } },
3893 { ISD::ROTL, MVT::v16i16, { 4, 7, 5, 6 } },
3894 { ISD::ROTL, MVT::v32i8, { 4, 7, 5, 6 } },
3895 { ISD::ROTL, MVT::v2i64, { 1, 3, 1, 1 } },
3896 { ISD::ROTL, MVT::v4i32, { 1, 3, 1, 1 } },
3897 { ISD::ROTL, MVT::v8i16, { 1, 3, 1, 1 } },
3898 { ISD::ROTL, MVT::v16i8, { 1, 3, 1, 1 } },
3899 { ISD::ROTR, MVT::v4i64, { 4, 7, 8, 9 } },
3900 { ISD::ROTR, MVT::v8i32, { 4, 7, 8, 9 } },
3901 { ISD::ROTR, MVT::v16i16, { 4, 7, 8, 9 } },
3902 { ISD::ROTR, MVT::v32i8, { 4, 7, 8, 9 } },
3903 { ISD::ROTR, MVT::v2i64, { 1, 3, 3, 3 } },
3904 { ISD::ROTR, MVT::v4i32, { 1, 3, 3, 3 } },
3905 { ISD::ROTR, MVT::v8i16, { 1, 3, 3, 3 } },
3906 { ISD::ROTR, MVT::v16i8, { 1, 3, 3, 3 } },
3907 { X86ISD::VROTLI, MVT::v4i64, { 4, 7, 5, 6 } },
3908 { X86ISD::VROTLI, MVT::v8i32, { 4, 7, 5, 6 } },
3909 { X86ISD::VROTLI, MVT::v16i16, { 4, 7, 5, 6 } },
3910 { X86ISD::VROTLI, MVT::v32i8, { 4, 7, 5, 6 } },
3911 { X86ISD::VROTLI, MVT::v2i64, { 1, 3, 1, 1 } },
3912 { X86ISD::VROTLI, MVT::v4i32, { 1, 3, 1, 1 } },
3913 { X86ISD::VROTLI, MVT::v8i16, { 1, 3, 1, 1 } },
3914 { X86ISD::VROTLI, MVT::v16i8, { 1, 3, 1, 1 } },
3915 };
3916 static const CostKindTblEntry AVX2CostTbl[] = {
3917 { ISD::ABS, MVT::v2i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3918 { ISD::ABS, MVT::v4i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3919 { ISD::ABS, MVT::v4i32, { 1, 1, 1, 1 } },
3920 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 2 } },
3921 { ISD::ABS, MVT::v8i16, { 1, 1, 1, 1 } },
3922 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 2 } },
3923 { ISD::ABS, MVT::v16i8, { 1, 1, 1, 1 } },
3924 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 2 } },
3925 { ISD::BITREVERSE, MVT::v2i64, { 3, 11, 10, 11 } },
3926 { ISD::BITREVERSE, MVT::v4i64, { 5, 11, 10, 17 } },
3927 { ISD::BITREVERSE, MVT::v4i32, { 3, 11, 10, 11 } },
3928 { ISD::BITREVERSE, MVT::v8i32, { 5, 11, 10, 17 } },
3929 { ISD::BITREVERSE, MVT::v8i16, { 3, 11, 10, 11 } },
3930 { ISD::BITREVERSE, MVT::v16i16, { 5, 11, 10, 17 } },
3931 { ISD::BITREVERSE, MVT::v16i8, { 3, 6, 9, 9 } },
3932 { ISD::BITREVERSE, MVT::v32i8, { 4, 5, 9, 15 } },
3933 { ISD::BSWAP, MVT::v2i64, { 1, 2, 1, 2 } },
3934 { ISD::BSWAP, MVT::v4i64, { 1, 3, 1, 2 } },
3935 { ISD::BSWAP, MVT::v4i32, { 1, 2, 1, 2 } },
3936 { ISD::BSWAP, MVT::v8i32, { 1, 3, 1, 2 } },
3937 { ISD::BSWAP, MVT::v8i16, { 1, 2, 1, 2 } },
3938 { ISD::BSWAP, MVT::v16i16, { 1, 3, 1, 2 } },
3939 { ISD::CTLZ, MVT::v2i64, { 7, 18, 24, 25 } },
3940 { ISD::CTLZ, MVT::v4i64, { 14, 18, 24, 44 } },
3941 { ISD::CTLZ, MVT::v4i32, { 5, 16, 19, 20 } },
3942 { ISD::CTLZ, MVT::v8i32, { 10, 16, 19, 34 } },
3943 { ISD::CTLZ, MVT::v8i16, { 4, 13, 14, 15 } },
3944 { ISD::CTLZ, MVT::v16i16, { 6, 14, 14, 24 } },
3945 { ISD::CTLZ, MVT::v16i8, { 3, 12, 9, 10 } },
3946 { ISD::CTLZ, MVT::v32i8, { 4, 12, 9, 14 } },
3947 { ISD::CTPOP, MVT::v2i64, { 3, 9, 10, 10 } },
3948 { ISD::CTPOP, MVT::v4i64, { 4, 9, 10, 14 } },
3949 { ISD::CTPOP, MVT::v4i32, { 7, 12, 14, 14 } },
3950 { ISD::CTPOP, MVT::v8i32, { 7, 12, 14, 18 } },
3951 { ISD::CTPOP, MVT::v8i16, { 3, 7, 11, 11 } },
3952 { ISD::CTPOP, MVT::v16i16, { 6, 8, 11, 18 } },
3953 { ISD::CTPOP, MVT::v16i8, { 2, 5, 8, 8 } },
3954 { ISD::CTPOP, MVT::v32i8, { 3, 5, 8, 12 } },
3955 { ISD::CTTZ, MVT::v2i64, { 4, 11, 13, 13 } },
3956 { ISD::CTTZ, MVT::v4i64, { 5, 11, 13, 20 } },
3957 { ISD::CTTZ, MVT::v4i32, { 7, 14, 17, 17 } },
3958 { ISD::CTTZ, MVT::v8i32, { 7, 15, 17, 24 } },
3959 { ISD::CTTZ, MVT::v8i16, { 4, 9, 14, 14 } },
3960 { ISD::CTTZ, MVT::v16i16, { 6, 9, 14, 24 } },
3961 { ISD::CTTZ, MVT::v16i8, { 3, 7, 11, 11 } },
3962 { ISD::CTTZ, MVT::v32i8, { 5, 7, 11, 18 } },
3963 { ISD::SADDSAT, MVT::v2i64, { 4, 13, 8, 11 } },
3964 { ISD::SADDSAT, MVT::v4i64, { 3, 10, 8, 12 } },
3965 { ISD::SADDSAT, MVT::v4i32, { 2, 6, 7, 9 } },
3966 { ISD::SADDSAT, MVT::v8i32, { 4, 6, 7, 13 } },
3967 { ISD::SADDSAT, MVT::v16i16, { 1, 1, 1, 2 } },
3968 { ISD::SADDSAT, MVT::v32i8, { 1, 1, 1, 2 } },
3969 { ISD::SMAX, MVT::v2i64, { 2, 7, 2, 3 } },
3970 { ISD::SMAX, MVT::v4i64, { 2, 7, 2, 3 } },
3971 { ISD::SMAX, MVT::v8i32, { 1, 1, 1, 2 } },
3972 { ISD::SMAX, MVT::v16i16, { 1, 1, 1, 2 } },
3973 { ISD::SMAX, MVT::v32i8, { 1, 1, 1, 2 } },
3974 { ISD::SMIN, MVT::v2i64, { 2, 7, 2, 3 } },
3975 { ISD::SMIN, MVT::v4i64, { 2, 7, 2, 3 } },
3976 { ISD::SMIN, MVT::v8i32, { 1, 1, 1, 2 } },
3977 { ISD::SMIN, MVT::v16i16, { 1, 1, 1, 2 } },
3978 { ISD::SMIN, MVT::v32i8, { 1, 1, 1, 2 } },
3979 { ISD::SMULO, MVT::v4i64, { 20, 20, 33, 37 } },
3980 { ISD::SMULO, MVT::v2i64, { 8, 8, 13, 15 } },
3981 { ISD::SMULO, MVT::v8i32, { 8, 20, 13, 24 } },
3982 { ISD::SMULO, MVT::v4i32, { 5, 15, 11, 12 } },
3983 { ISD::SMULO, MVT::v16i16, { 4, 14, 8, 14 } },
3984 { ISD::SMULO, MVT::v8i16, { 3, 9, 6, 6 } },
3985 { ISD::SMULO, MVT::v32i8, { 9, 15, 18, 35 } },
3986 { ISD::SMULO, MVT::v16i8, { 6, 22, 14, 21 } },
3987 { ISD::SSUBSAT, MVT::v2i64, { 4, 13, 9, 13 } },
3988 { ISD::SSUBSAT, MVT::v4i64, { 4, 15, 9, 13 } },
3989 { ISD::SSUBSAT, MVT::v4i32, { 3, 14, 9, 11 } },
3990 { ISD::SSUBSAT, MVT::v8i32, { 4, 15, 9, 16 } },
3991 { ISD::SSUBSAT, MVT::v16i16, { 1, 1, 1, 2 } },
3992 { ISD::SSUBSAT, MVT::v32i8, { 1, 1, 1, 2 } },
3993 { ISD::UADDSAT, MVT::v2i64, { 2, 8, 6, 6 } },
3994 { ISD::UADDSAT, MVT::v4i64, { 3, 8, 6, 10 } },
3995 { ISD::UADDSAT, MVT::v8i32, { 2, 2, 4, 8 } },
3996 { ISD::UADDSAT, MVT::v16i16, { 1, 1, 1, 2 } },
3997 { ISD::UADDSAT, MVT::v32i8, { 1, 1, 1, 2 } },
3998 { ISD::UMAX, MVT::v2i64, { 2, 8, 5, 6 } },
3999 { ISD::UMAX, MVT::v4i64, { 2, 8, 5, 8 } },
4000 { ISD::UMAX, MVT::v8i32, { 1, 1, 1, 2 } },
4001 { ISD::UMAX, MVT::v16i16, { 1, 1, 1, 2 } },
4002 { ISD::UMAX, MVT::v32i8, { 1, 1, 1, 2 } },
4003 { ISD::UMIN, MVT::v2i64, { 2, 8, 5, 6 } },
4004 { ISD::UMIN, MVT::v4i64, { 2, 8, 5, 8 } },
4005 { ISD::UMIN, MVT::v8i32, { 1, 1, 1, 2 } },
4006 { ISD::UMIN, MVT::v16i16, { 1, 1, 1, 2 } },
4007 { ISD::UMIN, MVT::v32i8, { 1, 1, 1, 2 } },
4008 { ISD::UMULO, MVT::v4i64, { 24, 24, 39, 43 } },
4009 { ISD::UMULO, MVT::v2i64, { 10, 10, 15, 19 } },
4010 { ISD::UMULO, MVT::v8i32, { 8, 11, 13, 23 } },
4011 { ISD::UMULO, MVT::v4i32, { 5, 12, 11, 12 } },
4012 { ISD::UMULO, MVT::v16i16, { 4, 6, 8, 13 } },
4013 { ISD::UMULO, MVT::v8i16, { 2, 8, 6, 6 } },
4014 { ISD::UMULO, MVT::v32i8, { 9, 13, 17, 33 } },
4015 { ISD::UMULO, MVT::v16i8, { 6, 19, 13, 20 } },
4016 { ISD::USUBSAT, MVT::v2i64, { 2, 7, 6, 6 } },
4017 { ISD::USUBSAT, MVT::v4i64, { 3, 7, 6, 10 } },
4018 { ISD::USUBSAT, MVT::v8i32, { 2, 2, 2, 4 } },
4019 { ISD::USUBSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4020 { ISD::USUBSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4021 { ISD::FMAXNUM, MVT::f32, { 2, 7, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4022 { ISD::FMAXNUM, MVT::v4f32, { 2, 7, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4023 { ISD::FMAXNUM, MVT::v8f32, { 3, 7, 3, 6 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4024 { ISD::FMAXNUM, MVT::f64, { 2, 7, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4025 { ISD::FMAXNUM, MVT::v2f64, { 2, 7, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4026 { ISD::FMAXNUM, MVT::v4f64, { 3, 7, 3, 6 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4027 { ISD::FSQRT, MVT::f32, { 7, 15, 1, 1 } }, // vsqrtss
4028 { ISD::FSQRT, MVT::v4f32, { 7, 15, 1, 1 } }, // vsqrtps
4029 { ISD::FSQRT, MVT::v8f32, { 14, 21, 1, 3 } }, // vsqrtps
4030 { ISD::FSQRT, MVT::f64, { 14, 21, 1, 1 } }, // vsqrtsd
4031 { ISD::FSQRT, MVT::v2f64, { 14, 21, 1, 1 } }, // vsqrtpd
4032 { ISD::FSQRT, MVT::v4f64, { 28, 35, 1, 3 } }, // vsqrtpd
4033 };
4034 static const CostKindTblEntry AVX1CostTbl[] = {
4035 { ISD::ABS, MVT::v4i64, { 6, 8, 6, 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
4036 { ISD::ABS, MVT::v8i32, { 3, 6, 4, 5 } },
4037 { ISD::ABS, MVT::v16i16, { 3, 6, 4, 5 } },
4038 { ISD::ABS, MVT::v32i8, { 3, 6, 4, 5 } },
4039 { ISD::BITREVERSE, MVT::v4i64, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4040 { ISD::BITREVERSE, MVT::v2i64, { 8, 13, 10, 16 } },
4041 { ISD::BITREVERSE, MVT::v8i32, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4042 { ISD::BITREVERSE, MVT::v4i32, { 8, 13, 10, 16 } },
4043 { ISD::BITREVERSE, MVT::v16i16, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4044 { ISD::BITREVERSE, MVT::v8i16, { 8, 13, 10, 16 } },
4045 { ISD::BITREVERSE, MVT::v32i8, { 13, 15, 17, 26 } }, // 2 x 128-bit Op + extract/insert
4046 { ISD::BITREVERSE, MVT::v16i8, { 7, 7, 9, 13 } },
4047 { ISD::BSWAP, MVT::v4i64, { 5, 6, 5, 10 } },
4048 { ISD::BSWAP, MVT::v2i64, { 2, 2, 1, 3 } },
4049 { ISD::BSWAP, MVT::v8i32, { 5, 6, 5, 10 } },
4050 { ISD::BSWAP, MVT::v4i32, { 2, 2, 1, 3 } },
4051 { ISD::BSWAP, MVT::v16i16, { 5, 6, 5, 10 } },
4052 { ISD::BSWAP, MVT::v8i16, { 2, 2, 1, 3 } },
4053 { ISD::CTLZ, MVT::v4i64, { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert
4054 { ISD::CTLZ, MVT::v2i64, { 14, 24, 24, 28 } },
4055 { ISD::CTLZ, MVT::v8i32, { 24, 28, 39, 48 } }, // 2 x 128-bit Op + extract/insert
4056 { ISD::CTLZ, MVT::v4i32, { 12, 20, 19, 23 } },
4057 { ISD::CTLZ, MVT::v16i16, { 19, 22, 29, 38 } }, // 2 x 128-bit Op + extract/insert
4058 { ISD::CTLZ, MVT::v8i16, { 9, 16, 14, 18 } },
4059 { ISD::CTLZ, MVT::v32i8, { 14, 15, 19, 28 } }, // 2 x 128-bit Op + extract/insert
4060 { ISD::CTLZ, MVT::v16i8, { 7, 12, 9, 13 } },
4061 { ISD::CTPOP, MVT::v4i64, { 14, 18, 19, 28 } }, // 2 x 128-bit Op + extract/insert
4062 { ISD::CTPOP, MVT::v2i64, { 7, 14, 10, 14 } },
4063 { ISD::CTPOP, MVT::v8i32, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
4064 { ISD::CTPOP, MVT::v4i32, { 9, 20, 14, 18 } },
4065 { ISD::CTPOP, MVT::v16i16, { 16, 21, 22, 31 } }, // 2 x 128-bit Op + extract/insert
4066 { ISD::CTPOP, MVT::v8i16, { 8, 18, 11, 15 } },
4067 { ISD::CTPOP, MVT::v32i8, { 13, 15, 16, 25 } }, // 2 x 128-bit Op + extract/insert
4068 { ISD::CTPOP, MVT::v16i8, { 6, 12, 8, 12 } },
4069 { ISD::CTTZ, MVT::v4i64, { 17, 22, 24, 33 } }, // 2 x 128-bit Op + extract/insert
4070 { ISD::CTTZ, MVT::v2i64, { 9, 19, 13, 17 } },
4071 { ISD::CTTZ, MVT::v8i32, { 21, 27, 32, 41 } }, // 2 x 128-bit Op + extract/insert
4072 { ISD::CTTZ, MVT::v4i32, { 11, 24, 17, 21 } },
4073 { ISD::CTTZ, MVT::v16i16, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
4074 { ISD::CTTZ, MVT::v8i16, { 9, 21, 14, 18 } },
4075 { ISD::CTTZ, MVT::v32i8, { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert
4076 { ISD::CTTZ, MVT::v16i8, { 8, 16, 11, 15 } },
4077 { ISD::SADDSAT, MVT::v2i64, { 6, 13, 8, 11 } },
4078 { ISD::SADDSAT, MVT::v4i64, { 13, 20, 15, 25 } }, // 2 x 128-bit Op + extract/insert
4079 { ISD::SADDSAT, MVT::v8i32, { 12, 18, 14, 24 } }, // 2 x 128-bit Op + extract/insert
4080 { ISD::SADDSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4081 { ISD::SADDSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4082 { ISD::SMAX, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
4083 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 4 } },
4084 { ISD::SMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4085 { ISD::SMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4086 { ISD::SMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4087 { ISD::SMIN, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
4088 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
4089 { ISD::SMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4090 { ISD::SMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4091 { ISD::SMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4092 { ISD::SMULO, MVT::v4i64, { 20, 20, 33, 37 } },
4093 { ISD::SMULO, MVT::v2i64, { 9, 9, 13, 17 } },
4094 { ISD::SMULO, MVT::v8i32, { 15, 20, 24, 29 } },
4095 { ISD::SMULO, MVT::v4i32, { 7, 15, 11, 13 } },
4096 { ISD::SMULO, MVT::v16i16, { 8, 14, 14, 15 } },
4097 { ISD::SMULO, MVT::v8i16, { 3, 9, 6, 6 } },
4098 { ISD::SMULO, MVT::v32i8, { 20, 20, 37, 39 } },
4099 { ISD::SMULO, MVT::v16i8, { 9, 22, 18, 21 } },
4100 { ISD::SSUBSAT, MVT::v2i64, { 7, 13, 9, 13 } },
4101 { ISD::SSUBSAT, MVT::v4i64, { 15, 21, 18, 29 } }, // 2 x 128-bit Op + extract/insert
4102 { ISD::SSUBSAT, MVT::v8i32, { 15, 19, 18, 29 } }, // 2 x 128-bit Op + extract/insert
4103 { ISD::SSUBSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4104 { ISD::SSUBSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4105 { ISD::UADDSAT, MVT::v2i64, { 3, 8, 6, 6 } },
4106 { ISD::UADDSAT, MVT::v4i64, { 8, 11, 14, 15 } }, // 2 x 128-bit Op + extract/insert
4107 { ISD::UADDSAT, MVT::v8i32, { 6, 6, 10, 11 } }, // 2 x 128-bit Op + extract/insert
4108 { ISD::UADDSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4109 { ISD::UADDSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4110 { ISD::UMAX, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
4111 { ISD::UMAX, MVT::v2i64, { 4, 8, 5, 7 } },
4112 { ISD::UMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4113 { ISD::UMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4114 { ISD::UMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4115 { ISD::UMIN, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
4116 { ISD::UMIN, MVT::v2i64, { 4, 8, 5, 7 } },
4117 { ISD::UMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4118 { ISD::UMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4119 { ISD::UMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4120 { ISD::UMULO, MVT::v4i64, { 24, 26, 39, 45 } },
4121 { ISD::UMULO, MVT::v2i64, { 10, 12, 15, 20 } },
4122 { ISD::UMULO, MVT::v8i32, { 14, 15, 23, 28 } },
4123 { ISD::UMULO, MVT::v4i32, { 7, 12, 11, 13 } },
4124 { ISD::UMULO, MVT::v16i16, { 7, 11, 13, 14 } },
4125 { ISD::UMULO, MVT::v8i16, { 3, 8, 6, 6 } },
4126 { ISD::UMULO, MVT::v32i8, { 19, 19, 35, 37 } },
4127 { ISD::UMULO, MVT::v16i8, { 9, 19, 17, 20 } },
4128 { ISD::USUBSAT, MVT::v2i64, { 3, 7, 6, 6 } },
4129 { ISD::USUBSAT, MVT::v4i64, { 8, 10, 14, 15 } }, // 2 x 128-bit Op + extract/insert
4130 { ISD::USUBSAT, MVT::v8i32, { 4, 4, 7, 8 } }, // 2 x 128-bit Op + extract/insert
4131 { ISD::USUBSAT, MVT::v8i32, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4132 { ISD::USUBSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4133 { ISD::USUBSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4134 { ISD::FMAXNUM, MVT::f32, { 3, 6, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4135 { ISD::FMAXNUM, MVT::v4f32, { 3, 6, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4136 { ISD::FMAXNUM, MVT::v8f32, { 5, 7, 3, 10 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4137 { ISD::FMAXNUM, MVT::f64, { 3, 6, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4138 { ISD::FMAXNUM, MVT::v2f64, { 3, 6, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4139 { ISD::FMAXNUM, MVT::v4f64, { 5, 7, 3, 10 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4140 { ISD::FSQRT, MVT::f32, { 21, 21, 1, 1 } }, // vsqrtss
4141 { ISD::FSQRT, MVT::v4f32, { 21, 21, 1, 1 } }, // vsqrtps
4142 { ISD::FSQRT, MVT::v8f32, { 42, 42, 1, 3 } }, // vsqrtps
4143 { ISD::FSQRT, MVT::f64, { 27, 27, 1, 1 } }, // vsqrtsd
4144 { ISD::FSQRT, MVT::v2f64, { 27, 27, 1, 1 } }, // vsqrtpd
4145 { ISD::FSQRT, MVT::v4f64, { 54, 54, 1, 3 } }, // vsqrtpd
4146 };
4147 static const CostKindTblEntry GFNICostTbl[] = {
4148 { ISD::BITREVERSE, MVT::i8, { 3, 3, 3, 4 } }, // gf2p8affineqb
4149 { ISD::BITREVERSE, MVT::i16, { 3, 3, 4, 6 } }, // gf2p8affineqb
4150 { ISD::BITREVERSE, MVT::i32, { 3, 3, 4, 5 } }, // gf2p8affineqb
4151 { ISD::BITREVERSE, MVT::i64, { 3, 3, 4, 6 } }, // gf2p8affineqb
4152 { ISD::BITREVERSE, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4153 { ISD::BITREVERSE, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4154 { ISD::BITREVERSE, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4155 { ISD::BITREVERSE, MVT::v8i16, { 1, 8, 2, 4 } }, // gf2p8affineqb
4156 { ISD::BITREVERSE, MVT::v16i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
4157 { ISD::BITREVERSE, MVT::v32i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
4158 { ISD::BITREVERSE, MVT::v4i32, { 1, 8, 2, 4 } }, // gf2p8affineqb
4159 { ISD::BITREVERSE, MVT::v8i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
4160 { ISD::BITREVERSE, MVT::v16i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
4161 { ISD::BITREVERSE, MVT::v2i64, { 1, 8, 2, 4 } }, // gf2p8affineqb
4162 { ISD::BITREVERSE, MVT::v4i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
4163 { ISD::BITREVERSE, MVT::v8i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
4164 { X86ISD::VROTLI, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4165 { X86ISD::VROTLI, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4166 { X86ISD::VROTLI, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4167 };
4168 static const CostKindTblEntry GLMCostTbl[] = {
4169 { ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss
4170 { ISD::FSQRT, MVT::v4f32, { 37, 41, 1, 5 } }, // sqrtps
4171 { ISD::FSQRT, MVT::f64, { 34, 35, 1, 1 } }, // sqrtsd
4172 { ISD::FSQRT, MVT::v2f64, { 67, 71, 1, 5 } }, // sqrtpd
4173 };
4174 static const CostKindTblEntry SLMCostTbl[] = {
4175 { ISD::BSWAP, MVT::v2i64, { 5, 5, 1, 5 } },
4176 { ISD::BSWAP, MVT::v4i32, { 5, 5, 1, 5 } },
4177 { ISD::BSWAP, MVT::v8i16, { 5, 5, 1, 5 } },
4178 { ISD::FSQRT, MVT::f32, { 20, 20, 1, 1 } }, // sqrtss
4179 { ISD::FSQRT, MVT::v4f32, { 40, 41, 1, 5 } }, // sqrtps
4180 { ISD::FSQRT, MVT::f64, { 35, 35, 1, 1 } }, // sqrtsd
4181 { ISD::FSQRT, MVT::v2f64, { 70, 71, 1, 5 } }, // sqrtpd
4182 };
4183 static const CostKindTblEntry SSE42CostTbl[] = {
4184 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4185 { ISD::FMAXNUM, MVT::v4f32, { 4, 4, 4, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4186 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4187 { ISD::FMAXNUM, MVT::v2f64, { 4, 4, 4, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4188 { ISD::FSQRT, MVT::f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
4189 { ISD::FSQRT, MVT::v4f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
4190 };
4191 static const CostKindTblEntry SSE41CostTbl[] = {
4192 { ISD::ABS, MVT::v2i64, { 3, 4, 3, 5 } }, // BLENDVPD(X,PSUBQ(0,X),X)
4193 { ISD::SADDSAT, MVT::v2i64, { 10, 14, 17, 21 } },
4194 { ISD::SADDSAT, MVT::v4i32, { 5, 11, 8, 10 } },
4195 { ISD::SSUBSAT, MVT::v2i64, { 12, 19, 25, 29 } },
4196 { ISD::SSUBSAT, MVT::v4i32, { 6, 14, 10, 12 } },
4197 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 3 } },
4198 { ISD::SMAX, MVT::v4i32, { 1, 1, 1, 1 } },
4199 { ISD::SMAX, MVT::v16i8, { 1, 1, 1, 1 } },
4200 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
4201 { ISD::SMIN, MVT::v4i32, { 1, 1, 1, 1 } },
4202 { ISD::SMIN, MVT::v16i8, { 1, 1, 1, 1 } },
4203 { ISD::SMULO, MVT::v2i64, { 9, 11, 13, 17 } },
4204 { ISD::SMULO, MVT::v4i32, { 20, 24, 13, 19 } },
4205 { ISD::SMULO, MVT::v8i16, { 5, 9, 8, 8 } },
4206 { ISD::SMULO, MVT::v16i8, { 13, 22, 24, 25 } },
4207 { ISD::UADDSAT, MVT::v2i64, { 6, 13, 14, 14 } },
4208 { ISD::UADDSAT, MVT::v4i32, { 2, 2, 4, 4 } },
4209 { ISD::USUBSAT, MVT::v2i64, { 6, 10, 14, 14 } },
4210 { ISD::USUBSAT, MVT::v4i32, { 1, 2, 2, 2 } },
4211 { ISD::UMAX, MVT::v2i64, { 2, 11, 6, 7 } },
4212 { ISD::UMAX, MVT::v4i32, { 1, 1, 1, 1 } },
4213 { ISD::UMAX, MVT::v8i16, { 1, 1, 1, 1 } },
4214 { ISD::UMIN, MVT::v2i64, { 2, 11, 6, 7 } },
4215 { ISD::UMIN, MVT::v4i32, { 1, 1, 1, 1 } },
4216 { ISD::UMIN, MVT::v8i16, { 1, 1, 1, 1 } },
4217 { ISD::UMULO, MVT::v2i64, { 14, 20, 15, 20 } },
4218 { ISD::UMULO, MVT::v4i32, { 19, 22, 12, 18 } },
4219 { ISD::UMULO, MVT::v8i16, { 4, 9, 7, 7 } },
4220 { ISD::UMULO, MVT::v16i8, { 13, 19, 18, 20 } },
4221 };
4222 static const CostKindTblEntry SSSE3CostTbl[] = {
4223 { ISD::ABS, MVT::v4i32, { 1, 2, 1, 1 } },
4224 { ISD::ABS, MVT::v8i16, { 1, 2, 1, 1 } },
4225 { ISD::ABS, MVT::v16i8, { 1, 2, 1, 1 } },
4226 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 11, 21 } },
4227 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 11, 21 } },
4228 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 11, 21 } },
4229 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 10, 16 } },
4230 { ISD::BSWAP, MVT::v2i64, { 2, 3, 1, 5 } },
4231 { ISD::BSWAP, MVT::v4i32, { 2, 3, 1, 5 } },
4232 { ISD::BSWAP, MVT::v8i16, { 2, 3, 1, 5 } },
4233 { ISD::CTLZ, MVT::v2i64, { 18, 28, 28, 35 } },
4234 { ISD::CTLZ, MVT::v4i32, { 15, 20, 22, 28 } },
4235 { ISD::CTLZ, MVT::v8i16, { 13, 17, 16, 22 } },
4236 { ISD::CTLZ, MVT::v16i8, { 11, 15, 10, 16 } },
4237 { ISD::CTPOP, MVT::v2i64, { 13, 19, 12, 18 } },
4238 { ISD::CTPOP, MVT::v4i32, { 18, 24, 16, 22 } },
4239 { ISD::CTPOP, MVT::v8i16, { 13, 18, 14, 20 } },
4240 { ISD::CTPOP, MVT::v16i8, { 11, 12, 10, 16 } },
4241 { ISD::CTTZ, MVT::v2i64, { 13, 25, 15, 22 } },
4242 { ISD::CTTZ, MVT::v4i32, { 18, 26, 19, 25 } },
4243 { ISD::CTTZ, MVT::v8i16, { 13, 20, 17, 23 } },
4244 { ISD::CTTZ, MVT::v16i8, { 11, 16, 13, 19 } }
4245 };
4246 static const CostKindTblEntry SSE2CostTbl[] = {
4247 { ISD::ABS, MVT::v2i64, { 3, 6, 5, 5 } },
4248 { ISD::ABS, MVT::v4i32, { 1, 4, 4, 4 } },
4249 { ISD::ABS, MVT::v8i16, { 1, 2, 3, 3 } },
4250 { ISD::ABS, MVT::v16i8, { 1, 2, 3, 3 } },
4251 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 32, 32 } },
4252 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 30, 30 } },
4253 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 25, 25 } },
4254 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 21, 21 } },
4255 { ISD::BSWAP, MVT::v2i64, { 5, 6, 11, 11 } },
4256 { ISD::BSWAP, MVT::v4i32, { 5, 5, 9, 9 } },
4257 { ISD::BSWAP, MVT::v8i16, { 5, 5, 4, 5 } },
4258 { ISD::CTLZ, MVT::v2i64, { 10, 45, 36, 38 } },
4259 { ISD::CTLZ, MVT::v4i32, { 10, 45, 38, 40 } },
4260 { ISD::CTLZ, MVT::v8i16, { 9, 38, 32, 34 } },
4261 { ISD::CTLZ, MVT::v16i8, { 8, 39, 29, 32 } },
4262 { ISD::CTPOP, MVT::v2i64, { 12, 26, 16, 18 } },
4263 { ISD::CTPOP, MVT::v4i32, { 15, 29, 21, 23 } },
4264 { ISD::CTPOP, MVT::v8i16, { 13, 25, 18, 20 } },
4265 { ISD::CTPOP, MVT::v16i8, { 10, 21, 14, 16 } },
4266 { ISD::CTTZ, MVT::v2i64, { 14, 28, 19, 21 } },
4267 { ISD::CTTZ, MVT::v4i32, { 18, 31, 24, 26 } },
4268 { ISD::CTTZ, MVT::v8i16, { 16, 27, 21, 23 } },
4269 { ISD::CTTZ, MVT::v16i8, { 13, 23, 17, 19 } },
4270 { ISD::SADDSAT, MVT::v2i64, { 12, 14, 24, 24 } },
4271 { ISD::SADDSAT, MVT::v4i32, { 6, 11, 11, 12 } },
4272 { ISD::SADDSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4273 { ISD::SADDSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4274 { ISD::SMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4275 { ISD::SMAX, MVT::v4i32, { 2, 4, 5, 5 } },
4276 { ISD::SMAX, MVT::v8i16, { 1, 1, 1, 1 } },
4277 { ISD::SMAX, MVT::v16i8, { 2, 4, 5, 5 } },
4278 { ISD::SMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4279 { ISD::SMIN, MVT::v4i32, { 2, 4, 5, 5 } },
4280 { ISD::SMIN, MVT::v8i16, { 1, 1, 1, 1 } },
4281 { ISD::SMIN, MVT::v16i8, { 2, 4, 5, 5 } },
4282 { ISD::SMULO, MVT::v2i64, { 30, 33, 13, 23 } },
4283 { ISD::SMULO, MVT::v4i32, { 20, 24, 23, 23 } },
4284 { ISD::SMULO, MVT::v8i16, { 5, 10, 8, 8 } },
4285 { ISD::SMULO, MVT::v16i8, { 13, 23, 24, 25 } },
4286 { ISD::SSUBSAT, MVT::v2i64, { 16, 19, 31, 31 } },
4287 { ISD::SSUBSAT, MVT::v4i32, { 6, 14, 12, 13 } },
4288 { ISD::SSUBSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4289 { ISD::SSUBSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4290 { ISD::UADDSAT, MVT::v2i64, { 7, 13, 14, 14 } },
4291 { ISD::UADDSAT, MVT::v4i32, { 4, 5, 7, 7 } },
4292 { ISD::UADDSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4293 { ISD::UADDSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4294 { ISD::UMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4295 { ISD::UMAX, MVT::v4i32, { 2, 5, 8, 8 } },
4296 { ISD::UMAX, MVT::v8i16, { 1, 3, 3, 3 } },
4297 { ISD::UMAX, MVT::v16i8, { 1, 1, 1, 1 } },
4298 { ISD::UMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4299 { ISD::UMIN, MVT::v4i32, { 2, 5, 8, 8 } },
4300 { ISD::UMIN, MVT::v8i16, { 1, 3, 3, 3 } },
4301 { ISD::UMIN, MVT::v16i8, { 1, 1, 1, 1 } },
4302 { ISD::UMULO, MVT::v2i64, { 30, 33, 15, 29 } },
4303 { ISD::UMULO, MVT::v4i32, { 19, 22, 14, 18 } },
4304 { ISD::UMULO, MVT::v8i16, { 4, 9, 7, 7 } },
4305 { ISD::UMULO, MVT::v16i8, { 13, 19, 20, 20 } },
4306 { ISD::USUBSAT, MVT::v2i64, { 7, 10, 14, 14 } },
4307 { ISD::USUBSAT, MVT::v4i32, { 4, 4, 7, 7 } },
4308 { ISD::USUBSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4309 { ISD::USUBSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4310 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } },
4311 { ISD::FMAXNUM, MVT::v2f64, { 4, 6, 6, 6 } },
4312 { ISD::FSQRT, MVT::f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4313 { ISD::FSQRT, MVT::v2f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4314 };
4315 static const CostKindTblEntry SSE1CostTbl[] = {
4316 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } },
4317 { ISD::FMAXNUM, MVT::v4f32, { 4, 6, 6, 6 } },
4318 { ISD::FSQRT, MVT::f32, { 28, 30, 1, 2 } }, // Pentium III from http://www.agner.org/
4319 { ISD::FSQRT, MVT::v4f32, { 56, 56, 1, 2 } }, // Pentium III from http://www.agner.org/
4320 };
4321 static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets
4322 { ISD::CTTZ, MVT::i64, { 1, 1, 1, 1 } },
4323 };
4324 static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
4325 { ISD::CTTZ, MVT::i32, { 1, 1, 1, 1 } },
4326 { ISD::CTTZ, MVT::i16, { 2, 1, 1, 1 } },
4327 { ISD::CTTZ, MVT::i8, { 2, 1, 1, 1 } },
4328 };
4329 static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets
4330 { ISD::CTLZ, MVT::i64, { 1, 1, 1, 1 } },
4331 };
4332 static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
4333 { ISD::CTLZ, MVT::i32, { 1, 1, 1, 1 } },
4334 { ISD::CTLZ, MVT::i16, { 2, 1, 1, 1 } },
4335 { ISD::CTLZ, MVT::i8, { 2, 1, 1, 1 } },
4336 };
4337 static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets
4338 { ISD::CTPOP, MVT::i64, { 1, 1, 1, 1 } }, // popcnt
4339 };
4340 static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
4341 { ISD::CTPOP, MVT::i32, { 1, 1, 1, 1 } }, // popcnt
4342 { ISD::CTPOP, MVT::i16, { 1, 1, 2, 2 } }, // popcnt(zext())
4343 { ISD::CTPOP, MVT::i8, { 1, 1, 2, 2 } }, // popcnt(zext())
4344 };
4345 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
4346 { ISD::ABS, MVT::i64, { 1, 2, 3, 3 } }, // SUB+CMOV
4347 { ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } },
4348 { ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } },
4349 { ISD::CTLZ, MVT::i64, { 1, 2, 3, 3 } }, // MOV+BSR+XOR
4350 { ISD::CTLZ, MVT::i32, { 1, 2, 3, 3 } }, // MOV+BSR+XOR
4351 { ISD::CTLZ, MVT::i16, { 2, 2, 3, 3 } }, // MOV+BSR+XOR
4352 { ISD::CTLZ, MVT::i8, { 2, 2, 4, 3 } }, // MOV+BSR+XOR
4353 { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 2, 2 } }, // BSR+XOR
4354 { ISD::CTTZ, MVT::i64, { 1, 2, 2, 2 } }, // MOV+BSF
4355 { ISD::CTTZ, MVT::i32, { 1, 2, 2, 2 } }, // MOV+BSF
4356 { ISD::CTTZ, MVT::i16, { 2, 2, 2, 2 } }, // MOV+BSF
4357 { ISD::CTTZ, MVT::i8, { 2, 2, 2, 2 } }, // MOV+BSF
4358 { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 1, 2 } }, // BSF
4359 { ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } },
4360 { ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } },
4361 { ISD::ROTR, MVT::i64, { 2, 3, 1, 3 } },
4362 { X86ISD::VROTLI, MVT::i64, { 1, 1, 1, 1 } },
4363 { ISD::FSHL, MVT::i64, { 4, 4, 1, 4 } },
4364 { ISD::SADDSAT, MVT::i64, { 4, 4, 7, 10 } },
4365 { ISD::SSUBSAT, MVT::i64, { 4, 5, 8, 11 } },
4366 { ISD::UADDSAT, MVT::i64, { 2, 3, 4, 7 } },
4367 { ISD::USUBSAT, MVT::i64, { 2, 3, 4, 7 } },
4368 { ISD::SMAX, MVT::i64, { 1, 3, 2, 3 } },
4369 { ISD::SMIN, MVT::i64, { 1, 3, 2, 3 } },
4370 { ISD::UMAX, MVT::i64, { 1, 3, 2, 3 } },
4371 { ISD::UMIN, MVT::i64, { 1, 3, 2, 3 } },
4372 { ISD::SADDO, MVT::i64, { 2, 2, 4, 6 } },
4373 { ISD::UADDO, MVT::i64, { 2, 2, 4, 6 } },
4374 { ISD::SMULO, MVT::i64, { 4, 4, 4, 6 } },
4375 { ISD::UMULO, MVT::i64, { 8, 8, 4, 7 } },
4376 };
4377 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
4378 { ISD::ABS, MVT::i32, { 1, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
4379 { ISD::ABS, MVT::i16, { 2, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
4380 { ISD::ABS, MVT::i8, { 2, 4, 4, 3 } }, // SUB+XOR+SRA
4381 { ISD::BITREVERSE, MVT::i32, { 9, 12, 17, 19 } },
4382 { ISD::BITREVERSE, MVT::i16, { 9, 12, 17, 19 } },
4383 { ISD::BITREVERSE, MVT::i8, { 7, 9, 13, 14 } },
4384 { ISD::BSWAP, MVT::i32, { 1, 1, 1, 1 } },
4385 { ISD::BSWAP, MVT::i16, { 1, 2, 1, 2 } }, // ROL
4386 { ISD::CTLZ, MVT::i32, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
4387 { ISD::CTLZ, MVT::i16, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
4388 { ISD::CTLZ, MVT::i8, { 2, 2, 5, 6 } }, // BSR+XOR or BSR+XOR+CMOV
4389 { ISD::CTLZ_ZERO_UNDEF, MVT::i32,{ 1, 2, 2, 2 } }, // BSR+XOR
4390 { ISD::CTLZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 2, 2 } }, // BSR+XOR
4391 { ISD::CTLZ_ZERO_UNDEF, MVT::i8, { 2, 2, 3, 3 } }, // BSR+XOR
4392 { ISD::CTTZ, MVT::i32, { 2, 2, 3, 3 } }, // TEST+BSF+CMOV/BRANCH
4393 { ISD::CTTZ, MVT::i16, { 2, 2, 2, 3 } }, // TEST+BSF+CMOV/BRANCH
4394 { ISD::CTTZ, MVT::i8, { 2, 2, 2, 3 } }, // TEST+BSF+CMOV/BRANCH
4395 { ISD::CTTZ_ZERO_UNDEF, MVT::i32,{ 1, 2, 1, 2 } }, // BSF
4396 { ISD::CTTZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 1, 2 } }, // BSF
4397 { ISD::CTTZ_ZERO_UNDEF, MVT::i8, { 2, 2, 1, 2 } }, // BSF
4398 { ISD::CTPOP, MVT::i32, { 8, 7, 15, 15 } },
4399 { ISD::CTPOP, MVT::i16, { 9, 8, 17, 17 } },
4400 { ISD::CTPOP, MVT::i8, { 7, 6, 6, 6 } },
4401 { ISD::ROTL, MVT::i32, { 2, 3, 1, 3 } },
4402 { ISD::ROTL, MVT::i16, { 2, 3, 1, 3 } },
4403 { ISD::ROTL, MVT::i8, { 2, 3, 1, 3 } },
4404 { ISD::ROTR, MVT::i32, { 2, 3, 1, 3 } },
4405 { ISD::ROTR, MVT::i16, { 2, 3, 1, 3 } },
4406 { ISD::ROTR, MVT::i8, { 2, 3, 1, 3 } },
4407 { X86ISD::VROTLI, MVT::i32, { 1, 1, 1, 1 } },
4408 { X86ISD::VROTLI, MVT::i16, { 1, 1, 1, 1 } },
4409 { X86ISD::VROTLI, MVT::i8, { 1, 1, 1, 1 } },
4410 { ISD::FSHL, MVT::i32, { 4, 4, 1, 4 } },
4411 { ISD::FSHL, MVT::i16, { 4, 4, 2, 5 } },
4412 { ISD::FSHL, MVT::i8, { 4, 4, 2, 5 } },
4413 { ISD::SADDSAT, MVT::i32, { 3, 4, 6, 9 } },
4414 { ISD::SADDSAT, MVT::i16, { 4, 4, 7, 10 } },
4415 { ISD::SADDSAT, MVT::i8, { 4, 5, 8, 11 } },
4416 { ISD::SSUBSAT, MVT::i32, { 4, 4, 7, 10 } },
4417 { ISD::SSUBSAT, MVT::i16, { 4, 4, 7, 10 } },
4418 { ISD::SSUBSAT, MVT::i8, { 4, 5, 8, 11 } },
4419 { ISD::UADDSAT, MVT::i32, { 2, 3, 4, 7 } },
4420 { ISD::UADDSAT, MVT::i16, { 2, 3, 4, 7 } },
4421 { ISD::UADDSAT, MVT::i8, { 3, 3, 5, 8 } },
4422 { ISD::USUBSAT, MVT::i32, { 2, 3, 4, 7 } },
4423 { ISD::USUBSAT, MVT::i16, { 2, 3, 4, 7 } },
4424 { ISD::USUBSAT, MVT::i8, { 3, 3, 5, 8 } },
4425 { ISD::SMAX, MVT::i32, { 1, 2, 2, 3 } },
4426 { ISD::SMAX, MVT::i16, { 1, 4, 2, 4 } },
4427 { ISD::SMAX, MVT::i8, { 1, 4, 2, 4 } },
4428 { ISD::SMIN, MVT::i32, { 1, 2, 2, 3 } },
4429 { ISD::SMIN, MVT::i16, { 1, 4, 2, 4 } },
4430 { ISD::SMIN, MVT::i8, { 1, 4, 2, 4 } },
4431 { ISD::UMAX, MVT::i32, { 1, 2, 2, 3 } },
4432 { ISD::UMAX, MVT::i16, { 1, 4, 2, 4 } },
4433 { ISD::UMAX, MVT::i8, { 1, 4, 2, 4 } },
4434 { ISD::UMIN, MVT::i32, { 1, 2, 2, 3 } },
4435 { ISD::UMIN, MVT::i16, { 1, 4, 2, 4 } },
4436 { ISD::UMIN, MVT::i8, { 1, 4, 2, 4 } },
4437 { ISD::SADDO, MVT::i32, { 2, 2, 4, 6 } },
4438 { ISD::SADDO, MVT::i16, { 2, 2, 4, 6 } },
4439 { ISD::SADDO, MVT::i8, { 2, 2, 4, 6 } },
4440 { ISD::UADDO, MVT::i32, { 2, 2, 4, 6 } },
4441 { ISD::UADDO, MVT::i16, { 2, 2, 4, 6 } },
4442 { ISD::UADDO, MVT::i8, { 2, 2, 4, 6 } },
4443 { ISD::SMULO, MVT::i32, { 2, 2, 4, 6 } },
4444 { ISD::SMULO, MVT::i16, { 5, 5, 4, 6 } },
4445 { ISD::SMULO, MVT::i8, { 6, 6, 4, 6 } },
4446 { ISD::UMULO, MVT::i32, { 6, 6, 4, 8 } },
4447 { ISD::UMULO, MVT::i16, { 6, 6, 4, 9 } },
4448 { ISD::UMULO, MVT::i8, { 6, 6, 4, 6 } },
4449 };
4450
4451 Type *RetTy = ICA.getReturnType();
4452 Type *OpTy = RetTy;
4453 Intrinsic::ID IID = ICA.getID();
4454 unsigned ISD = ISD::DELETED_NODE;
4455 switch (IID) {
4456 default:
4457 break;
4458 case Intrinsic::abs:
4459 ISD = ISD::ABS;
4460 break;
4461 case Intrinsic::bitreverse:
4462 ISD = ISD::BITREVERSE;
4463 break;
4464 case Intrinsic::bswap:
4465 ISD = ISD::BSWAP;
4466 break;
4467 case Intrinsic::ctlz:
4468 ISD = ISD::CTLZ;
4469 break;
4470 case Intrinsic::ctpop:
4471 ISD = ISD::CTPOP;
4472 break;
4473 case Intrinsic::cttz:
4474 ISD = ISD::CTTZ;
4475 break;
4476 case Intrinsic::fshl:
4477 ISD = ISD::FSHL;
4478 if (!ICA.isTypeBasedOnly()) {
4479 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4480 if (Args[0] == Args[1]) {
4481 ISD = ISD::ROTL;
4482 // Handle uniform constant rotation amounts.
4483 // TODO: Handle funnel-shift cases.
4484 const APInt *Amt;
4485 if (Args[2] &&
4487 ISD = X86ISD::VROTLI;
4488 }
4489 }
4490 break;
4491 case Intrinsic::fshr:
4492 // FSHR has same costs so don't duplicate.
4493 ISD = ISD::FSHL;
4494 if (!ICA.isTypeBasedOnly()) {
4495 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4496 if (Args[0] == Args[1]) {
4497 ISD = ISD::ROTR;
4498 // Handle uniform constant rotation amount.
4499 // TODO: Handle funnel-shift cases.
4500 const APInt *Amt;
4501 if (Args[2] &&
4503 ISD = X86ISD::VROTLI;
4504 }
4505 }
4506 break;
4507 case Intrinsic::lrint:
4508 case Intrinsic::llrint: {
4509 // X86 can use the CVTP2SI instructions to lower lrint/llrint calls, which
4510 // have the same costs as the CVTTP2SI (fptosi) instructions
4511 const SmallVectorImpl<Type *> &ArgTys = ICA.getArgTypes();
4512 return getCastInstrCost(Instruction::FPToSI, RetTy, ArgTys[0],
4514 }
4515 case Intrinsic::maxnum:
4516 case Intrinsic::minnum:
4517 // FMINNUM has same costs so don't duplicate.
4518 ISD = ISD::FMAXNUM;
4519 break;
4520 case Intrinsic::sadd_sat:
4521 ISD = ISD::SADDSAT;
4522 break;
4523 case Intrinsic::smax:
4524 ISD = ISD::SMAX;
4525 break;
4526 case Intrinsic::smin:
4527 ISD = ISD::SMIN;
4528 break;
4529 case Intrinsic::ssub_sat:
4530 ISD = ISD::SSUBSAT;
4531 break;
4532 case Intrinsic::uadd_sat:
4533 ISD = ISD::UADDSAT;
4534 break;
4535 case Intrinsic::umax:
4536 ISD = ISD::UMAX;
4537 break;
4538 case Intrinsic::umin:
4539 ISD = ISD::UMIN;
4540 break;
4541 case Intrinsic::usub_sat:
4542 ISD = ISD::USUBSAT;
4543 break;
4544 case Intrinsic::sqrt:
4545 ISD = ISD::FSQRT;
4546 break;
4547 case Intrinsic::sadd_with_overflow:
4548 case Intrinsic::ssub_with_overflow:
4549 // SSUBO has same costs so don't duplicate.
4550 ISD = ISD::SADDO;
4551 OpTy = RetTy->getContainedType(0);
4552 break;
4553 case Intrinsic::uadd_with_overflow:
4554 case Intrinsic::usub_with_overflow:
4555 // USUBO has same costs so don't duplicate.
4556 ISD = ISD::UADDO;
4557 OpTy = RetTy->getContainedType(0);
4558 break;
4559 case Intrinsic::smul_with_overflow:
4560 ISD = ISD::SMULO;
4561 OpTy = RetTy->getContainedType(0);
4562 break;
4563 case Intrinsic::umul_with_overflow:
4564 ISD = ISD::UMULO;
4565 OpTy = RetTy->getContainedType(0);
4566 break;
4567 }
4568
4569 if (ISD != ISD::DELETED_NODE) {
4570 auto adjustTableCost = [&](int ISD, unsigned Cost,
4571 std::pair<InstructionCost, MVT> LT,
4573 InstructionCost LegalizationCost = LT.first;
4574 MVT MTy = LT.second;
4575
4576 // If there are no NANs to deal with, then these are reduced to a
4577 // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
4578 // assume is used in the non-fast case.
4579 if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) {
4580 if (FMF.noNaNs())
4581 return LegalizationCost * 1;
4582 }
4583
4584 // For cases where some ops can be folded into a load/store, assume free.
4585 if (MTy.isScalarInteger()) {
4586 if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
4587 if (const Instruction *II = ICA.getInst()) {
4588 if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
4589 return TTI::TCC_Free;
4590 if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
4591 if (LI->hasOneUse())
4592 return TTI::TCC_Free;
4593 }
4594 }
4595 }
4596 }
4597
4598 return LegalizationCost * (int)Cost;
4599 };
4600
4601 // Legalize the type.
4602 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy);
4603 MVT MTy = LT.second;
4604
4605 // Without BMI/LZCNT see if we're only looking for a *_ZERO_UNDEF cost.
4606 if (((ISD == ISD::CTTZ && !ST->hasBMI()) ||
4607 (ISD == ISD::CTLZ && !ST->hasLZCNT())) &&
4608 !MTy.isVector() && !ICA.isTypeBasedOnly()) {
4609 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4610 if (auto *Cst = dyn_cast<ConstantInt>(Args[1]))
4611 if (Cst->isAllOnesValue())
4613 }
4614
4615 // FSQRT is a single instruction.
4616 if (ISD == ISD::FSQRT && CostKind == TTI::TCK_CodeSize)
4617 return LT.first;
4618
4619 if (ST->useGLMDivSqrtCosts())
4620 if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
4621 if (auto KindCost = Entry->Cost[CostKind])
4622 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4623
4624 if (ST->useSLMArithCosts())
4625 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
4626 if (auto KindCost = Entry->Cost[CostKind])
4627 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4628
4629 if (ST->hasVBMI2())
4630 if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy))
4631 if (auto KindCost = Entry->Cost[CostKind])
4632 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4633
4634 if (ST->hasBITALG())
4635 if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy))
4636 if (auto KindCost = Entry->Cost[CostKind])
4637 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4638
4639 if (ST->hasVPOPCNTDQ())
4640 if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy))
4641 if (auto KindCost = Entry->Cost[CostKind])
4642 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4643
4644 if (ST->hasGFNI())
4645 if (const auto *Entry = CostTableLookup(GFNICostTbl, ISD, MTy))
4646 if (auto KindCost = Entry->Cost[CostKind])
4647 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4648
4649 if (ST->hasCDI())
4650 if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
4651 if (auto KindCost = Entry->Cost[CostKind])
4652 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4653
4654 if (ST->hasBWI())
4655 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
4656 if (auto KindCost = Entry->Cost[CostKind])
4657 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4658
4659 if (ST->hasAVX512())
4660 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
4661 if (auto KindCost = Entry->Cost[CostKind])
4662 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4663
4664 if (ST->hasXOP())
4665 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
4666 if (auto KindCost = Entry->Cost[CostKind])
4667 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4668
4669 if (ST->hasAVX2())
4670 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
4671 if (auto KindCost = Entry->Cost[CostKind])
4672 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4673
4674 if (ST->hasAVX())
4675 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
4676 if (auto KindCost = Entry->Cost[CostKind])
4677 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4678
4679 if (ST->hasSSE42())
4680 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
4681 if (auto KindCost = Entry->Cost[CostKind])
4682 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4683
4684 if (ST->hasSSE41())
4685 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
4686 if (auto KindCost = Entry->Cost[CostKind])
4687 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4688
4689 if (ST->hasSSSE3())
4690 if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
4691 if (auto KindCost = Entry->Cost[CostKind])
4692 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4693
4694 if (ST->hasSSE2())
4695 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
4696 if (auto KindCost = Entry->Cost[CostKind])
4697 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4698
4699 if (ST->hasSSE1())
4700 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
4701 if (auto KindCost = Entry->Cost[CostKind])
4702 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4703
4704 if (ST->hasBMI()) {
4705 if (ST->is64Bit())
4706 if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
4707 if (auto KindCost = Entry->Cost[CostKind])
4708 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4709
4710 if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
4711 if (auto KindCost = Entry->Cost[CostKind])
4712 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4713 }
4714
4715 if (ST->hasLZCNT()) {
4716 if (ST->is64Bit())
4717 if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
4718 if (auto KindCost = Entry->Cost[CostKind])
4719 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4720
4721 if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
4722 if (auto KindCost = Entry->Cost[CostKind])
4723 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4724 }
4725
4726 if (ST->hasPOPCNT()) {
4727 if (ST->is64Bit())
4728 if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
4729 if (auto KindCost = Entry->Cost[CostKind])
4730 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4731
4732 if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
4733 if (auto KindCost = Entry->Cost[CostKind])
4734 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4735 }
4736
4737 if (ST->is64Bit())
4738 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
4739 if (auto KindCost = Entry->Cost[CostKind])
4740 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4741
4742 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
4743 if (auto KindCost = Entry->Cost[CostKind])
4744 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4745
4746 // Without arg data, we need to compute the expanded costs of custom lowered
4747 // intrinsics to prevent use of the (very low) default costs.
4748 if (ICA.isTypeBasedOnly() &&
4749 (IID == Intrinsic::fshl || IID == Intrinsic::fshr)) {
4750 Type *CondTy = RetTy->getWithNewBitWidth(1);
4752 Cost += getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
4753 Cost += getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
4754 Cost += getArithmeticInstrCost(BinaryOperator::Shl, RetTy, CostKind);
4755 Cost += getArithmeticInstrCost(BinaryOperator::LShr, RetTy, CostKind);
4756 Cost += getArithmeticInstrCost(BinaryOperator::And, RetTy, CostKind);
4757 Cost += getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
4759 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
4761 return Cost;
4762 }
4763 }
4764
4766}
4767
4770 unsigned Index, Value *Op0,
4771 Value *Op1) {
4772 static const CostTblEntry SLMCostTbl[] = {
4773 { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 },
4774 { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 },
4775 { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 },
4776 { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 }
4777 };
4778
4779 assert(Val->isVectorTy() && "This must be a vector type");
4780 Type *ScalarType = Val->getScalarType();
4781 InstructionCost RegisterFileMoveCost = 0;
4782
4783 // Non-immediate extraction/insertion can be handled as a sequence of
4784 // aliased loads+stores via the stack.
4785 if (Index == -1U && (Opcode == Instruction::ExtractElement ||
4786 Opcode == Instruction::InsertElement)) {
4787 // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
4788 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
4789
4790 // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling.
4791 assert(isa<FixedVectorType>(Val) && "Fixed vector type expected");
4792 Align VecAlign = DL.getPrefTypeAlign(Val);
4793 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
4794
4795 // Extract - store vector to stack, load scalar.
4796 if (Opcode == Instruction::ExtractElement) {
4797 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4798 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
4799 CostKind);
4800 }
4801 // Insert - store vector to stack, store scalar, load vector.
4802 if (Opcode == Instruction::InsertElement) {
4803 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4804 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
4805 CostKind) +
4806 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind);
4807 }
4808 }
4809
4810 if (Index != -1U && (Opcode == Instruction::ExtractElement ||
4811 Opcode == Instruction::InsertElement)) {
4812 // Extraction of vXi1 elements are now efficiently handled by MOVMSK.
4813 if (Opcode == Instruction::ExtractElement &&
4814 ScalarType->getScalarSizeInBits() == 1 &&
4815 cast<FixedVectorType>(Val)->getNumElements() > 1)
4816 return 1;
4817
4818 // Legalize the type.
4819 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
4820
4821 // This type is legalized to a scalar type.
4822 if (!LT.second.isVector())
4823 return TTI::TCC_Free;
4824
4825 // The type may be split. Normalize the index to the new type.
4826 unsigned SizeInBits = LT.second.getSizeInBits();
4827 unsigned NumElts = LT.second.getVectorNumElements();
4828 unsigned SubNumElts = NumElts;
4829 Index = Index % NumElts;
4830
4831 // For >128-bit vectors, we need to extract higher 128-bit subvectors.
4832 // For inserts, we also need to insert the subvector back.
4833 if (SizeInBits > 128) {
4834 assert((SizeInBits % 128) == 0 && "Illegal vector");
4835 unsigned NumSubVecs = SizeInBits / 128;
4836 SubNumElts = NumElts / NumSubVecs;
4837 if (SubNumElts <= Index) {
4838 RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
4839 Index %= SubNumElts;
4840 }
4841 }
4842
4843 MVT MScalarTy = LT.second.getScalarType();
4844 auto IsCheapPInsrPExtrInsertPS = [&]() {
4845 // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
4846 // Inserting f32 into index0 is just movss.
4847 // Also, assume insertps is relatively cheap on all >= SSE41 targets.
4848 return (MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4849 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4850 (MScalarTy == MVT::f32 && ST->hasSSE1() && Index == 0 &&
4851 Opcode == Instruction::InsertElement) ||
4852 (MScalarTy == MVT::f32 && ST->hasSSE41() &&
4853 Opcode == Instruction::InsertElement);
4854 };
4855
4856 if (Index == 0) {
4857 // Floating point scalars are already located in index #0.
4858 // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
4859 // true for all.
4860 if (ScalarType->isFloatingPointTy() &&
4861 (Opcode != Instruction::InsertElement || !Op0 ||
4862 isa<UndefValue>(Op0)))
4863 return RegisterFileMoveCost;
4864
4865 if (Opcode == Instruction::InsertElement &&
4866 isa_and_nonnull<UndefValue>(Op0)) {
4867 // Consider the gather cost to be cheap.
4868 if (isa_and_nonnull<LoadInst>(Op1))
4869 return RegisterFileMoveCost;
4870 if (!IsCheapPInsrPExtrInsertPS()) {
4871 // mov constant-to-GPR + movd/movq GPR -> XMM.
4872 if (isa_and_nonnull<Constant>(Op1) && Op1->getType()->isIntegerTy())
4873 return 2 + RegisterFileMoveCost;
4874 // Assume movd/movq GPR -> XMM is relatively cheap on all targets.
4875 return 1 + RegisterFileMoveCost;
4876 }
4877 }
4878
4879 // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
4880 if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
4881 return 1 + RegisterFileMoveCost;
4882 }
4883
4884 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4885 assert(ISD && "Unexpected vector opcode");
4886 if (ST->useSLMArithCosts())
4887 if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
4888 return Entry->Cost + RegisterFileMoveCost;
4889
4890 // Consider cheap cases.
4891 if (IsCheapPInsrPExtrInsertPS())
4892 return 1 + RegisterFileMoveCost;
4893
4894 // For extractions we just need to shuffle the element to index 0, which
4895 // should be very cheap (assume cost = 1). For insertions we need to shuffle
4896 // the elements to its destination. In both cases we must handle the
4897 // subvector move(s).
4898 // If the vector type is already less than 128-bits then don't reduce it.
4899 // TODO: Under what circumstances should we shuffle using the full width?
4900 InstructionCost ShuffleCost = 1;
4901 if (Opcode == Instruction::InsertElement) {
4902 auto *SubTy = cast<VectorType>(Val);
4903 EVT VT = TLI->getValueType(DL, Val);
4904 if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
4905 SubTy = FixedVectorType::get(ScalarType, SubNumElts);
4906 ShuffleCost =
4907 getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, {}, CostKind, 0, SubTy);
4908 }
4909 int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
4910 return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
4911 }
4912
4913 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1) +
4914 RegisterFileMoveCost;
4915}
4916
4918 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
4920 assert(DemandedElts.getBitWidth() ==
4921 cast<FixedVectorType>(Ty)->getNumElements() &&
4922 "Vector size mismatch");
4923
4924 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4925 MVT MScalarTy = LT.second.getScalarType();
4926 unsigned LegalVectorBitWidth = LT.second.getSizeInBits();
4928
4929 constexpr unsigned LaneBitWidth = 128;
4930 assert((LegalVectorBitWidth < LaneBitWidth ||
4931 (LegalVectorBitWidth % LaneBitWidth) == 0) &&
4932 "Illegal vector");
4933
4934 const int NumLegalVectors = *LT.first.getValue();
4935 assert(NumLegalVectors >= 0 && "Negative cost!");
4936
4937 // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
4938 // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
4939 if (Insert) {
4940 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4941 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4942 (MScalarTy == MVT::f32 && ST->hasSSE41())) {
4943 // For types we can insert directly, insertion into 128-bit sub vectors is
4944 // cheap, followed by a cheap chain of concatenations.
4945 if (LegalVectorBitWidth <= LaneBitWidth) {
4946 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
4947 /*Extract*/ false, CostKind);
4948 } else {
4949 // In each 128-lane, if at least one index is demanded but not all
4950 // indices are demanded and this 128-lane is not the first 128-lane of
4951 // the legalized-vector, then this 128-lane needs a extracti128; If in
4952 // each 128-lane, there is at least one demanded index, this 128-lane
4953 // needs a inserti128.
4954
4955 // The following cases will help you build a better understanding:
4956 // Assume we insert several elements into a v8i32 vector in avx2,
4957 // Case#1: inserting into 1th index needs vpinsrd + inserti128.
4958 // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
4959 // inserti128.
4960 // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
4961 assert((LegalVectorBitWidth % LaneBitWidth) == 0 && "Illegal vector");
4962 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
4963 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
4964 unsigned NumLegalElts =
4965 LT.second.getVectorNumElements() * NumLegalVectors;
4966 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
4967 "Vector has been legalized to smaller element count");
4968 assert((NumLegalElts % NumLanesTotal) == 0 &&
4969 "Unexpected elts per lane");
4970 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
4971
4972 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
4973 auto *LaneTy =
4974 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
4975
4976 for (unsigned I = 0; I != NumLanesTotal; ++I) {
4977 APInt LaneEltMask = WidenedDemandedElts.extractBits(
4978 NumEltsPerLane, NumEltsPerLane * I);
4979 if (LaneEltMask.isZero())
4980 continue;
4981 // FIXME: we don't need to extract if all non-demanded elements
4982 // are legalization-inserted padding.
4983 if (!LaneEltMask.isAllOnes())
4985 I * NumEltsPerLane, LaneTy);
4986 Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert,
4987 /*Extract*/ false, CostKind);
4988 }
4989
4990 APInt AffectedLanes =
4991 APIntOps::ScaleBitMask(WidenedDemandedElts, NumLanesTotal);
4992 APInt FullyAffectedLegalVectors = APIntOps::ScaleBitMask(
4993 AffectedLanes, NumLegalVectors, /*MatchAllBits=*/true);
4994 for (int LegalVec = 0; LegalVec != NumLegalVectors; ++LegalVec) {
4995 for (unsigned Lane = 0; Lane != NumLegalLanes; ++Lane) {
4996 unsigned I = NumLegalLanes * LegalVec + Lane;
4997 // No need to insert unaffected lane; or lane 0 of each legal vector
4998 // iff ALL lanes of that vector were affected and will be inserted.
4999 if (!AffectedLanes[I] ||
5000 (Lane == 0 && FullyAffectedLegalVectors[LegalVec]))
5001 continue;
5003 I * NumEltsPerLane, LaneTy);
5004 }
5005 }
5006 }
5007 } else if (LT.second.isVector()) {
5008 // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
5009 // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
5010 // series of UNPCK followed by CONCAT_VECTORS - all of these can be
5011 // considered cheap.
5012 if (Ty->isIntOrIntVectorTy())
5013 Cost += DemandedElts.popcount();
5014
5015 // Get the smaller of the legalized or original pow2-extended number of
5016 // vector elements, which represents the number of unpacks we'll end up
5017 // performing.
5018 unsigned NumElts = LT.second.getVectorNumElements();
5019 unsigned Pow2Elts =
5020 PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements());
5021 Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
5022 }
5023 }
5024
5025 if (Extract) {
5026 // vXi1 can be efficiently extracted with MOVMSK.
5027 // TODO: AVX512 predicate mask handling.
5028 // NOTE: This doesn't work well for roundtrip scalarization.
5029 if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) {
5030 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
5031 unsigned MaxElts = ST->hasAVX2() ? 32 : 16;
5032 unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts;
5033 return MOVMSKCost;
5034 }
5035
5036 if (LT.second.isVector()) {
5037 unsigned NumLegalElts =
5038 LT.second.getVectorNumElements() * NumLegalVectors;
5039 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
5040 "Vector has been legalized to smaller element count");
5041
5042 // If we're extracting elements from a 128-bit subvector lane,
5043 // we only need to extract each lane once, not for every element.
5044 if (LegalVectorBitWidth > LaneBitWidth) {
5045 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
5046 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
5047 assert((NumLegalElts % NumLanesTotal) == 0 &&
5048 "Unexpected elts per lane");
5049 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
5050
5051 // Add cost for each demanded 128-bit subvector extraction.
5052 // Luckily this is a lot easier than for insertion.
5053 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
5054 auto *LaneTy =
5055 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
5056
5057 for (unsigned I = 0; I != NumLanesTotal; ++I) {
5058 APInt LaneEltMask = WidenedDemandedElts.extractBits(
5059 NumEltsPerLane, I * NumEltsPerLane);
5060 if (LaneEltMask.isZero())
5061 continue;
5063 I * NumEltsPerLane, LaneTy);
5065 LaneTy, LaneEltMask, /*Insert*/ false, Extract, CostKind);
5066 }
5067
5068 return Cost;
5069 }
5070 }
5071
5072 // Fallback to default extraction.
5073 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ false,
5074 Extract, CostKind);
5075 }
5076
5077 return Cost;
5078}
5079
5081X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
5082 int VF, const APInt &DemandedDstElts,
5084 const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy);
5085 // We don't differentiate element types here, only element bit width.
5086 EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits);
5087
5088 auto bailout = [&]() {
5089 return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
5090 DemandedDstElts, CostKind);
5091 };
5092
5093 // For now, only deal with AVX512 cases.
5094 if (!ST->hasAVX512())
5095 return bailout();
5096
5097 // Do we have a native shuffle for this element type, or should we promote?
5098 unsigned PromEltTyBits = EltTyBits;
5099 switch (EltTyBits) {
5100 case 32:
5101 case 64:
5102 break; // AVX512F.
5103 case 16:
5104 if (!ST->hasBWI())
5105 PromEltTyBits = 32; // promote to i32, AVX512F.
5106 break; // AVX512BW
5107 case 8:
5108 if (!ST->hasVBMI())
5109 PromEltTyBits = 32; // promote to i32, AVX512F.
5110 break; // AVX512VBMI
5111 case 1:
5112 // There is no support for shuffling i1 elements. We *must* promote.
5113 if (ST->hasBWI()) {
5114 if (ST->hasVBMI())
5115 PromEltTyBits = 8; // promote to i8, AVX512VBMI.
5116 else
5117 PromEltTyBits = 16; // promote to i16, AVX512BW.
5118 break;
5119 }
5120 PromEltTyBits = 32; // promote to i32, AVX512F.
5121 break;
5122 default:
5123 return bailout();
5124 }
5125 auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits);
5126
5127 auto *SrcVecTy = FixedVectorType::get(EltTy, VF);
5128 auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF);
5129
5130 int NumDstElements = VF * ReplicationFactor;
5131 auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements);
5132 auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements);
5133
5134 // Legalize the types.
5135 MVT LegalSrcVecTy = getTypeLegalizationCost(SrcVecTy).second;
5136 MVT LegalPromSrcVecTy = getTypeLegalizationCost(PromSrcVecTy).second;
5137 MVT LegalPromDstVecTy = getTypeLegalizationCost(PromDstVecTy).second;
5138 MVT LegalDstVecTy = getTypeLegalizationCost(DstVecTy).second;
5139 // They should have legalized into vector types.
5140 if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() ||
5141 !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector())
5142 return bailout();
5143
5144 if (PromEltTyBits != EltTyBits) {
5145 // If we have to perform the shuffle with wider elt type than our data type,
5146 // then we will first need to anyext (we don't care about the new bits)
5147 // the source elements, and then truncate Dst elements.
5148 InstructionCost PromotionCost;
5149 PromotionCost += getCastInstrCost(
5150 Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy,
5152 PromotionCost +=
5153 getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy,
5154 /*Src=*/PromDstVecTy,
5156 return PromotionCost + getReplicationShuffleCost(PromEltTy,
5157 ReplicationFactor, VF,
5158 DemandedDstElts, CostKind);
5159 }
5160
5161 assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits &&
5162 LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() &&
5163 "We expect that the legalization doesn't affect the element width, "
5164 "doesn't coalesce/split elements.");
5165
5166 unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements();
5167 unsigned NumDstVectors =
5168 divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec);
5169
5170 auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec);
5171
5172 // Not all the produced Dst elements may be demanded. In our case,
5173 // given that a single Dst vector is formed by a single shuffle,
5174 // if all elements that will form a single Dst vector aren't demanded,
5175 // then we won't need to do that shuffle, so adjust the cost accordingly.
5176 APInt DemandedDstVectors = APIntOps::ScaleBitMask(
5177 DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors);
5178 unsigned NumDstVectorsDemanded = DemandedDstVectors.popcount();
5179
5180 InstructionCost SingleShuffleCost = getShuffleCost(
5181 TTI::SK_PermuteSingleSrc, SingleDstVecTy, /*Mask=*/{}, CostKind,
5182 /*Index=*/0, /*SubTp=*/nullptr);
5183 return NumDstVectorsDemanded * SingleShuffleCost;
5184}
5185
5187 MaybeAlign Alignment,
5188 unsigned AddressSpace,
5190 TTI::OperandValueInfo OpInfo,
5191 const Instruction *I) {
5192 // TODO: Handle other cost kinds.
5194 if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
5195 // Store instruction with index and scale costs 2 Uops.
5196 // Check the preceding GEP to identify non-const indices.
5197 if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
5198 if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
5199 return TTI::TCC_Basic * 2;
5200 }
5201 }
5202 return TTI::TCC_Basic;
5203 }
5204
5205 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
5206 "Invalid Opcode");
5207 // Type legalization can't handle structs
5208 if (TLI->getValueType(DL, Src, true) == MVT::Other)
5209 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5210 CostKind, OpInfo, I);
5211
5212 // Legalize the type.
5213 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
5214
5215 auto *VTy = dyn_cast<FixedVectorType>(Src);
5216
5218
5219 // Add a cost for constant load to vector.
5220 if (Opcode == Instruction::Store && OpInfo.isConstant())
5221 Cost += getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src),
5222 /*AddressSpace=*/0, CostKind, OpInfo);
5223
5224 // Handle the simple case of non-vectors.
5225 // NOTE: this assumes that legalization never creates vector from scalars!
5226 if (!VTy || !LT.second.isVector()) {
5227 // Each load/store unit costs 1.
5228 return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1;
5229 }
5230
5231 bool IsLoad = Opcode == Instruction::Load;
5232
5233 Type *EltTy = VTy->getElementType();
5234
5235 const int EltTyBits = DL.getTypeSizeInBits(EltTy);
5236
5237 // Source of truth: how many elements were there in the original IR vector?
5238 const unsigned SrcNumElt = VTy->getNumElements();
5239
5240 // How far have we gotten?
5241 int NumEltRemaining = SrcNumElt;
5242 // Note that we intentionally capture by-reference, NumEltRemaining changes.
5243 auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
5244
5245 const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8);
5246
5247 // Note that even if we can store 64 bits of an XMM, we still operate on XMM.
5248 const unsigned XMMBits = 128;
5249 if (XMMBits % EltTyBits != 0)
5250 // Vector size must be a multiple of the element size. I.e. no padding.
5251 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5252 CostKind, OpInfo, I);
5253 const int NumEltPerXMM = XMMBits / EltTyBits;
5254
5255 auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
5256
5257 for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0;
5258 NumEltRemaining > 0; CurrOpSizeBytes /= 2) {
5259 // How many elements would a single op deal with at once?
5260 if ((8 * CurrOpSizeBytes) % EltTyBits != 0)
5261 // Vector size must be a multiple of the element size. I.e. no padding.
5262 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5263 CostKind, OpInfo, I);
5264 int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits;
5265
5266 assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?");
5267 assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||
5268 (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&
5269 "Unless we haven't halved the op size yet, "
5270 "we have less than two op's sized units of work left.");
5271
5272 auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
5273 ? FixedVectorType::get(EltTy, CurrNumEltPerOp)
5274 : XMMVecTy;
5275
5276 assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&
5277 "After halving sizes, the vector elt count is no longer a multiple "
5278 "of number of elements per operation?");
5279 auto *CoalescedVecTy =
5280 CurrNumEltPerOp == 1
5281 ? CurrVecTy
5283 IntegerType::get(Src->getContext(),
5284 EltTyBits * CurrNumEltPerOp),
5285 CurrVecTy->getNumElements() / CurrNumEltPerOp);
5286 assert(DL.getTypeSizeInBits(CoalescedVecTy) ==
5287 DL.getTypeSizeInBits(CurrVecTy) &&
5288 "coalesciing elements doesn't change vector width.");
5289
5290 while (NumEltRemaining > 0) {
5291 assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?");
5292
5293 // Can we use this vector size, as per the remaining element count?
5294 // Iff the vector is naturally aligned, we can do a wide load regardless.
5295 if (NumEltRemaining < CurrNumEltPerOp &&
5296 (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) &&
5297 CurrOpSizeBytes != 1)
5298 break; // Try smalled vector size.
5299
5300 // This isn't exactly right. We're using slow unaligned 32-byte accesses
5301 // as a proxy for a double-pumped AVX memory interface such as on
5302 // Sandybridge.
5303 // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or
5304 // will be scalarized.
5305 if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
5306 Cost += 2;
5307 else if (CurrOpSizeBytes < 4)
5308 Cost += 2;
5309 else
5310 Cost += 1;
5311
5312 // If we're loading a uniform value, then we don't need to split the load,
5313 // loading just a single (widest) vector can be reused by all splits.
5314 if (IsLoad && OpInfo.isUniform())
5315 return Cost;
5316
5317 bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
5318
5319 // If we have fully processed the previous reg, we need to replenish it.
5320 if (SubVecEltsLeft == 0) {
5321 SubVecEltsLeft += CurrVecTy->getNumElements();
5322 // And that's free only for the 0'th subvector of a legalized vector.
5323 if (!Is0thSubVec)
5326 VTy, {}, CostKind, NumEltDone(), CurrVecTy);
5327 }
5328
5329 // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
5330 // for smaller widths (32/16/8) we have to insert/extract them separately.
5331 // Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
5332 // but let's pretend that it is also true for 16/8 bit wide ops...)
5333 if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
5334 int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
5335 assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "");
5336 int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
5337 APInt DemandedElts =
5338 APInt::getBitsSet(CoalescedVecTy->getNumElements(),
5339 CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
5340 assert(DemandedElts.popcount() == 1 && "Inserting single value");
5341 Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
5342 !IsLoad, CostKind);
5343 }
5344
5345 SubVecEltsLeft -= CurrNumEltPerOp;
5346 NumEltRemaining -= CurrNumEltPerOp;
5347 Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes);
5348 }
5349 }
5350
5351 assert(NumEltRemaining <= 0 && "Should have processed all the elements.");
5352
5353 return Cost;
5354}
5355
5357X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
5358 unsigned AddressSpace,
5360 bool IsLoad = (Instruction::Load == Opcode);
5361 bool IsStore = (Instruction::Store == Opcode);
5362
5363 auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
5364 if (!SrcVTy)
5365 // To calculate scalar take the regular cost, without mask
5366 return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
5367
5368 unsigned NumElem = SrcVTy->getNumElements();
5369 auto *MaskTy =
5370 FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
5371 if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) ||
5372 (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) {
5373 // Scalarization
5374 APInt DemandedElts = APInt::getAllOnes(NumElem);
5376 MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind);
5377 InstructionCost ScalarCompareCost = getCmpSelInstrCost(
5378 Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
5380 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
5381 InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
5383 SrcVTy, DemandedElts, IsLoad, IsStore, CostKind);
5384 InstructionCost MemopCost =
5385 NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
5386 Alignment, AddressSpace, CostKind);
5387 return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
5388 }
5389
5390 // Legalize the type.
5391 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy);
5392 auto VT = TLI->getValueType(DL, SrcVTy);
5394 MVT Ty = LT.second;
5395 if (Ty == MVT::i16 || Ty == MVT::i32 || Ty == MVT::i64)
5396 // APX masked load/store for scalar is cheap.
5397 return Cost + LT.first;
5398
5399 if (VT.isSimple() && Ty != VT.getSimpleVT() &&
5400 LT.second.getVectorNumElements() == NumElem)
5401 // Promotion requires extend/truncate for data and a shuffle for mask.
5402 Cost +=
5404 nullptr) +
5405 getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, {}, CostKind, 0, nullptr);
5406
5407 else if (LT.first * Ty.getVectorNumElements() > NumElem) {
5408 auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
5410 // Expanding requires fill mask with zeroes
5412 MaskTy);
5413 }
5414
5415 // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
5416 if (!ST->hasAVX512())
5417 return Cost + LT.first * (IsLoad ? 2 : 8);
5418
5419 // AVX-512 masked load/store is cheaper
5420 return Cost + LT.first;
5421}
5422
5425 const Value *Base,
5426 const TTI::PointersChainInfo &Info,
5427 Type *AccessTy, TTI::TargetCostKind CostKind) {
5428 if (Info.isSameBase() && Info.isKnownStride()) {
5429 // If all the pointers have known stride all the differences are translated
5430 // into constants. X86 memory addressing allows encoding it into
5431 // displacement. So we just need to take the base GEP cost.
5432 if (const auto *BaseGEP = dyn_cast<GetElementPtrInst>(Base)) {
5433 SmallVector<const Value *> Indices(BaseGEP->indices());
5434 return getGEPCost(BaseGEP->getSourceElementType(),
5435 BaseGEP->getPointerOperand(), Indices, nullptr,
5436 CostKind);
5437 }
5438 return TTI::TCC_Free;
5439 }
5440 return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind);
5441}
5442
5444 ScalarEvolution *SE,
5445 const SCEV *Ptr) {
5446 // Address computations in vectorized code with non-consecutive addresses will
5447 // likely result in more instructions compared to scalar code where the
5448 // computation can more often be merged into the index mode. The resulting
5449 // extra micro-ops can significantly decrease throughput.
5450 const unsigned NumVectorInstToHideOverhead = 10;
5451
5452 // Cost modeling of Strided Access Computation is hidden by the indexing
5453 // modes of X86 regardless of the stride value. We dont believe that there
5454 // is a difference between constant strided access in gerenal and constant
5455 // strided value which is less than or equal to 64.
5456 // Even in the case of (loop invariant) stride whose value is not known at
5457 // compile time, the address computation will not incur more than one extra
5458 // ADD instruction.
5459 if (Ty->isVectorTy() && SE && !ST->hasAVX2()) {
5460 // TODO: AVX2 is the current cut-off because we don't have correct
5461 // interleaving costs for prior ISA's.
5463 return NumVectorInstToHideOverhead;
5465 return 1;
5466 }
5467
5468 return BaseT::getAddressComputationCost(Ty, SE, Ptr);
5469}
5470
5473 std::optional<FastMathFlags> FMF,
5476 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5477
5478 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5479 // and make it as the cost.
5480
5481 static const CostTblEntry SLMCostTbl[] = {
5482 { ISD::FADD, MVT::v2f64, 3 },
5483 { ISD::ADD, MVT::v2i64, 5 },
5484 };
5485
5486 static const CostTblEntry SSE2CostTbl[] = {
5487 { ISD::FADD, MVT::v2f64, 2 },
5488 { ISD::FADD, MVT::v2f32, 2 },
5489 { ISD::FADD, MVT::v4f32, 4 },
5490 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
5491 { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
5492 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
5493 { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
5494 { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
5495 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
5496 { ISD::ADD, MVT::v2i8, 2 },
5497 { ISD::ADD, MVT::v4i8, 2 },
5498 { ISD::ADD, MVT::v8i8, 2 },
5499 { ISD::ADD, MVT::v16i8, 3 },
5500 };
5501
5502 static const CostTblEntry AVX1CostTbl[] = {
5503 { ISD::FADD, MVT::v4f64, 3 },
5504 { ISD::FADD, MVT::v4f32, 3 },
5505 { ISD::FADD, MVT::v8f32, 4 },
5506 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
5507 { ISD::ADD, MVT::v4i64, 3 },
5508 { ISD::ADD, MVT::v8i32, 5 },
5509 { ISD::ADD, MVT::v16i16, 5 },
5510 { ISD::ADD, MVT::v32i8, 4 },
5511 };
5512
5513 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5514 assert(ISD && "Invalid opcode");
5515
5516 // Before legalizing the type, give a chance to look up illegal narrow types
5517 // in the table.
5518 // FIXME: Is there a better way to do this?
5519 EVT VT = TLI->getValueType(DL, ValTy);
5520 if (VT.isSimple()) {
5521 MVT MTy = VT.getSimpleVT();
5522 if (ST->useSLMArithCosts())
5523 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5524 return Entry->Cost;
5525
5526 if (ST->hasAVX())
5527 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5528 return Entry->Cost;
5529
5530 if (ST->hasSSE2())
5531 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5532 return Entry->Cost;
5533 }
5534
5535 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5536
5537 MVT MTy = LT.second;
5538
5539 auto *ValVTy = cast<FixedVectorType>(ValTy);
5540
5541 // Special case: vXi8 mul reductions are performed as vXi16.
5542 if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) {
5543 auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16);
5544 auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements());
5545 return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
5547 CostKind) +
5548 getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind);
5549 }
5550
5551 InstructionCost ArithmeticCost = 0;
5552 if (LT.first != 1 && MTy.isVector() &&
5553 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5554 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5555 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5556 MTy.getVectorNumElements());
5557 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5558 ArithmeticCost *= LT.first - 1;
5559 }
5560
5561 if (ST->useSLMArithCosts())
5562 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5563 return ArithmeticCost + Entry->Cost;
5564
5565 if (ST->hasAVX())
5566 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5567 return ArithmeticCost + Entry->Cost;
5568
5569 if (ST->hasSSE2())
5570 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5571 return ArithmeticCost + Entry->Cost;
5572
5573 // FIXME: These assume a naive kshift+binop lowering, which is probably
5574 // conservative in most cases.
5575 static const CostTblEntry AVX512BoolReduction[] = {
5576 { ISD::AND, MVT::v2i1, 3 },
5577 { ISD::AND, MVT::v4i1, 5 },
5578 { ISD::AND, MVT::v8i1, 7 },
5579 { ISD::AND, MVT::v16i1, 9 },
5580 { ISD::AND, MVT::v32i1, 11 },
5581 { ISD::AND, MVT::v64i1, 13 },
5582 { ISD::OR, MVT::v2i1, 3 },
5583 { ISD::OR, MVT::v4i1, 5 },
5584 { ISD::OR, MVT::v8i1, 7 },
5585 { ISD::OR, MVT::v16i1, 9 },
5586 { ISD::OR, MVT::v32i1, 11 },
5587 { ISD::OR, MVT::v64i1, 13 },
5588 };
5589
5590 static const CostTblEntry AVX2BoolReduction[] = {
5591 { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
5592 { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
5593 { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
5594 { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
5595 };
5596
5597 static const CostTblEntry AVX1BoolReduction[] = {
5598 { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
5599 { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
5600 { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5601 { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5602 { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
5603 { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
5604 { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5605 { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5606 };
5607
5608 static const CostTblEntry SSE2BoolReduction[] = {
5609 { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
5610 { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
5611 { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
5612 { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
5613 { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
5614 { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
5615 { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
5616 { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
5617 };
5618
5619 // Handle bool allof/anyof patterns.
5620 if (ValVTy->getElementType()->isIntegerTy(1)) {
5621 InstructionCost ArithmeticCost = 0;
5622 if (LT.first != 1 && MTy.isVector() &&
5623 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5624 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5625 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5626 MTy.getVectorNumElements());
5627 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5628 ArithmeticCost *= LT.first - 1;
5629 }
5630
5631 if (ST->hasAVX512())
5632 if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
5633 return ArithmeticCost + Entry->Cost;
5634 if (ST->hasAVX2())
5635 if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
5636 return ArithmeticCost + Entry->Cost;
5637 if (ST->hasAVX())
5638 if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
5639 return ArithmeticCost + Entry->Cost;
5640 if (ST->hasSSE2())
5641 if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
5642 return ArithmeticCost + Entry->Cost;
5643
5644 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5645 }
5646
5647 unsigned NumVecElts = ValVTy->getNumElements();
5648 unsigned ScalarSize = ValVTy->getScalarSizeInBits();
5649
5650 // Special case power of 2 reductions where the scalar type isn't changed
5651 // by type legalization.
5652 if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
5653 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5654
5655 InstructionCost ReductionCost = 0;
5656
5657 auto *Ty = ValVTy;
5658 if (LT.first != 1 && MTy.isVector() &&
5659 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5660 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5661 Ty = FixedVectorType::get(ValVTy->getElementType(),
5662 MTy.getVectorNumElements());
5663 ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5664 ReductionCost *= LT.first - 1;
5665 NumVecElts = MTy.getVectorNumElements();
5666 }
5667
5668 // Now handle reduction with the legal type, taking into account size changes
5669 // at each level.
5670 while (NumVecElts > 1) {
5671 // Determine the size of the remaining vector we need to reduce.
5672 unsigned Size = NumVecElts * ScalarSize;
5673 NumVecElts /= 2;
5674 // If we're reducing from 256/512 bits, use an extract_subvector.
5675 if (Size > 128) {
5676 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5677 ReductionCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, {},
5678 CostKind, NumVecElts, SubTy);
5679 Ty = SubTy;
5680 } else if (Size == 128) {
5681 // Reducing from 128 bits is a permute of v2f64/v2i64.
5682 FixedVectorType *ShufTy;
5683 if (ValVTy->isFloatingPointTy())
5684 ShufTy =
5685 FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
5686 else
5687 ShufTy =
5688 FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
5689 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, {},
5690 CostKind, 0, nullptr);
5691 } else if (Size == 64) {
5692 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5693 FixedVectorType *ShufTy;
5694 if (ValVTy->isFloatingPointTy())
5695 ShufTy =
5696 FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
5697 else
5698 ShufTy =
5699 FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
5700 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, {},
5701 CostKind, 0, nullptr);
5702 } else {
5703 // Reducing from smaller size is a shift by immediate.
5704 auto *ShiftTy = FixedVectorType::get(
5705 Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
5706 ReductionCost += getArithmeticInstrCost(
5707 Instruction::LShr, ShiftTy, CostKind,
5710 }
5711
5712 // Add the arithmetic op for this level.
5713 ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
5714 }
5715
5716 // Add the final extract element to the cost.
5717 return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5718 CostKind, 0, nullptr, nullptr);
5719}
5720
5723 FastMathFlags FMF) {
5724 IntrinsicCostAttributes ICA(IID, Ty, {Ty, Ty}, FMF);
5725 return getIntrinsicInstrCost(ICA, CostKind);
5726}
5727
5730 FastMathFlags FMF,
5732 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5733
5734 MVT MTy = LT.second;
5735
5736 int ISD;
5737 if (ValTy->isIntOrIntVectorTy()) {
5738 ISD = (IID == Intrinsic::umin || IID == Intrinsic::umax) ? ISD::UMIN
5739 : ISD::SMIN;
5740 } else {
5741 assert(ValTy->isFPOrFPVectorTy() &&
5742 "Expected float point or integer vector type.");
5743 ISD = (IID == Intrinsic::minnum || IID == Intrinsic::maxnum)
5744 ? ISD::FMINNUM
5745 : ISD::FMINIMUM;
5746 }
5747
5748 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5749 // and make it as the cost.
5750
5751 static const CostTblEntry SSE2CostTbl[] = {
5752 {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw
5753 {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw
5754 {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw
5755 };
5756
5757 static const CostTblEntry SSE41CostTbl[] = {
5758 {ISD::SMIN, MVT::v2i16, 3}, // same as sse2
5759 {ISD::SMIN, MVT::v4i16, 5}, // same as sse2
5760 {ISD::UMIN, MVT::v2i16, 5}, // same as sse2
5761 {ISD::UMIN, MVT::v4i16, 7}, // same as sse2
5762 {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor
5763 {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax
5764 {ISD::SMIN, MVT::v2i8, 3}, // pminsb
5765 {ISD::SMIN, MVT::v4i8, 5}, // pminsb
5766 {ISD::SMIN, MVT::v8i8, 7}, // pminsb
5767 {ISD::SMIN, MVT::v16i8, 6},
5768 {ISD::UMIN, MVT::v2i8, 3}, // same as sse2
5769 {ISD::UMIN, MVT::v4i8, 5}, // same as sse2
5770 {ISD::UMIN, MVT::v8i8, 7}, // same as sse2
5771 {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax
5772 };
5773
5774 static const CostTblEntry AVX1CostTbl[] = {
5775 {ISD::SMIN, MVT::v16i16, 6},
5776 {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax
5777 {ISD::SMIN, MVT::v32i8, 8},
5778 {ISD::UMIN, MVT::v32i8, 8},
5779 };
5780
5781 static const CostTblEntry AVX512BWCostTbl[] = {
5782 {ISD::SMIN, MVT::v32i16, 8},
5783 {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax
5784 {ISD::SMIN, MVT::v64i8, 10},
5785 {ISD::UMIN, MVT::v64i8, 10},
5786 };
5787
5788 // Before legalizing the type, give a chance to look up illegal narrow types
5789 // in the table.
5790 // FIXME: Is there a better way to do this?
5791 EVT VT = TLI->getValueType(DL, ValTy);
5792 if (VT.isSimple()) {
5793 MVT MTy = VT.getSimpleVT();
5794 if (ST->hasBWI())
5795 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5796 return Entry->Cost;
5797
5798 if (ST->hasAVX())
5799 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5800 return Entry->Cost;
5801
5802 if (ST->hasSSE41())
5803 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5804 return Entry->Cost;
5805
5806 if (ST->hasSSE2())
5807 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5808 return Entry->Cost;
5809 }
5810
5811 auto *ValVTy = cast<FixedVectorType>(ValTy);
5812 unsigned NumVecElts = ValVTy->getNumElements();
5813
5814 auto *Ty = ValVTy;
5815 InstructionCost MinMaxCost = 0;
5816 if (LT.first != 1 && MTy.isVector() &&
5817 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5818 // Type needs to be split. We need LT.first - 1 operations ops.
5819 Ty = FixedVectorType::get(ValVTy->getElementType(),
5820 MTy.getVectorNumElements());
5821 MinMaxCost = getMinMaxCost(IID, Ty, CostKind, FMF);
5822 MinMaxCost *= LT.first - 1;
5823 NumVecElts = MTy.getVectorNumElements();
5824 }
5825
5826 if (ST->hasBWI())
5827 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5828 return MinMaxCost + Entry->Cost;
5829
5830 if (ST->hasAVX())
5831 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5832 return MinMaxCost + Entry->Cost;
5833
5834 if (ST->hasSSE41())
5835 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5836 return MinMaxCost + Entry->Cost;
5837
5838 if (ST->hasSSE2())
5839 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5840 return MinMaxCost + Entry->Cost;
5841
5842 unsigned ScalarSize = ValTy->getScalarSizeInBits();
5843
5844 // Special case power of 2 reductions where the scalar type isn't changed
5845 // by type legalization.
5846 if (!isPowerOf2_32(ValVTy->getNumElements()) ||
5847 ScalarSize != MTy.getScalarSizeInBits())
5848 return BaseT::getMinMaxReductionCost(IID, ValTy, FMF, CostKind);
5849
5850 // Now handle reduction with the legal type, taking into account size changes
5851 // at each level.
5852 while (NumVecElts > 1) {
5853 // Determine the size of the remaining vector we need to reduce.
5854 unsigned Size = NumVecElts * ScalarSize;
5855 NumVecElts /= 2;
5856 // If we're reducing from 256/512 bits, use an extract_subvector.
5857 if (Size > 128) {
5858 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5859 MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, {}, CostKind,
5860 NumVecElts, SubTy);
5861 Ty = SubTy;
5862 } else if (Size == 128) {
5863 // Reducing from 128 bits is a permute of v2f64/v2i64.
5864 VectorType *ShufTy;
5865 if (ValTy->isFloatingPointTy())
5866 ShufTy =
5868 else
5869 ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
5870 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, {},
5871 CostKind, 0, nullptr);
5872 } else if (Size == 64) {
5873 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5874 FixedVectorType *ShufTy;
5875 if (ValTy->isFloatingPointTy())
5876 ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
5877 else
5878 ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
5879 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, {},
5880 CostKind, 0, nullptr);
5881 } else {
5882 // Reducing from smaller size is a shift by immediate.
5883 auto *ShiftTy = FixedVectorType::get(
5884 Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
5885 MinMaxCost += getArithmeticInstrCost(
5886 Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
5889 }
5890
5891 // Add the arithmetic op for this level.
5892 MinMaxCost += getMinMaxCost(IID, Ty, CostKind, FMF);
5893 }
5894
5895 // Add the final extract element to the cost.
5896 return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5897 CostKind, 0, nullptr, nullptr);
5898}
5899
5900/// Calculate the cost of materializing a 64-bit value. This helper
5901/// method might only calculate a fraction of a larger immediate. Therefore it
5902/// is valid to return a cost of ZERO.
5904 if (Val == 0)
5905 return TTI::TCC_Free;
5906
5907 if (isInt<32>(Val))
5908 return TTI::TCC_Basic;
5909
5910 return 2 * TTI::TCC_Basic;
5911}
5912
5915 assert(Ty->isIntegerTy());
5916
5917 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5918 if (BitSize == 0)
5919 return ~0U;
5920
5921 // Never hoist constants larger than 128bit, because this might lead to
5922 // incorrect code generation or assertions in codegen.
5923 // Fixme: Create a cost model for types larger than i128 once the codegen
5924 // issues have been fixed.
5925 if (BitSize > 128)
5926 return TTI::TCC_Free;
5927
5928 if (Imm == 0)
5929 return TTI::TCC_Free;
5930
5931 // Sign-extend all constants to a multiple of 64-bit.
5932 APInt ImmVal = Imm;
5933 if (BitSize % 64 != 0)
5934 ImmVal = Imm.sext(alignTo(BitSize, 64));
5935
5936 // Split the constant into 64-bit chunks and calculate the cost for each
5937 // chunk.
5939 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
5940 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
5941 int64_t Val = Tmp.getSExtValue();
5942 Cost += getIntImmCost(Val);
5943 }
5944 // We need at least one instruction to materialize the constant.
5945 return std::max<InstructionCost>(1, Cost);
5946}
5947
5949 const APInt &Imm, Type *Ty,
5951 Instruction *Inst) {
5952 assert(Ty->isIntegerTy());
5953
5954 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5955 unsigned ImmBitWidth = Imm.getBitWidth();
5956
5957 // There is no cost model for constants with a bit size of 0. Return TCC_Free
5958 // here, so that constant hoisting will ignore this constant.
5959 if (BitSize == 0)
5960 return TTI::TCC_Free;
5961
5962 unsigned ImmIdx = ~0U;
5963 switch (Opcode) {
5964 default:
5965 return TTI::TCC_Free;
5966 case Instruction::GetElementPtr:
5967 // Always hoist the base address of a GetElementPtr. This prevents the
5968 // creation of new constants for every base constant that gets constant
5969 // folded with the offset.
5970 if (Idx == 0)
5971 return 2 * TTI::TCC_Basic;
5972 return TTI::TCC_Free;
5973 case Instruction::Store:
5974 ImmIdx = 0;
5975 break;
5976 case Instruction::ICmp:
5977 // This is an imperfect hack to prevent constant hoisting of
5978 // compares that might be trying to check if a 64-bit value fits in
5979 // 32-bits. The backend can optimize these cases using a right shift by 32.
5980 // Ideally we would check the compare predicate here. There also other
5981 // similar immediates the backend can use shifts for.
5982 if (Idx == 1 && ImmBitWidth == 64) {
5983 uint64_t ImmVal = Imm.getZExtValue();
5984 if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
5985 return TTI::TCC_Free;
5986 }
5987 ImmIdx = 1;
5988 break;
5989 case Instruction::And:
5990 // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
5991 // by using a 32-bit operation with implicit zero extension. Detect such
5992 // immediates here as the normal path expects bit 31 to be sign extended.
5993 if (Idx == 1 && ImmBitWidth == 64 && Imm.isIntN(32))
5994 return TTI::TCC_Free;
5995 // If we have BMI then we can use BEXTR/BZHI to mask out upper i64 bits.
5996 if (Idx == 1 && ImmBitWidth == 64 && ST->is64Bit() && ST->hasBMI() &&
5997 Imm.isMask())
5998 return X86TTIImpl::getIntImmCost(ST->hasBMI2() ? 255 : 65535);
5999 ImmIdx = 1;
6000 break;
6001 case Instruction::Add:
6002 case Instruction::Sub:
6003 // For add/sub, we can use the opposite instruction for INT32_MIN.
6004 if (Idx == 1 && ImmBitWidth == 64 && Imm.getZExtValue() == 0x80000000)
6005 return TTI::TCC_Free;
6006 ImmIdx = 1;
6007 break;
6008 case Instruction::UDiv:
6009 case Instruction::SDiv:
6010 case Instruction::URem:
6011 case Instruction::SRem:
6012 // Division by constant is typically expanded later into a different
6013 // instruction sequence. This completely changes the constants.
6014 // Report them as "free" to stop ConstantHoist from marking them as opaque.
6015 return TTI::TCC_Free;
6016 case Instruction::Mul:
6017 case Instruction::Or:
6018 case Instruction::Xor:
6019 ImmIdx = 1;
6020 break;
6021 // Always return TCC_Free for the shift value of a shift instruction.
6022 case Instruction::Shl:
6023 case Instruction::LShr:
6024 case Instruction::AShr:
6025 if (Idx == 1)
6026 return TTI::TCC_Free;
6027 break;
6028 case Instruction::Trunc:
6029 case Instruction::ZExt:
6030 case Instruction::SExt:
6031 case Instruction::IntToPtr:
6032 case Instruction::PtrToInt:
6033 case Instruction::BitCast:
6034 case Instruction::PHI:
6035 case Instruction::Call:
6036 case Instruction::Select:
6037 case Instruction::Ret:
6038 case Instruction::Load:
6039 break;
6040 }
6041
6042 if (Idx == ImmIdx) {
6043 uint64_t NumConstants = divideCeil(BitSize, 64);
6045 return (Cost <= NumConstants * TTI::TCC_Basic)
6046 ? static_cast<int>(TTI::TCC_Free)
6047 : Cost;
6048 }
6049
6050 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
6051}
6052
6054 const APInt &Imm, Type *Ty,
6056 assert(Ty->isIntegerTy());
6057
6058 unsigned BitSize = Ty->getPrimitiveSizeInBits();
6059 // There is no cost model for constants with a bit size of 0. Return TCC_Free
6060 // here, so that constant hoisting will ignore this constant.
6061 if (BitSize == 0)
6062 return TTI::TCC_Free;
6063
6064 switch (IID) {
6065 default:
6066 return TTI::TCC_Free;
6067 case Intrinsic::sadd_with_overflow:
6068 case Intrinsic::uadd_with_overflow:
6069 case Intrinsic::ssub_with_overflow:
6070 case Intrinsic::usub_with_overflow:
6071 case Intrinsic::smul_with_overflow:
6072 case Intrinsic::umul_with_overflow:
6073 if ((Idx == 1) && Imm.getBitWidth() <= 64 && Imm.isSignedIntN(32))
6074 return TTI::TCC_Free;
6075 break;
6076 case Intrinsic::experimental_stackmap:
6077 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
6078 return TTI::TCC_Free;
6079 break;
6080 case Intrinsic::experimental_patchpoint_void:
6081 case Intrinsic::experimental_patchpoint:
6082 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
6083 return TTI::TCC_Free;
6084 break;
6085 }
6086 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
6087}
6088
6091 const Instruction *I) {
6093 return Opcode == Instruction::PHI ? TTI::TCC_Free : TTI::TCC_Basic;
6094 // Branches are assumed to be predicted.
6095 return TTI::TCC_Free;
6096}
6097
6098int X86TTIImpl::getGatherOverhead() const {
6099 // Some CPUs have more overhead for gather. The specified overhead is relative
6100 // to the Load operation. "2" is the number provided by Intel architects. This
6101 // parameter is used for cost estimation of Gather Op and comparison with
6102 // other alternatives.
6103 // TODO: Remove the explicit hasAVX512()?, That would mean we would only
6104 // enable gather with a -march.
6105 if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
6106 return 2;
6107
6108 return 1024;
6109}
6110
6111int X86TTIImpl::getScatterOverhead() const {
6112 if (ST->hasAVX512())
6113 return 2;
6114
6115 return 1024;
6116}
6117
6118// Return an average cost of Gather / Scatter instruction, maybe improved later.
6119InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode,
6121 Type *SrcVTy, const Value *Ptr,
6122 Align Alignment,
6123 unsigned AddressSpace) {
6124
6125 assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
6126 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
6127
6128 // Try to reduce index size from 64 bit (default for GEP)
6129 // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
6130 // operation will use 16 x 64 indices which do not fit in a zmm and needs
6131 // to split. Also check that the base pointer is the same for all lanes,
6132 // and that there's at most one variable index.
6133 auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
6134 unsigned IndexSize = DL.getPointerSizeInBits();
6135 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
6136 if (IndexSize < 64 || !GEP)
6137 return IndexSize;
6138
6139 unsigned NumOfVarIndices = 0;
6140 const Value *Ptrs = GEP->getPointerOperand();
6141 if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
6142 return IndexSize;
6143 for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) {
6144 if (isa<Constant>(GEP->getOperand(I)))
6145 continue;
6146 Type *IndxTy = GEP->getOperand(I)->getType();
6147 if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
6148 IndxTy = IndexVTy->getElementType();
6149 if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
6150 !isa<SExtInst>(GEP->getOperand(I))) ||
6151 ++NumOfVarIndices > 1)
6152 return IndexSize; // 64
6153 }
6154 return (unsigned)32;
6155 };
6156
6157 // Trying to reduce IndexSize to 32 bits for vector 16.
6158 // By default the IndexSize is equal to pointer size.
6159 unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
6160 ? getIndexSizeInBits(Ptr, DL)
6162
6163 auto *IndexVTy = FixedVectorType::get(
6164 IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
6165 std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(IndexVTy);
6166 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(SrcVTy);
6167 InstructionCost::CostType SplitFactor =
6168 *std::max(IdxsLT.first, SrcLT.first).getValue();
6169 if (SplitFactor > 1) {
6170 // Handle splitting of vector of pointers
6171 auto *SplitSrcTy =
6172 FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
6173 return SplitFactor * getGSVectorCost(Opcode, CostKind, SplitSrcTy, Ptr,
6174 Alignment, AddressSpace);
6175 }
6176
6177 // If we didn't split, this will be a single gather/scatter instruction.
6179 return 1;
6180
6181 // The gather / scatter cost is given by Intel architects. It is a rough
6182 // number since we are looking at one instruction in a time.
6183 const int GSOverhead = (Opcode == Instruction::Load) ? getGatherOverhead()
6184 : getScatterOverhead();
6185 return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
6186 MaybeAlign(Alignment), AddressSpace,
6187 CostKind);
6188}
6189
6190/// Calculate the cost of Gather / Scatter operation
6192 unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask,
6194 const Instruction *I = nullptr) {
6195 if ((Opcode == Instruction::Load &&
6196 (!isLegalMaskedGather(SrcVTy, Align(Alignment)) ||
6197 forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
6198 Align(Alignment)))) ||
6199 (Opcode == Instruction::Store &&
6200 (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) ||
6201 forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
6202 Align(Alignment)))))
6203 return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask,
6204 Alignment, CostKind, I);
6205
6206 assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
6207 PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
6208 if (!PtrTy && Ptr->getType()->isVectorTy())
6209 PtrTy = dyn_cast<PointerType>(
6210 cast<VectorType>(Ptr->getType())->getElementType());
6211 assert(PtrTy && "Unexpected type for Ptr argument");
6212 unsigned AddressSpace = PtrTy->getAddressSpace();
6213 return getGSVectorCost(Opcode, CostKind, SrcVTy, Ptr, Alignment,
6214 AddressSpace);
6215}
6216
6218 const TargetTransformInfo::LSRCost &C2) {
6219 // X86 specific here are "instruction number 1st priority".
6220 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
6221 C1.NumIVMuls, C1.NumBaseAdds,
6222 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
6223 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
6224 C2.NumIVMuls, C2.NumBaseAdds,
6225 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
6226}
6227
6229 return ST->hasMacroFusion() || ST->hasBranchFusion();
6230}
6231
6232bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
6233 Type *ScalarTy = DataTy->getScalarType();
6234
6235 // The backend can't handle a single element vector w/o CFCMOV.
6236 if (isa<VectorType>(DataTy) && cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6237 return ST->hasCF() && hasConditionalLoadStoreForType(ScalarTy);
6238
6239 if (!ST->hasAVX())
6240 return false;
6241
6242 if (ScalarTy->isPointerTy())
6243 return true;
6244
6245 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6246 return true;
6247
6248 if (ScalarTy->isHalfTy() && ST->hasBWI())
6249 return true;
6250
6251 if (ScalarTy->isBFloatTy() && ST->hasBF16())
6252 return true;
6253
6254 if (!ScalarTy->isIntegerTy())
6255 return false;
6256
6257 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6258 return IntWidth == 32 || IntWidth == 64 ||
6259 ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
6260}
6261
6262bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) {
6263 return isLegalMaskedLoad(DataType, Alignment);
6264}
6265
6266bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) {
6267 unsigned DataSize = DL.getTypeStoreSize(DataType);
6268 // The only supported nontemporal loads are for aligned vectors of 16 or 32
6269 // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
6270 // (the equivalent stores only require AVX).
6271 if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
6272 return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
6273
6274 return false;
6275}
6276
6277bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) {
6278 unsigned DataSize = DL.getTypeStoreSize(DataType);
6279
6280 // SSE4A supports nontemporal stores of float and double at arbitrary
6281 // alignment.
6282 if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
6283 return true;
6284
6285 // Besides the SSE4A subtarget exception above, only aligned stores are
6286 // available nontemporaly on any other subtarget. And only stores with a size
6287 // of 4..32 bytes (powers of 2, only) are permitted.
6288 if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
6289 !isPowerOf2_32(DataSize))
6290 return false;
6291
6292 // 32-byte vector nontemporal stores are supported by AVX (the equivalent
6293 // loads require AVX2).
6294 if (DataSize == 32)
6295 return ST->hasAVX();
6296 if (DataSize == 16)
6297 return ST->hasSSE1();
6298 return true;
6299}
6300
6302 ElementCount NumElements) const {
6303 // movddup
6304 return ST->hasSSE3() && !NumElements.isScalable() &&
6305 NumElements.getFixedValue() == 2 &&
6306 ElementTy == Type::getDoubleTy(ElementTy->getContext());
6307}
6308
6310 if (!isa<VectorType>(DataTy))
6311 return false;
6312
6313 if (!ST->hasAVX512())
6314 return false;
6315
6316 // The backend can't handle a single element vector.
6317 if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6318 return false;
6319
6320 Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
6321
6322 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6323 return true;
6324
6325 if (!ScalarTy->isIntegerTy())
6326 return false;
6327
6328 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6329 return IntWidth == 32 || IntWidth == 64 ||
6330 ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
6331}
6332
6334 return isLegalMaskedExpandLoad(DataTy, Alignment);
6335}
6336
6337bool X86TTIImpl::supportsGather() const {
6338 // Some CPUs have better gather performance than others.
6339 // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
6340 // enable gather with a -march.
6341 return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2());
6342}
6343
6345 // Gather / Scatter for vector 2 is not profitable on KNL / SKX
6346 // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend
6347 // it to 8 elements, but zeroing upper bits of the mask vector will add more
6348 // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO:
6349 // Check, maybe the gather/scatter instruction is better in the VariableMask
6350 // case.
6351 unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements();
6352 return NumElts == 1 ||
6353 (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())));
6354}
6355
6357 Type *ScalarTy = DataTy->getScalarType();
6358 if (ScalarTy->isPointerTy())
6359 return true;
6360
6361 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6362 return true;
6363
6364 if (!ScalarTy->isIntegerTy())
6365 return false;
6366
6367 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6368 return IntWidth == 32 || IntWidth == 64;
6369}
6370
6372 if (!supportsGather() || !ST->preferGather())
6373 return false;
6374 return isLegalMaskedGatherScatter(DataTy, Alignment);
6375}
6376
6377bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
6378 unsigned Opcode1,
6379 const SmallBitVector &OpcodeMask) const {
6380 // ADDSUBPS 4xf32 SSE3
6381 // VADDSUBPS 4xf32 AVX
6382 // VADDSUBPS 8xf32 AVX2
6383 // ADDSUBPD 2xf64 SSE3
6384 // VADDSUBPD 2xf64 AVX
6385 // VADDSUBPD 4xf64 AVX2
6386
6387 unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
6388 assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible");
6389 if (!isPowerOf2_32(NumElements))
6390 return false;
6391 // Check the opcode pattern. We apply the mask on the opcode arguments and
6392 // then check if it is what we expect.
6393 for (int Lane : seq<int>(0, NumElements)) {
6394 unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0;
6395 // We expect FSub for even lanes and FAdd for odd lanes.
6396 if (Lane % 2 == 0 && Opc != Instruction::FSub)
6397 return false;
6398 if (Lane % 2 == 1 && Opc != Instruction::FAdd)
6399 return false;
6400 }
6401 // Now check that the pattern is supported by the target ISA.
6402 Type *ElemTy = cast<VectorType>(VecTy)->getElementType();
6403 if (ElemTy->isFloatTy())
6404 return ST->hasSSE3() && NumElements % 4 == 0;
6405 if (ElemTy->isDoubleTy())
6406 return ST->hasSSE3() && NumElements % 2 == 0;
6407 return false;
6408}
6409
6410bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) {
6411 // AVX2 doesn't support scatter
6412 if (!ST->hasAVX512() || !ST->preferScatter())
6413 return false;
6414 return isLegalMaskedGatherScatter(DataType, Alignment);
6415}
6416
6417bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
6418 EVT VT = TLI->getValueType(DL, DataType);
6419 return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
6420}
6421
6423 // FDIV is always expensive, even if it has a very low uop count.
6424 // TODO: Still necessary for recent CPUs with low latency/throughput fdiv?
6425 if (I->getOpcode() == Instruction::FDiv)
6426 return true;
6427
6429}
6430
6432 return false;
6433}
6434
6436 const Function *Callee) const {
6437 const TargetMachine &TM = getTLI()->getTargetMachine();
6438
6439 // Work this as a subsetting of subtarget features.
6440 const FeatureBitset &CallerBits =
6441 TM.getSubtargetImpl(*Caller)->getFeatureBits();
6442 const FeatureBitset &CalleeBits =
6443 TM.getSubtargetImpl(*Callee)->getFeatureBits();
6444
6445 // Check whether features are the same (apart from the ignore list).
6446 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
6447 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
6448 if (RealCallerBits == RealCalleeBits)
6449 return true;
6450
6451 // If the features are a subset, we need to additionally check for calls
6452 // that may become ABI-incompatible as a result of inlining.
6453 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
6454 return false;
6455
6456 for (const Instruction &I : instructions(Callee)) {
6457 if (const auto *CB = dyn_cast<CallBase>(&I)) {
6458 // Having more target features is fine for inline ASM.
6459 if (CB->isInlineAsm())
6460 continue;
6461
6463 for (Value *Arg : CB->args())
6464 Types.push_back(Arg->getType());
6465 if (!CB->getType()->isVoidTy())
6466 Types.push_back(CB->getType());
6467
6468 // Simple types are always ABI compatible.
6469 auto IsSimpleTy = [](Type *Ty) {
6470 return !Ty->isVectorTy() && !Ty->isAggregateType();
6471 };
6472 if (all_of(Types, IsSimpleTy))
6473 continue;
6474
6475 if (Function *NestedCallee = CB->getCalledFunction()) {
6476 // Assume that intrinsics are always ABI compatible.
6477 if (NestedCallee->isIntrinsic())
6478 continue;
6479
6480 // Do a precise compatibility check.
6481 if (!areTypesABICompatible(Caller, NestedCallee, Types))
6482 return false;
6483 } else {
6484 // We don't know the target features of the callee,
6485 // assume it is incompatible.
6486 return false;
6487 }
6488 }
6489 }
6490 return true;
6491}
6492
6494 const Function *Callee,
6495 const ArrayRef<Type *> &Types) const {
6496 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
6497 return false;
6498
6499 // If we get here, we know the target features match. If one function
6500 // considers 512-bit vectors legal and the other does not, consider them
6501 // incompatible.
6502 const TargetMachine &TM = getTLI()->getTargetMachine();
6503
6504 if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
6505 TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs())
6506 return true;
6507
6508 // Consider the arguments compatible if they aren't vectors or aggregates.
6509 // FIXME: Look at the size of vectors.
6510 // FIXME: Look at the element types of aggregates to see if there are vectors.
6511 return llvm::none_of(Types,
6512 [](Type *T) { return T->isVectorTy() || T->isAggregateType(); });
6513}
6514
6516X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
6518 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
6519 Options.NumLoadsPerBlock = 2;
6520 // All GPR and vector loads can be unaligned.
6521 Options.AllowOverlappingLoads = true;
6522 if (IsZeroCmp) {
6523 // Only enable vector loads for equality comparison. Right now the vector
6524 // version is not as fast for three way compare (see #33329).
6525 const unsigned PreferredWidth = ST->getPreferVectorWidth();
6526 if (PreferredWidth >= 512 && ST->hasAVX512() && ST->hasEVEX512())
6527 Options.LoadSizes.push_back(64);
6528 if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
6529 if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
6530 }
6531 if (ST->is64Bit()) {
6532 Options.LoadSizes.push_back(8);
6533 }
6534 Options.LoadSizes.push_back(4);
6535 Options.LoadSizes.push_back(2);
6536 Options.LoadSizes.push_back(1);
6537 return Options;
6538}
6539
6541 return supportsGather();
6542}
6543
6545 return false;
6546}
6547
6549 // TODO: We expect this to be beneficial regardless of arch,
6550 // but there are currently some unexplained performance artifacts on Atom.
6551 // As a temporary solution, disable on Atom.
6552 return !(ST->isAtom());
6553}
6554
6555// Get estimation for interleaved load/store operations and strided load.
6556// \p Indices contains indices for strided load.
6557// \p Factor - the factor of interleaving.
6558// AVX-512 provides 3-src shuffles that significantly reduces the cost.
6560 unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
6561 ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
6562 TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
6563 // VecTy for interleave memop is <VF*Factor x Elt>.
6564 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6565 // VecTy = <12 x i32>.
6566
6567 // Calculate the number of memory operations (NumOfMemOps), required
6568 // for load/store the VecTy.
6569 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6570 unsigned VecTySize = DL.getTypeStoreSize(VecTy);
6571 unsigned LegalVTSize = LegalVT.getStoreSize();
6572 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
6573
6574 // Get the cost of one memory operation.
6575 auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
6576 LegalVT.getVectorNumElements());
6577 InstructionCost MemOpCost;
6578 bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps;
6579 if (UseMaskedMemOp)
6580 MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment,
6582 else
6583 MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, MaybeAlign(Alignment),
6585
6586 unsigned VF = VecTy->getNumElements() / Factor;
6587 MVT VT =
6589
6590 InstructionCost MaskCost;
6591 if (UseMaskedMemOp) {
6592 APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements());
6593 for (unsigned Index : Indices) {
6594 assert(Index < Factor && "Invalid index for interleaved memory op");
6595 for (unsigned Elm = 0; Elm < VF; Elm++)
6596 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
6597 }
6598
6599 Type *I1Type = Type::getInt1Ty(VecTy->getContext());
6600
6601 MaskCost = getReplicationShuffleCost(
6602 I1Type, Factor, VF,
6603 UseMaskForGaps ? DemandedLoadStoreElts
6605 CostKind);
6606
6607 // The Gaps mask is invariant and created outside the loop, therefore the
6608 // cost of creating it is not accounted for here. However if we have both
6609 // a MaskForGaps and some other mask that guards the execution of the
6610 // memory access, we need to account for the cost of And-ing the two masks
6611 // inside the loop.
6612 if (UseMaskForGaps) {
6613 auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements());
6614 MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind);
6615 }
6616 }
6617
6618 if (Opcode == Instruction::Load) {
6619 // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
6620 // contain the cost of the optimized shuffle sequence that the
6621 // X86InterleavedAccess pass will generate.
6622 // The cost of loads and stores are computed separately from the table.
6623
6624 // X86InterleavedAccess support only the following interleaved-access group.
6625 static const CostTblEntry AVX512InterleavedLoadTbl[] = {
6626 {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
6627 {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
6628 {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
6629 };
6630
6631 if (const auto *Entry =
6632 CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
6633 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6634 //If an entry does not exist, fallback to the default implementation.
6635
6636 // Kind of shuffle depends on number of loaded values.
6637 // If we load the entire data in one register, we can use a 1-src shuffle.
6638 // Otherwise, we'll merge 2 sources in each operation.
6639 TTI::ShuffleKind ShuffleKind =
6640 (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
6641
6642 InstructionCost ShuffleCost =
6643 getShuffleCost(ShuffleKind, SingleMemOpTy, {}, CostKind, 0, nullptr);
6644
6645 unsigned NumOfLoadsInInterleaveGrp =
6646 Indices.size() ? Indices.size() : Factor;
6647 auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
6648 VecTy->getNumElements() / Factor);
6649 InstructionCost NumOfResults =
6650 getTypeLegalizationCost(ResultTy).first * NumOfLoadsInInterleaveGrp;
6651
6652 // About a half of the loads may be folded in shuffles when we have only
6653 // one result. If we have more than one result, or the loads are masked,
6654 // we do not fold loads at all.
6655 unsigned NumOfUnfoldedLoads =
6656 UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
6657
6658 // Get a number of shuffle operations per result.
6659 unsigned NumOfShufflesPerResult =
6660 std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
6661
6662 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6663 // When we have more than one destination, we need additional instructions
6664 // to keep sources.
6665 InstructionCost NumOfMoves = 0;
6666 if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
6667 NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
6668
6669 InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
6670 MaskCost + NumOfUnfoldedLoads * MemOpCost +
6671 NumOfMoves;
6672
6673 return Cost;
6674 }
6675
6676 // Store.
6677 assert(Opcode == Instruction::Store &&
6678 "Expected Store Instruction at this point");
6679 // X86InterleavedAccess support only the following interleaved-access group.
6680 static const CostTblEntry AVX512InterleavedStoreTbl[] = {
6681 {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
6682 {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
6683 {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
6684
6685 {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
6686 {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
6687 {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
6688 {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
6689 };
6690
6691 if (const auto *Entry =
6692 CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
6693 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6694 //If an entry does not exist, fallback to the default implementation.
6695
6696 // There is no strided stores meanwhile. And store can't be folded in
6697 // shuffle.
6698 unsigned NumOfSources = Factor; // The number of values to be merged.
6699 InstructionCost ShuffleCost = getShuffleCost(
6700 TTI::SK_PermuteTwoSrc, SingleMemOpTy, {}, CostKind, 0, nullptr);
6701 unsigned NumOfShufflesPerStore = NumOfSources - 1;
6702
6703 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6704 // We need additional instructions to keep sources.
6705 unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
6707 MaskCost +
6708 NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
6709 NumOfMoves;
6710 return Cost;
6711}
6712
6714 unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices,
6715 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
6716 bool UseMaskForCond, bool UseMaskForGaps) {
6717 auto *VecTy = cast<FixedVectorType>(BaseTy);
6718
6719 auto isSupportedOnAVX512 = [&](Type *VecTy) {
6720 Type *EltTy = cast<VectorType>(VecTy)->getElementType();
6721 if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
6722 EltTy->isIntegerTy(32) || EltTy->isPointerTy())
6723 return true;
6724 if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy())
6725 return ST->hasBWI();
6726 if (EltTy->isBFloatTy())
6727 return ST->hasBF16();
6728 return false;
6729 };
6730 if (ST->hasAVX512() && isSupportedOnAVX512(VecTy))
6732 Opcode, VecTy, Factor, Indices, Alignment,
6733 AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
6734
6735 if (UseMaskForCond || UseMaskForGaps)
6736 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6737 Alignment, AddressSpace, CostKind,
6738 UseMaskForCond, UseMaskForGaps);
6739
6740 // Get estimation for interleaved load/store operations for SSE-AVX2.
6741 // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow
6742 // computing the cost using a generic formula as a function of generic
6743 // shuffles. We therefore use a lookup table instead, filled according to
6744 // the instruction sequences that codegen currently generates.
6745
6746 // VecTy for interleave memop is <VF*Factor x Elt>.
6747 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6748 // VecTy = <12 x i32>.
6749 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6750
6751 // This function can be called with VecTy=<6xi128>, Factor=3, in which case
6752 // the VF=2, while v2i128 is an unsupported MVT vector type
6753 // (see MachineValueType.h::getVectorVT()).
6754 if (!LegalVT.isVector())
6755 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6756 Alignment, AddressSpace, CostKind);
6757
6758 unsigned VF = VecTy->getNumElements() / Factor;
6759 Type *ScalarTy = VecTy->getElementType();
6760 // Deduplicate entries, model floats/pointers as appropriately-sized integers.
6761 if (!ScalarTy->isIntegerTy())
6762 ScalarTy =
6763 Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy));
6764
6765 // Get the cost of all the memory operations.
6766 // FIXME: discount dead loads.
6767 InstructionCost MemOpCosts = getMemoryOpCost(
6768 Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind);
6769
6770 auto *VT = FixedVectorType::get(ScalarTy, VF);
6771 EVT ETy = TLI->getValueType(DL, VT);
6772 if (!ETy.isSimple())
6773 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6774 Alignment, AddressSpace, CostKind);
6775
6776 // TODO: Complete for other data-types and strides.
6777 // Each combination of Stride, element bit width and VF results in a different
6778 // sequence; The cost tables are therefore accessed with:
6779 // Factor (stride) and VectorType=VFxiN.
6780 // The Cost accounts only for the shuffle sequence;
6781 // The cost of the loads/stores is accounted for separately.
6782 //
6783 static const CostTblEntry AVX2InterleavedLoadTbl[] = {
6784 {2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8
6785 {2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8
6786 {2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8
6787 {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8
6788 {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8
6789
6790 {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16
6791 {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16
6792 {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16
6793
6794 {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32
6795 {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32
6796 {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32
6797
6798 {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64
6799 {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64
6800 {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64
6801 {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64
6802
6803 {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8
6804 {3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8
6805 {3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8
6806 {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
6807 {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8
6808
6809 {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16
6810 {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16
6811 {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16
6812 {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16
6813 {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16
6814
6815 {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32
6816 {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32
6817 {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32
6818 {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32
6819 {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32
6820
6821 {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64
6822 {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64
6823 {3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64
6824 {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64
6825
6826 {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8
6827 {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8
6828 {4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8
6829 {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8
6830 {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8
6831
6832 {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16
6833 {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16
6834 {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16
6835 {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16
6836 {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16
6837
6838 {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32
6839 {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32
6840 {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32
6841 {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32
6842 {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32
6843
6844 {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64
6845 {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64
6846 {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64
6847 {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64
6848
6849 {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8
6850 {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8
6851 {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8
6852 {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8
6853 {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8
6854
6855 {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16
6856 {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16
6857 {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16
6858 {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16
6859 {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16
6860
6861 {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32
6862 {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32
6863 {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32
6864 {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32
6865
6866 {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64
6867 {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64
6868 {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64
6869
6870 {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32
6871 };
6872
6873 static const CostTblEntry SSSE3InterleavedLoadTbl[] = {
6874 {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16
6875 };
6876
6877 static const CostTblEntry SSE2InterleavedLoadTbl[] = {
6878 {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16
6879 {2, MVT::v4i16, 7}, // (load 8i16 and) deinterleave into 2 x 4i16
6880
6881 {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32
6882 {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32
6883
6884 {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64
6885 };
6886
6887 static const CostTblEntry AVX2InterleavedStoreTbl[] = {
6888 {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store)
6889 {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store)
6890
6891 {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store)
6892 {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store)
6893 {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store)
6894
6895 {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store)
6896 {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store)
6897 {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store)
6898 {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store)
6899
6900 {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store)
6901 {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store)
6902 {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store)
6903 {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store)
6904 {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store)
6905
6906 {3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store)
6907 {3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store)
6908 {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store)
6909 {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
6910 {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
6911
6912 {3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store)
6913 {3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store)
6914 {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store)
6915 {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store)
6916 {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store)
6917
6918 {3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store)
6919 {3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store)
6920 {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store)
6921 {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store)
6922 {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store)
6923
6924 {3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store)
6925 {3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store)
6926 {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store)
6927 {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store)
6928
6929 {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store)
6930 {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store)
6931 {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store)
6932 {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store)
6933 {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store)
6934
6935 {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store)
6936 {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store)
6937 {4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store)
6938 {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store)
6939 {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store)
6940
6941 {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store)
6942 {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store)
6943 {4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store)
6944 {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store)
6945 {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store)
6946
6947 {4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store)
6948 {4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store)
6949 {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store)
6950 {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store)
6951
6952 {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store)
6953 {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store)
6954 {6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store)
6955 {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store)
6956 {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store)
6957
6958 {6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store)
6959 {6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store)
6960 {6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store)
6961 {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store)
6962 {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store)
6963
6964 {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store)
6965 {6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store)
6966 {6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store)
6967 {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store)
6968
6969 {6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store)
6970 {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store)
6971 {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store)
6972 };
6973
6974 static const CostTblEntry SSE2InterleavedStoreTbl[] = {
6975 {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store)
6976 {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store)
6977 {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store)
6978
6979 {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store)
6980 {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store)
6981
6982 {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store)
6983 };
6984
6985 if (Opcode == Instruction::Load) {
6986 auto GetDiscountedCost = [Factor, NumMembers = Indices.size(),
6987 MemOpCosts](const CostTblEntry *Entry) {
6988 // NOTE: this is just an approximation!
6989 // It can over/under -estimate the cost!
6990 return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor);
6991 };
6992
6993 if (ST->hasAVX2())
6994 if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor,
6995 ETy.getSimpleVT()))
6996 return GetDiscountedCost(Entry);
6997
6998 if (ST->hasSSSE3())
6999 if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor,
7000 ETy.getSimpleVT()))
7001 return GetDiscountedCost(Entry);
7002
7003 if (ST->hasSSE2())
7004 if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor,
7005 ETy.getSimpleVT()))
7006 return GetDiscountedCost(Entry);
7007 } else {
7008 assert(Opcode == Instruction::Store &&
7009 "Expected Store Instruction at this point");
7010 assert((!Indices.size() || Indices.size() == Factor) &&
7011 "Interleaved store only supports fully-interleaved groups.");
7012 if (ST->hasAVX2())
7013 if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor,
7014 ETy.getSimpleVT()))
7015 return MemOpCosts + Entry->Cost;
7016
7017 if (ST->hasSSE2())
7018 if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor,
7019 ETy.getSimpleVT()))
7020 return MemOpCosts + Entry->Cost;
7021 }
7022
7023 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
7024 Alignment, AddressSpace, CostKind,
7025 UseMaskForCond, UseMaskForGaps);
7026}
7027
7029 StackOffset BaseOffset,
7030 bool HasBaseReg, int64_t Scale,
7031 unsigned AddrSpace) const {
7032 // Scaling factors are not free at all.
7033 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
7034 // will take 2 allocations in the out of order engine instead of 1
7035 // for plain addressing mode, i.e. inst (reg1).
7036 // E.g.,
7037 // vaddps (%rsi,%rdx), %ymm0, %ymm1
7038 // Requires two allocations (one for the load, one for the computation)
7039 // whereas:
7040 // vaddps (%rsi), %ymm0, %ymm1
7041 // Requires just 1 allocation, i.e., freeing allocations for other operations
7042 // and having less micro operations to execute.
7043 //
7044 // For some X86 architectures, this is even worse because for instance for
7045 // stores, the complex addressing mode forces the instruction to use the
7046 // "load" ports instead of the dedicated "store" port.
7047 // E.g., on Haswell:
7048 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
7049 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
7051 AM.BaseGV = BaseGV;
7052 AM.BaseOffs = BaseOffset.getFixed();
7053 AM.HasBaseReg = HasBaseReg;
7054 AM.Scale = Scale;
7055 AM.ScalableOffset = BaseOffset.getScalable();
7056 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
7057 // Scale represents reg2 * scale, thus account for 1
7058 // as soon as we use a second register.
7059 return AM.Scale != 0;
7060 return -1;
7061}
7062
7064 // TODO: Hook MispredictPenalty of SchedMachineModel into this.
7065 return 14;
7066}
7067
7069 unsigned Bits = Ty->getScalarSizeInBits();
7070
7071 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
7072 // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
7073 if (ST->hasXOP() && (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
7074 return false;
7075
7076 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
7077 // shifts just as cheap as scalar ones.
7078 if (ST->hasAVX2() && (Bits == 32 || Bits == 64))
7079 return false;
7080
7081 // AVX512BW has shifts such as vpsllvw.
7082 if (ST->hasBWI() && Bits == 16)
7083 return false;
7084
7085 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
7086 // fully general vector.
7087 return true;
7088}
7089
7090unsigned X86TTIImpl::getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
7091 Type *ScalarValTy) const {
7092 if (ST->hasF16C() && ScalarMemTy->isHalfTy()) {
7093 return 4;
7094 }
7095 return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy);
7096}
7097
7099 SmallVectorImpl<Use *> &Ops) const {
7100 using namespace llvm::PatternMatch;
7101
7102 FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
7103 if (!VTy)
7104 return false;
7105
7106 if (I->getOpcode() == Instruction::Mul &&
7107 VTy->getElementType()->isIntegerTy(64)) {
7108 for (auto &Op : I->operands()) {
7109 // Make sure we are not already sinking this operand
7110 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
7111 continue;
7112
7113 // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or
7114 // the PMULUDQ pattern where the input is a zext_inreg from vXi32.
7115 if (ST->hasSSE41() &&
7116 match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),
7117 m_SpecificInt(32)))) {
7118 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
7119 Ops.push_back(&Op);
7120 } else if (ST->hasSSE2() &&
7121 match(Op.get(),
7122 m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) {
7123 Ops.push_back(&Op);
7124 }
7125 }
7126
7127 return !Ops.empty();
7128 }
7129
7130 // A uniform shift amount in a vector shift or funnel shift may be much
7131 // cheaper than a generic variable vector shift, so make that pattern visible
7132 // to SDAG by sinking the shuffle instruction next to the shift.
7133 int ShiftAmountOpNum = -1;
7134 if (I->isShift())
7135 ShiftAmountOpNum = 1;
7136 else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
7137 if (II->getIntrinsicID() == Intrinsic::fshl ||
7138 II->getIntrinsicID() == Intrinsic::fshr)
7139 ShiftAmountOpNum = 2;
7140 }
7141
7142 if (ShiftAmountOpNum == -1)
7143 return false;
7144
7145 auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
7146 if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
7147 isVectorShiftByScalarCheap(I->getType())) {
7148 Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
7149 return true;
7150 }
7151
7152 return false;
7153}
Expand Atomic instructions
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Hexagon Common GEP
static LVOptions Options
Definition: LVOptions.cpp:25
#define I(x, y, z)
Definition: MD5.cpp:58
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
if(PassOpts->AAPipeline)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getNumElements(Type *Ty)
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
This file a TargetTransformInfo::Concept conforming object specific to the X86 target machine.
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:986
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1649
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1330
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:371
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:258
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:380
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1468
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1015
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:827
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:200
APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition: APInt.cpp:455
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1542
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:478
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
Definition: BasicTTIImpl.h:396
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:922
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={})
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:806
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:958
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:379
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:676
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:703
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:697
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:696
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:681
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:684
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:698
@ ICMP_EQ
equal
Definition: InstrTypes.h:694
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:701
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:699
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:683
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:364
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:843
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:617
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:421
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:847
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:322
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:563
unsigned getNumElements() const
Definition: DerivedTypes.h:606
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:933
static InstructionCost getInvalid(CostType Val=0)
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:311
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
const IntrinsicInst * getInst() const
Machine Value Type.
bool is128BitVector() const
Return true if this is a 128-bit vector type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Class to represent pointers.
Definition: DerivedTypes.h:670
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:703
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
bool test(unsigned Idx) const
size_type size() const
Returns the number of bits in this bitvector.
bool empty() const
Definition: SmallVector.h:81
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
static StackOffset getScalable(int64_t Scalable)
Definition: TypeSize.h:43
static StackOffset getFixed(int64_t Fixed)
Definition: TypeSize.h:42
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
unsigned getMaxExpandSizeMemcmp(bool OptSize) const
Get maximum # of load operations permitted for memcmp.
MVT getSimpleValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the MVT corresponding to this LLVM type. See getValueType.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
bool isExpensiveToSpeculativelyExecute(const Instruction *I)
bool isStridedAccess(const SCEV *Ptr) const
unsigned minRequiredElementSize(const Value *Val, bool &isSigned) const
const SCEVConstant * getConstantStrideStep(ScalarEvolution *SE, const SCEV *Ptr) const
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
CacheLevel
The possible cache levels.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:348
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
static Type * getDoubleTy(LLVMContext &C)
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:243
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:264
static IntegerType * getInt1Ty(LLVMContext &C)
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:145
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
static IntegerType * getInt8Ty(LLVMContext &C)
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:225
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
Base class of all SIMD vector types.
Definition: DerivedTypes.h:427
static VectorType * getExtendedElementVectorType(VectorType *VTy)
This static method is like getInteger except that the element types are twice as wide as the elements...
Definition: DerivedTypes.h:487
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:665
static VectorType * getDoubleElementsVectorType(VectorType *VTy)
This static method returns a VectorType with twice as many elements as the input type and the same el...
Definition: DerivedTypes.h:541
Type * getElementType() const
Definition: DerivedTypes.h:460
bool hasSSE1() const
Definition: X86Subtarget.h:193
bool hasSSE42() const
Definition: X86Subtarget.h:198
bool useAVX512Regs() const
Definition: X86Subtarget.h:253
bool hasSSE3() const
Definition: X86Subtarget.h:195
bool hasAVX512() const
Definition: X86Subtarget.h:201
bool hasSSE41() const
Definition: X86Subtarget.h:197
bool hasSSE2() const
Definition: X86Subtarget.h:194
bool hasSSSE3() const
Definition: X86Subtarget.h:196
bool hasAVX() const
Definition: X86Subtarget.h:199
unsigned getPreferVectorWidth() const
Definition: X86Subtarget.h:225
bool hasAVX2() const
Definition: X86Subtarget.h:200
InstructionCost getInterleavedMemoryOpCostAVX512(unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
bool isLegalMaskedGather(Type *DataType, Align Alignment)
InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask, TTI::TargetCostKind CostKind) const
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const
std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool isLegalNTStore(Type *DataType, Align Alignment)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
bool isLegalNTLoad(Type *DataType, Align Alignment)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment)
bool isLegalMaskedGatherScatter(Type *DataType, Align Alignment)
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
bool isLegalMaskedLoad(Type *DataType, Align Alignment)
bool hasConditionalLoadStoreForType(Type *Ty=nullptr) const
bool supportsEfficientVectorElementLoadStore() const
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
bool prefersVectorizedAddressing() const
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment)
std::optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const override
bool isLegalMaskedStore(Type *DataType, Align Alignment)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
Calculate the cost of Gather / Scatter operation.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
unsigned getMaxInterleaveFactor(ElementCount VF)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
bool isLegalMaskedCompressStore(Type *DataType, Align Alignment)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isVectorShiftByScalarCheap(Type *Ty) const
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
unsigned getNumberOfRegisters(unsigned ClassID) const
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
unsigned getAtomicMemIntrinsicMaxElementSize() const
bool isLegalMaskedScatter(Type *DataType, Align Alignment)
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={})
InstructionCost getIntImmCost(int64_t)
Calculate the cost of materializing a 64-bit value.
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getBranchMispredictPenalty() const
bool isExpensiveToSpeculativelyExecute(const Instruction *I)
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr)
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
InstructionCost getMinMaxCost(Intrinsic::ID IID, Type *Ty, TTI::TargetCostKind CostKind, FastMathFlags FMF)
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Type) const
bool hasDivRemOp(Type *DataType, bool IsSigned)
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition: APInt.cpp:2982
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:780
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:753
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:744
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:841
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:717
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:262
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:805
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:752
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:981
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:757
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:330
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:550
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:811
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1031
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:697
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:939
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1050
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:887
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:920
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:817
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
Definition: PatternMatch.h:982
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
apint_match m_APIntAllowPoison(const APInt *&Res)
Match APInt while allowing poison in splat vector constants.
Definition: PatternMatch.h:305
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:556
Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:395
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:292
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:404
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1841
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1945
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
InstructionCost Cost
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
#define N
std::optional< unsigned > operator[](TargetTransformInfo::TargetCostKind Kind) const
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition: Alignment.h:141
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Type Conversion Cost Table.
Definition: CostTable.h:55