LLVM 20.0.0git
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements a TargetTransformInfo analysis pass specific to the
10/// X86 target machine. It uses the target's detailed information to provide
11/// more precise answers to certain TTI queries, while letting the target
12/// independent and default TTI implementations handle the rest.
13///
14//===----------------------------------------------------------------------===//
15/// About Cost Model numbers used below it's necessary to say the following:
16/// the numbers correspond to some "generic" X86 CPU instead of usage of a
17/// specific CPU model. Usually the numbers correspond to the CPU where the
18/// feature first appeared. For example, if we do Subtarget.hasSSE42() in
19/// the lookups below the cost is based on Nehalem as that was the first CPU
20/// to support that feature level and thus has most likely the worst case cost,
21/// although we may discard an outlying worst cost from one CPU (e.g. Atom).
22///
23/// Some examples of other technologies/CPUs:
24/// SSE 3 - Pentium4 / Athlon64
25/// SSE 4.1 - Penryn
26/// SSE 4.2 - Nehalem / Silvermont
27/// AVX - Sandy Bridge / Jaguar / Bulldozer
28/// AVX2 - Haswell / Ryzen
29/// AVX-512 - Xeon Phi / Skylake
30///
31/// And some examples of instruction target dependent costs (latency)
32/// divss sqrtss rsqrtss
33/// AMD K7 11-16 19 3
34/// Piledriver 9-24 13-15 5
35/// Jaguar 14 16 2
36/// Pentium II,III 18 30 2
37/// Nehalem 7-14 7-18 3
38/// Haswell 10-13 11 5
39///
40/// Interpreting the 4 TargetCostKind types:
41/// TCK_RecipThroughput and TCK_Latency should try to match the worst case
42/// values reported by the CPU scheduler models (and llvm-mca).
43/// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the
44/// actual encoding size of the instruction.
45/// TCK_SizeAndLatency should match the worst case micro-op counts reported by
46/// by the CPU scheduler models (and llvm-mca), to ensure that they are
47/// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are
48/// often used as the cost thresholds where TCK_SizeAndLatency is requested.
49//===----------------------------------------------------------------------===//
50
58#include <optional>
59
60using namespace llvm;
61
62#define DEBUG_TYPE "x86tti"
63
64//===----------------------------------------------------------------------===//
65//
66// X86 cost model.
67//
68//===----------------------------------------------------------------------===//
69
70// Helper struct to store/access costs for each cost kind.
71// TODO: Move this to allow other targets to use it?
73 unsigned RecipThroughputCost = ~0U;
74 unsigned LatencyCost = ~0U;
75 unsigned CodeSizeCost = ~0U;
76 unsigned SizeAndLatencyCost = ~0U;
77
78 std::optional<unsigned>
80 unsigned Cost = ~0U;
81 switch (Kind) {
84 break;
87 break;
90 break;
93 break;
94 }
95 if (Cost == ~0U)
96 return std::nullopt;
97 return Cost;
98 }
99};
102
105 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
106 // TODO: Currently the __builtin_popcount() implementation using SSE3
107 // instructions is inefficient. Once the problem is fixed, we should
108 // call ST->hasSSE3() instead of ST->hasPOPCNT().
109 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
110}
111
112std::optional<unsigned> X86TTIImpl::getCacheSize(
114 switch (Level) {
116 // - Penryn
117 // - Nehalem
118 // - Westmere
119 // - Sandy Bridge
120 // - Ivy Bridge
121 // - Haswell
122 // - Broadwell
123 // - Skylake
124 // - Kabylake
125 return 32 * 1024; // 32 KByte
127 // - Penryn
128 // - Nehalem
129 // - Westmere
130 // - Sandy Bridge
131 // - Ivy Bridge
132 // - Haswell
133 // - Broadwell
134 // - Skylake
135 // - Kabylake
136 return 256 * 1024; // 256 KByte
137 }
138
139 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
140}
141
142std::optional<unsigned> X86TTIImpl::getCacheAssociativity(
144 // - Penryn
145 // - Nehalem
146 // - Westmere
147 // - Sandy Bridge
148 // - Ivy Bridge
149 // - Haswell
150 // - Broadwell
151 // - Skylake
152 // - Kabylake
153 switch (Level) {
155 [[fallthrough]];
157 return 8;
158 }
159
160 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
161}
162
163unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
164 bool Vector = (ClassID == 1);
165 if (Vector && !ST->hasSSE1())
166 return 0;
167
168 if (ST->is64Bit()) {
169 if (Vector && ST->hasAVX512())
170 return 32;
171 if (!Vector && ST->hasEGPR())
172 return 32;
173 return 16;
174 }
175 return 8;
176}
177
179 if (!ST->hasCF())
180 return false;
181 if (!Ty)
182 return true;
183 // Conditional faulting is supported by CFCMOV, which only accepts
184 // 16/32/64-bit operands.
185 // TODO: Support f32/f64 with VMOVSS/VMOVSD with zero mask when it's
186 // profitable.
187 auto *VTy = dyn_cast<FixedVectorType>(Ty);
188 if (!Ty->isIntegerTy() && (!VTy || VTy->getNumElements() != 1))
189 return false;
190 auto *ScalarTy = Ty->getScalarType();
191 switch (cast<IntegerType>(ScalarTy)->getBitWidth()) {
192 default:
193 return false;
194 case 16:
195 case 32:
196 case 64:
197 return true;
198 }
199}
200
203 unsigned PreferVectorWidth = ST->getPreferVectorWidth();
204 switch (K) {
206 return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
208 if (ST->hasAVX512() && ST->hasEVEX512() && PreferVectorWidth >= 512)
209 return TypeSize::getFixed(512);
210 if (ST->hasAVX() && PreferVectorWidth >= 256)
211 return TypeSize::getFixed(256);
212 if (ST->hasSSE1() && PreferVectorWidth >= 128)
213 return TypeSize::getFixed(128);
214 return TypeSize::getFixed(0);
216 return TypeSize::getScalable(0);
217 }
218
219 llvm_unreachable("Unsupported register kind");
220}
221
224 .getFixedValue();
225}
226
228 // If the loop will not be vectorized, don't interleave the loop.
229 // Let regular unroll to unroll the loop, which saves the overflow
230 // check and memory check cost.
231 if (VF.isScalar())
232 return 1;
233
234 if (ST->isAtom())
235 return 1;
236
237 // Sandybridge and Haswell have multiple execution ports and pipelined
238 // vector units.
239 if (ST->hasAVX())
240 return 4;
241
242 return 2;
243}
244
246 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
249 const Instruction *CxtI) {
250
251 // vXi8 multiplications are always promoted to vXi16.
252 // Sub-128-bit types can be extended/packed more efficiently.
253 if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
254 Ty->getPrimitiveSizeInBits() <= 64 && Ty->getScalarSizeInBits() == 8) {
255 Type *WideVecTy =
256 VectorType::getExtendedElementVectorType(cast<VectorType>(Ty));
257 return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
259 CostKind) +
260 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
262 CostKind) +
263 getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info);
264 }
265
266 // Legalize the type.
267 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
268
269 int ISD = TLI->InstructionOpcodeToISD(Opcode);
270 assert(ISD && "Invalid opcode");
271
272 if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
273 (LT.second.getScalarType() == MVT::i32 ||
274 LT.second.getScalarType() == MVT::i64)) {
275 // Check if the operands can be represented as a smaller datatype.
276 bool Op1Signed = false, Op2Signed = false;
277 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
278 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
279 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
280 bool SignedMode = Op1Signed || Op2Signed;
281
282 // If both vXi32 are representable as i15 and at least one is constant,
283 // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
284 // can treat this as PMADDWD which has the same costs as a vXi16 multiply.
285 if (OpMinSize <= 15 && !ST->isPMADDWDSlow() &&
286 LT.second.getScalarType() == MVT::i32) {
287 bool Op1Constant =
288 isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
289 bool Op2Constant =
290 isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
291 bool Op1Sext = isa<SExtInst>(Args[0]) &&
292 (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
293 bool Op2Sext = isa<SExtInst>(Args[1]) &&
294 (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
295
296 bool IsZeroExtended = !Op1Signed || !Op2Signed;
297 bool IsConstant = Op1Constant || Op2Constant;
298 bool IsSext = Op1Sext || Op2Sext;
299 if (IsConstant || IsZeroExtended || IsSext)
300 LT.second =
301 MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
302 }
303
304 // Check if the vXi32 operands can be shrunk into a smaller datatype.
305 // This should match the codegen from reduceVMULWidth.
306 // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()).
307 if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) {
308 if (OpMinSize <= 7)
309 return LT.first * 3; // pmullw/sext
310 if (!SignedMode && OpMinSize <= 8)
311 return LT.first * 3; // pmullw/zext
312 if (OpMinSize <= 15)
313 return LT.first * 5; // pmullw/pmulhw/pshuf
314 if (!SignedMode && OpMinSize <= 16)
315 return LT.first * 5; // pmullw/pmulhw/pshuf
316 }
317
318 // If both vXi64 are representable as (unsigned) i32, then we can perform
319 // the multiple with a single PMULUDQ instruction.
320 // TODO: Add (SSE41+) PMULDQ handling for signed extensions.
321 if (!SignedMode && OpMinSize <= 32 && LT.second.getScalarType() == MVT::i64)
322 ISD = X86ISD::PMULUDQ;
323 }
324
325 // Vector multiply by pow2 will be simplified to shifts.
326 // Vector multiply by -pow2 will be simplified to shifts/negates.
327 if (ISD == ISD::MUL && Op2Info.isConstant() &&
328 (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) {
330 getArithmeticInstrCost(Instruction::Shl, Ty, CostKind,
331 Op1Info.getNoProps(), Op2Info.getNoProps());
332 if (Op2Info.isNegatedPowerOf2())
333 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
334 return Cost;
335 }
336
337 // On X86, vector signed division by constants power-of-two are
338 // normally expanded to the sequence SRA + SRL + ADD + SRA.
339 // The OperandValue properties may not be the same as that of the previous
340 // operation; conservatively assume OP_None.
341 if ((ISD == ISD::SDIV || ISD == ISD::SREM) &&
342 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
344 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
345 Op1Info.getNoProps(), Op2Info.getNoProps());
346 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
347 Op1Info.getNoProps(), Op2Info.getNoProps());
348 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
349 Op1Info.getNoProps(), Op2Info.getNoProps());
350
351 if (ISD == ISD::SREM) {
352 // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
353 Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(),
354 Op2Info.getNoProps());
355 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(),
356 Op2Info.getNoProps());
357 }
358
359 return Cost;
360 }
361
362 // Vector unsigned division/remainder will be simplified to shifts/masks.
363 if ((ISD == ISD::UDIV || ISD == ISD::UREM) &&
364 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
365 if (ISD == ISD::UDIV)
366 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
367 Op1Info.getNoProps(), Op2Info.getNoProps());
368 // UREM
369 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
370 Op1Info.getNoProps(), Op2Info.getNoProps());
371 }
372
373 static const CostKindTblEntry GFNIUniformConstCostTable[] = {
374 { ISD::SHL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
375 { ISD::SRL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
376 { ISD::SRA, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
377 { ISD::SHL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
378 { ISD::SRL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
379 { ISD::SRA, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
380 { ISD::SHL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
381 { ISD::SRL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
382 { ISD::SRA, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
383 };
384
385 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasGFNI())
386 if (const auto *Entry =
387 CostTableLookup(GFNIUniformConstCostTable, ISD, LT.second))
388 if (auto KindCost = Entry->Cost[CostKind])
389 return LT.first * *KindCost;
390
391 static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
392 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
393 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
394 { ISD::SRA, MVT::v16i8, { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb.
395 { ISD::SHL, MVT::v32i8, { 1, 8, 2, 3 } }, // psllw + pand.
396 { ISD::SRL, MVT::v32i8, { 1, 8, 2, 3 } }, // psrlw + pand.
397 { ISD::SRA, MVT::v32i8, { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb.
398 { ISD::SHL, MVT::v64i8, { 1, 8, 2, 3 } }, // psllw + pand.
399 { ISD::SRL, MVT::v64i8, { 1, 8, 2, 3 } }, // psrlw + pand.
400 { ISD::SRA, MVT::v64i8, { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb.
401
402 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // psllw
403 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
404 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
405 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // psllw
406 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
407 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
408 };
409
410 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI())
411 if (const auto *Entry =
412 CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second))
413 if (auto KindCost = Entry->Cost[CostKind])
414 return LT.first * *KindCost;
415
416 static const CostKindTblEntry AVX512UniformConstCostTable[] = {
417 { ISD::SHL, MVT::v64i8, { 2, 12, 5, 6 } }, // psllw + pand.
418 { ISD::SRL, MVT::v64i8, { 2, 12, 5, 6 } }, // psrlw + pand.
419 { ISD::SRA, MVT::v64i8, { 3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb.
420
421 { ISD::SHL, MVT::v16i16, { 2, 7, 4, 4 } }, // psllw + split.
422 { ISD::SRL, MVT::v16i16, { 2, 7, 4, 4 } }, // psrlw + split.
423 { ISD::SRA, MVT::v16i16, { 2, 7, 4, 4 } }, // psraw + split.
424
425 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, // pslld
426 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, // psrld
427 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, // psrad
428 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, // pslld
429 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, // psrld
430 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, // psrad
431
432 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, // psraq
433 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, // psllq
434 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, // psrlq
435 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, // psraq
436 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, // psllq
437 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, // psrlq
438 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, // psraq
439
440 { ISD::SDIV, MVT::v16i32, { 6 } }, // pmuludq sequence
441 { ISD::SREM, MVT::v16i32, { 8 } }, // pmuludq+mul+sub sequence
442 { ISD::UDIV, MVT::v16i32, { 5 } }, // pmuludq sequence
443 { ISD::UREM, MVT::v16i32, { 7 } }, // pmuludq+mul+sub sequence
444 };
445
446 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512())
447 if (const auto *Entry =
448 CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second))
449 if (auto KindCost = Entry->Cost[CostKind])
450 return LT.first * *KindCost;
451
452 static const CostKindTblEntry AVX2UniformConstCostTable[] = {
453 { ISD::SHL, MVT::v16i8, { 1, 8, 2, 3 } }, // psllw + pand.
454 { ISD::SRL, MVT::v16i8, { 1, 8, 2, 3 } }, // psrlw + pand.
455 { ISD::SRA, MVT::v16i8, { 2, 10, 5, 6 } }, // psrlw, pand, pxor, psubb.
456 { ISD::SHL, MVT::v32i8, { 2, 8, 2, 4 } }, // psllw + pand.
457 { ISD::SRL, MVT::v32i8, { 2, 8, 2, 4 } }, // psrlw + pand.
458 { ISD::SRA, MVT::v32i8, { 3, 10, 5, 9 } }, // psrlw, pand, pxor, psubb.
459
460 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw
461 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw
462 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw
463 { ISD::SHL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psllw
464 { ISD::SRL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psrlw
465 { ISD::SRA, MVT::v16i16,{ 2, 2, 1, 2 } }, // psraw
466
467 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
468 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld
469 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad
470 { ISD::SHL, MVT::v8i32, { 2, 2, 1, 2 } }, // pslld
471 { ISD::SRL, MVT::v8i32, { 2, 2, 1, 2 } }, // psrld
472 { ISD::SRA, MVT::v8i32, { 2, 2, 1, 2 } }, // psrad
473
474 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq
475 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq
476 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
477 { ISD::SHL, MVT::v4i64, { 2, 2, 1, 2 } }, // psllq
478 { ISD::SRL, MVT::v4i64, { 2, 2, 1, 2 } }, // psrlq
479 { ISD::SRA, MVT::v4i64, { 4, 4, 3, 6 } }, // psrad + shuffle + split.
480
481 { ISD::SDIV, MVT::v8i32, { 6 } }, // pmuludq sequence
482 { ISD::SREM, MVT::v8i32, { 8 } }, // pmuludq+mul+sub sequence
483 { ISD::UDIV, MVT::v8i32, { 5 } }, // pmuludq sequence
484 { ISD::UREM, MVT::v8i32, { 7 } }, // pmuludq+mul+sub sequence
485 };
486
487 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2())
488 if (const auto *Entry =
489 CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second))
490 if (auto KindCost = Entry->Cost[CostKind])
491 return LT.first * *KindCost;
492
493 static const CostKindTblEntry AVXUniformConstCostTable[] = {
494 { ISD::SHL, MVT::v16i8, { 2, 7, 2, 3 } }, // psllw + pand.
495 { ISD::SRL, MVT::v16i8, { 2, 7, 2, 3 } }, // psrlw + pand.
496 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
497 { ISD::SHL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psllw + pand) + split.
498 { ISD::SRL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psrlw + pand) + split.
499 { ISD::SRA, MVT::v32i8, { 7, 7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split.
500
501 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 1 } }, // psllw.
502 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 1 } }, // psrlw.
503 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 1 } }, // psraw.
504 { ISD::SHL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psllw + split.
505 { ISD::SRL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psrlw + split.
506 { ISD::SRA, MVT::v16i16,{ 3, 6, 4, 5 } }, // psraw + split.
507
508 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 1 } }, // pslld.
509 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 1 } }, // psrld.
510 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 1 } }, // psrad.
511 { ISD::SHL, MVT::v8i32, { 3, 6, 4, 5 } }, // pslld + split.
512 { ISD::SRL, MVT::v8i32, { 3, 6, 4, 5 } }, // psrld + split.
513 { ISD::SRA, MVT::v8i32, { 3, 6, 4, 5 } }, // psrad + split.
514
515 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 1 } }, // psllq.
516 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 1 } }, // psrlq.
517 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
518 { ISD::SHL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
519 { ISD::SRL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
520 { ISD::SRA, MVT::v4i64, { 5, 7, 8, 9 } }, // 2 x psrad + shuffle + split.
521
522 { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split.
523 { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split.
524 { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split.
525 { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split.
526 };
527
528 // XOP has faster vXi8 shifts.
529 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() &&
530 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
531 if (const auto *Entry =
532 CostTableLookup(AVXUniformConstCostTable, ISD, LT.second))
533 if (auto KindCost = Entry->Cost[CostKind])
534 return LT.first * *KindCost;
535
536 static const CostKindTblEntry SSE2UniformConstCostTable[] = {
537 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
538 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
539 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
540
541 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw.
542 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw.
543 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw.
544
545 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
546 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld.
547 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad.
548
549 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq.
550 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq.
551 { ISD::SRA, MVT::v2i64, { 3, 5, 6, 6 } }, // 2 x psrad + shuffle.
552
553 { ISD::SDIV, MVT::v4i32, { 6 } }, // pmuludq sequence
554 { ISD::SREM, MVT::v4i32, { 8 } }, // pmuludq+mul+sub sequence
555 { ISD::UDIV, MVT::v4i32, { 5 } }, // pmuludq sequence
556 { ISD::UREM, MVT::v4i32, { 7 } }, // pmuludq+mul+sub sequence
557 };
558
559 // XOP has faster vXi8 shifts.
560 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() &&
561 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
562 if (const auto *Entry =
563 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
564 if (auto KindCost = Entry->Cost[CostKind])
565 return LT.first * *KindCost;
566
567 static const CostKindTblEntry AVX512BWConstCostTable[] = {
568 { ISD::SDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
569 { ISD::SREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
570 { ISD::UDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
571 { ISD::UREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
572
573 { ISD::SDIV, MVT::v32i16, { 6 } }, // vpmulhw sequence
574 { ISD::SREM, MVT::v32i16, { 8 } }, // vpmulhw+mul+sub sequence
575 { ISD::UDIV, MVT::v32i16, { 6 } }, // vpmulhuw sequence
576 { ISD::UREM, MVT::v32i16, { 8 } }, // vpmulhuw+mul+sub sequence
577 };
578
579 if (Op2Info.isConstant() && ST->hasBWI())
580 if (const auto *Entry =
581 CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
582 if (auto KindCost = Entry->Cost[CostKind])
583 return LT.first * *KindCost;
584
585 static const CostKindTblEntry AVX512ConstCostTable[] = {
586 { ISD::SDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
587 { ISD::SREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
588 { ISD::UDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
589 { ISD::UREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
590
591 { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence
592 { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence
593 { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence
594 { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence
595
596 { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence
597 { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence
598 { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence
599 { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence
600 };
601
602 if (Op2Info.isConstant() && ST->hasAVX512())
603 if (const auto *Entry =
604 CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
605 if (auto KindCost = Entry->Cost[CostKind])
606 return LT.first * *KindCost;
607
608 static const CostKindTblEntry AVX2ConstCostTable[] = {
609 { ISD::SDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
610 { ISD::SREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
611 { ISD::UDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
612 { ISD::UREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
613
614 { ISD::SDIV, MVT::v16i16, { 6 } }, // vpmulhw sequence
615 { ISD::SREM, MVT::v16i16, { 8 } }, // vpmulhw+mul+sub sequence
616 { ISD::UDIV, MVT::v16i16, { 6 } }, // vpmulhuw sequence
617 { ISD::UREM, MVT::v16i16, { 8 } }, // vpmulhuw+mul+sub sequence
618
619 { ISD::SDIV, MVT::v8i32, { 15 } }, // vpmuldq sequence
620 { ISD::SREM, MVT::v8i32, { 19 } }, // vpmuldq+mul+sub sequence
621 { ISD::UDIV, MVT::v8i32, { 15 } }, // vpmuludq sequence
622 { ISD::UREM, MVT::v8i32, { 19 } }, // vpmuludq+mul+sub sequence
623 };
624
625 if (Op2Info.isConstant() && ST->hasAVX2())
626 if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
627 if (auto KindCost = Entry->Cost[CostKind])
628 return LT.first * *KindCost;
629
630 static const CostKindTblEntry AVXConstCostTable[] = {
631 { ISD::SDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
632 { ISD::SREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
633 { ISD::UDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
634 { ISD::UREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
635
636 { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split.
637 { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split.
638 { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split.
639 { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split.
640
641 { ISD::SDIV, MVT::v8i32, { 32 } }, // vpmuludq sequence
642 { ISD::SREM, MVT::v8i32, { 38 } }, // vpmuludq+mul+sub sequence
643 { ISD::UDIV, MVT::v8i32, { 32 } }, // 2*pmuludq sequence + split.
644 { ISD::UREM, MVT::v8i32, { 42 } }, // 2*pmuludq+mul+sub sequence + split.
645 };
646
647 if (Op2Info.isConstant() && ST->hasAVX())
648 if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second))
649 if (auto KindCost = Entry->Cost[CostKind])
650 return LT.first * *KindCost;
651
652 static const CostKindTblEntry SSE41ConstCostTable[] = {
653 { ISD::SDIV, MVT::v4i32, { 15 } }, // vpmuludq sequence
654 { ISD::SREM, MVT::v4i32, { 20 } }, // vpmuludq+mul+sub sequence
655 };
656
657 if (Op2Info.isConstant() && ST->hasSSE41())
658 if (const auto *Entry =
659 CostTableLookup(SSE41ConstCostTable, ISD, LT.second))
660 if (auto KindCost = Entry->Cost[CostKind])
661 return LT.first * *KindCost;
662
663 static const CostKindTblEntry SSE2ConstCostTable[] = {
664 { ISD::SDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
665 { ISD::SREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
666 { ISD::UDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
667 { ISD::UREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
668
669 { ISD::SDIV, MVT::v8i16, { 6 } }, // pmulhw sequence
670 { ISD::SREM, MVT::v8i16, { 8 } }, // pmulhw+mul+sub sequence
671 { ISD::UDIV, MVT::v8i16, { 6 } }, // pmulhuw sequence
672 { ISD::UREM, MVT::v8i16, { 8 } }, // pmulhuw+mul+sub sequence
673
674 { ISD::SDIV, MVT::v4i32, { 19 } }, // pmuludq sequence
675 { ISD::SREM, MVT::v4i32, { 24 } }, // pmuludq+mul+sub sequence
676 { ISD::UDIV, MVT::v4i32, { 15 } }, // pmuludq sequence
677 { ISD::UREM, MVT::v4i32, { 20 } }, // pmuludq+mul+sub sequence
678 };
679
680 if (Op2Info.isConstant() && ST->hasSSE2())
681 if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
682 if (auto KindCost = Entry->Cost[CostKind])
683 return LT.first * *KindCost;
684
685 static const CostKindTblEntry AVX512BWUniformCostTable[] = {
686 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
687 { ISD::SRL, MVT::v16i8, { 3,10, 5, 8 } }, // psrlw + pand.
688 { ISD::SRA, MVT::v16i8, { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb.
689 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
690 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
691 { ISD::SRA, MVT::v32i8, { 5,10,10,13 } }, // psrlw, pand, pxor, psubb.
692 { ISD::SHL, MVT::v64i8, { 4, 7, 6, 8 } }, // psllw + pand.
693 { ISD::SRL, MVT::v64i8, { 4, 8, 7,10 } }, // psrlw + pand.
694 { ISD::SRA, MVT::v64i8, { 5,10,10,15 } }, // psrlw, pand, pxor, psubb.
695
696 { ISD::SHL, MVT::v32i16, { 2, 4, 2, 3 } }, // psllw
697 { ISD::SRL, MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw
698 { ISD::SRA, MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw
699 };
700
701 if (ST->hasBWI() && Op2Info.isUniform())
702 if (const auto *Entry =
703 CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second))
704 if (auto KindCost = Entry->Cost[CostKind])
705 return LT.first * *KindCost;
706
707 static const CostKindTblEntry AVX512UniformCostTable[] = {
708 { ISD::SHL, MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split.
709 { ISD::SRL, MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split.
710 { ISD::SRA, MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split.
711
712 { ISD::SHL, MVT::v16i32, { 2, 4, 2, 3 } }, // pslld
713 { ISD::SRL, MVT::v16i32, { 2, 4, 2, 3 } }, // psrld
714 { ISD::SRA, MVT::v16i32, { 2, 4, 2, 3 } }, // psrad
715
716 { ISD::SRA, MVT::v2i64, { 1, 2, 1, 2 } }, // psraq
717 { ISD::SHL, MVT::v4i64, { 1, 4, 1, 2 } }, // psllq
718 { ISD::SRL, MVT::v4i64, { 1, 4, 1, 2 } }, // psrlq
719 { ISD::SRA, MVT::v4i64, { 1, 4, 1, 2 } }, // psraq
720 { ISD::SHL, MVT::v8i64, { 1, 4, 1, 2 } }, // psllq
721 { ISD::SRL, MVT::v8i64, { 1, 4, 1, 2 } }, // psrlq
722 { ISD::SRA, MVT::v8i64, { 1, 4, 1, 2 } }, // psraq
723 };
724
725 if (ST->hasAVX512() && Op2Info.isUniform())
726 if (const auto *Entry =
727 CostTableLookup(AVX512UniformCostTable, ISD, LT.second))
728 if (auto KindCost = Entry->Cost[CostKind])
729 return LT.first * *KindCost;
730
731 static const CostKindTblEntry AVX2UniformCostTable[] = {
732 // Uniform splats are cheaper for the following instructions.
733 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
734 { ISD::SRL, MVT::v16i8, { 3, 9, 5, 8 } }, // psrlw + pand.
735 { ISD::SRA, MVT::v16i8, { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb.
736 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
737 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
738 { ISD::SRA, MVT::v32i8, { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb.
739
740 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 2 } }, // psllw.
741 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 2 } }, // psrlw.
742 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 2 } }, // psraw.
743 { ISD::SHL, MVT::v16i16, { 2, 4, 2, 3 } }, // psllw.
744 { ISD::SRL, MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw.
745 { ISD::SRA, MVT::v16i16, { 2, 4, 2, 3 } }, // psraw.
746
747 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 2 } }, // pslld
748 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 2 } }, // psrld
749 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 2 } }, // psrad
750 { ISD::SHL, MVT::v8i32, { 2, 4, 2, 3 } }, // pslld
751 { ISD::SRL, MVT::v8i32, { 2, 4, 2, 3 } }, // psrld
752 { ISD::SRA, MVT::v8i32, { 2, 4, 2, 3 } }, // psrad
753
754 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 2 } }, // psllq
755 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 2 } }, // psrlq
756 { ISD::SRA, MVT::v2i64, { 2, 4, 5, 7 } }, // 2 x psrad + shuffle.
757 { ISD::SHL, MVT::v4i64, { 2, 4, 1, 2 } }, // psllq
758 { ISD::SRL, MVT::v4i64, { 2, 4, 1, 2 } }, // psrlq
759 { ISD::SRA, MVT::v4i64, { 4, 6, 5, 9 } }, // 2 x psrad + shuffle.
760 };
761
762 if (ST->hasAVX2() && Op2Info.isUniform())
763 if (const auto *Entry =
764 CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
765 if (auto KindCost = Entry->Cost[CostKind])
766 return LT.first * *KindCost;
767
768 static const CostKindTblEntry AVXUniformCostTable[] = {
769 { ISD::SHL, MVT::v16i8, { 4, 4, 6, 8 } }, // psllw + pand.
770 { ISD::SRL, MVT::v16i8, { 4, 8, 5, 8 } }, // psrlw + pand.
771 { ISD::SRA, MVT::v16i8, { 6, 6, 9,13 } }, // psrlw, pand, pxor, psubb.
772 { ISD::SHL, MVT::v32i8, { 7, 8,11,14 } }, // psllw + pand + split.
773 { ISD::SRL, MVT::v32i8, { 7, 9,10,14 } }, // psrlw + pand + split.
774 { ISD::SRA, MVT::v32i8, { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split.
775
776 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 2 } }, // psllw.
777 { ISD::SRL, MVT::v8i16, { 1, 3, 1, 2 } }, // psrlw.
778 { ISD::SRA, MVT::v8i16, { 1, 3, 1, 2 } }, // psraw.
779 { ISD::SHL, MVT::v16i16, { 3, 7, 5, 7 } }, // psllw + split.
780 { ISD::SRL, MVT::v16i16, { 3, 7, 5, 7 } }, // psrlw + split.
781 { ISD::SRA, MVT::v16i16, { 3, 7, 5, 7 } }, // psraw + split.
782
783 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 2 } }, // pslld.
784 { ISD::SRL, MVT::v4i32, { 1, 3, 1, 2 } }, // psrld.
785 { ISD::SRA, MVT::v4i32, { 1, 3, 1, 2 } }, // psrad.
786 { ISD::SHL, MVT::v8i32, { 3, 7, 5, 7 } }, // pslld + split.
787 { ISD::SRL, MVT::v8i32, { 3, 7, 5, 7 } }, // psrld + split.
788 { ISD::SRA, MVT::v8i32, { 3, 7, 5, 7 } }, // psrad + split.
789
790 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 2 } }, // psllq.
791 { ISD::SRL, MVT::v2i64, { 1, 3, 1, 2 } }, // psrlq.
792 { ISD::SRA, MVT::v2i64, { 3, 4, 5, 7 } }, // 2 x psrad + shuffle.
793 { ISD::SHL, MVT::v4i64, { 3, 7, 4, 6 } }, // psllq + split.
794 { ISD::SRL, MVT::v4i64, { 3, 7, 4, 6 } }, // psrlq + split.
795 { ISD::SRA, MVT::v4i64, { 6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split.
796 };
797
798 // XOP has faster vXi8 shifts.
799 if (ST->hasAVX() && Op2Info.isUniform() &&
800 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
801 if (const auto *Entry =
802 CostTableLookup(AVXUniformCostTable, ISD, LT.second))
803 if (auto KindCost = Entry->Cost[CostKind])
804 return LT.first * *KindCost;
805
806 static const CostKindTblEntry SSE2UniformCostTable[] = {
807 // Uniform splats are cheaper for the following instructions.
808 { ISD::SHL, MVT::v16i8, { 9, 10, 6, 9 } }, // psllw + pand.
809 { ISD::SRL, MVT::v16i8, { 9, 13, 5, 9 } }, // psrlw + pand.
810 { ISD::SRA, MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence.
811
812 { ISD::SHL, MVT::v8i16, { 2, 2, 1, 2 } }, // psllw.
813 { ISD::SRL, MVT::v8i16, { 2, 2, 1, 2 } }, // psrlw.
814 { ISD::SRA, MVT::v8i16, { 2, 2, 1, 2 } }, // psraw.
815
816 { ISD::SHL, MVT::v4i32, { 2, 2, 1, 2 } }, // pslld
817 { ISD::SRL, MVT::v4i32, { 2, 2, 1, 2 } }, // psrld.
818 { ISD::SRA, MVT::v4i32, { 2, 2, 1, 2 } }, // psrad.
819
820 { ISD::SHL, MVT::v2i64, { 2, 2, 1, 2 } }, // psllq.
821 { ISD::SRL, MVT::v2i64, { 2, 2, 1, 2 } }, // psrlq.
822 { ISD::SRA, MVT::v2i64, { 5, 9, 5, 7 } }, // 2*psrlq + xor + sub.
823 };
824
825 if (ST->hasSSE2() && Op2Info.isUniform() &&
826 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
827 if (const auto *Entry =
828 CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
829 if (auto KindCost = Entry->Cost[CostKind])
830 return LT.first * *KindCost;
831
832 static const CostKindTblEntry AVX512DQCostTable[] = {
833 { ISD::MUL, MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq
834 { ISD::MUL, MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq
835 { ISD::MUL, MVT::v8i64, { 3, 15, 1, 3 } } // pmullq
836 };
837
838 // Look for AVX512DQ lowering tricks for custom cases.
839 if (ST->hasDQI())
840 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
841 if (auto KindCost = Entry->Cost[CostKind])
842 return LT.first * *KindCost;
843
844 static const CostKindTblEntry AVX512BWCostTable[] = {
845 { ISD::SHL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsllvw/pack sequence.
846 { ISD::SRL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsrlvw/pack sequence.
847 { ISD::SRA, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsravw/pack sequence.
848 { ISD::SHL, MVT::v32i8, { 4, 23,11,16 } }, // extend/vpsllvw/pack sequence.
849 { ISD::SRL, MVT::v32i8, { 4, 30,12,18 } }, // extend/vpsrlvw/pack sequence.
850 { ISD::SRA, MVT::v32i8, { 6, 13,24,30 } }, // extend/vpsravw/pack sequence.
851 { ISD::SHL, MVT::v64i8, { 6, 19,13,15 } }, // extend/vpsllvw/pack sequence.
852 { ISD::SRL, MVT::v64i8, { 7, 27,15,18 } }, // extend/vpsrlvw/pack sequence.
853 { ISD::SRA, MVT::v64i8, { 15, 15,30,30 } }, // extend/vpsravw/pack sequence.
854
855 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsllvw
856 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsrlvw
857 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsravw
858 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsllvw
859 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsrlvw
860 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsravw
861 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsllvw
862 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsrlvw
863 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsravw
864
865 { ISD::ADD, MVT::v64i8, { 1, 1, 1, 1 } }, // paddb
866 { ISD::ADD, MVT::v32i16, { 1, 1, 1, 1 } }, // paddw
867
868 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 1 } }, // paddb
869 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 1 } }, // paddw
870 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 1 } }, // paddd
871 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 1 } }, // paddq
872
873 { ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb
874 { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw
875
876 { ISD::MUL, MVT::v16i8, { 4, 12, 4, 5 } }, // extend/pmullw/trunc
877 { ISD::MUL, MVT::v32i8, { 3, 10, 7,10 } }, // pmaddubsw
878 { ISD::MUL, MVT::v64i8, { 3, 11, 7,10 } }, // pmaddubsw
879 { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw
880
881 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb
882 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 1 } }, // psubw
883 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 1 } }, // psubd
884 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 1 } }, // psubq
885 };
886
887 // Look for AVX512BW lowering tricks for custom cases.
888 if (ST->hasBWI())
889 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
890 if (auto KindCost = Entry->Cost[CostKind])
891 return LT.first * *KindCost;
892
893 static const CostKindTblEntry AVX512CostTable[] = {
894 { ISD::SHL, MVT::v64i8, { 15, 19,27,33 } }, // vpblendv+split sequence.
895 { ISD::SRL, MVT::v64i8, { 15, 19,30,36 } }, // vpblendv+split sequence.
896 { ISD::SRA, MVT::v64i8, { 37, 37,51,63 } }, // vpblendv+split sequence.
897
898 { ISD::SHL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
899 { ISD::SRL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
900 { ISD::SRA, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence.
901
902 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } },
903 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } },
904 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } },
905 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } },
906 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } },
907 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } },
908 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } },
909 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } },
910 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } },
911
912 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } },
913 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } },
914 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } },
915 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } },
916 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } },
917 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } },
918 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } },
919 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } },
920 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } },
921
922 { ISD::ADD, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*paddb + split
923 { ISD::ADD, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*paddw + split
924
925 { ISD::SUB, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*psubb + split
926 { ISD::SUB, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*psubw + split
927
928 { ISD::AND, MVT::v32i8, { 1, 1, 1, 1 } },
929 { ISD::AND, MVT::v16i16, { 1, 1, 1, 1 } },
930 { ISD::AND, MVT::v8i32, { 1, 1, 1, 1 } },
931 { ISD::AND, MVT::v4i64, { 1, 1, 1, 1 } },
932
933 { ISD::OR, MVT::v32i8, { 1, 1, 1, 1 } },
934 { ISD::OR, MVT::v16i16, { 1, 1, 1, 1 } },
935 { ISD::OR, MVT::v8i32, { 1, 1, 1, 1 } },
936 { ISD::OR, MVT::v4i64, { 1, 1, 1, 1 } },
937
938 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 1 } },
939 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 1 } },
940 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 1 } },
941 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 1 } },
942
943 { ISD::MUL, MVT::v16i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
944 { ISD::MUL, MVT::v8i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
945 { ISD::MUL, MVT::v4i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
946 { ISD::MUL, MVT::v8i64, { 6, 9, 8, 8 } }, // 3*pmuludq/3*shift/2*add
947 { ISD::MUL, MVT::i64, { 1 } }, // Skylake from http://www.agner.org/
948
949 { X86ISD::PMULUDQ, MVT::v8i64, { 1, 5, 1, 1 } },
950
951 { ISD::FNEG, MVT::v8f64, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
952 { ISD::FADD, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
953 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
954 { ISD::FSUB, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
955 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
956 { ISD::FMUL, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
957 { ISD::FMUL, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
958 { ISD::FMUL, MVT::v2f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
959 { ISD::FMUL, MVT::f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
960
961 { ISD::FDIV, MVT::f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
962 { ISD::FDIV, MVT::v2f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
963 { ISD::FDIV, MVT::v4f64, { 8, 14, 1, 1 } }, // Skylake from http://www.agner.org/
964 { ISD::FDIV, MVT::v8f64, { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/
965
966 { ISD::FNEG, MVT::v16f32, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
967 { ISD::FADD, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
968 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
969 { ISD::FSUB, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
970 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
971 { ISD::FMUL, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
972 { ISD::FMUL, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
973 { ISD::FMUL, MVT::v4f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
974 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
975
976 { ISD::FDIV, MVT::f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
977 { ISD::FDIV, MVT::v4f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
978 { ISD::FDIV, MVT::v8f32, { 5, 11, 1, 1 } }, // Skylake from http://www.agner.org/
979 { ISD::FDIV, MVT::v16f32, { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/
980 };
981
982 if (ST->hasAVX512())
983 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
984 if (auto KindCost = Entry->Cost[CostKind])
985 return LT.first * *KindCost;
986
987 static const CostKindTblEntry AVX2ShiftCostTable[] = {
988 // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
989 // customize them to detect the cases where shift amount is a scalar one.
990 { ISD::SHL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org)
991 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
992 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org)
993 { ISD::SHL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org)
994 { ISD::SRL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
995 { ISD::SRA, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org)
996 { ISD::SHL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org)
997 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org)
998 { ISD::SHL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org)
999 { ISD::SRL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org)
1000 };
1001
1002 if (ST->hasAVX512()) {
1003 if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant())
1004 // On AVX512, a packed v32i16 shift left by a constant build_vector
1005 // is lowered into a vector multiply (vpmullw).
1006 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
1007 Op1Info.getNoProps(), Op2Info.getNoProps());
1008 }
1009
1010 // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
1011 if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
1012 if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
1013 Op2Info.isConstant())
1014 // On AVX2, a packed v16i16 shift left by a constant build_vector
1015 // is lowered into a vector multiply (vpmullw).
1016 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
1017 Op1Info.getNoProps(), Op2Info.getNoProps());
1018
1019 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
1020 if (auto KindCost = Entry->Cost[CostKind])
1021 return LT.first * *KindCost;
1022 }
1023
1024 static const CostKindTblEntry XOPShiftCostTable[] = {
1025 // 128bit shifts take 1cy, but right shifts require negation beforehand.
1026 { ISD::SHL, MVT::v16i8, { 1, 3, 1, 1 } },
1027 { ISD::SRL, MVT::v16i8, { 2, 3, 1, 1 } },
1028 { ISD::SRA, MVT::v16i8, { 2, 3, 1, 1 } },
1029 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 1 } },
1030 { ISD::SRL, MVT::v8i16, { 2, 3, 1, 1 } },
1031 { ISD::SRA, MVT::v8i16, { 2, 3, 1, 1 } },
1032 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 1 } },
1033 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 1 } },
1034 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 1 } },
1035 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 1 } },
1036 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } },
1037 { ISD::SRA, MVT::v2i64, { 2, 3, 1, 1 } },
1038 // 256bit shifts require splitting if AVX2 didn't catch them above.
1039 { ISD::SHL, MVT::v32i8, { 4, 7, 5, 6 } },
1040 { ISD::SRL, MVT::v32i8, { 6, 7, 5, 6 } },
1041 { ISD::SRA, MVT::v32i8, { 6, 7, 5, 6 } },
1042 { ISD::SHL, MVT::v16i16, { 4, 7, 5, 6 } },
1043 { ISD::SRL, MVT::v16i16, { 6, 7, 5, 6 } },
1044 { ISD::SRA, MVT::v16i16, { 6, 7, 5, 6 } },
1045 { ISD::SHL, MVT::v8i32, { 4, 7, 5, 6 } },
1046 { ISD::SRL, MVT::v8i32, { 6, 7, 5, 6 } },
1047 { ISD::SRA, MVT::v8i32, { 6, 7, 5, 6 } },
1048 { ISD::SHL, MVT::v4i64, { 4, 7, 5, 6 } },
1049 { ISD::SRL, MVT::v4i64, { 6, 7, 5, 6 } },
1050 { ISD::SRA, MVT::v4i64, { 6, 7, 5, 6 } },
1051 };
1052
1053 // Look for XOP lowering tricks.
1054 if (ST->hasXOP()) {
1055 // If the right shift is constant then we'll fold the negation so
1056 // it's as cheap as a left shift.
1057 int ShiftISD = ISD;
1058 if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant())
1059 ShiftISD = ISD::SHL;
1060 if (const auto *Entry =
1061 CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
1062 if (auto KindCost = Entry->Cost[CostKind])
1063 return LT.first * *KindCost;
1064 }
1065
1066 if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) {
1067 MVT VT = LT.second;
1068 // Vector shift left by non uniform constant can be lowered
1069 // into vector multiply.
1070 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
1071 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
1072 ISD = ISD::MUL;
1073 }
1074
1075 static const CostKindTblEntry GLMCostTable[] = {
1076 { ISD::FDIV, MVT::f32, { 18, 19, 1, 1 } }, // divss
1077 { ISD::FDIV, MVT::v4f32, { 35, 36, 1, 1 } }, // divps
1078 { ISD::FDIV, MVT::f64, { 33, 34, 1, 1 } }, // divsd
1079 { ISD::FDIV, MVT::v2f64, { 65, 66, 1, 1 } }, // divpd
1080 };
1081
1082 if (ST->useGLMDivSqrtCosts())
1083 if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second))
1084 if (auto KindCost = Entry->Cost[CostKind])
1085 return LT.first * *KindCost;
1086
1087 static const CostKindTblEntry SLMCostTable[] = {
1088 { ISD::MUL, MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld
1089 { ISD::MUL, MVT::v8i16, { 2, 5, 1, 1 } }, // pmullw
1090 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // mulsd
1091 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // mulss
1092 { ISD::FMUL, MVT::v2f64, { 4, 7, 1, 1 } }, // mulpd
1093 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // mulps
1094 { ISD::FDIV, MVT::f32, { 17, 19, 1, 1 } }, // divss
1095 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 6 } }, // divps
1096 { ISD::FDIV, MVT::f64, { 32, 34, 1, 1 } }, // divsd
1097 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 6 } }, // divpd
1098 { ISD::FADD, MVT::v2f64, { 2, 4, 1, 1 } }, // addpd
1099 { ISD::FSUB, MVT::v2f64, { 2, 4, 1, 1 } }, // subpd
1100 // v2i64/v4i64 mul is custom lowered as a series of long:
1101 // multiplies(3), shifts(3) and adds(2)
1102 // slm muldq version throughput is 2 and addq throughput 4
1103 // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
1104 // 3X4 (addq throughput) = 17
1105 { ISD::MUL, MVT::v2i64, { 17, 22, 9, 9 } },
1106 // slm addq\subq throughput is 4
1107 { ISD::ADD, MVT::v2i64, { 4, 2, 1, 2 } },
1108 { ISD::SUB, MVT::v2i64, { 4, 2, 1, 2 } },
1109 };
1110
1111 if (ST->useSLMArithCosts())
1112 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second))
1113 if (auto KindCost = Entry->Cost[CostKind])
1114 return LT.first * *KindCost;
1115
1116 static const CostKindTblEntry AVX2CostTable[] = {
1117 { ISD::SHL, MVT::v16i8, { 6, 21,11,16 } }, // vpblendvb sequence.
1118 { ISD::SHL, MVT::v32i8, { 6, 23,11,22 } }, // vpblendvb sequence.
1119 { ISD::SHL, MVT::v8i16, { 5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence.
1120 { ISD::SHL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1121
1122 { ISD::SRL, MVT::v16i8, { 6, 27,12,18 } }, // vpblendvb sequence.
1123 { ISD::SRL, MVT::v32i8, { 8, 30,12,24 } }, // vpblendvb sequence.
1124 { ISD::SRL, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence.
1125 { ISD::SRL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1126
1127 { ISD::SRA, MVT::v16i8, { 17, 17,24,30 } }, // vpblendvb sequence.
1128 { ISD::SRA, MVT::v32i8, { 18, 20,24,43 } }, // vpblendvb sequence.
1129 { ISD::SRA, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsravd/pack sequence.
1130 { ISD::SRA, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsravd/pack sequence.
1131 { ISD::SRA, MVT::v2i64, { 4, 5, 5, 5 } }, // srl/xor/sub sequence.
1132 { ISD::SRA, MVT::v4i64, { 8, 8, 5, 9 } }, // srl/xor/sub sequence.
1133
1134 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 2 } }, // psubb
1135 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 2 } }, // paddb
1136 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 2 } }, // psubw
1137 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 2 } }, // paddw
1138 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 2 } }, // psubd
1139 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 2 } }, // paddd
1140 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 2 } }, // psubq
1141 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq
1142
1143 { ISD::MUL, MVT::v16i8, { 5, 18, 6,12 } }, // extend/pmullw/pack
1144 { ISD::MUL, MVT::v32i8, { 4, 8, 8,16 } }, // pmaddubsw
1145 { ISD::MUL, MVT::v16i16, { 2, 5, 1, 2 } }, // pmullw
1146 { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld
1147 { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld
1148 { ISD::MUL, MVT::v4i64, { 6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add
1149 { ISD::MUL, MVT::v2i64, { 6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
1150
1151 { X86ISD::PMULUDQ, MVT::v4i64, { 1, 5, 1, 1 } },
1152
1153 { ISD::FNEG, MVT::v4f64, { 1, 1, 1, 2 } }, // vxorpd
1154 { ISD::FNEG, MVT::v8f32, { 1, 1, 1, 2 } }, // vxorps
1155
1156 { ISD::FADD, MVT::f64, { 1, 4, 1, 1 } }, // vaddsd
1157 { ISD::FADD, MVT::f32, { 1, 4, 1, 1 } }, // vaddss
1158 { ISD::FADD, MVT::v2f64, { 1, 4, 1, 1 } }, // vaddpd
1159 { ISD::FADD, MVT::v4f32, { 1, 4, 1, 1 } }, // vaddps
1160 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 2 } }, // vaddpd
1161 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 2 } }, // vaddps
1162
1163 { ISD::FSUB, MVT::f64, { 1, 4, 1, 1 } }, // vsubsd
1164 { ISD::FSUB, MVT::f32, { 1, 4, 1, 1 } }, // vsubss
1165 { ISD::FSUB, MVT::v2f64, { 1, 4, 1, 1 } }, // vsubpd
1166 { ISD::FSUB, MVT::v4f32, { 1, 4, 1, 1 } }, // vsubps
1167 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 2 } }, // vsubpd
1168 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 2 } }, // vsubps
1169
1170 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // vmulsd
1171 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // vmulss
1172 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // vmulpd
1173 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // vmulps
1174 { ISD::FMUL, MVT::v4f64, { 1, 5, 1, 2 } }, // vmulpd
1175 { ISD::FMUL, MVT::v8f32, { 1, 5, 1, 2 } }, // vmulps
1176
1177 { ISD::FDIV, MVT::f32, { 7, 13, 1, 1 } }, // vdivss
1178 { ISD::FDIV, MVT::v4f32, { 7, 13, 1, 1 } }, // vdivps
1179 { ISD::FDIV, MVT::v8f32, { 14, 21, 1, 3 } }, // vdivps
1180 { ISD::FDIV, MVT::f64, { 14, 20, 1, 1 } }, // vdivsd
1181 { ISD::FDIV, MVT::v2f64, { 14, 20, 1, 1 } }, // vdivpd
1182 { ISD::FDIV, MVT::v4f64, { 28, 35, 1, 3 } }, // vdivpd
1183 };
1184
1185 // Look for AVX2 lowering tricks for custom cases.
1186 if (ST->hasAVX2())
1187 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
1188 if (auto KindCost = Entry->Cost[CostKind])
1189 return LT.first * *KindCost;
1190
1191 static const CostKindTblEntry AVX1CostTable[] = {
1192 // We don't have to scalarize unsupported ops. We can issue two half-sized
1193 // operations and we only need to extract the upper YMM half.
1194 // Two ops + 1 extract + 1 insert = 4.
1195 { ISD::MUL, MVT::v32i8, { 10, 11, 18, 19 } }, // pmaddubsw + split
1196 { ISD::MUL, MVT::v16i8, { 5, 6, 8, 12 } }, // 2*pmaddubsw/3*and/psllw/or
1197 { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split
1198 { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split
1199 { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld
1200 { ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } },
1201
1202 { ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps
1203 { ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps
1204 { ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps
1205 { ISD::AND, MVT::v4i64, { 1, 1, 1, 2 } }, // vandps
1206
1207 { ISD::OR, MVT::v32i8, { 1, 1, 1, 2 } }, // vorps
1208 { ISD::OR, MVT::v16i16, { 1, 1, 1, 2 } }, // vorps
1209 { ISD::OR, MVT::v8i32, { 1, 1, 1, 2 } }, // vorps
1210 { ISD::OR, MVT::v4i64, { 1, 1, 1, 2 } }, // vorps
1211
1212 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 2 } }, // vxorps
1213 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 2 } }, // vxorps
1214 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 2 } }, // vxorps
1215 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 2 } }, // vxorps
1216
1217 { ISD::SUB, MVT::v32i8, { 4, 2, 5, 6 } }, // psubb + split
1218 { ISD::ADD, MVT::v32i8, { 4, 2, 5, 6 } }, // paddb + split
1219 { ISD::SUB, MVT::v16i16, { 4, 2, 5, 6 } }, // psubw + split
1220 { ISD::ADD, MVT::v16i16, { 4, 2, 5, 6 } }, // paddw + split
1221 { ISD::SUB, MVT::v8i32, { 4, 2, 5, 6 } }, // psubd + split
1222 { ISD::ADD, MVT::v8i32, { 4, 2, 5, 6 } }, // paddd + split
1223 { ISD::SUB, MVT::v4i64, { 4, 2, 5, 6 } }, // psubq + split
1224 { ISD::ADD, MVT::v4i64, { 4, 2, 5, 6 } }, // paddq + split
1225 { ISD::SUB, MVT::v2i64, { 1, 1, 1, 1 } }, // psubq
1226 { ISD::ADD, MVT::v2i64, { 1, 1, 1, 1 } }, // paddq
1227
1228 { ISD::SHL, MVT::v16i8, { 10, 21,11,17 } }, // pblendvb sequence.
1229 { ISD::SHL, MVT::v32i8, { 22, 22,27,40 } }, // pblendvb sequence + split.
1230 { ISD::SHL, MVT::v8i16, { 6, 9,11,11 } }, // pblendvb sequence.
1231 { ISD::SHL, MVT::v16i16, { 13, 16,24,25 } }, // pblendvb sequence + split.
1232 { ISD::SHL, MVT::v4i32, { 3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld
1233 { ISD::SHL, MVT::v8i32, { 9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split
1234 { ISD::SHL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1235 { ISD::SHL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1236
1237 { ISD::SRL, MVT::v16i8, { 11, 27,12,18 } }, // pblendvb sequence.
1238 { ISD::SRL, MVT::v32i8, { 23, 23,30,43 } }, // pblendvb sequence + split.
1239 { ISD::SRL, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1240 { ISD::SRL, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1241 { ISD::SRL, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1242 { ISD::SRL, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1243 { ISD::SRL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1244 { ISD::SRL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1245
1246 { ISD::SRA, MVT::v16i8, { 21, 22,24,36 } }, // pblendvb sequence.
1247 { ISD::SRA, MVT::v32i8, { 44, 45,51,76 } }, // pblendvb sequence + split.
1248 { ISD::SRA, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1249 { ISD::SRA, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1250 { ISD::SRA, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1251 { ISD::SRA, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1252 { ISD::SRA, MVT::v2i64, { 5, 6,10,14 } }, // Shift each lane + blend.
1253 { ISD::SRA, MVT::v4i64, { 12, 12,22,30 } }, // Shift each lane + blend + split.
1254
1255 { ISD::FNEG, MVT::v4f64, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1256 { ISD::FNEG, MVT::v8f32, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1257
1258 { ISD::FADD, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1259 { ISD::FADD, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1260 { ISD::FADD, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1261 { ISD::FADD, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1262 { ISD::FADD, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1263 { ISD::FADD, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1264
1265 { ISD::FSUB, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1266 { ISD::FSUB, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1267 { ISD::FSUB, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1268 { ISD::FSUB, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1269 { ISD::FSUB, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1270 { ISD::FSUB, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1271
1272 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1273 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1274 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1275 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1276 { ISD::FMUL, MVT::v4f64, { 4, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1277 { ISD::FMUL, MVT::v8f32, { 2, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1278
1279 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1280 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1281 { ISD::FDIV, MVT::v8f32, { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/
1282 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1283 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1284 { ISD::FDIV, MVT::v4f64, { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/
1285 };
1286
1287 if (ST->hasAVX())
1288 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
1289 if (auto KindCost = Entry->Cost[CostKind])
1290 return LT.first * *KindCost;
1291
1292 static const CostKindTblEntry SSE42CostTable[] = {
1293 { ISD::FADD, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1294 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1295 { ISD::FADD, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1296 { ISD::FADD, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1297
1298 { ISD::FSUB, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1299 { ISD::FSUB, MVT::f32 , { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1300 { ISD::FSUB, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1301 { ISD::FSUB, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1302
1303 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1304 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1305 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1306 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1307
1308 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1309 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1310 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1311 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1312
1313 { ISD::MUL, MVT::v2i64, { 6, 10,10,10 } } // 3*pmuludq/3*shift/2*add
1314 };
1315
1316 if (ST->hasSSE42())
1317 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
1318 if (auto KindCost = Entry->Cost[CostKind])
1319 return LT.first * *KindCost;
1320
1321 static const CostKindTblEntry SSE41CostTable[] = {
1322 { ISD::SHL, MVT::v16i8, { 15, 24,17,22 } }, // pblendvb sequence.
1323 { ISD::SHL, MVT::v8i16, { 11, 14,11,11 } }, // pblendvb sequence.
1324 { ISD::SHL, MVT::v4i32, { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld
1325
1326 { ISD::SRL, MVT::v16i8, { 16, 27,18,24 } }, // pblendvb sequence.
1327 { ISD::SRL, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1328 { ISD::SRL, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1329 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1330
1331 { ISD::SRA, MVT::v16i8, { 38, 41,30,36 } }, // pblendvb sequence.
1332 { ISD::SRA, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1333 { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1334 { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence.
1335
1336 { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org)
1337 };
1338
1339 if (ST->hasSSE41())
1340 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
1341 if (auto KindCost = Entry->Cost[CostKind])
1342 return LT.first * *KindCost;
1343
1344 static const CostKindTblEntry SSSE3CostTable[] = {
1345 { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*pmaddubsw/3*and/psllw/or
1346 };
1347
1348 if (ST->hasSSSE3())
1349 if (const auto *Entry = CostTableLookup(SSSE3CostTable, ISD, LT.second))
1350 if (auto KindCost = Entry->Cost[CostKind])
1351 return LT.first * *KindCost;
1352
1353 static const CostKindTblEntry SSE2CostTable[] = {
1354 // We don't correctly identify costs of casts because they are marked as
1355 // custom.
1356 { ISD::SHL, MVT::v16i8, { 13, 21,26,28 } }, // cmpgtb sequence.
1357 { ISD::SHL, MVT::v8i16, { 24, 27,16,20 } }, // cmpgtw sequence.
1358 { ISD::SHL, MVT::v4i32, { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq.
1359 { ISD::SHL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1360
1361 { ISD::SRL, MVT::v16i8, { 14, 28,27,30 } }, // cmpgtb sequence.
1362 { ISD::SRL, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1363 { ISD::SRL, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1364 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1365
1366 { ISD::SRA, MVT::v16i8, { 27, 30,54,54 } }, // unpacked cmpgtb sequence.
1367 { ISD::SRA, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1368 { ISD::SRA, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1369 { ISD::SRA, MVT::v2i64, { 8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence.
1370
1371 { ISD::AND, MVT::v16i8, { 1, 1, 1, 1 } }, // pand
1372 { ISD::AND, MVT::v8i16, { 1, 1, 1, 1 } }, // pand
1373 { ISD::AND, MVT::v4i32, { 1, 1, 1, 1 } }, // pand
1374 { ISD::AND, MVT::v2i64, { 1, 1, 1, 1 } }, // pand
1375
1376 { ISD::OR, MVT::v16i8, { 1, 1, 1, 1 } }, // por
1377 { ISD::OR, MVT::v8i16, { 1, 1, 1, 1 } }, // por
1378 { ISD::OR, MVT::v4i32, { 1, 1, 1, 1 } }, // por
1379 { ISD::OR, MVT::v2i64, { 1, 1, 1, 1 } }, // por
1380
1381 { ISD::XOR, MVT::v16i8, { 1, 1, 1, 1 } }, // pxor
1382 { ISD::XOR, MVT::v8i16, { 1, 1, 1, 1 } }, // pxor
1383 { ISD::XOR, MVT::v4i32, { 1, 1, 1, 1 } }, // pxor
1384 { ISD::XOR, MVT::v2i64, { 1, 1, 1, 1 } }, // pxor
1385
1386 { ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq
1387 { ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq
1388
1389 { ISD::MUL, MVT::v16i8, { 6, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack
1390 { ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw
1391 { ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle
1392 { ISD::MUL, MVT::v2i64, { 7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add
1393
1394 { X86ISD::PMULUDQ, MVT::v2i64, { 1, 5, 1, 1 } },
1395
1396 { ISD::FDIV, MVT::f32, { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/
1397 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/
1398 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/
1399 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/
1400
1401 { ISD::FNEG, MVT::f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1402 { ISD::FNEG, MVT::f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1403 { ISD::FNEG, MVT::v4f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1404 { ISD::FNEG, MVT::v2f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1405
1406 { ISD::FADD, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1407 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1408 { ISD::FADD, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1409
1410 { ISD::FSUB, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1411 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1412 { ISD::FSUB, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1413
1414 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1415 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1416 };
1417
1418 if (ST->hasSSE2())
1419 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
1420 if (auto KindCost = Entry->Cost[CostKind])
1421 return LT.first * *KindCost;
1422
1423 static const CostKindTblEntry SSE1CostTable[] = {
1424 { ISD::FDIV, MVT::f32, { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/
1425 { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/
1426
1427 { ISD::FNEG, MVT::f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1428 { ISD::FNEG, MVT::v4f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1429
1430 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1431 { ISD::FADD, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1432
1433 { ISD::FSUB, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1434 { ISD::FSUB, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1435
1436 { ISD::FMUL, MVT::f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1437 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1438 };
1439
1440 if (ST->hasSSE1())
1441 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1442 if (auto KindCost = Entry->Cost[CostKind])
1443 return LT.first * *KindCost;
1444
1445 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
1446 { ISD::ADD, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1447 { ISD::SUB, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1448 { ISD::MUL, MVT::i64, { 2, 6, 1, 2 } },
1449 };
1450
1451 if (ST->is64Bit())
1452 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1453 if (auto KindCost = Entry->Cost[CostKind])
1454 return LT.first * *KindCost;
1455
1456 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1457 { ISD::ADD, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1458 { ISD::ADD, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1459 { ISD::ADD, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1460
1461 { ISD::SUB, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1462 { ISD::SUB, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1463 { ISD::SUB, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1464
1465 { ISD::MUL, MVT::i8, { 3, 4, 1, 1 } },
1466 { ISD::MUL, MVT::i16, { 2, 4, 1, 1 } },
1467 { ISD::MUL, MVT::i32, { 1, 4, 1, 1 } },
1468
1469 { ISD::FNEG, MVT::f64, { 2, 2, 1, 3 } }, // (x87)
1470 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1471 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1472 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // (x87)
1473 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87)
1474 };
1475
1476 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1477 if (auto KindCost = Entry->Cost[CostKind])
1478 return LT.first * *KindCost;
1479
1480 // It is not a good idea to vectorize division. We have to scalarize it and
1481 // in the process we will often end up having to spilling regular
1482 // registers. The overhead of division is going to dominate most kernels
1483 // anyways so try hard to prevent vectorization of division - it is
1484 // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1485 // to hide "20 cycles" for each lane.
1486 if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() &&
1487 (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
1488 ISD == ISD::UREM)) {
1489 InstructionCost ScalarCost =
1491 Op1Info.getNoProps(), Op2Info.getNoProps());
1492 return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1493 }
1494
1495 // Handle some basic single instruction code size cases.
1496 if (CostKind == TTI::TCK_CodeSize) {
1497 switch (ISD) {
1498 case ISD::FADD:
1499 case ISD::FSUB:
1500 case ISD::FMUL:
1501 case ISD::FDIV:
1502 case ISD::FNEG:
1503 case ISD::AND:
1504 case ISD::OR:
1505 case ISD::XOR:
1506 return LT.first;
1507 break;
1508 }
1509 }
1510
1511 // Fallback to the default implementation.
1512 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1513 Args, CxtI);
1514}
1515
1518 unsigned Opcode1, const SmallBitVector &OpcodeMask,
1520 if (isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
1521 return TTI::TCC_Basic;
1523}
1524
1526 TTI::ShuffleKind Kind, VectorType *BaseTp, ArrayRef<int> Mask,
1527 TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
1528 ArrayRef<const Value *> Args, const Instruction *CxtI) {
1529 // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1530 // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1531 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(BaseTp);
1532
1533 Kind = improveShuffleKindFromMask(Kind, Mask, BaseTp, Index, SubTp);
1534
1535 // If all args are constant than this will be constant folded away.
1536 if (!Args.empty() &&
1537 all_of(Args, [](const Value *Arg) { return isa<Constant>(Arg); }))
1538 return TTI::TCC_Free;
1539
1540 // Recognize a basic concat_vector shuffle.
1541 if (Kind == TTI::SK_PermuteTwoSrc &&
1542 Mask.size() == (2 * BaseTp->getElementCount().getKnownMinValue()) &&
1543 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
1546 CostKind, Mask.size() / 2, BaseTp);
1547
1548 // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1549 if (Kind == TTI::SK_Transpose)
1550 Kind = TTI::SK_PermuteTwoSrc;
1551
1552 if (Kind == TTI::SK_Broadcast) {
1553 // For Broadcasts we are splatting the first element from the first input
1554 // register, so only need to reference that input and all the output
1555 // registers are the same.
1556 LT.first = 1;
1557
1558 // If we're broadcasting a load then AVX/AVX2 can do this for free.
1559 using namespace PatternMatch;
1560 if (!Args.empty() && match(Args[0], m_OneUse(m_Load(m_Value()))) &&
1561 (ST->hasAVX2() ||
1562 (ST->hasAVX() && LT.second.getScalarSizeInBits() >= 32)))
1563 return TTI::TCC_Free;
1564 }
1565
1566 // Attempt to detect a cheaper inlane shuffle, avoiding 128-bit subvector
1567 // permutation.
1568 // Attempt to detect a shuffle mask with a single defined element.
1569 bool IsInLaneShuffle = false;
1570 bool IsSingleElementMask = false;
1571 if (BaseTp->getPrimitiveSizeInBits() > 0 &&
1572 (BaseTp->getPrimitiveSizeInBits() % 128) == 0 &&
1573 BaseTp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
1574 Mask.size() == BaseTp->getElementCount().getKnownMinValue()) {
1575 unsigned NumLanes = BaseTp->getPrimitiveSizeInBits() / 128;
1576 unsigned NumEltsPerLane = Mask.size() / NumLanes;
1577 if ((Mask.size() % NumLanes) == 0) {
1578 IsInLaneShuffle = all_of(enumerate(Mask), [&](const auto &P) {
1579 return P.value() == PoisonMaskElem ||
1580 ((P.value() % Mask.size()) / NumEltsPerLane) ==
1581 (P.index() / NumEltsPerLane);
1582 });
1583 IsSingleElementMask =
1584 (Mask.size() - 1) == static_cast<unsigned>(count_if(Mask, [](int M) {
1585 return M == PoisonMaskElem;
1586 }));
1587 }
1588 }
1589
1590 // Treat <X x bfloat> shuffles as <X x half>.
1591 if (LT.second.isVector() && LT.second.getScalarType() == MVT::bf16)
1592 LT.second = LT.second.changeVectorElementType(MVT::f16);
1593
1594 // Subvector extractions are free if they start at the beginning of a
1595 // vector and cheap if the subvectors are aligned.
1596 if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1597 int NumElts = LT.second.getVectorNumElements();
1598 if ((Index % NumElts) == 0)
1599 return TTI::TCC_Free;
1600 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1601 if (SubLT.second.isVector()) {
1602 int NumSubElts = SubLT.second.getVectorNumElements();
1603 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1604 return SubLT.first;
1605 // Handle some cases for widening legalization. For now we only handle
1606 // cases where the original subvector was naturally aligned and evenly
1607 // fit in its legalized subvector type.
1608 // FIXME: Remove some of the alignment restrictions.
1609 // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1610 // vectors.
1611 int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1612 if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1613 (NumSubElts % OrigSubElts) == 0 &&
1614 LT.second.getVectorElementType() ==
1615 SubLT.second.getVectorElementType() &&
1616 LT.second.getVectorElementType().getSizeInBits() ==
1618 assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1619 "Unexpected number of elements!");
1620 auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
1621 LT.second.getVectorNumElements());
1622 auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
1623 SubLT.second.getVectorNumElements());
1624 int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1625 InstructionCost ExtractCost = getShuffleCost(
1626 TTI::SK_ExtractSubvector, VecTy, {}, CostKind, ExtractIndex, SubTy);
1627
1628 // If the original size is 32-bits or more, we can use pshufd. Otherwise
1629 // if we have SSSE3 we can use pshufb.
1630 if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1631 return ExtractCost + 1; // pshufd or pshufb
1632
1633 assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1634 "Unexpected vector size");
1635
1636 return ExtractCost + 2; // worst case pshufhw + pshufd
1637 }
1638 }
1639 // If the extract subvector is not optimal, treat it as single op shuffle.
1641 }
1642
1643 // Subvector insertions are cheap if the subvectors are aligned.
1644 // Note that in general, the insertion starting at the beginning of a vector
1645 // isn't free, because we need to preserve the rest of the wide vector,
1646 // but if the destination vector legalizes to the same width as the subvector
1647 // then the insertion will simplify to a (free) register copy.
1648 if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1649 int NumElts = LT.second.getVectorNumElements();
1650 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1651 if (SubLT.second.isVector()) {
1652 int NumSubElts = SubLT.second.getVectorNumElements();
1653 bool MatchingTypes =
1654 NumElts == NumSubElts &&
1655 (SubTp->getElementCount().getKnownMinValue() % NumSubElts) == 0;
1656 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1657 return MatchingTypes ? TTI::TCC_Free : SubLT.first;
1658 }
1659
1660 // Attempt to match MOVSS (Idx == 0) or INSERTPS pattern. This will have
1661 // been matched by improveShuffleKindFromMask as a SK_InsertSubvector of
1662 // v1f32 (legalised to f32) into a v4f32.
1663 if (LT.first == 1 && LT.second == MVT::v4f32 && SubLT.first == 1 &&
1664 SubLT.second == MVT::f32 && (Index == 0 || ST->hasSSE41()))
1665 return 1;
1666
1667 // If the insertion isn't aligned, treat it like a 2-op shuffle.
1668 Kind = TTI::SK_PermuteTwoSrc;
1669 }
1670
1671 // Handle some common (illegal) sub-vector types as they are often very cheap
1672 // to shuffle even on targets without PSHUFB.
1673 EVT VT = TLI->getValueType(DL, BaseTp);
1674 if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1675 !ST->hasSSSE3()) {
1676 static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
1677 {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw
1678 {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw
1679 {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw
1680 {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw
1681 {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck
1682
1683 {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw
1684 {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw
1685 {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus
1686 {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck
1687
1688 {TTI::SK_Splice, MVT::v4i16, 2}, // punpck+psrldq
1689 {TTI::SK_Splice, MVT::v2i16, 2}, // punpck+psrldq
1690 {TTI::SK_Splice, MVT::v4i8, 2}, // punpck+psrldq
1691 {TTI::SK_Splice, MVT::v2i8, 2}, // punpck+psrldq
1692
1693 {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw
1694 {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw
1695 {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw
1696 {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw
1697 {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck
1698
1699 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
1700 {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
1701 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw
1702 {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw
1703 {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck
1704 };
1705
1706 if (ST->hasSSE2())
1707 if (const auto *Entry =
1708 CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1709 return Entry->Cost;
1710 }
1711
1712 // We are going to permute multiple sources and the result will be in multiple
1713 // destinations. Providing an accurate cost only for splits where the element
1714 // type remains the same.
1715 if (LT.first != 1) {
1716 MVT LegalVT = LT.second;
1717 if (LegalVT.isVector() &&
1718 LegalVT.getVectorElementType().getSizeInBits() ==
1720 LegalVT.getVectorNumElements() <
1721 cast<FixedVectorType>(BaseTp)->getNumElements()) {
1722 unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
1723 unsigned LegalVTSize = LegalVT.getStoreSize();
1724 // Number of source vectors after legalization:
1725 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1726 // Number of destination vectors after legalization:
1727 InstructionCost NumOfDests = LT.first;
1728
1729 auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
1730 LegalVT.getVectorNumElements());
1731
1732 if (!Mask.empty() && NumOfDests.isValid()) {
1733 // Try to perform better estimation of the permutation.
1734 // 1. Split the source/destination vectors into real registers.
1735 // 2. Do the mask analysis to identify which real registers are
1736 // permuted. If more than 1 source registers are used for the
1737 // destination register building, the cost for this destination register
1738 // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
1739 // source register is used, build mask and calculate the cost as a cost
1740 // of PermuteSingleSrc.
1741 // Also, for the single register permute we try to identify if the
1742 // destination register is just a copy of the source register or the
1743 // copy of the previous destination register (the cost is
1744 // TTI::TCC_Basic). If the source register is just reused, the cost for
1745 // this operation is TTI::TCC_Free.
1746 NumOfDests =
1748 FixedVectorType::get(BaseTp->getElementType(), Mask.size()))
1749 .first;
1750 unsigned E = *NumOfDests.getValue();
1751 unsigned NormalizedVF =
1752 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
1753 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
1754 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
1755 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
1756 copy(Mask, NormalizedMask.begin());
1757 unsigned PrevSrcReg = 0;
1758 ArrayRef<int> PrevRegMask;
1761 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
1762 [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask,
1763 &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
1764 if (!ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) {
1765 // Check if the previous register can be just copied to the next
1766 // one.
1767 if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
1768 PrevRegMask != RegMask)
1770 RegMask, CostKind, 0, nullptr);
1771 else
1772 // Just a copy of previous destination register.
1774 return;
1775 }
1776 if (SrcReg != DestReg &&
1777 any_of(RegMask, [](int I) { return I != PoisonMaskElem; })) {
1778 // Just a copy of the source register.
1780 }
1781 PrevSrcReg = SrcReg;
1782 PrevRegMask = RegMask;
1783 },
1784 [this, SingleOpTy, CostKind,
1785 &Cost](ArrayRef<int> RegMask, unsigned /*Unused*/,
1786 unsigned /*Unused*/, bool /*Unused*/) {
1787 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
1788 CostKind, 0, nullptr);
1789 });
1790 return Cost;
1791 }
1792
1793 InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1794 return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1795 {}, CostKind, 0, nullptr);
1796 }
1797
1798 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
1799 }
1800
1801 // If we're just moving a single element around (probably as an alternative to
1802 // extracting it), we can assume this is cheap.
1803 if (LT.first == 1 && IsInLaneShuffle && IsSingleElementMask)
1804 return TTI::TCC_Basic;
1805
1806 static const CostTblEntry AVX512VBMIShuffleTbl[] = {
1807 {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
1808 {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
1809
1810 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
1811 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
1812
1813 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
1814 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
1815 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b
1816 };
1817
1818 if (ST->hasVBMI())
1819 if (const auto *Entry =
1820 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1821 return LT.first * Entry->Cost;
1822
1823 static const CostTblEntry AVX512BWShuffleTbl[] = {
1824 {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1825 {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
1826 {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1827
1828 {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
1829 {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
1830 {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
1831 {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
1832
1833 {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
1834 {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
1835 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
1836 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
1837 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
1838
1839 {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
1840 {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
1841 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
1842 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
1843 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
1844
1845 {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
1846 {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb
1847
1848 {TTI::SK_Splice, MVT::v32i16, 2}, // vshufi64x2 + palignr
1849 {TTI::SK_Splice, MVT::v32f16, 2}, // vshufi64x2 + palignr
1850 {TTI::SK_Splice, MVT::v64i8, 2}, // vshufi64x2 + palignr
1851 };
1852
1853 if (ST->hasBWI())
1854 if (const auto *Entry =
1855 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1856 return LT.first * Entry->Cost;
1857
1858 static const CostKindTblEntry AVX512ShuffleTbl[] = {
1859 {TTI::SK_Broadcast, MVT::v8f64, { 1, 1, 1, 1 } }, // vbroadcastsd
1860 {TTI::SK_Broadcast, MVT::v16f32, { 1, 1, 1, 1 } }, // vbroadcastss
1861 {TTI::SK_Broadcast, MVT::v8i64, { 1, 1, 1, 1 } }, // vpbroadcastq
1862 {TTI::SK_Broadcast, MVT::v16i32, { 1, 1, 1, 1 } }, // vpbroadcastd
1863 {TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw
1864 {TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw
1865 {TTI::SK_Broadcast, MVT::v64i8, { 1, 1, 1, 1 } }, // vpbroadcastb
1866
1867 {TTI::SK_Reverse, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1868 {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1869 {TTI::SK_Reverse, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1870 {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1871 {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca
1872 {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca
1873 {TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca
1874
1875 {TTI::SK_Splice, MVT::v8f64, { 1, 1, 1, 1 } }, // vpalignd
1876 {TTI::SK_Splice, MVT::v4f64, { 1, 1, 1, 1 } }, // vpalignd
1877 {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd
1878 {TTI::SK_Splice, MVT::v8f32, { 1, 1, 1, 1 } }, // vpalignd
1879 {TTI::SK_Splice, MVT::v8i64, { 1, 1, 1, 1 } }, // vpalignd
1880 {TTI::SK_Splice, MVT::v4i64, { 1, 1, 1, 1 } }, // vpalignd
1881 {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd
1882 {TTI::SK_Splice, MVT::v8i32, { 1, 1, 1, 1 } }, // vpalignd
1883 {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr
1884 {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr
1885 {TTI::SK_Splice, MVT::v64i8, { 4, 4, 4, 4 } }, // split + palignr
1886
1887 {TTI::SK_PermuteSingleSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1888 {TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermpd
1889 {TTI::SK_PermuteSingleSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermpd
1890 {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1891 {TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermps
1892 {TTI::SK_PermuteSingleSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermps
1893 {TTI::SK_PermuteSingleSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1894 {TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermq
1895 {TTI::SK_PermuteSingleSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermq
1896 {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1897 {TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermd
1898 {TTI::SK_PermuteSingleSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermd
1899 {TTI::SK_PermuteSingleSrc, MVT::v16i8, { 1, 3, 1, 1 } }, // pshufb
1900
1901 {TTI::SK_PermuteTwoSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermt2pd
1902 {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermt2ps
1903 {TTI::SK_PermuteTwoSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermt2q
1904 {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermt2d
1905 {TTI::SK_PermuteTwoSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermt2pd
1906 {TTI::SK_PermuteTwoSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermt2ps
1907 {TTI::SK_PermuteTwoSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermt2q
1908 {TTI::SK_PermuteTwoSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermt2d
1909 {TTI::SK_PermuteTwoSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermt2pd
1910 {TTI::SK_PermuteTwoSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermt2ps
1911 {TTI::SK_PermuteTwoSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermt2q
1912 {TTI::SK_PermuteTwoSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermt2d
1913
1914 // FIXME: This just applies the type legalization cost rules above
1915 // assuming these completely split.
1916 {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } },
1917 {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } },
1918 {TTI::SK_PermuteSingleSrc, MVT::v64i8, { 14, 14, 14, 14 } },
1919 {TTI::SK_PermuteTwoSrc, MVT::v32i16, { 42, 42, 42, 42 } },
1920 {TTI::SK_PermuteTwoSrc, MVT::v32f16, { 42, 42, 42, 42 } },
1921 {TTI::SK_PermuteTwoSrc, MVT::v64i8, { 42, 42, 42, 42 } },
1922
1923 {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq
1924 {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq
1925 {TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vpternlogq
1926 {TTI::SK_Select, MVT::v8f64, { 1, 1, 1, 1 } }, // vblendmpd
1927 {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps
1928 {TTI::SK_Select, MVT::v8i64, { 1, 1, 1, 1 } }, // vblendmq
1929 {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd
1930 };
1931
1932 if (ST->hasAVX512())
1933 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1934 if (auto KindCost = Entry->Cost[CostKind])
1935 return LT.first * *KindCost;
1936
1937 static const CostTblEntry AVX2InLaneShuffleTbl[] = {
1938 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 1}, // vpshufb
1939 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 1}, // vpshufb
1940 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpshufb
1941
1942 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 2}, // 2*vshufpd + vblendpd
1943 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 2}, // 2*vshufps + vblendps
1944 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 2}, // 2*vpshufd + vpblendd
1945 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 2}, // 2*vpshufd + vpblendd
1946 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // 2*vpshufb + vpor
1947 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 2}, // 2*vpshufb + vpor
1948 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // 2*vpshufb + vpor
1949 };
1950
1951 if (IsInLaneShuffle && ST->hasAVX2())
1952 if (const auto *Entry =
1953 CostTableLookup(AVX2InLaneShuffleTbl, Kind, LT.second))
1954 return LT.first * Entry->Cost;
1955
1956 static const CostTblEntry AVX2ShuffleTbl[] = {
1957 {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
1958 {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
1959 {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
1960 {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
1961 {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1962 {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw
1963 {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
1964
1965 {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
1966 {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
1967 {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
1968 {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
1969 {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1970 {TTI::SK_Reverse, MVT::v16f16, 2}, // vperm2i128 + pshufb
1971 {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
1972
1973 {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1974 {TTI::SK_Select, MVT::v16f16, 1}, // vpblendvb
1975 {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
1976
1977 {TTI::SK_Splice, MVT::v8i32, 2}, // vperm2i128 + vpalignr
1978 {TTI::SK_Splice, MVT::v8f32, 2}, // vperm2i128 + vpalignr
1979 {TTI::SK_Splice, MVT::v16i16, 2}, // vperm2i128 + vpalignr
1980 {TTI::SK_Splice, MVT::v16f16, 2}, // vperm2i128 + vpalignr
1981 {TTI::SK_Splice, MVT::v32i8, 2}, // vperm2i128 + vpalignr
1982
1983 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1984 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1985 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1986 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1987 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1988 // + vpblendvb
1989 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vperm2i128 + 2*vpshufb
1990 // + vpblendvb
1991 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
1992 // + vpblendvb
1993
1994 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
1995 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
1996 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
1997 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
1998 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1999 // + vpblendvb
2000 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 7}, // 2*vperm2i128 + 4*vpshufb
2001 // + vpblendvb
2002 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
2003 // + vpblendvb
2004 };
2005
2006 if (ST->hasAVX2())
2007 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
2008 return LT.first * Entry->Cost;
2009
2010 static const CostTblEntry XOPShuffleTbl[] = {
2011 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
2012 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
2013 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
2014 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
2015 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
2016 // + vinsertf128
2017 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
2018 // + vinsertf128
2019
2020 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
2021 // + vinsertf128
2022 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
2023 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
2024 // + vinsertf128
2025 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
2026 };
2027
2028 if (ST->hasXOP())
2029 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
2030 return LT.first * Entry->Cost;
2031
2032 static const CostTblEntry AVX1InLaneShuffleTbl[] = {
2033 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermilpd
2034 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermilpd
2035 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermilps
2036 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermilps
2037
2038 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
2039 // + vpor + vinsertf128
2040 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vextractf128 + 2*pshufb
2041 // + vpor + vinsertf128
2042 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
2043 // + vpor + vinsertf128
2044
2045 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 2}, // 2*vshufpd + vblendpd
2046 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 2}, // 2*vshufps + vblendps
2047 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 2}, // 2*vpermilpd + vblendpd
2048 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 2}, // 2*vpermilps + vblendps
2049 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 4*pshufb
2050 // + 2*vpor + vinsertf128
2051 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 9}, // 2*vextractf128 + 4*pshufb
2052 // + 2*vpor + vinsertf128
2053 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 4*pshufb
2054 // + 2*vpor + vinsertf128
2055 };
2056
2057 if (IsInLaneShuffle && ST->hasAVX())
2058 if (const auto *Entry =
2059 CostTableLookup(AVX1InLaneShuffleTbl, Kind, LT.second))
2060 return LT.first * Entry->Cost;
2061
2062 static const CostTblEntry AVX1ShuffleTbl[] = {
2063 {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
2064 {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
2065 {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
2066 {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
2067 {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
2068 {TTI::SK_Broadcast, MVT::v16f16, 3}, // vpshuflw + vpshufd + vinsertf128
2069 {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
2070
2071 {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
2072 {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
2073 {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
2074 {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
2075 {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
2076 // + vinsertf128
2077 {TTI::SK_Reverse, MVT::v16f16, 4}, // vextractf128 + 2*pshufb
2078 // + vinsertf128
2079 {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
2080 // + vinsertf128
2081
2082 {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
2083 {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
2084 {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
2085 {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
2086 {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
2087 {TTI::SK_Select, MVT::v16f16, 3}, // vpand + vpandn + vpor
2088 {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
2089
2090 {TTI::SK_Splice, MVT::v4i64, 2}, // vperm2f128 + shufpd
2091 {TTI::SK_Splice, MVT::v4f64, 2}, // vperm2f128 + shufpd
2092 {TTI::SK_Splice, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
2093 {TTI::SK_Splice, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
2094 {TTI::SK_Splice, MVT::v16i16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2095 {TTI::SK_Splice, MVT::v16f16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2096 {TTI::SK_Splice, MVT::v32i8, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2097
2098 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
2099 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
2100 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
2101 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
2102 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
2103 // + 2*por + vinsertf128
2104 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 8}, // vextractf128 + 4*pshufb
2105 // + 2*por + vinsertf128
2106 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
2107 // + 2*por + vinsertf128
2108
2109 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
2110 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
2111 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
2112 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
2113 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
2114 // + 4*por + vinsertf128
2115 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 15}, // 2*vextractf128 + 8*pshufb
2116 // + 4*por + vinsertf128
2117 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
2118 // + 4*por + vinsertf128
2119 };
2120
2121 if (ST->hasAVX())
2122 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
2123 return LT.first * Entry->Cost;
2124
2125 static const CostTblEntry SSE41ShuffleTbl[] = {
2126 {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
2127 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
2128 {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
2129 {TTI::SK_Select, MVT::v4f32, 1}, // blendps
2130 {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
2131 {TTI::SK_Select, MVT::v8f16, 1}, // pblendw
2132 {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
2133 };
2134
2135 if (ST->hasSSE41())
2136 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
2137 return LT.first * Entry->Cost;
2138
2139 static const CostTblEntry SSSE3ShuffleTbl[] = {
2140 {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
2141 {TTI::SK_Broadcast, MVT::v8f16, 1}, // pshufb
2142 {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
2143
2144 {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
2145 {TTI::SK_Reverse, MVT::v8f16, 1}, // pshufb
2146 {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
2147
2148 {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
2149 {TTI::SK_Select, MVT::v8f16, 3}, // 2*pshufb + por
2150 {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
2151
2152 {TTI::SK_Splice, MVT::v4i32, 1}, // palignr
2153 {TTI::SK_Splice, MVT::v4f32, 1}, // palignr
2154 {TTI::SK_Splice, MVT::v8i16, 1}, // palignr
2155 {TTI::SK_Splice, MVT::v8f16, 1}, // palignr
2156 {TTI::SK_Splice, MVT::v16i8, 1}, // palignr
2157
2158 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
2159 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // pshufb
2160 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
2161
2162 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
2163 {TTI::SK_PermuteTwoSrc, MVT::v8f16, 3}, // 2*pshufb + por
2164 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
2165 };
2166
2167 if (ST->hasSSSE3())
2168 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
2169 return LT.first * Entry->Cost;
2170
2171 static const CostTblEntry SSE2ShuffleTbl[] = {
2172 {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
2173 {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
2174 {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
2175 {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
2176 {TTI::SK_Broadcast, MVT::v8f16, 2}, // pshuflw + pshufd
2177 {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
2178
2179 {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
2180 {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
2181 {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
2182 {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
2183 {TTI::SK_Reverse, MVT::v8f16, 3}, // pshuflw + pshufhw + pshufd
2184 {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
2185 // + 2*pshufd + 2*unpck + packus
2186
2187 {TTI::SK_Select, MVT::v2i64, 1}, // movsd
2188 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
2189 {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
2190 {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
2191 {TTI::SK_Select, MVT::v8f16, 3}, // pand + pandn + por
2192 {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
2193
2194 {TTI::SK_Splice, MVT::v2i64, 1}, // shufpd
2195 {TTI::SK_Splice, MVT::v2f64, 1}, // shufpd
2196 {TTI::SK_Splice, MVT::v4i32, 2}, // 2*{unpck,movsd,pshufd}
2197 {TTI::SK_Splice, MVT::v8i16, 3}, // psrldq + psrlldq + por
2198 {TTI::SK_Splice, MVT::v8f16, 3}, // psrldq + psrlldq + por
2199 {TTI::SK_Splice, MVT::v16i8, 3}, // psrldq + psrlldq + por
2200
2201 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
2202 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
2203 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
2204 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
2205 // + pshufd/unpck
2206 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 5}, // 2*pshuflw + 2*pshufhw
2207 // + pshufd/unpck
2208 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
2209 // + 2*pshufd + 2*unpck + 2*packus
2210
2211 { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
2212 { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
2213 { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
2214 { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
2215 { TTI::SK_PermuteTwoSrc, MVT::v8f16, 8 }, // blend+permute
2216 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
2217 };
2218
2219 static const CostTblEntry SSE3BroadcastLoadTbl[] = {
2220 {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
2221 };
2222
2223 if (ST->hasSSE2()) {
2224 bool IsLoad =
2225 llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); });
2226 if (ST->hasSSE3() && IsLoad)
2227 if (const auto *Entry =
2228 CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
2230 LT.second.getVectorElementCount()) &&
2231 "Table entry missing from isLegalBroadcastLoad()");
2232 return LT.first * Entry->Cost;
2233 }
2234
2235 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
2236 return LT.first * Entry->Cost;
2237 }
2238
2239 static const CostTblEntry SSE1ShuffleTbl[] = {
2240 { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
2241 { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
2242 { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
2243 { TTI::SK_Splice, MVT::v4f32, 2 }, // 2*shufps
2244 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
2245 { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
2246 };
2247
2248 if (ST->hasSSE1()) {
2249 if (LT.first == 1 && LT.second == MVT::v4f32 && Mask.size() == 4) {
2250 // SHUFPS: both pairs must come from the same source register.
2251 auto MatchSHUFPS = [](int X, int Y) {
2252 return X < 0 || Y < 0 || ((X & 4) == (Y & 4));
2253 };
2254 if (MatchSHUFPS(Mask[0], Mask[1]) && MatchSHUFPS(Mask[2], Mask[3]))
2255 return 1;
2256 }
2257 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
2258 return LT.first * Entry->Cost;
2259 }
2260
2261 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
2262}
2263
2265 Type *Src,
2268 const Instruction *I) {
2269 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2270 assert(ISD && "Invalid opcode");
2271
2272 // The cost tables include both specific, custom (non-legal) src/dst type
2273 // conversions and generic, legalized types. We test for customs first, before
2274 // falling back to legalization.
2275 // FIXME: Need a better design of the cost table to handle non-simple types of
2276 // potential massive combinations (elem_num x src_type x dst_type).
2277 static const TypeConversionCostKindTblEntry AVX512BWConversionTbl[]{
2278 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2279 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2280
2281 // Mask sign extend has an instruction.
2282 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2283 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2284 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2285 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2286 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2287 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2288 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2289 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2290 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2291 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2292 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2293 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2294 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2295 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2296 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, { 1, 1, 1, 1 } },
2297 { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, { 1, 1, 1, 1 } },
2298 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, { 1, 1, 1, 1 } },
2299
2300 // Mask zero extend is a sext + shift.
2301 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2302 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2303 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2304 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2305 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2306 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2307 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2308 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2309 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2310 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2311 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2312 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2313 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2314 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2315 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, { 2, 1, 1, 1 } },
2316 { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, { 2, 1, 1, 1 } },
2317 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, { 2, 1, 1, 1 } },
2318
2319 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2320 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2321 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2322 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2323 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2324 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2325 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2326 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2327 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2328 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2329 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2330 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2331 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2332 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2333 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, { 2, 1, 1, 1 } },
2334 { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, { 2, 1, 1, 1 } },
2335 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, { 2, 1, 1, 1 } },
2336
2337 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 2, 1, 1, 1 } },
2338 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // widen to zmm
2339 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, { 2, 1, 1, 1 } }, // vpmovwb
2340 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, { 2, 1, 1, 1 } }, // vpmovwb
2341 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, { 2, 1, 1, 1 } }, // vpmovwb
2342 };
2343
2344 static const TypeConversionCostKindTblEntry AVX512DQConversionTbl[] = {
2345 // Mask sign extend has an instruction.
2346 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2347 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2348 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2349 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2350 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2351 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, { 1, 1, 1, 1 } },
2352 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } },
2353 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } },
2354
2355 // Mask zero extend is a sext + shift.
2356 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1, } },
2357 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1, } },
2358 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1, } },
2359 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1, } },
2360 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1, } },
2361 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, { 2, 1, 1, 1, } },
2362 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1, } },
2363 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1, } },
2364
2365 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2366 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2367 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2368 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2369 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2370 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } },
2371 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } },
2372 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, { 2, 1, 1, 1 } },
2373
2374 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2375 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2376
2377 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2378 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2379
2380 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2381 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2382
2383 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2384 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2385 };
2386
2387 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
2388 // 256-bit wide vectors.
2389
2390 static const TypeConversionCostKindTblEntry AVX512FConversionTbl[] = {
2391 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 1, 1, 1, 1 } },
2392 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, { 3, 1, 1, 1 } },
2393 { ISD::FP_EXTEND, MVT::v16f64, MVT::v16f32, { 4, 1, 1, 1 } }, // 2*vcvtps2pd+vextractf64x4
2394 { ISD::FP_EXTEND, MVT::v16f32, MVT::v16f16, { 1, 1, 1, 1 } }, // vcvtph2ps
2395 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
2396 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 1, 1, 1, 1 } },
2397 { ISD::FP_ROUND, MVT::v16f16, MVT::v16f32, { 1, 1, 1, 1 } }, // vcvtps2ph
2398
2399 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2400 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2401 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2402 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2403 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2404 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2405 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2406 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2407 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2408 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2409 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2410 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2411 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2412 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2413 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2414 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, { 2, 1, 1, 1 } }, // vpmovdb
2415 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, { 2, 1, 1, 1 } }, // vpmovdb
2416 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2417 { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2418 { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2419 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2420 { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2421 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, { 2, 1, 1, 1 } }, // vpmovqb
2422 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, { 1, 1, 1, 1 } }, // vpshufb
2423 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2424 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2425 { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2426 { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2427 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2428 { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2429 { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2430 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, { 1, 1, 1, 1 } }, // vpmovqd
2431 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // zmm vpmovqd
2432 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, { 5, 1, 1, 1 } },// 2*vpmovqd+concat+vpmovdb
2433
2434 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } }, // extend to v16i32
2435 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 8, 1, 1, 1 } },
2436 { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, { 8, 1, 1, 1 } },
2437
2438 // Sign extend is zmm vpternlogd+vptruncdb.
2439 // Zero extend is zmm broadcast load+vptruncdw.
2440 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 3, 1, 1, 1 } },
2441 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 4, 1, 1, 1 } },
2442 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 3, 1, 1, 1 } },
2443 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 4, 1, 1, 1 } },
2444 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 3, 1, 1, 1 } },
2445 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 4, 1, 1, 1 } },
2446 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 3, 1, 1, 1 } },
2447 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 4, 1, 1, 1 } },
2448
2449 // Sign extend is zmm vpternlogd+vptruncdw.
2450 // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
2451 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 3, 1, 1, 1 } },
2452 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2453 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 3, 1, 1, 1 } },
2454 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2455 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 3, 1, 1, 1 } },
2456 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2457 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 3, 1, 1, 1 } },
2458 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2459
2460 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2461 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2462 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2463 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2464 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2465 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2466 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2467 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2468 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2469 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2470
2471 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2472 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2473 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogq
2474 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2475
2476 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2477 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2478 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2479 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2480 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2481 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2482 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2483 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2484 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2485 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2486
2487 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2488 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2489
2490 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2491 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2492 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2493 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2494 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2495 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2496 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2497 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2498
2499 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2500 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2501 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2502 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2503 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2504 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2505 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2506 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2507 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, {26, 1, 1, 1 } },
2508 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 5, 1, 1, 1 } },
2509
2510 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2511 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, { 7, 1, 1, 1 } },
2512 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64, {15, 1, 1, 1 } },
2513 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32, {11, 1, 1, 1 } },
2514 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64, {31, 1, 1, 1 } },
2515 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2516 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, { 7, 1, 1, 1 } },
2517 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, { 5, 1, 1, 1 } },
2518 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64, {15, 1, 1, 1 } },
2519 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2520 { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, { 3, 1, 1, 1 } },
2521
2522 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2523 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2524 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, { 3, 1, 1, 1 } },
2525 { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, { 1, 1, 1, 1 } },
2526 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, { 3, 1, 1, 1 } },
2527 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, { 3, 1, 1, 1 } },
2528 };
2529
2530 static const TypeConversionCostKindTblEntry AVX512BWVLConversionTbl[] {
2531 // Mask sign extend has an instruction.
2532 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2533 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2534 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2535 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2536 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2537 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2538 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2539 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2540 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2541 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2542 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2543 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2544 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2545 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2546 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, { 1, 1, 1, 1 } },
2547 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, { 1, 1, 1, 1 } },
2548 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, { 1, 1, 1, 1 } },
2549
2550 // Mask zero extend is a sext + shift.
2551 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2552 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2553 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2554 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2555 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2556 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2557 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2558 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2559 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2560 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2561 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2562 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2563 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2564 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2565 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, { 2, 1, 1, 1 } },
2566 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, { 2, 1, 1, 1 } },
2567 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, { 2, 1, 1, 1 } },
2568
2569 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2570 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2571 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2572 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2573 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2574 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2575 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2576 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2577 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2578 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2579 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2580 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2581 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2582 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2583 { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, { 2, 1, 1, 1 } },
2584 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, { 2, 1, 1, 1 } },
2585 { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, { 2, 1, 1, 1 } },
2586
2587 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } },
2588 };
2589
2590 static const TypeConversionCostKindTblEntry AVX512DQVLConversionTbl[] = {
2591 // Mask sign extend has an instruction.
2592 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2593 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2594 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2595 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, { 1, 1, 1, 1 } },
2596 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2597 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, { 1, 1, 1, 1 } },
2598 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } },
2599 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2600
2601 // Mask zero extend is a sext + shift.
2602 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } },
2603 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1 } },
2604 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } },
2605 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, { 2, 1, 1, 1 } },
2606 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } },
2607 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, { 2, 1, 1, 1 } },
2608 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } },
2609 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } },
2610
2611 { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, { 2, 1, 1, 1 } },
2612 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } },
2613 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2614 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2615 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2616 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2617 { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, { 2, 1, 1, 1 } },
2618 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2619
2620 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2621 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2622 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2623 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2624
2625 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2626 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2627 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2628 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2629
2630 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2631 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2632 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2633 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2634
2635 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2636 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2637 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2638 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2639 };
2640
2641 static const TypeConversionCostKindTblEntry AVX512VLConversionTbl[] = {
2642 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2643 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2644 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2645 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 8, 1, 1, 1 } }, // split+2*v8i8
2646 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2647 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2648 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2649 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 8, 1, 1, 1 } }, // split+2*v8i16
2650 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2651 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2652 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2653 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2654 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2655 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2656 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // vpmovqd
2657 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqb
2658 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqw
2659 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, { 2, 1, 1, 1 } }, // vpmovwb
2660
2661 // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
2662 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
2663 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 5, 1, 1, 1 } },
2664 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 6, 1, 1, 1 } },
2665 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 5, 1, 1, 1 } },
2666 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 6, 1, 1, 1 } },
2667 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 5, 1, 1, 1 } },
2668 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 6, 1, 1, 1 } },
2669 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, {10, 1, 1, 1 } },
2670 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, {12, 1, 1, 1 } },
2671
2672 // sign extend is vpcmpeq+maskedmove+vpmovdw
2673 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
2674 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2675 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 5, 1, 1, 1 } },
2676 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2677 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 5, 1, 1, 1 } },
2678 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2679 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 5, 1, 1, 1 } },
2680 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, {10, 1, 1, 1 } },
2681 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, {12, 1, 1, 1 } },
2682
2683 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogd
2684 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2685 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogd
2686 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2687 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogd
2688 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2689 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2690 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2691
2692 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogq
2693 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2694 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogq
2695 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2696
2697 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2698 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2699 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2700 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2701 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2702 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2703 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2704 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2705 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2706 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2707 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2708 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2709
2710 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2711 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2712 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2713 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2714
2715 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
2716 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
2717 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2718 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2719 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2720 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2721 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 1, 1, 1, 1 } },
2722 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
2723 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2724 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2725 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 5, 1, 1, 1 } },
2726 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2727 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 5, 1, 1, 1 } },
2728
2729 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2730 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2731 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, { 5, 1, 1, 1 } },
2732
2733 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
2734 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
2735 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
2736 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
2737 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2738 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2739 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2740 };
2741
2742 static const TypeConversionCostKindTblEntry AVX2ConversionTbl[] = {
2743 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2744 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2745 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2746 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2747 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2748 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2749
2750 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2751 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2752 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2753 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2754 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2755 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2756 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2757 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2758 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2759 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2760 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2761 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2762 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2763 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2764
2765 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2766
2767 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 4, 1, 1, 1 } },
2768 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 4, 1, 1, 1 } },
2769 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 1, 1, 1, 1 } },
2770 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 1, 1, 1, 1 } },
2771 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 1, 1, 1, 1 } },
2772 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 4, 1, 1, 1 } },
2773 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 4, 1, 1, 1 } },
2774 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 1, 1, 1, 1 } },
2775 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 1, 1, 1, 1 } },
2776 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 5, 1, 1, 1 } },
2777 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } },
2778 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 2, 1, 1, 1 } },
2779
2780 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 3, 1, 1, 1 } },
2781 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 3, 1, 1, 1 } },
2782
2783 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2784 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2785 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2786 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 3, 1, 1, 1 } },
2787
2788 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 3, 1, 1, 1 } },
2789 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 3, 1, 1, 1 } },
2790 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2791 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
2792 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2793 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 4, 1, 1, 1 } },
2794 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 3, 1, 1, 1 } },
2795 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 4, 1, 1, 1 } },
2796
2797 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2798 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2799 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2800 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2801 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2802 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2803 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 3, 1, 1, 1 } },
2804
2805 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2806 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2807 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2808 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2809 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 2, 1, 1, 1 } },
2810 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 1, 1, 1, 1 } },
2811 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 2, 1, 1, 1 } },
2812 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2813 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2814 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2815 };
2816
2817 static const TypeConversionCostKindTblEntry AVXConversionTbl[] = {
2818 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2819 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2820 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2821 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2822 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2823 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2824
2825 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2826 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2827 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2828 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2829 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2830 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2831 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2832 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2833 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2834 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2835 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2836 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2837
2838 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 4, 1, 1, 1 } },
2839 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 5, 1, 1, 1 } },
2840 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 4, 1, 1, 1 } },
2841 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 9, 1, 1, 1 } },
2842 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, {11, 1, 1, 1 } },
2843
2844 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 6, 1, 1, 1 } },
2845 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 6, 1, 1, 1 } },
2846 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // and+extract+packuswb
2847 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 5, 1, 1, 1 } },
2848 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
2849 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 5, 1, 1, 1 } },
2850 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 3, 1, 1, 1 } }, // and+extract+2*packusdw
2851 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 2, 1, 1, 1 } },
2852
2853 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, { 3, 1, 1, 1 } },
2854 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, { 3, 1, 1, 1 } },
2855 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, { 8, 1, 1, 1 } },
2856 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2857 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2858 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2859 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2860 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2861 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2862 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2863 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 5, 1, 1, 1 } },
2864 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 8, 1, 1, 1 } },
2865
2866 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, { 7, 1, 1, 1 } },
2867 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, { 7, 1, 1, 1 } },
2868 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, { 6, 1, 1, 1 } },
2869 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2870 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2871 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2872 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2873 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 4, 1, 1, 1 } },
2874 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 4, 1, 1, 1 } },
2875 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
2876 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 6, 1, 1, 1 } },
2877 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 8, 1, 1, 1 } },
2878 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, {10, 1, 1, 1 } },
2879 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, {10, 1, 1, 1 } },
2880 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {18, 1, 1, 1 } },
2881 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2882 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, {10, 1, 1, 1 } },
2883
2884 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2885 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2886 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2887 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
2888 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
2889 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
2890 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
2891 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
2892 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 2, 1, 1, 1 } },
2893 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 2, 1, 1, 1 } },
2894 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 5, 1, 1, 1 } },
2895
2896 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2897 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2898 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2899 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
2900 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
2901 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
2902 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
2903 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
2904 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
2905 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2906 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 6, 1, 1, 1 } },
2907 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 7, 1, 1, 1 } },
2908 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 7, 1, 1, 1 } },
2909
2910 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, { 1, 1, 1, 1 } },
2911 { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, { 1, 1, 1, 1 } },
2912 };
2913
2914 static const TypeConversionCostKindTblEntry SSE41ConversionTbl[] = {
2915 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
2916 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
2917 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
2918 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
2919 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
2920 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
2921 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
2922 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
2923 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
2924 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
2925 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
2926 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
2927
2928 // These truncates end up widening elements.
2929 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 1, 1, 1, 1 } }, // PMOVXZBQ
2930 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 1, 1, 1, 1 } }, // PMOVXZWQ
2931 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 1, 1, 1, 1 } }, // PMOVXZBD
2932
2933 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 2, 1, 1, 1 } },
2934 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 2, 1, 1, 1 } },
2935 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 2, 1, 1, 1 } },
2936
2937 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
2938 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
2939 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
2940 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
2941 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
2942 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2943 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
2944 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2945 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
2946 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 1, 1, 1, 1 } },
2947 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2948
2949 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
2950 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
2951 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 4, 1, 1, 1 } },
2952 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 4, 1, 1, 1 } },
2953 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
2954 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2955 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
2956 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2957 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 3, 1, 1, 1 } },
2958 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
2959 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 2, 1, 1, 1 } },
2960 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {12, 1, 1, 1 } },
2961 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {22, 1, 1, 1 } },
2962 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 4, 1, 1, 1 } },
2963
2964 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
2965 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
2966 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
2967 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
2968 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
2969 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
2970 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
2971 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
2972 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
2973 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
2974
2975 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
2976 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
2977 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
2978 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
2979 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
2980 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
2981 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
2982 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
2983 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
2984 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2985 };
2986
2987 static const TypeConversionCostKindTblEntry SSE2ConversionTbl[] = {
2988 // These are somewhat magic numbers justified by comparing the
2989 // output of llvm-mca for our various supported scheduler models
2990 // and basing it off the worst case scenario.
2991 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
2992 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
2993 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 3, 1, 1, 1 } },
2994 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 3, 1, 1, 1 } },
2995 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 3, 1, 1, 1 } },
2996 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
2997 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 3, 1, 1, 1 } },
2998 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
2999 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
3000 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 4, 1, 1, 1 } },
3001 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 8, 1, 1, 1 } },
3002 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 8, 1, 1, 1 } },
3003
3004 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
3005 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
3006 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 8, 1, 1, 1 } },
3007 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 9, 1, 1, 1 } },
3008 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
3009 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 4, 1, 1, 1 } },
3010 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 4, 1, 1, 1 } },
3011 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
3012 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 7, 1, 1, 1 } },
3013 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 7, 1, 1, 1 } },
3014 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
3015 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, {15, 1, 1, 1 } },
3016 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {18, 1, 1, 1 } },
3017
3018 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
3019 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3020 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
3021 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
3022 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
3023 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
3024 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
3025 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
3026 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
3027 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
3028
3029 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
3030 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3031 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
3032 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, {15, 1, 1, 1 } },
3033 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
3034 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
3035 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
3036 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
3037 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 8, 1, 1, 1 } },
3038 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 8, 1, 1, 1 } },
3039
3040 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
3041 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
3042 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 2, 1, 1, 1 } },
3043 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 3, 1, 1, 1 } },
3044 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
3045 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 2, 1, 1, 1 } },
3046 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 2, 1, 1, 1 } },
3047 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 3, 1, 1, 1 } },
3048 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
3049 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 2, 1, 1, 1 } },
3050 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
3051 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 2, 1, 1, 1 } },
3052
3053 // These truncates are really widening elements.
3054 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 1, 1, 1, 1 } }, // PSHUFD
3055 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } }, // PUNPCKLWD+DQ
3056 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // PUNPCKLBW+WD+PSHUFD
3057 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 1, 1, 1, 1 } }, // PUNPCKLWD
3058 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } }, // PUNPCKLBW+WD
3059 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 1, 1, 1, 1 } }, // PUNPCKLBW
3060
3061 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 2, 1, 1, 1 } }, // PAND+PACKUSWB
3062 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } },
3063 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 3, 1, 1, 1 } }, // PAND+2*PACKUSWB
3064 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 7, 1, 1, 1 } },
3065 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, { 1, 1, 1, 1 } },
3066 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 3, 1, 1, 1 } },
3067 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
3068 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, {10, 1, 1, 1 } },
3069 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 4, 1, 1, 1 } }, // PAND+3*PACKUSWB
3070 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 2, 1, 1, 1 } }, // PSHUFD+PSHUFLW
3071 { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, { 1, 1, 1, 1 } }, // PSHUFD
3072 };
3073
3074 static const TypeConversionCostKindTblEntry F16ConversionTbl[] = {
3075 { ISD::FP_ROUND, MVT::f16, MVT::f32, { 1, 1, 1, 1 } },
3076 { ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, { 1, 1, 1, 1 } },
3077 { ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, { 1, 1, 1, 1 } },
3078 { ISD::FP_EXTEND, MVT::f32, MVT::f16, { 1, 1, 1, 1 } },
3079 { ISD::FP_EXTEND, MVT::f64, MVT::f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
3080 { ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, { 1, 1, 1, 1 } },
3081 { ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, { 1, 1, 1, 1 } },
3082 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
3083 };
3084
3085 // Attempt to map directly to (simple) MVT types to let us match custom entries.
3086 EVT SrcTy = TLI->getValueType(DL, Src);
3087 EVT DstTy = TLI->getValueType(DL, Dst);
3088
3089 // The function getSimpleVT only handles simple value types.
3090 if (SrcTy.isSimple() && DstTy.isSimple()) {
3091 MVT SimpleSrcTy = SrcTy.getSimpleVT();
3092 MVT SimpleDstTy = DstTy.getSimpleVT();
3093
3094 if (ST->useAVX512Regs()) {
3095 if (ST->hasBWI())
3096 if (const auto *Entry = ConvertCostTableLookup(
3097 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3098 if (auto KindCost = Entry->Cost[CostKind])
3099 return *KindCost;
3100
3101 if (ST->hasDQI())
3102 if (const auto *Entry = ConvertCostTableLookup(
3103 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3104 if (auto KindCost = Entry->Cost[CostKind])
3105 return *KindCost;
3106
3107 if (ST->hasAVX512())
3108 if (const auto *Entry = ConvertCostTableLookup(
3109 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3110 if (auto KindCost = Entry->Cost[CostKind])
3111 return *KindCost;
3112 }
3113
3114 if (ST->hasBWI())
3115 if (const auto *Entry = ConvertCostTableLookup(
3116 AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3117 if (auto KindCost = Entry->Cost[CostKind])
3118 return *KindCost;
3119
3120 if (ST->hasDQI())
3121 if (const auto *Entry = ConvertCostTableLookup(
3122 AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3123 if (auto KindCost = Entry->Cost[CostKind])
3124 return *KindCost;
3125
3126 if (ST->hasAVX512())
3127 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3128 SimpleDstTy, SimpleSrcTy))
3129 if (auto KindCost = Entry->Cost[CostKind])
3130 return *KindCost;
3131
3132 if (ST->hasAVX2()) {
3133 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3134 SimpleDstTy, SimpleSrcTy))
3135 if (auto KindCost = Entry->Cost[CostKind])
3136 return *KindCost;
3137 }
3138
3139 if (ST->hasAVX()) {
3140 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3141 SimpleDstTy, SimpleSrcTy))
3142 if (auto KindCost = Entry->Cost[CostKind])
3143 return *KindCost;
3144 }
3145
3146 if (ST->hasF16C()) {
3147 if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD,
3148 SimpleDstTy, SimpleSrcTy))
3149 if (auto KindCost = Entry->Cost[CostKind])
3150 return *KindCost;
3151 }
3152
3153 if (ST->hasSSE41()) {
3154 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3155 SimpleDstTy, SimpleSrcTy))
3156 if (auto KindCost = Entry->Cost[CostKind])
3157 return *KindCost;
3158 }
3159
3160 if (ST->hasSSE2()) {
3161 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3162 SimpleDstTy, SimpleSrcTy))
3163 if (auto KindCost = Entry->Cost[CostKind])
3164 return *KindCost;
3165 }
3166
3167 if ((ISD == ISD::FP_ROUND && SimpleDstTy == MVT::f16) ||
3168 (ISD == ISD::FP_EXTEND && SimpleSrcTy == MVT::f16)) {
3169 // fp16 conversions not covered by any table entries require a libcall.
3170 // Return a large (arbitrary) number to model this.
3171 return InstructionCost(64);
3172 }
3173 }
3174
3175 // Fall back to legalized types.
3176 std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src);
3177 std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst);
3178
3179 // If we're truncating to the same legalized type - just assume its free.
3180 if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second)
3181 return TTI::TCC_Free;
3182
3183 if (ST->useAVX512Regs()) {
3184 if (ST->hasBWI())
3185 if (const auto *Entry = ConvertCostTableLookup(
3186 AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
3187 if (auto KindCost = Entry->Cost[CostKind])
3188 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3189
3190 if (ST->hasDQI())
3191 if (const auto *Entry = ConvertCostTableLookup(
3192 AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
3193 if (auto KindCost = Entry->Cost[CostKind])
3194 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3195
3196 if (ST->hasAVX512())
3197 if (const auto *Entry = ConvertCostTableLookup(
3198 AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
3199 if (auto KindCost = Entry->Cost[CostKind])
3200 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3201 }
3202
3203 if (ST->hasBWI())
3204 if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
3205 LTDest.second, LTSrc.second))
3206 if (auto KindCost = Entry->Cost[CostKind])
3207 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3208
3209 if (ST->hasDQI())
3210 if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
3211 LTDest.second, LTSrc.second))
3212 if (auto KindCost = Entry->Cost[CostKind])
3213 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3214
3215 if (ST->hasAVX512())
3216 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3217 LTDest.second, LTSrc.second))
3218 if (auto KindCost = Entry->Cost[CostKind])
3219 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3220
3221 if (ST->hasAVX2())
3222 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3223 LTDest.second, LTSrc.second))
3224 if (auto KindCost = Entry->Cost[CostKind])
3225 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3226
3227 if (ST->hasAVX())
3228 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3229 LTDest.second, LTSrc.second))
3230 if (auto KindCost = Entry->Cost[CostKind])
3231 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3232
3233 if (ST->hasF16C()) {
3234 if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD,
3235 LTDest.second, LTSrc.second))
3236 if (auto KindCost = Entry->Cost[CostKind])
3237 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3238 }
3239
3240 if (ST->hasSSE41())
3241 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3242 LTDest.second, LTSrc.second))
3243 if (auto KindCost = Entry->Cost[CostKind])
3244 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3245
3246 if (ST->hasSSE2())
3247 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3248 LTDest.second, LTSrc.second))
3249 if (auto KindCost = Entry->Cost[CostKind])
3250 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3251
3252 // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
3253 // sitofp.
3254 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3255 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
3256 Type *ExtSrc = Src->getWithNewBitWidth(32);
3257 unsigned ExtOpc =
3258 (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
3259
3260 // For scalar loads the extend would be free.
3261 InstructionCost ExtCost = 0;
3262 if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
3263 ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
3264
3265 return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
3267 }
3268
3269 // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
3270 // i32.
3271 if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
3272 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
3273 Type *TruncDst = Dst->getWithNewBitWidth(32);
3274 return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
3275 getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
3277 }
3278
3279 // TODO: Allow non-throughput costs that aren't binary.
3280 auto AdjustCost = [&CostKind](InstructionCost Cost,
3283 return Cost == 0 ? 0 : N;
3284 return Cost * N;
3285 };
3286 return AdjustCost(
3287 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3288}
3289
3291 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
3293 TTI::OperandValueInfo Op2Info, const Instruction *I) {
3294 // Early out if this type isn't scalar/vector integer/float.
3295 if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy()))
3296 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3297 Op1Info, Op2Info, I);
3298
3299 // Legalize the type.
3300 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3301
3302 MVT MTy = LT.second;
3303
3304 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3305 assert(ISD && "Invalid opcode");
3306
3307 InstructionCost ExtraCost = 0;
3308 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
3309 // Some vector comparison predicates cost extra instructions.
3310 // TODO: Adjust ExtraCost based on CostKind?
3311 // TODO: Should we invert this and assume worst case cmp costs
3312 // and reduce for particular predicates?
3313 if (MTy.isVector() &&
3314 !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
3315 (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
3316 ST->hasBWI())) {
3317 // Fallback to I if a specific predicate wasn't specified.
3318 CmpInst::Predicate Pred = VecPred;
3319 if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE ||
3321 Pred = cast<CmpInst>(I)->getPredicate();
3322
3323 bool CmpWithConstant = false;
3324 if (auto *CmpInstr = dyn_cast_or_null<CmpInst>(I))
3325 CmpWithConstant = isa<Constant>(CmpInstr->getOperand(1));
3326
3327 switch (Pred) {
3329 // xor(cmpeq(x,y),-1)
3330 ExtraCost = CmpWithConstant ? 0 : 1;
3331 break;
3334 // xor(cmpgt(x,y),-1)
3335 ExtraCost = CmpWithConstant ? 0 : 1;
3336 break;
3339 // cmpgt(xor(x,signbit),xor(y,signbit))
3340 // xor(cmpeq(pmaxu(x,y),x),-1)
3341 ExtraCost = CmpWithConstant ? 1 : 2;
3342 break;
3345 if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
3346 (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
3347 // cmpeq(psubus(x,y),0)
3348 // cmpeq(pminu(x,y),x)
3349 ExtraCost = 1;
3350 } else {
3351 // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
3352 ExtraCost = CmpWithConstant ? 2 : 3;
3353 }
3354 break;
3357 // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases.
3358 // Use FCMP_UEQ expansion - FCMP_ONE should be the same.
3359 if (CondTy && !ST->hasAVX())
3360 return getCmpSelInstrCost(Opcode, ValTy, CondTy,
3362 Op1Info, Op2Info) +
3363 getCmpSelInstrCost(Opcode, ValTy, CondTy,
3365 Op1Info, Op2Info) +
3366 getArithmeticInstrCost(Instruction::Or, CondTy, CostKind);
3367
3368 break;
3371 // Assume worst case scenario and add the maximum extra cost.
3372 ExtraCost = 3;
3373 break;
3374 default:
3375 break;
3376 }
3377 }
3378 }
3379
3380 static const CostKindTblEntry SLMCostTbl[] = {
3381 // slm pcmpeq/pcmpgt throughput is 2
3382 { ISD::SETCC, MVT::v2i64, { 2, 5, 1, 2 } },
3383 // slm pblendvb/blendvpd/blendvps throughput is 4
3384 { ISD::SELECT, MVT::v2f64, { 4, 4, 1, 3 } }, // vblendvpd
3385 { ISD::SELECT, MVT::v4f32, { 4, 4, 1, 3 } }, // vblendvps
3386 { ISD::SELECT, MVT::v2i64, { 4, 4, 1, 3 } }, // pblendvb
3387 { ISD::SELECT, MVT::v8i32, { 4, 4, 1, 3 } }, // pblendvb
3388 { ISD::SELECT, MVT::v8i16, { 4, 4, 1, 3 } }, // pblendvb
3389 { ISD::SELECT, MVT::v16i8, { 4, 4, 1, 3 } }, // pblendvb
3390 };
3391
3392 static const CostKindTblEntry AVX512BWCostTbl[] = {
3393 { ISD::SETCC, MVT::v32i16, { 1, 1, 1, 1 } },
3394 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 1 } },
3395 { ISD::SETCC, MVT::v64i8, { 1, 1, 1, 1 } },
3396 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 1 } },
3397
3398 { ISD::SELECT, MVT::v32i16, { 1, 1, 1, 1 } },
3399 { ISD::SELECT, MVT::v64i8, { 1, 1, 1, 1 } },
3400 };
3401
3402 static const CostKindTblEntry AVX512CostTbl[] = {
3403 { ISD::SETCC, MVT::v8f64, { 1, 4, 1, 1 } },
3404 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 1 } },
3405 { ISD::SETCC, MVT::v16f32, { 1, 4, 1, 1 } },
3406 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 1 } },
3407
3408 { ISD::SETCC, MVT::v8i64, { 1, 1, 1, 1 } },
3409 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 1 } },
3410 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3411 { ISD::SETCC, MVT::v16i32, { 1, 1, 1, 1 } },
3412 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 1 } },
3413 { ISD::SETCC, MVT::v32i16, { 3, 7, 5, 5 } },
3414 { ISD::SETCC, MVT::v64i8, { 3, 7, 5, 5 } },
3415
3416 { ISD::SELECT, MVT::v8i64, { 1, 1, 1, 1 } },
3417 { ISD::SELECT, MVT::v4i64, { 1, 1, 1, 1 } },
3418 { ISD::SELECT, MVT::v2i64, { 1, 1, 1, 1 } },
3419 { ISD::SELECT, MVT::v16i32, { 1, 1, 1, 1 } },
3420 { ISD::SELECT, MVT::v8i32, { 1, 1, 1, 1 } },
3421 { ISD::SELECT, MVT::v4i32, { 1, 1, 1, 1 } },
3422 { ISD::SELECT, MVT::v8f64, { 1, 1, 1, 1 } },
3423 { ISD::SELECT, MVT::v4f64, { 1, 1, 1, 1 } },
3424 { ISD::SELECT, MVT::v2f64, { 1, 1, 1, 1 } },
3425 { ISD::SELECT, MVT::f64, { 1, 1, 1, 1 } },
3426 { ISD::SELECT, MVT::v16f32, { 1, 1, 1, 1 } },
3427 { ISD::SELECT, MVT::v8f32 , { 1, 1, 1, 1 } },
3428 { ISD::SELECT, MVT::v4f32, { 1, 1, 1, 1 } },
3429 { ISD::SELECT, MVT::f32 , { 1, 1, 1, 1 } },
3430
3431 { ISD::SELECT, MVT::v32i16, { 2, 2, 4, 4 } },
3432 { ISD::SELECT, MVT::v16i16, { 1, 1, 1, 1 } },
3433 { ISD::SELECT, MVT::v8i16, { 1, 1, 1, 1 } },
3434 { ISD::SELECT, MVT::v64i8, { 2, 2, 4, 4 } },
3435 { ISD::SELECT, MVT::v32i8, { 1, 1, 1, 1 } },
3436 { ISD::SELECT, MVT::v16i8, { 1, 1, 1, 1 } },
3437 };
3438
3439 static const CostKindTblEntry AVX2CostTbl[] = {
3440 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 2 } },
3441 { ISD::SETCC, MVT::v2f64, { 1, 4, 1, 1 } },
3442 { ISD::SETCC, MVT::f64, { 1, 4, 1, 1 } },
3443 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 2 } },
3444 { ISD::SETCC, MVT::v4f32, { 1, 4, 1, 1 } },
3445 { ISD::SETCC, MVT::f32, { 1, 4, 1, 1 } },
3446
3447 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 2 } },
3448 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 2 } },
3449 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 2 } },
3450 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 2 } },
3451
3452 { ISD::SELECT, MVT::v4f64, { 2, 2, 1, 2 } }, // vblendvpd
3453 { ISD::SELECT, MVT::v8f32, { 2, 2, 1, 2 } }, // vblendvps
3454 { ISD::SELECT, MVT::v4i64, { 2, 2, 1, 2 } }, // pblendvb
3455 { ISD::SELECT, MVT::v8i32, { 2, 2, 1, 2 } }, // pblendvb
3456 { ISD::SELECT, MVT::v16i16, { 2, 2, 1, 2 } }, // pblendvb
3457 { ISD::SELECT, MVT::v32i8, { 2, 2, 1, 2 } }, // pblendvb
3458 };
3459
3460 static const CostKindTblEntry XOPCostTbl[] = {
3461 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3462 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3463 };
3464
3465 static const CostKindTblEntry AVX1CostTbl[] = {
3466 { ISD::SETCC, MVT::v4f64, { 2, 3, 1, 2 } },
3467 { ISD::SETCC, MVT::v2f64, { 1, 3, 1, 1 } },
3468 { ISD::SETCC, MVT::f64, { 1, 3, 1, 1 } },
3469 { ISD::SETCC, MVT::v8f32, { 2, 3, 1, 2 } },
3470 { ISD::SETCC, MVT::v4f32, { 1, 3, 1, 1 } },
3471 { ISD::SETCC, MVT::f32, { 1, 3, 1, 1 } },
3472
3473 // AVX1 does not support 8-wide integer compare.
3474 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3475 { ISD::SETCC, MVT::v8i32, { 4, 2, 5, 6 } },
3476 { ISD::SETCC, MVT::v16i16, { 4, 2, 5, 6 } },
3477 { ISD::SETCC, MVT::v32i8, { 4, 2, 5, 6 } },
3478
3479 { ISD::SELECT, MVT::v4f64, { 3, 3, 1, 2 } }, // vblendvpd
3480 { ISD::SELECT, MVT::v8f32, { 3, 3, 1, 2 } }, // vblendvps
3481 { ISD::SELECT, MVT::v4i64, { 3, 3, 1, 2 } }, // vblendvpd
3482 { ISD::SELECT, MVT::v8i32, { 3, 3, 1, 2 } }, // vblendvps
3483 { ISD::SELECT, MVT::v16i16, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3484 { ISD::SELECT, MVT::v32i8, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3485 };
3486
3487 static const CostKindTblEntry SSE42CostTbl[] = {
3488 { ISD::SETCC, MVT::v2i64, { 1, 2, 1, 2 } },
3489 };
3490
3491 static const CostKindTblEntry SSE41CostTbl[] = {
3492 { ISD::SETCC, MVT::v2f64, { 1, 5, 1, 1 } },
3493 { ISD::SETCC, MVT::v4f32, { 1, 5, 1, 1 } },
3494
3495 { ISD::SELECT, MVT::v2f64, { 2, 2, 1, 2 } }, // blendvpd
3496 { ISD::SELECT, MVT::f64, { 2, 2, 1, 2 } }, // blendvpd
3497 { ISD::SELECT, MVT::v4f32, { 2, 2, 1, 2 } }, // blendvps
3498 { ISD::SELECT, MVT::f32 , { 2, 2, 1, 2 } }, // blendvps
3499 { ISD::SELECT, MVT::v2i64, { 2, 2, 1, 2 } }, // pblendvb
3500 { ISD::SELECT, MVT::v4i32, { 2, 2, 1, 2 } }, // pblendvb
3501 { ISD::SELECT, MVT::v8i16, { 2, 2, 1, 2 } }, // pblendvb
3502 { ISD::SELECT, MVT::v16i8, { 2, 2, 1, 2 } }, // pblendvb
3503 };
3504
3505 static const CostKindTblEntry SSE2CostTbl[] = {
3506 { ISD::SETCC, MVT::v2f64, { 2, 5, 1, 1 } },
3507 { ISD::SETCC, MVT::f64, { 1, 5, 1, 1 } },
3508
3509 { ISD::SETCC, MVT::v2i64, { 5, 4, 5, 5 } }, // pcmpeqd/pcmpgtd expansion
3510 { ISD::SETCC, MVT::v4i32, { 1, 1, 1, 1 } },
3511 { ISD::SETCC, MVT::v8i16, { 1, 1, 1, 1 } },
3512 { ISD::SETCC, MVT::v16i8, { 1, 1, 1, 1 } },
3513
3514 { ISD::SELECT, MVT::v2f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3515 { ISD::SELECT, MVT::f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3516 { ISD::SELECT, MVT::v2i64, { 2, 2, 3, 3 } }, // pand + pandn + por
3517 { ISD::SELECT, MVT::v4i32, { 2, 2, 3, 3 } }, // pand + pandn + por
3518 { ISD::SELECT, MVT::v8i16, { 2, 2, 3, 3 } }, // pand + pandn + por
3519 { ISD::SELECT, MVT::v16i8, { 2, 2, 3, 3 } }, // pand + pandn + por
3520 };
3521
3522 static const CostKindTblEntry SSE1CostTbl[] = {
3523 { ISD::SETCC, MVT::v4f32, { 2, 5, 1, 1 } },
3524 { ISD::SETCC, MVT::f32, { 1, 5, 1, 1 } },
3525
3526 { ISD::SELECT, MVT::v4f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3527 { ISD::SELECT, MVT::f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3528 };
3529
3530 if (ST->useSLMArithCosts())
3531 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
3532 if (auto KindCost = Entry->Cost[CostKind])
3533 return LT.first * (ExtraCost + *KindCost);
3534
3535 if (ST->hasBWI())
3536 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3537 if (auto KindCost = Entry->Cost[CostKind])
3538 return LT.first * (ExtraCost + *KindCost);
3539
3540 if (ST->hasAVX512())
3541 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3542 if (auto KindCost = Entry->Cost[CostKind])
3543 return LT.first * (ExtraCost + *KindCost);
3544
3545 if (ST->hasAVX2())
3546 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3547 if (auto KindCost = Entry->Cost[CostKind])
3548 return LT.first * (ExtraCost + *KindCost);
3549
3550 if (ST->hasXOP())
3551 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3552 if (auto KindCost = Entry->Cost[CostKind])
3553 return LT.first * (ExtraCost + *KindCost);
3554
3555 if (ST->hasAVX())
3556 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3557 if (auto KindCost = Entry->Cost[CostKind])
3558 return LT.first * (ExtraCost + *KindCost);
3559
3560 if (ST->hasSSE42())
3561 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3562 if (auto KindCost = Entry->Cost[CostKind])
3563 return LT.first * (ExtraCost + *KindCost);
3564
3565 if (ST->hasSSE41())
3566 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3567 if (auto KindCost = Entry->Cost[CostKind])
3568 return LT.first * (ExtraCost + *KindCost);
3569
3570 if (ST->hasSSE2())
3571 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3572 if (auto KindCost = Entry->Cost[CostKind])
3573 return LT.first * (ExtraCost + *KindCost);
3574
3575 if (ST->hasSSE1())
3576 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3577 if (auto KindCost = Entry->Cost[CostKind])
3578 return LT.first * (ExtraCost + *KindCost);
3579
3580 // Assume a 3cy latency for fp select ops.
3581 if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select)
3582 if (ValTy->getScalarType()->isFloatingPointTy())
3583 return 3;
3584
3585 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3586 Op1Info, Op2Info, I);
3587}
3588
3590
3594 // Costs should match the codegen from:
3595 // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
3596 // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
3597 // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
3598 // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
3599 // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
3600
3601 // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
3602 // specialized in these tables yet.
3603 static const CostKindTblEntry AVX512VBMI2CostTbl[] = {
3604 { ISD::FSHL, MVT::v8i64, { 1, 1, 1, 1 } },
3605 { ISD::FSHL, MVT::v4i64, { 1, 1, 1, 1 } },
3606 { ISD::FSHL, MVT::v2i64, { 1, 1, 1, 1 } },
3607 { ISD::FSHL, MVT::v16i32, { 1, 1, 1, 1 } },
3608 { ISD::FSHL, MVT::v8i32, { 1, 1, 1, 1 } },
3609 { ISD::FSHL, MVT::v4i32, { 1, 1, 1, 1 } },
3610 { ISD::FSHL, MVT::v32i16, { 1, 1, 1, 1 } },
3611 { ISD::FSHL, MVT::v16i16, { 1, 1, 1, 1 } },
3612 { ISD::FSHL, MVT::v8i16, { 1, 1, 1, 1 } },
3613 { ISD::ROTL, MVT::v32i16, { 1, 1, 1, 1 } },
3614 { ISD::ROTL, MVT::v16i16, { 1, 1, 1, 1 } },
3615 { ISD::ROTL, MVT::v8i16, { 1, 1, 1, 1 } },
3616 { ISD::ROTR, MVT::v32i16, { 1, 1, 1, 1 } },
3617 { ISD::ROTR, MVT::v16i16, { 1, 1, 1, 1 } },
3618 { ISD::ROTR, MVT::v8i16, { 1, 1, 1, 1 } },
3619 { X86ISD::VROTLI, MVT::v32i16, { 1, 1, 1, 1 } },
3620 { X86ISD::VROTLI, MVT::v16i16, { 1, 1, 1, 1 } },
3621 { X86ISD::VROTLI, MVT::v8i16, { 1, 1, 1, 1 } },
3622 };
3623 static const CostKindTblEntry AVX512BITALGCostTbl[] = {
3624 { ISD::CTPOP, MVT::v32i16, { 1, 1, 1, 1 } },
3625 { ISD::CTPOP, MVT::v64i8, { 1, 1, 1, 1 } },
3626 { ISD::CTPOP, MVT::v16i16, { 1, 1, 1, 1 } },
3627 { ISD::CTPOP, MVT::v32i8, { 1, 1, 1, 1 } },
3628 { ISD::CTPOP, MVT::v8i16, { 1, 1, 1, 1 } },
3629 { ISD::CTPOP, MVT::v16i8, { 1, 1, 1, 1 } },
3630 };
3631 static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = {
3632 { ISD::CTPOP, MVT::v8i64, { 1, 1, 1, 1 } },
3633 { ISD::CTPOP, MVT::v16i32, { 1, 1, 1, 1 } },
3634 { ISD::CTPOP, MVT::v4i64, { 1, 1, 1, 1 } },
3635 { ISD::CTPOP, MVT::v8i32, { 1, 1, 1, 1 } },
3636 { ISD::CTPOP, MVT::v2i64, { 1, 1, 1, 1 } },
3637 { ISD::CTPOP, MVT::v4i32, { 1, 1, 1, 1 } },
3638 };
3639 static const CostKindTblEntry AVX512CDCostTbl[] = {
3640 { ISD::CTLZ, MVT::v8i64, { 1, 5, 1, 1 } },
3641 { ISD::CTLZ, MVT::v16i32, { 1, 5, 1, 1 } },
3642 { ISD::CTLZ, MVT::v32i16, { 18, 27, 23, 27 } },
3643 { ISD::CTLZ, MVT::v64i8, { 3, 16, 9, 11 } },
3644 { ISD::CTLZ, MVT::v4i64, { 1, 5, 1, 1 } },
3645 { ISD::CTLZ, MVT::v8i32, { 1, 5, 1, 1 } },
3646 { ISD::CTLZ, MVT::v16i16, { 8, 19, 11, 13 } },
3647 { ISD::CTLZ, MVT::v32i8, { 2, 11, 9, 10 } },
3648 { ISD::CTLZ, MVT::v2i64, { 1, 5, 1, 1 } },
3649 { ISD::CTLZ, MVT::v4i32, { 1, 5, 1, 1 } },
3650 { ISD::CTLZ, MVT::v8i16, { 3, 15, 4, 6 } },
3651 { ISD::CTLZ, MVT::v16i8, { 2, 10, 9, 10 } },
3652
3653 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3654 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3655 { ISD::CTTZ, MVT::v4i64, { 1, 8, 6, 6 } },
3656 { ISD::CTTZ, MVT::v8i32, { 1, 8, 6, 6 } },
3657 { ISD::CTTZ, MVT::v2i64, { 1, 8, 6, 6 } },
3658 { ISD::CTTZ, MVT::v4i32, { 1, 8, 6, 6 } },
3659 };
3660 static const CostKindTblEntry AVX512BWCostTbl[] = {
3661 { ISD::ABS, MVT::v32i16, { 1, 1, 1, 1 } },
3662 { ISD::ABS, MVT::v64i8, { 1, 1, 1, 1 } },
3663 { ISD::BITREVERSE, MVT::v2i64, { 3, 10, 10, 11 } },
3664 { ISD::BITREVERSE, MVT::v4i64, { 3, 11, 10, 11 } },
3665 { ISD::BITREVERSE, MVT::v8i64, { 3, 12, 10, 14 } },
3666 { ISD::BITREVERSE, MVT::v4i32, { 3, 10, 10, 11 } },
3667 { ISD::BITREVERSE, MVT::v8i32, { 3, 11, 10, 11 } },
3668 { ISD::BITREVERSE, MVT::v16i32, { 3, 12, 10, 14 } },
3669 { ISD::BITREVERSE, MVT::v8i16, { 3, 10, 10, 11 } },
3670 { ISD::BITREVERSE, MVT::v16i16, { 3, 11, 10, 11 } },
3671 { ISD::BITREVERSE, MVT::v32i16, { 3, 12, 10, 14 } },
3672 { ISD::BITREVERSE, MVT::v16i8, { 2, 5, 9, 9 } },
3673 { ISD::BITREVERSE, MVT::v32i8, { 2, 5, 9, 9 } },
3674 { ISD::BITREVERSE, MVT::v64i8, { 2, 5, 9, 12 } },
3675 { ISD::BSWAP, MVT::v2i64, { 1, 1, 1, 2 } },
3676 { ISD::BSWAP, MVT::v4i64, { 1, 1, 1, 2 } },
3677 { ISD::BSWAP, MVT::v8i64, { 1, 1, 1, 2 } },
3678 { ISD::BSWAP, MVT::v4i32, { 1, 1, 1, 2 } },
3679 { ISD::BSWAP, MVT::v8i32, { 1, 1, 1, 2 } },
3680 { ISD::BSWAP, MVT::v16i32, { 1, 1, 1, 2 } },
3681 { ISD::BSWAP, MVT::v8i16, { 1, 1, 1, 2 } },
3682 { ISD::BSWAP, MVT::v16i16, { 1, 1, 1, 2 } },
3683 { ISD::BSWAP, MVT::v32i16, { 1, 1, 1, 2 } },
3684 { ISD::CTLZ, MVT::v8i64, { 8, 22, 23, 23 } },
3685 { ISD::CTLZ, MVT::v16i32, { 8, 23, 25, 25 } },
3686 { ISD::CTLZ, MVT::v32i16, { 4, 15, 15, 16 } },
3687 { ISD::CTLZ, MVT::v64i8, { 3, 12, 10, 9 } },
3688 { ISD::CTPOP, MVT::v2i64, { 3, 7, 10, 10 } },
3689 { ISD::CTPOP, MVT::v4i64, { 3, 7, 10, 10 } },
3690 { ISD::CTPOP, MVT::v8i64, { 3, 8, 10, 12 } },
3691 { ISD::CTPOP, MVT::v4i32, { 7, 11, 14, 14 } },
3692 { ISD::CTPOP, MVT::v8i32, { 7, 11, 14, 14 } },
3693 { ISD::CTPOP, MVT::v16i32, { 7, 12, 14, 16 } },
3694 { ISD::CTPOP, MVT::v8i16, { 2, 7, 11, 11 } },
3695 { ISD::CTPOP, MVT::v16i16, { 2, 7, 11, 11 } },
3696 { ISD::CTPOP, MVT::v32i16, { 3, 7, 11, 13 } },
3697 { ISD::CTPOP, MVT::v16i8, { 2, 4, 8, 8 } },
3698 { ISD::CTPOP, MVT::v32i8, { 2, 4, 8, 8 } },
3699 { ISD::CTPOP, MVT::v64i8, { 2, 5, 8, 10 } },
3700 { ISD::CTTZ, MVT::v8i16, { 3, 9, 14, 14 } },
3701 { ISD::CTTZ, MVT::v16i16, { 3, 9, 14, 14 } },
3702 { ISD::CTTZ, MVT::v32i16, { 3, 10, 14, 16 } },
3703 { ISD::CTTZ, MVT::v16i8, { 2, 6, 11, 11 } },
3704 { ISD::CTTZ, MVT::v32i8, { 2, 6, 11, 11 } },
3705 { ISD::CTTZ, MVT::v64i8, { 3, 7, 11, 13 } },
3706 { ISD::ROTL, MVT::v32i16, { 2, 8, 6, 8 } },
3707 { ISD::ROTL, MVT::v16i16, { 2, 8, 6, 7 } },
3708 { ISD::ROTL, MVT::v8i16, { 2, 7, 6, 7 } },
3709 { ISD::ROTL, MVT::v64i8, { 5, 6, 11, 12 } },
3710 { ISD::ROTL, MVT::v32i8, { 5, 15, 7, 10 } },
3711 { ISD::ROTL, MVT::v16i8, { 5, 15, 7, 10 } },
3712 { ISD::ROTR, MVT::v32i16, { 2, 8, 6, 8 } },
3713 { ISD::ROTR, MVT::v16i16, { 2, 8, 6, 7 } },
3714 { ISD::ROTR, MVT::v8i16, { 2, 7, 6, 7 } },
3715 { ISD::ROTR, MVT::v64i8, { 5, 6, 12, 14 } },
3716 { ISD::ROTR, MVT::v32i8, { 5, 14, 6, 9 } },
3717 { ISD::ROTR, MVT::v16i8, { 5, 14, 6, 9 } },
3718 { X86ISD::VROTLI, MVT::v32i16, { 2, 5, 3, 3 } },
3719 { X86ISD::VROTLI, MVT::v16i16, { 1, 5, 3, 3 } },
3720 { X86ISD::VROTLI, MVT::v8i16, { 1, 5, 3, 3 } },
3721 { X86ISD::VROTLI, MVT::v64i8, { 2, 9, 3, 4 } },
3722 { X86ISD::VROTLI, MVT::v32i8, { 1, 9, 3, 4 } },
3723 { X86ISD::VROTLI, MVT::v16i8, { 1, 8, 3, 4 } },
3724 { ISD::SADDSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3725 { ISD::SADDSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3726 { ISD::SMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3727 { ISD::SMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3728 { ISD::SMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3729 { ISD::SMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3730 { ISD::SMULO, MVT::v32i16, { 3, 6, 4, 4 } },
3731 { ISD::SMULO, MVT::v64i8, { 8, 21, 17, 18 } },
3732 { ISD::UMULO, MVT::v32i16, { 2, 5, 3, 3 } },
3733 { ISD::UMULO, MVT::v64i8, { 8, 15, 15, 16 } },
3734 { ISD::SSUBSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3735 { ISD::SSUBSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3736 { ISD::UADDSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3737 { ISD::UADDSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3738 { ISD::UMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3739 { ISD::UMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3740 { ISD::UMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3741 { ISD::UMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3742 { ISD::USUBSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3743 { ISD::USUBSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3744 };
3745 static const CostKindTblEntry AVX512CostTbl[] = {
3746 { ISD::ABS, MVT::v8i64, { 1, 1, 1, 1 } },
3747 { ISD::ABS, MVT::v4i64, { 1, 1, 1, 1 } },
3748 { ISD::ABS, MVT::v2i64, { 1, 1, 1, 1 } },
3749 { ISD::ABS, MVT::v16i32, { 1, 1, 1, 1 } },
3750 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 1 } },
3751 { ISD::ABS, MVT::v32i16, { 2, 7, 4, 4 } },
3752 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 1 } },
3753 { ISD::ABS, MVT::v64i8, { 2, 7, 4, 4 } },
3754 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 1 } },
3755 { ISD::BITREVERSE, MVT::v8i64, { 9, 13, 20, 20 } },
3756 { ISD::BITREVERSE, MVT::v16i32, { 9, 13, 20, 20 } },
3757 { ISD::BITREVERSE, MVT::v32i16, { 9, 13, 20, 20 } },
3758 { ISD::BITREVERSE, MVT::v64i8, { 6, 11, 17, 17 } },
3759 { ISD::BSWAP, MVT::v8i64, { 4, 7, 5, 5 } },
3760 { ISD::BSWAP, MVT::v16i32, { 4, 7, 5, 5 } },
3761 { ISD::BSWAP, MVT::v32i16, { 4, 7, 5, 5 } },
3762 { ISD::CTLZ, MVT::v8i64, { 10, 28, 32, 32 } },
3763 { ISD::CTLZ, MVT::v16i32, { 12, 30, 38, 38 } },
3764 { ISD::CTLZ, MVT::v32i16, { 8, 15, 29, 29 } },
3765 { ISD::CTLZ, MVT::v64i8, { 6, 11, 19, 19 } },
3766 { ISD::CTPOP, MVT::v8i64, { 16, 16, 19, 19 } },
3767 { ISD::CTPOP, MVT::v16i32, { 24, 19, 27, 27 } },
3768 { ISD::CTPOP, MVT::v32i16, { 18, 15, 22, 22 } },
3769 { ISD::CTPOP, MVT::v64i8, { 12, 11, 16, 16 } },
3770 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3771 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3772 { ISD::CTTZ, MVT::v32i16, { 7, 17, 27, 27 } },
3773 { ISD::CTTZ, MVT::v64i8, { 6, 13, 21, 21 } },
3774 { ISD::ROTL, MVT::v8i64, { 1, 1, 1, 1 } },
3775 { ISD::ROTL, MVT::v4i64, { 1, 1, 1, 1 } },
3776 { ISD::ROTL, MVT::v2i64, { 1, 1, 1, 1 } },
3777 { ISD::ROTL, MVT::v16i32, { 1, 1, 1, 1 } },
3778 { ISD::ROTL, MVT::v8i32, { 1, 1, 1, 1 } },
3779 { ISD::ROTL, MVT::v4i32, { 1, 1, 1, 1 } },
3780 { ISD::ROTR, MVT::v8i64, { 1, 1, 1, 1 } },
3781 { ISD::ROTR, MVT::v4i64, { 1, 1, 1, 1 } },
3782 { ISD::ROTR, MVT::v2i64, { 1, 1, 1, 1 } },
3783 { ISD::ROTR, MVT::v16i32, { 1, 1, 1, 1 } },
3784 { ISD::ROTR, MVT::v8i32, { 1, 1, 1, 1 } },
3785 { ISD::ROTR, MVT::v4i32, { 1, 1, 1, 1 } },
3786 { X86ISD::VROTLI, MVT::v8i64, { 1, 1, 1, 1 } },
3787 { X86ISD::VROTLI, MVT::v4i64, { 1, 1, 1, 1 } },
3788 { X86ISD::VROTLI, MVT::v2i64, { 1, 1, 1, 1 } },
3789 { X86ISD::VROTLI, MVT::v16i32, { 1, 1, 1, 1 } },
3790 { X86ISD::VROTLI, MVT::v8i32, { 1, 1, 1, 1 } },
3791 { X86ISD::VROTLI, MVT::v4i32, { 1, 1, 1, 1 } },
3792 { ISD::SADDSAT, MVT::v2i64, { 3, 3, 8, 9 } },
3793 { ISD::SADDSAT, MVT::v4i64, { 2, 2, 6, 7 } },
3794 { ISD::SADDSAT, MVT::v8i64, { 3, 3, 6, 7 } },
3795 { ISD::SADDSAT, MVT::v4i32, { 2, 2, 6, 7 } },
3796 { ISD::SADDSAT, MVT::v8i32, { 2, 2, 6, 7 } },
3797 { ISD::SADDSAT, MVT::v16i32, { 3, 3, 6, 7 } },
3798 { ISD::SADDSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3799 { ISD::SADDSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3800 { ISD::SMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3801 { ISD::SMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3802 { ISD::SMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3803 { ISD::SMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3804 { ISD::SMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3805 { ISD::SMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3806 { ISD::SMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3807 { ISD::SMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3808 { ISD::SMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3809 { ISD::SMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3810 { ISD::SMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3811 { ISD::SMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3812 { ISD::SMULO, MVT::v8i64, { 44, 44, 81, 93 } },
3813 { ISD::SMULO, MVT::v16i32, { 5, 12, 9, 11 } },
3814 { ISD::SMULO, MVT::v32i16, { 6, 12, 17, 17 } },
3815 { ISD::SMULO, MVT::v64i8, { 22, 28, 42, 42 } },
3816 { ISD::SSUBSAT, MVT::v2i64, { 2, 13, 9, 10 } },
3817 { ISD::SSUBSAT, MVT::v4i64, { 2, 15, 7, 8 } },
3818 { ISD::SSUBSAT, MVT::v8i64, { 2, 14, 7, 8 } },
3819 { ISD::SSUBSAT, MVT::v4i32, { 2, 14, 7, 8 } },
3820 { ISD::SSUBSAT, MVT::v8i32, { 2, 15, 7, 8 } },
3821 { ISD::SSUBSAT, MVT::v16i32, { 2, 14, 7, 8 } },
3822 { ISD::SSUBSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3823 { ISD::SSUBSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3824 { ISD::UMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3825 { ISD::UMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3826 { ISD::UMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3827 { ISD::UMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3828 { ISD::UMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3829 { ISD::UMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3830 { ISD::UMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3831 { ISD::UMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3832 { ISD::UMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3833 { ISD::UMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3834 { ISD::UMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3835 { ISD::UMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3836 { ISD::UMULO, MVT::v8i64, { 52, 52, 95, 104} },
3837 { ISD::UMULO, MVT::v16i32, { 5, 12, 8, 10 } },
3838 { ISD::UMULO, MVT::v32i16, { 5, 13, 16, 16 } },
3839 { ISD::UMULO, MVT::v64i8, { 18, 24, 30, 30 } },
3840 { ISD::UADDSAT, MVT::v2i64, { 1, 4, 4, 4 } },
3841 { ISD::UADDSAT, MVT::v4i64, { 1, 4, 4, 4 } },
3842 { ISD::UADDSAT, MVT::v8i64, { 1, 4, 4, 4 } },
3843 { ISD::UADDSAT, MVT::v4i32, { 1, 2, 4, 4 } },
3844 { ISD::UADDSAT, MVT::v8i32, { 1, 2, 4, 4 } },
3845 { ISD::UADDSAT, MVT::v16i32, { 2, 2, 4, 4 } },
3846 { ISD::UADDSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3847 { ISD::UADDSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3848 { ISD::USUBSAT, MVT::v2i64, { 1, 4, 2, 2 } },
3849 { ISD::USUBSAT, MVT::v4i64, { 1, 4, 2, 2 } },
3850 { ISD::USUBSAT, MVT::v8i64, { 1, 4, 2, 2 } },
3851 { ISD::USUBSAT, MVT::v8i32, { 1, 2, 2, 2 } },
3852 { ISD::USUBSAT, MVT::v16i32, { 1, 2, 2, 2 } },
3853 { ISD::USUBSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3854 { ISD::USUBSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3855 { ISD::FMAXNUM, MVT::f32, { 2, 2, 3, 3 } },
3856 { ISD::FMAXNUM, MVT::v4f32, { 1, 1, 3, 3 } },
3857 { ISD::FMAXNUM, MVT::v8f32, { 2, 2, 3, 3 } },
3858 { ISD::FMAXNUM, MVT::v16f32, { 4, 4, 3, 3 } },
3859 { ISD::FMAXNUM, MVT::f64, { 2, 2, 3, 3 } },
3860 { ISD::FMAXNUM, MVT::v2f64, { 1, 1, 3, 3 } },
3861 { ISD::FMAXNUM, MVT::v4f64, { 2, 2, 3, 3 } },
3862 { ISD::FMAXNUM, MVT::v8f64, { 3, 3, 3, 3 } },
3863 { ISD::FSQRT, MVT::f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3864 { ISD::FSQRT, MVT::v4f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3865 { ISD::FSQRT, MVT::v8f32, { 6, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3866 { ISD::FSQRT, MVT::v16f32, { 12, 20, 1, 3 } }, // Skylake from http://www.agner.org/
3867 { ISD::FSQRT, MVT::f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3868 { ISD::FSQRT, MVT::v2f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3869 { ISD::FSQRT, MVT::v4f64, { 12, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3870 { ISD::FSQRT, MVT::v8f64, { 24, 32, 1, 3 } }, // Skylake from http://www.agner.org/
3871 };
3872 static const CostKindTblEntry XOPCostTbl[] = {
3873 { ISD::BITREVERSE, MVT::v4i64, { 3, 6, 5, 6 } },
3874 { ISD::BITREVERSE, MVT::v8i32, { 3, 6, 5, 6 } },
3875 { ISD::BITREVERSE, MVT::v16i16, { 3, 6, 5, 6 } },
3876 { ISD::BITREVERSE, MVT::v32i8, { 3, 6, 5, 6 } },
3877 { ISD::BITREVERSE, MVT::v2i64, { 2, 7, 1, 1 } },
3878 { ISD::BITREVERSE, MVT::v4i32, { 2, 7, 1, 1 } },
3879 { ISD::BITREVERSE, MVT::v8i16, { 2, 7, 1, 1 } },
3880 { ISD::BITREVERSE, MVT::v16i8, { 2, 7, 1, 1 } },
3881 { ISD::BITREVERSE, MVT::i64, { 2, 2, 3, 4 } },
3882 { ISD::BITREVERSE, MVT::i32, { 2, 2, 3, 4 } },
3883 { ISD::BITREVERSE, MVT::i16, { 2, 2, 3, 4 } },
3884 { ISD::BITREVERSE, MVT::i8, { 2, 2, 3, 4 } },
3885 // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
3886 { ISD::ROTL, MVT::v4i64, { 4, 7, 5, 6 } },
3887 { ISD::ROTL, MVT::v8i32, { 4, 7, 5, 6 } },
3888 { ISD::ROTL, MVT::v16i16, { 4, 7, 5, 6 } },
3889 { ISD::ROTL, MVT::v32i8, { 4, 7, 5, 6 } },
3890 { ISD::ROTL, MVT::v2i64, { 1, 3, 1, 1 } },
3891 { ISD::ROTL, MVT::v4i32, { 1, 3, 1, 1 } },
3892 { ISD::ROTL, MVT::v8i16, { 1, 3, 1, 1 } },
3893 { ISD::ROTL, MVT::v16i8, { 1, 3, 1, 1 } },
3894 { ISD::ROTR, MVT::v4i64, { 4, 7, 8, 9 } },
3895 { ISD::ROTR, MVT::v8i32, { 4, 7, 8, 9 } },
3896 { ISD::ROTR, MVT::v16i16, { 4, 7, 8, 9 } },
3897 { ISD::ROTR, MVT::v32i8, { 4, 7, 8, 9 } },
3898 { ISD::ROTR, MVT::v2i64, { 1, 3, 3, 3 } },
3899 { ISD::ROTR, MVT::v4i32, { 1, 3, 3, 3 } },
3900 { ISD::ROTR, MVT::v8i16, { 1, 3, 3, 3 } },
3901 { ISD::ROTR, MVT::v16i8, { 1, 3, 3, 3 } },
3902 { X86ISD::VROTLI, MVT::v4i64, { 4, 7, 5, 6 } },
3903 { X86ISD::VROTLI, MVT::v8i32, { 4, 7, 5, 6 } },
3904 { X86ISD::VROTLI, MVT::v16i16, { 4, 7, 5, 6 } },
3905 { X86ISD::VROTLI, MVT::v32i8, { 4, 7, 5, 6 } },
3906 { X86ISD::VROTLI, MVT::v2i64, { 1, 3, 1, 1 } },
3907 { X86ISD::VROTLI, MVT::v4i32, { 1, 3, 1, 1 } },
3908 { X86ISD::VROTLI, MVT::v8i16, { 1, 3, 1, 1 } },
3909 { X86ISD::VROTLI, MVT::v16i8, { 1, 3, 1, 1 } },
3910 };
3911 static const CostKindTblEntry AVX2CostTbl[] = {
3912 { ISD::ABS, MVT::v2i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3913 { ISD::ABS, MVT::v4i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3914 { ISD::ABS, MVT::v4i32, { 1, 1, 1, 1 } },
3915 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 2 } },
3916 { ISD::ABS, MVT::v8i16, { 1, 1, 1, 1 } },
3917 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 2 } },
3918 { ISD::ABS, MVT::v16i8, { 1, 1, 1, 1 } },
3919 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 2 } },
3920 { ISD::BITREVERSE, MVT::v2i64, { 3, 11, 10, 11 } },
3921 { ISD::BITREVERSE, MVT::v4i64, { 5, 11, 10, 17 } },
3922 { ISD::BITREVERSE, MVT::v4i32, { 3, 11, 10, 11 } },
3923 { ISD::BITREVERSE, MVT::v8i32, { 5, 11, 10, 17 } },
3924 { ISD::BITREVERSE, MVT::v8i16, { 3, 11, 10, 11 } },
3925 { ISD::BITREVERSE, MVT::v16i16, { 5, 11, 10, 17 } },
3926 { ISD::BITREVERSE, MVT::v16i8, { 3, 6, 9, 9 } },
3927 { ISD::BITREVERSE, MVT::v32i8, { 4, 5, 9, 15 } },
3928 { ISD::BSWAP, MVT::v2i64, { 1, 2, 1, 2 } },
3929 { ISD::BSWAP, MVT::v4i64, { 1, 3, 1, 2 } },
3930 { ISD::BSWAP, MVT::v4i32, { 1, 2, 1, 2 } },
3931 { ISD::BSWAP, MVT::v8i32, { 1, 3, 1, 2 } },
3932 { ISD::BSWAP, MVT::v8i16, { 1, 2, 1, 2 } },
3933 { ISD::BSWAP, MVT::v16i16, { 1, 3, 1, 2 } },
3934 { ISD::CTLZ, MVT::v2i64, { 7, 18, 24, 25 } },
3935 { ISD::CTLZ, MVT::v4i64, { 14, 18, 24, 44 } },
3936 { ISD::CTLZ, MVT::v4i32, { 5, 16, 19, 20 } },
3937 { ISD::CTLZ, MVT::v8i32, { 10, 16, 19, 34 } },
3938 { ISD::CTLZ, MVT::v8i16, { 4, 13, 14, 15 } },
3939 { ISD::CTLZ, MVT::v16i16, { 6, 14, 14, 24 } },
3940 { ISD::CTLZ, MVT::v16i8, { 3, 12, 9, 10 } },
3941 { ISD::CTLZ, MVT::v32i8, { 4, 12, 9, 14 } },
3942 { ISD::CTPOP, MVT::v2i64, { 3, 9, 10, 10 } },
3943 { ISD::CTPOP, MVT::v4i64, { 4, 9, 10, 14 } },
3944 { ISD::CTPOP, MVT::v4i32, { 7, 12, 14, 14 } },
3945 { ISD::CTPOP, MVT::v8i32, { 7, 12, 14, 18 } },
3946 { ISD::CTPOP, MVT::v8i16, { 3, 7, 11, 11 } },
3947 { ISD::CTPOP, MVT::v16i16, { 6, 8, 11, 18 } },
3948 { ISD::CTPOP, MVT::v16i8, { 2, 5, 8, 8 } },
3949 { ISD::CTPOP, MVT::v32i8, { 3, 5, 8, 12 } },
3950 { ISD::CTTZ, MVT::v2i64, { 4, 11, 13, 13 } },
3951 { ISD::CTTZ, MVT::v4i64, { 5, 11, 13, 20 } },
3952 { ISD::CTTZ, MVT::v4i32, { 7, 14, 17, 17 } },
3953 { ISD::CTTZ, MVT::v8i32, { 7, 15, 17, 24 } },
3954 { ISD::CTTZ, MVT::v8i16, { 4, 9, 14, 14 } },
3955 { ISD::CTTZ, MVT::v16i16, { 6, 9, 14, 24 } },
3956 { ISD::CTTZ, MVT::v16i8, { 3, 7, 11, 11 } },
3957 { ISD::CTTZ, MVT::v32i8, { 5, 7, 11, 18 } },
3958 { ISD::SADDSAT, MVT::v2i64, { 4, 13, 8, 11 } },
3959 { ISD::SADDSAT, MVT::v4i64, { 3, 10, 8, 12 } },
3960 { ISD::SADDSAT, MVT::v4i32, { 2, 6, 7, 9 } },
3961 { ISD::SADDSAT, MVT::v8i32, { 4, 6, 7, 13 } },
3962 { ISD::SADDSAT, MVT::v16i16, { 1, 1, 1, 2 } },
3963 { ISD::SADDSAT, MVT::v32i8, { 1, 1, 1, 2 } },
3964 { ISD::SMAX, MVT::v2i64, { 2, 7, 2, 3 } },
3965 { ISD::SMAX, MVT::v4i64, { 2, 7, 2, 3 } },
3966 { ISD::SMAX, MVT::v8i32, { 1, 1, 1, 2 } },
3967 { ISD::SMAX, MVT::v16i16, { 1, 1, 1, 2 } },
3968 { ISD::SMAX, MVT::v32i8, { 1, 1, 1, 2 } },
3969 { ISD::SMIN, MVT::v2i64, { 2, 7, 2, 3 } },
3970 { ISD::SMIN, MVT::v4i64, { 2, 7, 2, 3 } },
3971 { ISD::SMIN, MVT::v8i32, { 1, 1, 1, 2 } },
3972 { ISD::SMIN, MVT::v16i16, { 1, 1, 1, 2 } },
3973 { ISD::SMIN, MVT::v32i8, { 1, 1, 1, 2 } },
3974 { ISD::SMULO, MVT::v4i64, { 20, 20, 33, 37 } },
3975 { ISD::SMULO, MVT::v2i64, { 8, 8, 13, 15 } },
3976 { ISD::SMULO, MVT::v8i32, { 8, 20, 13, 24 } },
3977 { ISD::SMULO, MVT::v4i32, { 5, 15, 11, 12 } },
3978 { ISD::SMULO, MVT::v16i16, { 4, 14, 8, 14 } },
3979 { ISD::SMULO, MVT::v8i16, { 3, 9, 6, 6 } },
3980 { ISD::SMULO, MVT::v32i8, { 9, 15, 18, 35 } },
3981 { ISD::SMULO, MVT::v16i8, { 6, 22, 14, 21 } },
3982 { ISD::SSUBSAT, MVT::v2i64, { 4, 13, 9, 13 } },
3983 { ISD::SSUBSAT, MVT::v4i64, { 4, 15, 9, 13 } },
3984 { ISD::SSUBSAT, MVT::v4i32, { 3, 14, 9, 11 } },
3985 { ISD::SSUBSAT, MVT::v8i32, { 4, 15, 9, 16 } },
3986 { ISD::SSUBSAT, MVT::v16i16, { 1, 1, 1, 2 } },
3987 { ISD::SSUBSAT, MVT::v32i8, { 1, 1, 1, 2 } },
3988 { ISD::UADDSAT, MVT::v2i64, { 2, 8, 6, 6 } },
3989 { ISD::UADDSAT, MVT::v4i64, { 3, 8, 6, 10 } },
3990 { ISD::UADDSAT, MVT::v8i32, { 2, 2, 4, 8 } },
3991 { ISD::UADDSAT, MVT::v16i16, { 1, 1, 1, 2 } },
3992 { ISD::UADDSAT, MVT::v32i8, { 1, 1, 1, 2 } },
3993 { ISD::UMAX, MVT::v2i64, { 2, 8, 5, 6 } },
3994 { ISD::UMAX, MVT::v4i64, { 2, 8, 5, 8 } },
3995 { ISD::UMAX, MVT::v8i32, { 1, 1, 1, 2 } },
3996 { ISD::UMAX, MVT::v16i16, { 1, 1, 1, 2 } },
3997 { ISD::UMAX, MVT::v32i8, { 1, 1, 1, 2 } },
3998 { ISD::UMIN, MVT::v2i64, { 2, 8, 5, 6 } },
3999 { ISD::UMIN, MVT::v4i64, { 2, 8, 5, 8 } },
4000 { ISD::UMIN, MVT::v8i32, { 1, 1, 1, 2 } },
4001 { ISD::UMIN, MVT::v16i16, { 1, 1, 1, 2 } },
4002 { ISD::UMIN, MVT::v32i8, { 1, 1, 1, 2 } },
4003 { ISD::UMULO, MVT::v4i64, { 24, 24, 39, 43 } },
4004 { ISD::UMULO, MVT::v2i64, { 10, 10, 15, 19 } },
4005 { ISD::UMULO, MVT::v8i32, { 8, 11, 13, 23 } },
4006 { ISD::UMULO, MVT::v4i32, { 5, 12, 11, 12 } },
4007 { ISD::UMULO, MVT::v16i16, { 4, 6, 8, 13 } },
4008 { ISD::UMULO, MVT::v8i16, { 2, 8, 6, 6 } },
4009 { ISD::UMULO, MVT::v32i8, { 9, 13, 17, 33 } },
4010 { ISD::UMULO, MVT::v16i8, { 6, 19, 13, 20 } },
4011 { ISD::USUBSAT, MVT::v2i64, { 2, 7, 6, 6 } },
4012 { ISD::USUBSAT, MVT::v4i64, { 3, 7, 6, 10 } },
4013 { ISD::USUBSAT, MVT::v8i32, { 2, 2, 2, 4 } },
4014 { ISD::USUBSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4015 { ISD::USUBSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4016 { ISD::FMAXNUM, MVT::f32, { 2, 7, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4017 { ISD::FMAXNUM, MVT::v4f32, { 2, 7, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4018 { ISD::FMAXNUM, MVT::v8f32, { 3, 7, 3, 6 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4019 { ISD::FMAXNUM, MVT::f64, { 2, 7, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4020 { ISD::FMAXNUM, MVT::v2f64, { 2, 7, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4021 { ISD::FMAXNUM, MVT::v4f64, { 3, 7, 3, 6 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4022 { ISD::FSQRT, MVT::f32, { 7, 15, 1, 1 } }, // vsqrtss
4023 { ISD::FSQRT, MVT::v4f32, { 7, 15, 1, 1 } }, // vsqrtps
4024 { ISD::FSQRT, MVT::v8f32, { 14, 21, 1, 3 } }, // vsqrtps
4025 { ISD::FSQRT, MVT::f64, { 14, 21, 1, 1 } }, // vsqrtsd
4026 { ISD::FSQRT, MVT::v2f64, { 14, 21, 1, 1 } }, // vsqrtpd
4027 { ISD::FSQRT, MVT::v4f64, { 28, 35, 1, 3 } }, // vsqrtpd
4028 };
4029 static const CostKindTblEntry AVX1CostTbl[] = {
4030 { ISD::ABS, MVT::v4i64, { 6, 8, 6, 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
4031 { ISD::ABS, MVT::v8i32, { 3, 6, 4, 5 } },
4032 { ISD::ABS, MVT::v16i16, { 3, 6, 4, 5 } },
4033 { ISD::ABS, MVT::v32i8, { 3, 6, 4, 5 } },
4034 { ISD::BITREVERSE, MVT::v4i64, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4035 { ISD::BITREVERSE, MVT::v2i64, { 8, 13, 10, 16 } },
4036 { ISD::BITREVERSE, MVT::v8i32, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4037 { ISD::BITREVERSE, MVT::v4i32, { 8, 13, 10, 16 } },
4038 { ISD::BITREVERSE, MVT::v16i16, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4039 { ISD::BITREVERSE, MVT::v8i16, { 8, 13, 10, 16 } },
4040 { ISD::BITREVERSE, MVT::v32i8, { 13, 15, 17, 26 } }, // 2 x 128-bit Op + extract/insert
4041 { ISD::BITREVERSE, MVT::v16i8, { 7, 7, 9, 13 } },
4042 { ISD::BSWAP, MVT::v4i64, { 5, 6, 5, 10 } },
4043 { ISD::BSWAP, MVT::v2i64, { 2, 2, 1, 3 } },
4044 { ISD::BSWAP, MVT::v8i32, { 5, 6, 5, 10 } },
4045 { ISD::BSWAP, MVT::v4i32, { 2, 2, 1, 3 } },
4046 { ISD::BSWAP, MVT::v16i16, { 5, 6, 5, 10 } },
4047 { ISD::BSWAP, MVT::v8i16, { 2, 2, 1, 3 } },
4048 { ISD::CTLZ, MVT::v4i64, { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert
4049 { ISD::CTLZ, MVT::v2i64, { 14, 24, 24, 28 } },
4050 { ISD::CTLZ, MVT::v8i32, { 24, 28, 39, 48 } }, // 2 x 128-bit Op + extract/insert
4051 { ISD::CTLZ, MVT::v4i32, { 12, 20, 19, 23 } },
4052 { ISD::CTLZ, MVT::v16i16, { 19, 22, 29, 38 } }, // 2 x 128-bit Op + extract/insert
4053 { ISD::CTLZ, MVT::v8i16, { 9, 16, 14, 18 } },
4054 { ISD::CTLZ, MVT::v32i8, { 14, 15, 19, 28 } }, // 2 x 128-bit Op + extract/insert
4055 { ISD::CTLZ, MVT::v16i8, { 7, 12, 9, 13 } },
4056 { ISD::CTPOP, MVT::v4i64, { 14, 18, 19, 28 } }, // 2 x 128-bit Op + extract/insert
4057 { ISD::CTPOP, MVT::v2i64, { 7, 14, 10, 14 } },
4058 { ISD::CTPOP, MVT::v8i32, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
4059 { ISD::CTPOP, MVT::v4i32, { 9, 20, 14, 18 } },
4060 { ISD::CTPOP, MVT::v16i16, { 16, 21, 22, 31 } }, // 2 x 128-bit Op + extract/insert
4061 { ISD::CTPOP, MVT::v8i16, { 8, 18, 11, 15 } },
4062 { ISD::CTPOP, MVT::v32i8, { 13, 15, 16, 25 } }, // 2 x 128-bit Op + extract/insert
4063 { ISD::CTPOP, MVT::v16i8, { 6, 12, 8, 12 } },
4064 { ISD::CTTZ, MVT::v4i64, { 17, 22, 24, 33 } }, // 2 x 128-bit Op + extract/insert
4065 { ISD::CTTZ, MVT::v2i64, { 9, 19, 13, 17 } },
4066 { ISD::CTTZ, MVT::v8i32, { 21, 27, 32, 41 } }, // 2 x 128-bit Op + extract/insert
4067 { ISD::CTTZ, MVT::v4i32, { 11, 24, 17, 21 } },
4068 { ISD::CTTZ, MVT::v16i16, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
4069 { ISD::CTTZ, MVT::v8i16, { 9, 21, 14, 18 } },
4070 { ISD::CTTZ, MVT::v32i8, { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert
4071 { ISD::CTTZ, MVT::v16i8, { 8, 16, 11, 15 } },
4072 { ISD::SADDSAT, MVT::v2i64, { 6, 13, 8, 11 } },
4073 { ISD::SADDSAT, MVT::v4i64, { 13, 20, 15, 25 } }, // 2 x 128-bit Op + extract/insert
4074 { ISD::SADDSAT, MVT::v8i32, { 12, 18, 14, 24 } }, // 2 x 128-bit Op + extract/insert
4075 { ISD::SADDSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4076 { ISD::SADDSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4077 { ISD::SMAX, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
4078 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 4 } },
4079 { ISD::SMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4080 { ISD::SMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4081 { ISD::SMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4082 { ISD::SMIN, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
4083 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
4084 { ISD::SMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4085 { ISD::SMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4086 { ISD::SMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4087 { ISD::SMULO, MVT::v4i64, { 20, 20, 33, 37 } },
4088 { ISD::SMULO, MVT::v2i64, { 9, 9, 13, 17 } },
4089 { ISD::SMULO, MVT::v8i32, { 15, 20, 24, 29 } },
4090 { ISD::SMULO, MVT::v4i32, { 7, 15, 11, 13 } },
4091 { ISD::SMULO, MVT::v16i16, { 8, 14, 14, 15 } },
4092 { ISD::SMULO, MVT::v8i16, { 3, 9, 6, 6 } },
4093 { ISD::SMULO, MVT::v32i8, { 20, 20, 37, 39 } },
4094 { ISD::SMULO, MVT::v16i8, { 9, 22, 18, 21 } },
4095 { ISD::SSUBSAT, MVT::v2i64, { 7, 13, 9, 13 } },
4096 { ISD::SSUBSAT, MVT::v4i64, { 15, 21, 18, 29 } }, // 2 x 128-bit Op + extract/insert
4097 { ISD::SSUBSAT, MVT::v8i32, { 15, 19, 18, 29 } }, // 2 x 128-bit Op + extract/insert
4098 { ISD::SSUBSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4099 { ISD::SSUBSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4100 { ISD::UADDSAT, MVT::v2i64, { 3, 8, 6, 6 } },
4101 { ISD::UADDSAT, MVT::v4i64, { 8, 11, 14, 15 } }, // 2 x 128-bit Op + extract/insert
4102 { ISD::UADDSAT, MVT::v8i32, { 6, 6, 10, 11 } }, // 2 x 128-bit Op + extract/insert
4103 { ISD::UADDSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4104 { ISD::UADDSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4105 { ISD::UMAX, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
4106 { ISD::UMAX, MVT::v2i64, { 4, 8, 5, 7 } },
4107 { ISD::UMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4108 { ISD::UMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4109 { ISD::UMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4110 { ISD::UMIN, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
4111 { ISD::UMIN, MVT::v2i64, { 4, 8, 5, 7 } },
4112 { ISD::UMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4113 { ISD::UMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4114 { ISD::UMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4115 { ISD::UMULO, MVT::v4i64, { 24, 26, 39, 45 } },
4116 { ISD::UMULO, MVT::v2i64, { 10, 12, 15, 20 } },
4117 { ISD::UMULO, MVT::v8i32, { 14, 15, 23, 28 } },
4118 { ISD::UMULO, MVT::v4i32, { 7, 12, 11, 13 } },
4119 { ISD::UMULO, MVT::v16i16, { 7, 11, 13, 14 } },
4120 { ISD::UMULO, MVT::v8i16, { 3, 8, 6, 6 } },
4121 { ISD::UMULO, MVT::v32i8, { 19, 19, 35, 37 } },
4122 { ISD::UMULO, MVT::v16i8, { 9, 19, 17, 20 } },
4123 { ISD::USUBSAT, MVT::v2i64, { 3, 7, 6, 6 } },
4124 { ISD::USUBSAT, MVT::v4i64, { 8, 10, 14, 15 } }, // 2 x 128-bit Op + extract/insert
4125 { ISD::USUBSAT, MVT::v8i32, { 4, 4, 7, 8 } }, // 2 x 128-bit Op + extract/insert
4126 { ISD::USUBSAT, MVT::v8i32, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4127 { ISD::USUBSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4128 { ISD::USUBSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4129 { ISD::FMAXNUM, MVT::f32, { 3, 6, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4130 { ISD::FMAXNUM, MVT::v4f32, { 3, 6, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4131 { ISD::FMAXNUM, MVT::v8f32, { 5, 7, 3, 10 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4132 { ISD::FMAXNUM, MVT::f64, { 3, 6, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4133 { ISD::FMAXNUM, MVT::v2f64, { 3, 6, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4134 { ISD::FMAXNUM, MVT::v4f64, { 5, 7, 3, 10 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4135 { ISD::FSQRT, MVT::f32, { 21, 21, 1, 1 } }, // vsqrtss
4136 { ISD::FSQRT, MVT::v4f32, { 21, 21, 1, 1 } }, // vsqrtps
4137 { ISD::FSQRT, MVT::v8f32, { 42, 42, 1, 3 } }, // vsqrtps
4138 { ISD::FSQRT, MVT::f64, { 27, 27, 1, 1 } }, // vsqrtsd
4139 { ISD::FSQRT, MVT::v2f64, { 27, 27, 1, 1 } }, // vsqrtpd
4140 { ISD::FSQRT, MVT::v4f64, { 54, 54, 1, 3 } }, // vsqrtpd
4141 };
4142 static const CostKindTblEntry GFNICostTbl[] = {
4143 { ISD::BITREVERSE, MVT::i8, { 3, 3, 3, 4 } }, // gf2p8affineqb
4144 { ISD::BITREVERSE, MVT::i16, { 3, 3, 4, 6 } }, // gf2p8affineqb
4145 { ISD::BITREVERSE, MVT::i32, { 3, 3, 4, 5 } }, // gf2p8affineqb
4146 { ISD::BITREVERSE, MVT::i64, { 3, 3, 4, 6 } }, // gf2p8affineqb
4147 { ISD::BITREVERSE, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4148 { ISD::BITREVERSE, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4149 { ISD::BITREVERSE, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4150 { ISD::BITREVERSE, MVT::v8i16, { 1, 8, 2, 4 } }, // gf2p8affineqb
4151 { ISD::BITREVERSE, MVT::v16i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
4152 { ISD::BITREVERSE, MVT::v32i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
4153 { ISD::BITREVERSE, MVT::v4i32, { 1, 8, 2, 4 } }, // gf2p8affineqb
4154 { ISD::BITREVERSE, MVT::v8i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
4155 { ISD::BITREVERSE, MVT::v16i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
4156 { ISD::BITREVERSE, MVT::v2i64, { 1, 8, 2, 4 } }, // gf2p8affineqb
4157 { ISD::BITREVERSE, MVT::v4i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
4158 { ISD::BITREVERSE, MVT::v8i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
4159 { X86ISD::VROTLI, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4160 { X86ISD::VROTLI, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4161 { X86ISD::VROTLI, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4162 };
4163 static const CostKindTblEntry GLMCostTbl[] = {
4164 { ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss
4165 { ISD::FSQRT, MVT::v4f32, { 37, 41, 1, 5 } }, // sqrtps
4166 { ISD::FSQRT, MVT::f64, { 34, 35, 1, 1 } }, // sqrtsd
4167 { ISD::FSQRT, MVT::v2f64, { 67, 71, 1, 5 } }, // sqrtpd
4168 };
4169 static const CostKindTblEntry SLMCostTbl[] = {
4170 { ISD::BSWAP, MVT::v2i64, { 5, 5, 1, 5 } },
4171 { ISD::BSWAP, MVT::v4i32, { 5, 5, 1, 5 } },
4172 { ISD::BSWAP, MVT::v8i16, { 5, 5, 1, 5 } },
4173 { ISD::FSQRT, MVT::f32, { 20, 20, 1, 1 } }, // sqrtss
4174 { ISD::FSQRT, MVT::v4f32, { 40, 41, 1, 5 } }, // sqrtps
4175 { ISD::FSQRT, MVT::f64, { 35, 35, 1, 1 } }, // sqrtsd
4176 { ISD::FSQRT, MVT::v2f64, { 70, 71, 1, 5 } }, // sqrtpd
4177 };
4178 static const CostKindTblEntry SSE42CostTbl[] = {
4179 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4180 { ISD::FMAXNUM, MVT::v4f32, { 4, 4, 4, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4181 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4182 { ISD::FMAXNUM, MVT::v2f64, { 4, 4, 4, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4183 { ISD::FSQRT, MVT::f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
4184 { ISD::FSQRT, MVT::v4f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
4185 };
4186 static const CostKindTblEntry SSE41CostTbl[] = {
4187 { ISD::ABS, MVT::v2i64, { 3, 4, 3, 5 } }, // BLENDVPD(X,PSUBQ(0,X),X)
4188 { ISD::SADDSAT, MVT::v2i64, { 10, 14, 17, 21 } },
4189 { ISD::SADDSAT, MVT::v4i32, { 5, 11, 8, 10 } },
4190 { ISD::SSUBSAT, MVT::v2i64, { 12, 19, 25, 29 } },
4191 { ISD::SSUBSAT, MVT::v4i32, { 6, 14, 10, 12 } },
4192 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 3 } },
4193 { ISD::SMAX, MVT::v4i32, { 1, 1, 1, 1 } },
4194 { ISD::SMAX, MVT::v16i8, { 1, 1, 1, 1 } },
4195 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
4196 { ISD::SMIN, MVT::v4i32, { 1, 1, 1, 1 } },
4197 { ISD::SMIN, MVT::v16i8, { 1, 1, 1, 1 } },
4198 { ISD::SMULO, MVT::v2i64, { 9, 11, 13, 17 } },
4199 { ISD::SMULO, MVT::v4i32, { 20, 24, 13, 19 } },
4200 { ISD::SMULO, MVT::v8i16, { 5, 9, 8, 8 } },
4201 { ISD::SMULO, MVT::v16i8, { 13, 22, 24, 25 } },
4202 { ISD::UADDSAT, MVT::v2i64, { 6, 13, 14, 14 } },
4203 { ISD::UADDSAT, MVT::v4i32, { 2, 2, 4, 4 } },
4204 { ISD::USUBSAT, MVT::v2i64, { 6, 10, 14, 14 } },
4205 { ISD::USUBSAT, MVT::v4i32, { 1, 2, 2, 2 } },
4206 { ISD::UMAX, MVT::v2i64, { 2, 11, 6, 7 } },
4207 { ISD::UMAX, MVT::v4i32, { 1, 1, 1, 1 } },
4208 { ISD::UMAX, MVT::v8i16, { 1, 1, 1, 1 } },
4209 { ISD::UMIN, MVT::v2i64, { 2, 11, 6, 7 } },
4210 { ISD::UMIN, MVT::v4i32, { 1, 1, 1, 1 } },
4211 { ISD::UMIN, MVT::v8i16, { 1, 1, 1, 1 } },
4212 { ISD::UMULO, MVT::v2i64, { 14, 20, 15, 20 } },
4213 { ISD::UMULO, MVT::v4i32, { 19, 22, 12, 18 } },
4214 { ISD::UMULO, MVT::v8i16, { 4, 9, 7, 7 } },
4215 { ISD::UMULO, MVT::v16i8, { 13, 19, 18, 20 } },
4216 };
4217 static const CostKindTblEntry SSSE3CostTbl[] = {
4218 { ISD::ABS, MVT::v4i32, { 1, 2, 1, 1 } },
4219 { ISD::ABS, MVT::v8i16, { 1, 2, 1, 1 } },
4220 { ISD::ABS, MVT::v16i8, { 1, 2, 1, 1 } },
4221 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 11, 21 } },
4222 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 11, 21 } },
4223 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 11, 21 } },
4224 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 10, 16 } },
4225 { ISD::BSWAP, MVT::v2i64, { 2, 3, 1, 5 } },
4226 { ISD::BSWAP, MVT::v4i32, { 2, 3, 1, 5 } },
4227 { ISD::BSWAP, MVT::v8i16, { 2, 3, 1, 5 } },
4228 { ISD::CTLZ, MVT::v2i64, { 18, 28, 28, 35 } },
4229 { ISD::CTLZ, MVT::v4i32, { 15, 20, 22, 28 } },
4230 { ISD::CTLZ, MVT::v8i16, { 13, 17, 16, 22 } },
4231 { ISD::CTLZ, MVT::v16i8, { 11, 15, 10, 16 } },
4232 { ISD::CTPOP, MVT::v2i64, { 13, 19, 12, 18 } },
4233 { ISD::CTPOP, MVT::v4i32, { 18, 24, 16, 22 } },
4234 { ISD::CTPOP, MVT::v8i16, { 13, 18, 14, 20 } },
4235 { ISD::CTPOP, MVT::v16i8, { 11, 12, 10, 16 } },
4236 { ISD::CTTZ, MVT::v2i64, { 13, 25, 15, 22 } },
4237 { ISD::CTTZ, MVT::v4i32, { 18, 26, 19, 25 } },
4238 { ISD::CTTZ, MVT::v8i16, { 13, 20, 17, 23 } },
4239 { ISD::CTTZ, MVT::v16i8, { 11, 16, 13, 19 } }
4240 };
4241 static const CostKindTblEntry SSE2CostTbl[] = {
4242 { ISD::ABS, MVT::v2i64, { 3, 6, 5, 5 } },
4243 { ISD::ABS, MVT::v4i32, { 1, 4, 4, 4 } },
4244 { ISD::ABS, MVT::v8i16, { 1, 2, 3, 3 } },
4245 { ISD::ABS, MVT::v16i8, { 1, 2, 3, 3 } },
4246 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 32, 32 } },
4247 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 30, 30 } },
4248 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 25, 25 } },
4249 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 21, 21 } },
4250 { ISD::BSWAP, MVT::v2i64, { 5, 6, 11, 11 } },
4251 { ISD::BSWAP, MVT::v4i32, { 5, 5, 9, 9 } },
4252 { ISD::BSWAP, MVT::v8i16, { 5, 5, 4, 5 } },
4253 { ISD::CTLZ, MVT::v2i64, { 10, 45, 36, 38 } },
4254 { ISD::CTLZ, MVT::v4i32, { 10, 45, 38, 40 } },
4255 { ISD::CTLZ, MVT::v8i16, { 9, 38, 32, 34 } },
4256 { ISD::CTLZ, MVT::v16i8, { 8, 39, 29, 32 } },
4257 { ISD::CTPOP, MVT::v2i64, { 12, 26, 16, 18 } },
4258 { ISD::CTPOP, MVT::v4i32, { 15, 29, 21, 23 } },
4259 { ISD::CTPOP, MVT::v8i16, { 13, 25, 18, 20 } },
4260 { ISD::CTPOP, MVT::v16i8, { 10, 21, 14, 16 } },
4261 { ISD::CTTZ, MVT::v2i64, { 14, 28, 19, 21 } },
4262 { ISD::CTTZ, MVT::v4i32, { 18, 31, 24, 26 } },
4263 { ISD::CTTZ, MVT::v8i16, { 16, 27, 21, 23 } },
4264 { ISD::CTTZ, MVT::v16i8, { 13, 23, 17, 19 } },
4265 { ISD::SADDSAT, MVT::v2i64, { 12, 14, 24, 24 } },
4266 { ISD::SADDSAT, MVT::v4i32, { 6, 11, 11, 12 } },
4267 { ISD::SADDSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4268 { ISD::SADDSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4269 { ISD::SMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4270 { ISD::SMAX, MVT::v4i32, { 2, 4, 5, 5 } },
4271 { ISD::SMAX, MVT::v8i16, { 1, 1, 1, 1 } },
4272 { ISD::SMAX, MVT::v16i8, { 2, 4, 5, 5 } },
4273 { ISD::SMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4274 { ISD::SMIN, MVT::v4i32, { 2, 4, 5, 5 } },
4275 { ISD::SMIN, MVT::v8i16, { 1, 1, 1, 1 } },
4276 { ISD::SMIN, MVT::v16i8, { 2, 4, 5, 5 } },
4277 { ISD::SMULO, MVT::v2i64, { 30, 33, 13, 23 } },
4278 { ISD::SMULO, MVT::v4i32, { 20, 24, 23, 23 } },
4279 { ISD::SMULO, MVT::v8i16, { 5, 10, 8, 8 } },
4280 { ISD::SMULO, MVT::v16i8, { 13, 23, 24, 25 } },
4281 { ISD::SSUBSAT, MVT::v2i64, { 16, 19, 31, 31 } },
4282 { ISD::SSUBSAT, MVT::v4i32, { 6, 14, 12, 13 } },
4283 { ISD::SSUBSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4284 { ISD::SSUBSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4285 { ISD::UADDSAT, MVT::v2i64, { 7, 13, 14, 14 } },
4286 { ISD::UADDSAT, MVT::v4i32, { 4, 5, 7, 7 } },
4287 { ISD::UADDSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4288 { ISD::UADDSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4289 { ISD::UMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4290 { ISD::UMAX, MVT::v4i32, { 2, 5, 8, 8 } },
4291 { ISD::UMAX, MVT::v8i16, { 1, 3, 3, 3 } },
4292 { ISD::UMAX, MVT::v16i8, { 1, 1, 1, 1 } },
4293 { ISD::UMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4294 { ISD::UMIN, MVT::v4i32, { 2, 5, 8, 8 } },
4295 { ISD::UMIN, MVT::v8i16, { 1, 3, 3, 3 } },
4296 { ISD::UMIN, MVT::v16i8, { 1, 1, 1, 1 } },
4297 { ISD::UMULO, MVT::v2i64, { 30, 33, 15, 29 } },
4298 { ISD::UMULO, MVT::v4i32, { 19, 22, 14, 18 } },
4299 { ISD::UMULO, MVT::v8i16, { 4, 9, 7, 7 } },
4300 { ISD::UMULO, MVT::v16i8, { 13, 19, 20, 20 } },
4301 { ISD::USUBSAT, MVT::v2i64, { 7, 10, 14, 14 } },
4302 { ISD::USUBSAT, MVT::v4i32, { 4, 4, 7, 7 } },
4303 { ISD::USUBSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4304 { ISD::USUBSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4305 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } },
4306 { ISD::FMAXNUM, MVT::v2f64, { 4, 6, 6, 6 } },
4307 { ISD::FSQRT, MVT::f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4308 { ISD::FSQRT, MVT::v2f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4309 };
4310 static const CostKindTblEntry SSE1CostTbl[] = {
4311 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } },
4312 { ISD::FMAXNUM, MVT::v4f32, { 4, 6, 6, 6 } },
4313 { ISD::FSQRT, MVT::f32, { 28, 30, 1, 2 } }, // Pentium III from http://www.agner.org/
4314 { ISD::FSQRT, MVT::v4f32, { 56, 56, 1, 2 } }, // Pentium III from http://www.agner.org/
4315 };
4316 static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets
4317 { ISD::CTTZ, MVT::i64, { 1, 1, 1, 1 } },
4318 };
4319 static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
4320 { ISD::CTTZ, MVT::i32, { 1, 1, 1, 1 } },
4321 { ISD::CTTZ, MVT::i16, { 2, 1, 1, 1 } },
4322 { ISD::CTTZ, MVT::i8, { 2, 1, 1, 1 } },
4323 };
4324 static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets
4325 { ISD::CTLZ, MVT::i64, { 1, 1, 1, 1 } },
4326 };
4327 static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
4328 { ISD::CTLZ, MVT::i32, { 1, 1, 1, 1 } },
4329 { ISD::CTLZ, MVT::i16, { 2, 1, 1, 1 } },
4330 { ISD::CTLZ, MVT::i8, { 2, 1, 1, 1 } },
4331 };
4332 static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets
4333 { ISD::CTPOP, MVT::i64, { 1, 1, 1, 1 } }, // popcnt
4334 };
4335 static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
4336 { ISD::CTPOP, MVT::i32, { 1, 1, 1, 1 } }, // popcnt
4337 { ISD::CTPOP, MVT::i16, { 1, 1, 2, 2 } }, // popcnt(zext())
4338 { ISD::CTPOP, MVT::i8, { 1, 1, 2, 2 } }, // popcnt(zext())
4339 };
4340 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
4341 { ISD::ABS, MVT::i64, { 1, 2, 3, 3 } }, // SUB+CMOV
4342 { ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } },
4343 { ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } },
4344 { ISD::CTLZ, MVT::i64, { 1, 2, 3, 3 } }, // MOV+BSR+XOR
4345 { ISD::CTLZ, MVT::i32, { 1, 2, 3, 3 } }, // MOV+BSR+XOR
4346 { ISD::CTLZ, MVT::i16, { 2, 2, 3, 3 } }, // MOV+BSR+XOR
4347 { ISD::CTLZ, MVT::i8, { 2, 2, 4, 3 } }, // MOV+BSR+XOR
4348 { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 2, 2 } }, // BSR+XOR
4349 { ISD::CTTZ, MVT::i64, { 1, 2, 2, 2 } }, // MOV+BSF
4350 { ISD::CTTZ, MVT::i32, { 1, 2, 2, 2 } }, // MOV+BSF
4351 { ISD::CTTZ, MVT::i16, { 2, 2, 2, 2 } }, // MOV+BSF
4352 { ISD::CTTZ, MVT::i8, { 2, 2, 2, 2 } }, // MOV+BSF
4353 { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 1, 2 } }, // BSF
4354 { ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } },
4355 { ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } },
4356 { ISD::ROTR, MVT::i64, { 2, 3, 1, 3 } },
4357 { X86ISD::VROTLI, MVT::i64, { 1, 1, 1, 1 } },
4358 { ISD::FSHL, MVT::i64, { 4, 4, 1, 4 } },
4359 { ISD::SADDSAT, MVT::i64, { 4, 4, 7, 10 } },
4360 { ISD::SSUBSAT, MVT::i64, { 4, 5, 8, 11 } },
4361 { ISD::UADDSAT, MVT::i64, { 2, 3, 4, 7 } },
4362 { ISD::USUBSAT, MVT::i64, { 2, 3, 4, 7 } },
4363 { ISD::SMAX, MVT::i64, { 1, 3, 2, 3 } },
4364 { ISD::SMIN, MVT::i64, { 1, 3, 2, 3 } },
4365 { ISD::UMAX, MVT::i64, { 1, 3, 2, 3 } },
4366 { ISD::UMIN, MVT::i64, { 1, 3, 2, 3 } },
4367 { ISD::SADDO, MVT::i64, { 2, 2, 4, 6 } },
4368 { ISD::UADDO, MVT::i64, { 2, 2, 4, 6 } },
4369 { ISD::SMULO, MVT::i64, { 4, 4, 4, 6 } },
4370 { ISD::UMULO, MVT::i64, { 8, 8, 4, 7 } },
4371 };
4372 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
4373 { ISD::ABS, MVT::i32, { 1, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
4374 { ISD::ABS, MVT::i16, { 2, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
4375 { ISD::ABS, MVT::i8, { 2, 4, 4, 3 } }, // SUB+XOR+SRA
4376 { ISD::BITREVERSE, MVT::i32, { 9, 12, 17, 19 } },
4377 { ISD::BITREVERSE, MVT::i16, { 9, 12, 17, 19 } },
4378 { ISD::BITREVERSE, MVT::i8, { 7, 9, 13, 14 } },
4379 { ISD::BSWAP, MVT::i32, { 1, 1, 1, 1 } },
4380 { ISD::BSWAP, MVT::i16, { 1, 2, 1, 2 } }, // ROL
4381 { ISD::CTLZ, MVT::i32, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
4382 { ISD::CTLZ, MVT::i16, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
4383 { ISD::CTLZ, MVT::i8, { 2, 2, 5, 6 } }, // BSR+XOR or BSR+XOR+CMOV
4384 { ISD::CTLZ_ZERO_UNDEF, MVT::i32,{ 1, 2, 2, 2 } }, // BSR+XOR
4385 { ISD::CTLZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 2, 2 } }, // BSR+XOR
4386 { ISD::CTLZ_ZERO_UNDEF, MVT::i8, { 2, 2, 3, 3 } }, // BSR+XOR
4387 { ISD::CTTZ, MVT::i32, { 2, 2, 3, 3 } }, // TEST+BSF+CMOV/BRANCH
4388 { ISD::CTTZ, MVT::i16, { 2, 2, 2, 3 } }, // TEST+BSF+CMOV/BRANCH
4389 { ISD::CTTZ, MVT::i8, { 2, 2, 2, 3 } }, // TEST+BSF+CMOV/BRANCH
4390 { ISD::CTTZ_ZERO_UNDEF, MVT::i32,{ 1, 2, 1, 2 } }, // BSF
4391 { ISD::CTTZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 1, 2 } }, // BSF
4392 { ISD::CTTZ_ZERO_UNDEF, MVT::i8, { 2, 2, 1, 2 } }, // BSF
4393 { ISD::CTPOP, MVT::i32, { 8, 7, 15, 15 } },
4394 { ISD::CTPOP, MVT::i16, { 9, 8, 17, 17 } },
4395 { ISD::CTPOP, MVT::i8, { 7, 6, 6, 6 } },
4396 { ISD::ROTL, MVT::i32, { 2, 3, 1, 3 } },
4397 { ISD::ROTL, MVT::i16, { 2, 3, 1, 3 } },
4398 { ISD::ROTL, MVT::i8, { 2, 3, 1, 3 } },
4399 { ISD::ROTR, MVT::i32, { 2, 3, 1, 3 } },
4400 { ISD::ROTR, MVT::i16, { 2, 3, 1, 3 } },
4401 { ISD::ROTR, MVT::i8, { 2, 3, 1, 3 } },
4402 { X86ISD::VROTLI, MVT::i32, { 1, 1, 1, 1 } },
4403 { X86ISD::VROTLI, MVT::i16, { 1, 1, 1, 1 } },
4404 { X86ISD::VROTLI, MVT::i8, { 1, 1, 1, 1 } },
4405 { ISD::FSHL, MVT::i32, { 4, 4, 1, 4 } },
4406 { ISD::FSHL, MVT::i16, { 4, 4, 2, 5 } },
4407 { ISD::FSHL, MVT::i8, { 4, 4, 2, 5 } },
4408 { ISD::SADDSAT, MVT::i32, { 3, 4, 6, 9 } },
4409 { ISD::SADDSAT, MVT::i16, { 4, 4, 7, 10 } },
4410 { ISD::SADDSAT, MVT::i8, { 4, 5, 8, 11 } },
4411 { ISD::SSUBSAT, MVT::i32, { 4, 4, 7, 10 } },
4412 { ISD::SSUBSAT, MVT::i16, { 4, 4, 7, 10 } },
4413 { ISD::SSUBSAT, MVT::i8, { 4, 5, 8, 11 } },
4414 { ISD::UADDSAT, MVT::i32, { 2, 3, 4, 7 } },
4415 { ISD::UADDSAT, MVT::i16, { 2, 3, 4, 7 } },
4416 { ISD::UADDSAT, MVT::i8, { 3, 3, 5, 8 } },
4417 { ISD::USUBSAT, MVT::i32, { 2, 3, 4, 7 } },
4418 { ISD::USUBSAT, MVT::i16, { 2, 3, 4, 7 } },
4419 { ISD::USUBSAT, MVT::i8, { 3, 3, 5, 8 } },
4420 { ISD::SMAX, MVT::i32, { 1, 2, 2, 3 } },
4421 { ISD::SMAX, MVT::i16, { 1, 4, 2, 4 } },
4422 { ISD::SMAX, MVT::i8, { 1, 4, 2, 4 } },
4423 { ISD::SMIN, MVT::i32, { 1, 2, 2, 3 } },
4424 { ISD::SMIN, MVT::i16, { 1, 4, 2, 4 } },
4425 { ISD::SMIN, MVT::i8, { 1, 4, 2, 4 } },
4426 { ISD::UMAX, MVT::i32, { 1, 2, 2, 3 } },
4427 { ISD::UMAX, MVT::i16, { 1, 4, 2, 4 } },
4428 { ISD::UMAX, MVT::i8, { 1, 4, 2, 4 } },
4429 { ISD::UMIN, MVT::i32, { 1, 2, 2, 3 } },
4430 { ISD::UMIN, MVT::i16, { 1, 4, 2, 4 } },
4431 { ISD::UMIN, MVT::i8, { 1, 4, 2, 4 } },
4432 { ISD::SADDO, MVT::i32, { 2, 2, 4, 6 } },
4433 { ISD::SADDO, MVT::i16, { 2, 2, 4, 6 } },
4434 { ISD::SADDO, MVT::i8, { 2, 2, 4, 6 } },
4435 { ISD::UADDO, MVT::i32, { 2, 2, 4, 6 } },
4436 { ISD::UADDO, MVT::i16, { 2, 2, 4, 6 } },
4437 { ISD::UADDO, MVT::i8, { 2, 2, 4, 6 } },
4438 { ISD::SMULO, MVT::i32, { 2, 2, 4, 6 } },
4439 { ISD::SMULO, MVT::i16, { 5, 5, 4, 6 } },
4440 { ISD::SMULO, MVT::i8, { 6, 6, 4, 6 } },
4441 { ISD::UMULO, MVT::i32, { 6, 6, 4, 8 } },
4442 { ISD::UMULO, MVT::i16, { 6, 6, 4, 9 } },
4443 { ISD::UMULO, MVT::i8, { 6, 6, 4, 6 } },
4444 };
4445
4446 Type *RetTy = ICA.getReturnType();
4447 Type *OpTy = RetTy;
4448 Intrinsic::ID IID = ICA.getID();
4449 unsigned ISD = ISD::DELETED_NODE;
4450 switch (IID) {
4451 default:
4452 break;
4453 case Intrinsic::abs:
4454 ISD = ISD::ABS;
4455 break;
4456 case Intrinsic::bitreverse:
4457 ISD = ISD::BITREVERSE;
4458 break;
4459 case Intrinsic::bswap:
4460 ISD = ISD::BSWAP;
4461 break;
4462 case Intrinsic::ctlz:
4463 ISD = ISD::CTLZ;
4464 break;
4465 case Intrinsic::ctpop:
4466 ISD = ISD::CTPOP;
4467 break;
4468 case Intrinsic::cttz:
4469 ISD = ISD::CTTZ;
4470 break;
4471 case Intrinsic::fshl:
4472 ISD = ISD::FSHL;
4473 if (!ICA.isTypeBasedOnly()) {
4474 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4475 if (Args[0] == Args[1]) {
4476 ISD = ISD::ROTL;
4477 // Handle uniform constant rotation amounts.
4478 // TODO: Handle funnel-shift cases.
4479 const APInt *Amt;
4480 if (Args[2] &&
4482 ISD = X86ISD::VROTLI;
4483 }
4484 }
4485 break;
4486 case Intrinsic::fshr:
4487 // FSHR has same costs so don't duplicate.
4488 ISD = ISD::FSHL;
4489 if (!ICA.isTypeBasedOnly()) {
4490 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4491 if (Args[0] == Args[1]) {
4492 ISD = ISD::ROTR;
4493 // Handle uniform constant rotation amount.
4494 // TODO: Handle funnel-shift cases.
4495 const APInt *Amt;
4496 if (Args[2] &&
4498 ISD = X86ISD::VROTLI;
4499 }
4500 }
4501 break;
4502 case Intrinsic::lrint:
4503 case Intrinsic::llrint: {
4504 // X86 can use the CVTP2SI instructions to lower lrint/llrint calls, which
4505 // have the same costs as the CVTTP2SI (fptosi) instructions
4506 const SmallVectorImpl<Type *> &ArgTys = ICA.getArgTypes();
4507 return getCastInstrCost(Instruction::FPToSI, RetTy, ArgTys[0],
4509 }
4510 case Intrinsic::maxnum:
4511 case Intrinsic::minnum:
4512 // FMINNUM has same costs so don't duplicate.
4513 ISD = ISD::FMAXNUM;
4514 break;
4515 case Intrinsic::sadd_sat:
4516 ISD = ISD::SADDSAT;
4517 break;
4518 case Intrinsic::smax:
4519 ISD = ISD::SMAX;
4520 break;
4521 case Intrinsic::smin:
4522 ISD = ISD::SMIN;
4523 break;
4524 case Intrinsic::ssub_sat:
4525 ISD = ISD::SSUBSAT;
4526 break;
4527 case Intrinsic::uadd_sat:
4528 ISD = ISD::UADDSAT;
4529 break;
4530 case Intrinsic::umax:
4531 ISD = ISD::UMAX;
4532 break;
4533 case Intrinsic::umin:
4534 ISD = ISD::UMIN;
4535 break;
4536 case Intrinsic::usub_sat:
4537 ISD = ISD::USUBSAT;
4538 break;
4539 case Intrinsic::sqrt:
4540 ISD = ISD::FSQRT;
4541 break;
4542 case Intrinsic::sadd_with_overflow:
4543 case Intrinsic::ssub_with_overflow:
4544 // SSUBO has same costs so don't duplicate.
4545 ISD = ISD::SADDO;
4546 OpTy = RetTy->getContainedType(0);
4547 break;
4548 case Intrinsic::uadd_with_overflow:
4549 case Intrinsic::usub_with_overflow:
4550 // USUBO has same costs so don't duplicate.
4551 ISD = ISD::UADDO;
4552 OpTy = RetTy->getContainedType(0);
4553 break;
4554 case Intrinsic::smul_with_overflow:
4555 ISD = ISD::SMULO;
4556 OpTy = RetTy->getContainedType(0);
4557 break;
4558 case Intrinsic::umul_with_overflow:
4559 ISD = ISD::UMULO;
4560 OpTy = RetTy->getContainedType(0);
4561 break;
4562 }
4563
4564 if (ISD != ISD::DELETED_NODE) {
4565 auto adjustTableCost = [&](int ISD, unsigned Cost,
4566 std::pair<InstructionCost, MVT> LT,
4568 InstructionCost LegalizationCost = LT.first;
4569 MVT MTy = LT.second;
4570
4571 // If there are no NANs to deal with, then these are reduced to a
4572 // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
4573 // assume is used in the non-fast case.
4574 if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) {
4575 if (FMF.noNaNs())
4576 return LegalizationCost * 1;
4577 }
4578
4579 // For cases where some ops can be folded into a load/store, assume free.
4580 if (MTy.isScalarInteger()) {
4581 if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
4582 if (const Instruction *II = ICA.getInst()) {
4583 if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
4584 return TTI::TCC_Free;
4585 if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
4586 if (LI->hasOneUse())
4587 return TTI::TCC_Free;
4588 }
4589 }
4590 }
4591 }
4592
4593 return LegalizationCost * (int)Cost;
4594 };
4595
4596 // Legalize the type.
4597 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy);
4598 MVT MTy = LT.second;
4599
4600 // Without BMI/LZCNT see if we're only looking for a *_ZERO_UNDEF cost.
4601 if (((ISD == ISD::CTTZ && !ST->hasBMI()) ||
4602 (ISD == ISD::CTLZ && !ST->hasLZCNT())) &&
4603 !MTy.isVector() && !ICA.isTypeBasedOnly()) {
4604 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4605 if (auto *Cst = dyn_cast<ConstantInt>(Args[1]))
4606 if (Cst->isAllOnesValue())
4608 }
4609
4610 // FSQRT is a single instruction.
4611 if (ISD == ISD::FSQRT && CostKind == TTI::TCK_CodeSize)
4612 return LT.first;
4613
4614 if (ST->useGLMDivSqrtCosts())
4615 if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
4616 if (auto KindCost = Entry->Cost[CostKind])
4617 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4618
4619 if (ST->useSLMArithCosts())
4620 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
4621 if (auto KindCost = Entry->Cost[CostKind])
4622 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4623
4624 if (ST->hasVBMI2())
4625 if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy))
4626 if (auto KindCost = Entry->Cost[CostKind])
4627 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4628
4629 if (ST->hasBITALG())
4630 if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy))
4631 if (auto KindCost = Entry->Cost[CostKind])
4632 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4633
4634 if (ST->hasVPOPCNTDQ())
4635 if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy))
4636 if (auto KindCost = Entry->Cost[CostKind])
4637 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4638
4639 if (ST->hasGFNI())
4640 if (const auto *Entry = CostTableLookup(GFNICostTbl, ISD, MTy))
4641 if (auto KindCost = Entry->Cost[CostKind])
4642 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4643
4644 if (ST->hasCDI())
4645 if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
4646 if (auto KindCost = Entry->Cost[CostKind])
4647 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4648
4649 if (ST->hasBWI())
4650 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
4651 if (auto KindCost = Entry->Cost[CostKind])
4652 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4653
4654 if (ST->hasAVX512())
4655 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
4656 if (auto KindCost = Entry->Cost[CostKind])
4657 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4658
4659 if (ST->hasXOP())
4660 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
4661 if (auto KindCost = Entry->Cost[CostKind])
4662 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4663
4664 if (ST->hasAVX2())
4665 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
4666 if (auto KindCost = Entry->Cost[CostKind])
4667 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4668
4669 if (ST->hasAVX())
4670 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
4671 if (auto KindCost = Entry->Cost[CostKind])
4672 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4673
4674 if (ST->hasSSE42())
4675 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
4676 if (auto KindCost = Entry->Cost[CostKind])
4677 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4678
4679 if (ST->hasSSE41())
4680 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
4681 if (auto KindCost = Entry->Cost[CostKind])
4682 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4683
4684 if (ST->hasSSSE3())
4685 if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
4686 if (auto KindCost = Entry->Cost[CostKind])
4687 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4688
4689 if (ST->hasSSE2())
4690 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
4691 if (auto KindCost = Entry->Cost[CostKind])
4692 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4693
4694 if (ST->hasSSE1())
4695 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
4696 if (auto KindCost = Entry->Cost[CostKind])
4697 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4698
4699 if (ST->hasBMI()) {
4700 if (ST->is64Bit())
4701 if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
4702 if (auto KindCost = Entry->Cost[CostKind])
4703 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4704
4705 if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
4706 if (auto KindCost = Entry->Cost[CostKind])
4707 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4708 }
4709
4710 if (ST->hasLZCNT()) {
4711 if (ST->is64Bit())
4712 if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
4713 if (auto KindCost = Entry->Cost[CostKind])
4714 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4715
4716 if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
4717 if (auto KindCost = Entry->Cost[CostKind])
4718 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4719 }
4720
4721 if (ST->hasPOPCNT()) {
4722 if (ST->is64Bit())
4723 if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
4724 if (auto KindCost = Entry->Cost[CostKind])
4725 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4726
4727 if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
4728 if (auto KindCost = Entry->Cost[CostKind])
4729 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4730 }
4731
4732 if (ST->is64Bit())
4733 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
4734 if (auto KindCost = Entry->Cost[CostKind])
4735 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4736
4737 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
4738 if (auto KindCost = Entry->Cost[CostKind])
4739 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4740
4741 // Without arg data, we need to compute the expanded costs of custom lowered
4742 // intrinsics to prevent use of the (very low) default costs.
4743 if (ICA.isTypeBasedOnly() &&
4744 (IID == Intrinsic::fshl || IID == Intrinsic::fshr)) {
4745 Type *CondTy = RetTy->getWithNewBitWidth(1);
4747 Cost += getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
4748 Cost += getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
4749 Cost += getArithmeticInstrCost(BinaryOperator::Shl, RetTy, CostKind);
4750 Cost += getArithmeticInstrCost(BinaryOperator::LShr, RetTy, CostKind);
4751 Cost += getArithmeticInstrCost(BinaryOperator::And, RetTy, CostKind);
4752 Cost += getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
4754 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
4756 return Cost;
4757 }
4758 }
4759
4761}
4762
4765 unsigned Index, Value *Op0,
4766 Value *Op1) {
4767 static const CostTblEntry SLMCostTbl[] = {
4768 { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 },
4769 { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 },
4770 { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 },
4771 { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 }
4772 };
4773
4774 assert(Val->isVectorTy() && "This must be a vector type");
4775 Type *ScalarType = Val->getScalarType();
4776 InstructionCost RegisterFileMoveCost = 0;
4777
4778 // Non-immediate extraction/insertion can be handled as a sequence of
4779 // aliased loads+stores via the stack.
4780 if (Index == -1U && (Opcode == Instruction::ExtractElement ||
4781 Opcode == Instruction::InsertElement)) {
4782 // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
4783 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
4784
4785 // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling.
4786 assert(isa<FixedVectorType>(Val) && "Fixed vector type expected");
4787 Align VecAlign = DL.getPrefTypeAlign(Val);
4788 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
4789
4790 // Extract - store vector to stack, load scalar.
4791 if (Opcode == Instruction::ExtractElement) {
4792 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4793 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
4794 CostKind);
4795 }
4796 // Insert - store vector to stack, store scalar, load vector.
4797 if (Opcode == Instruction::InsertElement) {
4798 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4799 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
4800 CostKind) +
4801 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind);
4802 }
4803 }
4804
4805 if (Index != -1U && (Opcode == Instruction::ExtractElement ||
4806 Opcode == Instruction::InsertElement)) {
4807 // Extraction of vXi1 elements are now efficiently handled by MOVMSK.
4808 if (Opcode == Instruction::ExtractElement &&
4809 ScalarType->getScalarSizeInBits() == 1 &&
4810 cast<FixedVectorType>(Val)->getNumElements() > 1)
4811 return 1;
4812
4813 // Legalize the type.
4814 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
4815
4816 // This type is legalized to a scalar type.
4817 if (!LT.second.isVector())
4818 return TTI::TCC_Free;
4819
4820 // The type may be split. Normalize the index to the new type.
4821 unsigned SizeInBits = LT.second.getSizeInBits();
4822 unsigned NumElts = LT.second.getVectorNumElements();
4823 unsigned SubNumElts = NumElts;
4824 Index = Index % NumElts;
4825
4826 // For >128-bit vectors, we need to extract higher 128-bit subvectors.
4827 // For inserts, we also need to insert the subvector back.
4828 if (SizeInBits > 128) {
4829 assert((SizeInBits % 128) == 0 && "Illegal vector");
4830 unsigned NumSubVecs = SizeInBits / 128;
4831 SubNumElts = NumElts / NumSubVecs;
4832 if (SubNumElts <= Index) {
4833 RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
4834 Index %= SubNumElts;
4835 }
4836 }
4837
4838 MVT MScalarTy = LT.second.getScalarType();
4839 auto IsCheapPInsrPExtrInsertPS = [&]() {
4840 // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
4841 // Inserting f32 into index0 is just movss.
4842 // Also, assume insertps is relatively cheap on all >= SSE41 targets.
4843 return (MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4844 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4845 (MScalarTy == MVT::f32 && ST->hasSSE1() && Index == 0 &&
4846 Opcode == Instruction::InsertElement) ||
4847 (MScalarTy == MVT::f32 && ST->hasSSE41() &&
4848 Opcode == Instruction::InsertElement);
4849 };
4850
4851 if (Index == 0) {
4852 // Floating point scalars are already located in index #0.
4853 // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
4854 // true for all.
4855 if (ScalarType->isFloatingPointTy() &&
4856 (Opcode != Instruction::InsertElement || !Op0 ||
4857 isa<UndefValue>(Op0)))
4858 return RegisterFileMoveCost;
4859
4860 if (Opcode == Instruction::InsertElement &&
4861 isa_and_nonnull<UndefValue>(Op0)) {
4862 // Consider the gather cost to be cheap.
4863 if (isa_and_nonnull<LoadInst>(Op1))
4864 return RegisterFileMoveCost;
4865 if (!IsCheapPInsrPExtrInsertPS()) {
4866 // mov constant-to-GPR + movd/movq GPR -> XMM.
4867 if (isa_and_nonnull<Constant>(Op1) && Op1->getType()->isIntegerTy())
4868 return 2 + RegisterFileMoveCost;
4869 // Assume movd/movq GPR -> XMM is relatively cheap on all targets.
4870 return 1 + RegisterFileMoveCost;
4871 }
4872 }
4873
4874 // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
4875 if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
4876 return 1 + RegisterFileMoveCost;
4877 }
4878
4879 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4880 assert(ISD && "Unexpected vector opcode");
4881 if (ST->useSLMArithCosts())
4882 if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
4883 return Entry->Cost + RegisterFileMoveCost;
4884
4885 // Consider cheap cases.
4886 if (IsCheapPInsrPExtrInsertPS())
4887 return 1 + RegisterFileMoveCost;
4888
4889 // For extractions we just need to shuffle the element to index 0, which
4890 // should be very cheap (assume cost = 1). For insertions we need to shuffle
4891 // the elements to its destination. In both cases we must handle the
4892 // subvector move(s).
4893 // If the vector type is already less than 128-bits then don't reduce it.
4894 // TODO: Under what circumstances should we shuffle using the full width?
4895 InstructionCost ShuffleCost = 1;
4896 if (Opcode == Instruction::InsertElement) {
4897 auto *SubTy = cast<VectorType>(Val);
4898 EVT VT = TLI->getValueType(DL, Val);
4899 if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
4900 SubTy = FixedVectorType::get(ScalarType, SubNumElts);
4901 ShuffleCost =
4902 getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, {}, CostKind, 0, SubTy);
4903 }
4904 int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
4905 return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
4906 }
4907
4908 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1) +
4909 RegisterFileMoveCost;
4910}
4911
4913 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
4915 assert(DemandedElts.getBitWidth() ==
4916 cast<FixedVectorType>(Ty)->getNumElements() &&
4917 "Vector size mismatch");
4918
4919 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4920 MVT MScalarTy = LT.second.getScalarType();
4921 unsigned LegalVectorBitWidth = LT.second.getSizeInBits();
4923
4924 constexpr unsigned LaneBitWidth = 128;
4925 assert((LegalVectorBitWidth < LaneBitWidth ||
4926 (LegalVectorBitWidth % LaneBitWidth) == 0) &&
4927 "Illegal vector");
4928
4929 const int NumLegalVectors = *LT.first.getValue();
4930 assert(NumLegalVectors >= 0 && "Negative cost!");
4931
4932 // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
4933 // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
4934 if (Insert) {
4935 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4936 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4937 (MScalarTy == MVT::f32 && ST->hasSSE41())) {
4938 // For types we can insert directly, insertion into 128-bit sub vectors is
4939 // cheap, followed by a cheap chain of concatenations.
4940 if (LegalVectorBitWidth <= LaneBitWidth) {
4941 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
4942 /*Extract*/ false, CostKind);
4943 } else {
4944 // In each 128-lane, if at least one index is demanded but not all
4945 // indices are demanded and this 128-lane is not the first 128-lane of
4946 // the legalized-vector, then this 128-lane needs a extracti128; If in
4947 // each 128-lane, there is at least one demanded index, this 128-lane
4948 // needs a inserti128.
4949
4950 // The following cases will help you build a better understanding:
4951 // Assume we insert several elements into a v8i32 vector in avx2,
4952 // Case#1: inserting into 1th index needs vpinsrd + inserti128.
4953 // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
4954 // inserti128.
4955 // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
4956 assert((LegalVectorBitWidth % LaneBitWidth) == 0 && "Illegal vector");
4957 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
4958 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
4959 unsigned NumLegalElts =
4960 LT.second.getVectorNumElements() * NumLegalVectors;
4961 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
4962 "Vector has been legalized to smaller element count");
4963 assert((NumLegalElts % NumLanesTotal) == 0 &&
4964 "Unexpected elts per lane");
4965 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
4966
4967 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
4968 auto *LaneTy =
4969 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
4970
4971 for (unsigned I = 0; I != NumLanesTotal; ++I) {
4972 APInt LaneEltMask = WidenedDemandedElts.extractBits(
4973 NumEltsPerLane, NumEltsPerLane * I);
4974 if (LaneEltMask.isZero())
4975 continue;
4976 // FIXME: we don't need to extract if all non-demanded elements
4977 // are legalization-inserted padding.
4978 if (!LaneEltMask.isAllOnes())
4980 I * NumEltsPerLane, LaneTy);
4981 Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert,
4982 /*Extract*/ false, CostKind);
4983 }
4984
4985 APInt AffectedLanes =
4986 APIntOps::ScaleBitMask(WidenedDemandedElts, NumLanesTotal);
4987 APInt FullyAffectedLegalVectors = APIntOps::ScaleBitMask(
4988 AffectedLanes, NumLegalVectors, /*MatchAllBits=*/true);
4989 for (int LegalVec = 0; LegalVec != NumLegalVectors; ++LegalVec) {
4990 for (unsigned Lane = 0; Lane != NumLegalLanes; ++Lane) {
4991 unsigned I = NumLegalLanes * LegalVec + Lane;
4992 // No need to insert unaffected lane; or lane 0 of each legal vector
4993 // iff ALL lanes of that vector were affected and will be inserted.
4994 if (!AffectedLanes[I] ||
4995 (Lane == 0 && FullyAffectedLegalVectors[LegalVec]))
4996 continue;
4998 I * NumEltsPerLane, LaneTy);
4999 }
5000 }
5001 }
5002 } else if (LT.second.isVector()) {
5003 // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
5004 // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
5005 // series of UNPCK followed by CONCAT_VECTORS - all of these can be
5006 // considered cheap.
5007 if (Ty->isIntOrIntVectorTy())
5008 Cost += DemandedElts.popcount();
5009
5010 // Get the smaller of the legalized or original pow2-extended number of
5011 // vector elements, which represents the number of unpacks we'll end up
5012 // performing.
5013 unsigned NumElts = LT.second.getVectorNumElements();
5014 unsigned Pow2Elts =
5015 PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements());
5016 Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
5017 }
5018 }
5019
5020 if (Extract) {
5021 // vXi1 can be efficiently extracted with MOVMSK.
5022 // TODO: AVX512 predicate mask handling.
5023 // NOTE: This doesn't work well for roundtrip scalarization.
5024 if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) {
5025 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
5026 unsigned MaxElts = ST->hasAVX2() ? 32 : 16;
5027 unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts;
5028 return MOVMSKCost;
5029 }
5030
5031 if (LT.second.isVector()) {
5032 unsigned NumLegalElts =
5033 LT.second.getVectorNumElements() * NumLegalVectors;
5034 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
5035 "Vector has been legalized to smaller element count");
5036
5037 // If we're extracting elements from a 128-bit subvector lane,
5038 // we only need to extract each lane once, not for every element.
5039 if (LegalVectorBitWidth > LaneBitWidth) {
5040 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
5041 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
5042 assert((NumLegalElts % NumLanesTotal) == 0 &&
5043 "Unexpected elts per lane");
5044 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
5045
5046 // Add cost for each demanded 128-bit subvector extraction.
5047 // Luckily this is a lot easier than for insertion.
5048 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
5049 auto *LaneTy =
5050 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
5051
5052 for (unsigned I = 0; I != NumLanesTotal; ++I) {
5053 APInt LaneEltMask = WidenedDemandedElts.extractBits(
5054 NumEltsPerLane, I * NumEltsPerLane);
5055 if (LaneEltMask.isZero())
5056 continue;
5058 I * NumEltsPerLane, LaneTy);
5060 LaneTy, LaneEltMask, /*Insert*/ false, Extract, CostKind);
5061 }
5062
5063 return Cost;
5064 }
5065 }
5066
5067 // Fallback to default extraction.
5068 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ false,
5069 Extract, CostKind);
5070 }
5071
5072 return Cost;
5073}
5074
5076X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
5077 int VF, const APInt &DemandedDstElts,
5079 const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy);
5080 // We don't differentiate element types here, only element bit width.
5081 EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits);
5082
5083 auto bailout = [&]() {
5084 return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
5085 DemandedDstElts, CostKind);
5086 };
5087
5088 // For now, only deal with AVX512 cases.
5089 if (!ST->hasAVX512())
5090 return bailout();
5091
5092 // Do we have a native shuffle for this element type, or should we promote?
5093 unsigned PromEltTyBits = EltTyBits;
5094 switch (EltTyBits) {
5095 case 32:
5096 case 64:
5097 break; // AVX512F.
5098 case 16:
5099 if (!ST->hasBWI())
5100 PromEltTyBits = 32; // promote to i32, AVX512F.
5101 break; // AVX512BW
5102 case 8:
5103 if (!ST->hasVBMI())
5104 PromEltTyBits = 32; // promote to i32, AVX512F.
5105 break; // AVX512VBMI
5106 case 1:
5107 // There is no support for shuffling i1 elements. We *must* promote.
5108 if (ST->hasBWI()) {
5109 if (ST->hasVBMI())
5110 PromEltTyBits = 8; // promote to i8, AVX512VBMI.
5111 else
5112 PromEltTyBits = 16; // promote to i16, AVX512BW.
5113 break;
5114 }
5115 PromEltTyBits = 32; // promote to i32, AVX512F.
5116 break;
5117 default:
5118 return bailout();
5119 }
5120 auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits);
5121
5122 auto *SrcVecTy = FixedVectorType::get(EltTy, VF);
5123 auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF);
5124
5125 int NumDstElements = VF * ReplicationFactor;
5126 auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements);
5127 auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements);
5128
5129 // Legalize the types.
5130 MVT LegalSrcVecTy = getTypeLegalizationCost(SrcVecTy).second;
5131 MVT LegalPromSrcVecTy = getTypeLegalizationCost(PromSrcVecTy).second;
5132 MVT LegalPromDstVecTy = getTypeLegalizationCost(PromDstVecTy).second;
5133 MVT LegalDstVecTy = getTypeLegalizationCost(DstVecTy).second;
5134 // They should have legalized into vector types.
5135 if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() ||
5136 !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector())
5137 return bailout();
5138
5139 if (PromEltTyBits != EltTyBits) {
5140 // If we have to perform the shuffle with wider elt type than our data type,
5141 // then we will first need to anyext (we don't care about the new bits)
5142 // the source elements, and then truncate Dst elements.
5143 InstructionCost PromotionCost;
5144 PromotionCost += getCastInstrCost(
5145 Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy,
5147 PromotionCost +=
5148 getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy,
5149 /*Src=*/PromDstVecTy,
5151 return PromotionCost + getReplicationShuffleCost(PromEltTy,
5152 ReplicationFactor, VF,
5153 DemandedDstElts, CostKind);
5154 }
5155
5156 assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits &&
5157 LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() &&
5158 "We expect that the legalization doesn't affect the element width, "
5159 "doesn't coalesce/split elements.");
5160
5161 unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements();
5162 unsigned NumDstVectors =
5163 divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec);
5164
5165 auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec);
5166
5167 // Not all the produced Dst elements may be demanded. In our case,
5168 // given that a single Dst vector is formed by a single shuffle,
5169 // if all elements that will form a single Dst vector aren't demanded,
5170 // then we won't need to do that shuffle, so adjust the cost accordingly.
5171 APInt DemandedDstVectors = APIntOps::ScaleBitMask(
5172 DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors);
5173 unsigned NumDstVectorsDemanded = DemandedDstVectors.popcount();
5174
5175 InstructionCost SingleShuffleCost = getShuffleCost(
5176 TTI::SK_PermuteSingleSrc, SingleDstVecTy, /*Mask=*/{}, CostKind,
5177 /*Index=*/0, /*SubTp=*/nullptr);
5178 return NumDstVectorsDemanded * SingleShuffleCost;
5179}
5180
5182 MaybeAlign Alignment,
5183 unsigned AddressSpace,
5185 TTI::OperandValueInfo OpInfo,
5186 const Instruction *I) {
5187 // TODO: Handle other cost kinds.
5189 if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
5190 // Store instruction with index and scale costs 2 Uops.
5191 // Check the preceding GEP to identify non-const indices.
5192 if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
5193 if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
5194 return TTI::TCC_Basic * 2;
5195 }
5196 }
5197 return TTI::TCC_Basic;
5198 }
5199
5200 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
5201 "Invalid Opcode");
5202 // Type legalization can't handle structs
5203 if (TLI->getValueType(DL, Src, true) == MVT::Other)
5204 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5205 CostKind, OpInfo, I);
5206
5207 // Legalize the type.
5208 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
5209
5210 auto *VTy = dyn_cast<FixedVectorType>(Src);
5211
5213
5214 // Add a cost for constant load to vector.
5215 if (Opcode == Instruction::Store && OpInfo.isConstant())
5216 Cost += getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src),
5217 /*AddressSpace=*/0, CostKind, OpInfo);
5218
5219 // Handle the simple case of non-vectors.
5220 // NOTE: this assumes that legalization never creates vector from scalars!
5221 if (!VTy || !LT.second.isVector()) {
5222 // Each load/store unit costs 1.
5223 return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1;
5224 }
5225
5226 bool IsLoad = Opcode == Instruction::Load;
5227
5228 Type *EltTy = VTy->getElementType();
5229
5230 const int EltTyBits = DL.getTypeSizeInBits(EltTy);
5231
5232 // Source of truth: how many elements were there in the original IR vector?
5233 const unsigned SrcNumElt = VTy->getNumElements();
5234
5235 // How far have we gotten?
5236 int NumEltRemaining = SrcNumElt;
5237 // Note that we intentionally capture by-reference, NumEltRemaining changes.
5238 auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
5239
5240 const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8);
5241
5242 // Note that even if we can store 64 bits of an XMM, we still operate on XMM.
5243 const unsigned XMMBits = 128;
5244 if (XMMBits % EltTyBits != 0)
5245 // Vector size must be a multiple of the element size. I.e. no padding.
5246 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5247 CostKind, OpInfo, I);
5248 const int NumEltPerXMM = XMMBits / EltTyBits;
5249
5250 auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
5251
5252 for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0;
5253 NumEltRemaining > 0; CurrOpSizeBytes /= 2) {
5254 // How many elements would a single op deal with at once?
5255 if ((8 * CurrOpSizeBytes) % EltTyBits != 0)
5256 // Vector size must be a multiple of the element size. I.e. no padding.
5257 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5258 CostKind, OpInfo, I);
5259 int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits;
5260
5261 assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?");
5262 assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||
5263 (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&
5264 "Unless we haven't halved the op size yet, "
5265 "we have less than two op's sized units of work left.");
5266
5267 auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
5268 ? FixedVectorType::get(EltTy, CurrNumEltPerOp)
5269 : XMMVecTy;
5270
5271 assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&
5272 "After halving sizes, the vector elt count is no longer a multiple "
5273 "of number of elements per operation?");
5274 auto *CoalescedVecTy =
5275 CurrNumEltPerOp == 1
5276 ? CurrVecTy
5278 IntegerType::get(Src->getContext(),
5279 EltTyBits * CurrNumEltPerOp),
5280 CurrVecTy->getNumElements() / CurrNumEltPerOp);
5281 assert(DL.getTypeSizeInBits(CoalescedVecTy) ==
5282 DL.getTypeSizeInBits(CurrVecTy) &&
5283 "coalesciing elements doesn't change vector width.");
5284
5285 while (NumEltRemaining > 0) {
5286 assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?");
5287
5288 // Can we use this vector size, as per the remaining element count?
5289 // Iff the vector is naturally aligned, we can do a wide load regardless.
5290 if (NumEltRemaining < CurrNumEltPerOp &&
5291 (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) &&
5292 CurrOpSizeBytes != 1)
5293 break; // Try smalled vector size.
5294
5295 // This isn't exactly right. We're using slow unaligned 32-byte accesses
5296 // as a proxy for a double-pumped AVX memory interface such as on
5297 // Sandybridge.
5298 // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or
5299 // will be scalarized.
5300 if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
5301 Cost += 2;
5302 else if (CurrOpSizeBytes < 4)
5303 Cost += 2;
5304 else
5305 Cost += 1;
5306
5307 // If we're loading a uniform value, then we don't need to split the load,
5308 // loading just a single (widest) vector can be reused by all splits.
5309 if (IsLoad && OpInfo.isUniform())
5310 return Cost;
5311
5312 bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
5313
5314 // If we have fully processed the previous reg, we need to replenish it.
5315 if (SubVecEltsLeft == 0) {
5316 SubVecEltsLeft += CurrVecTy->getNumElements();
5317 // And that's free only for the 0'th subvector of a legalized vector.
5318 if (!Is0thSubVec)
5321 VTy, {}, CostKind, NumEltDone(), CurrVecTy);
5322 }
5323
5324 // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
5325 // for smaller widths (32/16/8) we have to insert/extract them separately.
5326 // Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
5327 // but let's pretend that it is also true for 16/8 bit wide ops...)
5328 if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
5329 int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
5330 assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "");
5331 int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
5332 APInt DemandedElts =
5333 APInt::getBitsSet(CoalescedVecTy->getNumElements(),
5334 CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
5335 assert(DemandedElts.popcount() == 1 && "Inserting single value");
5336 Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
5337 !IsLoad, CostKind);
5338 }
5339
5340 SubVecEltsLeft -= CurrNumEltPerOp;
5341 NumEltRemaining -= CurrNumEltPerOp;
5342 Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes);
5343 }
5344 }
5345
5346 assert(NumEltRemaining <= 0 && "Should have processed all the elements.");
5347
5348 return Cost;
5349}
5350
5352X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
5353 unsigned AddressSpace,
5355 bool IsLoad = (Instruction::Load == Opcode);
5356 bool IsStore = (Instruction::Store == Opcode);
5357
5358 auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
5359 if (!SrcVTy)
5360 // To calculate scalar take the regular cost, without mask
5361 return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
5362
5363 unsigned NumElem = SrcVTy->getNumElements();
5364 auto *MaskTy =
5365 FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
5366 if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) ||
5367 (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) {
5368 // Scalarization
5369 APInt DemandedElts = APInt::getAllOnes(NumElem);
5371 MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind);
5372 InstructionCost ScalarCompareCost = getCmpSelInstrCost(
5373 Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
5375 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
5376 InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
5378 SrcVTy, DemandedElts, IsLoad, IsStore, CostKind);
5379 InstructionCost MemopCost =
5380 NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
5381 Alignment, AddressSpace, CostKind);
5382 return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
5383 }
5384
5385 // Legalize the type.
5386 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy);
5387 auto VT = TLI->getValueType(DL, SrcVTy);
5389 MVT Ty = LT.second;
5390 if (Ty == MVT::i16 || Ty == MVT::i32 || Ty == MVT::i64)
5391 // APX masked load/store for scalar is cheap.
5392 return Cost + LT.first;
5393
5394 if (VT.isSimple() && Ty != VT.getSimpleVT() &&
5395 LT.second.getVectorNumElements() == NumElem)
5396 // Promotion requires extend/truncate for data and a shuffle for mask.
5397 Cost +=
5399 nullptr) +
5400 getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, {}, CostKind, 0, nullptr);
5401
5402 else if (LT.first * Ty.getVectorNumElements() > NumElem) {
5403 auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
5405 // Expanding requires fill mask with zeroes
5407 MaskTy);
5408 }
5409
5410 // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
5411 if (!ST->hasAVX512())
5412 return Cost + LT.first * (IsLoad ? 2 : 8);
5413
5414 // AVX-512 masked load/store is cheaper
5415 return Cost + LT.first;
5416}
5417
5420 const Value *Base,
5421 const TTI::PointersChainInfo &Info,
5422 Type *AccessTy, TTI::TargetCostKind CostKind) {
5423 if (Info.isSameBase() && Info.isKnownStride()) {
5424 // If all the pointers have known stride all the differences are translated
5425 // into constants. X86 memory addressing allows encoding it into
5426 // displacement. So we just need to take the base GEP cost.
5427 if (const auto *BaseGEP = dyn_cast<GetElementPtrInst>(Base)) {
5428 SmallVector<const Value *> Indices(BaseGEP->indices());
5429 return getGEPCost(BaseGEP->getSourceElementType(),
5430 BaseGEP->getPointerOperand(), Indices, nullptr,
5431 CostKind);
5432 }
5433 return TTI::TCC_Free;
5434 }
5435 return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind);
5436}
5437
5439 ScalarEvolution *SE,
5440 const SCEV *Ptr) {
5441 // Address computations in vectorized code with non-consecutive addresses will
5442 // likely result in more instructions compared to scalar code where the
5443 // computation can more often be merged into the index mode. The resulting
5444 // extra micro-ops can significantly decrease throughput.
5445 const unsigned NumVectorInstToHideOverhead = 10;
5446
5447 // Cost modeling of Strided Access Computation is hidden by the indexing
5448 // modes of X86 regardless of the stride value. We dont believe that there
5449 // is a difference between constant strided access in gerenal and constant
5450 // strided value which is less than or equal to 64.
5451 // Even in the case of (loop invariant) stride whose value is not known at
5452 // compile time, the address computation will not incur more than one extra
5453 // ADD instruction.
5454 if (Ty->isVectorTy() && SE && !ST->hasAVX2()) {
5455 // TODO: AVX2 is the current cut-off because we don't have correct
5456 // interleaving costs for prior ISA's.
5458 return NumVectorInstToHideOverhead;
5460 return 1;
5461 }
5462
5463 return BaseT::getAddressComputationCost(Ty, SE, Ptr);
5464}
5465
5468 std::optional<FastMathFlags> FMF,
5471 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5472
5473 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5474 // and make it as the cost.
5475
5476 static const CostTblEntry SLMCostTbl[] = {
5477 { ISD::FADD, MVT::v2f64, 3 },
5478 { ISD::ADD, MVT::v2i64, 5 },
5479 };
5480
5481 static const CostTblEntry SSE2CostTbl[] = {
5482 { ISD::FADD, MVT::v2f64, 2 },
5483 { ISD::FADD, MVT::v2f32, 2 },
5484 { ISD::FADD, MVT::v4f32, 4 },
5485 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
5486 { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
5487 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
5488 { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
5489 { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
5490 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
5491 { ISD::ADD, MVT::v2i8, 2 },
5492 { ISD::ADD, MVT::v4i8, 2 },
5493 { ISD::ADD, MVT::v8i8, 2 },
5494 { ISD::ADD, MVT::v16i8, 3 },
5495 };
5496
5497 static const CostTblEntry AVX1CostTbl[] = {
5498 { ISD::FADD, MVT::v4f64, 3 },
5499 { ISD::FADD, MVT::v4f32, 3 },
5500 { ISD::FADD, MVT::v8f32, 4 },
5501 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
5502 { ISD::ADD, MVT::v4i64, 3 },
5503 { ISD::ADD, MVT::v8i32, 5 },
5504 { ISD::ADD, MVT::v16i16, 5 },
5505 { ISD::ADD, MVT::v32i8, 4 },
5506 };
5507
5508 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5509 assert(ISD && "Invalid opcode");
5510
5511 // Before legalizing the type, give a chance to look up illegal narrow types
5512 // in the table.
5513 // FIXME: Is there a better way to do this?
5514 EVT VT = TLI->getValueType(DL, ValTy);
5515 if (VT.isSimple()) {
5516 MVT MTy = VT.getSimpleVT();
5517 if (ST->useSLMArithCosts())
5518 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5519 return Entry->Cost;
5520
5521 if (ST->hasAVX())
5522 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5523 return Entry->Cost;
5524
5525 if (ST->hasSSE2())
5526 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5527 return Entry->Cost;
5528 }
5529
5530 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5531
5532 MVT MTy = LT.second;
5533
5534 auto *ValVTy = cast<FixedVectorType>(ValTy);
5535
5536 // Special case: vXi8 mul reductions are performed as vXi16.
5537 if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) {
5538 auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16);
5539 auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements());
5540 return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
5542 CostKind) +
5543 getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind);
5544 }
5545
5546 InstructionCost ArithmeticCost = 0;
5547 if (LT.first != 1 && MTy.isVector() &&
5548 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5549 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5550 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5551 MTy.getVectorNumElements());
5552 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5553 ArithmeticCost *= LT.first - 1;
5554 }
5555
5556 if (ST->useSLMArithCosts())
5557 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5558 return ArithmeticCost + Entry->Cost;
5559
5560 if (ST->hasAVX())
5561 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5562 return ArithmeticCost + Entry->Cost;
5563
5564 if (ST->hasSSE2())
5565 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5566 return ArithmeticCost + Entry->Cost;
5567
5568 // FIXME: These assume a naive kshift+binop lowering, which is probably
5569 // conservative in most cases.
5570 static const CostTblEntry AVX512BoolReduction[] = {
5571 { ISD::AND, MVT::v2i1, 3 },
5572 { ISD::AND, MVT::v4i1, 5 },
5573 { ISD::AND, MVT::v8i1, 7 },
5574 { ISD::AND, MVT::v16i1, 9 },
5575 { ISD::AND, MVT::v32i1, 11 },
5576 { ISD::AND, MVT::v64i1, 13 },
5577 { ISD::OR, MVT::v2i1, 3 },
5578 { ISD::OR, MVT::v4i1, 5 },
5579 { ISD::OR, MVT::v8i1, 7 },
5580 { ISD::OR, MVT::v16i1, 9 },
5581 { ISD::OR, MVT::v32i1, 11 },
5582 { ISD::OR, MVT::v64i1, 13 },
5583 };
5584
5585 static const CostTblEntry AVX2BoolReduction[] = {
5586 { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
5587 { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
5588 { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
5589 { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
5590 };
5591
5592 static const CostTblEntry AVX1BoolReduction[] = {
5593 { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
5594 { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
5595 { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5596 { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5597 { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
5598 { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
5599 { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5600 { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5601 };
5602
5603 static const CostTblEntry SSE2BoolReduction[] = {
5604 { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
5605 { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
5606 { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
5607 { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
5608 { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
5609 { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
5610 { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
5611 { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
5612 };
5613
5614 // Handle bool allof/anyof patterns.
5615 if (ValVTy->getElementType()->isIntegerTy(1)) {
5616 InstructionCost ArithmeticCost = 0;
5617 if (LT.first != 1 && MTy.isVector() &&
5618 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5619 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5620 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5621 MTy.getVectorNumElements());
5622 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5623 ArithmeticCost *= LT.first - 1;
5624 }
5625
5626 if (ST->hasAVX512())
5627 if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
5628 return ArithmeticCost + Entry->Cost;
5629 if (ST->hasAVX2())
5630 if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
5631 return ArithmeticCost + Entry->Cost;
5632 if (ST->hasAVX())
5633 if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
5634 return ArithmeticCost + Entry->Cost;
5635 if (ST->hasSSE2())
5636 if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
5637 return ArithmeticCost + Entry->Cost;
5638
5639 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5640 }
5641
5642 unsigned NumVecElts = ValVTy->getNumElements();
5643 unsigned ScalarSize = ValVTy->getScalarSizeInBits();
5644
5645 // Special case power of 2 reductions where the scalar type isn't changed
5646 // by type legalization.
5647 if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
5648 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5649
5650 InstructionCost ReductionCost = 0;
5651
5652 auto *Ty = ValVTy;
5653 if (LT.first != 1 && MTy.isVector() &&
5654 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5655 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5656 Ty = FixedVectorType::get(ValVTy->getElementType(),
5657 MTy.getVectorNumElements());
5658 ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5659 ReductionCost *= LT.first - 1;
5660 NumVecElts = MTy.getVectorNumElements();
5661 }
5662
5663 // Now handle reduction with the legal type, taking into account size changes
5664 // at each level.
5665 while (NumVecElts > 1) {
5666 // Determine the size of the remaining vector we need to reduce.
5667 unsigned Size = NumVecElts * ScalarSize;
5668 NumVecElts /= 2;
5669 // If we're reducing from 256/512 bits, use an extract_subvector.
5670 if (Size > 128) {
5671 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5672 ReductionCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, {},
5673 CostKind, NumVecElts, SubTy);
5674 Ty = SubTy;
5675 } else if (Size == 128) {
5676 // Reducing from 128 bits is a permute of v2f64/v2i64.
5677 FixedVectorType *ShufTy;
5678 if (ValVTy->isFloatingPointTy())
5679 ShufTy =
5680 FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
5681 else
5682 ShufTy =
5683 FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
5684 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, {},
5685 CostKind, 0, nullptr);
5686 } else if (Size == 64) {
5687 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5688 FixedVectorType *ShufTy;
5689 if (ValVTy->isFloatingPointTy())
5690 ShufTy =
5691 FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
5692 else
5693 ShufTy =
5694 FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
5695 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, {},
5696 CostKind, 0, nullptr);
5697 } else {
5698 // Reducing from smaller size is a shift by immediate.
5699 auto *ShiftTy = FixedVectorType::get(
5700 Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
5701 ReductionCost += getArithmeticInstrCost(
5702 Instruction::LShr, ShiftTy, CostKind,
5705 }
5706
5707 // Add the arithmetic op for this level.
5708 ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
5709 }
5710
5711 // Add the final extract element to the cost.
5712 return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5713 CostKind, 0, nullptr, nullptr);
5714}
5715
5718 FastMathFlags FMF) {
5719 IntrinsicCostAttributes ICA(IID, Ty, {Ty, Ty}, FMF);
5720 return getIntrinsicInstrCost(ICA, CostKind);
5721}
5722
5725 FastMathFlags FMF,
5727 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5728
5729 MVT MTy = LT.second;
5730
5731 int ISD;
5732 if (ValTy->isIntOrIntVectorTy()) {
5733 ISD = (IID == Intrinsic::umin || IID == Intrinsic::umax) ? ISD::UMIN
5734 : ISD::SMIN;
5735 } else {
5736 assert(ValTy->isFPOrFPVectorTy() &&
5737 "Expected float point or integer vector type.");
5738 ISD = (IID == Intrinsic::minnum || IID == Intrinsic::maxnum)
5739 ? ISD::FMINNUM
5740 : ISD::FMINIMUM;
5741 }
5742
5743 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5744 // and make it as the cost.
5745
5746 static const CostTblEntry SSE2CostTbl[] = {
5747 {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw
5748 {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw
5749 {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw
5750 };
5751
5752 static const CostTblEntry SSE41CostTbl[] = {
5753 {ISD::SMIN, MVT::v2i16, 3}, // same as sse2
5754 {ISD::SMIN, MVT::v4i16, 5}, // same as sse2
5755 {ISD::UMIN, MVT::v2i16, 5}, // same as sse2
5756 {ISD::UMIN, MVT::v4i16, 7}, // same as sse2
5757 {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor
5758 {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax
5759 {ISD::SMIN, MVT::v2i8, 3}, // pminsb
5760 {ISD::SMIN, MVT::v4i8, 5}, // pminsb
5761 {ISD::SMIN, MVT::v8i8, 7}, // pminsb
5762 {ISD::SMIN, MVT::v16i8, 6},
5763 {ISD::UMIN, MVT::v2i8, 3}, // same as sse2
5764 {ISD::UMIN, MVT::v4i8, 5}, // same as sse2
5765 {ISD::UMIN, MVT::v8i8, 7}, // same as sse2
5766 {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax
5767 };
5768
5769 static const CostTblEntry AVX1CostTbl[] = {
5770 {ISD::SMIN, MVT::v16i16, 6},
5771 {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax
5772 {ISD::SMIN, MVT::v32i8, 8},
5773 {ISD::UMIN, MVT::v32i8, 8},
5774 };
5775
5776 static const CostTblEntry AVX512BWCostTbl[] = {
5777 {ISD::SMIN, MVT::v32i16, 8},
5778 {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax
5779 {ISD::SMIN, MVT::v64i8, 10},
5780 {ISD::UMIN, MVT::v64i8, 10},
5781 };
5782
5783 // Before legalizing the type, give a chance to look up illegal narrow types
5784 // in the table.
5785 // FIXME: Is there a better way to do this?
5786 EVT VT = TLI->getValueType(DL, ValTy);
5787 if (VT.isSimple()) {
5788 MVT MTy = VT.getSimpleVT();
5789 if (ST->hasBWI())
5790 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5791 return Entry->Cost;
5792
5793 if (ST->hasAVX())
5794 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5795 return Entry->Cost;
5796
5797 if (ST->hasSSE41())
5798 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5799 return Entry->Cost;
5800
5801 if (ST->hasSSE2())
5802 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5803 return Entry->Cost;
5804 }
5805
5806 auto *ValVTy = cast<FixedVectorType>(ValTy);
5807 unsigned NumVecElts = ValVTy->getNumElements();
5808
5809 auto *Ty = ValVTy;
5810 InstructionCost MinMaxCost = 0;
5811 if (LT.first != 1 && MTy.isVector() &&
5812 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5813 // Type needs to be split. We need LT.first - 1 operations ops.
5814 Ty = FixedVectorType::get(ValVTy->getElementType(),
5815 MTy.getVectorNumElements());
5816 MinMaxCost = getMinMaxCost(IID, Ty, CostKind, FMF);
5817 MinMaxCost *= LT.first - 1;
5818 NumVecElts = MTy.getVectorNumElements();
5819 }
5820
5821 if (ST->hasBWI())
5822 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5823 return MinMaxCost + Entry->Cost;
5824
5825 if (ST->hasAVX())
5826 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5827 return MinMaxCost + Entry->Cost;
5828
5829 if (ST->hasSSE41())
5830 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5831 return MinMaxCost + Entry->Cost;
5832
5833 if (ST->hasSSE2())
5834 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5835 return MinMaxCost + Entry->Cost;
5836
5837 unsigned ScalarSize = ValTy->getScalarSizeInBits();
5838
5839 // Special case power of 2 reductions where the scalar type isn't changed
5840 // by type legalization.
5841 if (!isPowerOf2_32(ValVTy->getNumElements()) ||
5842 ScalarSize != MTy.getScalarSizeInBits())
5843 return BaseT::getMinMaxReductionCost(IID, ValTy, FMF, CostKind);
5844
5845 // Now handle reduction with the legal type, taking into account size changes
5846 // at each level.
5847 while (NumVecElts > 1) {
5848 // Determine the size of the remaining vector we need to reduce.
5849 unsigned Size = NumVecElts * ScalarSize;
5850 NumVecElts /= 2;
5851 // If we're reducing from 256/512 bits, use an extract_subvector.
5852 if (Size > 128) {
5853 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5854 MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, {}, CostKind,
5855 NumVecElts, SubTy);
5856 Ty = SubTy;
5857 } else if (Size == 128) {
5858 // Reducing from 128 bits is a permute of v2f64/v2i64.
5859 VectorType *ShufTy;
5860 if (ValTy->isFloatingPointTy())
5861 ShufTy =
5863 else
5864 ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
5865 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, {},
5866 CostKind, 0, nullptr);
5867 } else if (Size == 64) {
5868 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5869 FixedVectorType *ShufTy;
5870 if (ValTy->isFloatingPointTy())
5871 ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
5872 else
5873 ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
5874 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, {},
5875 CostKind, 0, nullptr);
5876 } else {
5877 // Reducing from smaller size is a shift by immediate.
5878 auto *ShiftTy = FixedVectorType::get(
5879 Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
5880 MinMaxCost += getArithmeticInstrCost(
5881 Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
5884 }
5885
5886 // Add the arithmetic op for this level.
5887 MinMaxCost += getMinMaxCost(IID, Ty, CostKind, FMF);
5888 }
5889
5890 // Add the final extract element to the cost.
5891 return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5892 CostKind, 0, nullptr, nullptr);
5893}
5894
5895/// Calculate the cost of materializing a 64-bit value. This helper
5896/// method might only calculate a fraction of a larger immediate. Therefore it
5897/// is valid to return a cost of ZERO.
5899 if (Val == 0)
5900 return TTI::TCC_Free;
5901
5902 if (isInt<32>(Val))
5903 return TTI::TCC_Basic;
5904
5905 return 2 * TTI::TCC_Basic;
5906}
5907
5910 assert(Ty->isIntegerTy());
5911
5912 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5913 if (BitSize == 0)
5914 return ~0U;
5915
5916 // Never hoist constants larger than 128bit, because this might lead to
5917 // incorrect code generation or assertions in codegen.
5918 // Fixme: Create a cost model for types larger than i128 once the codegen
5919 // issues have been fixed.
5920 if (BitSize > 128)
5921 return TTI::TCC_Free;
5922
5923 if (Imm == 0)
5924 return TTI::TCC_Free;
5925
5926 // Sign-extend all constants to a multiple of 64-bit.
5927 APInt ImmVal = Imm;
5928 if (BitSize % 64 != 0)
5929 ImmVal = Imm.sext(alignTo(BitSize, 64));
5930
5931 // Split the constant into 64-bit chunks and calculate the cost for each
5932 // chunk.
5934 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
5935 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
5936 int64_t Val = Tmp.getSExtValue();
5937 Cost += getIntImmCost(Val);
5938 }
5939 // We need at least one instruction to materialize the constant.
5940 return std::max<InstructionCost>(1, Cost);
5941}
5942
5944 const APInt &Imm, Type *Ty,
5946 Instruction *Inst) {
5947 assert(Ty->isIntegerTy());
5948
5949 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5950 unsigned ImmBitWidth = Imm.getBitWidth();
5951
5952 // There is no cost model for constants with a bit size of 0. Return TCC_Free
5953 // here, so that constant hoisting will ignore this constant.
5954 if (BitSize == 0)
5955 return TTI::TCC_Free;
5956
5957 unsigned ImmIdx = ~0U;
5958 switch (Opcode) {
5959 default:
5960 return TTI::TCC_Free;
5961 case Instruction::GetElementPtr:
5962 // Always hoist the base address of a GetElementPtr. This prevents the
5963 // creation of new constants for every base constant that gets constant
5964 // folded with the offset.
5965 if (Idx == 0)
5966 return 2 * TTI::TCC_Basic;
5967 return TTI::TCC_Free;
5968 case Instruction::Store:
5969 ImmIdx = 0;
5970 break;
5971 case Instruction::ICmp:
5972 // This is an imperfect hack to prevent constant hoisting of
5973 // compares that might be trying to check if a 64-bit value fits in
5974 // 32-bits. The backend can optimize these cases using a right shift by 32.
5975 // Ideally we would check the compare predicate here. There also other
5976 // similar immediates the backend can use shifts for.
5977 if (Idx == 1 && ImmBitWidth == 64) {
5978 uint64_t ImmVal = Imm.getZExtValue();
5979 if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
5980 return TTI::TCC_Free;
5981 }
5982 ImmIdx = 1;
5983 break;
5984 case Instruction::And:
5985 // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
5986 // by using a 32-bit operation with implicit zero extension. Detect such
5987 // immediates here as the normal path expects bit 31 to be sign extended.
5988 if (Idx == 1 && ImmBitWidth == 64 && Imm.isIntN(32))
5989 return TTI::TCC_Free;
5990 // If we have BMI then we can use BEXTR/BZHI to mask out upper i64 bits.
5991 if (Idx == 1 && ImmBitWidth == 64 && ST->is64Bit() && ST->hasBMI() &&
5992 Imm.isMask())
5993 return X86TTIImpl::getIntImmCost(ST->hasBMI2() ? 255 : 65535);
5994 ImmIdx = 1;
5995 break;
5996 case Instruction::Add:
5997 case Instruction::Sub:
5998 // For add/sub, we can use the opposite instruction for INT32_MIN.
5999 if (Idx == 1 && ImmBitWidth == 64 && Imm.getZExtValue() == 0x80000000)
6000 return TTI::TCC_Free;
6001 ImmIdx = 1;
6002 break;
6003 case Instruction::UDiv:
6004 case Instruction::SDiv:
6005 case Instruction::URem:
6006 case Instruction::SRem:
6007 // Division by constant is typically expanded later into a different
6008 // instruction sequence. This completely changes the constants.
6009 // Report them as "free" to stop ConstantHoist from marking them as opaque.
6010 return TTI::TCC_Free;
6011 case Instruction::Mul:
6012 case Instruction::Or:
6013 case Instruction::Xor:
6014 ImmIdx = 1;
6015 break;
6016 // Always return TCC_Free for the shift value of a shift instruction.
6017 case Instruction::Shl:
6018 case Instruction::LShr:
6019 case Instruction::AShr:
6020 if (Idx == 1)
6021 return TTI::TCC_Free;
6022 break;
6023 case Instruction::Trunc:
6024 case Instruction::ZExt:
6025 case Instruction::SExt:
6026 case Instruction::IntToPtr:
6027 case Instruction::PtrToInt:
6028 case Instruction::BitCast:
6029 case Instruction::PHI:
6030 case Instruction::Call:
6031 case Instruction::Select:
6032 case Instruction::Ret:
6033 case Instruction::Load:
6034 break;
6035 }
6036
6037 if (Idx == ImmIdx) {
6038 uint64_t NumConstants = divideCeil(BitSize, 64);
6040 return (Cost <= NumConstants * TTI::TCC_Basic)
6041 ? static_cast<int>(TTI::TCC_Free)
6042 : Cost;
6043 }
6044
6045 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
6046}
6047
6049 const APInt &Imm, Type *Ty,
6051 assert(Ty->isIntegerTy());
6052
6053 unsigned BitSize = Ty->getPrimitiveSizeInBits();
6054 // There is no cost model for constants with a bit size of 0. Return TCC_Free
6055 // here, so that constant hoisting will ignore this constant.
6056 if (BitSize == 0)
6057 return TTI::TCC_Free;
6058
6059 switch (IID) {
6060 default:
6061 return TTI::TCC_Free;
6062 case Intrinsic::sadd_with_overflow:
6063 case Intrinsic::uadd_with_overflow:
6064 case Intrinsic::ssub_with_overflow:
6065 case Intrinsic::usub_with_overflow:
6066 case Intrinsic::smul_with_overflow:
6067 case Intrinsic::umul_with_overflow:
6068 if ((Idx == 1) && Imm.getBitWidth() <= 64 && Imm.isSignedIntN(32))
6069 return TTI::TCC_Free;
6070 break;
6071 case Intrinsic::experimental_stackmap:
6072 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
6073 return TTI::TCC_Free;
6074 break;
6075 case Intrinsic::experimental_patchpoint_void:
6076 case Intrinsic::experimental_patchpoint:
6077 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
6078 return TTI::TCC_Free;
6079 break;
6080 }
6081 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
6082}
6083
6086 const Instruction *I) {
6088 return Opcode == Instruction::PHI ? TTI::TCC_Free : TTI::TCC_Basic;
6089 // Branches are assumed to be predicted.
6090 return TTI::TCC_Free;
6091}
6092
6093int X86TTIImpl::getGatherOverhead() const {
6094 // Some CPUs have more overhead for gather. The specified overhead is relative
6095 // to the Load operation. "2" is the number provided by Intel architects. This
6096 // parameter is used for cost estimation of Gather Op and comparison with
6097 // other alternatives.
6098 // TODO: Remove the explicit hasAVX512()?, That would mean we would only
6099 // enable gather with a -march.
6100 if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
6101 return 2;
6102
6103 return 1024;
6104}
6105
6106int X86TTIImpl::getScatterOverhead() const {
6107 if (ST->hasAVX512())
6108 return 2;
6109
6110 return 1024;
6111}
6112
6113// Return an average cost of Gather / Scatter instruction, maybe improved later.
6114InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode,
6116 Type *SrcVTy, const Value *Ptr,
6117 Align Alignment,
6118 unsigned AddressSpace) {
6119
6120 assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
6121 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
6122
6123 // Try to reduce index size from 64 bit (default for GEP)
6124 // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
6125 // operation will use 16 x 64 indices which do not fit in a zmm and needs
6126 // to split. Also check that the base pointer is the same for all lanes,
6127 // and that there's at most one variable index.
6128 auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
6129 unsigned IndexSize = DL.getPointerSizeInBits();
6130 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
6131 if (IndexSize < 64 || !GEP)
6132 return IndexSize;
6133
6134 unsigned NumOfVarIndices = 0;
6135 const Value *Ptrs = GEP->getPointerOperand();
6136 if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
6137 return IndexSize;
6138 for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) {
6139 if (isa<Constant>(GEP->getOperand(I)))
6140 continue;
6141 Type *IndxTy = GEP->getOperand(I)->getType();
6142 if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
6143 IndxTy = IndexVTy->getElementType();
6144 if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
6145 !isa<SExtInst>(GEP->getOperand(I))) ||
6146 ++NumOfVarIndices > 1)
6147 return IndexSize; // 64
6148 }
6149 return (unsigned)32;
6150 };
6151
6152 // Trying to reduce IndexSize to 32 bits for vector 16.
6153 // By default the IndexSize is equal to pointer size.
6154 unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
6155 ? getIndexSizeInBits(Ptr, DL)
6157
6158 auto *IndexVTy = FixedVectorType::get(
6159 IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
6160 std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(IndexVTy);
6161 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(SrcVTy);
6162 InstructionCost::CostType SplitFactor =
6163 *std::max(IdxsLT.first, SrcLT.first).getValue();
6164 if (SplitFactor > 1) {
6165 // Handle splitting of vector of pointers
6166 auto *SplitSrcTy =
6167 FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
6168 return SplitFactor * getGSVectorCost(Opcode, CostKind, SplitSrcTy, Ptr,
6169 Alignment, AddressSpace);
6170 }
6171
6172 // If we didn't split, this will be a single gather/scatter instruction.
6174 return 1;
6175
6176 // The gather / scatter cost is given by Intel architects. It is a rough
6177 // number since we are looking at one instruction in a time.
6178 const int GSOverhead = (Opcode == Instruction::Load) ? getGatherOverhead()
6179 : getScatterOverhead();
6180 return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
6181 MaybeAlign(Alignment), AddressSpace,
6182 CostKind);
6183}
6184
6185/// Calculate the cost of Gather / Scatter operation
6187 unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask,
6189 const Instruction *I = nullptr) {
6190 if ((Opcode == Instruction::Load &&
6191 (!isLegalMaskedGather(SrcVTy, Align(Alignment)) ||
6192 forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
6193 Align(Alignment)))) ||
6194 (Opcode == Instruction::Store &&
6195 (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) ||
6196 forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
6197 Align(Alignment)))))
6198 return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask,
6199 Alignment, CostKind, I);
6200
6201 assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
6202 PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
6203 if (!PtrTy && Ptr->getType()->isVectorTy())
6204 PtrTy = dyn_cast<PointerType>(
6205 cast<VectorType>(Ptr->getType())->getElementType());
6206 assert(PtrTy && "Unexpected type for Ptr argument");
6207 unsigned AddressSpace = PtrTy->getAddressSpace();
6208 return getGSVectorCost(Opcode, CostKind, SrcVTy, Ptr, Alignment,
6209 AddressSpace);
6210}
6211
6213 const TargetTransformInfo::LSRCost &C2) {
6214 // X86 specific here are "instruction number 1st priority".
6215 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
6216 C1.NumIVMuls, C1.NumBaseAdds,
6217 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
6218 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
6219 C2.NumIVMuls, C2.NumBaseAdds,
6220 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
6221}
6222
6224 return ST->hasMacroFusion() || ST->hasBranchFusion();
6225}
6226
6227bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
6228 Type *ScalarTy = DataTy->getScalarType();
6229
6230 // The backend can't handle a single element vector w/o CFCMOV.
6231 if (isa<VectorType>(DataTy) && cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6232 return ST->hasCF() && hasConditionalLoadStoreForType(ScalarTy);
6233
6234 if (!ST->hasAVX())
6235 return false;
6236
6237 if (ScalarTy->isPointerTy())
6238 return true;
6239
6240 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6241 return true;
6242
6243 if (ScalarTy->isHalfTy() && ST->hasBWI())
6244 return true;
6245
6246 if (ScalarTy->isBFloatTy() && ST->hasBF16())
6247 return true;
6248
6249 if (!ScalarTy->isIntegerTy())
6250 return false;
6251
6252 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6253 return IntWidth == 32 || IntWidth == 64 ||
6254 ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
6255}
6256
6257bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) {
6258 return isLegalMaskedLoad(DataType, Alignment);
6259}
6260
6261bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) {
6262 unsigned DataSize = DL.getTypeStoreSize(DataType);
6263 // The only supported nontemporal loads are for aligned vectors of 16 or 32
6264 // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
6265 // (the equivalent stores only require AVX).
6266 if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
6267 return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
6268
6269 return false;
6270}
6271
6272bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) {
6273 unsigned DataSize = DL.getTypeStoreSize(DataType);
6274
6275 // SSE4A supports nontemporal stores of float and double at arbitrary
6276 // alignment.
6277 if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
6278 return true;
6279
6280 // Besides the SSE4A subtarget exception above, only aligned stores are
6281 // available nontemporaly on any other subtarget. And only stores with a size
6282 // of 4..32 bytes (powers of 2, only) are permitted.
6283 if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
6284 !isPowerOf2_32(DataSize))
6285 return false;
6286
6287 // 32-byte vector nontemporal stores are supported by AVX (the equivalent
6288 // loads require AVX2).
6289 if (DataSize == 32)
6290 return ST->hasAVX();
6291 if (DataSize == 16)
6292 return ST->hasSSE1();
6293 return true;
6294}
6295
6297 ElementCount NumElements) const {
6298 // movddup
6299 return ST->hasSSE3() && !NumElements.isScalable() &&
6300 NumElements.getFixedValue() == 2 &&
6301 ElementTy == Type::getDoubleTy(ElementTy->getContext());
6302}
6303
6305 if (!isa<VectorType>(DataTy))
6306 return false;
6307
6308 if (!ST->hasAVX512())
6309 return false;
6310
6311 // The backend can't handle a single element vector.
6312 if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6313 return false;
6314
6315 Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
6316
6317 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6318 return true;
6319
6320 if (!ScalarTy->isIntegerTy())
6321 return false;
6322
6323 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6324 return IntWidth == 32 || IntWidth == 64 ||
6325 ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
6326}
6327
6329 return isLegalMaskedExpandLoad(DataTy, Alignment);
6330}
6331
6332bool X86TTIImpl::supportsGather() const {
6333 // Some CPUs have better gather performance than others.
6334 // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
6335 // enable gather with a -march.
6336 return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2());
6337}
6338
6340 // Gather / Scatter for vector 2 is not profitable on KNL / SKX
6341 // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend
6342 // it to 8 elements, but zeroing upper bits of the mask vector will add more
6343 // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO:
6344 // Check, maybe the gather/scatter instruction is better in the VariableMask
6345 // case.
6346 unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements();
6347 return NumElts == 1 ||
6348 (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())));
6349}
6350
6352 Type *ScalarTy = DataTy->getScalarType();
6353 if (ScalarTy->isPointerTy())
6354 return true;
6355
6356 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6357 return true;
6358
6359 if (!ScalarTy->isIntegerTy())
6360 return false;
6361
6362 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6363 return IntWidth == 32 || IntWidth == 64;
6364}
6365
6367 if (!supportsGather() || !ST->preferGather())
6368 return false;
6369 return isLegalMaskedGatherScatter(DataTy, Alignment);
6370}
6371
6372bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
6373 unsigned Opcode1,
6374 const SmallBitVector &OpcodeMask) const {
6375 // ADDSUBPS 4xf32 SSE3
6376 // VADDSUBPS 4xf32 AVX
6377 // VADDSUBPS 8xf32 AVX2
6378 // ADDSUBPD 2xf64 SSE3
6379 // VADDSUBPD 2xf64 AVX
6380 // VADDSUBPD 4xf64 AVX2
6381
6382 unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
6383 assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible");
6384 if (!isPowerOf2_32(NumElements))
6385 return false;
6386 // Check the opcode pattern. We apply the mask on the opcode arguments and
6387 // then check if it is what we expect.
6388 for (int Lane : seq<int>(0, NumElements)) {
6389 unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0;
6390 // We expect FSub for even lanes and FAdd for odd lanes.
6391 if (Lane % 2 == 0 && Opc != Instruction::FSub)
6392 return false;
6393 if (Lane % 2 == 1 && Opc != Instruction::FAdd)
6394 return false;
6395 }
6396 // Now check that the pattern is supported by the target ISA.
6397 Type *ElemTy = cast<VectorType>(VecTy)->getElementType();
6398 if (ElemTy->isFloatTy())
6399 return ST->hasSSE3() && NumElements % 4 == 0;
6400 if (ElemTy->isDoubleTy())
6401 return ST->hasSSE3() && NumElements % 2 == 0;
6402 return false;
6403}
6404
6405bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) {
6406 // AVX2 doesn't support scatter
6407 if (!ST->hasAVX512() || !ST->preferScatter())
6408 return false;
6409 return isLegalMaskedGatherScatter(DataType, Alignment);
6410}
6411
6412bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
6413 EVT VT = TLI->getValueType(DL, DataType);
6414 return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
6415}
6416
6418 // FDIV is always expensive, even if it has a very low uop count.
6419 // TODO: Still necessary for recent CPUs with low latency/throughput fdiv?
6420 if (I->getOpcode() == Instruction::FDiv)
6421 return true;
6422
6424}
6425
6427 return false;
6428}
6429
6431 const Function *Callee) const {
6432 const TargetMachine &TM = getTLI()->getTargetMachine();
6433
6434 // Work this as a subsetting of subtarget features.
6435 const FeatureBitset &CallerBits =
6436 TM.getSubtargetImpl(*Caller)->getFeatureBits();
6437 const FeatureBitset &CalleeBits =
6438 TM.getSubtargetImpl(*Callee)->getFeatureBits();
6439
6440 // Check whether features are the same (apart from the ignore list).
6441 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
6442 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
6443 if (RealCallerBits == RealCalleeBits)
6444 return true;
6445
6446 // If the features are a subset, we need to additionally check for calls
6447 // that may become ABI-incompatible as a result of inlining.
6448 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
6449 return false;
6450
6451 for (const Instruction &I : instructions(Callee)) {
6452 if (const auto *CB = dyn_cast<CallBase>(&I)) {
6453 // Having more target features is fine for inline ASM.
6454 if (CB->isInlineAsm())
6455 continue;
6456
6458 for (Value *Arg : CB->args())
6459 Types.push_back(Arg->getType());
6460 if (!CB->getType()->isVoidTy())
6461 Types.push_back(CB->getType());
6462
6463 // Simple types are always ABI compatible.
6464 auto IsSimpleTy = [](Type *Ty) {
6465 return !Ty->isVectorTy() && !Ty->isAggregateType();
6466 };
6467 if (all_of(Types, IsSimpleTy))
6468 continue;
6469
6470 if (Function *NestedCallee = CB->getCalledFunction()) {
6471 // Assume that intrinsics are always ABI compatible.
6472 if (NestedCallee->isIntrinsic())
6473 continue;
6474
6475 // Do a precise compatibility check.
6476 if (!areTypesABICompatible(Caller, NestedCallee, Types))
6477 return false;
6478 } else {
6479 // We don't know the target features of the callee,
6480 // assume it is incompatible.
6481 return false;
6482 }
6483 }
6484 }
6485 return true;
6486}
6487
6489 const Function *Callee,
6490 const ArrayRef<Type *> &Types) const {
6491 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
6492 return false;
6493
6494 // If we get here, we know the target features match. If one function
6495 // considers 512-bit vectors legal and the other does not, consider them
6496 // incompatible.
6497 const TargetMachine &TM = getTLI()->getTargetMachine();
6498
6499 if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
6500 TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs())
6501 return true;
6502
6503 // Consider the arguments compatible if they aren't vectors or aggregates.
6504 // FIXME: Look at the size of vectors.
6505 // FIXME: Look at the element types of aggregates to see if there are vectors.
6506 return llvm::none_of(Types,
6507 [](Type *T) { return T->isVectorTy() || T->isAggregateType(); });
6508}
6509
6511X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
6513 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
6514 Options.NumLoadsPerBlock = 2;
6515 // All GPR and vector loads can be unaligned.
6516 Options.AllowOverlappingLoads = true;
6517 if (IsZeroCmp) {
6518 // Only enable vector loads for equality comparison. Right now the vector
6519 // version is not as fast for three way compare (see #33329).
6520 const unsigned PreferredWidth = ST->getPreferVectorWidth();
6521 if (PreferredWidth >= 512 && ST->hasAVX512() && ST->hasEVEX512())
6522 Options.LoadSizes.push_back(64);
6523 if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
6524 if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
6525 }
6526 if (ST->is64Bit()) {
6527 Options.LoadSizes.push_back(8);
6528 }
6529 Options.LoadSizes.push_back(4);
6530 Options.LoadSizes.push_back(2);
6531 Options.LoadSizes.push_back(1);
6532 return Options;
6533}
6534
6536 return supportsGather();
6537}
6538
6540 return false;
6541}
6542
6544 // TODO: We expect this to be beneficial regardless of arch,
6545 // but there are currently some unexplained performance artifacts on Atom.
6546 // As a temporary solution, disable on Atom.
6547 return !(ST->isAtom());
6548}
6549
6550// Get estimation for interleaved load/store operations and strided load.
6551// \p Indices contains indices for strided load.
6552// \p Factor - the factor of interleaving.
6553// AVX-512 provides 3-src shuffles that significantly reduces the cost.
6555 unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
6556 ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
6557 TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
6558 // VecTy for interleave memop is <VF*Factor x Elt>.
6559 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6560 // VecTy = <12 x i32>.
6561
6562 // Calculate the number of memory operations (NumOfMemOps), required
6563 // for load/store the VecTy.
6564 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6565 unsigned VecTySize = DL.getTypeStoreSize(VecTy);
6566 unsigned LegalVTSize = LegalVT.getStoreSize();
6567 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
6568
6569 // Get the cost of one memory operation.
6570 auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
6571 LegalVT.getVectorNumElements());
6572 InstructionCost MemOpCost;
6573 bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps;
6574 if (UseMaskedMemOp)
6575 MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment,
6577 else
6578 MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, MaybeAlign(Alignment),
6580
6581 unsigned VF = VecTy->getNumElements() / Factor;
6582 MVT VT =
6584
6585 InstructionCost MaskCost;
6586 if (UseMaskedMemOp) {
6587 APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements());
6588 for (unsigned Index : Indices) {
6589 assert(Index < Factor && "Invalid index for interleaved memory op");
6590 for (unsigned Elm = 0; Elm < VF; Elm++)
6591 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
6592 }
6593
6594 Type *I1Type = Type::getInt1Ty(VecTy->getContext());
6595
6596 MaskCost = getReplicationShuffleCost(
6597 I1Type, Factor, VF,
6598 UseMaskForGaps ? DemandedLoadStoreElts
6600 CostKind);
6601
6602 // The Gaps mask is invariant and created outside the loop, therefore the
6603 // cost of creating it is not accounted for here. However if we have both
6604 // a MaskForGaps and some other mask that guards the execution of the
6605 // memory access, we need to account for the cost of And-ing the two masks
6606 // inside the loop.
6607 if (UseMaskForGaps) {
6608 auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements());
6609 MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind);
6610 }
6611 }
6612
6613 if (Opcode == Instruction::Load) {
6614 // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
6615 // contain the cost of the optimized shuffle sequence that the
6616 // X86InterleavedAccess pass will generate.
6617 // The cost of loads and stores are computed separately from the table.
6618
6619 // X86InterleavedAccess support only the following interleaved-access group.
6620 static const CostTblEntry AVX512InterleavedLoadTbl[] = {
6621 {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
6622 {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
6623 {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
6624 };
6625
6626 if (const auto *Entry =
6627 CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
6628 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6629 //If an entry does not exist, fallback to the default implementation.
6630
6631 // Kind of shuffle depends on number of loaded values.
6632 // If we load the entire data in one register, we can use a 1-src shuffle.
6633 // Otherwise, we'll merge 2 sources in each operation.
6634 TTI::ShuffleKind ShuffleKind =
6635 (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
6636
6637 InstructionCost ShuffleCost =
6638 getShuffleCost(ShuffleKind, SingleMemOpTy, {}, CostKind, 0, nullptr);
6639
6640 unsigned NumOfLoadsInInterleaveGrp =
6641 Indices.size() ? Indices.size() : Factor;
6642 auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
6643 VecTy->getNumElements() / Factor);
6644 InstructionCost NumOfResults =
6645 getTypeLegalizationCost(ResultTy).first * NumOfLoadsInInterleaveGrp;
6646
6647 // About a half of the loads may be folded in shuffles when we have only
6648 // one result. If we have more than one result, or the loads are masked,
6649 // we do not fold loads at all.
6650 unsigned NumOfUnfoldedLoads =
6651 UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
6652
6653 // Get a number of shuffle operations per result.
6654 unsigned NumOfShufflesPerResult =
6655 std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
6656
6657 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6658 // When we have more than one destination, we need additional instructions
6659 // to keep sources.
6660 InstructionCost NumOfMoves = 0;
6661 if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
6662 NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
6663
6664 InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
6665 MaskCost + NumOfUnfoldedLoads * MemOpCost +
6666 NumOfMoves;
6667
6668 return Cost;
6669 }
6670
6671 // Store.
6672 assert(Opcode == Instruction::Store &&
6673 "Expected Store Instruction at this point");
6674 // X86InterleavedAccess support only the following interleaved-access group.
6675 static const CostTblEntry AVX512InterleavedStoreTbl[] = {
6676 {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
6677 {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
6678 {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
6679
6680 {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
6681 {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
6682 {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
6683 {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
6684 };
6685
6686 if (const auto *Entry =
6687 CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
6688 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6689 //If an entry does not exist, fallback to the default implementation.
6690
6691 // There is no strided stores meanwhile. And store can't be folded in
6692 // shuffle.
6693 unsigned NumOfSources = Factor; // The number of values to be merged.
6694 InstructionCost ShuffleCost = getShuffleCost(
6695 TTI::SK_PermuteTwoSrc, SingleMemOpTy, {}, CostKind, 0, nullptr);
6696 unsigned NumOfShufflesPerStore = NumOfSources - 1;
6697
6698 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6699 // We need additional instructions to keep sources.
6700 unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
6702 MaskCost +
6703 NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
6704 NumOfMoves;
6705 return Cost;
6706}
6707
6709 unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices,
6710 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
6711 bool UseMaskForCond, bool UseMaskForGaps) {
6712 auto *VecTy = cast<FixedVectorType>(BaseTy);
6713
6714 auto isSupportedOnAVX512 = [&](Type *VecTy) {
6715 Type *EltTy = cast<VectorType>(VecTy)->getElementType();
6716 if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
6717 EltTy->isIntegerTy(32) || EltTy->isPointerTy())
6718 return true;
6719 if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy())
6720 return ST->hasBWI();
6721 if (EltTy->isBFloatTy())
6722 return ST->hasBF16();
6723 return false;
6724 };
6725 if (ST->hasAVX512() && isSupportedOnAVX512(VecTy))
6727 Opcode, VecTy, Factor, Indices, Alignment,
6728 AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
6729
6730 if (UseMaskForCond || UseMaskForGaps)
6731 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6732 Alignment, AddressSpace, CostKind,
6733 UseMaskForCond, UseMaskForGaps);
6734
6735 // Get estimation for interleaved load/store operations for SSE-AVX2.
6736 // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow
6737 // computing the cost using a generic formula as a function of generic
6738 // shuffles. We therefore use a lookup table instead, filled according to
6739 // the instruction sequences that codegen currently generates.
6740
6741 // VecTy for interleave memop is <VF*Factor x Elt>.
6742 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6743 // VecTy = <12 x i32>.
6744 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6745
6746 // This function can be called with VecTy=<6xi128>, Factor=3, in which case
6747 // the VF=2, while v2i128 is an unsupported MVT vector type
6748 // (see MachineValueType.h::getVectorVT()).
6749 if (!LegalVT.isVector())
6750 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6751 Alignment, AddressSpace, CostKind);
6752
6753 unsigned VF = VecTy->getNumElements() / Factor;
6754 Type *ScalarTy = VecTy->getElementType();
6755 // Deduplicate entries, model floats/pointers as appropriately-sized integers.
6756 if (!ScalarTy->isIntegerTy())
6757 ScalarTy =
6758 Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy));
6759
6760 // Get the cost of all the memory operations.
6761 // FIXME: discount dead loads.
6762 InstructionCost MemOpCosts = getMemoryOpCost(
6763 Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind);
6764
6765 auto *VT = FixedVectorType::get(ScalarTy, VF);
6766 EVT ETy = TLI->getValueType(DL, VT);
6767 if (!ETy.isSimple())
6768 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6769 Alignment, AddressSpace, CostKind);
6770
6771 // TODO: Complete for other data-types and strides.
6772 // Each combination of Stride, element bit width and VF results in a different
6773 // sequence; The cost tables are therefore accessed with:
6774 // Factor (stride) and VectorType=VFxiN.
6775 // The Cost accounts only for the shuffle sequence;
6776 // The cost of the loads/stores is accounted for separately.
6777 //
6778 static const CostTblEntry AVX2InterleavedLoadTbl[] = {
6779 {2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8
6780 {2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8
6781 {2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8
6782 {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8
6783 {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8
6784
6785 {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16
6786 {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16
6787 {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16
6788
6789 {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32
6790 {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32
6791 {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32
6792
6793 {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64
6794 {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64
6795 {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64
6796 {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64
6797
6798 {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8
6799 {3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8
6800 {3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8
6801 {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
6802 {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8
6803
6804 {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16
6805 {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16
6806 {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16
6807 {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16
6808 {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16
6809
6810 {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32
6811 {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32
6812 {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32
6813 {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32
6814 {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32
6815
6816 {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64
6817 {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64
6818 {3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64
6819 {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64
6820
6821 {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8
6822 {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8
6823 {4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8
6824 {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8
6825 {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8
6826
6827 {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16
6828 {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16
6829 {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16
6830 {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16
6831 {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16
6832
6833 {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32
6834 {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32
6835 {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32
6836 {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32
6837 {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32
6838
6839 {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64
6840 {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64
6841 {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64
6842 {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64
6843
6844 {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8
6845 {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8
6846 {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8
6847 {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8
6848 {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8
6849
6850 {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16
6851 {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16
6852 {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16
6853 {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16
6854 {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16
6855
6856 {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32
6857 {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32
6858 {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32
6859 {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32
6860
6861 {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64
6862 {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64
6863 {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64
6864
6865 {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32
6866 };
6867
6868 static const CostTblEntry SSSE3InterleavedLoadTbl[] = {
6869 {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16
6870 };
6871
6872 static const CostTblEntry SSE2InterleavedLoadTbl[] = {
6873 {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16
6874 {2, MVT::v4i16, 7}, // (load 8i16 and) deinterleave into 2 x 4i16
6875
6876 {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32
6877 {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32
6878
6879 {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64
6880 };
6881
6882 static const CostTblEntry AVX2InterleavedStoreTbl[] = {
6883 {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store)
6884 {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store)
6885
6886 {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store)
6887 {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store)
6888 {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store)
6889
6890 {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store)
6891 {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store)
6892 {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store)
6893 {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store)
6894
6895 {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store)
6896 {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store)
6897 {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store)
6898 {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store)
6899 {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store)
6900
6901 {3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store)
6902 {3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store)
6903 {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store)
6904 {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
6905 {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
6906
6907 {3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store)
6908 {3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store)
6909 {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store)
6910 {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store)
6911 {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store)
6912
6913 {3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store)
6914 {3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store)
6915 {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store)
6916 {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store)
6917 {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store)
6918
6919 {3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store)
6920 {3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store)
6921 {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store)
6922 {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store)
6923
6924 {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store)
6925 {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store)
6926 {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store)
6927 {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store)
6928 {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store)
6929
6930 {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store)
6931 {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store)
6932 {4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store)
6933 {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store)
6934 {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store)
6935
6936 {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store)
6937 {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store)
6938 {4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store)
6939 {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store)
6940 {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store)
6941
6942 {4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store)
6943 {4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store)
6944 {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store)
6945 {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store)
6946
6947 {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store)
6948 {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store)
6949 {6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store)
6950 {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store)
6951 {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store)
6952
6953 {6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store)
6954 {6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store)
6955 {6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store)
6956 {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store)
6957 {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store)
6958
6959 {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store)
6960 {6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store)
6961 {6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store)
6962 {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store)
6963
6964 {6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store)
6965 {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store)
6966 {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store)
6967 };
6968
6969 static const CostTblEntry SSE2InterleavedStoreTbl[] = {
6970 {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store)
6971 {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store)
6972 {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store)
6973
6974 {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store)
6975 {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store)
6976
6977 {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store)
6978 };
6979
6980 if (Opcode == Instruction::Load) {
6981 auto GetDiscountedCost = [Factor, NumMembers = Indices.size(),
6982 MemOpCosts](const CostTblEntry *Entry) {
6983 // NOTE: this is just an approximation!
6984 // It can over/under -estimate the cost!
6985 return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor);
6986 };
6987
6988 if (ST->hasAVX2())
6989 if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor,
6990 ETy.getSimpleVT()))
6991 return GetDiscountedCost(Entry);
6992
6993 if (ST->hasSSSE3())
6994 if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor,
6995 ETy.getSimpleVT()))
6996 return GetDiscountedCost(Entry);
6997
6998 if (ST->hasSSE2())
6999 if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor,
7000 ETy.getSimpleVT()))
7001 return GetDiscountedCost(Entry);
7002 } else {
7003 assert(Opcode == Instruction::Store &&
7004 "Expected Store Instruction at this point");
7005 assert((!Indices.size() || Indices.size() == Factor) &&
7006 "Interleaved store only supports fully-interleaved groups.");
7007 if (ST->hasAVX2())
7008 if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor,
7009 ETy.getSimpleVT()))
7010 return MemOpCosts + Entry->Cost;
7011
7012 if (ST->hasSSE2())
7013 if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor,
7014 ETy.getSimpleVT()))
7015 return MemOpCosts + Entry->Cost;
7016 }
7017
7018 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
7019 Alignment, AddressSpace, CostKind,
7020 UseMaskForCond, UseMaskForGaps);
7021}
7022
7024 StackOffset BaseOffset,
7025 bool HasBaseReg, int64_t Scale,
7026 unsigned AddrSpace) const {
7027 // Scaling factors are not free at all.
7028 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
7029 // will take 2 allocations in the out of order engine instead of 1
7030 // for plain addressing mode, i.e. inst (reg1).
7031 // E.g.,
7032 // vaddps (%rsi,%rdx), %ymm0, %ymm1
7033 // Requires two allocations (one for the load, one for the computation)
7034 // whereas:
7035 // vaddps (%rsi), %ymm0, %ymm1
7036 // Requires just 1 allocation, i.e., freeing allocations for other operations
7037 // and having less micro operations to execute.
7038 //
7039 // For some X86 architectures, this is even worse because for instance for
7040 // stores, the complex addressing mode forces the instruction to use the
7041 // "load" ports instead of the dedicated "store" port.
7042 // E.g., on Haswell:
7043 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
7044 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
7046 AM.BaseGV = BaseGV;
7047 AM.BaseOffs = BaseOffset.getFixed();
7048 AM.HasBaseReg = HasBaseReg;
7049 AM.Scale = Scale;
7050 AM.ScalableOffset = BaseOffset.getScalable();
7051 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
7052 // Scale represents reg2 * scale, thus account for 1
7053 // as soon as we use a second register.
7054 return AM.Scale != 0;
7055 return -1;
7056}
7057
7059 // TODO: Hook MispredictPenalty of SchedMachineModel into this.
7060 return 14;
7061}
7062
7064 unsigned Bits = Ty->getScalarSizeInBits();
7065
7066 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
7067 // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
7068 if (ST->hasXOP() && (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
7069 return false;
7070
7071 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
7072 // shifts just as cheap as scalar ones.
7073 if (ST->hasAVX2() && (Bits == 32 || Bits == 64))
7074 return false;
7075
7076 // AVX512BW has shifts such as vpsllvw.
7077 if (ST->hasBWI() && Bits == 16)
7078 return false;
7079
7080 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
7081 // fully general vector.
7082 return true;
7083}
7084
7085unsigned X86TTIImpl::getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
7086 Type *ScalarValTy) const {
7087 if (ST->hasF16C() && ScalarMemTy->isHalfTy()) {
7088 return 4;
7089 }
7090 return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy);
7091}
7092
7094 SmallVectorImpl<Use *> &Ops) const {
7095 using namespace llvm::PatternMatch;
7096
7097 FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
7098 if (!VTy)
7099 return false;
7100
7101 if (I->getOpcode() == Instruction::Mul &&
7102 VTy->getElementType()->isIntegerTy(64)) {
7103 for (auto &Op : I->operands()) {
7104 // Make sure we are not already sinking this operand
7105 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
7106 continue;
7107
7108 // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or
7109 // the PMULUDQ pattern where the input is a zext_inreg from vXi32.
7110 if (ST->hasSSE41() &&
7111 match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),
7112 m_SpecificInt(32)))) {
7113 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
7114 Ops.push_back(&Op);
7115 } else if (ST->hasSSE2() &&
7116 match(Op.get(),
7117 m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) {
7118 Ops.push_back(&Op);
7119 }
7120 }
7121
7122 return !Ops.empty();
7123 }
7124
7125 // A uniform shift amount in a vector shift or funnel shift may be much
7126 // cheaper than a generic variable vector shift, so make that pattern visible
7127 // to SDAG by sinking the shuffle instruction next to the shift.
7128 int ShiftAmountOpNum = -1;
7129 if (I->isShift())
7130 ShiftAmountOpNum = 1;
7131 else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
7132 if (II->getIntrinsicID() == Intrinsic::fshl ||
7133 II->getIntrinsicID() == Intrinsic::fshr)
7134 ShiftAmountOpNum = 2;
7135 }
7136
7137 if (ShiftAmountOpNum == -1)
7138 return false;
7139
7140 auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
7141 if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
7142 isVectorShiftByScalarCheap(I->getType())) {
7143 Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
7144 return true;
7145 }
7146
7147 return false;
7148}
Expand Atomic instructions
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Hexagon Common GEP
static LVOptions Options
Definition: LVOptions.cpp:25
#define I(x, y, z)
Definition: MD5.cpp:58
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
if(PassOpts->AAPipeline)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getNumElements(Type *Ty)
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
This file a TargetTransformInfo::Concept conforming object specific to the X86 target machine.
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:986
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1649
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1330
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:371
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:258
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:380
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1468
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1015
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:827
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:200
APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition: APInt.cpp:455
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1542
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:478
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
Definition: BasicTTIImpl.h:396
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:922
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={})
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:806
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:958
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:379
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:676
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:703
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:697
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:696
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:681
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:684
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:698
@ ICMP_EQ
equal
Definition: InstrTypes.h:694
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:701
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:699
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:683
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:364
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:843
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:617
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:421
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:847
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:322
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:563
unsigned getNumElements() const
Definition: DerivedTypes.h:606
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:933
static InstructionCost getInvalid(CostType Val=0)
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:311
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
const IntrinsicInst * getInst() const
Machine Value Type.
bool is128BitVector() const
Return true if this is a 128-bit vector type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Class to represent pointers.
Definition: DerivedTypes.h:670
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:703
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
bool test(unsigned Idx) const
size_type size() const
Returns the number of bits in this bitvector.
bool empty() const
Definition: SmallVector.h:81
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
static StackOffset getScalable(int64_t Scalable)
Definition: TypeSize.h:43
static StackOffset getFixed(int64_t Fixed)
Definition: TypeSize.h:42
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
unsigned getMaxExpandSizeMemcmp(bool OptSize) const
Get maximum # of load operations permitted for memcmp.
MVT getSimpleValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the MVT corresponding to this LLVM type. See getValueType.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
bool isExpensiveToSpeculativelyExecute(const Instruction *I)
bool isStridedAccess(const SCEV *Ptr) const
unsigned minRequiredElementSize(const Value *Val, bool &isSigned) const
const SCEVConstant * getConstantStrideStep(ScalarEvolution *SE, const SCEV *Ptr) const
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
CacheLevel
The possible cache levels.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:348
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
static Type * getDoubleTy(LLVMContext &C)
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:243
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:264
static IntegerType * getInt1Ty(LLVMContext &C)
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:145
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
static IntegerType * getInt8Ty(LLVMContext &C)
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:225
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
Base class of all SIMD vector types.
Definition: DerivedTypes.h:427
static VectorType * getExtendedElementVectorType(VectorType *VTy)
This static method is like getInteger except that the element types are twice as wide as the elements...
Definition: DerivedTypes.h:487
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:665
static VectorType * getDoubleElementsVectorType(VectorType *VTy)
This static method returns a VectorType with twice as many elements as the input type and the same el...
Definition: DerivedTypes.h:541
Type * getElementType() const
Definition: DerivedTypes.h:460
bool hasSSE1() const
Definition: X86Subtarget.h:193
bool hasSSE42() const
Definition: X86Subtarget.h:198
bool useAVX512Regs() const
Definition: X86Subtarget.h:253
bool hasSSE3() const
Definition: X86Subtarget.h:195
bool hasAVX512() const
Definition: X86Subtarget.h:201
bool hasSSE41() const
Definition: X86Subtarget.h:197
bool hasSSE2() const
Definition: X86Subtarget.h:194
bool hasSSSE3() const
Definition: X86Subtarget.h:196
bool hasAVX() const
Definition: X86Subtarget.h:199
unsigned getPreferVectorWidth() const
Definition: X86Subtarget.h:225
bool hasAVX2() const
Definition: X86Subtarget.h:200
InstructionCost getInterleavedMemoryOpCostAVX512(unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
bool isLegalMaskedGather(Type *DataType, Align Alignment)
InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask, TTI::TargetCostKind CostKind) const
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const
std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool isLegalNTStore(Type *DataType, Align Alignment)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
bool isLegalNTLoad(Type *DataType, Align Alignment)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment)
bool isLegalMaskedGatherScatter(Type *DataType, Align Alignment)
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
bool isLegalMaskedLoad(Type *DataType, Align Alignment)
bool hasConditionalLoadStoreForType(Type *Ty=nullptr) const
bool supportsEfficientVectorElementLoadStore() const
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
bool prefersVectorizedAddressing() const
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment)
std::optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const override
bool isLegalMaskedStore(Type *DataType, Align Alignment)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
Calculate the cost of Gather / Scatter operation.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
unsigned getMaxInterleaveFactor(ElementCount VF)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
bool isLegalMaskedCompressStore(Type *DataType, Align Alignment)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isVectorShiftByScalarCheap(Type *Ty) const
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
unsigned getNumberOfRegisters(unsigned ClassID) const
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
unsigned getAtomicMemIntrinsicMaxElementSize() const
bool isLegalMaskedScatter(Type *DataType, Align Alignment)
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={})
InstructionCost getIntImmCost(int64_t)
Calculate the cost of materializing a 64-bit value.
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getBranchMispredictPenalty() const
bool isExpensiveToSpeculativelyExecute(const Instruction *I)
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr)
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
InstructionCost getMinMaxCost(Intrinsic::ID IID, Type *Ty, TTI::TargetCostKind CostKind, FastMathFlags FMF)
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Type) const
bool hasDivRemOp(Type *DataType, bool IsSigned)
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition: APInt.cpp:2982
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:780
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:753
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:744
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:841
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:717
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:262
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:805
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:752
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:981
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:757
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:330
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:550
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:811
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1031
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:697
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:939
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1050
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:887
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:920
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:817
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
Definition: PatternMatch.h:982
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
apint_match m_APIntAllowPoison(const APInt *&Res)
Match APInt while allowing poison in splat vector constants.
Definition: PatternMatch.h:305
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:556
Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:395
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:292
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:404
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1841
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1945
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
InstructionCost Cost
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
#define N
std::optional< unsigned > operator[](TargetTransformInfo::TargetCostKind Kind) const
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition: Alignment.h:141
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Type Conversion Cost Table.
Definition: CostTable.h:55