LLVM 20.0.0git
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements a TargetTransformInfo analysis pass specific to the
10/// X86 target machine. It uses the target's detailed information to provide
11/// more precise answers to certain TTI queries, while letting the target
12/// independent and default TTI implementations handle the rest.
13///
14//===----------------------------------------------------------------------===//
15/// About Cost Model numbers used below it's necessary to say the following:
16/// the numbers correspond to some "generic" X86 CPU instead of usage of a
17/// specific CPU model. Usually the numbers correspond to the CPU where the
18/// feature first appeared. For example, if we do Subtarget.hasSSE42() in
19/// the lookups below the cost is based on Nehalem as that was the first CPU
20/// to support that feature level and thus has most likely the worst case cost,
21/// although we may discard an outlying worst cost from one CPU (e.g. Atom).
22///
23/// Some examples of other technologies/CPUs:
24/// SSE 3 - Pentium4 / Athlon64
25/// SSE 4.1 - Penryn
26/// SSE 4.2 - Nehalem / Silvermont
27/// AVX - Sandy Bridge / Jaguar / Bulldozer
28/// AVX2 - Haswell / Ryzen
29/// AVX-512 - Xeon Phi / Skylake
30///
31/// And some examples of instruction target dependent costs (latency)
32/// divss sqrtss rsqrtss
33/// AMD K7 11-16 19 3
34/// Piledriver 9-24 13-15 5
35/// Jaguar 14 16 2
36/// Pentium II,III 18 30 2
37/// Nehalem 7-14 7-18 3
38/// Haswell 10-13 11 5
39///
40/// Interpreting the 4 TargetCostKind types:
41/// TCK_RecipThroughput and TCK_Latency should try to match the worst case
42/// values reported by the CPU scheduler models (and llvm-mca).
43/// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the
44/// actual encoding size of the instruction.
45/// TCK_SizeAndLatency should match the worst case micro-op counts reported by
46/// by the CPU scheduler models (and llvm-mca), to ensure that they are
47/// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are
48/// often used as the cost thresholds where TCK_SizeAndLatency is requested.
49//===----------------------------------------------------------------------===//
50
58#include <optional>
59
60using namespace llvm;
61
62#define DEBUG_TYPE "x86tti"
63
64//===----------------------------------------------------------------------===//
65//
66// X86 cost model.
67//
68//===----------------------------------------------------------------------===//
69
70// Helper struct to store/access costs for each cost kind.
71// TODO: Move this to allow other targets to use it?
73 unsigned RecipThroughputCost = ~0U;
74 unsigned LatencyCost = ~0U;
75 unsigned CodeSizeCost = ~0U;
76 unsigned SizeAndLatencyCost = ~0U;
77
78 std::optional<unsigned>
80 unsigned Cost = ~0U;
81 switch (Kind) {
84 break;
87 break;
90 break;
93 break;
94 }
95 if (Cost == ~0U)
96 return std::nullopt;
97 return Cost;
98 }
99};
102
105 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
106 // TODO: Currently the __builtin_popcount() implementation using SSE3
107 // instructions is inefficient. Once the problem is fixed, we should
108 // call ST->hasSSE3() instead of ST->hasPOPCNT().
109 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
110}
111
112std::optional<unsigned> X86TTIImpl::getCacheSize(
114 switch (Level) {
116 // - Penryn
117 // - Nehalem
118 // - Westmere
119 // - Sandy Bridge
120 // - Ivy Bridge
121 // - Haswell
122 // - Broadwell
123 // - Skylake
124 // - Kabylake
125 return 32 * 1024; // 32 KByte
127 // - Penryn
128 // - Nehalem
129 // - Westmere
130 // - Sandy Bridge
131 // - Ivy Bridge
132 // - Haswell
133 // - Broadwell
134 // - Skylake
135 // - Kabylake
136 return 256 * 1024; // 256 KByte
137 }
138
139 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
140}
141
142std::optional<unsigned> X86TTIImpl::getCacheAssociativity(
144 // - Penryn
145 // - Nehalem
146 // - Westmere
147 // - Sandy Bridge
148 // - Ivy Bridge
149 // - Haswell
150 // - Broadwell
151 // - Skylake
152 // - Kabylake
153 switch (Level) {
155 [[fallthrough]];
157 return 8;
158 }
159
160 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
161}
162
163unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
164 bool Vector = (ClassID == 1);
165 if (Vector && !ST->hasSSE1())
166 return 0;
167
168 if (ST->is64Bit()) {
169 if (Vector && ST->hasAVX512())
170 return 32;
171 if (!Vector && ST->hasEGPR())
172 return 32;
173 return 16;
174 }
175 return 8;
176}
177
179 if (!ST->hasCF())
180 return false;
181 if (!Ty)
182 return true;
183 // Conditional faulting is supported by CFCMOV, which only accepts
184 // 16/32/64-bit operands.
185 // TODO: Support f32/f64 with VMOVSS/VMOVSD with zero mask when it's
186 // profitable.
187 auto *VTy = dyn_cast<FixedVectorType>(Ty);
188 if (!Ty->isIntegerTy() && (!VTy || VTy->getNumElements() != 1))
189 return false;
190 auto *ScalarTy = Ty->getScalarType();
191 switch (cast<IntegerType>(ScalarTy)->getBitWidth()) {
192 default:
193 return false;
194 case 16:
195 case 32:
196 case 64:
197 return true;
198 }
199}
200
203 unsigned PreferVectorWidth = ST->getPreferVectorWidth();
204 switch (K) {
206 return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
208 if (ST->hasAVX512() && ST->hasEVEX512() && PreferVectorWidth >= 512)
209 return TypeSize::getFixed(512);
210 if (ST->hasAVX() && PreferVectorWidth >= 256)
211 return TypeSize::getFixed(256);
212 if (ST->hasSSE1() && PreferVectorWidth >= 128)
213 return TypeSize::getFixed(128);
214 return TypeSize::getFixed(0);
216 return TypeSize::getScalable(0);
217 }
218
219 llvm_unreachable("Unsupported register kind");
220}
221
224 .getFixedValue();
225}
226
228 // If the loop will not be vectorized, don't interleave the loop.
229 // Let regular unroll to unroll the loop, which saves the overflow
230 // check and memory check cost.
231 if (VF.isScalar())
232 return 1;
233
234 if (ST->isAtom())
235 return 1;
236
237 // Sandybridge and Haswell have multiple execution ports and pipelined
238 // vector units.
239 if (ST->hasAVX())
240 return 4;
241
242 return 2;
243}
244
246 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
249 const Instruction *CxtI) {
250
251 // vXi8 multiplications are always promoted to vXi16.
252 // Sub-128-bit types can be extended/packed more efficiently.
253 if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
254 Ty->getPrimitiveSizeInBits() <= 64 && Ty->getScalarSizeInBits() == 8) {
255 Type *WideVecTy =
256 VectorType::getExtendedElementVectorType(cast<VectorType>(Ty));
257 return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
259 CostKind) +
260 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
262 CostKind) +
263 getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info);
264 }
265
266 // Legalize the type.
267 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
268
269 int ISD = TLI->InstructionOpcodeToISD(Opcode);
270 assert(ISD && "Invalid opcode");
271
272 if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
273 (LT.second.getScalarType() == MVT::i32 ||
274 LT.second.getScalarType() == MVT::i64)) {
275 // Check if the operands can be represented as a smaller datatype.
276 bool Op1Signed = false, Op2Signed = false;
277 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
278 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
279 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
280 bool SignedMode = Op1Signed || Op2Signed;
281
282 // If both vXi32 are representable as i15 and at least one is constant,
283 // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
284 // can treat this as PMADDWD which has the same costs as a vXi16 multiply.
285 if (OpMinSize <= 15 && !ST->isPMADDWDSlow() &&
286 LT.second.getScalarType() == MVT::i32) {
287 bool Op1Constant =
288 isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
289 bool Op2Constant =
290 isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
291 bool Op1Sext = isa<SExtInst>(Args[0]) &&
292 (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
293 bool Op2Sext = isa<SExtInst>(Args[1]) &&
294 (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
295
296 bool IsZeroExtended = !Op1Signed || !Op2Signed;
297 bool IsConstant = Op1Constant || Op2Constant;
298 bool IsSext = Op1Sext || Op2Sext;
299 if (IsConstant || IsZeroExtended || IsSext)
300 LT.second =
301 MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
302 }
303
304 // Check if the vXi32 operands can be shrunk into a smaller datatype.
305 // This should match the codegen from reduceVMULWidth.
306 // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()).
307 if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) {
308 if (OpMinSize <= 7)
309 return LT.first * 3; // pmullw/sext
310 if (!SignedMode && OpMinSize <= 8)
311 return LT.first * 3; // pmullw/zext
312 if (OpMinSize <= 15)
313 return LT.first * 5; // pmullw/pmulhw/pshuf
314 if (!SignedMode && OpMinSize <= 16)
315 return LT.first * 5; // pmullw/pmulhw/pshuf
316 }
317
318 // If both vXi64 are representable as (unsigned) i32, then we can perform
319 // the multiple with a single PMULUDQ instruction.
320 // TODO: Add (SSE41+) PMULDQ handling for signed extensions.
321 if (!SignedMode && OpMinSize <= 32 && LT.second.getScalarType() == MVT::i64)
322 ISD = X86ISD::PMULUDQ;
323 }
324
325 // Vector multiply by pow2 will be simplified to shifts.
326 // Vector multiply by -pow2 will be simplified to shifts/negates.
327 if (ISD == ISD::MUL && Op2Info.isConstant() &&
328 (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) {
330 getArithmeticInstrCost(Instruction::Shl, Ty, CostKind,
331 Op1Info.getNoProps(), Op2Info.getNoProps());
332 if (Op2Info.isNegatedPowerOf2())
333 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
334 return Cost;
335 }
336
337 // On X86, vector signed division by constants power-of-two are
338 // normally expanded to the sequence SRA + SRL + ADD + SRA.
339 // The OperandValue properties may not be the same as that of the previous
340 // operation; conservatively assume OP_None.
341 if ((ISD == ISD::SDIV || ISD == ISD::SREM) &&
342 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
344 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
345 Op1Info.getNoProps(), Op2Info.getNoProps());
346 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
347 Op1Info.getNoProps(), Op2Info.getNoProps());
348 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
349 Op1Info.getNoProps(), Op2Info.getNoProps());
350
351 if (ISD == ISD::SREM) {
352 // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
353 Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(),
354 Op2Info.getNoProps());
355 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(),
356 Op2Info.getNoProps());
357 }
358
359 return Cost;
360 }
361
362 // Vector unsigned division/remainder will be simplified to shifts/masks.
363 if ((ISD == ISD::UDIV || ISD == ISD::UREM) &&
364 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
365 if (ISD == ISD::UDIV)
366 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
367 Op1Info.getNoProps(), Op2Info.getNoProps());
368 // UREM
369 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
370 Op1Info.getNoProps(), Op2Info.getNoProps());
371 }
372
373 static const CostKindTblEntry GFNIUniformConstCostTable[] = {
374 { ISD::SHL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
375 { ISD::SRL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
376 { ISD::SRA, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
377 { ISD::SHL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
378 { ISD::SRL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
379 { ISD::SRA, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
380 { ISD::SHL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
381 { ISD::SRL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
382 { ISD::SRA, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
383 };
384
385 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasGFNI())
386 if (const auto *Entry =
387 CostTableLookup(GFNIUniformConstCostTable, ISD, LT.second))
388 if (auto KindCost = Entry->Cost[CostKind])
389 return LT.first * *KindCost;
390
391 static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
392 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
393 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
394 { ISD::SRA, MVT::v16i8, { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb.
395 { ISD::SHL, MVT::v32i8, { 1, 8, 2, 3 } }, // psllw + pand.
396 { ISD::SRL, MVT::v32i8, { 1, 8, 2, 3 } }, // psrlw + pand.
397 { ISD::SRA, MVT::v32i8, { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb.
398 { ISD::SHL, MVT::v64i8, { 1, 8, 2, 3 } }, // psllw + pand.
399 { ISD::SRL, MVT::v64i8, { 1, 8, 2, 3 } }, // psrlw + pand.
400 { ISD::SRA, MVT::v64i8, { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb.
401
402 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // psllw
403 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
404 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
405 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // psllw
406 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
407 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
408 };
409
410 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI())
411 if (const auto *Entry =
412 CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second))
413 if (auto KindCost = Entry->Cost[CostKind])
414 return LT.first * *KindCost;
415
416 static const CostKindTblEntry AVX512UniformConstCostTable[] = {
417 { ISD::SHL, MVT::v64i8, { 2, 12, 5, 6 } }, // psllw + pand.
418 { ISD::SRL, MVT::v64i8, { 2, 12, 5, 6 } }, // psrlw + pand.
419 { ISD::SRA, MVT::v64i8, { 3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb.
420
421 { ISD::SHL, MVT::v16i16, { 2, 7, 4, 4 } }, // psllw + split.
422 { ISD::SRL, MVT::v16i16, { 2, 7, 4, 4 } }, // psrlw + split.
423 { ISD::SRA, MVT::v16i16, { 2, 7, 4, 4 } }, // psraw + split.
424
425 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, // pslld
426 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, // psrld
427 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, // psrad
428 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, // pslld
429 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, // psrld
430 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, // psrad
431
432 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, // psraq
433 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, // psllq
434 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, // psrlq
435 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, // psraq
436 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, // psllq
437 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, // psrlq
438 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, // psraq
439
440 { ISD::SDIV, MVT::v16i32, { 6 } }, // pmuludq sequence
441 { ISD::SREM, MVT::v16i32, { 8 } }, // pmuludq+mul+sub sequence
442 { ISD::UDIV, MVT::v16i32, { 5 } }, // pmuludq sequence
443 { ISD::UREM, MVT::v16i32, { 7 } }, // pmuludq+mul+sub sequence
444 };
445
446 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512())
447 if (const auto *Entry =
448 CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second))
449 if (auto KindCost = Entry->Cost[CostKind])
450 return LT.first * *KindCost;
451
452 static const CostKindTblEntry AVX2UniformConstCostTable[] = {
453 { ISD::SHL, MVT::v16i8, { 1, 8, 2, 3 } }, // psllw + pand.
454 { ISD::SRL, MVT::v16i8, { 1, 8, 2, 3 } }, // psrlw + pand.
455 { ISD::SRA, MVT::v16i8, { 2, 10, 5, 6 } }, // psrlw, pand, pxor, psubb.
456 { ISD::SHL, MVT::v32i8, { 2, 8, 2, 4 } }, // psllw + pand.
457 { ISD::SRL, MVT::v32i8, { 2, 8, 2, 4 } }, // psrlw + pand.
458 { ISD::SRA, MVT::v32i8, { 3, 10, 5, 9 } }, // psrlw, pand, pxor, psubb.
459
460 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw
461 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw
462 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw
463 { ISD::SHL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psllw
464 { ISD::SRL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psrlw
465 { ISD::SRA, MVT::v16i16,{ 2, 2, 1, 2 } }, // psraw
466
467 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
468 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld
469 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad
470 { ISD::SHL, MVT::v8i32, { 2, 2, 1, 2 } }, // pslld
471 { ISD::SRL, MVT::v8i32, { 2, 2, 1, 2 } }, // psrld
472 { ISD::SRA, MVT::v8i32, { 2, 2, 1, 2 } }, // psrad
473
474 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq
475 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq
476 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
477 { ISD::SHL, MVT::v4i64, { 2, 2, 1, 2 } }, // psllq
478 { ISD::SRL, MVT::v4i64, { 2, 2, 1, 2 } }, // psrlq
479 { ISD::SRA, MVT::v4i64, { 4, 4, 3, 6 } }, // psrad + shuffle + split.
480
481 { ISD::SDIV, MVT::v8i32, { 6 } }, // pmuludq sequence
482 { ISD::SREM, MVT::v8i32, { 8 } }, // pmuludq+mul+sub sequence
483 { ISD::UDIV, MVT::v8i32, { 5 } }, // pmuludq sequence
484 { ISD::UREM, MVT::v8i32, { 7 } }, // pmuludq+mul+sub sequence
485 };
486
487 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2())
488 if (const auto *Entry =
489 CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second))
490 if (auto KindCost = Entry->Cost[CostKind])
491 return LT.first * *KindCost;
492
493 static const CostKindTblEntry AVXUniformConstCostTable[] = {
494 { ISD::SHL, MVT::v16i8, { 2, 7, 2, 3 } }, // psllw + pand.
495 { ISD::SRL, MVT::v16i8, { 2, 7, 2, 3 } }, // psrlw + pand.
496 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
497 { ISD::SHL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psllw + pand) + split.
498 { ISD::SRL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psrlw + pand) + split.
499 { ISD::SRA, MVT::v32i8, { 7, 7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split.
500
501 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 1 } }, // psllw.
502 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 1 } }, // psrlw.
503 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 1 } }, // psraw.
504 { ISD::SHL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psllw + split.
505 { ISD::SRL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psrlw + split.
506 { ISD::SRA, MVT::v16i16,{ 3, 6, 4, 5 } }, // psraw + split.
507
508 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 1 } }, // pslld.
509 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 1 } }, // psrld.
510 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 1 } }, // psrad.
511 { ISD::SHL, MVT::v8i32, { 3, 6, 4, 5 } }, // pslld + split.
512 { ISD::SRL, MVT::v8i32, { 3, 6, 4, 5 } }, // psrld + split.
513 { ISD::SRA, MVT::v8i32, { 3, 6, 4, 5 } }, // psrad + split.
514
515 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 1 } }, // psllq.
516 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 1 } }, // psrlq.
517 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
518 { ISD::SHL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
519 { ISD::SRL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
520 { ISD::SRA, MVT::v4i64, { 5, 7, 8, 9 } }, // 2 x psrad + shuffle + split.
521
522 { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split.
523 { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split.
524 { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split.
525 { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split.
526 };
527
528 // XOP has faster vXi8 shifts.
529 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() &&
530 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
531 if (const auto *Entry =
532 CostTableLookup(AVXUniformConstCostTable, ISD, LT.second))
533 if (auto KindCost = Entry->Cost[CostKind])
534 return LT.first * *KindCost;
535
536 static const CostKindTblEntry SSE2UniformConstCostTable[] = {
537 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
538 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
539 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
540
541 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw.
542 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw.
543 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw.
544
545 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
546 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld.
547 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad.
548
549 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq.
550 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq.
551 { ISD::SRA, MVT::v2i64, { 3, 5, 6, 6 } }, // 2 x psrad + shuffle.
552
553 { ISD::SDIV, MVT::v4i32, { 6 } }, // pmuludq sequence
554 { ISD::SREM, MVT::v4i32, { 8 } }, // pmuludq+mul+sub sequence
555 { ISD::UDIV, MVT::v4i32, { 5 } }, // pmuludq sequence
556 { ISD::UREM, MVT::v4i32, { 7 } }, // pmuludq+mul+sub sequence
557 };
558
559 // XOP has faster vXi8 shifts.
560 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() &&
561 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
562 if (const auto *Entry =
563 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
564 if (auto KindCost = Entry->Cost[CostKind])
565 return LT.first * *KindCost;
566
567 static const CostKindTblEntry AVX512BWConstCostTable[] = {
568 { ISD::SDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
569 { ISD::SREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
570 { ISD::UDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
571 { ISD::UREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
572
573 { ISD::SDIV, MVT::v32i16, { 6 } }, // vpmulhw sequence
574 { ISD::SREM, MVT::v32i16, { 8 } }, // vpmulhw+mul+sub sequence
575 { ISD::UDIV, MVT::v32i16, { 6 } }, // vpmulhuw sequence
576 { ISD::UREM, MVT::v32i16, { 8 } }, // vpmulhuw+mul+sub sequence
577 };
578
579 if (Op2Info.isConstant() && ST->hasBWI())
580 if (const auto *Entry =
581 CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
582 if (auto KindCost = Entry->Cost[CostKind])
583 return LT.first * *KindCost;
584
585 static const CostKindTblEntry AVX512ConstCostTable[] = {
586 { ISD::SDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
587 { ISD::SREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
588 { ISD::UDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
589 { ISD::UREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
590
591 { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence
592 { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence
593 { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence
594 { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence
595
596 { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence
597 { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence
598 { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence
599 { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence
600 };
601
602 if (Op2Info.isConstant() && ST->hasAVX512())
603 if (const auto *Entry =
604 CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
605 if (auto KindCost = Entry->Cost[CostKind])
606 return LT.first * *KindCost;
607
608 static const CostKindTblEntry AVX2ConstCostTable[] = {
609 { ISD::SDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
610 { ISD::SREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
611 { ISD::UDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
612 { ISD::UREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
613
614 { ISD::SDIV, MVT::v16i16, { 6 } }, // vpmulhw sequence
615 { ISD::SREM, MVT::v16i16, { 8 } }, // vpmulhw+mul+sub sequence
616 { ISD::UDIV, MVT::v16i16, { 6 } }, // vpmulhuw sequence
617 { ISD::UREM, MVT::v16i16, { 8 } }, // vpmulhuw+mul+sub sequence
618
619 { ISD::SDIV, MVT::v8i32, { 15 } }, // vpmuldq sequence
620 { ISD::SREM, MVT::v8i32, { 19 } }, // vpmuldq+mul+sub sequence
621 { ISD::UDIV, MVT::v8i32, { 15 } }, // vpmuludq sequence
622 { ISD::UREM, MVT::v8i32, { 19 } }, // vpmuludq+mul+sub sequence
623 };
624
625 if (Op2Info.isConstant() && ST->hasAVX2())
626 if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
627 if (auto KindCost = Entry->Cost[CostKind])
628 return LT.first * *KindCost;
629
630 static const CostKindTblEntry AVXConstCostTable[] = {
631 { ISD::SDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
632 { ISD::SREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
633 { ISD::UDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
634 { ISD::UREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
635
636 { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split.
637 { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split.
638 { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split.
639 { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split.
640
641 { ISD::SDIV, MVT::v8i32, { 32 } }, // vpmuludq sequence
642 { ISD::SREM, MVT::v8i32, { 38 } }, // vpmuludq+mul+sub sequence
643 { ISD::UDIV, MVT::v8i32, { 32 } }, // 2*pmuludq sequence + split.
644 { ISD::UREM, MVT::v8i32, { 42 } }, // 2*pmuludq+mul+sub sequence + split.
645 };
646
647 if (Op2Info.isConstant() && ST->hasAVX())
648 if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second))
649 if (auto KindCost = Entry->Cost[CostKind])
650 return LT.first * *KindCost;
651
652 static const CostKindTblEntry SSE41ConstCostTable[] = {
653 { ISD::SDIV, MVT::v4i32, { 15 } }, // vpmuludq sequence
654 { ISD::SREM, MVT::v4i32, { 20 } }, // vpmuludq+mul+sub sequence
655 };
656
657 if (Op2Info.isConstant() && ST->hasSSE41())
658 if (const auto *Entry =
659 CostTableLookup(SSE41ConstCostTable, ISD, LT.second))
660 if (auto KindCost = Entry->Cost[CostKind])
661 return LT.first * *KindCost;
662
663 static const CostKindTblEntry SSE2ConstCostTable[] = {
664 { ISD::SDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
665 { ISD::SREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
666 { ISD::UDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
667 { ISD::UREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
668
669 { ISD::SDIV, MVT::v8i16, { 6 } }, // pmulhw sequence
670 { ISD::SREM, MVT::v8i16, { 8 } }, // pmulhw+mul+sub sequence
671 { ISD::UDIV, MVT::v8i16, { 6 } }, // pmulhuw sequence
672 { ISD::UREM, MVT::v8i16, { 8 } }, // pmulhuw+mul+sub sequence
673
674 { ISD::SDIV, MVT::v4i32, { 19 } }, // pmuludq sequence
675 { ISD::SREM, MVT::v4i32, { 24 } }, // pmuludq+mul+sub sequence
676 { ISD::UDIV, MVT::v4i32, { 15 } }, // pmuludq sequence
677 { ISD::UREM, MVT::v4i32, { 20 } }, // pmuludq+mul+sub sequence
678 };
679
680 if (Op2Info.isConstant() && ST->hasSSE2())
681 if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
682 if (auto KindCost = Entry->Cost[CostKind])
683 return LT.first * *KindCost;
684
685 static const CostKindTblEntry AVX512BWUniformCostTable[] = {
686 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
687 { ISD::SRL, MVT::v16i8, { 3,10, 5, 8 } }, // psrlw + pand.
688 { ISD::SRA, MVT::v16i8, { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb.
689 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
690 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
691 { ISD::SRA, MVT::v32i8, { 5,10,10,13 } }, // psrlw, pand, pxor, psubb.
692 { ISD::SHL, MVT::v64i8, { 4, 7, 6, 8 } }, // psllw + pand.
693 { ISD::SRL, MVT::v64i8, { 4, 8, 7,10 } }, // psrlw + pand.
694 { ISD::SRA, MVT::v64i8, { 5,10,10,15 } }, // psrlw, pand, pxor, psubb.
695
696 { ISD::SHL, MVT::v32i16, { 2, 4, 2, 3 } }, // psllw
697 { ISD::SRL, MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw
698 { ISD::SRA, MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw
699 };
700
701 if (ST->hasBWI() && Op2Info.isUniform())
702 if (const auto *Entry =
703 CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second))
704 if (auto KindCost = Entry->Cost[CostKind])
705 return LT.first * *KindCost;
706
707 static const CostKindTblEntry AVX512UniformCostTable[] = {
708 { ISD::SHL, MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split.
709 { ISD::SRL, MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split.
710 { ISD::SRA, MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split.
711
712 { ISD::SHL, MVT::v16i32, { 2, 4, 2, 3 } }, // pslld
713 { ISD::SRL, MVT::v16i32, { 2, 4, 2, 3 } }, // psrld
714 { ISD::SRA, MVT::v16i32, { 2, 4, 2, 3 } }, // psrad
715
716 { ISD::SRA, MVT::v2i64, { 1, 2, 1, 2 } }, // psraq
717 { ISD::SHL, MVT::v4i64, { 1, 4, 1, 2 } }, // psllq
718 { ISD::SRL, MVT::v4i64, { 1, 4, 1, 2 } }, // psrlq
719 { ISD::SRA, MVT::v4i64, { 1, 4, 1, 2 } }, // psraq
720 { ISD::SHL, MVT::v8i64, { 1, 4, 1, 2 } }, // psllq
721 { ISD::SRL, MVT::v8i64, { 1, 4, 1, 2 } }, // psrlq
722 { ISD::SRA, MVT::v8i64, { 1, 4, 1, 2 } }, // psraq
723 };
724
725 if (ST->hasAVX512() && Op2Info.isUniform())
726 if (const auto *Entry =
727 CostTableLookup(AVX512UniformCostTable, ISD, LT.second))
728 if (auto KindCost = Entry->Cost[CostKind])
729 return LT.first * *KindCost;
730
731 static const CostKindTblEntry AVX2UniformCostTable[] = {
732 // Uniform splats are cheaper for the following instructions.
733 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
734 { ISD::SRL, MVT::v16i8, { 3, 9, 5, 8 } }, // psrlw + pand.
735 { ISD::SRA, MVT::v16i8, { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb.
736 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
737 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
738 { ISD::SRA, MVT::v32i8, { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb.
739
740 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 2 } }, // psllw.
741 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 2 } }, // psrlw.
742 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 2 } }, // psraw.
743 { ISD::SHL, MVT::v16i16, { 2, 4, 2, 3 } }, // psllw.
744 { ISD::SRL, MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw.
745 { ISD::SRA, MVT::v16i16, { 2, 4, 2, 3 } }, // psraw.
746
747 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 2 } }, // pslld
748 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 2 } }, // psrld
749 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 2 } }, // psrad
750 { ISD::SHL, MVT::v8i32, { 2, 4, 2, 3 } }, // pslld
751 { ISD::SRL, MVT::v8i32, { 2, 4, 2, 3 } }, // psrld
752 { ISD::SRA, MVT::v8i32, { 2, 4, 2, 3 } }, // psrad
753
754 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 2 } }, // psllq
755 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 2 } }, // psrlq
756 { ISD::SRA, MVT::v2i64, { 2, 4, 5, 7 } }, // 2 x psrad + shuffle.
757 { ISD::SHL, MVT::v4i64, { 2, 4, 1, 2 } }, // psllq
758 { ISD::SRL, MVT::v4i64, { 2, 4, 1, 2 } }, // psrlq
759 { ISD::SRA, MVT::v4i64, { 4, 6, 5, 9 } }, // 2 x psrad + shuffle.
760 };
761
762 if (ST->hasAVX2() && Op2Info.isUniform())
763 if (const auto *Entry =
764 CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
765 if (auto KindCost = Entry->Cost[CostKind])
766 return LT.first * *KindCost;
767
768 static const CostKindTblEntry AVXUniformCostTable[] = {
769 { ISD::SHL, MVT::v16i8, { 4, 4, 6, 8 } }, // psllw + pand.
770 { ISD::SRL, MVT::v16i8, { 4, 8, 5, 8 } }, // psrlw + pand.
771 { ISD::SRA, MVT::v16i8, { 6, 6, 9,13 } }, // psrlw, pand, pxor, psubb.
772 { ISD::SHL, MVT::v32i8, { 7, 8,11,14 } }, // psllw + pand + split.
773 { ISD::SRL, MVT::v32i8, { 7, 9,10,14 } }, // psrlw + pand + split.
774 { ISD::SRA, MVT::v32i8, { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split.
775
776 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 2 } }, // psllw.
777 { ISD::SRL, MVT::v8i16, { 1, 3, 1, 2 } }, // psrlw.
778 { ISD::SRA, MVT::v8i16, { 1, 3, 1, 2 } }, // psraw.
779 { ISD::SHL, MVT::v16i16, { 3, 7, 5, 7 } }, // psllw + split.
780 { ISD::SRL, MVT::v16i16, { 3, 7, 5, 7 } }, // psrlw + split.
781 { ISD::SRA, MVT::v16i16, { 3, 7, 5, 7 } }, // psraw + split.
782
783 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 2 } }, // pslld.
784 { ISD::SRL, MVT::v4i32, { 1, 3, 1, 2 } }, // psrld.
785 { ISD::SRA, MVT::v4i32, { 1, 3, 1, 2 } }, // psrad.
786 { ISD::SHL, MVT::v8i32, { 3, 7, 5, 7 } }, // pslld + split.
787 { ISD::SRL, MVT::v8i32, { 3, 7, 5, 7 } }, // psrld + split.
788 { ISD::SRA, MVT::v8i32, { 3, 7, 5, 7 } }, // psrad + split.
789
790 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 2 } }, // psllq.
791 { ISD::SRL, MVT::v2i64, { 1, 3, 1, 2 } }, // psrlq.
792 { ISD::SRA, MVT::v2i64, { 3, 4, 5, 7 } }, // 2 x psrad + shuffle.
793 { ISD::SHL, MVT::v4i64, { 3, 7, 4, 6 } }, // psllq + split.
794 { ISD::SRL, MVT::v4i64, { 3, 7, 4, 6 } }, // psrlq + split.
795 { ISD::SRA, MVT::v4i64, { 6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split.
796 };
797
798 // XOP has faster vXi8 shifts.
799 if (ST->hasAVX() && Op2Info.isUniform() &&
800 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
801 if (const auto *Entry =
802 CostTableLookup(AVXUniformCostTable, ISD, LT.second))
803 if (auto KindCost = Entry->Cost[CostKind])
804 return LT.first * *KindCost;
805
806 static const CostKindTblEntry SSE2UniformCostTable[] = {
807 // Uniform splats are cheaper for the following instructions.
808 { ISD::SHL, MVT::v16i8, { 9, 10, 6, 9 } }, // psllw + pand.
809 { ISD::SRL, MVT::v16i8, { 9, 13, 5, 9 } }, // psrlw + pand.
810 { ISD::SRA, MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence.
811
812 { ISD::SHL, MVT::v8i16, { 2, 2, 1, 2 } }, // psllw.
813 { ISD::SRL, MVT::v8i16, { 2, 2, 1, 2 } }, // psrlw.
814 { ISD::SRA, MVT::v8i16, { 2, 2, 1, 2 } }, // psraw.
815
816 { ISD::SHL, MVT::v4i32, { 2, 2, 1, 2 } }, // pslld
817 { ISD::SRL, MVT::v4i32, { 2, 2, 1, 2 } }, // psrld.
818 { ISD::SRA, MVT::v4i32, { 2, 2, 1, 2 } }, // psrad.
819
820 { ISD::SHL, MVT::v2i64, { 2, 2, 1, 2 } }, // psllq.
821 { ISD::SRL, MVT::v2i64, { 2, 2, 1, 2 } }, // psrlq.
822 { ISD::SRA, MVT::v2i64, { 5, 9, 5, 7 } }, // 2*psrlq + xor + sub.
823 };
824
825 if (ST->hasSSE2() && Op2Info.isUniform() &&
826 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
827 if (const auto *Entry =
828 CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
829 if (auto KindCost = Entry->Cost[CostKind])
830 return LT.first * *KindCost;
831
832 static const CostKindTblEntry AVX512DQCostTable[] = {
833 { ISD::MUL, MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq
834 { ISD::MUL, MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq
835 { ISD::MUL, MVT::v8i64, { 3, 15, 1, 3 } } // pmullq
836 };
837
838 // Look for AVX512DQ lowering tricks for custom cases.
839 if (ST->hasDQI())
840 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
841 if (auto KindCost = Entry->Cost[CostKind])
842 return LT.first * *KindCost;
843
844 static const CostKindTblEntry AVX512BWCostTable[] = {
845 { ISD::SHL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsllvw/pack sequence.
846 { ISD::SRL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsrlvw/pack sequence.
847 { ISD::SRA, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsravw/pack sequence.
848 { ISD::SHL, MVT::v32i8, { 4, 23,11,16 } }, // extend/vpsllvw/pack sequence.
849 { ISD::SRL, MVT::v32i8, { 4, 30,12,18 } }, // extend/vpsrlvw/pack sequence.
850 { ISD::SRA, MVT::v32i8, { 6, 13,24,30 } }, // extend/vpsravw/pack sequence.
851 { ISD::SHL, MVT::v64i8, { 6, 19,13,15 } }, // extend/vpsllvw/pack sequence.
852 { ISD::SRL, MVT::v64i8, { 7, 27,15,18 } }, // extend/vpsrlvw/pack sequence.
853 { ISD::SRA, MVT::v64i8, { 15, 15,30,30 } }, // extend/vpsravw/pack sequence.
854
855 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsllvw
856 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsrlvw
857 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsravw
858 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsllvw
859 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsrlvw
860 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsravw
861 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsllvw
862 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsrlvw
863 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsravw
864
865 { ISD::ADD, MVT::v64i8, { 1, 1, 1, 1 } }, // paddb
866 { ISD::ADD, MVT::v32i16, { 1, 1, 1, 1 } }, // paddw
867
868 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 1 } }, // paddb
869 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 1 } }, // paddw
870 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 1 } }, // paddd
871 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 1 } }, // paddq
872
873 { ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb
874 { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw
875
876 { ISD::MUL, MVT::v16i8, { 4, 12, 4, 5 } }, // extend/pmullw/trunc
877 { ISD::MUL, MVT::v32i8, { 3, 10, 7,10 } }, // pmaddubsw
878 { ISD::MUL, MVT::v64i8, { 3, 11, 7,10 } }, // pmaddubsw
879 { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw
880
881 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb
882 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 1 } }, // psubw
883 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 1 } }, // psubd
884 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 1 } }, // psubq
885 };
886
887 // Look for AVX512BW lowering tricks for custom cases.
888 if (ST->hasBWI())
889 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
890 if (auto KindCost = Entry->Cost[CostKind])
891 return LT.first * *KindCost;
892
893 static const CostKindTblEntry AVX512CostTable[] = {
894 { ISD::SHL, MVT::v64i8, { 15, 19,27,33 } }, // vpblendv+split sequence.
895 { ISD::SRL, MVT::v64i8, { 15, 19,30,36 } }, // vpblendv+split sequence.
896 { ISD::SRA, MVT::v64i8, { 37, 37,51,63 } }, // vpblendv+split sequence.
897
898 { ISD::SHL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
899 { ISD::SRL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
900 { ISD::SRA, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence.
901
902 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } },
903 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } },
904 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } },
905 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } },
906 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } },
907 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } },
908 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } },
909 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } },
910 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } },
911
912 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } },
913 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } },
914 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } },
915 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } },
916 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } },
917 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } },
918 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } },
919 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } },
920 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } },
921
922 { ISD::ADD, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*paddb + split
923 { ISD::ADD, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*paddw + split
924
925 { ISD::SUB, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*psubb + split
926 { ISD::SUB, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*psubw + split
927
928 { ISD::AND, MVT::v32i8, { 1, 1, 1, 1 } },
929 { ISD::AND, MVT::v16i16, { 1, 1, 1, 1 } },
930 { ISD::AND, MVT::v8i32, { 1, 1, 1, 1 } },
931 { ISD::AND, MVT::v4i64, { 1, 1, 1, 1 } },
932
933 { ISD::OR, MVT::v32i8, { 1, 1, 1, 1 } },
934 { ISD::OR, MVT::v16i16, { 1, 1, 1, 1 } },
935 { ISD::OR, MVT::v8i32, { 1, 1, 1, 1 } },
936 { ISD::OR, MVT::v4i64, { 1, 1, 1, 1 } },
937
938 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 1 } },
939 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 1 } },
940 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 1 } },
941 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 1 } },
942
943 { ISD::MUL, MVT::v16i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
944 { ISD::MUL, MVT::v8i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
945 { ISD::MUL, MVT::v4i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
946 { ISD::MUL, MVT::v8i64, { 6, 9, 8, 8 } }, // 3*pmuludq/3*shift/2*add
947 { ISD::MUL, MVT::i64, { 1 } }, // Skylake from http://www.agner.org/
948
949 { X86ISD::PMULUDQ, MVT::v8i64, { 1, 5, 1, 1 } },
950
951 { ISD::FNEG, MVT::v8f64, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
952 { ISD::FADD, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
953 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
954 { ISD::FSUB, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
955 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
956 { ISD::FMUL, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
957 { ISD::FMUL, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
958 { ISD::FMUL, MVT::v2f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
959 { ISD::FMUL, MVT::f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
960
961 { ISD::FDIV, MVT::f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
962 { ISD::FDIV, MVT::v2f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
963 { ISD::FDIV, MVT::v4f64, { 8, 14, 1, 1 } }, // Skylake from http://www.agner.org/
964 { ISD::FDIV, MVT::v8f64, { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/
965
966 { ISD::FNEG, MVT::v16f32, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
967 { ISD::FADD, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
968 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
969 { ISD::FSUB, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
970 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
971 { ISD::FMUL, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
972 { ISD::FMUL, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
973 { ISD::FMUL, MVT::v4f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
974 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
975
976 { ISD::FDIV, MVT::f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
977 { ISD::FDIV, MVT::v4f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
978 { ISD::FDIV, MVT::v8f32, { 5, 11, 1, 1 } }, // Skylake from http://www.agner.org/
979 { ISD::FDIV, MVT::v16f32, { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/
980 };
981
982 if (ST->hasAVX512())
983 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
984 if (auto KindCost = Entry->Cost[CostKind])
985 return LT.first * *KindCost;
986
987 static const CostKindTblEntry AVX2ShiftCostTable[] = {
988 // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
989 // customize them to detect the cases where shift amount is a scalar one.
990 { ISD::SHL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org)
991 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
992 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org)
993 { ISD::SHL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org)
994 { ISD::SRL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
995 { ISD::SRA, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org)
996 { ISD::SHL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org)
997 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org)
998 { ISD::SHL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org)
999 { ISD::SRL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org)
1000 };
1001
1002 if (ST->hasAVX512()) {
1003 if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant())
1004 // On AVX512, a packed v32i16 shift left by a constant build_vector
1005 // is lowered into a vector multiply (vpmullw).
1006 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
1007 Op1Info.getNoProps(), Op2Info.getNoProps());
1008 }
1009
1010 // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
1011 if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
1012 if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
1013 Op2Info.isConstant())
1014 // On AVX2, a packed v16i16 shift left by a constant build_vector
1015 // is lowered into a vector multiply (vpmullw).
1016 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
1017 Op1Info.getNoProps(), Op2Info.getNoProps());
1018
1019 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
1020 if (auto KindCost = Entry->Cost[CostKind])
1021 return LT.first * *KindCost;
1022 }
1023
1024 static const CostKindTblEntry XOPShiftCostTable[] = {
1025 // 128bit shifts take 1cy, but right shifts require negation beforehand.
1026 { ISD::SHL, MVT::v16i8, { 1, 3, 1, 1 } },
1027 { ISD::SRL, MVT::v16i8, { 2, 3, 1, 1 } },
1028 { ISD::SRA, MVT::v16i8, { 2, 3, 1, 1 } },
1029 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 1 } },
1030 { ISD::SRL, MVT::v8i16, { 2, 3, 1, 1 } },
1031 { ISD::SRA, MVT::v8i16, { 2, 3, 1, 1 } },
1032 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 1 } },
1033 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 1 } },
1034 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 1 } },
1035 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 1 } },
1036 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } },
1037 { ISD::SRA, MVT::v2i64, { 2, 3, 1, 1 } },
1038 // 256bit shifts require splitting if AVX2 didn't catch them above.
1039 { ISD::SHL, MVT::v32i8, { 4, 7, 5, 6 } },
1040 { ISD::SRL, MVT::v32i8, { 6, 7, 5, 6 } },
1041 { ISD::SRA, MVT::v32i8, { 6, 7, 5, 6 } },
1042 { ISD::SHL, MVT::v16i16, { 4, 7, 5, 6 } },
1043 { ISD::SRL, MVT::v16i16, { 6, 7, 5, 6 } },
1044 { ISD::SRA, MVT::v16i16, { 6, 7, 5, 6 } },
1045 { ISD::SHL, MVT::v8i32, { 4, 7, 5, 6 } },
1046 { ISD::SRL, MVT::v8i32, { 6, 7, 5, 6 } },
1047 { ISD::SRA, MVT::v8i32, { 6, 7, 5, 6 } },
1048 { ISD::SHL, MVT::v4i64, { 4, 7, 5, 6 } },
1049 { ISD::SRL, MVT::v4i64, { 6, 7, 5, 6 } },
1050 { ISD::SRA, MVT::v4i64, { 6, 7, 5, 6 } },
1051 };
1052
1053 // Look for XOP lowering tricks.
1054 if (ST->hasXOP()) {
1055 // If the right shift is constant then we'll fold the negation so
1056 // it's as cheap as a left shift.
1057 int ShiftISD = ISD;
1058 if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant())
1059 ShiftISD = ISD::SHL;
1060 if (const auto *Entry =
1061 CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
1062 if (auto KindCost = Entry->Cost[CostKind])
1063 return LT.first * *KindCost;
1064 }
1065
1066 if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) {
1067 MVT VT = LT.second;
1068 // Vector shift left by non uniform constant can be lowered
1069 // into vector multiply.
1070 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
1071 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
1072 ISD = ISD::MUL;
1073 }
1074
1075 static const CostKindTblEntry GLMCostTable[] = {
1076 { ISD::FDIV, MVT::f32, { 18, 19, 1, 1 } }, // divss
1077 { ISD::FDIV, MVT::v4f32, { 35, 36, 1, 1 } }, // divps
1078 { ISD::FDIV, MVT::f64, { 33, 34, 1, 1 } }, // divsd
1079 { ISD::FDIV, MVT::v2f64, { 65, 66, 1, 1 } }, // divpd
1080 };
1081
1082 if (ST->useGLMDivSqrtCosts())
1083 if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second))
1084 if (auto KindCost = Entry->Cost[CostKind])
1085 return LT.first * *KindCost;
1086
1087 static const CostKindTblEntry SLMCostTable[] = {
1088 { ISD::MUL, MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld
1089 { ISD::MUL, MVT::v8i16, { 2, 5, 1, 1 } }, // pmullw
1090 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // mulsd
1091 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // mulss
1092 { ISD::FMUL, MVT::v2f64, { 4, 7, 1, 1 } }, // mulpd
1093 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // mulps
1094 { ISD::FDIV, MVT::f32, { 17, 19, 1, 1 } }, // divss
1095 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 6 } }, // divps
1096 { ISD::FDIV, MVT::f64, { 32, 34, 1, 1 } }, // divsd
1097 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 6 } }, // divpd
1098 { ISD::FADD, MVT::v2f64, { 2, 4, 1, 1 } }, // addpd
1099 { ISD::FSUB, MVT::v2f64, { 2, 4, 1, 1 } }, // subpd
1100 // v2i64/v4i64 mul is custom lowered as a series of long:
1101 // multiplies(3), shifts(3) and adds(2)
1102 // slm muldq version throughput is 2 and addq throughput 4
1103 // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
1104 // 3X4 (addq throughput) = 17
1105 { ISD::MUL, MVT::v2i64, { 17, 22, 9, 9 } },
1106 // slm addq\subq throughput is 4
1107 { ISD::ADD, MVT::v2i64, { 4, 2, 1, 2 } },
1108 { ISD::SUB, MVT::v2i64, { 4, 2, 1, 2 } },
1109 };
1110
1111 if (ST->useSLMArithCosts())
1112 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second))
1113 if (auto KindCost = Entry->Cost[CostKind])
1114 return LT.first * *KindCost;
1115
1116 static const CostKindTblEntry AVX2CostTable[] = {
1117 { ISD::SHL, MVT::v16i8, { 6, 21,11,16 } }, // vpblendvb sequence.
1118 { ISD::SHL, MVT::v32i8, { 6, 23,11,22 } }, // vpblendvb sequence.
1119 { ISD::SHL, MVT::v8i16, { 5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence.
1120 { ISD::SHL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1121
1122 { ISD::SRL, MVT::v16i8, { 6, 27,12,18 } }, // vpblendvb sequence.
1123 { ISD::SRL, MVT::v32i8, { 8, 30,12,24 } }, // vpblendvb sequence.
1124 { ISD::SRL, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence.
1125 { ISD::SRL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1126
1127 { ISD::SRA, MVT::v16i8, { 17, 17,24,30 } }, // vpblendvb sequence.
1128 { ISD::SRA, MVT::v32i8, { 18, 20,24,43 } }, // vpblendvb sequence.
1129 { ISD::SRA, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsravd/pack sequence.
1130 { ISD::SRA, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsravd/pack sequence.
1131 { ISD::SRA, MVT::v2i64, { 4, 5, 5, 5 } }, // srl/xor/sub sequence.
1132 { ISD::SRA, MVT::v4i64, { 8, 8, 5, 9 } }, // srl/xor/sub sequence.
1133
1134 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 2 } }, // psubb
1135 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 2 } }, // paddb
1136 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 2 } }, // psubw
1137 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 2 } }, // paddw
1138 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 2 } }, // psubd
1139 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 2 } }, // paddd
1140 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 2 } }, // psubq
1141 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq
1142
1143 { ISD::MUL, MVT::v16i8, { 5, 18, 6,12 } }, // extend/pmullw/pack
1144 { ISD::MUL, MVT::v32i8, { 4, 8, 8,16 } }, // pmaddubsw
1145 { ISD::MUL, MVT::v16i16, { 2, 5, 1, 2 } }, // pmullw
1146 { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld
1147 { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld
1148 { ISD::MUL, MVT::v4i64, { 6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add
1149 { ISD::MUL, MVT::v2i64, { 6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
1150
1151 { X86ISD::PMULUDQ, MVT::v4i64, { 1, 5, 1, 1 } },
1152
1153 { ISD::FNEG, MVT::v4f64, { 1, 1, 1, 2 } }, // vxorpd
1154 { ISD::FNEG, MVT::v8f32, { 1, 1, 1, 2 } }, // vxorps
1155
1156 { ISD::FADD, MVT::f64, { 1, 4, 1, 1 } }, // vaddsd
1157 { ISD::FADD, MVT::f32, { 1, 4, 1, 1 } }, // vaddss
1158 { ISD::FADD, MVT::v2f64, { 1, 4, 1, 1 } }, // vaddpd
1159 { ISD::FADD, MVT::v4f32, { 1, 4, 1, 1 } }, // vaddps
1160 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 2 } }, // vaddpd
1161 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 2 } }, // vaddps
1162
1163 { ISD::FSUB, MVT::f64, { 1, 4, 1, 1 } }, // vsubsd
1164 { ISD::FSUB, MVT::f32, { 1, 4, 1, 1 } }, // vsubss
1165 { ISD::FSUB, MVT::v2f64, { 1, 4, 1, 1 } }, // vsubpd
1166 { ISD::FSUB, MVT::v4f32, { 1, 4, 1, 1 } }, // vsubps
1167 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 2 } }, // vsubpd
1168 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 2 } }, // vsubps
1169
1170 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // vmulsd
1171 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // vmulss
1172 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // vmulpd
1173 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // vmulps
1174 { ISD::FMUL, MVT::v4f64, { 1, 5, 1, 2 } }, // vmulpd
1175 { ISD::FMUL, MVT::v8f32, { 1, 5, 1, 2 } }, // vmulps
1176
1177 { ISD::FDIV, MVT::f32, { 7, 13, 1, 1 } }, // vdivss
1178 { ISD::FDIV, MVT::v4f32, { 7, 13, 1, 1 } }, // vdivps
1179 { ISD::FDIV, MVT::v8f32, { 14, 21, 1, 3 } }, // vdivps
1180 { ISD::FDIV, MVT::f64, { 14, 20, 1, 1 } }, // vdivsd
1181 { ISD::FDIV, MVT::v2f64, { 14, 20, 1, 1 } }, // vdivpd
1182 { ISD::FDIV, MVT::v4f64, { 28, 35, 1, 3 } }, // vdivpd
1183 };
1184
1185 // Look for AVX2 lowering tricks for custom cases.
1186 if (ST->hasAVX2())
1187 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
1188 if (auto KindCost = Entry->Cost[CostKind])
1189 return LT.first * *KindCost;
1190
1191 static const CostKindTblEntry AVX1CostTable[] = {
1192 // We don't have to scalarize unsupported ops. We can issue two half-sized
1193 // operations and we only need to extract the upper YMM half.
1194 // Two ops + 1 extract + 1 insert = 4.
1195 { ISD::MUL, MVT::v32i8, { 10, 11, 18, 19 } }, // pmaddubsw + split
1196 { ISD::MUL, MVT::v16i8, { 5, 6, 8, 12 } }, // 2*pmaddubsw/3*and/psllw/or
1197 { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split
1198 { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split
1199 { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld
1200 { ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } },
1201
1202 { ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps
1203 { ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps
1204 { ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps
1205 { ISD::AND, MVT::v4i64, { 1, 1, 1, 2 } }, // vandps
1206
1207 { ISD::OR, MVT::v32i8, { 1, 1, 1, 2 } }, // vorps
1208 { ISD::OR, MVT::v16i16, { 1, 1, 1, 2 } }, // vorps
1209 { ISD::OR, MVT::v8i32, { 1, 1, 1, 2 } }, // vorps
1210 { ISD::OR, MVT::v4i64, { 1, 1, 1, 2 } }, // vorps
1211
1212 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 2 } }, // vxorps
1213 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 2 } }, // vxorps
1214 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 2 } }, // vxorps
1215 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 2 } }, // vxorps
1216
1217 { ISD::SUB, MVT::v32i8, { 4, 2, 5, 6 } }, // psubb + split
1218 { ISD::ADD, MVT::v32i8, { 4, 2, 5, 6 } }, // paddb + split
1219 { ISD::SUB, MVT::v16i16, { 4, 2, 5, 6 } }, // psubw + split
1220 { ISD::ADD, MVT::v16i16, { 4, 2, 5, 6 } }, // paddw + split
1221 { ISD::SUB, MVT::v8i32, { 4, 2, 5, 6 } }, // psubd + split
1222 { ISD::ADD, MVT::v8i32, { 4, 2, 5, 6 } }, // paddd + split
1223 { ISD::SUB, MVT::v4i64, { 4, 2, 5, 6 } }, // psubq + split
1224 { ISD::ADD, MVT::v4i64, { 4, 2, 5, 6 } }, // paddq + split
1225 { ISD::SUB, MVT::v2i64, { 1, 1, 1, 1 } }, // psubq
1226 { ISD::ADD, MVT::v2i64, { 1, 1, 1, 1 } }, // paddq
1227
1228 { ISD::SHL, MVT::v16i8, { 10, 21,11,17 } }, // pblendvb sequence.
1229 { ISD::SHL, MVT::v32i8, { 22, 22,27,40 } }, // pblendvb sequence + split.
1230 { ISD::SHL, MVT::v8i16, { 6, 9,11,11 } }, // pblendvb sequence.
1231 { ISD::SHL, MVT::v16i16, { 13, 16,24,25 } }, // pblendvb sequence + split.
1232 { ISD::SHL, MVT::v4i32, { 3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld
1233 { ISD::SHL, MVT::v8i32, { 9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split
1234 { ISD::SHL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1235 { ISD::SHL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1236
1237 { ISD::SRL, MVT::v16i8, { 11, 27,12,18 } }, // pblendvb sequence.
1238 { ISD::SRL, MVT::v32i8, { 23, 23,30,43 } }, // pblendvb sequence + split.
1239 { ISD::SRL, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1240 { ISD::SRL, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1241 { ISD::SRL, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1242 { ISD::SRL, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1243 { ISD::SRL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1244 { ISD::SRL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1245
1246 { ISD::SRA, MVT::v16i8, { 21, 22,24,36 } }, // pblendvb sequence.
1247 { ISD::SRA, MVT::v32i8, { 44, 45,51,76 } }, // pblendvb sequence + split.
1248 { ISD::SRA, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1249 { ISD::SRA, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1250 { ISD::SRA, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1251 { ISD::SRA, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1252 { ISD::SRA, MVT::v2i64, { 5, 6,10,14 } }, // Shift each lane + blend.
1253 { ISD::SRA, MVT::v4i64, { 12, 12,22,30 } }, // Shift each lane + blend + split.
1254
1255 { ISD::FNEG, MVT::v4f64, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1256 { ISD::FNEG, MVT::v8f32, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1257
1258 { ISD::FADD, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1259 { ISD::FADD, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1260 { ISD::FADD, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1261 { ISD::FADD, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1262 { ISD::FADD, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1263 { ISD::FADD, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1264
1265 { ISD::FSUB, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1266 { ISD::FSUB, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1267 { ISD::FSUB, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1268 { ISD::FSUB, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1269 { ISD::FSUB, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1270 { ISD::FSUB, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1271
1272 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1273 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1274 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1275 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1276 { ISD::FMUL, MVT::v4f64, { 4, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1277 { ISD::FMUL, MVT::v8f32, { 2, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1278
1279 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1280 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1281 { ISD::FDIV, MVT::v8f32, { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/
1282 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1283 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1284 { ISD::FDIV, MVT::v4f64, { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/
1285 };
1286
1287 if (ST->hasAVX())
1288 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
1289 if (auto KindCost = Entry->Cost[CostKind])
1290 return LT.first * *KindCost;
1291
1292 static const CostKindTblEntry SSE42CostTable[] = {
1293 { ISD::FADD, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1294 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1295 { ISD::FADD, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1296 { ISD::FADD, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1297
1298 { ISD::FSUB, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1299 { ISD::FSUB, MVT::f32 , { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1300 { ISD::FSUB, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1301 { ISD::FSUB, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1302
1303 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1304 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1305 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1306 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1307
1308 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1309 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1310 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1311 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1312
1313 { ISD::MUL, MVT::v2i64, { 6, 10,10,10 } } // 3*pmuludq/3*shift/2*add
1314 };
1315
1316 if (ST->hasSSE42())
1317 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
1318 if (auto KindCost = Entry->Cost[CostKind])
1319 return LT.first * *KindCost;
1320
1321 static const CostKindTblEntry SSE41CostTable[] = {
1322 { ISD::SHL, MVT::v16i8, { 15, 24,17,22 } }, // pblendvb sequence.
1323 { ISD::SHL, MVT::v8i16, { 11, 14,11,11 } }, // pblendvb sequence.
1324 { ISD::SHL, MVT::v4i32, { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld
1325
1326 { ISD::SRL, MVT::v16i8, { 16, 27,18,24 } }, // pblendvb sequence.
1327 { ISD::SRL, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1328 { ISD::SRL, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1329 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1330
1331 { ISD::SRA, MVT::v16i8, { 38, 41,30,36 } }, // pblendvb sequence.
1332 { ISD::SRA, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1333 { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1334 { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence.
1335
1336 { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org)
1337 };
1338
1339 if (ST->hasSSE41())
1340 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
1341 if (auto KindCost = Entry->Cost[CostKind])
1342 return LT.first * *KindCost;
1343
1344 static const CostKindTblEntry SSSE3CostTable[] = {
1345 { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*pmaddubsw/3*and/psllw/or
1346 };
1347
1348 if (ST->hasSSSE3())
1349 if (const auto *Entry = CostTableLookup(SSSE3CostTable, ISD, LT.second))
1350 if (auto KindCost = Entry->Cost[CostKind])
1351 return LT.first * *KindCost;
1352
1353 static const CostKindTblEntry SSE2CostTable[] = {
1354 // We don't correctly identify costs of casts because they are marked as
1355 // custom.
1356 { ISD::SHL, MVT::v16i8, { 13, 21,26,28 } }, // cmpgtb sequence.
1357 { ISD::SHL, MVT::v8i16, { 24, 27,16,20 } }, // cmpgtw sequence.
1358 { ISD::SHL, MVT::v4i32, { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq.
1359 { ISD::SHL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1360
1361 { ISD::SRL, MVT::v16i8, { 14, 28,27,30 } }, // cmpgtb sequence.
1362 { ISD::SRL, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1363 { ISD::SRL, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1364 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1365
1366 { ISD::SRA, MVT::v16i8, { 27, 30,54,54 } }, // unpacked cmpgtb sequence.
1367 { ISD::SRA, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1368 { ISD::SRA, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1369 { ISD::SRA, MVT::v2i64, { 8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence.
1370
1371 { ISD::AND, MVT::v16i8, { 1, 1, 1, 1 } }, // pand
1372 { ISD::AND, MVT::v8i16, { 1, 1, 1, 1 } }, // pand
1373 { ISD::AND, MVT::v4i32, { 1, 1, 1, 1 } }, // pand
1374 { ISD::AND, MVT::v2i64, { 1, 1, 1, 1 } }, // pand
1375
1376 { ISD::OR, MVT::v16i8, { 1, 1, 1, 1 } }, // por
1377 { ISD::OR, MVT::v8i16, { 1, 1, 1, 1 } }, // por
1378 { ISD::OR, MVT::v4i32, { 1, 1, 1, 1 } }, // por
1379 { ISD::OR, MVT::v2i64, { 1, 1, 1, 1 } }, // por
1380
1381 { ISD::XOR, MVT::v16i8, { 1, 1, 1, 1 } }, // pxor
1382 { ISD::XOR, MVT::v8i16, { 1, 1, 1, 1 } }, // pxor
1383 { ISD::XOR, MVT::v4i32, { 1, 1, 1, 1 } }, // pxor
1384 { ISD::XOR, MVT::v2i64, { 1, 1, 1, 1 } }, // pxor
1385
1386 { ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq
1387 { ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq
1388
1389 { ISD::MUL, MVT::v16i8, { 6, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack
1390 { ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw
1391 { ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle
1392 { ISD::MUL, MVT::v2i64, { 7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add
1393
1394 { X86ISD::PMULUDQ, MVT::v2i64, { 1, 5, 1, 1 } },
1395
1396 { ISD::FDIV, MVT::f32, { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/
1397 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/
1398 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/
1399 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/
1400
1401 { ISD::FNEG, MVT::f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1402 { ISD::FNEG, MVT::f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1403 { ISD::FNEG, MVT::v4f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1404 { ISD::FNEG, MVT::v2f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1405
1406 { ISD::FADD, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1407 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1408 { ISD::FADD, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1409
1410 { ISD::FSUB, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1411 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1412 { ISD::FSUB, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1413
1414 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1415 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1416 };
1417
1418 if (ST->hasSSE2())
1419 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
1420 if (auto KindCost = Entry->Cost[CostKind])
1421 return LT.first * *KindCost;
1422
1423 static const CostKindTblEntry SSE1CostTable[] = {
1424 { ISD::FDIV, MVT::f32, { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/
1425 { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/
1426
1427 { ISD::FNEG, MVT::f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1428 { ISD::FNEG, MVT::v4f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1429
1430 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1431 { ISD::FADD, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1432
1433 { ISD::FSUB, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1434 { ISD::FSUB, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1435
1436 { ISD::FMUL, MVT::f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1437 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1438 };
1439
1440 if (ST->hasSSE1())
1441 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1442 if (auto KindCost = Entry->Cost[CostKind])
1443 return LT.first * *KindCost;
1444
1445 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
1446 { ISD::ADD, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1447 { ISD::SUB, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1448 { ISD::MUL, MVT::i64, { 2, 6, 1, 2 } },
1449 };
1450
1451 if (ST->is64Bit())
1452 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1453 if (auto KindCost = Entry->Cost[CostKind])
1454 return LT.first * *KindCost;
1455
1456 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1457 { ISD::ADD, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1458 { ISD::ADD, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1459 { ISD::ADD, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1460
1461 { ISD::SUB, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1462 { ISD::SUB, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1463 { ISD::SUB, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1464
1465 { ISD::MUL, MVT::i8, { 3, 4, 1, 1 } },
1466 { ISD::MUL, MVT::i16, { 2, 4, 1, 1 } },
1467 { ISD::MUL, MVT::i32, { 1, 4, 1, 1 } },
1468
1469 { ISD::FNEG, MVT::f64, { 2, 2, 1, 3 } }, // (x87)
1470 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1471 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1472 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // (x87)
1473 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87)
1474 };
1475
1476 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1477 if (auto KindCost = Entry->Cost[CostKind])
1478 return LT.first * *KindCost;
1479
1480 // It is not a good idea to vectorize division. We have to scalarize it and
1481 // in the process we will often end up having to spilling regular
1482 // registers. The overhead of division is going to dominate most kernels
1483 // anyways so try hard to prevent vectorization of division - it is
1484 // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1485 // to hide "20 cycles" for each lane.
1486 if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() &&
1487 (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
1488 ISD == ISD::UREM)) {
1489 InstructionCost ScalarCost =
1491 Op1Info.getNoProps(), Op2Info.getNoProps());
1492 return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1493 }
1494
1495 // Handle some basic single instruction code size cases.
1496 if (CostKind == TTI::TCK_CodeSize) {
1497 switch (ISD) {
1498 case ISD::FADD:
1499 case ISD::FSUB:
1500 case ISD::FMUL:
1501 case ISD::FDIV:
1502 case ISD::FNEG:
1503 case ISD::AND:
1504 case ISD::OR:
1505 case ISD::XOR:
1506 return LT.first;
1507 break;
1508 }
1509 }
1510
1511 // Fallback to the default implementation.
1512 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1513 Args, CxtI);
1514}
1515
1518 unsigned Opcode1, const SmallBitVector &OpcodeMask,
1520 if (isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
1521 return TTI::TCC_Basic;
1523}
1524
1526 TTI::ShuffleKind Kind, VectorType *BaseTp, ArrayRef<int> Mask,
1527 TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
1528 ArrayRef<const Value *> Args, const Instruction *CxtI) {
1529 // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1530 // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1531 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(BaseTp);
1532
1533 Kind = improveShuffleKindFromMask(Kind, Mask, BaseTp, Index, SubTp);
1534
1535 // If all args are constant than this will be constant folded away.
1536 if (!Args.empty() &&
1537 all_of(Args, [](const Value *Arg) { return isa<Constant>(Arg); }))
1538 return TTI::TCC_Free;
1539
1540 // Recognize a basic concat_vector shuffle.
1541 if (Kind == TTI::SK_PermuteTwoSrc &&
1542 Mask.size() == (2 * BaseTp->getElementCount().getKnownMinValue()) &&
1543 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
1546 CostKind, Mask.size() / 2, BaseTp);
1547
1548 // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1549 if (Kind == TTI::SK_Transpose)
1550 Kind = TTI::SK_PermuteTwoSrc;
1551
1552 if (Kind == TTI::SK_Broadcast) {
1553 // For Broadcasts we are splatting the first element from the first input
1554 // register, so only need to reference that input and all the output
1555 // registers are the same.
1556 LT.first = 1;
1557
1558 // If we're broadcasting a load then AVX/AVX2 can do this for free.
1559 using namespace PatternMatch;
1560 if (!Args.empty() && match(Args[0], m_OneUse(m_Load(m_Value()))) &&
1561 (ST->hasAVX2() ||
1562 (ST->hasAVX() && LT.second.getScalarSizeInBits() >= 32)))
1563 return TTI::TCC_Free;
1564 }
1565
1566 // Attempt to detect a cheaper inlane shuffle, avoiding 128-bit subvector
1567 // permutation.
1568 bool IsInLaneShuffle = false;
1569 if (BaseTp->getPrimitiveSizeInBits() > 0 &&
1570 (BaseTp->getPrimitiveSizeInBits() % 128) == 0 &&
1571 BaseTp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
1572 Mask.size() == BaseTp->getElementCount().getKnownMinValue()) {
1573 unsigned NumLanes = BaseTp->getPrimitiveSizeInBits() / 128;
1574 unsigned NumEltsPerLane = Mask.size() / NumLanes;
1575 if ((Mask.size() % NumLanes) == 0)
1576 IsInLaneShuffle = all_of(enumerate(Mask), [&](const auto &P) {
1577 return P.value() == PoisonMaskElem ||
1578 ((P.value() % Mask.size()) / NumEltsPerLane) ==
1579 (P.index() / NumEltsPerLane);
1580 });
1581 }
1582
1583 // Treat <X x bfloat> shuffles as <X x half>.
1584 if (LT.second.isVector() && LT.second.getScalarType() == MVT::bf16)
1585 LT.second = LT.second.changeVectorElementType(MVT::f16);
1586
1587 // Subvector extractions are free if they start at the beginning of a
1588 // vector and cheap if the subvectors are aligned.
1589 if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1590 int NumElts = LT.second.getVectorNumElements();
1591 if ((Index % NumElts) == 0)
1592 return TTI::TCC_Free;
1593 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1594 if (SubLT.second.isVector()) {
1595 int NumSubElts = SubLT.second.getVectorNumElements();
1596 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1597 return SubLT.first;
1598 // Handle some cases for widening legalization. For now we only handle
1599 // cases where the original subvector was naturally aligned and evenly
1600 // fit in its legalized subvector type.
1601 // FIXME: Remove some of the alignment restrictions.
1602 // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1603 // vectors.
1604 int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1605 if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1606 (NumSubElts % OrigSubElts) == 0 &&
1607 LT.second.getVectorElementType() ==
1608 SubLT.second.getVectorElementType() &&
1609 LT.second.getVectorElementType().getSizeInBits() ==
1611 assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1612 "Unexpected number of elements!");
1613 auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
1614 LT.second.getVectorNumElements());
1615 auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
1616 SubLT.second.getVectorNumElements());
1617 int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1618 InstructionCost ExtractCost = getShuffleCost(
1619 TTI::SK_ExtractSubvector, VecTy, {}, CostKind, ExtractIndex, SubTy);
1620
1621 // If the original size is 32-bits or more, we can use pshufd. Otherwise
1622 // if we have SSSE3 we can use pshufb.
1623 if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1624 return ExtractCost + 1; // pshufd or pshufb
1625
1626 assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1627 "Unexpected vector size");
1628
1629 return ExtractCost + 2; // worst case pshufhw + pshufd
1630 }
1631 }
1632 // If the extract subvector is not optimal, treat it as single op shuffle.
1634 }
1635
1636 // Subvector insertions are cheap if the subvectors are aligned.
1637 // Note that in general, the insertion starting at the beginning of a vector
1638 // isn't free, because we need to preserve the rest of the wide vector,
1639 // but if the destination vector legalizes to the same width as the subvector
1640 // then the insertion will simplify to a (free) register copy.
1641 if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1642 int NumElts = LT.second.getVectorNumElements();
1643 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1644 if (SubLT.second.isVector()) {
1645 int NumSubElts = SubLT.second.getVectorNumElements();
1646 bool MatchingTypes =
1647 NumElts == NumSubElts &&
1648 (SubTp->getElementCount().getKnownMinValue() % NumSubElts) == 0;
1649 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1650 return MatchingTypes ? TTI::TCC_Free : SubLT.first;
1651 }
1652
1653 // Attempt to match MOVSS (Idx == 0) or INSERTPS pattern. This will have
1654 // been matched by improveShuffleKindFromMask as a SK_InsertSubvector of
1655 // v1f32 (legalised to f32) into a v4f32.
1656 if (LT.first == 1 && LT.second == MVT::v4f32 && SubLT.first == 1 &&
1657 SubLT.second == MVT::f32 && (Index == 0 || ST->hasSSE41()))
1658 return 1;
1659
1660 // If the insertion isn't aligned, treat it like a 2-op shuffle.
1661 Kind = TTI::SK_PermuteTwoSrc;
1662 }
1663
1664 // Handle some common (illegal) sub-vector types as they are often very cheap
1665 // to shuffle even on targets without PSHUFB.
1666 EVT VT = TLI->getValueType(DL, BaseTp);
1667 if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1668 !ST->hasSSSE3()) {
1669 static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
1670 {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw
1671 {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw
1672 {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw
1673 {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw
1674 {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck
1675
1676 {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw
1677 {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw
1678 {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus
1679 {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck
1680
1681 {TTI::SK_Splice, MVT::v4i16, 2}, // punpck+psrldq
1682 {TTI::SK_Splice, MVT::v2i16, 2}, // punpck+psrldq
1683 {TTI::SK_Splice, MVT::v4i8, 2}, // punpck+psrldq
1684 {TTI::SK_Splice, MVT::v2i8, 2}, // punpck+psrldq
1685
1686 {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw
1687 {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw
1688 {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw
1689 {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw
1690 {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck
1691
1692 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
1693 {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
1694 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw
1695 {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw
1696 {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck
1697 };
1698
1699 if (ST->hasSSE2())
1700 if (const auto *Entry =
1701 CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1702 return Entry->Cost;
1703 }
1704
1705 // We are going to permute multiple sources and the result will be in multiple
1706 // destinations. Providing an accurate cost only for splits where the element
1707 // type remains the same.
1708 if (LT.first != 1) {
1709 MVT LegalVT = LT.second;
1710 if (LegalVT.isVector() &&
1711 LegalVT.getVectorElementType().getSizeInBits() ==
1713 LegalVT.getVectorNumElements() <
1714 cast<FixedVectorType>(BaseTp)->getNumElements()) {
1715 unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
1716 unsigned LegalVTSize = LegalVT.getStoreSize();
1717 // Number of source vectors after legalization:
1718 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1719 // Number of destination vectors after legalization:
1720 InstructionCost NumOfDests = LT.first;
1721
1722 auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
1723 LegalVT.getVectorNumElements());
1724
1725 if (!Mask.empty() && NumOfDests.isValid()) {
1726 // Try to perform better estimation of the permutation.
1727 // 1. Split the source/destination vectors into real registers.
1728 // 2. Do the mask analysis to identify which real registers are
1729 // permuted. If more than 1 source registers are used for the
1730 // destination register building, the cost for this destination register
1731 // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
1732 // source register is used, build mask and calculate the cost as a cost
1733 // of PermuteSingleSrc.
1734 // Also, for the single register permute we try to identify if the
1735 // destination register is just a copy of the source register or the
1736 // copy of the previous destination register (the cost is
1737 // TTI::TCC_Basic). If the source register is just reused, the cost for
1738 // this operation is TTI::TCC_Free.
1739 NumOfDests =
1741 FixedVectorType::get(BaseTp->getElementType(), Mask.size()))
1742 .first;
1743 unsigned E = *NumOfDests.getValue();
1744 unsigned NormalizedVF =
1745 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
1746 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
1747 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
1748 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
1749 copy(Mask, NormalizedMask.begin());
1750 unsigned PrevSrcReg = 0;
1751 ArrayRef<int> PrevRegMask;
1754 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
1755 [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask,
1756 &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
1757 if (!ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) {
1758 // Check if the previous register can be just copied to the next
1759 // one.
1760 if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
1761 PrevRegMask != RegMask)
1763 RegMask, CostKind, 0, nullptr);
1764 else
1765 // Just a copy of previous destination register.
1767 return;
1768 }
1769 if (SrcReg != DestReg &&
1770 any_of(RegMask, [](int I) { return I != PoisonMaskElem; })) {
1771 // Just a copy of the source register.
1773 }
1774 PrevSrcReg = SrcReg;
1775 PrevRegMask = RegMask;
1776 },
1777 [this, SingleOpTy, CostKind,
1778 &Cost](ArrayRef<int> RegMask, unsigned /*Unused*/,
1779 unsigned /*Unused*/, bool /*Unused*/) {
1780 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
1781 CostKind, 0, nullptr);
1782 });
1783 return Cost;
1784 }
1785
1786 InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1787 return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1788 {}, CostKind, 0, nullptr);
1789 }
1790
1791 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
1792 }
1793
1794 static const CostTblEntry AVX512VBMIShuffleTbl[] = {
1795 {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
1796 {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
1797
1798 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
1799 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
1800
1801 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
1802 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
1803 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b
1804 };
1805
1806 if (ST->hasVBMI())
1807 if (const auto *Entry =
1808 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1809 return LT.first * Entry->Cost;
1810
1811 static const CostTblEntry AVX512BWShuffleTbl[] = {
1812 {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1813 {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
1814 {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1815
1816 {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
1817 {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
1818 {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
1819 {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
1820
1821 {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
1822 {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
1823 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
1824 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
1825 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
1826
1827 {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
1828 {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
1829 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
1830 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
1831 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
1832
1833 {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
1834 {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb
1835
1836 {TTI::SK_Splice, MVT::v32i16, 2}, // vshufi64x2 + palignr
1837 {TTI::SK_Splice, MVT::v32f16, 2}, // vshufi64x2 + palignr
1838 {TTI::SK_Splice, MVT::v64i8, 2}, // vshufi64x2 + palignr
1839 };
1840
1841 if (ST->hasBWI())
1842 if (const auto *Entry =
1843 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1844 return LT.first * Entry->Cost;
1845
1846 static const CostKindTblEntry AVX512ShuffleTbl[] = {
1847 {TTI::SK_Broadcast, MVT::v8f64, { 1, 1, 1, 1 } }, // vbroadcastsd
1848 {TTI::SK_Broadcast, MVT::v16f32, { 1, 1, 1, 1 } }, // vbroadcastss
1849 {TTI::SK_Broadcast, MVT::v8i64, { 1, 1, 1, 1 } }, // vpbroadcastq
1850 {TTI::SK_Broadcast, MVT::v16i32, { 1, 1, 1, 1 } }, // vpbroadcastd
1851 {TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw
1852 {TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw
1853 {TTI::SK_Broadcast, MVT::v64i8, { 1, 1, 1, 1 } }, // vpbroadcastb
1854
1855 {TTI::SK_Reverse, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1856 {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1857 {TTI::SK_Reverse, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1858 {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1859 {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca
1860 {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca
1861 {TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca
1862
1863 {TTI::SK_Splice, MVT::v8f64, { 1, 1, 1, 1 } }, // vpalignd
1864 {TTI::SK_Splice, MVT::v4f64, { 1, 1, 1, 1 } }, // vpalignd
1865 {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd
1866 {TTI::SK_Splice, MVT::v8f32, { 1, 1, 1, 1 } }, // vpalignd
1867 {TTI::SK_Splice, MVT::v8i64, { 1, 1, 1, 1 } }, // vpalignd
1868 {TTI::SK_Splice, MVT::v4i64, { 1, 1, 1, 1 } }, // vpalignd
1869 {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd
1870 {TTI::SK_Splice, MVT::v8i32, { 1, 1, 1, 1 } }, // vpalignd
1871 {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr
1872 {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr
1873 {TTI::SK_Splice, MVT::v64i8, { 4, 4, 4, 4 } }, // split + palignr
1874
1875 {TTI::SK_PermuteSingleSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1876 {TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermpd
1877 {TTI::SK_PermuteSingleSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermpd
1878 {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1879 {TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermps
1880 {TTI::SK_PermuteSingleSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermps
1881 {TTI::SK_PermuteSingleSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1882 {TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermq
1883 {TTI::SK_PermuteSingleSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermq
1884 {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1885 {TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermd
1886 {TTI::SK_PermuteSingleSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermd
1887 {TTI::SK_PermuteSingleSrc, MVT::v16i8, { 1, 3, 1, 1 } }, // pshufb
1888
1889 {TTI::SK_PermuteTwoSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermt2pd
1890 {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermt2ps
1891 {TTI::SK_PermuteTwoSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermt2q
1892 {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermt2d
1893 {TTI::SK_PermuteTwoSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermt2pd
1894 {TTI::SK_PermuteTwoSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermt2ps
1895 {TTI::SK_PermuteTwoSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermt2q
1896 {TTI::SK_PermuteTwoSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermt2d
1897 {TTI::SK_PermuteTwoSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermt2pd
1898 {TTI::SK_PermuteTwoSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermt2ps
1899 {TTI::SK_PermuteTwoSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermt2q
1900 {TTI::SK_PermuteTwoSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermt2d
1901
1902 // FIXME: This just applies the type legalization cost rules above
1903 // assuming these completely split.
1904 {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } },
1905 {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } },
1906 {TTI::SK_PermuteSingleSrc, MVT::v64i8, { 14, 14, 14, 14 } },
1907 {TTI::SK_PermuteTwoSrc, MVT::v32i16, { 42, 42, 42, 42 } },
1908 {TTI::SK_PermuteTwoSrc, MVT::v32f16, { 42, 42, 42, 42 } },
1909 {TTI::SK_PermuteTwoSrc, MVT::v64i8, { 42, 42, 42, 42 } },
1910
1911 {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq
1912 {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq
1913 {TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vpternlogq
1914 {TTI::SK_Select, MVT::v8f64, { 1, 1, 1, 1 } }, // vblendmpd
1915 {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps
1916 {TTI::SK_Select, MVT::v8i64, { 1, 1, 1, 1 } }, // vblendmq
1917 {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd
1918 };
1919
1920 if (ST->hasAVX512())
1921 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1922 if (auto KindCost = Entry->Cost[CostKind])
1923 return LT.first * *KindCost;
1924
1925 static const CostTblEntry AVX2InLaneShuffleTbl[] = {
1926 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 1}, // vpshufb
1927 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 1}, // vpshufb
1928 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpshufb
1929
1930 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 2}, // 2*vshufpd + vblendpd
1931 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 2}, // 2*vshufps + vblendps
1932 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 2}, // 2*vpshufd + vpblendd
1933 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 2}, // 2*vpshufd + vpblendd
1934 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // 2*vpshufb + vpor
1935 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 2}, // 2*vpshufb + vpor
1936 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // 2*vpshufb + vpor
1937 };
1938
1939 if (IsInLaneShuffle && ST->hasAVX2())
1940 if (const auto *Entry =
1941 CostTableLookup(AVX2InLaneShuffleTbl, Kind, LT.second))
1942 return LT.first * Entry->Cost;
1943
1944 static const CostTblEntry AVX2ShuffleTbl[] = {
1945 {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
1946 {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
1947 {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
1948 {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
1949 {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1950 {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw
1951 {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
1952
1953 {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
1954 {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
1955 {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
1956 {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
1957 {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1958 {TTI::SK_Reverse, MVT::v16f16, 2}, // vperm2i128 + pshufb
1959 {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
1960
1961 {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1962 {TTI::SK_Select, MVT::v16f16, 1}, // vpblendvb
1963 {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
1964
1965 {TTI::SK_Splice, MVT::v8i32, 2}, // vperm2i128 + vpalignr
1966 {TTI::SK_Splice, MVT::v8f32, 2}, // vperm2i128 + vpalignr
1967 {TTI::SK_Splice, MVT::v16i16, 2}, // vperm2i128 + vpalignr
1968 {TTI::SK_Splice, MVT::v16f16, 2}, // vperm2i128 + vpalignr
1969 {TTI::SK_Splice, MVT::v32i8, 2}, // vperm2i128 + vpalignr
1970
1971 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1972 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1973 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1974 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1975 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1976 // + vpblendvb
1977 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vperm2i128 + 2*vpshufb
1978 // + vpblendvb
1979 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
1980 // + vpblendvb
1981
1982 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
1983 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
1984 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
1985 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
1986 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1987 // + vpblendvb
1988 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 7}, // 2*vperm2i128 + 4*vpshufb
1989 // + vpblendvb
1990 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
1991 // + vpblendvb
1992 };
1993
1994 if (ST->hasAVX2())
1995 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1996 return LT.first * Entry->Cost;
1997
1998 static const CostTblEntry XOPShuffleTbl[] = {
1999 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
2000 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
2001 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
2002 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
2003 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
2004 // + vinsertf128
2005 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
2006 // + vinsertf128
2007
2008 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
2009 // + vinsertf128
2010 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
2011 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
2012 // + vinsertf128
2013 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
2014 };
2015
2016 if (ST->hasXOP())
2017 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
2018 return LT.first * Entry->Cost;
2019
2020 static const CostTblEntry AVX1InLaneShuffleTbl[] = {
2021 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermilpd
2022 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermilpd
2023 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermilps
2024 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermilps
2025
2026 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
2027 // + vpor + vinsertf128
2028 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vextractf128 + 2*pshufb
2029 // + vpor + vinsertf128
2030 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
2031 // + vpor + vinsertf128
2032
2033 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 2}, // 2*vshufpd + vblendpd
2034 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 2}, // 2*vshufps + vblendps
2035 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 2}, // 2*vpermilpd + vblendpd
2036 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 2}, // 2*vpermilps + vblendps
2037 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 4*pshufb
2038 // + 2*vpor + vinsertf128
2039 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 9}, // 2*vextractf128 + 4*pshufb
2040 // + 2*vpor + vinsertf128
2041 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 4*pshufb
2042 // + 2*vpor + vinsertf128
2043 };
2044
2045 if (IsInLaneShuffle && ST->hasAVX())
2046 if (const auto *Entry =
2047 CostTableLookup(AVX1InLaneShuffleTbl, Kind, LT.second))
2048 return LT.first * Entry->Cost;
2049
2050 static const CostTblEntry AVX1ShuffleTbl[] = {
2051 {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
2052 {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
2053 {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
2054 {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
2055 {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
2056 {TTI::SK_Broadcast, MVT::v16f16, 3}, // vpshuflw + vpshufd + vinsertf128
2057 {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
2058
2059 {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
2060 {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
2061 {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
2062 {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
2063 {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
2064 // + vinsertf128
2065 {TTI::SK_Reverse, MVT::v16f16, 4}, // vextractf128 + 2*pshufb
2066 // + vinsertf128
2067 {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
2068 // + vinsertf128
2069
2070 {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
2071 {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
2072 {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
2073 {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
2074 {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
2075 {TTI::SK_Select, MVT::v16f16, 3}, // vpand + vpandn + vpor
2076 {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
2077
2078 {TTI::SK_Splice, MVT::v4i64, 2}, // vperm2f128 + shufpd
2079 {TTI::SK_Splice, MVT::v4f64, 2}, // vperm2f128 + shufpd
2080 {TTI::SK_Splice, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
2081 {TTI::SK_Splice, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
2082 {TTI::SK_Splice, MVT::v16i16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2083 {TTI::SK_Splice, MVT::v16f16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2084 {TTI::SK_Splice, MVT::v32i8, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2085
2086 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
2087 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
2088 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
2089 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
2090 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
2091 // + 2*por + vinsertf128
2092 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 8}, // vextractf128 + 4*pshufb
2093 // + 2*por + vinsertf128
2094 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
2095 // + 2*por + vinsertf128
2096
2097 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
2098 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
2099 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
2100 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
2101 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
2102 // + 4*por + vinsertf128
2103 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 15}, // 2*vextractf128 + 8*pshufb
2104 // + 4*por + vinsertf128
2105 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
2106 // + 4*por + vinsertf128
2107 };
2108
2109 if (ST->hasAVX())
2110 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
2111 return LT.first * Entry->Cost;
2112
2113 static const CostTblEntry SSE41ShuffleTbl[] = {
2114 {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
2115 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
2116 {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
2117 {TTI::SK_Select, MVT::v4f32, 1}, // blendps
2118 {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
2119 {TTI::SK_Select, MVT::v8f16, 1}, // pblendw
2120 {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
2121 };
2122
2123 if (ST->hasSSE41())
2124 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
2125 return LT.first * Entry->Cost;
2126
2127 static const CostTblEntry SSSE3ShuffleTbl[] = {
2128 {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
2129 {TTI::SK_Broadcast, MVT::v8f16, 1}, // pshufb
2130 {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
2131
2132 {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
2133 {TTI::SK_Reverse, MVT::v8f16, 1}, // pshufb
2134 {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
2135
2136 {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
2137 {TTI::SK_Select, MVT::v8f16, 3}, // 2*pshufb + por
2138 {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
2139
2140 {TTI::SK_Splice, MVT::v4i32, 1}, // palignr
2141 {TTI::SK_Splice, MVT::v4f32, 1}, // palignr
2142 {TTI::SK_Splice, MVT::v8i16, 1}, // palignr
2143 {TTI::SK_Splice, MVT::v8f16, 1}, // palignr
2144 {TTI::SK_Splice, MVT::v16i8, 1}, // palignr
2145
2146 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
2147 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // pshufb
2148 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
2149
2150 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
2151 {TTI::SK_PermuteTwoSrc, MVT::v8f16, 3}, // 2*pshufb + por
2152 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
2153 };
2154
2155 if (ST->hasSSSE3())
2156 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
2157 return LT.first * Entry->Cost;
2158
2159 static const CostTblEntry SSE2ShuffleTbl[] = {
2160 {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
2161 {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
2162 {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
2163 {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
2164 {TTI::SK_Broadcast, MVT::v8f16, 2}, // pshuflw + pshufd
2165 {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
2166
2167 {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
2168 {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
2169 {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
2170 {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
2171 {TTI::SK_Reverse, MVT::v8f16, 3}, // pshuflw + pshufhw + pshufd
2172 {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
2173 // + 2*pshufd + 2*unpck + packus
2174
2175 {TTI::SK_Select, MVT::v2i64, 1}, // movsd
2176 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
2177 {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
2178 {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
2179 {TTI::SK_Select, MVT::v8f16, 3}, // pand + pandn + por
2180 {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
2181
2182 {TTI::SK_Splice, MVT::v2i64, 1}, // shufpd
2183 {TTI::SK_Splice, MVT::v2f64, 1}, // shufpd
2184 {TTI::SK_Splice, MVT::v4i32, 2}, // 2*{unpck,movsd,pshufd}
2185 {TTI::SK_Splice, MVT::v8i16, 3}, // psrldq + psrlldq + por
2186 {TTI::SK_Splice, MVT::v8f16, 3}, // psrldq + psrlldq + por
2187 {TTI::SK_Splice, MVT::v16i8, 3}, // psrldq + psrlldq + por
2188
2189 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
2190 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
2191 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
2192 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
2193 // + pshufd/unpck
2194 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 5}, // 2*pshuflw + 2*pshufhw
2195 // + pshufd/unpck
2196 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
2197 // + 2*pshufd + 2*unpck + 2*packus
2198
2199 { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
2200 { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
2201 { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
2202 { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
2203 { TTI::SK_PermuteTwoSrc, MVT::v8f16, 8 }, // blend+permute
2204 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
2205 };
2206
2207 static const CostTblEntry SSE3BroadcastLoadTbl[] = {
2208 {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
2209 };
2210
2211 if (ST->hasSSE2()) {
2212 bool IsLoad =
2213 llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); });
2214 if (ST->hasSSE3() && IsLoad)
2215 if (const auto *Entry =
2216 CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
2218 LT.second.getVectorElementCount()) &&
2219 "Table entry missing from isLegalBroadcastLoad()");
2220 return LT.first * Entry->Cost;
2221 }
2222
2223 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
2224 return LT.first * Entry->Cost;
2225 }
2226
2227 static const CostTblEntry SSE1ShuffleTbl[] = {
2228 { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
2229 { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
2230 { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
2231 { TTI::SK_Splice, MVT::v4f32, 2 }, // 2*shufps
2232 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
2233 { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
2234 };
2235
2236 if (ST->hasSSE1()) {
2237 if (LT.first == 1 && LT.second == MVT::v4f32 && Mask.size() == 4) {
2238 // SHUFPS: both pairs must come from the same source register.
2239 auto MatchSHUFPS = [](int X, int Y) {
2240 return X < 0 || Y < 0 || ((X & 4) == (Y & 4));
2241 };
2242 if (MatchSHUFPS(Mask[0], Mask[1]) && MatchSHUFPS(Mask[2], Mask[3]))
2243 return 1;
2244 }
2245 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
2246 return LT.first * Entry->Cost;
2247 }
2248
2249 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
2250}
2251
2253 Type *Src,
2256 const Instruction *I) {
2257 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2258 assert(ISD && "Invalid opcode");
2259
2260 // The cost tables include both specific, custom (non-legal) src/dst type
2261 // conversions and generic, legalized types. We test for customs first, before
2262 // falling back to legalization.
2263 // FIXME: Need a better design of the cost table to handle non-simple types of
2264 // potential massive combinations (elem_num x src_type x dst_type).
2265 static const TypeConversionCostKindTblEntry AVX512BWConversionTbl[]{
2266 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2267 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2268
2269 // Mask sign extend has an instruction.
2270 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2271 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2272 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2273 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2274 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2275 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2276 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2277 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2278 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2279 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2280 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2281 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2282 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2283 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2284 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, { 1, 1, 1, 1 } },
2285 { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, { 1, 1, 1, 1 } },
2286 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, { 1, 1, 1, 1 } },
2287
2288 // Mask zero extend is a sext + shift.
2289 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2290 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2291 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2292 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2293 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2294 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2295 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2296 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2297 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2298 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2299 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2300 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2301 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2302 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2303 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, { 2, 1, 1, 1 } },
2304 { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, { 2, 1, 1, 1 } },
2305 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, { 2, 1, 1, 1 } },
2306
2307 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2308 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2309 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2310 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2311 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2312 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2313 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2314 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2315 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2316 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2317 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2318 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2319 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2320 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2321 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, { 2, 1, 1, 1 } },
2322 { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, { 2, 1, 1, 1 } },
2323 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, { 2, 1, 1, 1 } },
2324
2325 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 2, 1, 1, 1 } },
2326 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // widen to zmm
2327 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, { 2, 1, 1, 1 } }, // vpmovwb
2328 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, { 2, 1, 1, 1 } }, // vpmovwb
2329 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, { 2, 1, 1, 1 } }, // vpmovwb
2330 };
2331
2332 static const TypeConversionCostKindTblEntry AVX512DQConversionTbl[] = {
2333 // Mask sign extend has an instruction.
2334 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2335 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2336 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2337 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2338 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2339 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, { 1, 1, 1, 1 } },
2340 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } },
2341 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } },
2342
2343 // Mask zero extend is a sext + shift.
2344 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1, } },
2345 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1, } },
2346 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1, } },
2347 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1, } },
2348 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1, } },
2349 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, { 2, 1, 1, 1, } },
2350 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1, } },
2351 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1, } },
2352
2353 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2354 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2355 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2356 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2357 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2358 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } },
2359 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } },
2360 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, { 2, 1, 1, 1 } },
2361
2362 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2363 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2364
2365 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2366 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2367
2368 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2369 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2370
2371 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2372 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2373 };
2374
2375 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
2376 // 256-bit wide vectors.
2377
2378 static const TypeConversionCostKindTblEntry AVX512FConversionTbl[] = {
2379 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 1, 1, 1, 1 } },
2380 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, { 3, 1, 1, 1 } },
2381 { ISD::FP_EXTEND, MVT::v16f64, MVT::v16f32, { 4, 1, 1, 1 } }, // 2*vcvtps2pd+vextractf64x4
2382 { ISD::FP_EXTEND, MVT::v16f32, MVT::v16f16, { 1, 1, 1, 1 } }, // vcvtph2ps
2383 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
2384 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 1, 1, 1, 1 } },
2385 { ISD::FP_ROUND, MVT::v16f16, MVT::v16f32, { 1, 1, 1, 1 } }, // vcvtps2ph
2386
2387 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2388 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2389 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2390 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2391 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2392 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2393 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2394 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2395 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2396 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2397 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2398 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2399 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2400 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2401 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2402 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, { 2, 1, 1, 1 } }, // vpmovdb
2403 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, { 2, 1, 1, 1 } }, // vpmovdb
2404 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2405 { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2406 { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2407 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2408 { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2409 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, { 2, 1, 1, 1 } }, // vpmovqb
2410 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, { 1, 1, 1, 1 } }, // vpshufb
2411 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2412 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2413 { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2414 { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2415 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2416 { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2417 { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2418 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, { 1, 1, 1, 1 } }, // vpmovqd
2419 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // zmm vpmovqd
2420 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, { 5, 1, 1, 1 } },// 2*vpmovqd+concat+vpmovdb
2421
2422 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } }, // extend to v16i32
2423 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 8, 1, 1, 1 } },
2424 { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, { 8, 1, 1, 1 } },
2425
2426 // Sign extend is zmm vpternlogd+vptruncdb.
2427 // Zero extend is zmm broadcast load+vptruncdw.
2428 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 3, 1, 1, 1 } },
2429 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 4, 1, 1, 1 } },
2430 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 3, 1, 1, 1 } },
2431 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 4, 1, 1, 1 } },
2432 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 3, 1, 1, 1 } },
2433 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 4, 1, 1, 1 } },
2434 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 3, 1, 1, 1 } },
2435 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 4, 1, 1, 1 } },
2436
2437 // Sign extend is zmm vpternlogd+vptruncdw.
2438 // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
2439 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 3, 1, 1, 1 } },
2440 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2441 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 3, 1, 1, 1 } },
2442 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2443 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 3, 1, 1, 1 } },
2444 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2445 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 3, 1, 1, 1 } },
2446 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2447
2448 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2449 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2450 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2451 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2452 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2453 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2454 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2455 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2456 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2457 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2458
2459 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2460 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2461 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogq
2462 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2463
2464 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2465 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2466 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2467 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2468 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2469 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2470 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2471 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2472 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2473 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2474
2475 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2476 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2477
2478 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2479 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2480 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2481 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2482 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2483 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2484 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2485 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2486
2487 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2488 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2489 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2490 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2491 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2492 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2493 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2494 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2495 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, {26, 1, 1, 1 } },
2496 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 5, 1, 1, 1 } },
2497
2498 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2499 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, { 7, 1, 1, 1 } },
2500 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64, {15, 1, 1, 1 } },
2501 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32, {11, 1, 1, 1 } },
2502 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64, {31, 1, 1, 1 } },
2503 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2504 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, { 7, 1, 1, 1 } },
2505 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, { 5, 1, 1, 1 } },
2506 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64, {15, 1, 1, 1 } },
2507 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2508 { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, { 3, 1, 1, 1 } },
2509
2510 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2511 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2512 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, { 3, 1, 1, 1 } },
2513 { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, { 1, 1, 1, 1 } },
2514 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, { 3, 1, 1, 1 } },
2515 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, { 3, 1, 1, 1 } },
2516 };
2517
2518 static const TypeConversionCostKindTblEntry AVX512BWVLConversionTbl[] {
2519 // Mask sign extend has an instruction.
2520 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2521 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2522 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2523 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2524 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2525 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2526 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2527 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2528 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2529 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2530 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2531 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2532 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2533 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2534 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, { 1, 1, 1, 1 } },
2535 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, { 1, 1, 1, 1 } },
2536 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, { 1, 1, 1, 1 } },
2537
2538 // Mask zero extend is a sext + shift.
2539 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2540 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2541 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2542 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2543 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2544 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2545 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2546 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2547 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2548 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2549 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2550 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2551 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2552 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2553 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, { 2, 1, 1, 1 } },
2554 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, { 2, 1, 1, 1 } },
2555 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, { 2, 1, 1, 1 } },
2556
2557 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2558 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2559 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2560 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2561 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2562 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2563 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2564 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2565 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2566 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2567 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2568 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2569 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2570 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2571 { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, { 2, 1, 1, 1 } },
2572 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, { 2, 1, 1, 1 } },
2573 { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, { 2, 1, 1, 1 } },
2574
2575 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } },
2576 };
2577
2578 static const TypeConversionCostKindTblEntry AVX512DQVLConversionTbl[] = {
2579 // Mask sign extend has an instruction.
2580 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2581 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2582 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2583 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, { 1, 1, 1, 1 } },
2584 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2585 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, { 1, 1, 1, 1 } },
2586 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } },
2587 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2588
2589 // Mask zero extend is a sext + shift.
2590 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } },
2591 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1 } },
2592 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } },
2593 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, { 2, 1, 1, 1 } },
2594 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } },
2595 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, { 2, 1, 1, 1 } },
2596 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } },
2597 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } },
2598
2599 { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, { 2, 1, 1, 1 } },
2600 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } },
2601 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2602 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2603 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2604 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2605 { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, { 2, 1, 1, 1 } },
2606 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2607
2608 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2609 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2610 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2611 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2612
2613 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2614 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2615 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2616 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2617
2618 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2619 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2620 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2621 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2622
2623 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2624 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2625 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2626 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2627 };
2628
2629 static const TypeConversionCostKindTblEntry AVX512VLConversionTbl[] = {
2630 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2631 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2632 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2633 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 8, 1, 1, 1 } }, // split+2*v8i8
2634 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2635 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2636 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2637 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 8, 1, 1, 1 } }, // split+2*v8i16
2638 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2639 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2640 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2641 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2642 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2643 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2644 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // vpmovqd
2645 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqb
2646 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqw
2647 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, { 2, 1, 1, 1 } }, // vpmovwb
2648
2649 // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
2650 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
2651 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 5, 1, 1, 1 } },
2652 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 6, 1, 1, 1 } },
2653 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 5, 1, 1, 1 } },
2654 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 6, 1, 1, 1 } },
2655 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 5, 1, 1, 1 } },
2656 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 6, 1, 1, 1 } },
2657 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, {10, 1, 1, 1 } },
2658 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, {12, 1, 1, 1 } },
2659
2660 // sign extend is vpcmpeq+maskedmove+vpmovdw
2661 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
2662 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2663 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 5, 1, 1, 1 } },
2664 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2665 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 5, 1, 1, 1 } },
2666 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2667 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 5, 1, 1, 1 } },
2668 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, {10, 1, 1, 1 } },
2669 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, {12, 1, 1, 1 } },
2670
2671 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogd
2672 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2673 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogd
2674 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2675 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogd
2676 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2677 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2678 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2679
2680 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogq
2681 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2682 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogq
2683 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2684
2685 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2686 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2687 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2688 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2689 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2690 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2691 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2692 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2693 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2694 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2695 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2696 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2697
2698 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2699 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2700 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2701 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2702
2703 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
2704 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
2705 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2706 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2707 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2708 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2709 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 1, 1, 1, 1 } },
2710 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
2711 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2712 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2713 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 5, 1, 1, 1 } },
2714 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2715 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 5, 1, 1, 1 } },
2716
2717 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2718 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2719 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, { 5, 1, 1, 1 } },
2720
2721 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
2722 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
2723 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
2724 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
2725 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2726 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2727 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2728 };
2729
2730 static const TypeConversionCostKindTblEntry AVX2ConversionTbl[] = {
2731 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2732 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2733 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2734 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2735 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2736 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2737
2738 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2739 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2740 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2741 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2742 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2743 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2744 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2745 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2746 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2747 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2748 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2749 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2750 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2751 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2752
2753 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2754
2755 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 4, 1, 1, 1 } },
2756 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 4, 1, 1, 1 } },
2757 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 1, 1, 1, 1 } },
2758 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 1, 1, 1, 1 } },
2759 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 1, 1, 1, 1 } },
2760 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 4, 1, 1, 1 } },
2761 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 4, 1, 1, 1 } },
2762 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 1, 1, 1, 1 } },
2763 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 1, 1, 1, 1 } },
2764 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 5, 1, 1, 1 } },
2765 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } },
2766 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 2, 1, 1, 1 } },
2767
2768 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 3, 1, 1, 1 } },
2769 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 3, 1, 1, 1 } },
2770
2771 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2772 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2773 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2774 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 3, 1, 1, 1 } },
2775
2776 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 3, 1, 1, 1 } },
2777 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 3, 1, 1, 1 } },
2778 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2779 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
2780 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2781 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 4, 1, 1, 1 } },
2782 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 3, 1, 1, 1 } },
2783 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 4, 1, 1, 1 } },
2784
2785 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2786 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2787 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2788 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2789 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2790 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2791 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 3, 1, 1, 1 } },
2792
2793 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2794 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2795 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2796 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2797 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 2, 1, 1, 1 } },
2798 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 1, 1, 1, 1 } },
2799 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 2, 1, 1, 1 } },
2800 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2801 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2802 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2803 };
2804
2805 static const TypeConversionCostKindTblEntry AVXConversionTbl[] = {
2806 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2807 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2808 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2809 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2810 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2811 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2812
2813 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2814 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2815 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2816 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2817 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2818 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2819 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2820 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2821 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2822 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2823 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2824 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2825
2826 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 4, 1, 1, 1 } },
2827 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 5, 1, 1, 1 } },
2828 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 4, 1, 1, 1 } },
2829 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 9, 1, 1, 1 } },
2830 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, {11, 1, 1, 1 } },
2831
2832 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 6, 1, 1, 1 } },
2833 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 6, 1, 1, 1 } },
2834 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // and+extract+packuswb
2835 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 5, 1, 1, 1 } },
2836 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
2837 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 5, 1, 1, 1 } },
2838 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 3, 1, 1, 1 } }, // and+extract+2*packusdw
2839 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 2, 1, 1, 1 } },
2840
2841 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, { 3, 1, 1, 1 } },
2842 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, { 3, 1, 1, 1 } },
2843 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, { 8, 1, 1, 1 } },
2844 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2845 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2846 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2847 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2848 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2849 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2850 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2851 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 5, 1, 1, 1 } },
2852 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 8, 1, 1, 1 } },
2853
2854 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, { 7, 1, 1, 1 } },
2855 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, { 7, 1, 1, 1 } },
2856 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, { 6, 1, 1, 1 } },
2857 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2858 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2859 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2860 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2861 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 4, 1, 1, 1 } },
2862 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 4, 1, 1, 1 } },
2863 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
2864 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 6, 1, 1, 1 } },
2865 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 8, 1, 1, 1 } },
2866 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, {10, 1, 1, 1 } },
2867 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, {10, 1, 1, 1 } },
2868 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {18, 1, 1, 1 } },
2869 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2870 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, {10, 1, 1, 1 } },
2871
2872 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2873 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2874 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2875 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
2876 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
2877 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
2878 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
2879 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
2880 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 2, 1, 1, 1 } },
2881 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 2, 1, 1, 1 } },
2882 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 5, 1, 1, 1 } },
2883
2884 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2885 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2886 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2887 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
2888 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
2889 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
2890 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
2891 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
2892 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
2893 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2894 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 6, 1, 1, 1 } },
2895 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 7, 1, 1, 1 } },
2896 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 7, 1, 1, 1 } },
2897
2898 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, { 1, 1, 1, 1 } },
2899 { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, { 1, 1, 1, 1 } },
2900 };
2901
2902 static const TypeConversionCostKindTblEntry SSE41ConversionTbl[] = {
2903 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
2904 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
2905 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
2906 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
2907 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
2908 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
2909 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
2910 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
2911 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
2912 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
2913 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
2914 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
2915
2916 // These truncates end up widening elements.
2917 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 1, 1, 1, 1 } }, // PMOVXZBQ
2918 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 1, 1, 1, 1 } }, // PMOVXZWQ
2919 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 1, 1, 1, 1 } }, // PMOVXZBD
2920
2921 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 2, 1, 1, 1 } },
2922 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 2, 1, 1, 1 } },
2923 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 2, 1, 1, 1 } },
2924
2925 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
2926 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
2927 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
2928 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
2929 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
2930 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2931 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
2932 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2933 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
2934 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 1, 1, 1, 1 } },
2935 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2936
2937 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
2938 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
2939 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 4, 1, 1, 1 } },
2940 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 4, 1, 1, 1 } },
2941 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
2942 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2943 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
2944 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2945 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 3, 1, 1, 1 } },
2946 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
2947 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 2, 1, 1, 1 } },
2948 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {12, 1, 1, 1 } },
2949 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {22, 1, 1, 1 } },
2950 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 4, 1, 1, 1 } },
2951
2952 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
2953 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
2954 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
2955 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
2956 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
2957 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
2958 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
2959 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
2960 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
2961 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
2962
2963 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
2964 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
2965 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
2966 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
2967 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
2968 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
2969 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
2970 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
2971 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
2972 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2973 };
2974
2975 static const TypeConversionCostKindTblEntry SSE2ConversionTbl[] = {
2976 // These are somewhat magic numbers justified by comparing the
2977 // output of llvm-mca for our various supported scheduler models
2978 // and basing it off the worst case scenario.
2979 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
2980 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
2981 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 3, 1, 1, 1 } },
2982 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 3, 1, 1, 1 } },
2983 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 3, 1, 1, 1 } },
2984 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
2985 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 3, 1, 1, 1 } },
2986 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
2987 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
2988 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 4, 1, 1, 1 } },
2989 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 8, 1, 1, 1 } },
2990 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 8, 1, 1, 1 } },
2991
2992 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
2993 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
2994 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 8, 1, 1, 1 } },
2995 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 9, 1, 1, 1 } },
2996 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
2997 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 4, 1, 1, 1 } },
2998 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 4, 1, 1, 1 } },
2999 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
3000 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 7, 1, 1, 1 } },
3001 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 7, 1, 1, 1 } },
3002 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
3003 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, {15, 1, 1, 1 } },
3004 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {18, 1, 1, 1 } },
3005
3006 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
3007 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3008 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
3009 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
3010 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
3011 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
3012 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
3013 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
3014 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
3015 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
3016
3017 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
3018 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3019 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
3020 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, {15, 1, 1, 1 } },
3021 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
3022 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
3023 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
3024 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
3025 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 8, 1, 1, 1 } },
3026 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 8, 1, 1, 1 } },
3027
3028 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
3029 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
3030 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 2, 1, 1, 1 } },
3031 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 3, 1, 1, 1 } },
3032 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
3033 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 2, 1, 1, 1 } },
3034 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 2, 1, 1, 1 } },
3035 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 3, 1, 1, 1 } },
3036 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
3037 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 2, 1, 1, 1 } },
3038 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
3039 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 2, 1, 1, 1 } },
3040
3041 // These truncates are really widening elements.
3042 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 1, 1, 1, 1 } }, // PSHUFD
3043 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } }, // PUNPCKLWD+DQ
3044 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // PUNPCKLBW+WD+PSHUFD
3045 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 1, 1, 1, 1 } }, // PUNPCKLWD
3046 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } }, // PUNPCKLBW+WD
3047 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 1, 1, 1, 1 } }, // PUNPCKLBW
3048
3049 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 2, 1, 1, 1 } }, // PAND+PACKUSWB
3050 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } },
3051 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 3, 1, 1, 1 } }, // PAND+2*PACKUSWB
3052 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 7, 1, 1, 1 } },
3053 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, { 1, 1, 1, 1 } },
3054 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 3, 1, 1, 1 } },
3055 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
3056 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, {10, 1, 1, 1 } },
3057 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 4, 1, 1, 1 } }, // PAND+3*PACKUSWB
3058 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 2, 1, 1, 1 } }, // PSHUFD+PSHUFLW
3059 { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, { 1, 1, 1, 1 } }, // PSHUFD
3060 };
3061
3062 static const TypeConversionCostKindTblEntry F16ConversionTbl[] = {
3063 { ISD::FP_ROUND, MVT::f16, MVT::f32, { 1, 1, 1, 1 } },
3064 { ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, { 1, 1, 1, 1 } },
3065 { ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, { 1, 1, 1, 1 } },
3066 { ISD::FP_EXTEND, MVT::f32, MVT::f16, { 1, 1, 1, 1 } },
3067 { ISD::FP_EXTEND, MVT::f64, MVT::f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
3068 { ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, { 1, 1, 1, 1 } },
3069 { ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, { 1, 1, 1, 1 } },
3070 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
3071 };
3072
3073 // Attempt to map directly to (simple) MVT types to let us match custom entries.
3074 EVT SrcTy = TLI->getValueType(DL, Src);
3075 EVT DstTy = TLI->getValueType(DL, Dst);
3076
3077 // The function getSimpleVT only handles simple value types.
3078 if (SrcTy.isSimple() && DstTy.isSimple()) {
3079 MVT SimpleSrcTy = SrcTy.getSimpleVT();
3080 MVT SimpleDstTy = DstTy.getSimpleVT();
3081
3082 if (ST->useAVX512Regs()) {
3083 if (ST->hasBWI())
3084 if (const auto *Entry = ConvertCostTableLookup(
3085 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3086 if (auto KindCost = Entry->Cost[CostKind])
3087 return *KindCost;
3088
3089 if (ST->hasDQI())
3090 if (const auto *Entry = ConvertCostTableLookup(
3091 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3092 if (auto KindCost = Entry->Cost[CostKind])
3093 return *KindCost;
3094
3095 if (ST->hasAVX512())
3096 if (const auto *Entry = ConvertCostTableLookup(
3097 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3098 if (auto KindCost = Entry->Cost[CostKind])
3099 return *KindCost;
3100 }
3101
3102 if (ST->hasBWI())
3103 if (const auto *Entry = ConvertCostTableLookup(
3104 AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3105 if (auto KindCost = Entry->Cost[CostKind])
3106 return *KindCost;
3107
3108 if (ST->hasDQI())
3109 if (const auto *Entry = ConvertCostTableLookup(
3110 AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3111 if (auto KindCost = Entry->Cost[CostKind])
3112 return *KindCost;
3113
3114 if (ST->hasAVX512())
3115 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3116 SimpleDstTy, SimpleSrcTy))
3117 if (auto KindCost = Entry->Cost[CostKind])
3118 return *KindCost;
3119
3120 if (ST->hasAVX2()) {
3121 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3122 SimpleDstTy, SimpleSrcTy))
3123 if (auto KindCost = Entry->Cost[CostKind])
3124 return *KindCost;
3125 }
3126
3127 if (ST->hasAVX()) {
3128 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3129 SimpleDstTy, SimpleSrcTy))
3130 if (auto KindCost = Entry->Cost[CostKind])
3131 return *KindCost;
3132 }
3133
3134 if (ST->hasF16C()) {
3135 if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD,
3136 SimpleDstTy, SimpleSrcTy))
3137 if (auto KindCost = Entry->Cost[CostKind])
3138 return *KindCost;
3139 }
3140
3141 if (ST->hasSSE41()) {
3142 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3143 SimpleDstTy, SimpleSrcTy))
3144 if (auto KindCost = Entry->Cost[CostKind])
3145 return *KindCost;
3146 }
3147
3148 if (ST->hasSSE2()) {
3149 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3150 SimpleDstTy, SimpleSrcTy))
3151 if (auto KindCost = Entry->Cost[CostKind])
3152 return *KindCost;
3153 }
3154
3155 if ((ISD == ISD::FP_ROUND && SimpleDstTy == MVT::f16) ||
3156 (ISD == ISD::FP_EXTEND && SimpleSrcTy == MVT::f16)) {
3157 // fp16 conversions not covered by any table entries require a libcall.
3158 // Return a large (arbitrary) number to model this.
3159 return InstructionCost(64);
3160 }
3161 }
3162
3163 // Fall back to legalized types.
3164 std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src);
3165 std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst);
3166
3167 // If we're truncating to the same legalized type - just assume its free.
3168 if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second)
3169 return TTI::TCC_Free;
3170
3171 if (ST->useAVX512Regs()) {
3172 if (ST->hasBWI())
3173 if (const auto *Entry = ConvertCostTableLookup(
3174 AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
3175 if (auto KindCost = Entry->Cost[CostKind])
3176 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3177
3178 if (ST->hasDQI())
3179 if (const auto *Entry = ConvertCostTableLookup(
3180 AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
3181 if (auto KindCost = Entry->Cost[CostKind])
3182 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3183
3184 if (ST->hasAVX512())
3185 if (const auto *Entry = ConvertCostTableLookup(
3186 AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
3187 if (auto KindCost = Entry->Cost[CostKind])
3188 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3189 }
3190
3191 if (ST->hasBWI())
3192 if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
3193 LTDest.second, LTSrc.second))
3194 if (auto KindCost = Entry->Cost[CostKind])
3195 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3196
3197 if (ST->hasDQI())
3198 if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
3199 LTDest.second, LTSrc.second))
3200 if (auto KindCost = Entry->Cost[CostKind])
3201 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3202
3203 if (ST->hasAVX512())
3204 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3205 LTDest.second, LTSrc.second))
3206 if (auto KindCost = Entry->Cost[CostKind])
3207 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3208
3209 if (ST->hasAVX2())
3210 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3211 LTDest.second, LTSrc.second))
3212 if (auto KindCost = Entry->Cost[CostKind])
3213 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3214
3215 if (ST->hasAVX())
3216 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3217 LTDest.second, LTSrc.second))
3218 if (auto KindCost = Entry->Cost[CostKind])
3219 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3220
3221 if (ST->hasF16C()) {
3222 if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD,
3223 LTDest.second, LTSrc.second))
3224 if (auto KindCost = Entry->Cost[CostKind])
3225 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3226 }
3227
3228 if (ST->hasSSE41())
3229 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3230 LTDest.second, LTSrc.second))
3231 if (auto KindCost = Entry->Cost[CostKind])
3232 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3233
3234 if (ST->hasSSE2())
3235 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3236 LTDest.second, LTSrc.second))
3237 if (auto KindCost = Entry->Cost[CostKind])
3238 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3239
3240 // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
3241 // sitofp.
3242 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3243 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
3244 Type *ExtSrc = Src->getWithNewBitWidth(32);
3245 unsigned ExtOpc =
3246 (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
3247
3248 // For scalar loads the extend would be free.
3249 InstructionCost ExtCost = 0;
3250 if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
3251 ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
3252
3253 return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
3255 }
3256
3257 // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
3258 // i32.
3259 if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
3260 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
3261 Type *TruncDst = Dst->getWithNewBitWidth(32);
3262 return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
3263 getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
3265 }
3266
3267 // TODO: Allow non-throughput costs that aren't binary.
3268 auto AdjustCost = [&CostKind](InstructionCost Cost,
3271 return Cost == 0 ? 0 : N;
3272 return Cost * N;
3273 };
3274 return AdjustCost(
3275 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3276}
3277
3279 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
3281 TTI::OperandValueInfo Op2Info, const Instruction *I) {
3282 // Early out if this type isn't scalar/vector integer/float.
3283 if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy()))
3284 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3285 Op1Info, Op2Info, I);
3286
3287 // Legalize the type.
3288 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3289
3290 MVT MTy = LT.second;
3291
3292 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3293 assert(ISD && "Invalid opcode");
3294
3295 InstructionCost ExtraCost = 0;
3296 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
3297 // Some vector comparison predicates cost extra instructions.
3298 // TODO: Adjust ExtraCost based on CostKind?
3299 // TODO: Should we invert this and assume worst case cmp costs
3300 // and reduce for particular predicates?
3301 if (MTy.isVector() &&
3302 !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
3303 (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
3304 ST->hasBWI())) {
3305 // Fallback to I if a specific predicate wasn't specified.
3306 CmpInst::Predicate Pred = VecPred;
3307 if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE ||
3309 Pred = cast<CmpInst>(I)->getPredicate();
3310
3311 bool CmpWithConstant = false;
3312 if (auto *CmpInstr = dyn_cast_or_null<CmpInst>(I))
3313 CmpWithConstant = isa<Constant>(CmpInstr->getOperand(1));
3314
3315 switch (Pred) {
3317 // xor(cmpeq(x,y),-1)
3318 ExtraCost = CmpWithConstant ? 0 : 1;
3319 break;
3322 // xor(cmpgt(x,y),-1)
3323 ExtraCost = CmpWithConstant ? 0 : 1;
3324 break;
3327 // cmpgt(xor(x,signbit),xor(y,signbit))
3328 // xor(cmpeq(pmaxu(x,y),x),-1)
3329 ExtraCost = CmpWithConstant ? 1 : 2;
3330 break;
3333 if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
3334 (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
3335 // cmpeq(psubus(x,y),0)
3336 // cmpeq(pminu(x,y),x)
3337 ExtraCost = 1;
3338 } else {
3339 // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
3340 ExtraCost = CmpWithConstant ? 2 : 3;
3341 }
3342 break;
3345 // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases.
3346 // Use FCMP_UEQ expansion - FCMP_ONE should be the same.
3347 if (CondTy && !ST->hasAVX())
3348 return getCmpSelInstrCost(Opcode, ValTy, CondTy,
3350 Op1Info, Op2Info) +
3351 getCmpSelInstrCost(Opcode, ValTy, CondTy,
3353 Op1Info, Op2Info) +
3354 getArithmeticInstrCost(Instruction::Or, CondTy, CostKind);
3355
3356 break;
3359 // Assume worst case scenario and add the maximum extra cost.
3360 ExtraCost = 3;
3361 break;
3362 default:
3363 break;
3364 }
3365 }
3366 }
3367
3368 static const CostKindTblEntry SLMCostTbl[] = {
3369 // slm pcmpeq/pcmpgt throughput is 2
3370 { ISD::SETCC, MVT::v2i64, { 2, 5, 1, 2 } },
3371 // slm pblendvb/blendvpd/blendvps throughput is 4
3372 { ISD::SELECT, MVT::v2f64, { 4, 4, 1, 3 } }, // vblendvpd
3373 { ISD::SELECT, MVT::v4f32, { 4, 4, 1, 3 } }, // vblendvps
3374 { ISD::SELECT, MVT::v2i64, { 4, 4, 1, 3 } }, // pblendvb
3375 { ISD::SELECT, MVT::v8i32, { 4, 4, 1, 3 } }, // pblendvb
3376 { ISD::SELECT, MVT::v8i16, { 4, 4, 1, 3 } }, // pblendvb
3377 { ISD::SELECT, MVT::v16i8, { 4, 4, 1, 3 } }, // pblendvb
3378 };
3379
3380 static const CostKindTblEntry AVX512BWCostTbl[] = {
3381 { ISD::SETCC, MVT::v32i16, { 1, 1, 1, 1 } },
3382 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 1 } },
3383 { ISD::SETCC, MVT::v64i8, { 1, 1, 1, 1 } },
3384 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 1 } },
3385
3386 { ISD::SELECT, MVT::v32i16, { 1, 1, 1, 1 } },
3387 { ISD::SELECT, MVT::v64i8, { 1, 1, 1, 1 } },
3388 };
3389
3390 static const CostKindTblEntry AVX512CostTbl[] = {
3391 { ISD::SETCC, MVT::v8f64, { 1, 4, 1, 1 } },
3392 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 1 } },
3393 { ISD::SETCC, MVT::v16f32, { 1, 4, 1, 1 } },
3394 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 1 } },
3395
3396 { ISD::SETCC, MVT::v8i64, { 1, 1, 1, 1 } },
3397 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 1 } },
3398 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3399 { ISD::SETCC, MVT::v16i32, { 1, 1, 1, 1 } },
3400 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 1 } },
3401 { ISD::SETCC, MVT::v32i16, { 3, 7, 5, 5 } },
3402 { ISD::SETCC, MVT::v64i8, { 3, 7, 5, 5 } },
3403
3404 { ISD::SELECT, MVT::v8i64, { 1, 1, 1, 1 } },
3405 { ISD::SELECT, MVT::v4i64, { 1, 1, 1, 1 } },
3406 { ISD::SELECT, MVT::v2i64, { 1, 1, 1, 1 } },
3407 { ISD::SELECT, MVT::v16i32, { 1, 1, 1, 1 } },
3408 { ISD::SELECT, MVT::v8i32, { 1, 1, 1, 1 } },
3409 { ISD::SELECT, MVT::v4i32, { 1, 1, 1, 1 } },
3410 { ISD::SELECT, MVT::v8f64, { 1, 1, 1, 1 } },
3411 { ISD::SELECT, MVT::v4f64, { 1, 1, 1, 1 } },
3412 { ISD::SELECT, MVT::v2f64, { 1, 1, 1, 1 } },
3413 { ISD::SELECT, MVT::f64, { 1, 1, 1, 1 } },
3414 { ISD::SELECT, MVT::v16f32, { 1, 1, 1, 1 } },
3415 { ISD::SELECT, MVT::v8f32 , { 1, 1, 1, 1 } },
3416 { ISD::SELECT, MVT::v4f32, { 1, 1, 1, 1 } },
3417 { ISD::SELECT, MVT::f32 , { 1, 1, 1, 1 } },
3418
3419 { ISD::SELECT, MVT::v32i16, { 2, 2, 4, 4 } },
3420 { ISD::SELECT, MVT::v16i16, { 1, 1, 1, 1 } },
3421 { ISD::SELECT, MVT::v8i16, { 1, 1, 1, 1 } },
3422 { ISD::SELECT, MVT::v64i8, { 2, 2, 4, 4 } },
3423 { ISD::SELECT, MVT::v32i8, { 1, 1, 1, 1 } },
3424 { ISD::SELECT, MVT::v16i8, { 1, 1, 1, 1 } },
3425 };
3426
3427 static const CostKindTblEntry AVX2CostTbl[] = {
3428 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 2 } },
3429 { ISD::SETCC, MVT::v2f64, { 1, 4, 1, 1 } },
3430 { ISD::SETCC, MVT::f64, { 1, 4, 1, 1 } },
3431 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 2 } },
3432 { ISD::SETCC, MVT::v4f32, { 1, 4, 1, 1 } },
3433 { ISD::SETCC, MVT::f32, { 1, 4, 1, 1 } },
3434
3435 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 2 } },
3436 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 2 } },
3437 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 2 } },
3438 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 2 } },
3439
3440 { ISD::SELECT, MVT::v4f64, { 2, 2, 1, 2 } }, // vblendvpd
3441 { ISD::SELECT, MVT::v8f32, { 2, 2, 1, 2 } }, // vblendvps
3442 { ISD::SELECT, MVT::v4i64, { 2, 2, 1, 2 } }, // pblendvb
3443 { ISD::SELECT, MVT::v8i32, { 2, 2, 1, 2 } }, // pblendvb
3444 { ISD::SELECT, MVT::v16i16, { 2, 2, 1, 2 } }, // pblendvb
3445 { ISD::SELECT, MVT::v32i8, { 2, 2, 1, 2 } }, // pblendvb
3446 };
3447
3448 static const CostKindTblEntry XOPCostTbl[] = {
3449 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3450 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3451 };
3452
3453 static const CostKindTblEntry AVX1CostTbl[] = {
3454 { ISD::SETCC, MVT::v4f64, { 2, 3, 1, 2 } },
3455 { ISD::SETCC, MVT::v2f64, { 1, 3, 1, 1 } },
3456 { ISD::SETCC, MVT::f64, { 1, 3, 1, 1 } },
3457 { ISD::SETCC, MVT::v8f32, { 2, 3, 1, 2 } },
3458 { ISD::SETCC, MVT::v4f32, { 1, 3, 1, 1 } },
3459 { ISD::SETCC, MVT::f32, { 1, 3, 1, 1 } },
3460
3461 // AVX1 does not support 8-wide integer compare.
3462 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3463 { ISD::SETCC, MVT::v8i32, { 4, 2, 5, 6 } },
3464 { ISD::SETCC, MVT::v16i16, { 4, 2, 5, 6 } },
3465 { ISD::SETCC, MVT::v32i8, { 4, 2, 5, 6 } },
3466
3467 { ISD::SELECT, MVT::v4f64, { 3, 3, 1, 2 } }, // vblendvpd
3468 { ISD::SELECT, MVT::v8f32, { 3, 3, 1, 2 } }, // vblendvps
3469 { ISD::SELECT, MVT::v4i64, { 3, 3, 1, 2 } }, // vblendvpd
3470 { ISD::SELECT, MVT::v8i32, { 3, 3, 1, 2 } }, // vblendvps
3471 { ISD::SELECT, MVT::v16i16, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3472 { ISD::SELECT, MVT::v32i8, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3473 };
3474
3475 static const CostKindTblEntry SSE42CostTbl[] = {
3476 { ISD::SETCC, MVT::v2i64, { 1, 2, 1, 2 } },
3477 };
3478
3479 static const CostKindTblEntry SSE41CostTbl[] = {
3480 { ISD::SETCC, MVT::v2f64, { 1, 5, 1, 1 } },
3481 { ISD::SETCC, MVT::v4f32, { 1, 5, 1, 1 } },
3482
3483 { ISD::SELECT, MVT::v2f64, { 2, 2, 1, 2 } }, // blendvpd
3484 { ISD::SELECT, MVT::f64, { 2, 2, 1, 2 } }, // blendvpd
3485 { ISD::SELECT, MVT::v4f32, { 2, 2, 1, 2 } }, // blendvps
3486 { ISD::SELECT, MVT::f32 , { 2, 2, 1, 2 } }, // blendvps
3487 { ISD::SELECT, MVT::v2i64, { 2, 2, 1, 2 } }, // pblendvb
3488 { ISD::SELECT, MVT::v4i32, { 2, 2, 1, 2 } }, // pblendvb
3489 { ISD::SELECT, MVT::v8i16, { 2, 2, 1, 2 } }, // pblendvb
3490 { ISD::SELECT, MVT::v16i8, { 2, 2, 1, 2 } }, // pblendvb
3491 };
3492
3493 static const CostKindTblEntry SSE2CostTbl[] = {
3494 { ISD::SETCC, MVT::v2f64, { 2, 5, 1, 1 } },
3495 { ISD::SETCC, MVT::f64, { 1, 5, 1, 1 } },
3496
3497 { ISD::SETCC, MVT::v2i64, { 5, 4, 5, 5 } }, // pcmpeqd/pcmpgtd expansion
3498 { ISD::SETCC, MVT::v4i32, { 1, 1, 1, 1 } },
3499 { ISD::SETCC, MVT::v8i16, { 1, 1, 1, 1 } },
3500 { ISD::SETCC, MVT::v16i8, { 1, 1, 1, 1 } },
3501
3502 { ISD::SELECT, MVT::v2f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3503 { ISD::SELECT, MVT::f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3504 { ISD::SELECT, MVT::v2i64, { 2, 2, 3, 3 } }, // pand + pandn + por
3505 { ISD::SELECT, MVT::v4i32, { 2, 2, 3, 3 } }, // pand + pandn + por
3506 { ISD::SELECT, MVT::v8i16, { 2, 2, 3, 3 } }, // pand + pandn + por
3507 { ISD::SELECT, MVT::v16i8, { 2, 2, 3, 3 } }, // pand + pandn + por
3508 };
3509
3510 static const CostKindTblEntry SSE1CostTbl[] = {
3511 { ISD::SETCC, MVT::v4f32, { 2, 5, 1, 1 } },
3512 { ISD::SETCC, MVT::f32, { 1, 5, 1, 1 } },
3513
3514 { ISD::SELECT, MVT::v4f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3515 { ISD::SELECT, MVT::f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3516 };
3517
3518 if (ST->useSLMArithCosts())
3519 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
3520 if (auto KindCost = Entry->Cost[CostKind])
3521 return LT.first * (ExtraCost + *KindCost);
3522
3523 if (ST->hasBWI())
3524 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3525 if (auto KindCost = Entry->Cost[CostKind])
3526 return LT.first * (ExtraCost + *KindCost);
3527
3528 if (ST->hasAVX512())
3529 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3530 if (auto KindCost = Entry->Cost[CostKind])
3531 return LT.first * (ExtraCost + *KindCost);
3532
3533 if (ST->hasAVX2())
3534 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3535 if (auto KindCost = Entry->Cost[CostKind])
3536 return LT.first * (ExtraCost + *KindCost);
3537
3538 if (ST->hasXOP())
3539 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3540 if (auto KindCost = Entry->Cost[CostKind])
3541 return LT.first * (ExtraCost + *KindCost);
3542
3543 if (ST->hasAVX())
3544 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3545 if (auto KindCost = Entry->Cost[CostKind])
3546 return LT.first * (ExtraCost + *KindCost);
3547
3548 if (ST->hasSSE42())
3549 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3550 if (auto KindCost = Entry->Cost[CostKind])
3551 return LT.first * (ExtraCost + *KindCost);
3552
3553 if (ST->hasSSE41())
3554 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3555 if (auto KindCost = Entry->Cost[CostKind])
3556 return LT.first * (ExtraCost + *KindCost);
3557
3558 if (ST->hasSSE2())
3559 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3560 if (auto KindCost = Entry->Cost[CostKind])
3561 return LT.first * (ExtraCost + *KindCost);
3562
3563 if (ST->hasSSE1())
3564 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3565 if (auto KindCost = Entry->Cost[CostKind])
3566 return LT.first * (ExtraCost + *KindCost);
3567
3568 // Assume a 3cy latency for fp select ops.
3569 if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select)
3570 if (ValTy->getScalarType()->isFloatingPointTy())
3571 return 3;
3572
3573 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3574 Op1Info, Op2Info, I);
3575}
3576
3578
3582 // Costs should match the codegen from:
3583 // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
3584 // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
3585 // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
3586 // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
3587 // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
3588
3589 // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
3590 // specialized in these tables yet.
3591 static const CostKindTblEntry AVX512VBMI2CostTbl[] = {
3592 { ISD::FSHL, MVT::v8i64, { 1, 1, 1, 1 } },
3593 { ISD::FSHL, MVT::v4i64, { 1, 1, 1, 1 } },
3594 { ISD::FSHL, MVT::v2i64, { 1, 1, 1, 1 } },
3595 { ISD::FSHL, MVT::v16i32, { 1, 1, 1, 1 } },
3596 { ISD::FSHL, MVT::v8i32, { 1, 1, 1, 1 } },
3597 { ISD::FSHL, MVT::v4i32, { 1, 1, 1, 1 } },
3598 { ISD::FSHL, MVT::v32i16, { 1, 1, 1, 1 } },
3599 { ISD::FSHL, MVT::v16i16, { 1, 1, 1, 1 } },
3600 { ISD::FSHL, MVT::v8i16, { 1, 1, 1, 1 } },
3601 { ISD::ROTL, MVT::v32i16, { 1, 1, 1, 1 } },
3602 { ISD::ROTL, MVT::v16i16, { 1, 1, 1, 1 } },
3603 { ISD::ROTL, MVT::v8i16, { 1, 1, 1, 1 } },
3604 { ISD::ROTR, MVT::v32i16, { 1, 1, 1, 1 } },
3605 { ISD::ROTR, MVT::v16i16, { 1, 1, 1, 1 } },
3606 { ISD::ROTR, MVT::v8i16, { 1, 1, 1, 1 } },
3607 { X86ISD::VROTLI, MVT::v32i16, { 1, 1, 1, 1 } },
3608 { X86ISD::VROTLI, MVT::v16i16, { 1, 1, 1, 1 } },
3609 { X86ISD::VROTLI, MVT::v8i16, { 1, 1, 1, 1 } },
3610 };
3611 static const CostKindTblEntry AVX512BITALGCostTbl[] = {
3612 { ISD::CTPOP, MVT::v32i16, { 1, 1, 1, 1 } },
3613 { ISD::CTPOP, MVT::v64i8, { 1, 1, 1, 1 } },
3614 { ISD::CTPOP, MVT::v16i16, { 1, 1, 1, 1 } },
3615 { ISD::CTPOP, MVT::v32i8, { 1, 1, 1, 1 } },
3616 { ISD::CTPOP, MVT::v8i16, { 1, 1, 1, 1 } },
3617 { ISD::CTPOP, MVT::v16i8, { 1, 1, 1, 1 } },
3618 };
3619 static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = {
3620 { ISD::CTPOP, MVT::v8i64, { 1, 1, 1, 1 } },
3621 { ISD::CTPOP, MVT::v16i32, { 1, 1, 1, 1 } },
3622 { ISD::CTPOP, MVT::v4i64, { 1, 1, 1, 1 } },
3623 { ISD::CTPOP, MVT::v8i32, { 1, 1, 1, 1 } },
3624 { ISD::CTPOP, MVT::v2i64, { 1, 1, 1, 1 } },
3625 { ISD::CTPOP, MVT::v4i32, { 1, 1, 1, 1 } },
3626 };
3627 static const CostKindTblEntry AVX512CDCostTbl[] = {
3628 { ISD::CTLZ, MVT::v8i64, { 1, 5, 1, 1 } },
3629 { ISD::CTLZ, MVT::v16i32, { 1, 5, 1, 1 } },
3630 { ISD::CTLZ, MVT::v32i16, { 18, 27, 23, 27 } },
3631 { ISD::CTLZ, MVT::v64i8, { 3, 16, 9, 11 } },
3632 { ISD::CTLZ, MVT::v4i64, { 1, 5, 1, 1 } },
3633 { ISD::CTLZ, MVT::v8i32, { 1, 5, 1, 1 } },
3634 { ISD::CTLZ, MVT::v16i16, { 8, 19, 11, 13 } },
3635 { ISD::CTLZ, MVT::v32i8, { 2, 11, 9, 10 } },
3636 { ISD::CTLZ, MVT::v2i64, { 1, 5, 1, 1 } },
3637 { ISD::CTLZ, MVT::v4i32, { 1, 5, 1, 1 } },
3638 { ISD::CTLZ, MVT::v8i16, { 3, 15, 4, 6 } },
3639 { ISD::CTLZ, MVT::v16i8, { 2, 10, 9, 10 } },
3640
3641 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3642 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3643 { ISD::CTTZ, MVT::v4i64, { 1, 8, 6, 6 } },
3644 { ISD::CTTZ, MVT::v8i32, { 1, 8, 6, 6 } },
3645 { ISD::CTTZ, MVT::v2i64, { 1, 8, 6, 6 } },
3646 { ISD::CTTZ, MVT::v4i32, { 1, 8, 6, 6 } },
3647 };
3648 static const CostKindTblEntry AVX512BWCostTbl[] = {
3649 { ISD::ABS, MVT::v32i16, { 1, 1, 1, 1 } },
3650 { ISD::ABS, MVT::v64i8, { 1, 1, 1, 1 } },
3651 { ISD::BITREVERSE, MVT::v2i64, { 3, 10, 10, 11 } },
3652 { ISD::BITREVERSE, MVT::v4i64, { 3, 11, 10, 11 } },
3653 { ISD::BITREVERSE, MVT::v8i64, { 3, 12, 10, 14 } },
3654 { ISD::BITREVERSE, MVT::v4i32, { 3, 10, 10, 11 } },
3655 { ISD::BITREVERSE, MVT::v8i32, { 3, 11, 10, 11 } },
3656 { ISD::BITREVERSE, MVT::v16i32, { 3, 12, 10, 14 } },
3657 { ISD::BITREVERSE, MVT::v8i16, { 3, 10, 10, 11 } },
3658 { ISD::BITREVERSE, MVT::v16i16, { 3, 11, 10, 11 } },
3659 { ISD::BITREVERSE, MVT::v32i16, { 3, 12, 10, 14 } },
3660 { ISD::BITREVERSE, MVT::v16i8, { 2, 5, 9, 9 } },
3661 { ISD::BITREVERSE, MVT::v32i8, { 2, 5, 9, 9 } },
3662 { ISD::BITREVERSE, MVT::v64i8, { 2, 5, 9, 12 } },
3663 { ISD::BSWAP, MVT::v2i64, { 1, 1, 1, 2 } },
3664 { ISD::BSWAP, MVT::v4i64, { 1, 1, 1, 2 } },
3665 { ISD::BSWAP, MVT::v8i64, { 1, 1, 1, 2 } },
3666 { ISD::BSWAP, MVT::v4i32, { 1, 1, 1, 2 } },
3667 { ISD::BSWAP, MVT::v8i32, { 1, 1, 1, 2 } },
3668 { ISD::BSWAP, MVT::v16i32, { 1, 1, 1, 2 } },
3669 { ISD::BSWAP, MVT::v8i16, { 1, 1, 1, 2 } },
3670 { ISD::BSWAP, MVT::v16i16, { 1, 1, 1, 2 } },
3671 { ISD::BSWAP, MVT::v32i16, { 1, 1, 1, 2 } },
3672 { ISD::CTLZ, MVT::v8i64, { 8, 22, 23, 23 } },
3673 { ISD::CTLZ, MVT::v16i32, { 8, 23, 25, 25 } },
3674 { ISD::CTLZ, MVT::v32i16, { 4, 15, 15, 16 } },
3675 { ISD::CTLZ, MVT::v64i8, { 3, 12, 10, 9 } },
3676 { ISD::CTPOP, MVT::v2i64, { 3, 7, 10, 10 } },
3677 { ISD::CTPOP, MVT::v4i64, { 3, 7, 10, 10 } },
3678 { ISD::CTPOP, MVT::v8i64, { 3, 8, 10, 12 } },
3679 { ISD::CTPOP, MVT::v4i32, { 7, 11, 14, 14 } },
3680 { ISD::CTPOP, MVT::v8i32, { 7, 11, 14, 14 } },
3681 { ISD::CTPOP, MVT::v16i32, { 7, 12, 14, 16 } },
3682 { ISD::CTPOP, MVT::v8i16, { 2, 7, 11, 11 } },
3683 { ISD::CTPOP, MVT::v16i16, { 2, 7, 11, 11 } },
3684 { ISD::CTPOP, MVT::v32i16, { 3, 7, 11, 13 } },
3685 { ISD::CTPOP, MVT::v16i8, { 2, 4, 8, 8 } },
3686 { ISD::CTPOP, MVT::v32i8, { 2, 4, 8, 8 } },
3687 { ISD::CTPOP, MVT::v64i8, { 2, 5, 8, 10 } },
3688 { ISD::CTTZ, MVT::v8i16, { 3, 9, 14, 14 } },
3689 { ISD::CTTZ, MVT::v16i16, { 3, 9, 14, 14 } },
3690 { ISD::CTTZ, MVT::v32i16, { 3, 10, 14, 16 } },
3691 { ISD::CTTZ, MVT::v16i8, { 2, 6, 11, 11 } },
3692 { ISD::CTTZ, MVT::v32i8, { 2, 6, 11, 11 } },
3693 { ISD::CTTZ, MVT::v64i8, { 3, 7, 11, 13 } },
3694 { ISD::ROTL, MVT::v32i16, { 2, 8, 6, 8 } },
3695 { ISD::ROTL, MVT::v16i16, { 2, 8, 6, 7 } },
3696 { ISD::ROTL, MVT::v8i16, { 2, 7, 6, 7 } },
3697 { ISD::ROTL, MVT::v64i8, { 5, 6, 11, 12 } },
3698 { ISD::ROTL, MVT::v32i8, { 5, 15, 7, 10 } },
3699 { ISD::ROTL, MVT::v16i8, { 5, 15, 7, 10 } },
3700 { ISD::ROTR, MVT::v32i16, { 2, 8, 6, 8 } },
3701 { ISD::ROTR, MVT::v16i16, { 2, 8, 6, 7 } },
3702 { ISD::ROTR, MVT::v8i16, { 2, 7, 6, 7 } },
3703 { ISD::ROTR, MVT::v64i8, { 5, 6, 12, 14 } },
3704 { ISD::ROTR, MVT::v32i8, { 5, 14, 6, 9 } },
3705 { ISD::ROTR, MVT::v16i8, { 5, 14, 6, 9 } },
3706 { X86ISD::VROTLI, MVT::v32i16, { 2, 5, 3, 3 } },
3707 { X86ISD::VROTLI, MVT::v16i16, { 1, 5, 3, 3 } },
3708 { X86ISD::VROTLI, MVT::v8i16, { 1, 5, 3, 3 } },
3709 { X86ISD::VROTLI, MVT::v64i8, { 2, 9, 3, 4 } },
3710 { X86ISD::VROTLI, MVT::v32i8, { 1, 9, 3, 4 } },
3711 { X86ISD::VROTLI, MVT::v16i8, { 1, 8, 3, 4 } },
3712 { ISD::SADDSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3713 { ISD::SADDSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3714 { ISD::SMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3715 { ISD::SMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3716 { ISD::SMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3717 { ISD::SMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3718 { ISD::SMULO, MVT::v32i16, { 3, 6, 4, 4 } },
3719 { ISD::SMULO, MVT::v64i8, { 8, 21, 17, 18 } },
3720 { ISD::UMULO, MVT::v32i16, { 2, 5, 3, 3 } },
3721 { ISD::UMULO, MVT::v64i8, { 8, 15, 15, 16 } },
3722 { ISD::SSUBSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3723 { ISD::SSUBSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3724 { ISD::UADDSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3725 { ISD::UADDSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3726 { ISD::UMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3727 { ISD::UMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3728 { ISD::UMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3729 { ISD::UMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3730 { ISD::USUBSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3731 { ISD::USUBSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3732 };
3733 static const CostKindTblEntry AVX512CostTbl[] = {
3734 { ISD::ABS, MVT::v8i64, { 1, 1, 1, 1 } },
3735 { ISD::ABS, MVT::v4i64, { 1, 1, 1, 1 } },
3736 { ISD::ABS, MVT::v2i64, { 1, 1, 1, 1 } },
3737 { ISD::ABS, MVT::v16i32, { 1, 1, 1, 1 } },
3738 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 1 } },
3739 { ISD::ABS, MVT::v32i16, { 2, 7, 4, 4 } },
3740 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 1 } },
3741 { ISD::ABS, MVT::v64i8, { 2, 7, 4, 4 } },
3742 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 1 } },
3743 { ISD::BITREVERSE, MVT::v8i64, { 9, 13, 20, 20 } },
3744 { ISD::BITREVERSE, MVT::v16i32, { 9, 13, 20, 20 } },
3745 { ISD::BITREVERSE, MVT::v32i16, { 9, 13, 20, 20 } },
3746 { ISD::BITREVERSE, MVT::v64i8, { 6, 11, 17, 17 } },
3747 { ISD::BSWAP, MVT::v8i64, { 4, 7, 5, 5 } },
3748 { ISD::BSWAP, MVT::v16i32, { 4, 7, 5, 5 } },
3749 { ISD::BSWAP, MVT::v32i16, { 4, 7, 5, 5 } },
3750 { ISD::CTLZ, MVT::v8i64, { 10, 28, 32, 32 } },
3751 { ISD::CTLZ, MVT::v16i32, { 12, 30, 38, 38 } },
3752 { ISD::CTLZ, MVT::v32i16, { 8, 15, 29, 29 } },
3753 { ISD::CTLZ, MVT::v64i8, { 6, 11, 19, 19 } },
3754 { ISD::CTPOP, MVT::v8i64, { 16, 16, 19, 19 } },
3755 { ISD::CTPOP, MVT::v16i32, { 24, 19, 27, 27 } },
3756 { ISD::CTPOP, MVT::v32i16, { 18, 15, 22, 22 } },
3757 { ISD::CTPOP, MVT::v64i8, { 12, 11, 16, 16 } },
3758 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3759 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3760 { ISD::CTTZ, MVT::v32i16, { 7, 17, 27, 27 } },
3761 { ISD::CTTZ, MVT::v64i8, { 6, 13, 21, 21 } },
3762 { ISD::ROTL, MVT::v8i64, { 1, 1, 1, 1 } },
3763 { ISD::ROTL, MVT::v4i64, { 1, 1, 1, 1 } },
3764 { ISD::ROTL, MVT::v2i64, { 1, 1, 1, 1 } },
3765 { ISD::ROTL, MVT::v16i32, { 1, 1, 1, 1 } },
3766 { ISD::ROTL, MVT::v8i32, { 1, 1, 1, 1 } },
3767 { ISD::ROTL, MVT::v4i32, { 1, 1, 1, 1 } },
3768 { ISD::ROTR, MVT::v8i64, { 1, 1, 1, 1 } },
3769 { ISD::ROTR, MVT::v4i64, { 1, 1, 1, 1 } },
3770 { ISD::ROTR, MVT::v2i64, { 1, 1, 1, 1 } },
3771 { ISD::ROTR, MVT::v16i32, { 1, 1, 1, 1 } },
3772 { ISD::ROTR, MVT::v8i32, { 1, 1, 1, 1 } },
3773 { ISD::ROTR, MVT::v4i32, { 1, 1, 1, 1 } },
3774 { X86ISD::VROTLI, MVT::v8i64, { 1, 1, 1, 1 } },
3775 { X86ISD::VROTLI, MVT::v4i64, { 1, 1, 1, 1 } },
3776 { X86ISD::VROTLI, MVT::v2i64, { 1, 1, 1, 1 } },
3777 { X86ISD::VROTLI, MVT::v16i32, { 1, 1, 1, 1 } },
3778 { X86ISD::VROTLI, MVT::v8i32, { 1, 1, 1, 1 } },
3779 { X86ISD::VROTLI, MVT::v4i32, { 1, 1, 1, 1 } },
3780 { ISD::SADDSAT, MVT::v2i64, { 3, 3, 8, 9 } },
3781 { ISD::SADDSAT, MVT::v4i64, { 2, 2, 6, 7 } },
3782 { ISD::SADDSAT, MVT::v8i64, { 3, 3, 6, 7 } },
3783 { ISD::SADDSAT, MVT::v4i32, { 2, 2, 6, 7 } },
3784 { ISD::SADDSAT, MVT::v8i32, { 2, 2, 6, 7 } },
3785 { ISD::SADDSAT, MVT::v16i32, { 3, 3, 6, 7 } },
3786 { ISD::SADDSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3787 { ISD::SADDSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3788 { ISD::SMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3789 { ISD::SMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3790 { ISD::SMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3791 { ISD::SMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3792 { ISD::SMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3793 { ISD::SMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3794 { ISD::SMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3795 { ISD::SMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3796 { ISD::SMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3797 { ISD::SMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3798 { ISD::SMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3799 { ISD::SMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3800 { ISD::SMULO, MVT::v8i64, { 44, 44, 81, 93 } },
3801 { ISD::SMULO, MVT::v16i32, { 5, 12, 9, 11 } },
3802 { ISD::SMULO, MVT::v32i16, { 6, 12, 17, 17 } },
3803 { ISD::SMULO, MVT::v64i8, { 22, 28, 42, 42 } },
3804 { ISD::SSUBSAT, MVT::v2i64, { 2, 13, 9, 10 } },
3805 { ISD::SSUBSAT, MVT::v4i64, { 2, 15, 7, 8 } },
3806 { ISD::SSUBSAT, MVT::v8i64, { 2, 14, 7, 8 } },
3807 { ISD::SSUBSAT, MVT::v4i32, { 2, 14, 7, 8 } },
3808 { ISD::SSUBSAT, MVT::v8i32, { 2, 15, 7, 8 } },
3809 { ISD::SSUBSAT, MVT::v16i32, { 2, 14, 7, 8 } },
3810 { ISD::SSUBSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3811 { ISD::SSUBSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3812 { ISD::UMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3813 { ISD::UMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3814 { ISD::UMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3815 { ISD::UMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3816 { ISD::UMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3817 { ISD::UMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3818 { ISD::UMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3819 { ISD::UMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3820 { ISD::UMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3821 { ISD::UMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3822 { ISD::UMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3823 { ISD::UMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3824 { ISD::UMULO, MVT::v8i64, { 52, 52, 95, 104} },
3825 { ISD::UMULO, MVT::v16i32, { 5, 12, 8, 10 } },
3826 { ISD::UMULO, MVT::v32i16, { 5, 13, 16, 16 } },
3827 { ISD::UMULO, MVT::v64i8, { 18, 24, 30, 30 } },
3828 { ISD::UADDSAT, MVT::v2i64, { 1, 4, 4, 4 } },
3829 { ISD::UADDSAT, MVT::v4i64, { 1, 4, 4, 4 } },
3830 { ISD::UADDSAT, MVT::v8i64, { 1, 4, 4, 4 } },
3831 { ISD::UADDSAT, MVT::v4i32, { 1, 2, 4, 4 } },
3832 { ISD::UADDSAT, MVT::v8i32, { 1, 2, 4, 4 } },
3833 { ISD::UADDSAT, MVT::v16i32, { 2, 2, 4, 4 } },
3834 { ISD::UADDSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3835 { ISD::UADDSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3836 { ISD::USUBSAT, MVT::v2i64, { 1, 4, 2, 2 } },
3837 { ISD::USUBSAT, MVT::v4i64, { 1, 4, 2, 2 } },
3838 { ISD::USUBSAT, MVT::v8i64, { 1, 4, 2, 2 } },
3839 { ISD::USUBSAT, MVT::v8i32, { 1, 2, 2, 2 } },
3840 { ISD::USUBSAT, MVT::v16i32, { 1, 2, 2, 2 } },
3841 { ISD::USUBSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3842 { ISD::USUBSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3843 { ISD::FMAXNUM, MVT::f32, { 2, 2, 3, 3 } },
3844 { ISD::FMAXNUM, MVT::v4f32, { 1, 1, 3, 3 } },
3845 { ISD::FMAXNUM, MVT::v8f32, { 2, 2, 3, 3 } },
3846 { ISD::FMAXNUM, MVT::v16f32, { 4, 4, 3, 3 } },
3847 { ISD::FMAXNUM, MVT::f64, { 2, 2, 3, 3 } },
3848 { ISD::FMAXNUM, MVT::v2f64, { 1, 1, 3, 3 } },
3849 { ISD::FMAXNUM, MVT::v4f64, { 2, 2, 3, 3 } },
3850 { ISD::FMAXNUM, MVT::v8f64, { 3, 3, 3, 3 } },
3851 { ISD::FSQRT, MVT::f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3852 { ISD::FSQRT, MVT::v4f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3853 { ISD::FSQRT, MVT::v8f32, { 6, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3854 { ISD::FSQRT, MVT::v16f32, { 12, 20, 1, 3 } }, // Skylake from http://www.agner.org/
3855 { ISD::FSQRT, MVT::f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3856 { ISD::FSQRT, MVT::v2f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3857 { ISD::FSQRT, MVT::v4f64, { 12, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3858 { ISD::FSQRT, MVT::v8f64, { 24, 32, 1, 3 } }, // Skylake from http://www.agner.org/
3859 };
3860 static const CostKindTblEntry XOPCostTbl[] = {
3861 { ISD::BITREVERSE, MVT::v4i64, { 3, 6, 5, 6 } },
3862 { ISD::BITREVERSE, MVT::v8i32, { 3, 6, 5, 6 } },
3863 { ISD::BITREVERSE, MVT::v16i16, { 3, 6, 5, 6 } },
3864 { ISD::BITREVERSE, MVT::v32i8, { 3, 6, 5, 6 } },
3865 { ISD::BITREVERSE, MVT::v2i64, { 2, 7, 1, 1 } },
3866 { ISD::BITREVERSE, MVT::v4i32, { 2, 7, 1, 1 } },
3867 { ISD::BITREVERSE, MVT::v8i16, { 2, 7, 1, 1 } },
3868 { ISD::BITREVERSE, MVT::v16i8, { 2, 7, 1, 1 } },
3869 { ISD::BITREVERSE, MVT::i64, { 2, 2, 3, 4 } },
3870 { ISD::BITREVERSE, MVT::i32, { 2, 2, 3, 4 } },
3871 { ISD::BITREVERSE, MVT::i16, { 2, 2, 3, 4 } },
3872 { ISD::BITREVERSE, MVT::i8, { 2, 2, 3, 4 } },
3873 // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
3874 { ISD::ROTL, MVT::v4i64, { 4, 7, 5, 6 } },
3875 { ISD::ROTL, MVT::v8i32, { 4, 7, 5, 6 } },
3876 { ISD::ROTL, MVT::v16i16, { 4, 7, 5, 6 } },
3877 { ISD::ROTL, MVT::v32i8, { 4, 7, 5, 6 } },
3878 { ISD::ROTL, MVT::v2i64, { 1, 3, 1, 1 } },
3879 { ISD::ROTL, MVT::v4i32, { 1, 3, 1, 1 } },
3880 { ISD::ROTL, MVT::v8i16, { 1, 3, 1, 1 } },
3881 { ISD::ROTL, MVT::v16i8, { 1, 3, 1, 1 } },
3882 { ISD::ROTR, MVT::v4i64, { 4, 7, 8, 9 } },
3883 { ISD::ROTR, MVT::v8i32, { 4, 7, 8, 9 } },
3884 { ISD::ROTR, MVT::v16i16, { 4, 7, 8, 9 } },
3885 { ISD::ROTR, MVT::v32i8, { 4, 7, 8, 9 } },
3886 { ISD::ROTR, MVT::v2i64, { 1, 3, 3, 3 } },
3887 { ISD::ROTR, MVT::v4i32, { 1, 3, 3, 3 } },
3888 { ISD::ROTR, MVT::v8i16, { 1, 3, 3, 3 } },
3889 { ISD::ROTR, MVT::v16i8, { 1, 3, 3, 3 } },
3890 { X86ISD::VROTLI, MVT::v4i64, { 4, 7, 5, 6 } },
3891 { X86ISD::VROTLI, MVT::v8i32, { 4, 7, 5, 6 } },
3892 { X86ISD::VROTLI, MVT::v16i16, { 4, 7, 5, 6 } },
3893 { X86ISD::VROTLI, MVT::v32i8, { 4, 7, 5, 6 } },
3894 { X86ISD::VROTLI, MVT::v2i64, { 1, 3, 1, 1 } },
3895 { X86ISD::VROTLI, MVT::v4i32, { 1, 3, 1, 1 } },
3896 { X86ISD::VROTLI, MVT::v8i16, { 1, 3, 1, 1 } },
3897 { X86ISD::VROTLI, MVT::v16i8, { 1, 3, 1, 1 } },
3898 };
3899 static const CostKindTblEntry AVX2CostTbl[] = {
3900 { ISD::ABS, MVT::v2i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3901 { ISD::ABS, MVT::v4i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3902 { ISD::ABS, MVT::v4i32, { 1, 1, 1, 1 } },
3903 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 2 } },
3904 { ISD::ABS, MVT::v8i16, { 1, 1, 1, 1 } },
3905 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 2 } },
3906 { ISD::ABS, MVT::v16i8, { 1, 1, 1, 1 } },
3907 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 2 } },
3908 { ISD::BITREVERSE, MVT::v2i64, { 3, 11, 10, 11 } },
3909 { ISD::BITREVERSE, MVT::v4i64, { 5, 11, 10, 17 } },
3910 { ISD::BITREVERSE, MVT::v4i32, { 3, 11, 10, 11 } },
3911 { ISD::BITREVERSE, MVT::v8i32, { 5, 11, 10, 17 } },
3912 { ISD::BITREVERSE, MVT::v8i16, { 3, 11, 10, 11 } },
3913 { ISD::BITREVERSE, MVT::v16i16, { 5, 11, 10, 17 } },
3914 { ISD::BITREVERSE, MVT::v16i8, { 3, 6, 9, 9 } },
3915 { ISD::BITREVERSE, MVT::v32i8, { 4, 5, 9, 15 } },
3916 { ISD::BSWAP, MVT::v2i64, { 1, 2, 1, 2 } },
3917 { ISD::BSWAP, MVT::v4i64, { 1, 3, 1, 2 } },
3918 { ISD::BSWAP, MVT::v4i32, { 1, 2, 1, 2 } },
3919 { ISD::BSWAP, MVT::v8i32, { 1, 3, 1, 2 } },
3920 { ISD::BSWAP, MVT::v8i16, { 1, 2, 1, 2 } },
3921 { ISD::BSWAP, MVT::v16i16, { 1, 3, 1, 2 } },
3922 { ISD::CTLZ, MVT::v2i64, { 7, 18, 24, 25 } },
3923 { ISD::CTLZ, MVT::v4i64, { 14, 18, 24, 44 } },
3924 { ISD::CTLZ, MVT::v4i32, { 5, 16, 19, 20 } },
3925 { ISD::CTLZ, MVT::v8i32, { 10, 16, 19, 34 } },
3926 { ISD::CTLZ, MVT::v8i16, { 4, 13, 14, 15 } },
3927 { ISD::CTLZ, MVT::v16i16, { 6, 14, 14, 24 } },
3928 { ISD::CTLZ, MVT::v16i8, { 3, 12, 9, 10 } },
3929 { ISD::CTLZ, MVT::v32i8, { 4, 12, 9, 14 } },
3930 { ISD::CTPOP, MVT::v2i64, { 3, 9, 10, 10 } },
3931 { ISD::CTPOP, MVT::v4i64, { 4, 9, 10, 14 } },
3932 { ISD::CTPOP, MVT::v4i32, { 7, 12, 14, 14 } },
3933 { ISD::CTPOP, MVT::v8i32, { 7, 12, 14, 18 } },
3934 { ISD::CTPOP, MVT::v8i16, { 3, 7, 11, 11 } },
3935 { ISD::CTPOP, MVT::v16i16, { 6, 8, 11, 18 } },
3936 { ISD::CTPOP, MVT::v16i8, { 2, 5, 8, 8 } },
3937 { ISD::CTPOP, MVT::v32i8, { 3, 5, 8, 12 } },
3938 { ISD::CTTZ, MVT::v2i64, { 4, 11, 13, 13 } },
3939 { ISD::CTTZ, MVT::v4i64, { 5, 11, 13, 20 } },
3940 { ISD::CTTZ, MVT::v4i32, { 7, 14, 17, 17 } },
3941 { ISD::CTTZ, MVT::v8i32, { 7, 15, 17, 24 } },
3942 { ISD::CTTZ, MVT::v8i16, { 4, 9, 14, 14 } },
3943 { ISD::CTTZ, MVT::v16i16, { 6, 9, 14, 24 } },
3944 { ISD::CTTZ, MVT::v16i8, { 3, 7, 11, 11 } },
3945 { ISD::CTTZ, MVT::v32i8, { 5, 7, 11, 18 } },
3946 { ISD::SADDSAT, MVT::v2i64, { 4, 13, 8, 11 } },
3947 { ISD::SADDSAT, MVT::v4i64, { 3, 10, 8, 12 } },
3948 { ISD::SADDSAT, MVT::v4i32, { 2, 6, 7, 9 } },
3949 { ISD::SADDSAT, MVT::v8i32, { 4, 6, 7, 13 } },
3950 { ISD::SADDSAT, MVT::v16i16, { 1, 1, 1, 2 } },
3951 { ISD::SADDSAT, MVT::v32i8, { 1, 1, 1, 2 } },
3952 { ISD::SMAX, MVT::v2i64, { 2, 7, 2, 3 } },
3953 { ISD::SMAX, MVT::v4i64, { 2, 7, 2, 3 } },
3954 { ISD::SMAX, MVT::v8i32, { 1, 1, 1, 2 } },
3955 { ISD::SMAX, MVT::v16i16, { 1, 1, 1, 2 } },
3956 { ISD::SMAX, MVT::v32i8, { 1, 1, 1, 2 } },
3957 { ISD::SMIN, MVT::v2i64, { 2, 7, 2, 3 } },
3958 { ISD::SMIN, MVT::v4i64, { 2, 7, 2, 3 } },
3959 { ISD::SMIN, MVT::v8i32, { 1, 1, 1, 2 } },
3960 { ISD::SMIN, MVT::v16i16, { 1, 1, 1, 2 } },
3961 { ISD::SMIN, MVT::v32i8, { 1, 1, 1, 2 } },
3962 { ISD::SMULO, MVT::v4i64, { 20, 20, 33, 37 } },
3963 { ISD::SMULO, MVT::v2i64, { 8, 8, 13, 15 } },
3964 { ISD::SMULO, MVT::v8i32, { 8, 20, 13, 24 } },
3965 { ISD::SMULO, MVT::v4i32, { 5, 15, 11, 12 } },
3966 { ISD::SMULO, MVT::v16i16, { 4, 14, 8, 14 } },
3967 { ISD::SMULO, MVT::v8i16, { 3, 9, 6, 6 } },
3968 { ISD::SMULO, MVT::v32i8, { 9, 15, 18, 35 } },
3969 { ISD::SMULO, MVT::v16i8, { 6, 22, 14, 21 } },
3970 { ISD::SSUBSAT, MVT::v2i64, { 4, 13, 9, 13 } },
3971 { ISD::SSUBSAT, MVT::v4i64, { 4, 15, 9, 13 } },
3972 { ISD::SSUBSAT, MVT::v4i32, { 3, 14, 9, 11 } },
3973 { ISD::SSUBSAT, MVT::v8i32, { 4, 15, 9, 16 } },
3974 { ISD::SSUBSAT, MVT::v16i16, { 1, 1, 1, 2 } },
3975 { ISD::SSUBSAT, MVT::v32i8, { 1, 1, 1, 2 } },
3976 { ISD::UADDSAT, MVT::v2i64, { 2, 8, 6, 6 } },
3977 { ISD::UADDSAT, MVT::v4i64, { 3, 8, 6, 10 } },
3978 { ISD::UADDSAT, MVT::v8i32, { 2, 2, 4, 8 } },
3979 { ISD::UADDSAT, MVT::v16i16, { 1, 1, 1, 2 } },
3980 { ISD::UADDSAT, MVT::v32i8, { 1, 1, 1, 2 } },
3981 { ISD::UMAX, MVT::v2i64, { 2, 8, 5, 6 } },
3982 { ISD::UMAX, MVT::v4i64, { 2, 8, 5, 8 } },
3983 { ISD::UMAX, MVT::v8i32, { 1, 1, 1, 2 } },
3984 { ISD::UMAX, MVT::v16i16, { 1, 1, 1, 2 } },
3985 { ISD::UMAX, MVT::v32i8, { 1, 1, 1, 2 } },
3986 { ISD::UMIN, MVT::v2i64, { 2, 8, 5, 6 } },
3987 { ISD::UMIN, MVT::v4i64, { 2, 8, 5, 8 } },
3988 { ISD::UMIN, MVT::v8i32, { 1, 1, 1, 2 } },
3989 { ISD::UMIN, MVT::v16i16, { 1, 1, 1, 2 } },
3990 { ISD::UMIN, MVT::v32i8, { 1, 1, 1, 2 } },
3991 { ISD::UMULO, MVT::v4i64, { 24, 24, 39, 43 } },
3992 { ISD::UMULO, MVT::v2i64, { 10, 10, 15, 19 } },
3993 { ISD::UMULO, MVT::v8i32, { 8, 11, 13, 23 } },
3994 { ISD::UMULO, MVT::v4i32, { 5, 12, 11, 12 } },
3995 { ISD::UMULO, MVT::v16i16, { 4, 6, 8, 13 } },
3996 { ISD::UMULO, MVT::v8i16, { 2, 8, 6, 6 } },
3997 { ISD::UMULO, MVT::v32i8, { 9, 13, 17, 33 } },
3998 { ISD::UMULO, MVT::v16i8, { 6, 19, 13, 20 } },
3999 { ISD::USUBSAT, MVT::v2i64, { 2, 7, 6, 6 } },
4000 { ISD::USUBSAT, MVT::v4i64, { 3, 7, 6, 10 } },
4001 { ISD::USUBSAT, MVT::v8i32, { 2, 2, 2, 4 } },
4002 { ISD::USUBSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4003 { ISD::USUBSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4004 { ISD::FMAXNUM, MVT::f32, { 2, 7, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4005 { ISD::FMAXNUM, MVT::v4f32, { 2, 7, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4006 { ISD::FMAXNUM, MVT::v8f32, { 3, 7, 3, 6 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4007 { ISD::FMAXNUM, MVT::f64, { 2, 7, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4008 { ISD::FMAXNUM, MVT::v2f64, { 2, 7, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4009 { ISD::FMAXNUM, MVT::v4f64, { 3, 7, 3, 6 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4010 { ISD::FSQRT, MVT::f32, { 7, 15, 1, 1 } }, // vsqrtss
4011 { ISD::FSQRT, MVT::v4f32, { 7, 15, 1, 1 } }, // vsqrtps
4012 { ISD::FSQRT, MVT::v8f32, { 14, 21, 1, 3 } }, // vsqrtps
4013 { ISD::FSQRT, MVT::f64, { 14, 21, 1, 1 } }, // vsqrtsd
4014 { ISD::FSQRT, MVT::v2f64, { 14, 21, 1, 1 } }, // vsqrtpd
4015 { ISD::FSQRT, MVT::v4f64, { 28, 35, 1, 3 } }, // vsqrtpd
4016 };
4017 static const CostKindTblEntry AVX1CostTbl[] = {
4018 { ISD::ABS, MVT::v4i64, { 6, 8, 6, 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
4019 { ISD::ABS, MVT::v8i32, { 3, 6, 4, 5 } },
4020 { ISD::ABS, MVT::v16i16, { 3, 6, 4, 5 } },
4021 { ISD::ABS, MVT::v32i8, { 3, 6, 4, 5 } },
4022 { ISD::BITREVERSE, MVT::v4i64, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4023 { ISD::BITREVERSE, MVT::v2i64, { 8, 13, 10, 16 } },
4024 { ISD::BITREVERSE, MVT::v8i32, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4025 { ISD::BITREVERSE, MVT::v4i32, { 8, 13, 10, 16 } },
4026 { ISD::BITREVERSE, MVT::v16i16, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4027 { ISD::BITREVERSE, MVT::v8i16, { 8, 13, 10, 16 } },
4028 { ISD::BITREVERSE, MVT::v32i8, { 13, 15, 17, 26 } }, // 2 x 128-bit Op + extract/insert
4029 { ISD::BITREVERSE, MVT::v16i8, { 7, 7, 9, 13 } },
4030 { ISD::BSWAP, MVT::v4i64, { 5, 6, 5, 10 } },
4031 { ISD::BSWAP, MVT::v2i64, { 2, 2, 1, 3 } },
4032 { ISD::BSWAP, MVT::v8i32, { 5, 6, 5, 10 } },
4033 { ISD::BSWAP, MVT::v4i32, { 2, 2, 1, 3 } },
4034 { ISD::BSWAP, MVT::v16i16, { 5, 6, 5, 10 } },
4035 { ISD::BSWAP, MVT::v8i16, { 2, 2, 1, 3 } },
4036 { ISD::CTLZ, MVT::v4i64, { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert
4037 { ISD::CTLZ, MVT::v2i64, { 14, 24, 24, 28 } },
4038 { ISD::CTLZ, MVT::v8i32, { 24, 28, 39, 48 } }, // 2 x 128-bit Op + extract/insert
4039 { ISD::CTLZ, MVT::v4i32, { 12, 20, 19, 23 } },
4040 { ISD::CTLZ, MVT::v16i16, { 19, 22, 29, 38 } }, // 2 x 128-bit Op + extract/insert
4041 { ISD::CTLZ, MVT::v8i16, { 9, 16, 14, 18 } },
4042 { ISD::CTLZ, MVT::v32i8, { 14, 15, 19, 28 } }, // 2 x 128-bit Op + extract/insert
4043 { ISD::CTLZ, MVT::v16i8, { 7, 12, 9, 13 } },
4044 { ISD::CTPOP, MVT::v4i64, { 14, 18, 19, 28 } }, // 2 x 128-bit Op + extract/insert
4045 { ISD::CTPOP, MVT::v2i64, { 7, 14, 10, 14 } },
4046 { ISD::CTPOP, MVT::v8i32, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
4047 { ISD::CTPOP, MVT::v4i32, { 9, 20, 14, 18 } },
4048 { ISD::CTPOP, MVT::v16i16, { 16, 21, 22, 31 } }, // 2 x 128-bit Op + extract/insert
4049 { ISD::CTPOP, MVT::v8i16, { 8, 18, 11, 15 } },
4050 { ISD::CTPOP, MVT::v32i8, { 13, 15, 16, 25 } }, // 2 x 128-bit Op + extract/insert
4051 { ISD::CTPOP, MVT::v16i8, { 6, 12, 8, 12 } },
4052 { ISD::CTTZ, MVT::v4i64, { 17, 22, 24, 33 } }, // 2 x 128-bit Op + extract/insert
4053 { ISD::CTTZ, MVT::v2i64, { 9, 19, 13, 17 } },
4054 { ISD::CTTZ, MVT::v8i32, { 21, 27, 32, 41 } }, // 2 x 128-bit Op + extract/insert
4055 { ISD::CTTZ, MVT::v4i32, { 11, 24, 17, 21 } },
4056 { ISD::CTTZ, MVT::v16i16, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
4057 { ISD::CTTZ, MVT::v8i16, { 9, 21, 14, 18 } },
4058 { ISD::CTTZ, MVT::v32i8, { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert
4059 { ISD::CTTZ, MVT::v16i8, { 8, 16, 11, 15 } },
4060 { ISD::SADDSAT, MVT::v2i64, { 6, 13, 8, 11 } },
4061 { ISD::SADDSAT, MVT::v4i64, { 13, 20, 15, 25 } }, // 2 x 128-bit Op + extract/insert
4062 { ISD::SADDSAT, MVT::v8i32, { 12, 18, 14, 24 } }, // 2 x 128-bit Op + extract/insert
4063 { ISD::SADDSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4064 { ISD::SADDSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4065 { ISD::SMAX, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
4066 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 4 } },
4067 { ISD::SMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4068 { ISD::SMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4069 { ISD::SMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4070 { ISD::SMIN, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
4071 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
4072 { ISD::SMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4073 { ISD::SMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4074 { ISD::SMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4075 { ISD::SMULO, MVT::v4i64, { 20, 20, 33, 37 } },
4076 { ISD::SMULO, MVT::v2i64, { 9, 9, 13, 17 } },
4077 { ISD::SMULO, MVT::v8i32, { 15, 20, 24, 29 } },
4078 { ISD::SMULO, MVT::v4i32, { 7, 15, 11, 13 } },
4079 { ISD::SMULO, MVT::v16i16, { 8, 14, 14, 15 } },
4080 { ISD::SMULO, MVT::v8i16, { 3, 9, 6, 6 } },
4081 { ISD::SMULO, MVT::v32i8, { 20, 20, 37, 39 } },
4082 { ISD::SMULO, MVT::v16i8, { 9, 22, 18, 21 } },
4083 { ISD::SSUBSAT, MVT::v2i64, { 7, 13, 9, 13 } },
4084 { ISD::SSUBSAT, MVT::v4i64, { 15, 21, 18, 29 } }, // 2 x 128-bit Op + extract/insert
4085 { ISD::SSUBSAT, MVT::v8i32, { 15, 19, 18, 29 } }, // 2 x 128-bit Op + extract/insert
4086 { ISD::SSUBSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4087 { ISD::SSUBSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4088 { ISD::UADDSAT, MVT::v2i64, { 3, 8, 6, 6 } },
4089 { ISD::UADDSAT, MVT::v4i64, { 8, 11, 14, 15 } }, // 2 x 128-bit Op + extract/insert
4090 { ISD::UADDSAT, MVT::v8i32, { 6, 6, 10, 11 } }, // 2 x 128-bit Op + extract/insert
4091 { ISD::UADDSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4092 { ISD::UADDSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4093 { ISD::UMAX, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
4094 { ISD::UMAX, MVT::v2i64, { 4, 8, 5, 7 } },
4095 { ISD::UMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4096 { ISD::UMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4097 { ISD::UMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4098 { ISD::UMIN, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
4099 { ISD::UMIN, MVT::v2i64, { 4, 8, 5, 7 } },
4100 { ISD::UMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4101 { ISD::UMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4102 { ISD::UMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4103 { ISD::UMULO, MVT::v4i64, { 24, 26, 39, 45 } },
4104 { ISD::UMULO, MVT::v2i64, { 10, 12, 15, 20 } },
4105 { ISD::UMULO, MVT::v8i32, { 14, 15, 23, 28 } },
4106 { ISD::UMULO, MVT::v4i32, { 7, 12, 11, 13 } },
4107 { ISD::UMULO, MVT::v16i16, { 7, 11, 13, 14 } },
4108 { ISD::UMULO, MVT::v8i16, { 3, 8, 6, 6 } },
4109 { ISD::UMULO, MVT::v32i8, { 19, 19, 35, 37 } },
4110 { ISD::UMULO, MVT::v16i8, { 9, 19, 17, 20 } },
4111 { ISD::USUBSAT, MVT::v2i64, { 3, 7, 6, 6 } },
4112 { ISD::USUBSAT, MVT::v4i64, { 8, 10, 14, 15 } }, // 2 x 128-bit Op + extract/insert
4113 { ISD::USUBSAT, MVT::v8i32, { 4, 4, 7, 8 } }, // 2 x 128-bit Op + extract/insert
4114 { ISD::USUBSAT, MVT::v8i32, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4115 { ISD::USUBSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4116 { ISD::USUBSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4117 { ISD::FMAXNUM, MVT::f32, { 3, 6, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4118 { ISD::FMAXNUM, MVT::v4f32, { 3, 6, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4119 { ISD::FMAXNUM, MVT::v8f32, { 5, 7, 3, 10 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4120 { ISD::FMAXNUM, MVT::f64, { 3, 6, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4121 { ISD::FMAXNUM, MVT::v2f64, { 3, 6, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4122 { ISD::FMAXNUM, MVT::v4f64, { 5, 7, 3, 10 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4123 { ISD::FSQRT, MVT::f32, { 21, 21, 1, 1 } }, // vsqrtss
4124 { ISD::FSQRT, MVT::v4f32, { 21, 21, 1, 1 } }, // vsqrtps
4125 { ISD::FSQRT, MVT::v8f32, { 42, 42, 1, 3 } }, // vsqrtps
4126 { ISD::FSQRT, MVT::f64, { 27, 27, 1, 1 } }, // vsqrtsd
4127 { ISD::FSQRT, MVT::v2f64, { 27, 27, 1, 1 } }, // vsqrtpd
4128 { ISD::FSQRT, MVT::v4f64, { 54, 54, 1, 3 } }, // vsqrtpd
4129 };
4130 static const CostKindTblEntry GFNICostTbl[] = {
4131 { ISD::BITREVERSE, MVT::i8, { 3, 3, 3, 4 } }, // gf2p8affineqb
4132 { ISD::BITREVERSE, MVT::i16, { 3, 3, 4, 6 } }, // gf2p8affineqb
4133 { ISD::BITREVERSE, MVT::i32, { 3, 3, 4, 5 } }, // gf2p8affineqb
4134 { ISD::BITREVERSE, MVT::i64, { 3, 3, 4, 6 } }, // gf2p8affineqb
4135 { ISD::BITREVERSE, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4136 { ISD::BITREVERSE, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4137 { ISD::BITREVERSE, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4138 { ISD::BITREVERSE, MVT::v8i16, { 1, 8, 2, 4 } }, // gf2p8affineqb
4139 { ISD::BITREVERSE, MVT::v16i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
4140 { ISD::BITREVERSE, MVT::v32i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
4141 { ISD::BITREVERSE, MVT::v4i32, { 1, 8, 2, 4 } }, // gf2p8affineqb
4142 { ISD::BITREVERSE, MVT::v8i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
4143 { ISD::BITREVERSE, MVT::v16i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
4144 { ISD::BITREVERSE, MVT::v2i64, { 1, 8, 2, 4 } }, // gf2p8affineqb
4145 { ISD::BITREVERSE, MVT::v4i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
4146 { ISD::BITREVERSE, MVT::v8i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
4147 { X86ISD::VROTLI, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4148 { X86ISD::VROTLI, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4149 { X86ISD::VROTLI, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4150 };
4151 static const CostKindTblEntry GLMCostTbl[] = {
4152 { ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss
4153 { ISD::FSQRT, MVT::v4f32, { 37, 41, 1, 5 } }, // sqrtps
4154 { ISD::FSQRT, MVT::f64, { 34, 35, 1, 1 } }, // sqrtsd
4155 { ISD::FSQRT, MVT::v2f64, { 67, 71, 1, 5 } }, // sqrtpd
4156 };
4157 static const CostKindTblEntry SLMCostTbl[] = {
4158 { ISD::BSWAP, MVT::v2i64, { 5, 5, 1, 5 } },
4159 { ISD::BSWAP, MVT::v4i32, { 5, 5, 1, 5 } },
4160 { ISD::BSWAP, MVT::v8i16, { 5, 5, 1, 5 } },
4161 { ISD::FSQRT, MVT::f32, { 20, 20, 1, 1 } }, // sqrtss
4162 { ISD::FSQRT, MVT::v4f32, { 40, 41, 1, 5 } }, // sqrtps
4163 { ISD::FSQRT, MVT::f64, { 35, 35, 1, 1 } }, // sqrtsd
4164 { ISD::FSQRT, MVT::v2f64, { 70, 71, 1, 5 } }, // sqrtpd
4165 };
4166 static const CostKindTblEntry SSE42CostTbl[] = {
4167 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4168 { ISD::FMAXNUM, MVT::v4f32, { 4, 4, 4, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4169 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4170 { ISD::FMAXNUM, MVT::v2f64, { 4, 4, 4, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4171 { ISD::FSQRT, MVT::f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
4172 { ISD::FSQRT, MVT::v4f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
4173 };
4174 static const CostKindTblEntry SSE41CostTbl[] = {
4175 { ISD::ABS, MVT::v2i64, { 3, 4, 3, 5 } }, // BLENDVPD(X,PSUBQ(0,X),X)
4176 { ISD::SADDSAT, MVT::v2i64, { 10, 14, 17, 21 } },
4177 { ISD::SADDSAT, MVT::v4i32, { 5, 11, 8, 10 } },
4178 { ISD::SSUBSAT, MVT::v2i64, { 12, 19, 25, 29 } },
4179 { ISD::SSUBSAT, MVT::v4i32, { 6, 14, 10, 12 } },
4180 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 3 } },
4181 { ISD::SMAX, MVT::v4i32, { 1, 1, 1, 1 } },
4182 { ISD::SMAX, MVT::v16i8, { 1, 1, 1, 1 } },
4183 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
4184 { ISD::SMIN, MVT::v4i32, { 1, 1, 1, 1 } },
4185 { ISD::SMIN, MVT::v16i8, { 1, 1, 1, 1 } },
4186 { ISD::SMULO, MVT::v2i64, { 9, 11, 13, 17 } },
4187 { ISD::SMULO, MVT::v4i32, { 20, 24, 13, 19 } },
4188 { ISD::SMULO, MVT::v8i16, { 5, 9, 8, 8 } },
4189 { ISD::SMULO, MVT::v16i8, { 13, 22, 24, 25 } },
4190 { ISD::UADDSAT, MVT::v2i64, { 6, 13, 14, 14 } },
4191 { ISD::UADDSAT, MVT::v4i32, { 2, 2, 4, 4 } },
4192 { ISD::USUBSAT, MVT::v2i64, { 6, 10, 14, 14 } },
4193 { ISD::USUBSAT, MVT::v4i32, { 1, 2, 2, 2 } },
4194 { ISD::UMAX, MVT::v2i64, { 2, 11, 6, 7 } },
4195 { ISD::UMAX, MVT::v4i32, { 1, 1, 1, 1 } },
4196 { ISD::UMAX, MVT::v8i16, { 1, 1, 1, 1 } },
4197 { ISD::UMIN, MVT::v2i64, { 2, 11, 6, 7 } },
4198 { ISD::UMIN, MVT::v4i32, { 1, 1, 1, 1 } },
4199 { ISD::UMIN, MVT::v8i16, { 1, 1, 1, 1 } },
4200 { ISD::UMULO, MVT::v2i64, { 14, 20, 15, 20 } },
4201 { ISD::UMULO, MVT::v4i32, { 19, 22, 12, 18 } },
4202 { ISD::UMULO, MVT::v8i16, { 4, 9, 7, 7 } },
4203 { ISD::UMULO, MVT::v16i8, { 13, 19, 18, 20 } },
4204 };
4205 static const CostKindTblEntry SSSE3CostTbl[] = {
4206 { ISD::ABS, MVT::v4i32, { 1, 2, 1, 1 } },
4207 { ISD::ABS, MVT::v8i16, { 1, 2, 1, 1 } },
4208 { ISD::ABS, MVT::v16i8, { 1, 2, 1, 1 } },
4209 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 11, 21 } },
4210 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 11, 21 } },
4211 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 11, 21 } },
4212 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 10, 16 } },
4213 { ISD::BSWAP, MVT::v2i64, { 2, 3, 1, 5 } },
4214 { ISD::BSWAP, MVT::v4i32, { 2, 3, 1, 5 } },
4215 { ISD::BSWAP, MVT::v8i16, { 2, 3, 1, 5 } },
4216 { ISD::CTLZ, MVT::v2i64, { 18, 28, 28, 35 } },
4217 { ISD::CTLZ, MVT::v4i32, { 15, 20, 22, 28 } },
4218 { ISD::CTLZ, MVT::v8i16, { 13, 17, 16, 22 } },
4219 { ISD::CTLZ, MVT::v16i8, { 11, 15, 10, 16 } },
4220 { ISD::CTPOP, MVT::v2i64, { 13, 19, 12, 18 } },
4221 { ISD::CTPOP, MVT::v4i32, { 18, 24, 16, 22 } },
4222 { ISD::CTPOP, MVT::v8i16, { 13, 18, 14, 20 } },
4223 { ISD::CTPOP, MVT::v16i8, { 11, 12, 10, 16 } },
4224 { ISD::CTTZ, MVT::v2i64, { 13, 25, 15, 22 } },
4225 { ISD::CTTZ, MVT::v4i32, { 18, 26, 19, 25 } },
4226 { ISD::CTTZ, MVT::v8i16, { 13, 20, 17, 23 } },
4227 { ISD::CTTZ, MVT::v16i8, { 11, 16, 13, 19 } }
4228 };
4229 static const CostKindTblEntry SSE2CostTbl[] = {
4230 { ISD::ABS, MVT::v2i64, { 3, 6, 5, 5 } },
4231 { ISD::ABS, MVT::v4i32, { 1, 4, 4, 4 } },
4232 { ISD::ABS, MVT::v8i16, { 1, 2, 3, 3 } },
4233 { ISD::ABS, MVT::v16i8, { 1, 2, 3, 3 } },
4234 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 32, 32 } },
4235 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 30, 30 } },
4236 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 25, 25 } },
4237 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 21, 21 } },
4238 { ISD::BSWAP, MVT::v2i64, { 5, 6, 11, 11 } },
4239 { ISD::BSWAP, MVT::v4i32, { 5, 5, 9, 9 } },
4240 { ISD::BSWAP, MVT::v8i16, { 5, 5, 4, 5 } },
4241 { ISD::CTLZ, MVT::v2i64, { 10, 45, 36, 38 } },
4242 { ISD::CTLZ, MVT::v4i32, { 10, 45, 38, 40 } },
4243 { ISD::CTLZ, MVT::v8i16, { 9, 38, 32, 34 } },
4244 { ISD::CTLZ, MVT::v16i8, { 8, 39, 29, 32 } },
4245 { ISD::CTPOP, MVT::v2i64, { 12, 26, 16, 18 } },
4246 { ISD::CTPOP, MVT::v4i32, { 15, 29, 21, 23 } },
4247 { ISD::CTPOP, MVT::v8i16, { 13, 25, 18, 20 } },
4248 { ISD::CTPOP, MVT::v16i8, { 10, 21, 14, 16 } },
4249 { ISD::CTTZ, MVT::v2i64, { 14, 28, 19, 21 } },
4250 { ISD::CTTZ, MVT::v4i32, { 18, 31, 24, 26 } },
4251 { ISD::CTTZ, MVT::v8i16, { 16, 27, 21, 23 } },
4252 { ISD::CTTZ, MVT::v16i8, { 13, 23, 17, 19 } },
4253 { ISD::SADDSAT, MVT::v2i64, { 12, 14, 24, 24 } },
4254 { ISD::SADDSAT, MVT::v4i32, { 6, 11, 11, 12 } },
4255 { ISD::SADDSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4256 { ISD::SADDSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4257 { ISD::SMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4258 { ISD::SMAX, MVT::v4i32, { 2, 4, 5, 5 } },
4259 { ISD::SMAX, MVT::v8i16, { 1, 1, 1, 1 } },
4260 { ISD::SMAX, MVT::v16i8, { 2, 4, 5, 5 } },
4261 { ISD::SMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4262 { ISD::SMIN, MVT::v4i32, { 2, 4, 5, 5 } },
4263 { ISD::SMIN, MVT::v8i16, { 1, 1, 1, 1 } },
4264 { ISD::SMIN, MVT::v16i8, { 2, 4, 5, 5 } },
4265 { ISD::SMULO, MVT::v2i64, { 30, 33, 13, 23 } },
4266 { ISD::SMULO, MVT::v4i32, { 20, 24, 23, 23 } },
4267 { ISD::SMULO, MVT::v8i16, { 5, 10, 8, 8 } },
4268 { ISD::SMULO, MVT::v16i8, { 13, 23, 24, 25 } },
4269 { ISD::SSUBSAT, MVT::v2i64, { 16, 19, 31, 31 } },
4270 { ISD::SSUBSAT, MVT::v4i32, { 6, 14, 12, 13 } },
4271 { ISD::SSUBSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4272 { ISD::SSUBSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4273 { ISD::UADDSAT, MVT::v2i64, { 7, 13, 14, 14 } },
4274 { ISD::UADDSAT, MVT::v4i32, { 4, 5, 7, 7 } },
4275 { ISD::UADDSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4276 { ISD::UADDSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4277 { ISD::UMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4278 { ISD::UMAX, MVT::v4i32, { 2, 5, 8, 8 } },
4279 { ISD::UMAX, MVT::v8i16, { 1, 3, 3, 3 } },
4280 { ISD::UMAX, MVT::v16i8, { 1, 1, 1, 1 } },
4281 { ISD::UMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4282 { ISD::UMIN, MVT::v4i32, { 2, 5, 8, 8 } },
4283 { ISD::UMIN, MVT::v8i16, { 1, 3, 3, 3 } },
4284 { ISD::UMIN, MVT::v16i8, { 1, 1, 1, 1 } },
4285 { ISD::UMULO, MVT::v2i64, { 30, 33, 15, 29 } },
4286 { ISD::UMULO, MVT::v4i32, { 19, 22, 14, 18 } },
4287 { ISD::UMULO, MVT::v8i16, { 4, 9, 7, 7 } },
4288 { ISD::UMULO, MVT::v16i8, { 13, 19, 20, 20 } },
4289 { ISD::USUBSAT, MVT::v2i64, { 7, 10, 14, 14 } },
4290 { ISD::USUBSAT, MVT::v4i32, { 4, 4, 7, 7 } },
4291 { ISD::USUBSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4292 { ISD::USUBSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4293 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } },
4294 { ISD::FMAXNUM, MVT::v2f64, { 4, 6, 6, 6 } },
4295 { ISD::FSQRT, MVT::f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4296 { ISD::FSQRT, MVT::v2f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4297 };
4298 static const CostKindTblEntry SSE1CostTbl[] = {
4299 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } },
4300 { ISD::FMAXNUM, MVT::v4f32, { 4, 6, 6, 6 } },
4301 { ISD::FSQRT, MVT::f32, { 28, 30, 1, 2 } }, // Pentium III from http://www.agner.org/
4302 { ISD::FSQRT, MVT::v4f32, { 56, 56, 1, 2 } }, // Pentium III from http://www.agner.org/
4303 };
4304 static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets
4305 { ISD::CTTZ, MVT::i64, { 1, 1, 1, 1 } },
4306 };
4307 static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
4308 { ISD::CTTZ, MVT::i32, { 1, 1, 1, 1 } },
4309 { ISD::CTTZ, MVT::i16, { 2, 1, 1, 1 } },
4310 { ISD::CTTZ, MVT::i8, { 2, 1, 1, 1 } },
4311 };
4312 static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets
4313 { ISD::CTLZ, MVT::i64, { 1, 1, 1, 1 } },
4314 };
4315 static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
4316 { ISD::CTLZ, MVT::i32, { 1, 1, 1, 1 } },
4317 { ISD::CTLZ, MVT::i16, { 2, 1, 1, 1 } },
4318 { ISD::CTLZ, MVT::i8, { 2, 1, 1, 1 } },
4319 };
4320 static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets
4321 { ISD::CTPOP, MVT::i64, { 1, 1, 1, 1 } }, // popcnt
4322 };
4323 static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
4324 { ISD::CTPOP, MVT::i32, { 1, 1, 1, 1 } }, // popcnt
4325 { ISD::CTPOP, MVT::i16, { 1, 1, 2, 2 } }, // popcnt(zext())
4326 { ISD::CTPOP, MVT::i8, { 1, 1, 2, 2 } }, // popcnt(zext())
4327 };
4328 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
4329 { ISD::ABS, MVT::i64, { 1, 2, 3, 3 } }, // SUB+CMOV
4330 { ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } },
4331 { ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } },
4332 { ISD::CTLZ, MVT::i64, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
4333 { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 2, 2 } }, // BSR+XOR
4334 { ISD::CTTZ, MVT::i64, { 2, 2, 3, 4 } }, // TEST+BSF+CMOV/BRANCH
4335 { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 1, 2 } }, // BSF
4336 { ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } },
4337 { ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } },
4338 { ISD::ROTR, MVT::i64, { 2, 3, 1, 3 } },
4339 { X86ISD::VROTLI, MVT::i64, { 1, 1, 1, 1 } },
4340 { ISD::FSHL, MVT::i64, { 4, 4, 1, 4 } },
4341 { ISD::SADDSAT, MVT::i64, { 4, 4, 7, 10 } },
4342 { ISD::SSUBSAT, MVT::i64, { 4, 5, 8, 11 } },
4343 { ISD::UADDSAT, MVT::i64, { 2, 3, 4, 7 } },
4344 { ISD::USUBSAT, MVT::i64, { 2, 3, 4, 7 } },
4345 { ISD::SMAX, MVT::i64, { 1, 3, 2, 3 } },
4346 { ISD::SMIN, MVT::i64, { 1, 3, 2, 3 } },
4347 { ISD::UMAX, MVT::i64, { 1, 3, 2, 3 } },
4348 { ISD::UMIN, MVT::i64, { 1, 3, 2, 3 } },
4349 { ISD::SADDO, MVT::i64, { 2, 2, 4, 6 } },
4350 { ISD::UADDO, MVT::i64, { 2, 2, 4, 6 } },
4351 { ISD::SMULO, MVT::i64, { 4, 4, 4, 6 } },
4352 { ISD::UMULO, MVT::i64, { 8, 8, 4, 7 } },
4353 };
4354 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
4355 { ISD::ABS, MVT::i32, { 1, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
4356 { ISD::ABS, MVT::i16, { 2, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
4357 { ISD::ABS, MVT::i8, { 2, 4, 4, 3 } }, // SUB+XOR+SRA
4358 { ISD::BITREVERSE, MVT::i32, { 9, 12, 17, 19 } },
4359 { ISD::BITREVERSE, MVT::i16, { 9, 12, 17, 19 } },
4360 { ISD::BITREVERSE, MVT::i8, { 7, 9, 13, 14 } },
4361 { ISD::BSWAP, MVT::i32, { 1, 1, 1, 1 } },
4362 { ISD::BSWAP, MVT::i16, { 1, 2, 1, 2 } }, // ROL
4363 { ISD::CTLZ, MVT::i32, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
4364 { ISD::CTLZ, MVT::i16, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
4365 { ISD::CTLZ, MVT::i8, { 2, 2, 5, 6 } }, // BSR+XOR or BSR+XOR+CMOV
4366 { ISD::CTLZ_ZERO_UNDEF, MVT::i32,{ 1, 2, 2, 2 } }, // BSR+XOR
4367 { ISD::CTLZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 2, 2 } }, // BSR+XOR
4368 { ISD::CTLZ_ZERO_UNDEF, MVT::i8, { 2, 2, 3, 3 } }, // BSR+XOR
4369 { ISD::CTTZ, MVT::i32, { 2, 2, 3, 3 } }, // TEST+BSF+CMOV/BRANCH
4370 { ISD::CTTZ, MVT::i16, { 2, 2, 2, 3 } }, // TEST+BSF+CMOV/BRANCH
4371 { ISD::CTTZ, MVT::i8, { 2, 2, 2, 3 } }, // TEST+BSF+CMOV/BRANCH
4372 { ISD::CTTZ_ZERO_UNDEF, MVT::i32,{ 1, 2, 1, 2 } }, // BSF
4373 { ISD::CTTZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 1, 2 } }, // BSF
4374 { ISD::CTTZ_ZERO_UNDEF, MVT::i8, { 2, 2, 1, 2 } }, // BSF
4375 { ISD::CTPOP, MVT::i32, { 8, 7, 15, 15 } },
4376 { ISD::CTPOP, MVT::i16, { 9, 8, 17, 17 } },
4377 { ISD::CTPOP, MVT::i8, { 7, 6, 6, 6 } },
4378 { ISD::ROTL, MVT::i32, { 2, 3, 1, 3 } },
4379 { ISD::ROTL, MVT::i16, { 2, 3, 1, 3 } },
4380 { ISD::ROTL, MVT::i8, { 2, 3, 1, 3 } },
4381 { ISD::ROTR, MVT::i32, { 2, 3, 1, 3 } },
4382 { ISD::ROTR, MVT::i16, { 2, 3, 1, 3 } },
4383 { ISD::ROTR, MVT::i8, { 2, 3, 1, 3 } },
4384 { X86ISD::VROTLI, MVT::i32, { 1, 1, 1, 1 } },
4385 { X86ISD::VROTLI, MVT::i16, { 1, 1, 1, 1 } },
4386 { X86ISD::VROTLI, MVT::i8, { 1, 1, 1, 1 } },
4387 { ISD::FSHL, MVT::i32, { 4, 4, 1, 4 } },
4388 { ISD::FSHL, MVT::i16, { 4, 4, 2, 5 } },
4389 { ISD::FSHL, MVT::i8, { 4, 4, 2, 5 } },
4390 { ISD::SADDSAT, MVT::i32, { 3, 4, 6, 9 } },
4391 { ISD::SADDSAT, MVT::i16, { 4, 4, 7, 10 } },
4392 { ISD::SADDSAT, MVT::i8, { 4, 5, 8, 11 } },
4393 { ISD::SSUBSAT, MVT::i32, { 4, 4, 7, 10 } },
4394 { ISD::SSUBSAT, MVT::i16, { 4, 4, 7, 10 } },
4395 { ISD::SSUBSAT, MVT::i8, { 4, 5, 8, 11 } },
4396 { ISD::UADDSAT, MVT::i32, { 2, 3, 4, 7 } },
4397 { ISD::UADDSAT, MVT::i16, { 2, 3, 4, 7 } },
4398 { ISD::UADDSAT, MVT::i8, { 3, 3, 5, 8 } },
4399 { ISD::USUBSAT, MVT::i32, { 2, 3, 4, 7 } },
4400 { ISD::USUBSAT, MVT::i16, { 2, 3, 4, 7 } },
4401 { ISD::USUBSAT, MVT::i8, { 3, 3, 5, 8 } },
4402 { ISD::SMAX, MVT::i32, { 1, 2, 2, 3 } },
4403 { ISD::SMAX, MVT::i16, { 1, 4, 2, 4 } },
4404 { ISD::SMAX, MVT::i8, { 1, 4, 2, 4 } },
4405 { ISD::SMIN, MVT::i32, { 1, 2, 2, 3 } },
4406 { ISD::SMIN, MVT::i16, { 1, 4, 2, 4 } },
4407 { ISD::SMIN, MVT::i8, { 1, 4, 2, 4 } },
4408 { ISD::UMAX, MVT::i32, { 1, 2, 2, 3 } },
4409 { ISD::UMAX, MVT::i16, { 1, 4, 2, 4 } },
4410 { ISD::UMAX, MVT::i8, { 1, 4, 2, 4 } },
4411 { ISD::UMIN, MVT::i32, { 1, 2, 2, 3 } },
4412 { ISD::UMIN, MVT::i16, { 1, 4, 2, 4 } },
4413 { ISD::UMIN, MVT::i8, { 1, 4, 2, 4 } },
4414 { ISD::SADDO, MVT::i32, { 2, 2, 4, 6 } },
4415 { ISD::SADDO, MVT::i16, { 2, 2, 4, 6 } },
4416 { ISD::SADDO, MVT::i8, { 2, 2, 4, 6 } },
4417 { ISD::UADDO, MVT::i32, { 2, 2, 4, 6 } },
4418 { ISD::UADDO, MVT::i16, { 2, 2, 4, 6 } },
4419 { ISD::UADDO, MVT::i8, { 2, 2, 4, 6 } },
4420 { ISD::SMULO, MVT::i32, { 2, 2, 4, 6 } },
4421 { ISD::SMULO, MVT::i16, { 5, 5, 4, 6 } },
4422 { ISD::SMULO, MVT::i8, { 6, 6, 4, 6 } },
4423 { ISD::UMULO, MVT::i32, { 6, 6, 4, 8 } },
4424 { ISD::UMULO, MVT::i16, { 6, 6, 4, 9 } },
4425 { ISD::UMULO, MVT::i8, { 6, 6, 4, 6 } },
4426 };
4427
4428 Type *RetTy = ICA.getReturnType();
4429 Type *OpTy = RetTy;
4430 Intrinsic::ID IID = ICA.getID();
4431 unsigned ISD = ISD::DELETED_NODE;
4432 switch (IID) {
4433 default:
4434 break;
4435 case Intrinsic::abs:
4436 ISD = ISD::ABS;
4437 break;
4438 case Intrinsic::bitreverse:
4439 ISD = ISD::BITREVERSE;
4440 break;
4441 case Intrinsic::bswap:
4442 ISD = ISD::BSWAP;
4443 break;
4444 case Intrinsic::ctlz:
4445 ISD = ISD::CTLZ;
4446 break;
4447 case Intrinsic::ctpop:
4448 ISD = ISD::CTPOP;
4449 break;
4450 case Intrinsic::cttz:
4451 ISD = ISD::CTTZ;
4452 break;
4453 case Intrinsic::fshl:
4454 ISD = ISD::FSHL;
4455 if (!ICA.isTypeBasedOnly()) {
4456 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4457 if (Args[0] == Args[1]) {
4458 ISD = ISD::ROTL;
4459 // Handle uniform constant rotation amounts.
4460 // TODO: Handle funnel-shift cases.
4461 const APInt *Amt;
4462 if (Args[2] &&
4464 ISD = X86ISD::VROTLI;
4465 }
4466 }
4467 break;
4468 case Intrinsic::fshr:
4469 // FSHR has same costs so don't duplicate.
4470 ISD = ISD::FSHL;
4471 if (!ICA.isTypeBasedOnly()) {
4472 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4473 if (Args[0] == Args[1]) {
4474 ISD = ISD::ROTR;
4475 // Handle uniform constant rotation amount.
4476 // TODO: Handle funnel-shift cases.
4477 const APInt *Amt;
4478 if (Args[2] &&
4480 ISD = X86ISD::VROTLI;
4481 }
4482 }
4483 break;
4484 case Intrinsic::lrint:
4485 case Intrinsic::llrint:
4486 // X86 can use the CVTP2SI instructions to lower lrint/llrint calls, which
4487 // have the same costs as the CVTTP2SI (fptosi) instructions
4488 if (!ICA.isTypeBasedOnly()) {
4489 const SmallVectorImpl<Type *> &ArgTys = ICA.getArgTypes();
4490 return getCastInstrCost(Instruction::FPToSI, RetTy, ArgTys[0],
4492 }
4493 break;
4494 case Intrinsic::maxnum:
4495 case Intrinsic::minnum:
4496 // FMINNUM has same costs so don't duplicate.
4497 ISD = ISD::FMAXNUM;
4498 break;
4499 case Intrinsic::sadd_sat:
4500 ISD = ISD::SADDSAT;
4501 break;
4502 case Intrinsic::smax:
4503 ISD = ISD::SMAX;
4504 break;
4505 case Intrinsic::smin:
4506 ISD = ISD::SMIN;
4507 break;
4508 case Intrinsic::ssub_sat:
4509 ISD = ISD::SSUBSAT;
4510 break;
4511 case Intrinsic::uadd_sat:
4512 ISD = ISD::UADDSAT;
4513 break;
4514 case Intrinsic::umax:
4515 ISD = ISD::UMAX;
4516 break;
4517 case Intrinsic::umin:
4518 ISD = ISD::UMIN;
4519 break;
4520 case Intrinsic::usub_sat:
4521 ISD = ISD::USUBSAT;
4522 break;
4523 case Intrinsic::sqrt:
4524 ISD = ISD::FSQRT;
4525 break;
4526 case Intrinsic::sadd_with_overflow:
4527 case Intrinsic::ssub_with_overflow:
4528 // SSUBO has same costs so don't duplicate.
4529 ISD = ISD::SADDO;
4530 OpTy = RetTy->getContainedType(0);
4531 break;
4532 case Intrinsic::uadd_with_overflow:
4533 case Intrinsic::usub_with_overflow:
4534 // USUBO has same costs so don't duplicate.
4535 ISD = ISD::UADDO;
4536 OpTy = RetTy->getContainedType(0);
4537 break;
4538 case Intrinsic::smul_with_overflow:
4539 ISD = ISD::SMULO;
4540 OpTy = RetTy->getContainedType(0);
4541 break;
4542 case Intrinsic::umul_with_overflow:
4543 ISD = ISD::UMULO;
4544 OpTy = RetTy->getContainedType(0);
4545 break;
4546 }
4547
4548 if (ISD != ISD::DELETED_NODE) {
4549 auto adjustTableCost = [&](int ISD, unsigned Cost,
4550 std::pair<InstructionCost, MVT> LT,
4552 InstructionCost LegalizationCost = LT.first;
4553 MVT MTy = LT.second;
4554
4555 // If there are no NANs to deal with, then these are reduced to a
4556 // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
4557 // assume is used in the non-fast case.
4558 if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) {
4559 if (FMF.noNaNs())
4560 return LegalizationCost * 1;
4561 }
4562
4563 // For cases where some ops can be folded into a load/store, assume free.
4564 if (MTy.isScalarInteger()) {
4565 if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
4566 if (const Instruction *II = ICA.getInst()) {
4567 if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
4568 return TTI::TCC_Free;
4569 if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
4570 if (LI->hasOneUse())
4571 return TTI::TCC_Free;
4572 }
4573 }
4574 }
4575 }
4576
4577 return LegalizationCost * (int)Cost;
4578 };
4579
4580 // Legalize the type.
4581 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy);
4582 MVT MTy = LT.second;
4583
4584 // Without BMI/LZCNT see if we're only looking for a *_ZERO_UNDEF cost.
4585 if (((ISD == ISD::CTTZ && !ST->hasBMI()) ||
4586 (ISD == ISD::CTLZ && !ST->hasLZCNT())) &&
4587 !MTy.isVector() && !ICA.isTypeBasedOnly()) {
4588 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4589 if (auto *Cst = dyn_cast<ConstantInt>(Args[1]))
4590 if (Cst->isAllOnesValue())
4592 }
4593
4594 // FSQRT is a single instruction.
4595 if (ISD == ISD::FSQRT && CostKind == TTI::TCK_CodeSize)
4596 return LT.first;
4597
4598 if (ST->useGLMDivSqrtCosts())
4599 if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
4600 if (auto KindCost = Entry->Cost[CostKind])
4601 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4602
4603 if (ST->useSLMArithCosts())
4604 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
4605 if (auto KindCost = Entry->Cost[CostKind])
4606 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4607
4608 if (ST->hasVBMI2())
4609 if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy))
4610 if (auto KindCost = Entry->Cost[CostKind])
4611 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4612
4613 if (ST->hasBITALG())
4614 if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy))
4615 if (auto KindCost = Entry->Cost[CostKind])
4616 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4617
4618 if (ST->hasVPOPCNTDQ())
4619 if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy))
4620 if (auto KindCost = Entry->Cost[CostKind])
4621 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4622
4623 if (ST->hasGFNI())
4624 if (const auto *Entry = CostTableLookup(GFNICostTbl, ISD, MTy))
4625 if (auto KindCost = Entry->Cost[CostKind])
4626 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4627
4628 if (ST->hasCDI())
4629 if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
4630 if (auto KindCost = Entry->Cost[CostKind])
4631 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4632
4633 if (ST->hasBWI())
4634 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
4635 if (auto KindCost = Entry->Cost[CostKind])
4636 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4637
4638 if (ST->hasAVX512())
4639 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
4640 if (auto KindCost = Entry->Cost[CostKind])
4641 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4642
4643 if (ST->hasXOP())
4644 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
4645 if (auto KindCost = Entry->Cost[CostKind])
4646 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4647
4648 if (ST->hasAVX2())
4649 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
4650 if (auto KindCost = Entry->Cost[CostKind])
4651 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4652
4653 if (ST->hasAVX())
4654 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
4655 if (auto KindCost = Entry->Cost[CostKind])
4656 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4657
4658 if (ST->hasSSE42())
4659 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
4660 if (auto KindCost = Entry->Cost[CostKind])
4661 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4662
4663 if (ST->hasSSE41())
4664 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
4665 if (auto KindCost = Entry->Cost[CostKind])
4666 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4667
4668 if (ST->hasSSSE3())
4669 if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
4670 if (auto KindCost = Entry->Cost[CostKind])
4671 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4672
4673 if (ST->hasSSE2())
4674 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
4675 if (auto KindCost = Entry->Cost[CostKind])
4676 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4677
4678 if (ST->hasSSE1())
4679 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
4680 if (auto KindCost = Entry->Cost[CostKind])
4681 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4682
4683 if (ST->hasBMI()) {
4684 if (ST->is64Bit())
4685 if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
4686 if (auto KindCost = Entry->Cost[CostKind])
4687 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4688
4689 if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
4690 if (auto KindCost = Entry->Cost[CostKind])
4691 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4692 }
4693
4694 if (ST->hasLZCNT()) {
4695 if (ST->is64Bit())
4696 if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
4697 if (auto KindCost = Entry->Cost[CostKind])
4698 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4699
4700 if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
4701 if (auto KindCost = Entry->Cost[CostKind])
4702 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4703 }
4704
4705 if (ST->hasPOPCNT()) {
4706 if (ST->is64Bit())
4707 if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
4708 if (auto KindCost = Entry->Cost[CostKind])
4709 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4710
4711 if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
4712 if (auto KindCost = Entry->Cost[CostKind])
4713 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4714 }
4715
4716 if (ST->is64Bit())
4717 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
4718 if (auto KindCost = Entry->Cost[CostKind])
4719 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4720
4721 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
4722 if (auto KindCost = Entry->Cost[CostKind])
4723 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4724 }
4725
4727}
4728
4731 unsigned Index, Value *Op0,
4732 Value *Op1) {
4733 static const CostTblEntry SLMCostTbl[] = {
4734 { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 },
4735 { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 },
4736 { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 },
4737 { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 }
4738 };
4739
4740 assert(Val->isVectorTy() && "This must be a vector type");
4741 Type *ScalarType = Val->getScalarType();
4742 InstructionCost RegisterFileMoveCost = 0;
4743
4744 // Non-immediate extraction/insertion can be handled as a sequence of
4745 // aliased loads+stores via the stack.
4746 if (Index == -1U && (Opcode == Instruction::ExtractElement ||
4747 Opcode == Instruction::InsertElement)) {
4748 // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
4749 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
4750
4751 // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling.
4752 assert(isa<FixedVectorType>(Val) && "Fixed vector type expected");
4753 Align VecAlign = DL.getPrefTypeAlign(Val);
4754 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
4755
4756 // Extract - store vector to stack, load scalar.
4757 if (Opcode == Instruction::ExtractElement) {
4758 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4759 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
4760 CostKind);
4761 }
4762 // Insert - store vector to stack, store scalar, load vector.
4763 if (Opcode == Instruction::InsertElement) {
4764 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4765 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
4766 CostKind) +
4767 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind);
4768 }
4769 }
4770
4771 if (Index != -1U && (Opcode == Instruction::ExtractElement ||
4772 Opcode == Instruction::InsertElement)) {
4773 // Extraction of vXi1 elements are now efficiently handled by MOVMSK.
4774 if (Opcode == Instruction::ExtractElement &&
4775 ScalarType->getScalarSizeInBits() == 1 &&
4776 cast<FixedVectorType>(Val)->getNumElements() > 1)
4777 return 1;
4778
4779 // Legalize the type.
4780 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
4781
4782 // This type is legalized to a scalar type.
4783 if (!LT.second.isVector())
4784 return TTI::TCC_Free;
4785
4786 // The type may be split. Normalize the index to the new type.
4787 unsigned SizeInBits = LT.second.getSizeInBits();
4788 unsigned NumElts = LT.second.getVectorNumElements();
4789 unsigned SubNumElts = NumElts;
4790 Index = Index % NumElts;
4791
4792 // For >128-bit vectors, we need to extract higher 128-bit subvectors.
4793 // For inserts, we also need to insert the subvector back.
4794 if (SizeInBits > 128) {
4795 assert((SizeInBits % 128) == 0 && "Illegal vector");
4796 unsigned NumSubVecs = SizeInBits / 128;
4797 SubNumElts = NumElts / NumSubVecs;
4798 if (SubNumElts <= Index) {
4799 RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
4800 Index %= SubNumElts;
4801 }
4802 }
4803
4804 MVT MScalarTy = LT.second.getScalarType();
4805 auto IsCheapPInsrPExtrInsertPS = [&]() {
4806 // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
4807 // Inserting f32 into index0 is just movss.
4808 // Also, assume insertps is relatively cheap on all >= SSE41 targets.
4809 return (MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4810 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4811 (MScalarTy == MVT::f32 && ST->hasSSE1() && Index == 0 &&
4812 Opcode == Instruction::InsertElement) ||
4813 (MScalarTy == MVT::f32 && ST->hasSSE41() &&
4814 Opcode == Instruction::InsertElement);
4815 };
4816
4817 if (Index == 0) {
4818 // Floating point scalars are already located in index #0.
4819 // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
4820 // true for all.
4821 if (ScalarType->isFloatingPointTy() &&
4822 (Opcode != Instruction::InsertElement || !Op0 ||
4823 isa<UndefValue>(Op0)))
4824 return RegisterFileMoveCost;
4825
4826 if (Opcode == Instruction::InsertElement &&
4827 isa_and_nonnull<UndefValue>(Op0)) {
4828 // Consider the gather cost to be cheap.
4829 if (isa_and_nonnull<LoadInst>(Op1))
4830 return RegisterFileMoveCost;
4831 if (!IsCheapPInsrPExtrInsertPS()) {
4832 // mov constant-to-GPR + movd/movq GPR -> XMM.
4833 if (isa_and_nonnull<Constant>(Op1) && Op1->getType()->isIntegerTy())
4834 return 2 + RegisterFileMoveCost;
4835 // Assume movd/movq GPR -> XMM is relatively cheap on all targets.
4836 return 1 + RegisterFileMoveCost;
4837 }
4838 }
4839
4840 // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
4841 if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
4842 return 1 + RegisterFileMoveCost;
4843 }
4844
4845 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4846 assert(ISD && "Unexpected vector opcode");
4847 if (ST->useSLMArithCosts())
4848 if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
4849 return Entry->Cost + RegisterFileMoveCost;
4850
4851 // Consider cheap cases.
4852 if (IsCheapPInsrPExtrInsertPS())
4853 return 1 + RegisterFileMoveCost;
4854
4855 // For extractions we just need to shuffle the element to index 0, which
4856 // should be very cheap (assume cost = 1). For insertions we need to shuffle
4857 // the elements to its destination. In both cases we must handle the
4858 // subvector move(s).
4859 // If the vector type is already less than 128-bits then don't reduce it.
4860 // TODO: Under what circumstances should we shuffle using the full width?
4861 InstructionCost ShuffleCost = 1;
4862 if (Opcode == Instruction::InsertElement) {
4863 auto *SubTy = cast<VectorType>(Val);
4864 EVT VT = TLI->getValueType(DL, Val);
4865 if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
4866 SubTy = FixedVectorType::get(ScalarType, SubNumElts);
4867 ShuffleCost =
4868 getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, {}, CostKind, 0, SubTy);
4869 }
4870 int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
4871 return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
4872 }
4873
4874 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1) +
4875 RegisterFileMoveCost;
4876}
4877
4879 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
4881 assert(DemandedElts.getBitWidth() ==
4882 cast<FixedVectorType>(Ty)->getNumElements() &&
4883 "Vector size mismatch");
4884
4885 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4886 MVT MScalarTy = LT.second.getScalarType();
4887 unsigned LegalVectorBitWidth = LT.second.getSizeInBits();
4889
4890 constexpr unsigned LaneBitWidth = 128;
4891 assert((LegalVectorBitWidth < LaneBitWidth ||
4892 (LegalVectorBitWidth % LaneBitWidth) == 0) &&
4893 "Illegal vector");
4894
4895 const int NumLegalVectors = *LT.first.getValue();
4896 assert(NumLegalVectors >= 0 && "Negative cost!");
4897
4898 // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
4899 // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
4900 if (Insert) {
4901 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4902 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4903 (MScalarTy == MVT::f32 && ST->hasSSE41())) {
4904 // For types we can insert directly, insertion into 128-bit sub vectors is
4905 // cheap, followed by a cheap chain of concatenations.
4906 if (LegalVectorBitWidth <= LaneBitWidth) {
4907 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
4908 /*Extract*/ false, CostKind);
4909 } else {
4910 // In each 128-lane, if at least one index is demanded but not all
4911 // indices are demanded and this 128-lane is not the first 128-lane of
4912 // the legalized-vector, then this 128-lane needs a extracti128; If in
4913 // each 128-lane, there is at least one demanded index, this 128-lane
4914 // needs a inserti128.
4915
4916 // The following cases will help you build a better understanding:
4917 // Assume we insert several elements into a v8i32 vector in avx2,
4918 // Case#1: inserting into 1th index needs vpinsrd + inserti128.
4919 // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
4920 // inserti128.
4921 // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
4922 assert((LegalVectorBitWidth % LaneBitWidth) == 0 && "Illegal vector");
4923 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
4924 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
4925 unsigned NumLegalElts =
4926 LT.second.getVectorNumElements() * NumLegalVectors;
4927 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
4928 "Vector has been legalized to smaller element count");
4929 assert((NumLegalElts % NumLanesTotal) == 0 &&
4930 "Unexpected elts per lane");
4931 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
4932
4933 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
4934 auto *LaneTy =
4935 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
4936
4937 for (unsigned I = 0; I != NumLanesTotal; ++I) {
4938 APInt LaneEltMask = WidenedDemandedElts.extractBits(
4939 NumEltsPerLane, NumEltsPerLane * I);
4940 if (LaneEltMask.isZero())
4941 continue;
4942 // FIXME: we don't need to extract if all non-demanded elements
4943 // are legalization-inserted padding.
4944 if (!LaneEltMask.isAllOnes())
4946 I * NumEltsPerLane, LaneTy);
4947 Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert,
4948 /*Extract*/ false, CostKind);
4949 }
4950
4951 APInt AffectedLanes =
4952 APIntOps::ScaleBitMask(WidenedDemandedElts, NumLanesTotal);
4953 APInt FullyAffectedLegalVectors = APIntOps::ScaleBitMask(
4954 AffectedLanes, NumLegalVectors, /*MatchAllBits=*/true);
4955 for (int LegalVec = 0; LegalVec != NumLegalVectors; ++LegalVec) {
4956 for (unsigned Lane = 0; Lane != NumLegalLanes; ++Lane) {
4957 unsigned I = NumLegalLanes * LegalVec + Lane;
4958 // No need to insert unaffected lane; or lane 0 of each legal vector
4959 // iff ALL lanes of that vector were affected and will be inserted.
4960 if (!AffectedLanes[I] ||
4961 (Lane == 0 && FullyAffectedLegalVectors[LegalVec]))
4962 continue;
4964 I * NumEltsPerLane, LaneTy);
4965 }
4966 }
4967 }
4968 } else if (LT.second.isVector()) {
4969 // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
4970 // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
4971 // series of UNPCK followed by CONCAT_VECTORS - all of these can be
4972 // considered cheap.
4973 if (Ty->isIntOrIntVectorTy())
4974 Cost += DemandedElts.popcount();
4975
4976 // Get the smaller of the legalized or original pow2-extended number of
4977 // vector elements, which represents the number of unpacks we'll end up
4978 // performing.
4979 unsigned NumElts = LT.second.getVectorNumElements();
4980 unsigned Pow2Elts =
4981 PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements());
4982 Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
4983 }
4984 }
4985
4986 if (Extract) {
4987 // vXi1 can be efficiently extracted with MOVMSK.
4988 // TODO: AVX512 predicate mask handling.
4989 // NOTE: This doesn't work well for roundtrip scalarization.
4990 if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) {
4991 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
4992 unsigned MaxElts = ST->hasAVX2() ? 32 : 16;
4993 unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts;
4994 return MOVMSKCost;
4995 }
4996
4997 if (LT.second.isVector()) {
4998 unsigned NumLegalElts =
4999 LT.second.getVectorNumElements() * NumLegalVectors;
5000 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
5001 "Vector has been legalized to smaller element count");
5002
5003 // If we're extracting elements from a 128-bit subvector lane,
5004 // we only need to extract each lane once, not for every element.
5005 if (LegalVectorBitWidth > LaneBitWidth) {
5006 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
5007 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
5008 assert((NumLegalElts % NumLanesTotal) == 0 &&
5009 "Unexpected elts per lane");
5010 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
5011
5012 // Add cost for each demanded 128-bit subvector extraction.
5013 // Luckily this is a lot easier than for insertion.
5014 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
5015 auto *LaneTy =
5016 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
5017
5018 for (unsigned I = 0; I != NumLanesTotal; ++I) {
5019 APInt LaneEltMask = WidenedDemandedElts.extractBits(
5020 NumEltsPerLane, I * NumEltsPerLane);
5021 if (LaneEltMask.isZero())
5022 continue;
5024 I * NumEltsPerLane, LaneTy);
5026 LaneTy, LaneEltMask, /*Insert*/ false, Extract, CostKind);
5027 }
5028
5029 return Cost;
5030 }
5031 }
5032
5033 // Fallback to default extraction.
5034 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ false,
5035 Extract, CostKind);
5036 }
5037
5038 return Cost;
5039}
5040
5042X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
5043 int VF, const APInt &DemandedDstElts,
5045 const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy);
5046 // We don't differentiate element types here, only element bit width.
5047 EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits);
5048
5049 auto bailout = [&]() {
5050 return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
5051 DemandedDstElts, CostKind);
5052 };
5053
5054 // For now, only deal with AVX512 cases.
5055 if (!ST->hasAVX512())
5056 return bailout();
5057
5058 // Do we have a native shuffle for this element type, or should we promote?
5059 unsigned PromEltTyBits = EltTyBits;
5060 switch (EltTyBits) {
5061 case 32:
5062 case 64:
5063 break; // AVX512F.
5064 case 16:
5065 if (!ST->hasBWI())
5066 PromEltTyBits = 32; // promote to i32, AVX512F.
5067 break; // AVX512BW
5068 case 8:
5069 if (!ST->hasVBMI())
5070 PromEltTyBits = 32; // promote to i32, AVX512F.
5071 break; // AVX512VBMI
5072 case 1:
5073 // There is no support for shuffling i1 elements. We *must* promote.
5074 if (ST->hasBWI()) {
5075 if (ST->hasVBMI())
5076 PromEltTyBits = 8; // promote to i8, AVX512VBMI.
5077 else
5078 PromEltTyBits = 16; // promote to i16, AVX512BW.
5079 break;
5080 }
5081 PromEltTyBits = 32; // promote to i32, AVX512F.
5082 break;
5083 default:
5084 return bailout();
5085 }
5086 auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits);
5087
5088 auto *SrcVecTy = FixedVectorType::get(EltTy, VF);
5089 auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF);
5090
5091 int NumDstElements = VF * ReplicationFactor;
5092 auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements);
5093 auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements);
5094
5095 // Legalize the types.
5096 MVT LegalSrcVecTy = getTypeLegalizationCost(SrcVecTy).second;
5097 MVT LegalPromSrcVecTy = getTypeLegalizationCost(PromSrcVecTy).second;
5098 MVT LegalPromDstVecTy = getTypeLegalizationCost(PromDstVecTy).second;
5099 MVT LegalDstVecTy = getTypeLegalizationCost(DstVecTy).second;
5100 // They should have legalized into vector types.
5101 if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() ||
5102 !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector())
5103 return bailout();
5104
5105 if (PromEltTyBits != EltTyBits) {
5106 // If we have to perform the shuffle with wider elt type than our data type,
5107 // then we will first need to anyext (we don't care about the new bits)
5108 // the source elements, and then truncate Dst elements.
5109 InstructionCost PromotionCost;
5110 PromotionCost += getCastInstrCost(
5111 Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy,
5113 PromotionCost +=
5114 getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy,
5115 /*Src=*/PromDstVecTy,
5117 return PromotionCost + getReplicationShuffleCost(PromEltTy,
5118 ReplicationFactor, VF,
5119 DemandedDstElts, CostKind);
5120 }
5121
5122 assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits &&
5123 LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() &&
5124 "We expect that the legalization doesn't affect the element width, "
5125 "doesn't coalesce/split elements.");
5126
5127 unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements();
5128 unsigned NumDstVectors =
5129 divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec);
5130
5131 auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec);
5132
5133 // Not all the produced Dst elements may be demanded. In our case,
5134 // given that a single Dst vector is formed by a single shuffle,
5135 // if all elements that will form a single Dst vector aren't demanded,
5136 // then we won't need to do that shuffle, so adjust the cost accordingly.
5137 APInt DemandedDstVectors = APIntOps::ScaleBitMask(
5138 DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors);
5139 unsigned NumDstVectorsDemanded = DemandedDstVectors.popcount();
5140
5141 InstructionCost SingleShuffleCost = getShuffleCost(
5142 TTI::SK_PermuteSingleSrc, SingleDstVecTy, /*Mask=*/{}, CostKind,
5143 /*Index=*/0, /*SubTp=*/nullptr);
5144 return NumDstVectorsDemanded * SingleShuffleCost;
5145}
5146
5148 MaybeAlign Alignment,
5149 unsigned AddressSpace,
5151 TTI::OperandValueInfo OpInfo,
5152 const Instruction *I) {
5153 // TODO: Handle other cost kinds.
5155 if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
5156 // Store instruction with index and scale costs 2 Uops.
5157 // Check the preceding GEP to identify non-const indices.
5158 if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
5159 if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
5160 return TTI::TCC_Basic * 2;
5161 }
5162 }
5163 return TTI::TCC_Basic;
5164 }
5165
5166 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
5167 "Invalid Opcode");
5168 // Type legalization can't handle structs
5169 if (TLI->getValueType(DL, Src, true) == MVT::Other)
5170 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5171 CostKind, OpInfo, I);
5172
5173 // Legalize the type.
5174 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
5175
5176 auto *VTy = dyn_cast<FixedVectorType>(Src);
5177
5179
5180 // Add a cost for constant load to vector.
5181 if (Opcode == Instruction::Store && OpInfo.isConstant())
5182 Cost += getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src),
5183 /*AddressSpace=*/0, CostKind, OpInfo);
5184
5185 // Handle the simple case of non-vectors.
5186 // NOTE: this assumes that legalization never creates vector from scalars!
5187 if (!VTy || !LT.second.isVector()) {
5188 // Each load/store unit costs 1.
5189 return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1;
5190 }
5191
5192 bool IsLoad = Opcode == Instruction::Load;
5193
5194 Type *EltTy = VTy->getElementType();
5195
5196 const int EltTyBits = DL.getTypeSizeInBits(EltTy);
5197
5198 // Source of truth: how many elements were there in the original IR vector?
5199 const unsigned SrcNumElt = VTy->getNumElements();
5200
5201 // How far have we gotten?
5202 int NumEltRemaining = SrcNumElt;
5203 // Note that we intentionally capture by-reference, NumEltRemaining changes.
5204 auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
5205
5206 const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8);
5207
5208 // Note that even if we can store 64 bits of an XMM, we still operate on XMM.
5209 const unsigned XMMBits = 128;
5210 if (XMMBits % EltTyBits != 0)
5211 // Vector size must be a multiple of the element size. I.e. no padding.
5212 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5213 CostKind, OpInfo, I);
5214 const int NumEltPerXMM = XMMBits / EltTyBits;
5215
5216 auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
5217
5218 for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0;
5219 NumEltRemaining > 0; CurrOpSizeBytes /= 2) {
5220 // How many elements would a single op deal with at once?
5221 if ((8 * CurrOpSizeBytes) % EltTyBits != 0)
5222 // Vector size must be a multiple of the element size. I.e. no padding.
5223 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5224 CostKind, OpInfo, I);
5225 int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits;
5226
5227 assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?");
5228 assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||
5229 (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&
5230 "Unless we haven't halved the op size yet, "
5231 "we have less than two op's sized units of work left.");
5232
5233 auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
5234 ? FixedVectorType::get(EltTy, CurrNumEltPerOp)
5235 : XMMVecTy;
5236
5237 assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&
5238 "After halving sizes, the vector elt count is no longer a multiple "
5239 "of number of elements per operation?");
5240 auto *CoalescedVecTy =
5241 CurrNumEltPerOp == 1
5242 ? CurrVecTy
5244 IntegerType::get(Src->getContext(),
5245 EltTyBits * CurrNumEltPerOp),
5246 CurrVecTy->getNumElements() / CurrNumEltPerOp);
5247 assert(DL.getTypeSizeInBits(CoalescedVecTy) ==
5248 DL.getTypeSizeInBits(CurrVecTy) &&
5249 "coalesciing elements doesn't change vector width.");
5250
5251 while (NumEltRemaining > 0) {
5252 assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?");
5253
5254 // Can we use this vector size, as per the remaining element count?
5255 // Iff the vector is naturally aligned, we can do a wide load regardless.
5256 if (NumEltRemaining < CurrNumEltPerOp &&
5257 (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) &&
5258 CurrOpSizeBytes != 1)
5259 break; // Try smalled vector size.
5260
5261 // This isn't exactly right. We're using slow unaligned 32-byte accesses
5262 // as a proxy for a double-pumped AVX memory interface such as on
5263 // Sandybridge.
5264 // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or
5265 // will be scalarized.
5266 if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
5267 Cost += 2;
5268 else if (CurrOpSizeBytes < 4)
5269 Cost += 2;
5270 else
5271 Cost += 1;
5272
5273 // If we're loading a uniform value, then we don't need to split the load,
5274 // loading just a single (widest) vector can be reused by all splits.
5275 if (IsLoad && OpInfo.isUniform())
5276 return Cost;
5277
5278 bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
5279
5280 // If we have fully processed the previous reg, we need to replenish it.
5281 if (SubVecEltsLeft == 0) {
5282 SubVecEltsLeft += CurrVecTy->getNumElements();
5283 // And that's free only for the 0'th subvector of a legalized vector.
5284 if (!Is0thSubVec)
5287 VTy, {}, CostKind, NumEltDone(), CurrVecTy);
5288 }
5289
5290 // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
5291 // for smaller widths (32/16/8) we have to insert/extract them separately.
5292 // Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
5293 // but let's pretend that it is also true for 16/8 bit wide ops...)
5294 if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
5295 int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
5296 assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "");
5297 int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
5298 APInt DemandedElts =
5299 APInt::getBitsSet(CoalescedVecTy->getNumElements(),
5300 CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
5301 assert(DemandedElts.popcount() == 1 && "Inserting single value");
5302 Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
5303 !IsLoad, CostKind);
5304 }
5305
5306 SubVecEltsLeft -= CurrNumEltPerOp;
5307 NumEltRemaining -= CurrNumEltPerOp;
5308 Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes);
5309 }
5310 }
5311
5312 assert(NumEltRemaining <= 0 && "Should have processed all the elements.");
5313
5314 return Cost;
5315}
5316
5318X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
5319 unsigned AddressSpace,
5321 bool IsLoad = (Instruction::Load == Opcode);
5322 bool IsStore = (Instruction::Store == Opcode);
5323
5324 auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
5325 if (!SrcVTy)
5326 // To calculate scalar take the regular cost, without mask
5327 return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
5328
5329 unsigned NumElem = SrcVTy->getNumElements();
5330 auto *MaskTy =
5331 FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
5332 if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) ||
5333 (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) {
5334 // Scalarization
5335 APInt DemandedElts = APInt::getAllOnes(NumElem);
5337 MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind);
5338 InstructionCost ScalarCompareCost = getCmpSelInstrCost(
5339 Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
5341 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
5342 InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
5344 SrcVTy, DemandedElts, IsLoad, IsStore, CostKind);
5345 InstructionCost MemopCost =
5346 NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
5347 Alignment, AddressSpace, CostKind);
5348 return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
5349 }
5350
5351 // Legalize the type.
5352 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy);
5353 auto VT = TLI->getValueType(DL, SrcVTy);
5355 MVT Ty = LT.second;
5356 if (Ty == MVT::i16 || Ty == MVT::i32 || Ty == MVT::i64)
5357 // APX masked load/store for scalar is cheap.
5358 return Cost + LT.first;
5359
5360 if (VT.isSimple() && Ty != VT.getSimpleVT() &&
5361 LT.second.getVectorNumElements() == NumElem)
5362 // Promotion requires extend/truncate for data and a shuffle for mask.
5363 Cost +=
5365 nullptr) +
5366 getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, {}, CostKind, 0, nullptr);
5367
5368 else if (LT.first * Ty.getVectorNumElements() > NumElem) {
5369 auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
5371 // Expanding requires fill mask with zeroes
5373 MaskTy);
5374 }
5375
5376 // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
5377 if (!ST->hasAVX512())
5378 return Cost + LT.first * (IsLoad ? 2 : 8);
5379
5380 // AVX-512 masked load/store is cheaper
5381 return Cost + LT.first;
5382}
5383
5386 const Value *Base,
5387 const TTI::PointersChainInfo &Info,
5388 Type *AccessTy, TTI::TargetCostKind CostKind) {
5389 if (Info.isSameBase() && Info.isKnownStride()) {
5390 // If all the pointers have known stride all the differences are translated
5391 // into constants. X86 memory addressing allows encoding it into
5392 // displacement. So we just need to take the base GEP cost.
5393 if (const auto *BaseGEP = dyn_cast<GetElementPtrInst>(Base)) {
5394 SmallVector<const Value *> Indices(BaseGEP->indices());
5395 return getGEPCost(BaseGEP->getSourceElementType(),
5396 BaseGEP->getPointerOperand(), Indices, nullptr,
5397 CostKind);
5398 }
5399 return TTI::TCC_Free;
5400 }
5401 return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind);
5402}
5403
5405 ScalarEvolution *SE,
5406 const SCEV *Ptr) {
5407 // Address computations in vectorized code with non-consecutive addresses will
5408 // likely result in more instructions compared to scalar code where the
5409 // computation can more often be merged into the index mode. The resulting
5410 // extra micro-ops can significantly decrease throughput.
5411 const unsigned NumVectorInstToHideOverhead = 10;
5412
5413 // Cost modeling of Strided Access Computation is hidden by the indexing
5414 // modes of X86 regardless of the stride value. We dont believe that there
5415 // is a difference between constant strided access in gerenal and constant
5416 // strided value which is less than or equal to 64.
5417 // Even in the case of (loop invariant) stride whose value is not known at
5418 // compile time, the address computation will not incur more than one extra
5419 // ADD instruction.
5420 if (Ty->isVectorTy() && SE && !ST->hasAVX2()) {
5421 // TODO: AVX2 is the current cut-off because we don't have correct
5422 // interleaving costs for prior ISA's.
5424 return NumVectorInstToHideOverhead;
5426 return 1;
5427 }
5428
5429 return BaseT::getAddressComputationCost(Ty, SE, Ptr);
5430}
5431
5434 std::optional<FastMathFlags> FMF,
5437 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5438
5439 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5440 // and make it as the cost.
5441
5442 static const CostTblEntry SLMCostTbl[] = {
5443 { ISD::FADD, MVT::v2f64, 3 },
5444 { ISD::ADD, MVT::v2i64, 5 },
5445 };
5446
5447 static const CostTblEntry SSE2CostTbl[] = {
5448 { ISD::FADD, MVT::v2f64, 2 },
5449 { ISD::FADD, MVT::v2f32, 2 },
5450 { ISD::FADD, MVT::v4f32, 4 },
5451 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
5452 { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
5453 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
5454 { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
5455 { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
5456 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
5457 { ISD::ADD, MVT::v2i8, 2 },
5458 { ISD::ADD, MVT::v4i8, 2 },
5459 { ISD::ADD, MVT::v8i8, 2 },
5460 { ISD::ADD, MVT::v16i8, 3 },
5461 };
5462
5463 static const CostTblEntry AVX1CostTbl[] = {
5464 { ISD::FADD, MVT::v4f64, 3 },
5465 { ISD::FADD, MVT::v4f32, 3 },
5466 { ISD::FADD, MVT::v8f32, 4 },
5467 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
5468 { ISD::ADD, MVT::v4i64, 3 },
5469 { ISD::ADD, MVT::v8i32, 5 },
5470 { ISD::ADD, MVT::v16i16, 5 },
5471 { ISD::ADD, MVT::v32i8, 4 },
5472 };
5473
5474 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5475 assert(ISD && "Invalid opcode");
5476
5477 // Before legalizing the type, give a chance to look up illegal narrow types
5478 // in the table.
5479 // FIXME: Is there a better way to do this?
5480 EVT VT = TLI->getValueType(DL, ValTy);
5481 if (VT.isSimple()) {
5482 MVT MTy = VT.getSimpleVT();
5483 if (ST->useSLMArithCosts())
5484 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5485 return Entry->Cost;
5486
5487 if (ST->hasAVX())
5488 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5489 return Entry->Cost;
5490
5491 if (ST->hasSSE2())
5492 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5493 return Entry->Cost;
5494 }
5495
5496 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5497
5498 MVT MTy = LT.second;
5499
5500 auto *ValVTy = cast<FixedVectorType>(ValTy);
5501
5502 // Special case: vXi8 mul reductions are performed as vXi16.
5503 if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) {
5504 auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16);
5505 auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements());
5506 return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
5508 CostKind) +
5509 getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind);
5510 }
5511
5512 InstructionCost ArithmeticCost = 0;
5513 if (LT.first != 1 && MTy.isVector() &&
5514 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5515 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5516 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5517 MTy.getVectorNumElements());
5518 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5519 ArithmeticCost *= LT.first - 1;
5520 }
5521
5522 if (ST->useSLMArithCosts())
5523 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5524 return ArithmeticCost + Entry->Cost;
5525
5526 if (ST->hasAVX())
5527 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5528 return ArithmeticCost + Entry->Cost;
5529
5530 if (ST->hasSSE2())
5531 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5532 return ArithmeticCost + Entry->Cost;
5533
5534 // FIXME: These assume a naive kshift+binop lowering, which is probably
5535 // conservative in most cases.
5536 static const CostTblEntry AVX512BoolReduction[] = {
5537 { ISD::AND, MVT::v2i1, 3 },
5538 { ISD::AND, MVT::v4i1, 5 },
5539 { ISD::AND, MVT::v8i1, 7 },
5540 { ISD::AND, MVT::v16i1, 9 },
5541 { ISD::AND, MVT::v32i1, 11 },
5542 { ISD::AND, MVT::v64i1, 13 },
5543 { ISD::OR, MVT::v2i1, 3 },
5544 { ISD::OR, MVT::v4i1, 5 },
5545 { ISD::OR, MVT::v8i1, 7 },
5546 { ISD::OR, MVT::v16i1, 9 },
5547 { ISD::OR, MVT::v32i1, 11 },
5548 { ISD::OR, MVT::v64i1, 13 },
5549 };
5550
5551 static const CostTblEntry AVX2BoolReduction[] = {
5552 { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
5553 { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
5554 { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
5555 { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
5556 };
5557
5558 static const CostTblEntry AVX1BoolReduction[] = {
5559 { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
5560 { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
5561 { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5562 { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5563 { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
5564 { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
5565 { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5566 { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5567 };
5568
5569 static const CostTblEntry SSE2BoolReduction[] = {
5570 { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
5571 { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
5572 { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
5573 { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
5574 { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
5575 { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
5576 { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
5577 { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
5578 };
5579
5580 // Handle bool allof/anyof patterns.
5581 if (ValVTy->getElementType()->isIntegerTy(1)) {
5582 InstructionCost ArithmeticCost = 0;
5583 if (LT.first != 1 && MTy.isVector() &&
5584 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5585 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5586 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5587 MTy.getVectorNumElements());
5588 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5589 ArithmeticCost *= LT.first - 1;
5590 }
5591
5592 if (ST->hasAVX512())
5593 if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
5594 return ArithmeticCost + Entry->Cost;
5595 if (ST->hasAVX2())
5596 if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
5597 return ArithmeticCost + Entry->Cost;
5598 if (ST->hasAVX())
5599 if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
5600 return ArithmeticCost + Entry->Cost;
5601 if (ST->hasSSE2())
5602 if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
5603 return ArithmeticCost + Entry->Cost;
5604
5605 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5606 }
5607
5608 unsigned NumVecElts = ValVTy->getNumElements();
5609 unsigned ScalarSize = ValVTy->getScalarSizeInBits();
5610
5611 // Special case power of 2 reductions where the scalar type isn't changed
5612 // by type legalization.
5613 if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
5614 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5615
5616 InstructionCost ReductionCost = 0;
5617
5618 auto *Ty = ValVTy;
5619 if (LT.first != 1 && MTy.isVector() &&
5620 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5621 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5622 Ty = FixedVectorType::get(ValVTy->getElementType(),
5623 MTy.getVectorNumElements());
5624 ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5625 ReductionCost *= LT.first - 1;
5626 NumVecElts = MTy.getVectorNumElements();
5627 }
5628
5629 // Now handle reduction with the legal type, taking into account size changes
5630 // at each level.
5631 while (NumVecElts > 1) {
5632 // Determine the size of the remaining vector we need to reduce.
5633 unsigned Size = NumVecElts * ScalarSize;
5634 NumVecElts /= 2;
5635 // If we're reducing from 256/512 bits, use an extract_subvector.
5636 if (Size > 128) {
5637 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5638 ReductionCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, {},
5639 CostKind, NumVecElts, SubTy);
5640 Ty = SubTy;
5641 } else if (Size == 128) {
5642 // Reducing from 128 bits is a permute of v2f64/v2i64.
5643 FixedVectorType *ShufTy;
5644 if (ValVTy->isFloatingPointTy())
5645 ShufTy =
5646 FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
5647 else
5648 ShufTy =
5649 FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
5650 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, {},
5651 CostKind, 0, nullptr);
5652 } else if (Size == 64) {
5653 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5654 FixedVectorType *ShufTy;
5655 if (ValVTy->isFloatingPointTy())
5656 ShufTy =
5657 FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
5658 else
5659 ShufTy =
5660 FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
5661 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, {},
5662 CostKind, 0, nullptr);
5663 } else {
5664 // Reducing from smaller size is a shift by immediate.
5665 auto *ShiftTy = FixedVectorType::get(
5666 Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
5667 ReductionCost += getArithmeticInstrCost(
5668 Instruction::LShr, ShiftTy, CostKind,
5671 }
5672
5673 // Add the arithmetic op for this level.
5674 ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
5675 }
5676
5677 // Add the final extract element to the cost.
5678 return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5679 CostKind, 0, nullptr, nullptr);
5680}
5681
5684 FastMathFlags FMF) {
5685 IntrinsicCostAttributes ICA(IID, Ty, {Ty, Ty}, FMF);
5686 return getIntrinsicInstrCost(ICA, CostKind);
5687}
5688
5691 FastMathFlags FMF,
5693 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5694
5695 MVT MTy = LT.second;
5696
5697 int ISD;
5698 if (ValTy->isIntOrIntVectorTy()) {
5699 ISD = (IID == Intrinsic::umin || IID == Intrinsic::umax) ? ISD::UMIN
5700 : ISD::SMIN;
5701 } else {
5702 assert(ValTy->isFPOrFPVectorTy() &&
5703 "Expected float point or integer vector type.");
5704 ISD = (IID == Intrinsic::minnum || IID == Intrinsic::maxnum)
5705 ? ISD::FMINNUM
5706 : ISD::FMINIMUM;
5707 }
5708
5709 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5710 // and make it as the cost.
5711
5712 static const CostTblEntry SSE2CostTbl[] = {
5713 {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw
5714 {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw
5715 {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw
5716 };
5717
5718 static const CostTblEntry SSE41CostTbl[] = {
5719 {ISD::SMIN, MVT::v2i16, 3}, // same as sse2
5720 {ISD::SMIN, MVT::v4i16, 5}, // same as sse2
5721 {ISD::UMIN, MVT::v2i16, 5}, // same as sse2
5722 {ISD::UMIN, MVT::v4i16, 7}, // same as sse2
5723 {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor
5724 {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax
5725 {ISD::SMIN, MVT::v2i8, 3}, // pminsb
5726 {ISD::SMIN, MVT::v4i8, 5}, // pminsb
5727 {ISD::SMIN, MVT::v8i8, 7}, // pminsb
5728 {ISD::SMIN, MVT::v16i8, 6},
5729 {ISD::UMIN, MVT::v2i8, 3}, // same as sse2
5730 {ISD::UMIN, MVT::v4i8, 5}, // same as sse2
5731 {ISD::UMIN, MVT::v8i8, 7}, // same as sse2
5732 {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax
5733 };
5734
5735 static const CostTblEntry AVX1CostTbl[] = {
5736 {ISD::SMIN, MVT::v16i16, 6},
5737 {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax
5738 {ISD::SMIN, MVT::v32i8, 8},
5739 {ISD::UMIN, MVT::v32i8, 8},
5740 };
5741
5742 static const CostTblEntry AVX512BWCostTbl[] = {
5743 {ISD::SMIN, MVT::v32i16, 8},
5744 {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax
5745 {ISD::SMIN, MVT::v64i8, 10},
5746 {ISD::UMIN, MVT::v64i8, 10},
5747 };
5748
5749 // Before legalizing the type, give a chance to look up illegal narrow types
5750 // in the table.
5751 // FIXME: Is there a better way to do this?
5752 EVT VT = TLI->getValueType(DL, ValTy);
5753 if (VT.isSimple()) {
5754 MVT MTy = VT.getSimpleVT();
5755 if (ST->hasBWI())
5756 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5757 return Entry->Cost;
5758
5759 if (ST->hasAVX())
5760 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5761 return Entry->Cost;
5762
5763 if (ST->hasSSE41())
5764 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5765 return Entry->Cost;
5766
5767 if (ST->hasSSE2())
5768 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5769 return Entry->Cost;
5770 }
5771
5772 auto *ValVTy = cast<FixedVectorType>(ValTy);
5773 unsigned NumVecElts = ValVTy->getNumElements();
5774
5775 auto *Ty = ValVTy;
5776 InstructionCost MinMaxCost = 0;
5777 if (LT.first != 1 && MTy.isVector() &&
5778 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5779 // Type needs to be split. We need LT.first - 1 operations ops.
5780 Ty = FixedVectorType::get(ValVTy->getElementType(),
5781 MTy.getVectorNumElements());
5782 MinMaxCost = getMinMaxCost(IID, Ty, CostKind, FMF);
5783 MinMaxCost *= LT.first - 1;
5784 NumVecElts = MTy.getVectorNumElements();
5785 }
5786
5787 if (ST->hasBWI())
5788 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5789 return MinMaxCost + Entry->Cost;
5790
5791 if (ST->hasAVX())
5792 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5793 return MinMaxCost + Entry->Cost;
5794
5795 if (ST->hasSSE41())
5796 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5797 return MinMaxCost + Entry->Cost;
5798
5799 if (ST->hasSSE2())
5800 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5801 return MinMaxCost + Entry->Cost;
5802
5803 unsigned ScalarSize = ValTy->getScalarSizeInBits();
5804
5805 // Special case power of 2 reductions where the scalar type isn't changed
5806 // by type legalization.
5807 if (!isPowerOf2_32(ValVTy->getNumElements()) ||
5808 ScalarSize != MTy.getScalarSizeInBits())
5809 return BaseT::getMinMaxReductionCost(IID, ValTy, FMF, CostKind);
5810
5811 // Now handle reduction with the legal type, taking into account size changes
5812 // at each level.
5813 while (NumVecElts > 1) {
5814 // Determine the size of the remaining vector we need to reduce.
5815 unsigned Size = NumVecElts * ScalarSize;
5816 NumVecElts /= 2;
5817 // If we're reducing from 256/512 bits, use an extract_subvector.
5818 if (Size > 128) {
5819 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5820 MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, {}, CostKind,
5821 NumVecElts, SubTy);
5822 Ty = SubTy;
5823 } else if (Size == 128) {
5824 // Reducing from 128 bits is a permute of v2f64/v2i64.
5825 VectorType *ShufTy;
5826 if (ValTy->isFloatingPointTy())
5827 ShufTy =
5829 else
5830 ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
5831 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, {},
5832 CostKind, 0, nullptr);
5833 } else if (Size == 64) {
5834 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5835 FixedVectorType *ShufTy;
5836 if (ValTy->isFloatingPointTy())
5837 ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
5838 else
5839 ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
5840 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, {},
5841 CostKind, 0, nullptr);
5842 } else {
5843 // Reducing from smaller size is a shift by immediate.
5844 auto *ShiftTy = FixedVectorType::get(
5845 Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
5846 MinMaxCost += getArithmeticInstrCost(
5847 Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
5850 }
5851
5852 // Add the arithmetic op for this level.
5853 MinMaxCost += getMinMaxCost(IID, Ty, CostKind, FMF);
5854 }
5855
5856 // Add the final extract element to the cost.
5857 return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5858 CostKind, 0, nullptr, nullptr);
5859}
5860
5861/// Calculate the cost of materializing a 64-bit value. This helper
5862/// method might only calculate a fraction of a larger immediate. Therefore it
5863/// is valid to return a cost of ZERO.
5865 if (Val == 0)
5866 return TTI::TCC_Free;
5867
5868 if (isInt<32>(Val))
5869 return TTI::TCC_Basic;
5870
5871 return 2 * TTI::TCC_Basic;
5872}
5873
5876 assert(Ty->isIntegerTy());
5877
5878 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5879 if (BitSize == 0)
5880 return ~0U;
5881
5882 // Never hoist constants larger than 128bit, because this might lead to
5883 // incorrect code generation or assertions in codegen.
5884 // Fixme: Create a cost model for types larger than i128 once the codegen
5885 // issues have been fixed.
5886 if (BitSize > 128)
5887 return TTI::TCC_Free;
5888
5889 if (Imm == 0)
5890 return TTI::TCC_Free;
5891
5892 // Sign-extend all constants to a multiple of 64-bit.
5893 APInt ImmVal = Imm;
5894 if (BitSize % 64 != 0)
5895 ImmVal = Imm.sext(alignTo(BitSize, 64));
5896
5897 // Split the constant into 64-bit chunks and calculate the cost for each
5898 // chunk.
5900 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
5901 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
5902 int64_t Val = Tmp.getSExtValue();
5903 Cost += getIntImmCost(Val);
5904 }
5905 // We need at least one instruction to materialize the constant.
5906 return std::max<InstructionCost>(1, Cost);
5907}
5908
5910 const APInt &Imm, Type *Ty,
5912 Instruction *Inst) {
5913 assert(Ty->isIntegerTy());
5914
5915 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5916 unsigned ImmBitWidth = Imm.getBitWidth();
5917
5918 // There is no cost model for constants with a bit size of 0. Return TCC_Free
5919 // here, so that constant hoisting will ignore this constant.
5920 if (BitSize == 0)
5921 return TTI::TCC_Free;
5922
5923 unsigned ImmIdx = ~0U;
5924 switch (Opcode) {
5925 default:
5926 return TTI::TCC_Free;
5927 case Instruction::GetElementPtr:
5928 // Always hoist the base address of a GetElementPtr. This prevents the
5929 // creation of new constants for every base constant that gets constant
5930 // folded with the offset.
5931 if (Idx == 0)
5932 return 2 * TTI::TCC_Basic;
5933 return TTI::TCC_Free;
5934 case Instruction::Store:
5935 ImmIdx = 0;
5936 break;
5937 case Instruction::ICmp:
5938 // This is an imperfect hack to prevent constant hoisting of
5939 // compares that might be trying to check if a 64-bit value fits in
5940 // 32-bits. The backend can optimize these cases using a right shift by 32.
5941 // Ideally we would check the compare predicate here. There also other
5942 // similar immediates the backend can use shifts for.
5943 if (Idx == 1 && ImmBitWidth == 64) {
5944 uint64_t ImmVal = Imm.getZExtValue();
5945 if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
5946 return TTI::TCC_Free;
5947 }
5948 ImmIdx = 1;
5949 break;
5950 case Instruction::And:
5951 // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
5952 // by using a 32-bit operation with implicit zero extension. Detect such
5953 // immediates here as the normal path expects bit 31 to be sign extended.
5954 if (Idx == 1 && ImmBitWidth == 64 && Imm.isIntN(32))
5955 return TTI::TCC_Free;
5956 // If we have BMI then we can use BEXTR/BZHI to mask out upper i64 bits.
5957 if (Idx == 1 && ImmBitWidth == 64 && ST->is64Bit() && ST->hasBMI() &&
5958 Imm.isMask())
5959 return X86TTIImpl::getIntImmCost(ST->hasBMI2() ? 255 : 65535);
5960 ImmIdx = 1;
5961 break;
5962 case Instruction::Add:
5963 case Instruction::Sub:
5964 // For add/sub, we can use the opposite instruction for INT32_MIN.
5965 if (Idx == 1 && ImmBitWidth == 64 && Imm.getZExtValue() == 0x80000000)
5966 return TTI::TCC_Free;
5967 ImmIdx = 1;
5968 break;
5969 case Instruction::UDiv:
5970 case Instruction::SDiv:
5971 case Instruction::URem:
5972 case Instruction::SRem:
5973 // Division by constant is typically expanded later into a different
5974 // instruction sequence. This completely changes the constants.
5975 // Report them as "free" to stop ConstantHoist from marking them as opaque.
5976 return TTI::TCC_Free;
5977 case Instruction::Mul:
5978 case Instruction::Or:
5979 case Instruction::Xor:
5980 ImmIdx = 1;
5981 break;
5982 // Always return TCC_Free for the shift value of a shift instruction.
5983 case Instruction::Shl:
5984 case Instruction::LShr:
5985 case Instruction::AShr:
5986 if (Idx == 1)
5987 return TTI::TCC_Free;
5988 break;
5989 case Instruction::Trunc:
5990 case Instruction::ZExt:
5991 case Instruction::SExt:
5992 case Instruction::IntToPtr:
5993 case Instruction::PtrToInt:
5994 case Instruction::BitCast:
5995 case Instruction::PHI:
5996 case Instruction::Call:
5997 case Instruction::Select:
5998 case Instruction::Ret:
5999 case Instruction::Load:
6000 break;
6001 }
6002
6003 if (Idx == ImmIdx) {
6004 uint64_t NumConstants = divideCeil(BitSize, 64);
6006 return (Cost <= NumConstants * TTI::TCC_Basic)
6007 ? static_cast<int>(TTI::TCC_Free)
6008 : Cost;
6009 }
6010
6011 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
6012}
6013
6015 const APInt &Imm, Type *Ty,
6017 assert(Ty->isIntegerTy());
6018
6019 unsigned BitSize = Ty->getPrimitiveSizeInBits();
6020 // There is no cost model for constants with a bit size of 0. Return TCC_Free
6021 // here, so that constant hoisting will ignore this constant.
6022 if (BitSize == 0)
6023 return TTI::TCC_Free;
6024
6025 switch (IID) {
6026 default:
6027 return TTI::TCC_Free;
6028 case Intrinsic::sadd_with_overflow:
6029 case Intrinsic::uadd_with_overflow:
6030 case Intrinsic::ssub_with_overflow:
6031 case Intrinsic::usub_with_overflow:
6032 case Intrinsic::smul_with_overflow:
6033 case Intrinsic::umul_with_overflow:
6034 if ((Idx == 1) && Imm.getBitWidth() <= 64 && Imm.isSignedIntN(32))
6035 return TTI::TCC_Free;
6036 break;
6037 case Intrinsic::experimental_stackmap:
6038 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
6039 return TTI::TCC_Free;
6040 break;
6041 case Intrinsic::experimental_patchpoint_void:
6042 case Intrinsic::experimental_patchpoint:
6043 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
6044 return TTI::TCC_Free;
6045 break;
6046 }
6047 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
6048}
6049
6052 const Instruction *I) {
6054 return Opcode == Instruction::PHI ? TTI::TCC_Free : TTI::TCC_Basic;
6055 // Branches are assumed to be predicted.
6056 return TTI::TCC_Free;
6057}
6058
6059int X86TTIImpl::getGatherOverhead() const {
6060 // Some CPUs have more overhead for gather. The specified overhead is relative
6061 // to the Load operation. "2" is the number provided by Intel architects. This
6062 // parameter is used for cost estimation of Gather Op and comparison with
6063 // other alternatives.
6064 // TODO: Remove the explicit hasAVX512()?, That would mean we would only
6065 // enable gather with a -march.
6066 if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
6067 return 2;
6068
6069 return 1024;
6070}
6071
6072int X86TTIImpl::getScatterOverhead() const {
6073 if (ST->hasAVX512())
6074 return 2;
6075
6076 return 1024;
6077}
6078
6079// Return an average cost of Gather / Scatter instruction, maybe improved later.
6080InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode,
6082 Type *SrcVTy, const Value *Ptr,
6083 Align Alignment,
6084 unsigned AddressSpace) {
6085
6086 assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
6087 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
6088
6089 // Try to reduce index size from 64 bit (default for GEP)
6090 // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
6091 // operation will use 16 x 64 indices which do not fit in a zmm and needs
6092 // to split. Also check that the base pointer is the same for all lanes,
6093 // and that there's at most one variable index.
6094 auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
6095 unsigned IndexSize = DL.getPointerSizeInBits();
6096 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
6097 if (IndexSize < 64 || !GEP)
6098 return IndexSize;
6099
6100 unsigned NumOfVarIndices = 0;
6101 const Value *Ptrs = GEP->getPointerOperand();
6102 if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
6103 return IndexSize;
6104 for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) {
6105 if (isa<Constant>(GEP->getOperand(I)))
6106 continue;
6107 Type *IndxTy = GEP->getOperand(I)->getType();
6108 if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
6109 IndxTy = IndexVTy->getElementType();
6110 if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
6111 !isa<SExtInst>(GEP->getOperand(I))) ||
6112 ++NumOfVarIndices > 1)
6113 return IndexSize; // 64
6114 }
6115 return (unsigned)32;
6116 };
6117
6118 // Trying to reduce IndexSize to 32 bits for vector 16.
6119 // By default the IndexSize is equal to pointer size.
6120 unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
6121 ? getIndexSizeInBits(Ptr, DL)
6123
6124 auto *IndexVTy = FixedVectorType::get(
6125 IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
6126 std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(IndexVTy);
6127 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(SrcVTy);
6128 InstructionCost::CostType SplitFactor =
6129 *std::max(IdxsLT.first, SrcLT.first).getValue();
6130 if (SplitFactor > 1) {
6131 // Handle splitting of vector of pointers
6132 auto *SplitSrcTy =
6133 FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
6134 return SplitFactor * getGSVectorCost(Opcode, CostKind, SplitSrcTy, Ptr,
6135 Alignment, AddressSpace);
6136 }
6137
6138 // If we didn't split, this will be a single gather/scatter instruction.
6140 return 1;
6141
6142 // The gather / scatter cost is given by Intel architects. It is a rough
6143 // number since we are looking at one instruction in a time.
6144 const int GSOverhead = (Opcode == Instruction::Load) ? getGatherOverhead()
6145 : getScatterOverhead();
6146 return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
6147 MaybeAlign(Alignment), AddressSpace,
6148 CostKind);
6149}
6150
6151/// Calculate the cost of Gather / Scatter operation
6153 unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask,
6155 const Instruction *I = nullptr) {
6156 if ((Opcode == Instruction::Load &&
6157 (!isLegalMaskedGather(SrcVTy, Align(Alignment)) ||
6158 forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
6159 Align(Alignment)))) ||
6160 (Opcode == Instruction::Store &&
6161 (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) ||
6162 forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
6163 Align(Alignment)))))
6164 return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask,
6165 Alignment, CostKind, I);
6166
6167 assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
6168 PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
6169 if (!PtrTy && Ptr->getType()->isVectorTy())
6170 PtrTy = dyn_cast<PointerType>(
6171 cast<VectorType>(Ptr->getType())->getElementType());
6172 assert(PtrTy && "Unexpected type for Ptr argument");
6173 unsigned AddressSpace = PtrTy->getAddressSpace();
6174 return getGSVectorCost(Opcode, CostKind, SrcVTy, Ptr, Alignment,
6175 AddressSpace);
6176}
6177
6179 const TargetTransformInfo::LSRCost &C2) {
6180 // X86 specific here are "instruction number 1st priority".
6181 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
6182 C1.NumIVMuls, C1.NumBaseAdds,
6183 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
6184 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
6185 C2.NumIVMuls, C2.NumBaseAdds,
6186 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
6187}
6188
6190 return ST->hasMacroFusion() || ST->hasBranchFusion();
6191}
6192
6193bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
6194 Type *ScalarTy = DataTy->getScalarType();
6195
6196 // The backend can't handle a single element vector w/o CFCMOV.
6197 if (isa<VectorType>(DataTy) && cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6198 return ST->hasCF() && hasConditionalLoadStoreForType(ScalarTy);
6199
6200 if (!ST->hasAVX())
6201 return false;
6202
6203 if (ScalarTy->isPointerTy())
6204 return true;
6205
6206 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6207 return true;
6208
6209 if (ScalarTy->isHalfTy() && ST->hasBWI())
6210 return true;
6211
6212 if (ScalarTy->isBFloatTy() && ST->hasBF16())
6213 return true;
6214
6215 if (!ScalarTy->isIntegerTy())
6216 return false;
6217
6218 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6219 return IntWidth == 32 || IntWidth == 64 ||
6220 ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
6221}
6222
6223bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) {
6224 return isLegalMaskedLoad(DataType, Alignment);
6225}
6226
6227bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) {
6228 unsigned DataSize = DL.getTypeStoreSize(DataType);
6229 // The only supported nontemporal loads are for aligned vectors of 16 or 32
6230 // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
6231 // (the equivalent stores only require AVX).
6232 if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
6233 return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
6234
6235 return false;
6236}
6237
6238bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) {
6239 unsigned DataSize = DL.getTypeStoreSize(DataType);
6240
6241 // SSE4A supports nontemporal stores of float and double at arbitrary
6242 // alignment.
6243 if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
6244 return true;
6245
6246 // Besides the SSE4A subtarget exception above, only aligned stores are
6247 // available nontemporaly on any other subtarget. And only stores with a size
6248 // of 4..32 bytes (powers of 2, only) are permitted.
6249 if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
6250 !isPowerOf2_32(DataSize))
6251 return false;
6252
6253 // 32-byte vector nontemporal stores are supported by AVX (the equivalent
6254 // loads require AVX2).
6255 if (DataSize == 32)
6256 return ST->hasAVX();
6257 if (DataSize == 16)
6258 return ST->hasSSE1();
6259 return true;
6260}
6261
6263 ElementCount NumElements) const {
6264 // movddup
6265 return ST->hasSSE3() && !NumElements.isScalable() &&
6266 NumElements.getFixedValue() == 2 &&
6267 ElementTy == Type::getDoubleTy(ElementTy->getContext());
6268}
6269
6271 if (!isa<VectorType>(DataTy))
6272 return false;
6273
6274 if (!ST->hasAVX512())
6275 return false;
6276
6277 // The backend can't handle a single element vector.
6278 if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6279 return false;
6280
6281 Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
6282
6283 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6284 return true;
6285
6286 if (!ScalarTy->isIntegerTy())
6287 return false;
6288
6289 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6290 return IntWidth == 32 || IntWidth == 64 ||
6291 ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
6292}
6293
6295 return isLegalMaskedExpandLoad(DataTy, Alignment);
6296}
6297
6298bool X86TTIImpl::supportsGather() const {
6299 // Some CPUs have better gather performance than others.
6300 // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
6301 // enable gather with a -march.
6302 return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2());
6303}
6304
6306 // Gather / Scatter for vector 2 is not profitable on KNL / SKX
6307 // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend
6308 // it to 8 elements, but zeroing upper bits of the mask vector will add more
6309 // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO:
6310 // Check, maybe the gather/scatter instruction is better in the VariableMask
6311 // case.
6312 unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements();
6313 return NumElts == 1 ||
6314 (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())));
6315}
6316
6318 Type *ScalarTy = DataTy->getScalarType();
6319 if (ScalarTy->isPointerTy())
6320 return true;
6321
6322 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6323 return true;
6324
6325 if (!ScalarTy->isIntegerTy())
6326 return false;
6327
6328 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6329 return IntWidth == 32 || IntWidth == 64;
6330}
6331
6333 if (!supportsGather() || !ST->preferGather())
6334 return false;
6335 return isLegalMaskedGatherScatter(DataTy, Alignment);
6336}
6337
6338bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
6339 unsigned Opcode1,
6340 const SmallBitVector &OpcodeMask) const {
6341 // ADDSUBPS 4xf32 SSE3
6342 // VADDSUBPS 4xf32 AVX
6343 // VADDSUBPS 8xf32 AVX2
6344 // ADDSUBPD 2xf64 SSE3
6345 // VADDSUBPD 2xf64 AVX
6346 // VADDSUBPD 4xf64 AVX2
6347
6348 unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
6349 assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible");
6350 if (!isPowerOf2_32(NumElements))
6351 return false;
6352 // Check the opcode pattern. We apply the mask on the opcode arguments and
6353 // then check if it is what we expect.
6354 for (int Lane : seq<int>(0, NumElements)) {
6355 unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0;
6356 // We expect FSub for even lanes and FAdd for odd lanes.
6357 if (Lane % 2 == 0 && Opc != Instruction::FSub)
6358 return false;
6359 if (Lane % 2 == 1 && Opc != Instruction::FAdd)
6360 return false;
6361 }
6362 // Now check that the pattern is supported by the target ISA.
6363 Type *ElemTy = cast<VectorType>(VecTy)->getElementType();
6364 if (ElemTy->isFloatTy())
6365 return ST->hasSSE3() && NumElements % 4 == 0;
6366 if (ElemTy->isDoubleTy())
6367 return ST->hasSSE3() && NumElements % 2 == 0;
6368 return false;
6369}
6370
6371bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) {
6372 // AVX2 doesn't support scatter
6373 if (!ST->hasAVX512() || !ST->preferScatter())
6374 return false;
6375 return isLegalMaskedGatherScatter(DataType, Alignment);
6376}
6377
6378bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
6379 EVT VT = TLI->getValueType(DL, DataType);
6380 return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
6381}
6382
6384 // FDIV is always expensive, even if it has a very low uop count.
6385 // TODO: Still necessary for recent CPUs with low latency/throughput fdiv?
6386 if (I->getOpcode() == Instruction::FDiv)
6387 return true;
6388
6390}
6391
6393 return false;
6394}
6395
6397 const Function *Callee) const {
6398 const TargetMachine &TM = getTLI()->getTargetMachine();
6399
6400 // Work this as a subsetting of subtarget features.
6401 const FeatureBitset &CallerBits =
6402 TM.getSubtargetImpl(*Caller)->getFeatureBits();
6403 const FeatureBitset &CalleeBits =
6404 TM.getSubtargetImpl(*Callee)->getFeatureBits();
6405
6406 // Check whether features are the same (apart from the ignore list).
6407 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
6408 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
6409 if (RealCallerBits == RealCalleeBits)
6410 return true;
6411
6412 // If the features are a subset, we need to additionally check for calls
6413 // that may become ABI-incompatible as a result of inlining.
6414 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
6415 return false;
6416
6417 for (const Instruction &I : instructions(Callee)) {
6418 if (const auto *CB = dyn_cast<CallBase>(&I)) {
6419 // Having more target features is fine for inline ASM.
6420 if (CB->isInlineAsm())
6421 continue;
6422
6424 for (Value *Arg : CB->args())
6425 Types.push_back(Arg->getType());
6426 if (!CB->getType()->isVoidTy())
6427 Types.push_back(CB->getType());
6428
6429 // Simple types are always ABI compatible.
6430 auto IsSimpleTy = [](Type *Ty) {
6431 return !Ty->isVectorTy() && !Ty->isAggregateType();
6432 };
6433 if (all_of(Types, IsSimpleTy))
6434 continue;
6435
6436 if (Function *NestedCallee = CB->getCalledFunction()) {
6437 // Assume that intrinsics are always ABI compatible.
6438 if (NestedCallee->isIntrinsic())
6439 continue;
6440
6441 // Do a precise compatibility check.
6442 if (!areTypesABICompatible(Caller, NestedCallee, Types))
6443 return false;
6444 } else {
6445 // We don't know the target features of the callee,
6446 // assume it is incompatible.
6447 return false;
6448 }
6449 }
6450 }
6451 return true;
6452}
6453
6455 const Function *Callee,
6456 const ArrayRef<Type *> &Types) const {
6457 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
6458 return false;
6459
6460 // If we get here, we know the target features match. If one function
6461 // considers 512-bit vectors legal and the other does not, consider them
6462 // incompatible.
6463 const TargetMachine &TM = getTLI()->getTargetMachine();
6464
6465 if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
6466 TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs())
6467 return true;
6468
6469 // Consider the arguments compatible if they aren't vectors or aggregates.
6470 // FIXME: Look at the size of vectors.
6471 // FIXME: Look at the element types of aggregates to see if there are vectors.
6472 return llvm::none_of(Types,
6473 [](Type *T) { return T->isVectorTy() || T->isAggregateType(); });
6474}
6475
6477X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
6479 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
6480 Options.NumLoadsPerBlock = 2;
6481 // All GPR and vector loads can be unaligned.
6482 Options.AllowOverlappingLoads = true;
6483 if (IsZeroCmp) {
6484 // Only enable vector loads for equality comparison. Right now the vector
6485 // version is not as fast for three way compare (see #33329).
6486 const unsigned PreferredWidth = ST->getPreferVectorWidth();
6487 if (PreferredWidth >= 512 && ST->hasAVX512() && ST->hasEVEX512())
6488 Options.LoadSizes.push_back(64);
6489 if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
6490 if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
6491 }
6492 if (ST->is64Bit()) {
6493 Options.LoadSizes.push_back(8);
6494 }
6495 Options.LoadSizes.push_back(4);
6496 Options.LoadSizes.push_back(2);
6497 Options.LoadSizes.push_back(1);
6498 return Options;
6499}
6500
6502 return supportsGather();
6503}
6504
6506 return false;
6507}
6508
6510 // TODO: We expect this to be beneficial regardless of arch,
6511 // but there are currently some unexplained performance artifacts on Atom.
6512 // As a temporary solution, disable on Atom.
6513 return !(ST->isAtom());
6514}
6515
6516// Get estimation for interleaved load/store operations and strided load.
6517// \p Indices contains indices for strided load.
6518// \p Factor - the factor of interleaving.
6519// AVX-512 provides 3-src shuffles that significantly reduces the cost.
6521 unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
6522 ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
6523 TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
6524 // VecTy for interleave memop is <VF*Factor x Elt>.
6525 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6526 // VecTy = <12 x i32>.
6527
6528 // Calculate the number of memory operations (NumOfMemOps), required
6529 // for load/store the VecTy.
6530 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6531 unsigned VecTySize = DL.getTypeStoreSize(VecTy);
6532 unsigned LegalVTSize = LegalVT.getStoreSize();
6533 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
6534
6535 // Get the cost of one memory operation.
6536 auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
6537 LegalVT.getVectorNumElements());
6538 InstructionCost MemOpCost;
6539 bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps;
6540 if (UseMaskedMemOp)
6541 MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment,
6543 else
6544 MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, MaybeAlign(Alignment),
6546
6547 unsigned VF = VecTy->getNumElements() / Factor;
6548 MVT VT =
6550
6551 InstructionCost MaskCost;
6552 if (UseMaskedMemOp) {
6553 APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements());
6554 for (unsigned Index : Indices) {
6555 assert(Index < Factor && "Invalid index for interleaved memory op");
6556 for (unsigned Elm = 0; Elm < VF; Elm++)
6557 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
6558 }
6559
6560 Type *I1Type = Type::getInt1Ty(VecTy->getContext());
6561
6562 MaskCost = getReplicationShuffleCost(
6563 I1Type, Factor, VF,
6564 UseMaskForGaps ? DemandedLoadStoreElts
6566 CostKind);
6567
6568 // The Gaps mask is invariant and created outside the loop, therefore the
6569 // cost of creating it is not accounted for here. However if we have both
6570 // a MaskForGaps and some other mask that guards the execution of the
6571 // memory access, we need to account for the cost of And-ing the two masks
6572 // inside the loop.
6573 if (UseMaskForGaps) {
6574 auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements());
6575 MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind);
6576 }
6577 }
6578
6579 if (Opcode == Instruction::Load) {
6580 // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
6581 // contain the cost of the optimized shuffle sequence that the
6582 // X86InterleavedAccess pass will generate.
6583 // The cost of loads and stores are computed separately from the table.
6584
6585 // X86InterleavedAccess support only the following interleaved-access group.
6586 static const CostTblEntry AVX512InterleavedLoadTbl[] = {
6587 {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
6588 {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
6589 {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
6590 };
6591
6592 if (const auto *Entry =
6593 CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
6594 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6595 //If an entry does not exist, fallback to the default implementation.
6596
6597 // Kind of shuffle depends on number of loaded values.
6598 // If we load the entire data in one register, we can use a 1-src shuffle.
6599 // Otherwise, we'll merge 2 sources in each operation.
6600 TTI::ShuffleKind ShuffleKind =
6601 (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
6602
6603 InstructionCost ShuffleCost =
6604 getShuffleCost(ShuffleKind, SingleMemOpTy, {}, CostKind, 0, nullptr);
6605
6606 unsigned NumOfLoadsInInterleaveGrp =
6607 Indices.size() ? Indices.size() : Factor;
6608 auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
6609 VecTy->getNumElements() / Factor);
6610 InstructionCost NumOfResults =
6611 getTypeLegalizationCost(ResultTy).first * NumOfLoadsInInterleaveGrp;
6612
6613 // About a half of the loads may be folded in shuffles when we have only
6614 // one result. If we have more than one result, or the loads are masked,
6615 // we do not fold loads at all.
6616 unsigned NumOfUnfoldedLoads =
6617 UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
6618
6619 // Get a number of shuffle operations per result.
6620 unsigned NumOfShufflesPerResult =
6621 std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
6622
6623 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6624 // When we have more than one destination, we need additional instructions
6625 // to keep sources.
6626 InstructionCost NumOfMoves = 0;
6627 if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
6628 NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
6629
6630 InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
6631 MaskCost + NumOfUnfoldedLoads * MemOpCost +
6632 NumOfMoves;
6633
6634 return Cost;
6635 }
6636
6637 // Store.
6638 assert(Opcode == Instruction::Store &&
6639 "Expected Store Instruction at this point");
6640 // X86InterleavedAccess support only the following interleaved-access group.
6641 static const CostTblEntry AVX512InterleavedStoreTbl[] = {
6642 {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
6643 {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
6644 {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
6645
6646 {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
6647 {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
6648 {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
6649 {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
6650 };
6651
6652 if (const auto *Entry =
6653 CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
6654 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6655 //If an entry does not exist, fallback to the default implementation.
6656
6657 // There is no strided stores meanwhile. And store can't be folded in
6658 // shuffle.
6659 unsigned NumOfSources = Factor; // The number of values to be merged.
6660 InstructionCost ShuffleCost = getShuffleCost(
6661 TTI::SK_PermuteTwoSrc, SingleMemOpTy, {}, CostKind, 0, nullptr);
6662 unsigned NumOfShufflesPerStore = NumOfSources - 1;
6663
6664 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6665 // We need additional instructions to keep sources.
6666 unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
6668 MaskCost +
6669 NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
6670 NumOfMoves;
6671 return Cost;
6672}
6673
6675 unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices,
6676 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
6677 bool UseMaskForCond, bool UseMaskForGaps) {
6678 auto *VecTy = cast<FixedVectorType>(BaseTy);
6679
6680 auto isSupportedOnAVX512 = [&](Type *VecTy) {
6681 Type *EltTy = cast<VectorType>(VecTy)->getElementType();
6682 if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
6683 EltTy->isIntegerTy(32) || EltTy->isPointerTy())
6684 return true;
6685 if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy())
6686 return ST->hasBWI();
6687 if (EltTy->isBFloatTy())
6688 return ST->hasBF16();
6689 return false;
6690 };
6691 if (ST->hasAVX512() && isSupportedOnAVX512(VecTy))
6693 Opcode, VecTy, Factor, Indices, Alignment,
6694 AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
6695
6696 if (UseMaskForCond || UseMaskForGaps)
6697 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6698 Alignment, AddressSpace, CostKind,
6699 UseMaskForCond, UseMaskForGaps);
6700
6701 // Get estimation for interleaved load/store operations for SSE-AVX2.
6702 // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow
6703 // computing the cost using a generic formula as a function of generic
6704 // shuffles. We therefore use a lookup table instead, filled according to
6705 // the instruction sequences that codegen currently generates.
6706
6707 // VecTy for interleave memop is <VF*Factor x Elt>.
6708 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6709 // VecTy = <12 x i32>.
6710 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6711
6712 // This function can be called with VecTy=<6xi128>, Factor=3, in which case
6713 // the VF=2, while v2i128 is an unsupported MVT vector type
6714 // (see MachineValueType.h::getVectorVT()).
6715 if (!LegalVT.isVector())
6716 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6717 Alignment, AddressSpace, CostKind);
6718
6719 unsigned VF = VecTy->getNumElements() / Factor;
6720 Type *ScalarTy = VecTy->getElementType();
6721 // Deduplicate entries, model floats/pointers as appropriately-sized integers.
6722 if (!ScalarTy->isIntegerTy())
6723 ScalarTy =
6724 Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy));
6725
6726 // Get the cost of all the memory operations.
6727 // FIXME: discount dead loads.
6728 InstructionCost MemOpCosts = getMemoryOpCost(
6729 Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind);
6730
6731 auto *VT = FixedVectorType::get(ScalarTy, VF);
6732 EVT ETy = TLI->getValueType(DL, VT);
6733 if (!ETy.isSimple())
6734 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6735 Alignment, AddressSpace, CostKind);
6736
6737 // TODO: Complete for other data-types and strides.
6738 // Each combination of Stride, element bit width and VF results in a different
6739 // sequence; The cost tables are therefore accessed with:
6740 // Factor (stride) and VectorType=VFxiN.
6741 // The Cost accounts only for the shuffle sequence;
6742 // The cost of the loads/stores is accounted for separately.
6743 //
6744 static const CostTblEntry AVX2InterleavedLoadTbl[] = {
6745 {2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8
6746 {2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8
6747 {2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8
6748 {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8
6749 {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8
6750
6751 {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16
6752 {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16
6753 {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16
6754
6755 {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32
6756 {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32
6757 {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32
6758
6759 {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64
6760 {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64
6761 {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64
6762 {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64
6763
6764 {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8
6765 {3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8
6766 {3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8
6767 {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
6768 {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8
6769
6770 {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16
6771 {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16
6772 {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16
6773 {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16
6774 {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16
6775
6776 {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32
6777 {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32
6778 {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32
6779 {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32
6780 {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32
6781
6782 {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64
6783 {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64
6784 {3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64
6785 {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64
6786
6787 {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8
6788 {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8
6789 {4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8
6790 {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8
6791 {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8
6792
6793 {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16
6794 {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16
6795 {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16
6796 {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16
6797 {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16
6798
6799 {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32
6800 {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32
6801 {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32
6802 {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32
6803 {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32
6804
6805 {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64
6806 {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64
6807 {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64
6808 {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64
6809
6810 {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8
6811 {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8
6812 {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8
6813 {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8
6814 {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8
6815
6816 {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16
6817 {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16
6818 {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16
6819 {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16
6820 {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16
6821
6822 {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32
6823 {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32
6824 {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32
6825 {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32
6826
6827 {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64
6828 {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64
6829 {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64
6830
6831 {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32
6832 };
6833
6834 static const CostTblEntry SSSE3InterleavedLoadTbl[] = {
6835 {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16
6836 };
6837
6838 static const CostTblEntry SSE2InterleavedLoadTbl[] = {
6839 {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16
6840 {2, MVT::v4i16, 7}, // (load 8i16 and) deinterleave into 2 x 4i16
6841
6842 {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32
6843 {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32
6844
6845 {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64
6846 };
6847
6848 static const CostTblEntry AVX2InterleavedStoreTbl[] = {
6849 {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store)
6850 {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store)
6851
6852 {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store)
6853 {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store)
6854 {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store)
6855
6856 {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store)
6857 {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store)
6858 {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store)
6859 {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store)
6860
6861 {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store)
6862 {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store)
6863 {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store)
6864 {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store)
6865 {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store)
6866
6867 {3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store)
6868 {3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store)
6869 {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store)
6870 {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
6871 {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
6872
6873 {3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store)
6874 {3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store)
6875 {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store)
6876 {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store)
6877 {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store)
6878
6879 {3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store)
6880 {3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store)
6881 {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store)
6882 {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store)
6883 {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store)
6884
6885 {3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store)
6886 {3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store)
6887 {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store)
6888 {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store)
6889
6890 {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store)
6891 {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store)
6892 {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store)
6893 {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store)
6894 {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store)
6895
6896 {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store)
6897 {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store)
6898 {4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store)
6899 {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store)
6900 {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store)
6901
6902 {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store)
6903 {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store)
6904 {4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store)
6905 {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store)
6906 {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store)
6907
6908 {4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store)
6909 {4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store)
6910 {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store)
6911 {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store)
6912
6913 {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store)
6914 {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store)
6915 {6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store)
6916 {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store)
6917 {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store)
6918
6919 {6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store)
6920 {6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store)
6921 {6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store)
6922 {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store)
6923 {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store)
6924
6925 {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store)
6926 {6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store)
6927 {6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store)
6928 {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store)
6929
6930 {6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store)
6931 {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store)
6932 {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store)
6933 };
6934
6935 static const CostTblEntry SSE2InterleavedStoreTbl[] = {
6936 {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store)
6937 {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store)
6938 {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store)
6939
6940 {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store)
6941 {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store)
6942
6943 {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store)
6944 };
6945
6946 if (Opcode == Instruction::Load) {
6947 auto GetDiscountedCost = [Factor, NumMembers = Indices.size(),
6948 MemOpCosts](const CostTblEntry *Entry) {
6949 // NOTE: this is just an approximation!
6950 // It can over/under -estimate the cost!
6951 return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor);
6952 };
6953
6954 if (ST->hasAVX2())
6955 if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor,
6956 ETy.getSimpleVT()))
6957 return GetDiscountedCost(Entry);
6958
6959 if (ST->hasSSSE3())
6960 if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor,
6961 ETy.getSimpleVT()))
6962 return GetDiscountedCost(Entry);
6963
6964 if (ST->hasSSE2())
6965 if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor,
6966 ETy.getSimpleVT()))
6967 return GetDiscountedCost(Entry);
6968 } else {
6969 assert(Opcode == Instruction::Store &&
6970 "Expected Store Instruction at this point");
6971 assert((!Indices.size() || Indices.size() == Factor) &&
6972 "Interleaved store only supports fully-interleaved groups.");
6973 if (ST->hasAVX2())
6974 if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor,
6975 ETy.getSimpleVT()))
6976 return MemOpCosts + Entry->Cost;
6977
6978 if (ST->hasSSE2())
6979 if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor,
6980 ETy.getSimpleVT()))
6981 return MemOpCosts + Entry->Cost;
6982 }
6983
6984 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6985 Alignment, AddressSpace, CostKind,
6986 UseMaskForCond, UseMaskForGaps);
6987}
6988
6990 StackOffset BaseOffset,
6991 bool HasBaseReg, int64_t Scale,
6992 unsigned AddrSpace) const {
6993 // Scaling factors are not free at all.
6994 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
6995 // will take 2 allocations in the out of order engine instead of 1
6996 // for plain addressing mode, i.e. inst (reg1).
6997 // E.g.,
6998 // vaddps (%rsi,%rdx), %ymm0, %ymm1
6999 // Requires two allocations (one for the load, one for the computation)
7000 // whereas:
7001 // vaddps (%rsi), %ymm0, %ymm1
7002 // Requires just 1 allocation, i.e., freeing allocations for other operations
7003 // and having less micro operations to execute.
7004 //
7005 // For some X86 architectures, this is even worse because for instance for
7006 // stores, the complex addressing mode forces the instruction to use the
7007 // "load" ports instead of the dedicated "store" port.
7008 // E.g., on Haswell:
7009 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
7010 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
7012 AM.BaseGV = BaseGV;
7013 AM.BaseOffs = BaseOffset.getFixed();
7014 AM.HasBaseReg = HasBaseReg;
7015 AM.Scale = Scale;
7016 AM.ScalableOffset = BaseOffset.getScalable();
7017 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
7018 // Scale represents reg2 * scale, thus account for 1
7019 // as soon as we use a second register.
7020 return AM.Scale != 0;
7021 return -1;
7022}
7023
7025 // TODO: Hook MispredictPenalty of SchedMachineModel into this.
7026 return 14;
7027}
7028
7030 unsigned Bits = Ty->getScalarSizeInBits();
7031
7032 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
7033 // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
7034 if (ST->hasXOP() && (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
7035 return false;
7036
7037 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
7038 // shifts just as cheap as scalar ones.
7039 if (ST->hasAVX2() && (Bits == 32 || Bits == 64))
7040 return false;
7041
7042 // AVX512BW has shifts such as vpsllvw.
7043 if (ST->hasBWI() && Bits == 16)
7044 return false;
7045
7046 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
7047 // fully general vector.
7048 return true;
7049}
7050
7051unsigned X86TTIImpl::getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
7052 Type *ScalarValTy) const {
7053 if (ST->hasF16C() && ScalarMemTy->isHalfTy()) {
7054 return 4;
7055 }
7056 return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy);
7057}
7058
7060 SmallVectorImpl<Use *> &Ops) const {
7061 using namespace llvm::PatternMatch;
7062
7063 FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
7064 if (!VTy)
7065 return false;
7066
7067 if (I->getOpcode() == Instruction::Mul &&
7068 VTy->getElementType()->isIntegerTy(64)) {
7069 for (auto &Op : I->operands()) {
7070 // Make sure we are not already sinking this operand
7071 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
7072 continue;
7073
7074 // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or
7075 // the PMULUDQ pattern where the input is a zext_inreg from vXi32.
7076 if (ST->hasSSE41() &&
7077 match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),
7078 m_SpecificInt(32)))) {
7079 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
7080 Ops.push_back(&Op);
7081 } else if (ST->hasSSE2() &&
7082 match(Op.get(),
7083 m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) {
7084 Ops.push_back(&Op);
7085 }
7086 }
7087
7088 return !Ops.empty();
7089 }
7090
7091 // A uniform shift amount in a vector shift or funnel shift may be much
7092 // cheaper than a generic variable vector shift, so make that pattern visible
7093 // to SDAG by sinking the shuffle instruction next to the shift.
7094 int ShiftAmountOpNum = -1;
7095 if (I->isShift())
7096 ShiftAmountOpNum = 1;
7097 else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
7098 if (II->getIntrinsicID() == Intrinsic::fshl ||
7099 II->getIntrinsicID() == Intrinsic::fshr)
7100 ShiftAmountOpNum = 2;
7101 }
7102
7103 if (ShiftAmountOpNum == -1)
7104 return false;
7105
7106 auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
7107 if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
7108 isVectorShiftByScalarCheap(I->getType())) {
7109 Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
7110 return true;
7111 }
7112
7113 return false;
7114}
Expand Atomic instructions
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Hexagon Common GEP
static LVOptions Options
Definition: LVOptions.cpp:25
#define I(x, y, z)
Definition: MD5.cpp:58
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
if(PassOpts->AAPipeline)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getNumElements(Type *Ty)
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
This file a TargetTransformInfo::Concept conforming object specific to the X86 target machine.
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:986
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1649
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1330
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:371
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:258
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:380
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1468
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1015
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:827
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:200
APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition: APInt.cpp:455
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1542
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:479
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
Definition: BasicTTIImpl.h:397
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:923
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={})
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:807
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:959
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:380
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:676
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:703
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:697
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:696
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:681
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:684
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:698
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:701
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:699
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:683
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:364
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:843
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:617
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:421
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:847
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:322
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:563
unsigned getNumElements() const
Definition: DerivedTypes.h:606
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:933
static InstructionCost getInvalid(CostType Val=0)
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:311
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
const IntrinsicInst * getInst() const
Machine Value Type.
bool is128BitVector() const
Return true if this is a 128-bit vector type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Class to represent pointers.
Definition: DerivedTypes.h:670
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:703
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
bool test(unsigned Idx) const
size_type size() const
Returns the number of bits in this bitvector.
bool empty() const
Definition: SmallVector.h:81
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
static StackOffset getScalable(int64_t Scalable)
Definition: TypeSize.h:43
static StackOffset getFixed(int64_t Fixed)
Definition: TypeSize.h:42
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
unsigned getMaxExpandSizeMemcmp(bool OptSize) const
Get maximum # of load operations permitted for memcmp.
MVT getSimpleValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the MVT corresponding to this LLVM type. See getValueType.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
bool isExpensiveToSpeculativelyExecute(const Instruction *I)
bool isStridedAccess(const SCEV *Ptr) const
unsigned minRequiredElementSize(const Value *Val, bool &isSigned) const
const SCEVConstant * getConstantStrideStep(ScalarEvolution *SE, const SCEV *Ptr) const
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
CacheLevel
The possible cache levels.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:348
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
static Type * getDoubleTy(LLVMContext &C)
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:243
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:264
static IntegerType * getInt1Ty(LLVMContext &C)
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:145
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
static IntegerType * getInt8Ty(LLVMContext &C)
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:225
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
Base class of all SIMD vector types.
Definition: DerivedTypes.h:427
static VectorType * getExtendedElementVectorType(VectorType *VTy)
This static method is like getInteger except that the element types are twice as wide as the elements...
Definition: DerivedTypes.h:487
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:665
static VectorType * getDoubleElementsVectorType(VectorType *VTy)
This static method returns a VectorType with twice as many elements as the input type and the same el...
Definition: DerivedTypes.h:541
Type * getElementType() const
Definition: DerivedTypes.h:460
bool hasSSE1() const
Definition: X86Subtarget.h:193
bool hasSSE42() const
Definition: X86Subtarget.h:198
bool useAVX512Regs() const
Definition: X86Subtarget.h:253
bool hasSSE3() const
Definition: X86Subtarget.h:195
bool hasAVX512() const
Definition: X86Subtarget.h:201
bool hasSSE41() const
Definition: X86Subtarget.h:197
bool hasSSE2() const
Definition: X86Subtarget.h:194
bool hasSSSE3() const
Definition: X86Subtarget.h:196
bool hasAVX() const
Definition: X86Subtarget.h:199
unsigned getPreferVectorWidth() const
Definition: X86Subtarget.h:225
bool hasAVX2() const
Definition: X86Subtarget.h:200
InstructionCost getInterleavedMemoryOpCostAVX512(unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
bool isLegalMaskedGather(Type *DataType, Align Alignment)
InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask, TTI::TargetCostKind CostKind) const
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const
std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool isLegalNTStore(Type *DataType, Align Alignment)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
bool isLegalNTLoad(Type *DataType, Align Alignment)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment)
bool isLegalMaskedGatherScatter(Type *DataType, Align Alignment)
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
bool isLegalMaskedLoad(Type *DataType, Align Alignment)
bool hasConditionalLoadStoreForType(Type *Ty=nullptr) const
bool supportsEfficientVectorElementLoadStore() const
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
bool prefersVectorizedAddressing() const
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment)
std::optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const override
bool isLegalMaskedStore(Type *DataType, Align Alignment)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
Calculate the cost of Gather / Scatter operation.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
unsigned getMaxInterleaveFactor(ElementCount VF)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
bool isLegalMaskedCompressStore(Type *DataType, Align Alignment)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isVectorShiftByScalarCheap(Type *Ty) const
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
unsigned getNumberOfRegisters(unsigned ClassID) const
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
unsigned getAtomicMemIntrinsicMaxElementSize() const
bool isLegalMaskedScatter(Type *DataType, Align Alignment)
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={})
InstructionCost getIntImmCost(int64_t)
Calculate the cost of materializing a 64-bit value.
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getBranchMispredictPenalty() const
bool isExpensiveToSpeculativelyExecute(const Instruction *I)
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr)
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
InstructionCost getMinMaxCost(Intrinsic::ID IID, Type *Ty, TTI::TargetCostKind CostKind, FastMathFlags FMF)
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Type) const
bool hasDivRemOp(Type *DataType, bool IsSigned)
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition: APInt.cpp:2982
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:780
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:753
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:744
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:841
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:717
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:262
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:805
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:752
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:981
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:757
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:330
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:550
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:811
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1031
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:697
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:939
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1050
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:887
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:920
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:817
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
Definition: PatternMatch.h:982
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
apint_match m_APIntAllowPoison(const APInt *&Res)
Match APInt while allowing poison in splat vector constants.
Definition: PatternMatch.h:305
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:555
Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:394
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1841
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
InstructionCost Cost
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
#define N
std::optional< unsigned > operator[](TargetTransformInfo::TargetCostKind Kind) const
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition: Alignment.h:141
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Type Conversion Cost Table.
Definition: CostTable.h:55