LLVM 20.0.0git
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements a TargetTransformInfo analysis pass specific to the
10/// X86 target machine. It uses the target's detailed information to provide
11/// more precise answers to certain TTI queries, while letting the target
12/// independent and default TTI implementations handle the rest.
13///
14//===----------------------------------------------------------------------===//
15/// About Cost Model numbers used below it's necessary to say the following:
16/// the numbers correspond to some "generic" X86 CPU instead of usage of a
17/// specific CPU model. Usually the numbers correspond to the CPU where the
18/// feature first appeared. For example, if we do Subtarget.hasSSE42() in
19/// the lookups below the cost is based on Nehalem as that was the first CPU
20/// to support that feature level and thus has most likely the worst case cost,
21/// although we may discard an outlying worst cost from one CPU (e.g. Atom).
22///
23/// Some examples of other technologies/CPUs:
24/// SSE 3 - Pentium4 / Athlon64
25/// SSE 4.1 - Penryn
26/// SSE 4.2 - Nehalem / Silvermont
27/// AVX - Sandy Bridge / Jaguar / Bulldozer
28/// AVX2 - Haswell / Ryzen
29/// AVX-512 - Xeon Phi / Skylake
30///
31/// And some examples of instruction target dependent costs (latency)
32/// divss sqrtss rsqrtss
33/// AMD K7 11-16 19 3
34/// Piledriver 9-24 13-15 5
35/// Jaguar 14 16 2
36/// Pentium II,III 18 30 2
37/// Nehalem 7-14 7-18 3
38/// Haswell 10-13 11 5
39///
40/// Interpreting the 4 TargetCostKind types:
41/// TCK_RecipThroughput and TCK_Latency should try to match the worst case
42/// values reported by the CPU scheduler models (and llvm-mca).
43/// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the
44/// actual encoding size of the instruction.
45/// TCK_SizeAndLatency should match the worst case micro-op counts reported by
46/// by the CPU scheduler models (and llvm-mca), to ensure that they are
47/// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are
48/// often used as the cost thresholds where TCK_SizeAndLatency is requested.
49//===----------------------------------------------------------------------===//
50
58#include <optional>
59
60using namespace llvm;
61
62#define DEBUG_TYPE "x86tti"
63
64//===----------------------------------------------------------------------===//
65//
66// X86 cost model.
67//
68//===----------------------------------------------------------------------===//
69
70// Helper struct to store/access costs for each cost kind.
71// TODO: Move this to allow other targets to use it?
73 unsigned RecipThroughputCost = ~0U;
74 unsigned LatencyCost = ~0U;
75 unsigned CodeSizeCost = ~0U;
76 unsigned SizeAndLatencyCost = ~0U;
77
78 std::optional<unsigned>
80 unsigned Cost = ~0U;
81 switch (Kind) {
84 break;
87 break;
90 break;
93 break;
94 }
95 if (Cost == ~0U)
96 return std::nullopt;
97 return Cost;
98 }
99};
102
105 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
106 // TODO: Currently the __builtin_popcount() implementation using SSE3
107 // instructions is inefficient. Once the problem is fixed, we should
108 // call ST->hasSSE3() instead of ST->hasPOPCNT().
109 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
110}
111
112std::optional<unsigned> X86TTIImpl::getCacheSize(
114 switch (Level) {
116 // - Penryn
117 // - Nehalem
118 // - Westmere
119 // - Sandy Bridge
120 // - Ivy Bridge
121 // - Haswell
122 // - Broadwell
123 // - Skylake
124 // - Kabylake
125 return 32 * 1024; // 32 KByte
127 // - Penryn
128 // - Nehalem
129 // - Westmere
130 // - Sandy Bridge
131 // - Ivy Bridge
132 // - Haswell
133 // - Broadwell
134 // - Skylake
135 // - Kabylake
136 return 256 * 1024; // 256 KByte
137 }
138
139 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
140}
141
142std::optional<unsigned> X86TTIImpl::getCacheAssociativity(
144 // - Penryn
145 // - Nehalem
146 // - Westmere
147 // - Sandy Bridge
148 // - Ivy Bridge
149 // - Haswell
150 // - Broadwell
151 // - Skylake
152 // - Kabylake
153 switch (Level) {
155 [[fallthrough]];
157 return 8;
158 }
159
160 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
161}
162
163unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
164 bool Vector = (ClassID == 1);
165 if (Vector && !ST->hasSSE1())
166 return 0;
167
168 if (ST->is64Bit()) {
169 if (Vector && ST->hasAVX512())
170 return 32;
171 if (!Vector && ST->hasEGPR())
172 return 32;
173 return 16;
174 }
175 return 8;
176}
177
179 if (!ST->hasCF())
180 return false;
181 if (!Ty)
182 return true;
183 // Conditional faulting is supported by CFCMOV, which only accepts
184 // 16/32/64-bit operands.
185 // TODO: Support f32/f64 with VMOVSS/VMOVSD with zero mask when it's
186 // profitable.
187 auto *VTy = dyn_cast<FixedVectorType>(Ty);
188 if (!Ty->isIntegerTy() && (!VTy || VTy->getNumElements() != 1))
189 return false;
190 auto *ScalarTy = Ty->getScalarType();
191 switch (cast<IntegerType>(ScalarTy)->getBitWidth()) {
192 default:
193 return false;
194 case 16:
195 case 32:
196 case 64:
197 return true;
198 }
199}
200
203 unsigned PreferVectorWidth = ST->getPreferVectorWidth();
204 switch (K) {
206 return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
208 if (ST->hasAVX512() && ST->hasEVEX512() && PreferVectorWidth >= 512)
209 return TypeSize::getFixed(512);
210 if (ST->hasAVX() && PreferVectorWidth >= 256)
211 return TypeSize::getFixed(256);
212 if (ST->hasSSE1() && PreferVectorWidth >= 128)
213 return TypeSize::getFixed(128);
214 return TypeSize::getFixed(0);
216 return TypeSize::getScalable(0);
217 }
218
219 llvm_unreachable("Unsupported register kind");
220}
221
224 .getFixedValue();
225}
226
228 // If the loop will not be vectorized, don't interleave the loop.
229 // Let regular unroll to unroll the loop, which saves the overflow
230 // check and memory check cost.
231 if (VF.isScalar())
232 return 1;
233
234 if (ST->isAtom())
235 return 1;
236
237 // Sandybridge and Haswell have multiple execution ports and pipelined
238 // vector units.
239 if (ST->hasAVX())
240 return 4;
241
242 return 2;
243}
244
246 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
249 const Instruction *CxtI) {
250
251 // vXi8 multiplications are always promoted to vXi16.
252 // Sub-128-bit types can be extended/packed more efficiently.
253 if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
254 Ty->getPrimitiveSizeInBits() <= 64 && Ty->getScalarSizeInBits() == 8) {
255 Type *WideVecTy =
256 VectorType::getExtendedElementVectorType(cast<VectorType>(Ty));
257 return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
259 CostKind) +
260 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
262 CostKind) +
263 getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info);
264 }
265
266 // Legalize the type.
267 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
268
269 int ISD = TLI->InstructionOpcodeToISD(Opcode);
270 assert(ISD && "Invalid opcode");
271
272 if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
273 (LT.second.getScalarType() == MVT::i32 ||
274 LT.second.getScalarType() == MVT::i64)) {
275 // Check if the operands can be represented as a smaller datatype.
276 bool Op1Signed = false, Op2Signed = false;
277 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
278 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
279 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
280 bool SignedMode = Op1Signed || Op2Signed;
281
282 // If both vXi32 are representable as i15 and at least one is constant,
283 // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
284 // can treat this as PMADDWD which has the same costs as a vXi16 multiply.
285 if (OpMinSize <= 15 && !ST->isPMADDWDSlow() &&
286 LT.second.getScalarType() == MVT::i32) {
287 bool Op1Constant =
288 isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
289 bool Op2Constant =
290 isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
291 bool Op1Sext = isa<SExtInst>(Args[0]) &&
292 (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
293 bool Op2Sext = isa<SExtInst>(Args[1]) &&
294 (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
295
296 bool IsZeroExtended = !Op1Signed || !Op2Signed;
297 bool IsConstant = Op1Constant || Op2Constant;
298 bool IsSext = Op1Sext || Op2Sext;
299 if (IsConstant || IsZeroExtended || IsSext)
300 LT.second =
301 MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
302 }
303
304 // Check if the vXi32 operands can be shrunk into a smaller datatype.
305 // This should match the codegen from reduceVMULWidth.
306 // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()).
307 if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) {
308 if (OpMinSize <= 7)
309 return LT.first * 3; // pmullw/sext
310 if (!SignedMode && OpMinSize <= 8)
311 return LT.first * 3; // pmullw/zext
312 if (OpMinSize <= 15)
313 return LT.first * 5; // pmullw/pmulhw/pshuf
314 if (!SignedMode && OpMinSize <= 16)
315 return LT.first * 5; // pmullw/pmulhw/pshuf
316 }
317
318 // If both vXi64 are representable as (unsigned) i32, then we can perform
319 // the multiple with a single PMULUDQ instruction.
320 // TODO: Add (SSE41+) PMULDQ handling for signed extensions.
321 if (!SignedMode && OpMinSize <= 32 && LT.second.getScalarType() == MVT::i64)
322 ISD = X86ISD::PMULUDQ;
323 }
324
325 // Vector multiply by pow2 will be simplified to shifts.
326 // Vector multiply by -pow2 will be simplified to shifts/negates.
327 if (ISD == ISD::MUL && Op2Info.isConstant() &&
328 (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) {
330 getArithmeticInstrCost(Instruction::Shl, Ty, CostKind,
331 Op1Info.getNoProps(), Op2Info.getNoProps());
332 if (Op2Info.isNegatedPowerOf2())
333 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
334 return Cost;
335 }
336
337 // On X86, vector signed division by constants power-of-two are
338 // normally expanded to the sequence SRA + SRL + ADD + SRA.
339 // The OperandValue properties may not be the same as that of the previous
340 // operation; conservatively assume OP_None.
341 if ((ISD == ISD::SDIV || ISD == ISD::SREM) &&
342 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
344 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
345 Op1Info.getNoProps(), Op2Info.getNoProps());
346 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
347 Op1Info.getNoProps(), Op2Info.getNoProps());
348 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
349 Op1Info.getNoProps(), Op2Info.getNoProps());
350
351 if (ISD == ISD::SREM) {
352 // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
353 Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(),
354 Op2Info.getNoProps());
355 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(),
356 Op2Info.getNoProps());
357 }
358
359 return Cost;
360 }
361
362 // Vector unsigned division/remainder will be simplified to shifts/masks.
363 if ((ISD == ISD::UDIV || ISD == ISD::UREM) &&
364 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
365 if (ISD == ISD::UDIV)
366 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
367 Op1Info.getNoProps(), Op2Info.getNoProps());
368 // UREM
369 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
370 Op1Info.getNoProps(), Op2Info.getNoProps());
371 }
372
373 static const CostKindTblEntry GFNIUniformConstCostTable[] = {
374 { ISD::SHL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
375 { ISD::SRL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
376 { ISD::SRA, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
377 { ISD::SHL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
378 { ISD::SRL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
379 { ISD::SRA, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
380 { ISD::SHL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
381 { ISD::SRL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
382 { ISD::SRA, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
383 };
384
385 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasGFNI())
386 if (const auto *Entry =
387 CostTableLookup(GFNIUniformConstCostTable, ISD, LT.second))
388 if (auto KindCost = Entry->Cost[CostKind])
389 return LT.first * *KindCost;
390
391 static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
392 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
393 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
394 { ISD::SRA, MVT::v16i8, { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb.
395 { ISD::SHL, MVT::v32i8, { 1, 8, 2, 3 } }, // psllw + pand.
396 { ISD::SRL, MVT::v32i8, { 1, 8, 2, 3 } }, // psrlw + pand.
397 { ISD::SRA, MVT::v32i8, { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb.
398 { ISD::SHL, MVT::v64i8, { 1, 8, 2, 3 } }, // psllw + pand.
399 { ISD::SRL, MVT::v64i8, { 1, 8, 2, 3 } }, // psrlw + pand.
400 { ISD::SRA, MVT::v64i8, { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb.
401
402 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // psllw
403 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
404 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
405 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // psllw
406 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
407 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
408 };
409
410 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI())
411 if (const auto *Entry =
412 CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second))
413 if (auto KindCost = Entry->Cost[CostKind])
414 return LT.first * *KindCost;
415
416 static const CostKindTblEntry AVX512UniformConstCostTable[] = {
417 { ISD::SHL, MVT::v64i8, { 2, 12, 5, 6 } }, // psllw + pand.
418 { ISD::SRL, MVT::v64i8, { 2, 12, 5, 6 } }, // psrlw + pand.
419 { ISD::SRA, MVT::v64i8, { 3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb.
420
421 { ISD::SHL, MVT::v16i16, { 2, 7, 4, 4 } }, // psllw + split.
422 { ISD::SRL, MVT::v16i16, { 2, 7, 4, 4 } }, // psrlw + split.
423 { ISD::SRA, MVT::v16i16, { 2, 7, 4, 4 } }, // psraw + split.
424
425 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, // pslld
426 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, // psrld
427 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, // psrad
428 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, // pslld
429 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, // psrld
430 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, // psrad
431
432 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, // psraq
433 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, // psllq
434 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, // psrlq
435 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, // psraq
436 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, // psllq
437 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, // psrlq
438 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, // psraq
439
440 { ISD::SDIV, MVT::v16i32, { 6 } }, // pmuludq sequence
441 { ISD::SREM, MVT::v16i32, { 8 } }, // pmuludq+mul+sub sequence
442 { ISD::UDIV, MVT::v16i32, { 5 } }, // pmuludq sequence
443 { ISD::UREM, MVT::v16i32, { 7 } }, // pmuludq+mul+sub sequence
444 };
445
446 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512())
447 if (const auto *Entry =
448 CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second))
449 if (auto KindCost = Entry->Cost[CostKind])
450 return LT.first * *KindCost;
451
452 static const CostKindTblEntry AVX2UniformConstCostTable[] = {
453 { ISD::SHL, MVT::v16i8, { 1, 8, 2, 3 } }, // psllw + pand.
454 { ISD::SRL, MVT::v16i8, { 1, 8, 2, 3 } }, // psrlw + pand.
455 { ISD::SRA, MVT::v16i8, { 2, 10, 5, 6 } }, // psrlw, pand, pxor, psubb.
456 { ISD::SHL, MVT::v32i8, { 2, 8, 2, 4 } }, // psllw + pand.
457 { ISD::SRL, MVT::v32i8, { 2, 8, 2, 4 } }, // psrlw + pand.
458 { ISD::SRA, MVT::v32i8, { 3, 10, 5, 9 } }, // psrlw, pand, pxor, psubb.
459
460 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw
461 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw
462 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw
463 { ISD::SHL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psllw
464 { ISD::SRL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psrlw
465 { ISD::SRA, MVT::v16i16,{ 2, 2, 1, 2 } }, // psraw
466
467 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
468 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld
469 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad
470 { ISD::SHL, MVT::v8i32, { 2, 2, 1, 2 } }, // pslld
471 { ISD::SRL, MVT::v8i32, { 2, 2, 1, 2 } }, // psrld
472 { ISD::SRA, MVT::v8i32, { 2, 2, 1, 2 } }, // psrad
473
474 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq
475 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq
476 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
477 { ISD::SHL, MVT::v4i64, { 2, 2, 1, 2 } }, // psllq
478 { ISD::SRL, MVT::v4i64, { 2, 2, 1, 2 } }, // psrlq
479 { ISD::SRA, MVT::v4i64, { 4, 4, 3, 6 } }, // psrad + shuffle + split.
480
481 { ISD::SDIV, MVT::v8i32, { 6 } }, // pmuludq sequence
482 { ISD::SREM, MVT::v8i32, { 8 } }, // pmuludq+mul+sub sequence
483 { ISD::UDIV, MVT::v8i32, { 5 } }, // pmuludq sequence
484 { ISD::UREM, MVT::v8i32, { 7 } }, // pmuludq+mul+sub sequence
485 };
486
487 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2())
488 if (const auto *Entry =
489 CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second))
490 if (auto KindCost = Entry->Cost[CostKind])
491 return LT.first * *KindCost;
492
493 static const CostKindTblEntry AVXUniformConstCostTable[] = {
494 { ISD::SHL, MVT::v16i8, { 2, 7, 2, 3 } }, // psllw + pand.
495 { ISD::SRL, MVT::v16i8, { 2, 7, 2, 3 } }, // psrlw + pand.
496 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
497 { ISD::SHL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psllw + pand) + split.
498 { ISD::SRL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psrlw + pand) + split.
499 { ISD::SRA, MVT::v32i8, { 7, 7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split.
500
501 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 1 } }, // psllw.
502 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 1 } }, // psrlw.
503 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 1 } }, // psraw.
504 { ISD::SHL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psllw + split.
505 { ISD::SRL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psrlw + split.
506 { ISD::SRA, MVT::v16i16,{ 3, 6, 4, 5 } }, // psraw + split.
507
508 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 1 } }, // pslld.
509 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 1 } }, // psrld.
510 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 1 } }, // psrad.
511 { ISD::SHL, MVT::v8i32, { 3, 6, 4, 5 } }, // pslld + split.
512 { ISD::SRL, MVT::v8i32, { 3, 6, 4, 5 } }, // psrld + split.
513 { ISD::SRA, MVT::v8i32, { 3, 6, 4, 5 } }, // psrad + split.
514
515 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 1 } }, // psllq.
516 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 1 } }, // psrlq.
517 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
518 { ISD::SHL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
519 { ISD::SRL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
520 { ISD::SRA, MVT::v4i64, { 5, 7, 8, 9 } }, // 2 x psrad + shuffle + split.
521
522 { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split.
523 { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split.
524 { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split.
525 { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split.
526 };
527
528 // XOP has faster vXi8 shifts.
529 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() &&
530 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
531 if (const auto *Entry =
532 CostTableLookup(AVXUniformConstCostTable, ISD, LT.second))
533 if (auto KindCost = Entry->Cost[CostKind])
534 return LT.first * *KindCost;
535
536 static const CostKindTblEntry SSE2UniformConstCostTable[] = {
537 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
538 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
539 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
540
541 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw.
542 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw.
543 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw.
544
545 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
546 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld.
547 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad.
548
549 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq.
550 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq.
551 { ISD::SRA, MVT::v2i64, { 3, 5, 6, 6 } }, // 2 x psrad + shuffle.
552
553 { ISD::SDIV, MVT::v4i32, { 6 } }, // pmuludq sequence
554 { ISD::SREM, MVT::v4i32, { 8 } }, // pmuludq+mul+sub sequence
555 { ISD::UDIV, MVT::v4i32, { 5 } }, // pmuludq sequence
556 { ISD::UREM, MVT::v4i32, { 7 } }, // pmuludq+mul+sub sequence
557 };
558
559 // XOP has faster vXi8 shifts.
560 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() &&
561 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
562 if (const auto *Entry =
563 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
564 if (auto KindCost = Entry->Cost[CostKind])
565 return LT.first * *KindCost;
566
567 static const CostKindTblEntry AVX512BWConstCostTable[] = {
568 { ISD::SDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
569 { ISD::SREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
570 { ISD::UDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
571 { ISD::UREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
572
573 { ISD::SDIV, MVT::v32i16, { 6 } }, // vpmulhw sequence
574 { ISD::SREM, MVT::v32i16, { 8 } }, // vpmulhw+mul+sub sequence
575 { ISD::UDIV, MVT::v32i16, { 6 } }, // vpmulhuw sequence
576 { ISD::UREM, MVT::v32i16, { 8 } }, // vpmulhuw+mul+sub sequence
577 };
578
579 if (Op2Info.isConstant() && ST->hasBWI())
580 if (const auto *Entry =
581 CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
582 if (auto KindCost = Entry->Cost[CostKind])
583 return LT.first * *KindCost;
584
585 static const CostKindTblEntry AVX512ConstCostTable[] = {
586 { ISD::SDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
587 { ISD::SREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
588 { ISD::UDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
589 { ISD::UREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
590
591 { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence
592 { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence
593 { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence
594 { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence
595
596 { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence
597 { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence
598 { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence
599 { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence
600 };
601
602 if (Op2Info.isConstant() && ST->hasAVX512())
603 if (const auto *Entry =
604 CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
605 if (auto KindCost = Entry->Cost[CostKind])
606 return LT.first * *KindCost;
607
608 static const CostKindTblEntry AVX2ConstCostTable[] = {
609 { ISD::SDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
610 { ISD::SREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
611 { ISD::UDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
612 { ISD::UREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
613
614 { ISD::SDIV, MVT::v16i16, { 6 } }, // vpmulhw sequence
615 { ISD::SREM, MVT::v16i16, { 8 } }, // vpmulhw+mul+sub sequence
616 { ISD::UDIV, MVT::v16i16, { 6 } }, // vpmulhuw sequence
617 { ISD::UREM, MVT::v16i16, { 8 } }, // vpmulhuw+mul+sub sequence
618
619 { ISD::SDIV, MVT::v8i32, { 15 } }, // vpmuldq sequence
620 { ISD::SREM, MVT::v8i32, { 19 } }, // vpmuldq+mul+sub sequence
621 { ISD::UDIV, MVT::v8i32, { 15 } }, // vpmuludq sequence
622 { ISD::UREM, MVT::v8i32, { 19 } }, // vpmuludq+mul+sub sequence
623 };
624
625 if (Op2Info.isConstant() && ST->hasAVX2())
626 if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
627 if (auto KindCost = Entry->Cost[CostKind])
628 return LT.first * *KindCost;
629
630 static const CostKindTblEntry AVXConstCostTable[] = {
631 { ISD::SDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
632 { ISD::SREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
633 { ISD::UDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
634 { ISD::UREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
635
636 { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split.
637 { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split.
638 { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split.
639 { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split.
640
641 { ISD::SDIV, MVT::v8i32, { 32 } }, // vpmuludq sequence
642 { ISD::SREM, MVT::v8i32, { 38 } }, // vpmuludq+mul+sub sequence
643 { ISD::UDIV, MVT::v8i32, { 32 } }, // 2*pmuludq sequence + split.
644 { ISD::UREM, MVT::v8i32, { 42 } }, // 2*pmuludq+mul+sub sequence + split.
645 };
646
647 if (Op2Info.isConstant() && ST->hasAVX())
648 if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second))
649 if (auto KindCost = Entry->Cost[CostKind])
650 return LT.first * *KindCost;
651
652 static const CostKindTblEntry SSE41ConstCostTable[] = {
653 { ISD::SDIV, MVT::v4i32, { 15 } }, // vpmuludq sequence
654 { ISD::SREM, MVT::v4i32, { 20 } }, // vpmuludq+mul+sub sequence
655 };
656
657 if (Op2Info.isConstant() && ST->hasSSE41())
658 if (const auto *Entry =
659 CostTableLookup(SSE41ConstCostTable, ISD, LT.second))
660 if (auto KindCost = Entry->Cost[CostKind])
661 return LT.first * *KindCost;
662
663 static const CostKindTblEntry SSE2ConstCostTable[] = {
664 { ISD::SDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
665 { ISD::SREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
666 { ISD::UDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
667 { ISD::UREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
668
669 { ISD::SDIV, MVT::v8i16, { 6 } }, // pmulhw sequence
670 { ISD::SREM, MVT::v8i16, { 8 } }, // pmulhw+mul+sub sequence
671 { ISD::UDIV, MVT::v8i16, { 6 } }, // pmulhuw sequence
672 { ISD::UREM, MVT::v8i16, { 8 } }, // pmulhuw+mul+sub sequence
673
674 { ISD::SDIV, MVT::v4i32, { 19 } }, // pmuludq sequence
675 { ISD::SREM, MVT::v4i32, { 24 } }, // pmuludq+mul+sub sequence
676 { ISD::UDIV, MVT::v4i32, { 15 } }, // pmuludq sequence
677 { ISD::UREM, MVT::v4i32, { 20 } }, // pmuludq+mul+sub sequence
678 };
679
680 if (Op2Info.isConstant() && ST->hasSSE2())
681 if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
682 if (auto KindCost = Entry->Cost[CostKind])
683 return LT.first * *KindCost;
684
685 static const CostKindTblEntry AVX512BWUniformCostTable[] = {
686 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
687 { ISD::SRL, MVT::v16i8, { 3,10, 5, 8 } }, // psrlw + pand.
688 { ISD::SRA, MVT::v16i8, { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb.
689 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
690 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
691 { ISD::SRA, MVT::v32i8, { 5,10,10,13 } }, // psrlw, pand, pxor, psubb.
692 { ISD::SHL, MVT::v64i8, { 4, 7, 6, 8 } }, // psllw + pand.
693 { ISD::SRL, MVT::v64i8, { 4, 8, 7,10 } }, // psrlw + pand.
694 { ISD::SRA, MVT::v64i8, { 5,10,10,15 } }, // psrlw, pand, pxor, psubb.
695
696 { ISD::SHL, MVT::v32i16, { 2, 4, 2, 3 } }, // psllw
697 { ISD::SRL, MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw
698 { ISD::SRA, MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw
699 };
700
701 if (ST->hasBWI() && Op2Info.isUniform())
702 if (const auto *Entry =
703 CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second))
704 if (auto KindCost = Entry->Cost[CostKind])
705 return LT.first * *KindCost;
706
707 static const CostKindTblEntry AVX512UniformCostTable[] = {
708 { ISD::SHL, MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split.
709 { ISD::SRL, MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split.
710 { ISD::SRA, MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split.
711
712 { ISD::SHL, MVT::v16i32, { 2, 4, 2, 3 } }, // pslld
713 { ISD::SRL, MVT::v16i32, { 2, 4, 2, 3 } }, // psrld
714 { ISD::SRA, MVT::v16i32, { 2, 4, 2, 3 } }, // psrad
715
716 { ISD::SRA, MVT::v2i64, { 1, 2, 1, 2 } }, // psraq
717 { ISD::SHL, MVT::v4i64, { 1, 4, 1, 2 } }, // psllq
718 { ISD::SRL, MVT::v4i64, { 1, 4, 1, 2 } }, // psrlq
719 { ISD::SRA, MVT::v4i64, { 1, 4, 1, 2 } }, // psraq
720 { ISD::SHL, MVT::v8i64, { 1, 4, 1, 2 } }, // psllq
721 { ISD::SRL, MVT::v8i64, { 1, 4, 1, 2 } }, // psrlq
722 { ISD::SRA, MVT::v8i64, { 1, 4, 1, 2 } }, // psraq
723 };
724
725 if (ST->hasAVX512() && Op2Info.isUniform())
726 if (const auto *Entry =
727 CostTableLookup(AVX512UniformCostTable, ISD, LT.second))
728 if (auto KindCost = Entry->Cost[CostKind])
729 return LT.first * *KindCost;
730
731 static const CostKindTblEntry AVX2UniformCostTable[] = {
732 // Uniform splats are cheaper for the following instructions.
733 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
734 { ISD::SRL, MVT::v16i8, { 3, 9, 5, 8 } }, // psrlw + pand.
735 { ISD::SRA, MVT::v16i8, { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb.
736 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
737 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
738 { ISD::SRA, MVT::v32i8, { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb.
739
740 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 2 } }, // psllw.
741 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 2 } }, // psrlw.
742 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 2 } }, // psraw.
743 { ISD::SHL, MVT::v16i16, { 2, 4, 2, 3 } }, // psllw.
744 { ISD::SRL, MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw.
745 { ISD::SRA, MVT::v16i16, { 2, 4, 2, 3 } }, // psraw.
746
747 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 2 } }, // pslld
748 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 2 } }, // psrld
749 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 2 } }, // psrad
750 { ISD::SHL, MVT::v8i32, { 2, 4, 2, 3 } }, // pslld
751 { ISD::SRL, MVT::v8i32, { 2, 4, 2, 3 } }, // psrld
752 { ISD::SRA, MVT::v8i32, { 2, 4, 2, 3 } }, // psrad
753
754 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 2 } }, // psllq
755 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 2 } }, // psrlq
756 { ISD::SRA, MVT::v2i64, { 2, 4, 5, 7 } }, // 2 x psrad + shuffle.
757 { ISD::SHL, MVT::v4i64, { 2, 4, 1, 2 } }, // psllq
758 { ISD::SRL, MVT::v4i64, { 2, 4, 1, 2 } }, // psrlq
759 { ISD::SRA, MVT::v4i64, { 4, 6, 5, 9 } }, // 2 x psrad + shuffle.
760 };
761
762 if (ST->hasAVX2() && Op2Info.isUniform())
763 if (const auto *Entry =
764 CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
765 if (auto KindCost = Entry->Cost[CostKind])
766 return LT.first * *KindCost;
767
768 static const CostKindTblEntry AVXUniformCostTable[] = {
769 { ISD::SHL, MVT::v16i8, { 4, 4, 6, 8 } }, // psllw + pand.
770 { ISD::SRL, MVT::v16i8, { 4, 8, 5, 8 } }, // psrlw + pand.
771 { ISD::SRA, MVT::v16i8, { 6, 6, 9,13 } }, // psrlw, pand, pxor, psubb.
772 { ISD::SHL, MVT::v32i8, { 7, 8,11,14 } }, // psllw + pand + split.
773 { ISD::SRL, MVT::v32i8, { 7, 9,10,14 } }, // psrlw + pand + split.
774 { ISD::SRA, MVT::v32i8, { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split.
775
776 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 2 } }, // psllw.
777 { ISD::SRL, MVT::v8i16, { 1, 3, 1, 2 } }, // psrlw.
778 { ISD::SRA, MVT::v8i16, { 1, 3, 1, 2 } }, // psraw.
779 { ISD::SHL, MVT::v16i16, { 3, 7, 5, 7 } }, // psllw + split.
780 { ISD::SRL, MVT::v16i16, { 3, 7, 5, 7 } }, // psrlw + split.
781 { ISD::SRA, MVT::v16i16, { 3, 7, 5, 7 } }, // psraw + split.
782
783 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 2 } }, // pslld.
784 { ISD::SRL, MVT::v4i32, { 1, 3, 1, 2 } }, // psrld.
785 { ISD::SRA, MVT::v4i32, { 1, 3, 1, 2 } }, // psrad.
786 { ISD::SHL, MVT::v8i32, { 3, 7, 5, 7 } }, // pslld + split.
787 { ISD::SRL, MVT::v8i32, { 3, 7, 5, 7 } }, // psrld + split.
788 { ISD::SRA, MVT::v8i32, { 3, 7, 5, 7 } }, // psrad + split.
789
790 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 2 } }, // psllq.
791 { ISD::SRL, MVT::v2i64, { 1, 3, 1, 2 } }, // psrlq.
792 { ISD::SRA, MVT::v2i64, { 3, 4, 5, 7 } }, // 2 x psrad + shuffle.
793 { ISD::SHL, MVT::v4i64, { 3, 7, 4, 6 } }, // psllq + split.
794 { ISD::SRL, MVT::v4i64, { 3, 7, 4, 6 } }, // psrlq + split.
795 { ISD::SRA, MVT::v4i64, { 6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split.
796 };
797
798 // XOP has faster vXi8 shifts.
799 if (ST->hasAVX() && Op2Info.isUniform() &&
800 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
801 if (const auto *Entry =
802 CostTableLookup(AVXUniformCostTable, ISD, LT.second))
803 if (auto KindCost = Entry->Cost[CostKind])
804 return LT.first * *KindCost;
805
806 static const CostKindTblEntry SSE2UniformCostTable[] = {
807 // Uniform splats are cheaper for the following instructions.
808 { ISD::SHL, MVT::v16i8, { 9, 10, 6, 9 } }, // psllw + pand.
809 { ISD::SRL, MVT::v16i8, { 9, 13, 5, 9 } }, // psrlw + pand.
810 { ISD::SRA, MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence.
811
812 { ISD::SHL, MVT::v8i16, { 2, 2, 1, 2 } }, // psllw.
813 { ISD::SRL, MVT::v8i16, { 2, 2, 1, 2 } }, // psrlw.
814 { ISD::SRA, MVT::v8i16, { 2, 2, 1, 2 } }, // psraw.
815
816 { ISD::SHL, MVT::v4i32, { 2, 2, 1, 2 } }, // pslld
817 { ISD::SRL, MVT::v4i32, { 2, 2, 1, 2 } }, // psrld.
818 { ISD::SRA, MVT::v4i32, { 2, 2, 1, 2 } }, // psrad.
819
820 { ISD::SHL, MVT::v2i64, { 2, 2, 1, 2 } }, // psllq.
821 { ISD::SRL, MVT::v2i64, { 2, 2, 1, 2 } }, // psrlq.
822 { ISD::SRA, MVT::v2i64, { 5, 9, 5, 7 } }, // 2*psrlq + xor + sub.
823 };
824
825 if (ST->hasSSE2() && Op2Info.isUniform() &&
826 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
827 if (const auto *Entry =
828 CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
829 if (auto KindCost = Entry->Cost[CostKind])
830 return LT.first * *KindCost;
831
832 static const CostKindTblEntry AVX512DQCostTable[] = {
833 { ISD::MUL, MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq
834 { ISD::MUL, MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq
835 { ISD::MUL, MVT::v8i64, { 3, 15, 1, 3 } } // pmullq
836 };
837
838 // Look for AVX512DQ lowering tricks for custom cases.
839 if (ST->hasDQI())
840 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
841 if (auto KindCost = Entry->Cost[CostKind])
842 return LT.first * *KindCost;
843
844 static const CostKindTblEntry AVX512BWCostTable[] = {
845 { ISD::SHL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsllvw/pack sequence.
846 { ISD::SRL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsrlvw/pack sequence.
847 { ISD::SRA, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsravw/pack sequence.
848 { ISD::SHL, MVT::v32i8, { 4, 23,11,16 } }, // extend/vpsllvw/pack sequence.
849 { ISD::SRL, MVT::v32i8, { 4, 30,12,18 } }, // extend/vpsrlvw/pack sequence.
850 { ISD::SRA, MVT::v32i8, { 6, 13,24,30 } }, // extend/vpsravw/pack sequence.
851 { ISD::SHL, MVT::v64i8, { 6, 19,13,15 } }, // extend/vpsllvw/pack sequence.
852 { ISD::SRL, MVT::v64i8, { 7, 27,15,18 } }, // extend/vpsrlvw/pack sequence.
853 { ISD::SRA, MVT::v64i8, { 15, 15,30,30 } }, // extend/vpsravw/pack sequence.
854
855 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsllvw
856 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsrlvw
857 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsravw
858 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsllvw
859 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsrlvw
860 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsravw
861 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsllvw
862 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsrlvw
863 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsravw
864
865 { ISD::ADD, MVT::v64i8, { 1, 1, 1, 1 } }, // paddb
866 { ISD::ADD, MVT::v32i16, { 1, 1, 1, 1 } }, // paddw
867
868 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 1 } }, // paddb
869 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 1 } }, // paddw
870 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 1 } }, // paddd
871 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 1 } }, // paddq
872
873 { ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb
874 { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw
875
876 { ISD::MUL, MVT::v16i8, { 4, 12, 4, 5 } }, // extend/pmullw/trunc
877 { ISD::MUL, MVT::v32i8, { 3, 10, 7,10 } }, // pmaddubsw
878 { ISD::MUL, MVT::v64i8, { 3, 11, 7,10 } }, // pmaddubsw
879 { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw
880
881 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb
882 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 1 } }, // psubw
883 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 1 } }, // psubd
884 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 1 } }, // psubq
885 };
886
887 // Look for AVX512BW lowering tricks for custom cases.
888 if (ST->hasBWI())
889 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
890 if (auto KindCost = Entry->Cost[CostKind])
891 return LT.first * *KindCost;
892
893 static const CostKindTblEntry AVX512CostTable[] = {
894 { ISD::SHL, MVT::v64i8, { 15, 19,27,33 } }, // vpblendv+split sequence.
895 { ISD::SRL, MVT::v64i8, { 15, 19,30,36 } }, // vpblendv+split sequence.
896 { ISD::SRA, MVT::v64i8, { 37, 37,51,63 } }, // vpblendv+split sequence.
897
898 { ISD::SHL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
899 { ISD::SRL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
900 { ISD::SRA, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence.
901
902 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } },
903 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } },
904 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } },
905 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } },
906 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } },
907 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } },
908 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } },
909 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } },
910 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } },
911
912 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } },
913 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } },
914 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } },
915 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } },
916 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } },
917 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } },
918 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } },
919 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } },
920 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } },
921
922 { ISD::ADD, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*paddb + split
923 { ISD::ADD, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*paddw + split
924
925 { ISD::SUB, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*psubb + split
926 { ISD::SUB, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*psubw + split
927
928 { ISD::AND, MVT::v32i8, { 1, 1, 1, 1 } },
929 { ISD::AND, MVT::v16i16, { 1, 1, 1, 1 } },
930 { ISD::AND, MVT::v8i32, { 1, 1, 1, 1 } },
931 { ISD::AND, MVT::v4i64, { 1, 1, 1, 1 } },
932
933 { ISD::OR, MVT::v32i8, { 1, 1, 1, 1 } },
934 { ISD::OR, MVT::v16i16, { 1, 1, 1, 1 } },
935 { ISD::OR, MVT::v8i32, { 1, 1, 1, 1 } },
936 { ISD::OR, MVT::v4i64, { 1, 1, 1, 1 } },
937
938 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 1 } },
939 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 1 } },
940 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 1 } },
941 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 1 } },
942
943 { ISD::MUL, MVT::v16i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
944 { ISD::MUL, MVT::v8i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
945 { ISD::MUL, MVT::v4i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
946 { ISD::MUL, MVT::v8i64, { 6, 9, 8, 8 } }, // 3*pmuludq/3*shift/2*add
947 { ISD::MUL, MVT::i64, { 1 } }, // Skylake from http://www.agner.org/
948
949 { X86ISD::PMULUDQ, MVT::v8i64, { 1, 5, 1, 1 } },
950
951 { ISD::FNEG, MVT::v8f64, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
952 { ISD::FADD, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
953 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
954 { ISD::FSUB, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
955 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
956 { ISD::FMUL, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
957 { ISD::FMUL, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
958 { ISD::FMUL, MVT::v2f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
959 { ISD::FMUL, MVT::f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
960
961 { ISD::FDIV, MVT::f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
962 { ISD::FDIV, MVT::v2f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
963 { ISD::FDIV, MVT::v4f64, { 8, 14, 1, 1 } }, // Skylake from http://www.agner.org/
964 { ISD::FDIV, MVT::v8f64, { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/
965
966 { ISD::FNEG, MVT::v16f32, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
967 { ISD::FADD, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
968 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
969 { ISD::FSUB, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
970 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
971 { ISD::FMUL, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
972 { ISD::FMUL, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
973 { ISD::FMUL, MVT::v4f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
974 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
975
976 { ISD::FDIV, MVT::f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
977 { ISD::FDIV, MVT::v4f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
978 { ISD::FDIV, MVT::v8f32, { 5, 11, 1, 1 } }, // Skylake from http://www.agner.org/
979 { ISD::FDIV, MVT::v16f32, { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/
980 };
981
982 if (ST->hasAVX512())
983 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
984 if (auto KindCost = Entry->Cost[CostKind])
985 return LT.first * *KindCost;
986
987 static const CostKindTblEntry AVX2ShiftCostTable[] = {
988 // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
989 // customize them to detect the cases where shift amount is a scalar one.
990 { ISD::SHL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org)
991 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
992 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org)
993 { ISD::SHL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org)
994 { ISD::SRL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
995 { ISD::SRA, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org)
996 { ISD::SHL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org)
997 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org)
998 { ISD::SHL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org)
999 { ISD::SRL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org)
1000 };
1001
1002 if (ST->hasAVX512()) {
1003 if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant())
1004 // On AVX512, a packed v32i16 shift left by a constant build_vector
1005 // is lowered into a vector multiply (vpmullw).
1006 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
1007 Op1Info.getNoProps(), Op2Info.getNoProps());
1008 }
1009
1010 // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
1011 if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
1012 if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
1013 Op2Info.isConstant())
1014 // On AVX2, a packed v16i16 shift left by a constant build_vector
1015 // is lowered into a vector multiply (vpmullw).
1016 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
1017 Op1Info.getNoProps(), Op2Info.getNoProps());
1018
1019 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
1020 if (auto KindCost = Entry->Cost[CostKind])
1021 return LT.first * *KindCost;
1022 }
1023
1024 static const CostKindTblEntry XOPShiftCostTable[] = {
1025 // 128bit shifts take 1cy, but right shifts require negation beforehand.
1026 { ISD::SHL, MVT::v16i8, { 1, 3, 1, 1 } },
1027 { ISD::SRL, MVT::v16i8, { 2, 3, 1, 1 } },
1028 { ISD::SRA, MVT::v16i8, { 2, 3, 1, 1 } },
1029 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 1 } },
1030 { ISD::SRL, MVT::v8i16, { 2, 3, 1, 1 } },
1031 { ISD::SRA, MVT::v8i16, { 2, 3, 1, 1 } },
1032 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 1 } },
1033 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 1 } },
1034 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 1 } },
1035 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 1 } },
1036 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } },
1037 { ISD::SRA, MVT::v2i64, { 2, 3, 1, 1 } },
1038 // 256bit shifts require splitting if AVX2 didn't catch them above.
1039 { ISD::SHL, MVT::v32i8, { 4, 7, 5, 6 } },
1040 { ISD::SRL, MVT::v32i8, { 6, 7, 5, 6 } },
1041 { ISD::SRA, MVT::v32i8, { 6, 7, 5, 6 } },
1042 { ISD::SHL, MVT::v16i16, { 4, 7, 5, 6 } },
1043 { ISD::SRL, MVT::v16i16, { 6, 7, 5, 6 } },
1044 { ISD::SRA, MVT::v16i16, { 6, 7, 5, 6 } },
1045 { ISD::SHL, MVT::v8i32, { 4, 7, 5, 6 } },
1046 { ISD::SRL, MVT::v8i32, { 6, 7, 5, 6 } },
1047 { ISD::SRA, MVT::v8i32, { 6, 7, 5, 6 } },
1048 { ISD::SHL, MVT::v4i64, { 4, 7, 5, 6 } },
1049 { ISD::SRL, MVT::v4i64, { 6, 7, 5, 6 } },
1050 { ISD::SRA, MVT::v4i64, { 6, 7, 5, 6 } },
1051 };
1052
1053 // Look for XOP lowering tricks.
1054 if (ST->hasXOP()) {
1055 // If the right shift is constant then we'll fold the negation so
1056 // it's as cheap as a left shift.
1057 int ShiftISD = ISD;
1058 if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant())
1059 ShiftISD = ISD::SHL;
1060 if (const auto *Entry =
1061 CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
1062 if (auto KindCost = Entry->Cost[CostKind])
1063 return LT.first * *KindCost;
1064 }
1065
1066 if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) {
1067 MVT VT = LT.second;
1068 // Vector shift left by non uniform constant can be lowered
1069 // into vector multiply.
1070 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
1071 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
1072 ISD = ISD::MUL;
1073 }
1074
1075 static const CostKindTblEntry GLMCostTable[] = {
1076 { ISD::FDIV, MVT::f32, { 18, 19, 1, 1 } }, // divss
1077 { ISD::FDIV, MVT::v4f32, { 35, 36, 1, 1 } }, // divps
1078 { ISD::FDIV, MVT::f64, { 33, 34, 1, 1 } }, // divsd
1079 { ISD::FDIV, MVT::v2f64, { 65, 66, 1, 1 } }, // divpd
1080 };
1081
1082 if (ST->useGLMDivSqrtCosts())
1083 if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second))
1084 if (auto KindCost = Entry->Cost[CostKind])
1085 return LT.first * *KindCost;
1086
1087 static const CostKindTblEntry SLMCostTable[] = {
1088 { ISD::MUL, MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld
1089 { ISD::MUL, MVT::v8i16, { 2, 5, 1, 1 } }, // pmullw
1090 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // mulsd
1091 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // mulss
1092 { ISD::FMUL, MVT::v2f64, { 4, 7, 1, 1 } }, // mulpd
1093 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // mulps
1094 { ISD::FDIV, MVT::f32, { 17, 19, 1, 1 } }, // divss
1095 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 6 } }, // divps
1096 { ISD::FDIV, MVT::f64, { 32, 34, 1, 1 } }, // divsd
1097 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 6 } }, // divpd
1098 { ISD::FADD, MVT::v2f64, { 2, 4, 1, 1 } }, // addpd
1099 { ISD::FSUB, MVT::v2f64, { 2, 4, 1, 1 } }, // subpd
1100 // v2i64/v4i64 mul is custom lowered as a series of long:
1101 // multiplies(3), shifts(3) and adds(2)
1102 // slm muldq version throughput is 2 and addq throughput 4
1103 // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
1104 // 3X4 (addq throughput) = 17
1105 { ISD::MUL, MVT::v2i64, { 17, 22, 9, 9 } },
1106 // slm addq\subq throughput is 4
1107 { ISD::ADD, MVT::v2i64, { 4, 2, 1, 2 } },
1108 { ISD::SUB, MVT::v2i64, { 4, 2, 1, 2 } },
1109 };
1110
1111 if (ST->useSLMArithCosts())
1112 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second))
1113 if (auto KindCost = Entry->Cost[CostKind])
1114 return LT.first * *KindCost;
1115
1116 static const CostKindTblEntry AVX2CostTable[] = {
1117 { ISD::SHL, MVT::v16i8, { 6, 21,11,16 } }, // vpblendvb sequence.
1118 { ISD::SHL, MVT::v32i8, { 6, 23,11,22 } }, // vpblendvb sequence.
1119 { ISD::SHL, MVT::v8i16, { 5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence.
1120 { ISD::SHL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1121
1122 { ISD::SRL, MVT::v16i8, { 6, 27,12,18 } }, // vpblendvb sequence.
1123 { ISD::SRL, MVT::v32i8, { 8, 30,12,24 } }, // vpblendvb sequence.
1124 { ISD::SRL, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence.
1125 { ISD::SRL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1126
1127 { ISD::SRA, MVT::v16i8, { 17, 17,24,30 } }, // vpblendvb sequence.
1128 { ISD::SRA, MVT::v32i8, { 18, 20,24,43 } }, // vpblendvb sequence.
1129 { ISD::SRA, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsravd/pack sequence.
1130 { ISD::SRA, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsravd/pack sequence.
1131 { ISD::SRA, MVT::v2i64, { 4, 5, 5, 5 } }, // srl/xor/sub sequence.
1132 { ISD::SRA, MVT::v4i64, { 8, 8, 5, 9 } }, // srl/xor/sub sequence.
1133
1134 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 2 } }, // psubb
1135 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 2 } }, // paddb
1136 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 2 } }, // psubw
1137 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 2 } }, // paddw
1138 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 2 } }, // psubd
1139 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 2 } }, // paddd
1140 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 2 } }, // psubq
1141 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq
1142
1143 { ISD::MUL, MVT::v16i8, { 5, 18, 6,12 } }, // extend/pmullw/pack
1144 { ISD::MUL, MVT::v32i8, { 4, 8, 8,16 } }, // pmaddubsw
1145 { ISD::MUL, MVT::v16i16, { 2, 5, 1, 2 } }, // pmullw
1146 { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld
1147 { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld
1148 { ISD::MUL, MVT::v4i64, { 6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add
1149 { ISD::MUL, MVT::v2i64, { 6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
1150
1151 { X86ISD::PMULUDQ, MVT::v4i64, { 1, 5, 1, 1 } },
1152
1153 { ISD::FNEG, MVT::v4f64, { 1, 1, 1, 2 } }, // vxorpd
1154 { ISD::FNEG, MVT::v8f32, { 1, 1, 1, 2 } }, // vxorps
1155
1156 { ISD::FADD, MVT::f64, { 1, 4, 1, 1 } }, // vaddsd
1157 { ISD::FADD, MVT::f32, { 1, 4, 1, 1 } }, // vaddss
1158 { ISD::FADD, MVT::v2f64, { 1, 4, 1, 1 } }, // vaddpd
1159 { ISD::FADD, MVT::v4f32, { 1, 4, 1, 1 } }, // vaddps
1160 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 2 } }, // vaddpd
1161 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 2 } }, // vaddps
1162
1163 { ISD::FSUB, MVT::f64, { 1, 4, 1, 1 } }, // vsubsd
1164 { ISD::FSUB, MVT::f32, { 1, 4, 1, 1 } }, // vsubss
1165 { ISD::FSUB, MVT::v2f64, { 1, 4, 1, 1 } }, // vsubpd
1166 { ISD::FSUB, MVT::v4f32, { 1, 4, 1, 1 } }, // vsubps
1167 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 2 } }, // vsubpd
1168 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 2 } }, // vsubps
1169
1170 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // vmulsd
1171 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // vmulss
1172 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // vmulpd
1173 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // vmulps
1174 { ISD::FMUL, MVT::v4f64, { 1, 5, 1, 2 } }, // vmulpd
1175 { ISD::FMUL, MVT::v8f32, { 1, 5, 1, 2 } }, // vmulps
1176
1177 { ISD::FDIV, MVT::f32, { 7, 13, 1, 1 } }, // vdivss
1178 { ISD::FDIV, MVT::v4f32, { 7, 13, 1, 1 } }, // vdivps
1179 { ISD::FDIV, MVT::v8f32, { 14, 21, 1, 3 } }, // vdivps
1180 { ISD::FDIV, MVT::f64, { 14, 20, 1, 1 } }, // vdivsd
1181 { ISD::FDIV, MVT::v2f64, { 14, 20, 1, 1 } }, // vdivpd
1182 { ISD::FDIV, MVT::v4f64, { 28, 35, 1, 3 } }, // vdivpd
1183 };
1184
1185 // Look for AVX2 lowering tricks for custom cases.
1186 if (ST->hasAVX2())
1187 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
1188 if (auto KindCost = Entry->Cost[CostKind])
1189 return LT.first * *KindCost;
1190
1191 static const CostKindTblEntry AVX1CostTable[] = {
1192 // We don't have to scalarize unsupported ops. We can issue two half-sized
1193 // operations and we only need to extract the upper YMM half.
1194 // Two ops + 1 extract + 1 insert = 4.
1195 { ISD::MUL, MVT::v32i8, { 10, 11, 18, 19 } }, // pmaddubsw + split
1196 { ISD::MUL, MVT::v16i8, { 5, 6, 8, 12 } }, // 2*pmaddubsw/3*and/psllw/or
1197 { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split
1198 { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split
1199 { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld
1200 { ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } },
1201
1202 { ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps
1203 { ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps
1204 { ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps
1205 { ISD::AND, MVT::v4i64, { 1, 1, 1, 2 } }, // vandps
1206
1207 { ISD::OR, MVT::v32i8, { 1, 1, 1, 2 } }, // vorps
1208 { ISD::OR, MVT::v16i16, { 1, 1, 1, 2 } }, // vorps
1209 { ISD::OR, MVT::v8i32, { 1, 1, 1, 2 } }, // vorps
1210 { ISD::OR, MVT::v4i64, { 1, 1, 1, 2 } }, // vorps
1211
1212 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 2 } }, // vxorps
1213 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 2 } }, // vxorps
1214 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 2 } }, // vxorps
1215 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 2 } }, // vxorps
1216
1217 { ISD::SUB, MVT::v32i8, { 4, 2, 5, 6 } }, // psubb + split
1218 { ISD::ADD, MVT::v32i8, { 4, 2, 5, 6 } }, // paddb + split
1219 { ISD::SUB, MVT::v16i16, { 4, 2, 5, 6 } }, // psubw + split
1220 { ISD::ADD, MVT::v16i16, { 4, 2, 5, 6 } }, // paddw + split
1221 { ISD::SUB, MVT::v8i32, { 4, 2, 5, 6 } }, // psubd + split
1222 { ISD::ADD, MVT::v8i32, { 4, 2, 5, 6 } }, // paddd + split
1223 { ISD::SUB, MVT::v4i64, { 4, 2, 5, 6 } }, // psubq + split
1224 { ISD::ADD, MVT::v4i64, { 4, 2, 5, 6 } }, // paddq + split
1225 { ISD::SUB, MVT::v2i64, { 1, 1, 1, 1 } }, // psubq
1226 { ISD::ADD, MVT::v2i64, { 1, 1, 1, 1 } }, // paddq
1227
1228 { ISD::SHL, MVT::v16i8, { 10, 21,11,17 } }, // pblendvb sequence.
1229 { ISD::SHL, MVT::v32i8, { 22, 22,27,40 } }, // pblendvb sequence + split.
1230 { ISD::SHL, MVT::v8i16, { 6, 9,11,11 } }, // pblendvb sequence.
1231 { ISD::SHL, MVT::v16i16, { 13, 16,24,25 } }, // pblendvb sequence + split.
1232 { ISD::SHL, MVT::v4i32, { 3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld
1233 { ISD::SHL, MVT::v8i32, { 9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split
1234 { ISD::SHL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1235 { ISD::SHL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1236
1237 { ISD::SRL, MVT::v16i8, { 11, 27,12,18 } }, // pblendvb sequence.
1238 { ISD::SRL, MVT::v32i8, { 23, 23,30,43 } }, // pblendvb sequence + split.
1239 { ISD::SRL, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1240 { ISD::SRL, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1241 { ISD::SRL, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1242 { ISD::SRL, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1243 { ISD::SRL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1244 { ISD::SRL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1245
1246 { ISD::SRA, MVT::v16i8, { 21, 22,24,36 } }, // pblendvb sequence.
1247 { ISD::SRA, MVT::v32i8, { 44, 45,51,76 } }, // pblendvb sequence + split.
1248 { ISD::SRA, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1249 { ISD::SRA, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1250 { ISD::SRA, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1251 { ISD::SRA, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1252 { ISD::SRA, MVT::v2i64, { 5, 6,10,14 } }, // Shift each lane + blend.
1253 { ISD::SRA, MVT::v4i64, { 12, 12,22,30 } }, // Shift each lane + blend + split.
1254
1255 { ISD::FNEG, MVT::v4f64, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1256 { ISD::FNEG, MVT::v8f32, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1257
1258 { ISD::FADD, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1259 { ISD::FADD, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1260 { ISD::FADD, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1261 { ISD::FADD, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1262 { ISD::FADD, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1263 { ISD::FADD, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1264
1265 { ISD::FSUB, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1266 { ISD::FSUB, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1267 { ISD::FSUB, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1268 { ISD::FSUB, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1269 { ISD::FSUB, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1270 { ISD::FSUB, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1271
1272 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1273 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1274 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1275 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1276 { ISD::FMUL, MVT::v4f64, { 4, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1277 { ISD::FMUL, MVT::v8f32, { 2, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1278
1279 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1280 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1281 { ISD::FDIV, MVT::v8f32, { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/
1282 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1283 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1284 { ISD::FDIV, MVT::v4f64, { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/
1285 };
1286
1287 if (ST->hasAVX())
1288 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
1289 if (auto KindCost = Entry->Cost[CostKind])
1290 return LT.first * *KindCost;
1291
1292 static const CostKindTblEntry SSE42CostTable[] = {
1293 { ISD::FADD, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1294 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1295 { ISD::FADD, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1296 { ISD::FADD, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1297
1298 { ISD::FSUB, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1299 { ISD::FSUB, MVT::f32 , { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1300 { ISD::FSUB, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1301 { ISD::FSUB, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1302
1303 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1304 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1305 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1306 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1307
1308 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1309 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1310 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1311 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1312
1313 { ISD::MUL, MVT::v2i64, { 6, 10,10,10 } } // 3*pmuludq/3*shift/2*add
1314 };
1315
1316 if (ST->hasSSE42())
1317 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
1318 if (auto KindCost = Entry->Cost[CostKind])
1319 return LT.first * *KindCost;
1320
1321 static const CostKindTblEntry SSE41CostTable[] = {
1322 { ISD::SHL, MVT::v16i8, { 15, 24,17,22 } }, // pblendvb sequence.
1323 { ISD::SHL, MVT::v8i16, { 11, 14,11,11 } }, // pblendvb sequence.
1324 { ISD::SHL, MVT::v4i32, { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld
1325
1326 { ISD::SRL, MVT::v16i8, { 16, 27,18,24 } }, // pblendvb sequence.
1327 { ISD::SRL, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1328 { ISD::SRL, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1329 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1330
1331 { ISD::SRA, MVT::v16i8, { 38, 41,30,36 } }, // pblendvb sequence.
1332 { ISD::SRA, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1333 { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1334 { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence.
1335
1336 { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org)
1337 };
1338
1339 if (ST->hasSSE41())
1340 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
1341 if (auto KindCost = Entry->Cost[CostKind])
1342 return LT.first * *KindCost;
1343
1344 static const CostKindTblEntry SSSE3CostTable[] = {
1345 { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*pmaddubsw/3*and/psllw/or
1346 };
1347
1348 if (ST->hasSSSE3())
1349 if (const auto *Entry = CostTableLookup(SSSE3CostTable, ISD, LT.second))
1350 if (auto KindCost = Entry->Cost[CostKind])
1351 return LT.first * *KindCost;
1352
1353 static const CostKindTblEntry SSE2CostTable[] = {
1354 // We don't correctly identify costs of casts because they are marked as
1355 // custom.
1356 { ISD::SHL, MVT::v16i8, { 13, 21,26,28 } }, // cmpgtb sequence.
1357 { ISD::SHL, MVT::v8i16, { 24, 27,16,20 } }, // cmpgtw sequence.
1358 { ISD::SHL, MVT::v4i32, { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq.
1359 { ISD::SHL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1360
1361 { ISD::SRL, MVT::v16i8, { 14, 28,27,30 } }, // cmpgtb sequence.
1362 { ISD::SRL, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1363 { ISD::SRL, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1364 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1365
1366 { ISD::SRA, MVT::v16i8, { 27, 30,54,54 } }, // unpacked cmpgtb sequence.
1367 { ISD::SRA, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1368 { ISD::SRA, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1369 { ISD::SRA, MVT::v2i64, { 8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence.
1370
1371 { ISD::AND, MVT::v16i8, { 1, 1, 1, 1 } }, // pand
1372 { ISD::AND, MVT::v8i16, { 1, 1, 1, 1 } }, // pand
1373 { ISD::AND, MVT::v4i32, { 1, 1, 1, 1 } }, // pand
1374 { ISD::AND, MVT::v2i64, { 1, 1, 1, 1 } }, // pand
1375
1376 { ISD::OR, MVT::v16i8, { 1, 1, 1, 1 } }, // por
1377 { ISD::OR, MVT::v8i16, { 1, 1, 1, 1 } }, // por
1378 { ISD::OR, MVT::v4i32, { 1, 1, 1, 1 } }, // por
1379 { ISD::OR, MVT::v2i64, { 1, 1, 1, 1 } }, // por
1380
1381 { ISD::XOR, MVT::v16i8, { 1, 1, 1, 1 } }, // pxor
1382 { ISD::XOR, MVT::v8i16, { 1, 1, 1, 1 } }, // pxor
1383 { ISD::XOR, MVT::v4i32, { 1, 1, 1, 1 } }, // pxor
1384 { ISD::XOR, MVT::v2i64, { 1, 1, 1, 1 } }, // pxor
1385
1386 { ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq
1387 { ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq
1388
1389 { ISD::MUL, MVT::v16i8, { 6, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack
1390 { ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw
1391 { ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle
1392 { ISD::MUL, MVT::v2i64, { 7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add
1393
1394 { X86ISD::PMULUDQ, MVT::v2i64, { 1, 5, 1, 1 } },
1395
1396 { ISD::FDIV, MVT::f32, { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/
1397 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/
1398 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/
1399 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/
1400
1401 { ISD::FNEG, MVT::f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1402 { ISD::FNEG, MVT::f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1403 { ISD::FNEG, MVT::v4f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1404 { ISD::FNEG, MVT::v2f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1405
1406 { ISD::FADD, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1407 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1408 { ISD::FADD, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1409
1410 { ISD::FSUB, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1411 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1412 { ISD::FSUB, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1413
1414 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1415 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1416 };
1417
1418 if (ST->hasSSE2())
1419 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
1420 if (auto KindCost = Entry->Cost[CostKind])
1421 return LT.first * *KindCost;
1422
1423 static const CostKindTblEntry SSE1CostTable[] = {
1424 { ISD::FDIV, MVT::f32, { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/
1425 { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/
1426
1427 { ISD::FNEG, MVT::f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1428 { ISD::FNEG, MVT::v4f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1429
1430 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1431 { ISD::FADD, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1432
1433 { ISD::FSUB, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1434 { ISD::FSUB, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1435
1436 { ISD::FMUL, MVT::f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1437 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1438 };
1439
1440 if (ST->hasSSE1())
1441 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1442 if (auto KindCost = Entry->Cost[CostKind])
1443 return LT.first * *KindCost;
1444
1445 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
1446 { ISD::ADD, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1447 { ISD::SUB, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1448 { ISD::MUL, MVT::i64, { 2, 6, 1, 2 } },
1449 };
1450
1451 if (ST->is64Bit())
1452 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1453 if (auto KindCost = Entry->Cost[CostKind])
1454 return LT.first * *KindCost;
1455
1456 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1457 { ISD::ADD, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1458 { ISD::ADD, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1459 { ISD::ADD, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1460
1461 { ISD::SUB, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1462 { ISD::SUB, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1463 { ISD::SUB, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1464
1465 { ISD::MUL, MVT::i8, { 3, 4, 1, 1 } },
1466 { ISD::MUL, MVT::i16, { 2, 4, 1, 1 } },
1467 { ISD::MUL, MVT::i32, { 1, 4, 1, 1 } },
1468
1469 { ISD::FNEG, MVT::f64, { 2, 2, 1, 3 } }, // (x87)
1470 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1471 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1472 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // (x87)
1473 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87)
1474 };
1475
1476 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1477 if (auto KindCost = Entry->Cost[CostKind])
1478 return LT.first * *KindCost;
1479
1480 // It is not a good idea to vectorize division. We have to scalarize it and
1481 // in the process we will often end up having to spilling regular
1482 // registers. The overhead of division is going to dominate most kernels
1483 // anyways so try hard to prevent vectorization of division - it is
1484 // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1485 // to hide "20 cycles" for each lane.
1486 if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() &&
1487 (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
1488 ISD == ISD::UREM)) {
1489 InstructionCost ScalarCost =
1491 Op1Info.getNoProps(), Op2Info.getNoProps());
1492 return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1493 }
1494
1495 // Handle some basic single instruction code size cases.
1496 if (CostKind == TTI::TCK_CodeSize) {
1497 switch (ISD) {
1498 case ISD::FADD:
1499 case ISD::FSUB:
1500 case ISD::FMUL:
1501 case ISD::FDIV:
1502 case ISD::FNEG:
1503 case ISD::AND:
1504 case ISD::OR:
1505 case ISD::XOR:
1506 return LT.first;
1507 break;
1508 }
1509 }
1510
1511 // Fallback to the default implementation.
1512 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1513 Args, CxtI);
1514}
1515
1518 unsigned Opcode1, const SmallBitVector &OpcodeMask,
1520 if (isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
1521 return TTI::TCC_Basic;
1523}
1524
1526 TTI::ShuffleKind Kind, VectorType *BaseTp, ArrayRef<int> Mask,
1527 TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
1528 ArrayRef<const Value *> Args, const Instruction *CxtI) {
1529 // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1530 // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1531 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(BaseTp);
1532
1533 Kind = improveShuffleKindFromMask(Kind, Mask, BaseTp, Index, SubTp);
1534
1535 // If all args are constant than this will be constant folded away.
1536 if (!Args.empty() &&
1537 all_of(Args, [](const Value *Arg) { return isa<Constant>(Arg); }))
1538 return TTI::TCC_Free;
1539
1540 // Recognize a basic concat_vector shuffle.
1541 if (Kind == TTI::SK_PermuteTwoSrc &&
1542 Mask.size() == (2 * BaseTp->getElementCount().getKnownMinValue()) &&
1543 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
1546 CostKind, Mask.size() / 2, BaseTp);
1547
1548 // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1549 if (Kind == TTI::SK_Transpose)
1550 Kind = TTI::SK_PermuteTwoSrc;
1551
1552 if (Kind == TTI::SK_Broadcast) {
1553 // For Broadcasts we are splatting the first element from the first input
1554 // register, so only need to reference that input and all the output
1555 // registers are the same.
1556 LT.first = 1;
1557
1558 // If we're broadcasting a load then AVX/AVX2 can do this for free.
1559 using namespace PatternMatch;
1560 if (!Args.empty() && match(Args[0], m_OneUse(m_Load(m_Value()))) &&
1561 (ST->hasAVX2() ||
1562 (ST->hasAVX() && LT.second.getScalarSizeInBits() >= 32)))
1563 return TTI::TCC_Free;
1564 }
1565
1566 // Attempt to detect a cheaper inlane shuffle, avoiding 128-bit subvector
1567 // permutation.
1568 bool IsInLaneShuffle = false;
1569 if (BaseTp->getPrimitiveSizeInBits() > 0 &&
1570 (BaseTp->getPrimitiveSizeInBits() % 128) == 0 &&
1571 BaseTp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
1572 Mask.size() == BaseTp->getElementCount().getKnownMinValue()) {
1573 unsigned NumLanes = BaseTp->getPrimitiveSizeInBits() / 128;
1574 unsigned NumEltsPerLane = Mask.size() / NumLanes;
1575 if ((Mask.size() % NumLanes) == 0)
1576 IsInLaneShuffle = all_of(enumerate(Mask), [&](const auto &P) {
1577 return P.value() == PoisonMaskElem ||
1578 ((P.value() % Mask.size()) / NumEltsPerLane) ==
1579 (P.index() / NumEltsPerLane);
1580 });
1581 }
1582
1583 // Treat <X x bfloat> shuffles as <X x half>.
1584 if (LT.second.isVector() && LT.second.getScalarType() == MVT::bf16)
1585 LT.second = LT.second.changeVectorElementType(MVT::f16);
1586
1587 // Subvector extractions are free if they start at the beginning of a
1588 // vector and cheap if the subvectors are aligned.
1589 if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1590 int NumElts = LT.second.getVectorNumElements();
1591 if ((Index % NumElts) == 0)
1592 return TTI::TCC_Free;
1593 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1594 if (SubLT.second.isVector()) {
1595 int NumSubElts = SubLT.second.getVectorNumElements();
1596 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1597 return SubLT.first;
1598 // Handle some cases for widening legalization. For now we only handle
1599 // cases where the original subvector was naturally aligned and evenly
1600 // fit in its legalized subvector type.
1601 // FIXME: Remove some of the alignment restrictions.
1602 // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1603 // vectors.
1604 int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1605 if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1606 (NumSubElts % OrigSubElts) == 0 &&
1607 LT.second.getVectorElementType() ==
1608 SubLT.second.getVectorElementType() &&
1609 LT.second.getVectorElementType().getSizeInBits() ==
1611 assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1612 "Unexpected number of elements!");
1613 auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
1614 LT.second.getVectorNumElements());
1615 auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
1616 SubLT.second.getVectorNumElements());
1617 int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1618 InstructionCost ExtractCost = getShuffleCost(
1619 TTI::SK_ExtractSubvector, VecTy, {}, CostKind, ExtractIndex, SubTy);
1620
1621 // If the original size is 32-bits or more, we can use pshufd. Otherwise
1622 // if we have SSSE3 we can use pshufb.
1623 if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1624 return ExtractCost + 1; // pshufd or pshufb
1625
1626 assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1627 "Unexpected vector size");
1628
1629 return ExtractCost + 2; // worst case pshufhw + pshufd
1630 }
1631 }
1632 // If the extract subvector is not optimal, treat it as single op shuffle.
1634 }
1635
1636 // Subvector insertions are cheap if the subvectors are aligned.
1637 // Note that in general, the insertion starting at the beginning of a vector
1638 // isn't free, because we need to preserve the rest of the wide vector,
1639 // but if the destination vector legalizes to the same width as the subvector
1640 // then the insertion will simplify to a (free) register copy.
1641 if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1642 int NumElts = LT.second.getVectorNumElements();
1643 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1644 if (SubLT.second.isVector()) {
1645 int NumSubElts = SubLT.second.getVectorNumElements();
1646 bool MatchingTypes =
1647 NumElts == NumSubElts &&
1648 (SubTp->getElementCount().getKnownMinValue() % NumSubElts) == 0;
1649 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1650 return MatchingTypes ? TTI::TCC_Free : SubLT.first;
1651 }
1652
1653 // If the insertion isn't aligned, treat it like a 2-op shuffle.
1654 Kind = TTI::SK_PermuteTwoSrc;
1655 }
1656
1657 // Handle some common (illegal) sub-vector types as they are often very cheap
1658 // to shuffle even on targets without PSHUFB.
1659 EVT VT = TLI->getValueType(DL, BaseTp);
1660 if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1661 !ST->hasSSSE3()) {
1662 static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
1663 {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw
1664 {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw
1665 {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw
1666 {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw
1667 {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck
1668
1669 {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw
1670 {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw
1671 {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus
1672 {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck
1673
1674 {TTI::SK_Splice, MVT::v4i16, 2}, // punpck+psrldq
1675 {TTI::SK_Splice, MVT::v2i16, 2}, // punpck+psrldq
1676 {TTI::SK_Splice, MVT::v4i8, 2}, // punpck+psrldq
1677 {TTI::SK_Splice, MVT::v2i8, 2}, // punpck+psrldq
1678
1679 {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw
1680 {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw
1681 {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw
1682 {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw
1683 {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck
1684
1685 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
1686 {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
1687 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw
1688 {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw
1689 {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck
1690 };
1691
1692 if (ST->hasSSE2())
1693 if (const auto *Entry =
1694 CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1695 return Entry->Cost;
1696 }
1697
1698 // We are going to permute multiple sources and the result will be in multiple
1699 // destinations. Providing an accurate cost only for splits where the element
1700 // type remains the same.
1701 if (LT.first != 1) {
1702 MVT LegalVT = LT.second;
1703 if (LegalVT.isVector() &&
1704 LegalVT.getVectorElementType().getSizeInBits() ==
1706 LegalVT.getVectorNumElements() <
1707 cast<FixedVectorType>(BaseTp)->getNumElements()) {
1708 unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
1709 unsigned LegalVTSize = LegalVT.getStoreSize();
1710 // Number of source vectors after legalization:
1711 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1712 // Number of destination vectors after legalization:
1713 InstructionCost NumOfDests = LT.first;
1714
1715 auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
1716 LegalVT.getVectorNumElements());
1717
1718 if (!Mask.empty() && NumOfDests.isValid()) {
1719 // Try to perform better estimation of the permutation.
1720 // 1. Split the source/destination vectors into real registers.
1721 // 2. Do the mask analysis to identify which real registers are
1722 // permuted. If more than 1 source registers are used for the
1723 // destination register building, the cost for this destination register
1724 // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
1725 // source register is used, build mask and calculate the cost as a cost
1726 // of PermuteSingleSrc.
1727 // Also, for the single register permute we try to identify if the
1728 // destination register is just a copy of the source register or the
1729 // copy of the previous destination register (the cost is
1730 // TTI::TCC_Basic). If the source register is just reused, the cost for
1731 // this operation is TTI::TCC_Free.
1732 NumOfDests =
1734 FixedVectorType::get(BaseTp->getElementType(), Mask.size()))
1735 .first;
1736 unsigned E = *NumOfDests.getValue();
1737 unsigned NormalizedVF =
1738 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
1739 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
1740 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
1741 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
1742 copy(Mask, NormalizedMask.begin());
1743 unsigned PrevSrcReg = 0;
1744 ArrayRef<int> PrevRegMask;
1747 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
1748 [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask,
1749 &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
1750 if (!ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) {
1751 // Check if the previous register can be just copied to the next
1752 // one.
1753 if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
1754 PrevRegMask != RegMask)
1756 RegMask, CostKind, 0, nullptr);
1757 else
1758 // Just a copy of previous destination register.
1760 return;
1761 }
1762 if (SrcReg != DestReg &&
1763 any_of(RegMask, [](int I) { return I != PoisonMaskElem; })) {
1764 // Just a copy of the source register.
1766 }
1767 PrevSrcReg = SrcReg;
1768 PrevRegMask = RegMask;
1769 },
1770 [this, SingleOpTy, CostKind, &Cost](ArrayRef<int> RegMask,
1771 unsigned /*Unused*/,
1772 unsigned /*Unused*/) {
1773 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
1774 CostKind, 0, nullptr);
1775 });
1776 return Cost;
1777 }
1778
1779 InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1780 return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1781 {}, CostKind, 0, nullptr);
1782 }
1783
1784 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
1785 }
1786
1787 static const CostTblEntry AVX512VBMIShuffleTbl[] = {
1788 {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
1789 {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
1790
1791 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
1792 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
1793
1794 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
1795 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
1796 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b
1797 };
1798
1799 if (ST->hasVBMI())
1800 if (const auto *Entry =
1801 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1802 return LT.first * Entry->Cost;
1803
1804 static const CostTblEntry AVX512BWShuffleTbl[] = {
1805 {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1806 {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
1807 {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1808
1809 {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
1810 {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
1811 {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
1812 {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
1813
1814 {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
1815 {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
1816 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
1817 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
1818 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
1819
1820 {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
1821 {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
1822 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
1823 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
1824 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
1825
1826 {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
1827 {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb
1828
1829 {TTI::SK_Splice, MVT::v32i16, 2}, // vshufi64x2 + palignr
1830 {TTI::SK_Splice, MVT::v32f16, 2}, // vshufi64x2 + palignr
1831 {TTI::SK_Splice, MVT::v64i8, 2}, // vshufi64x2 + palignr
1832 };
1833
1834 if (ST->hasBWI())
1835 if (const auto *Entry =
1836 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1837 return LT.first * Entry->Cost;
1838
1839 static const CostKindTblEntry AVX512ShuffleTbl[] = {
1840 {TTI::SK_Broadcast, MVT::v8f64, { 1, 1, 1, 1 } }, // vbroadcastsd
1841 {TTI::SK_Broadcast, MVT::v16f32, { 1, 1, 1, 1 } }, // vbroadcastss
1842 {TTI::SK_Broadcast, MVT::v8i64, { 1, 1, 1, 1 } }, // vpbroadcastq
1843 {TTI::SK_Broadcast, MVT::v16i32, { 1, 1, 1, 1 } }, // vpbroadcastd
1844 {TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw
1845 {TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw
1846 {TTI::SK_Broadcast, MVT::v64i8, { 1, 1, 1, 1 } }, // vpbroadcastb
1847
1848 {TTI::SK_Reverse, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1849 {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1850 {TTI::SK_Reverse, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1851 {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1852 {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca
1853 {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca
1854 {TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca
1855
1856 {TTI::SK_Splice, MVT::v8f64, { 1, 1, 1, 1 } }, // vpalignd
1857 {TTI::SK_Splice, MVT::v4f64, { 1, 1, 1, 1 } }, // vpalignd
1858 {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd
1859 {TTI::SK_Splice, MVT::v8f32, { 1, 1, 1, 1 } }, // vpalignd
1860 {TTI::SK_Splice, MVT::v8i64, { 1, 1, 1, 1 } }, // vpalignd
1861 {TTI::SK_Splice, MVT::v4i64, { 1, 1, 1, 1 } }, // vpalignd
1862 {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd
1863 {TTI::SK_Splice, MVT::v8i32, { 1, 1, 1, 1 } }, // vpalignd
1864 {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr
1865 {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr
1866 {TTI::SK_Splice, MVT::v64i8, { 4, 4, 4, 4 } }, // split + palignr
1867
1868 {TTI::SK_PermuteSingleSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1869 {TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermpd
1870 {TTI::SK_PermuteSingleSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermpd
1871 {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1872 {TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermps
1873 {TTI::SK_PermuteSingleSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermps
1874 {TTI::SK_PermuteSingleSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1875 {TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermq
1876 {TTI::SK_PermuteSingleSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermq
1877 {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1878 {TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermd
1879 {TTI::SK_PermuteSingleSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermd
1880 {TTI::SK_PermuteSingleSrc, MVT::v16i8, { 1, 3, 1, 1 } }, // pshufb
1881
1882 {TTI::SK_PermuteTwoSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermt2pd
1883 {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermt2ps
1884 {TTI::SK_PermuteTwoSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermt2q
1885 {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermt2d
1886 {TTI::SK_PermuteTwoSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermt2pd
1887 {TTI::SK_PermuteTwoSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermt2ps
1888 {TTI::SK_PermuteTwoSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermt2q
1889 {TTI::SK_PermuteTwoSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermt2d
1890 {TTI::SK_PermuteTwoSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermt2pd
1891 {TTI::SK_PermuteTwoSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermt2ps
1892 {TTI::SK_PermuteTwoSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermt2q
1893 {TTI::SK_PermuteTwoSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermt2d
1894
1895 // FIXME: This just applies the type legalization cost rules above
1896 // assuming these completely split.
1897 {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } },
1898 {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } },
1899 {TTI::SK_PermuteSingleSrc, MVT::v64i8, { 14, 14, 14, 14 } },
1900 {TTI::SK_PermuteTwoSrc, MVT::v32i16, { 42, 42, 42, 42 } },
1901 {TTI::SK_PermuteTwoSrc, MVT::v32f16, { 42, 42, 42, 42 } },
1902 {TTI::SK_PermuteTwoSrc, MVT::v64i8, { 42, 42, 42, 42 } },
1903
1904 {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq
1905 {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq
1906 {TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vpternlogq
1907 {TTI::SK_Select, MVT::v8f64, { 1, 1, 1, 1 } }, // vblendmpd
1908 {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps
1909 {TTI::SK_Select, MVT::v8i64, { 1, 1, 1, 1 } }, // vblendmq
1910 {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd
1911 };
1912
1913 if (ST->hasAVX512())
1914 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1915 if (auto KindCost = Entry->Cost[CostKind])
1916 return LT.first * *KindCost;
1917
1918 static const CostTblEntry AVX2InLaneShuffleTbl[] = {
1919 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 1}, // vpshufb
1920 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 1}, // vpshufb
1921 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpshufb
1922
1923 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 2}, // 2*vshufpd + vblendpd
1924 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 2}, // 2*vshufps + vblendps
1925 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 2}, // 2*vpshufd + vpblendd
1926 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 2}, // 2*vpshufd + vpblendd
1927 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // 2*vpshufb + vpor
1928 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 2}, // 2*vpshufb + vpor
1929 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // 2*vpshufb + vpor
1930 };
1931
1932 if (IsInLaneShuffle && ST->hasAVX2())
1933 if (const auto *Entry =
1934 CostTableLookup(AVX2InLaneShuffleTbl, Kind, LT.second))
1935 return LT.first * Entry->Cost;
1936
1937 static const CostTblEntry AVX2ShuffleTbl[] = {
1938 {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
1939 {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
1940 {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
1941 {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
1942 {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1943 {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw
1944 {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
1945
1946 {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
1947 {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
1948 {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
1949 {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
1950 {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1951 {TTI::SK_Reverse, MVT::v16f16, 2}, // vperm2i128 + pshufb
1952 {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
1953
1954 {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1955 {TTI::SK_Select, MVT::v16f16, 1}, // vpblendvb
1956 {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
1957
1958 {TTI::SK_Splice, MVT::v8i32, 2}, // vperm2i128 + vpalignr
1959 {TTI::SK_Splice, MVT::v8f32, 2}, // vperm2i128 + vpalignr
1960 {TTI::SK_Splice, MVT::v16i16, 2}, // vperm2i128 + vpalignr
1961 {TTI::SK_Splice, MVT::v16f16, 2}, // vperm2i128 + vpalignr
1962 {TTI::SK_Splice, MVT::v32i8, 2}, // vperm2i128 + vpalignr
1963
1964 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1965 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1966 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1967 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1968 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1969 // + vpblendvb
1970 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vperm2i128 + 2*vpshufb
1971 // + vpblendvb
1972 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
1973 // + vpblendvb
1974
1975 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
1976 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
1977 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
1978 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
1979 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1980 // + vpblendvb
1981 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 7}, // 2*vperm2i128 + 4*vpshufb
1982 // + vpblendvb
1983 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
1984 // + vpblendvb
1985 };
1986
1987 if (ST->hasAVX2())
1988 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1989 return LT.first * Entry->Cost;
1990
1991 static const CostTblEntry XOPShuffleTbl[] = {
1992 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
1993 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
1994 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
1995 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
1996 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1997 // + vinsertf128
1998 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
1999 // + vinsertf128
2000
2001 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
2002 // + vinsertf128
2003 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
2004 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
2005 // + vinsertf128
2006 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
2007 };
2008
2009 if (ST->hasXOP())
2010 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
2011 return LT.first * Entry->Cost;
2012
2013 static const CostTblEntry AVX1InLaneShuffleTbl[] = {
2014 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermilpd
2015 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermilpd
2016 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermilps
2017 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermilps
2018
2019 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
2020 // + vpor + vinsertf128
2021 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vextractf128 + 2*pshufb
2022 // + vpor + vinsertf128
2023 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
2024 // + vpor + vinsertf128
2025
2026 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 2}, // 2*vshufpd + vblendpd
2027 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 2}, // 2*vshufps + vblendps
2028 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 2}, // 2*vpermilpd + vblendpd
2029 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 2}, // 2*vpermilps + vblendps
2030 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 4*pshufb
2031 // + 2*vpor + vinsertf128
2032 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 9}, // 2*vextractf128 + 4*pshufb
2033 // + 2*vpor + vinsertf128
2034 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 4*pshufb
2035 // + 2*vpor + vinsertf128
2036 };
2037
2038 if (IsInLaneShuffle && ST->hasAVX())
2039 if (const auto *Entry =
2040 CostTableLookup(AVX1InLaneShuffleTbl, Kind, LT.second))
2041 return LT.first * Entry->Cost;
2042
2043 static const CostTblEntry AVX1ShuffleTbl[] = {
2044 {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
2045 {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
2046 {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
2047 {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
2048 {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
2049 {TTI::SK_Broadcast, MVT::v16f16, 3}, // vpshuflw + vpshufd + vinsertf128
2050 {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
2051
2052 {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
2053 {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
2054 {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
2055 {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
2056 {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
2057 // + vinsertf128
2058 {TTI::SK_Reverse, MVT::v16f16, 4}, // vextractf128 + 2*pshufb
2059 // + vinsertf128
2060 {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
2061 // + vinsertf128
2062
2063 {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
2064 {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
2065 {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
2066 {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
2067 {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
2068 {TTI::SK_Select, MVT::v16f16, 3}, // vpand + vpandn + vpor
2069 {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
2070
2071 {TTI::SK_Splice, MVT::v4i64, 2}, // vperm2f128 + shufpd
2072 {TTI::SK_Splice, MVT::v4f64, 2}, // vperm2f128 + shufpd
2073 {TTI::SK_Splice, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
2074 {TTI::SK_Splice, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
2075 {TTI::SK_Splice, MVT::v16i16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2076 {TTI::SK_Splice, MVT::v16f16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2077 {TTI::SK_Splice, MVT::v32i8, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2078
2079 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
2080 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
2081 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
2082 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
2083 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
2084 // + 2*por + vinsertf128
2085 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 8}, // vextractf128 + 4*pshufb
2086 // + 2*por + vinsertf128
2087 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
2088 // + 2*por + vinsertf128
2089
2090 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
2091 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
2092 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
2093 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
2094 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
2095 // + 4*por + vinsertf128
2096 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 15}, // 2*vextractf128 + 8*pshufb
2097 // + 4*por + vinsertf128
2098 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
2099 // + 4*por + vinsertf128
2100 };
2101
2102 if (ST->hasAVX())
2103 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
2104 return LT.first * Entry->Cost;
2105
2106 static const CostTblEntry SSE41ShuffleTbl[] = {
2107 {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
2108 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
2109 {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
2110 {TTI::SK_Select, MVT::v4f32, 1}, // blendps
2111 {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
2112 {TTI::SK_Select, MVT::v8f16, 1}, // pblendw
2113 {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
2114 };
2115
2116 if (ST->hasSSE41())
2117 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
2118 return LT.first * Entry->Cost;
2119
2120 static const CostTblEntry SSSE3ShuffleTbl[] = {
2121 {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
2122 {TTI::SK_Broadcast, MVT::v8f16, 1}, // pshufb
2123 {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
2124
2125 {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
2126 {TTI::SK_Reverse, MVT::v8f16, 1}, // pshufb
2127 {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
2128
2129 {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
2130 {TTI::SK_Select, MVT::v8f16, 3}, // 2*pshufb + por
2131 {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
2132
2133 {TTI::SK_Splice, MVT::v4i32, 1}, // palignr
2134 {TTI::SK_Splice, MVT::v4f32, 1}, // palignr
2135 {TTI::SK_Splice, MVT::v8i16, 1}, // palignr
2136 {TTI::SK_Splice, MVT::v8f16, 1}, // palignr
2137 {TTI::SK_Splice, MVT::v16i8, 1}, // palignr
2138
2139 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
2140 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // pshufb
2141 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
2142
2143 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
2144 {TTI::SK_PermuteTwoSrc, MVT::v8f16, 3}, // 2*pshufb + por
2145 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
2146 };
2147
2148 if (ST->hasSSSE3())
2149 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
2150 return LT.first * Entry->Cost;
2151
2152 static const CostTblEntry SSE2ShuffleTbl[] = {
2153 {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
2154 {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
2155 {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
2156 {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
2157 {TTI::SK_Broadcast, MVT::v8f16, 2}, // pshuflw + pshufd
2158 {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
2159
2160 {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
2161 {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
2162 {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
2163 {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
2164 {TTI::SK_Reverse, MVT::v8f16, 3}, // pshuflw + pshufhw + pshufd
2165 {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
2166 // + 2*pshufd + 2*unpck + packus
2167
2168 {TTI::SK_Select, MVT::v2i64, 1}, // movsd
2169 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
2170 {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
2171 {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
2172 {TTI::SK_Select, MVT::v8f16, 3}, // pand + pandn + por
2173 {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
2174
2175 {TTI::SK_Splice, MVT::v2i64, 1}, // shufpd
2176 {TTI::SK_Splice, MVT::v2f64, 1}, // shufpd
2177 {TTI::SK_Splice, MVT::v4i32, 2}, // 2*{unpck,movsd,pshufd}
2178 {TTI::SK_Splice, MVT::v8i16, 3}, // psrldq + psrlldq + por
2179 {TTI::SK_Splice, MVT::v8f16, 3}, // psrldq + psrlldq + por
2180 {TTI::SK_Splice, MVT::v16i8, 3}, // psrldq + psrlldq + por
2181
2182 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
2183 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
2184 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
2185 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
2186 // + pshufd/unpck
2187 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 5}, // 2*pshuflw + 2*pshufhw
2188 // + pshufd/unpck
2189 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
2190 // + 2*pshufd + 2*unpck + 2*packus
2191
2192 { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
2193 { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
2194 { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
2195 { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
2196 { TTI::SK_PermuteTwoSrc, MVT::v8f16, 8 }, // blend+permute
2197 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
2198 };
2199
2200 static const CostTblEntry SSE3BroadcastLoadTbl[] = {
2201 {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
2202 };
2203
2204 if (ST->hasSSE2()) {
2205 bool IsLoad =
2206 llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); });
2207 if (ST->hasSSE3() && IsLoad)
2208 if (const auto *Entry =
2209 CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
2211 LT.second.getVectorElementCount()) &&
2212 "Table entry missing from isLegalBroadcastLoad()");
2213 return LT.first * Entry->Cost;
2214 }
2215
2216 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
2217 return LT.first * Entry->Cost;
2218 }
2219
2220 static const CostTblEntry SSE1ShuffleTbl[] = {
2221 { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
2222 { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
2223 { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
2224 { TTI::SK_Splice, MVT::v4f32, 2 }, // 2*shufps
2225 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
2226 { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
2227 };
2228
2229 if (ST->hasSSE1()) {
2230 if (LT.first == 1 && LT.second == MVT::v4f32 && Mask.size() == 4) {
2231 // SHUFPS: both pairs must come from the same source register.
2232 auto MatchSHUFPS = [](int X, int Y) {
2233 return X < 0 || Y < 0 || ((X & 4) == (Y & 4));
2234 };
2235 if (MatchSHUFPS(Mask[0], Mask[1]) && MatchSHUFPS(Mask[2], Mask[3]))
2236 return 1;
2237 }
2238 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
2239 return LT.first * Entry->Cost;
2240 }
2241
2242 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
2243}
2244
2246 Type *Src,
2249 const Instruction *I) {
2250 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2251 assert(ISD && "Invalid opcode");
2252
2253 // The cost tables include both specific, custom (non-legal) src/dst type
2254 // conversions and generic, legalized types. We test for customs first, before
2255 // falling back to legalization.
2256 // FIXME: Need a better design of the cost table to handle non-simple types of
2257 // potential massive combinations (elem_num x src_type x dst_type).
2258 static const TypeConversionCostKindTblEntry AVX512BWConversionTbl[]{
2259 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2260 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2261
2262 // Mask sign extend has an instruction.
2263 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2264 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2265 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2266 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2267 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2268 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2269 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2270 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2271 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2272 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2273 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2274 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2275 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2276 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2277 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, { 1, 1, 1, 1 } },
2278 { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, { 1, 1, 1, 1 } },
2279 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, { 1, 1, 1, 1 } },
2280
2281 // Mask zero extend is a sext + shift.
2282 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2283 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2284 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2285 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2286 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2287 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2288 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2289 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2290 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2291 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2292 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2293 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2294 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2295 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2296 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, { 2, 1, 1, 1 } },
2297 { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, { 2, 1, 1, 1 } },
2298 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, { 2, 1, 1, 1 } },
2299
2300 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2301 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2302 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2303 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2304 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2305 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2306 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2307 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2308 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2309 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2310 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2311 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2312 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2313 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2314 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, { 2, 1, 1, 1 } },
2315 { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, { 2, 1, 1, 1 } },
2316 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, { 2, 1, 1, 1 } },
2317
2318 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 2, 1, 1, 1 } },
2319 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // widen to zmm
2320 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, { 2, 1, 1, 1 } }, // vpmovwb
2321 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, { 2, 1, 1, 1 } }, // vpmovwb
2322 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, { 2, 1, 1, 1 } }, // vpmovwb
2323 };
2324
2325 static const TypeConversionCostKindTblEntry AVX512DQConversionTbl[] = {
2326 // Mask sign extend has an instruction.
2327 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2328 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2329 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2330 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2331 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2332 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, { 1, 1, 1, 1 } },
2333 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } },
2334 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } },
2335
2336 // Mask zero extend is a sext + shift.
2337 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1, } },
2338 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1, } },
2339 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1, } },
2340 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1, } },
2341 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1, } },
2342 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, { 2, 1, 1, 1, } },
2343 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1, } },
2344 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1, } },
2345
2346 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2347 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2348 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2349 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2350 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2351 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } },
2352 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } },
2353 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, { 2, 1, 1, 1 } },
2354
2355 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2356 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2357
2358 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2359 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2360
2361 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2362 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2363
2364 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2365 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2366 };
2367
2368 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
2369 // 256-bit wide vectors.
2370
2371 static const TypeConversionCostKindTblEntry AVX512FConversionTbl[] = {
2372 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 1, 1, 1, 1 } },
2373 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, { 3, 1, 1, 1 } },
2374 { ISD::FP_EXTEND, MVT::v16f64, MVT::v16f32, { 4, 1, 1, 1 } }, // 2*vcvtps2pd+vextractf64x4
2375 { ISD::FP_EXTEND, MVT::v16f32, MVT::v16f16, { 1, 1, 1, 1 } }, // vcvtph2ps
2376 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
2377 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 1, 1, 1, 1 } },
2378 { ISD::FP_ROUND, MVT::v16f16, MVT::v16f32, { 1, 1, 1, 1 } }, // vcvtps2ph
2379
2380 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2381 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2382 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2383 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2384 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2385 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2386 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2387 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2388 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2389 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2390 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2391 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2392 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2393 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2394 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2395 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, { 2, 1, 1, 1 } }, // vpmovdb
2396 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, { 2, 1, 1, 1 } }, // vpmovdb
2397 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2398 { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2399 { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2400 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2401 { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2402 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, { 2, 1, 1, 1 } }, // vpmovqb
2403 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, { 1, 1, 1, 1 } }, // vpshufb
2404 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2405 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2406 { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2407 { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2408 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2409 { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2410 { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2411 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, { 1, 1, 1, 1 } }, // vpmovqd
2412 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // zmm vpmovqd
2413 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, { 5, 1, 1, 1 } },// 2*vpmovqd+concat+vpmovdb
2414
2415 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } }, // extend to v16i32
2416 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 8, 1, 1, 1 } },
2417 { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, { 8, 1, 1, 1 } },
2418
2419 // Sign extend is zmm vpternlogd+vptruncdb.
2420 // Zero extend is zmm broadcast load+vptruncdw.
2421 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 3, 1, 1, 1 } },
2422 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 4, 1, 1, 1 } },
2423 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 3, 1, 1, 1 } },
2424 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 4, 1, 1, 1 } },
2425 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 3, 1, 1, 1 } },
2426 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 4, 1, 1, 1 } },
2427 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 3, 1, 1, 1 } },
2428 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 4, 1, 1, 1 } },
2429
2430 // Sign extend is zmm vpternlogd+vptruncdw.
2431 // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
2432 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 3, 1, 1, 1 } },
2433 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2434 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 3, 1, 1, 1 } },
2435 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2436 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 3, 1, 1, 1 } },
2437 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2438 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 3, 1, 1, 1 } },
2439 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2440
2441 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2442 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2443 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2444 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2445 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2446 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2447 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2448 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2449 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2450 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2451
2452 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2453 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2454 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogq
2455 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2456
2457 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2458 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2459 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2460 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2461 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2462 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2463 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2464 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2465 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2466 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2467
2468 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2469 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2470
2471 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2472 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2473 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2474 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2475 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2476 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2477 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2478 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2479
2480 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2481 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2482 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2483 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2484 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2485 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2486 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2487 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2488 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, {26, 1, 1, 1 } },
2489 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 5, 1, 1, 1 } },
2490
2491 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2492 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, { 7, 1, 1, 1 } },
2493 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64, {15, 1, 1, 1 } },
2494 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32, {11, 1, 1, 1 } },
2495 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64, {31, 1, 1, 1 } },
2496 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2497 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, { 7, 1, 1, 1 } },
2498 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, { 5, 1, 1, 1 } },
2499 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64, {15, 1, 1, 1 } },
2500 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2501 { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, { 3, 1, 1, 1 } },
2502
2503 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2504 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2505 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, { 3, 1, 1, 1 } },
2506 { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, { 1, 1, 1, 1 } },
2507 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, { 3, 1, 1, 1 } },
2508 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, { 3, 1, 1, 1 } },
2509 };
2510
2511 static const TypeConversionCostKindTblEntry AVX512BWVLConversionTbl[] {
2512 // Mask sign extend has an instruction.
2513 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2514 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2515 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2516 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2517 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2518 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2519 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2520 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2521 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2522 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2523 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2524 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2525 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2526 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2527 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, { 1, 1, 1, 1 } },
2528 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, { 1, 1, 1, 1 } },
2529 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, { 1, 1, 1, 1 } },
2530
2531 // Mask zero extend is a sext + shift.
2532 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2533 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2534 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2535 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2536 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2537 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2538 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2539 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2540 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2541 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2542 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2543 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2544 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2545 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2546 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, { 2, 1, 1, 1 } },
2547 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, { 2, 1, 1, 1 } },
2548 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, { 2, 1, 1, 1 } },
2549
2550 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2551 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2552 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2553 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2554 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2555 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2556 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2557 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2558 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2559 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2560 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2561 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2562 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2563 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2564 { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, { 2, 1, 1, 1 } },
2565 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, { 2, 1, 1, 1 } },
2566 { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, { 2, 1, 1, 1 } },
2567
2568 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } },
2569 };
2570
2571 static const TypeConversionCostKindTblEntry AVX512DQVLConversionTbl[] = {
2572 // Mask sign extend has an instruction.
2573 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2574 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2575 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2576 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, { 1, 1, 1, 1 } },
2577 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2578 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, { 1, 1, 1, 1 } },
2579 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } },
2580 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2581
2582 // Mask zero extend is a sext + shift.
2583 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } },
2584 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1 } },
2585 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } },
2586 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, { 2, 1, 1, 1 } },
2587 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } },
2588 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, { 2, 1, 1, 1 } },
2589 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } },
2590 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } },
2591
2592 { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, { 2, 1, 1, 1 } },
2593 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } },
2594 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2595 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2596 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2597 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2598 { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, { 2, 1, 1, 1 } },
2599 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2600
2601 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2602 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2603 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2604 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2605
2606 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2607 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2608 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2609 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2610
2611 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2612 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2613 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2614 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2615
2616 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2617 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2618 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2619 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2620 };
2621
2622 static const TypeConversionCostKindTblEntry AVX512VLConversionTbl[] = {
2623 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2624 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2625 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2626 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 8, 1, 1, 1 } }, // split+2*v8i8
2627 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2628 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2629 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2630 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 8, 1, 1, 1 } }, // split+2*v8i16
2631 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2632 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2633 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2634 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2635 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2636 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2637 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // vpmovqd
2638 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqb
2639 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqw
2640 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, { 2, 1, 1, 1 } }, // vpmovwb
2641
2642 // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
2643 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
2644 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 5, 1, 1, 1 } },
2645 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 6, 1, 1, 1 } },
2646 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 5, 1, 1, 1 } },
2647 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 6, 1, 1, 1 } },
2648 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 5, 1, 1, 1 } },
2649 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 6, 1, 1, 1 } },
2650 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, {10, 1, 1, 1 } },
2651 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, {12, 1, 1, 1 } },
2652
2653 // sign extend is vpcmpeq+maskedmove+vpmovdw
2654 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
2655 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2656 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 5, 1, 1, 1 } },
2657 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2658 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 5, 1, 1, 1 } },
2659 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2660 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 5, 1, 1, 1 } },
2661 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, {10, 1, 1, 1 } },
2662 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, {12, 1, 1, 1 } },
2663
2664 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogd
2665 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2666 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogd
2667 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2668 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogd
2669 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2670 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2671 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2672
2673 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogq
2674 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2675 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogq
2676 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2677
2678 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2679 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2680 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2681 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2682 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2683 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2684 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2685 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2686 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2687 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2688 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2689 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2690
2691 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2692 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2693 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2694 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2695
2696 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
2697 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
2698 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2699 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2700 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2701 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2702 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 1, 1, 1, 1 } },
2703 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
2704 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2705 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2706 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 5, 1, 1, 1 } },
2707 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2708 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 5, 1, 1, 1 } },
2709
2710 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2711 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2712 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, { 5, 1, 1, 1 } },
2713
2714 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
2715 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
2716 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
2717 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
2718 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2719 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2720 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2721 };
2722
2723 static const TypeConversionCostKindTblEntry AVX2ConversionTbl[] = {
2724 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2725 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2726 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2727 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2728 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2729 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2730
2731 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2732 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2733 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2734 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2735 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2736 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2737 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2738 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2739 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2740 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2741 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2742 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2743 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2744 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2745
2746 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2747
2748 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 4, 1, 1, 1 } },
2749 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 4, 1, 1, 1 } },
2750 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 1, 1, 1, 1 } },
2751 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 1, 1, 1, 1 } },
2752 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 1, 1, 1, 1 } },
2753 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 4, 1, 1, 1 } },
2754 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 4, 1, 1, 1 } },
2755 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 1, 1, 1, 1 } },
2756 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 1, 1, 1, 1 } },
2757 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 5, 1, 1, 1 } },
2758 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } },
2759 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 2, 1, 1, 1 } },
2760
2761 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 3, 1, 1, 1 } },
2762 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 3, 1, 1, 1 } },
2763
2764 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2765 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2766 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2767 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 3, 1, 1, 1 } },
2768
2769 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 3, 1, 1, 1 } },
2770 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 3, 1, 1, 1 } },
2771 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2772 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
2773 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2774 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 4, 1, 1, 1 } },
2775 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 3, 1, 1, 1 } },
2776 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 4, 1, 1, 1 } },
2777
2778 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2779 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2780 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2781 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2782 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2783 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2784 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 3, 1, 1, 1 } },
2785
2786 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2787 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2788 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2789 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2790 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 2, 1, 1, 1 } },
2791 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 1, 1, 1, 1 } },
2792 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 2, 1, 1, 1 } },
2793 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2794 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2795 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2796 };
2797
2798 static const TypeConversionCostKindTblEntry AVXConversionTbl[] = {
2799 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2800 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2801 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2802 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2803 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2804 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2805
2806 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2807 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2808 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2809 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2810 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2811 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2812 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2813 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2814 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2815 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2816 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2817 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2818
2819 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 4, 1, 1, 1 } },
2820 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 5, 1, 1, 1 } },
2821 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 4, 1, 1, 1 } },
2822 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 9, 1, 1, 1 } },
2823 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, {11, 1, 1, 1 } },
2824
2825 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 6, 1, 1, 1 } },
2826 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 6, 1, 1, 1 } },
2827 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // and+extract+packuswb
2828 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 5, 1, 1, 1 } },
2829 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
2830 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 5, 1, 1, 1 } },
2831 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 3, 1, 1, 1 } }, // and+extract+2*packusdw
2832 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 2, 1, 1, 1 } },
2833
2834 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, { 3, 1, 1, 1 } },
2835 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, { 3, 1, 1, 1 } },
2836 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, { 8, 1, 1, 1 } },
2837 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2838 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2839 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2840 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2841 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2842 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2843 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2844 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 5, 1, 1, 1 } },
2845 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 8, 1, 1, 1 } },
2846
2847 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, { 7, 1, 1, 1 } },
2848 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, { 7, 1, 1, 1 } },
2849 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, { 6, 1, 1, 1 } },
2850 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2851 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2852 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2853 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2854 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 4, 1, 1, 1 } },
2855 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 4, 1, 1, 1 } },
2856 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
2857 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 6, 1, 1, 1 } },
2858 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 8, 1, 1, 1 } },
2859 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, {10, 1, 1, 1 } },
2860 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, {10, 1, 1, 1 } },
2861 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {18, 1, 1, 1 } },
2862 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2863 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, {10, 1, 1, 1 } },
2864
2865 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2866 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2867 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2868 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
2869 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
2870 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
2871 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
2872 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
2873 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 2, 1, 1, 1 } },
2874 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 2, 1, 1, 1 } },
2875 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 5, 1, 1, 1 } },
2876
2877 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2878 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2879 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2880 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
2881 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
2882 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
2883 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
2884 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
2885 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
2886 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2887 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 6, 1, 1, 1 } },
2888 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 7, 1, 1, 1 } },
2889 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 7, 1, 1, 1 } },
2890
2891 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, { 1, 1, 1, 1 } },
2892 { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, { 1, 1, 1, 1 } },
2893 };
2894
2895 static const TypeConversionCostKindTblEntry SSE41ConversionTbl[] = {
2896 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
2897 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
2898 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
2899 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
2900 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
2901 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
2902 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
2903 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
2904 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
2905 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
2906 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
2907 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
2908
2909 // These truncates end up widening elements.
2910 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 1, 1, 1, 1 } }, // PMOVXZBQ
2911 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 1, 1, 1, 1 } }, // PMOVXZWQ
2912 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 1, 1, 1, 1 } }, // PMOVXZBD
2913
2914 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 2, 1, 1, 1 } },
2915 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 2, 1, 1, 1 } },
2916 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 2, 1, 1, 1 } },
2917
2918 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
2919 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
2920 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
2921 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
2922 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
2923 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2924 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
2925 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2926 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
2927 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 1, 1, 1, 1 } },
2928 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2929
2930 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
2931 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
2932 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 4, 1, 1, 1 } },
2933 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 4, 1, 1, 1 } },
2934 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
2935 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2936 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
2937 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2938 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 3, 1, 1, 1 } },
2939 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
2940 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 2, 1, 1, 1 } },
2941 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {12, 1, 1, 1 } },
2942 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {22, 1, 1, 1 } },
2943 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 4, 1, 1, 1 } },
2944
2945 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
2946 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
2947 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
2948 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
2949 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
2950 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
2951 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
2952 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
2953 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
2954 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
2955
2956 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
2957 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
2958 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
2959 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
2960 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
2961 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
2962 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
2963 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
2964 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
2965 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2966 };
2967
2968 static const TypeConversionCostKindTblEntry SSE2ConversionTbl[] = {
2969 // These are somewhat magic numbers justified by comparing the
2970 // output of llvm-mca for our various supported scheduler models
2971 // and basing it off the worst case scenario.
2972 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
2973 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
2974 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 3, 1, 1, 1 } },
2975 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 3, 1, 1, 1 } },
2976 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 3, 1, 1, 1 } },
2977 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
2978 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 3, 1, 1, 1 } },
2979 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
2980 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
2981 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 4, 1, 1, 1 } },
2982 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 8, 1, 1, 1 } },
2983 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 8, 1, 1, 1 } },
2984
2985 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
2986 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
2987 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 8, 1, 1, 1 } },
2988 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 9, 1, 1, 1 } },
2989 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
2990 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 4, 1, 1, 1 } },
2991 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 4, 1, 1, 1 } },
2992 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
2993 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 7, 1, 1, 1 } },
2994 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 7, 1, 1, 1 } },
2995 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
2996 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, {15, 1, 1, 1 } },
2997 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {18, 1, 1, 1 } },
2998
2999 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
3000 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3001 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
3002 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
3003 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
3004 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
3005 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
3006 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
3007 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
3008 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
3009
3010 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
3011 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3012 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
3013 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, {15, 1, 1, 1 } },
3014 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
3015 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
3016 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
3017 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
3018 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 8, 1, 1, 1 } },
3019 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 8, 1, 1, 1 } },
3020
3021 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
3022 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
3023 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 2, 1, 1, 1 } },
3024 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 3, 1, 1, 1 } },
3025 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
3026 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 2, 1, 1, 1 } },
3027 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 2, 1, 1, 1 } },
3028 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 3, 1, 1, 1 } },
3029 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
3030 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 2, 1, 1, 1 } },
3031 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
3032 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 2, 1, 1, 1 } },
3033
3034 // These truncates are really widening elements.
3035 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 1, 1, 1, 1 } }, // PSHUFD
3036 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } }, // PUNPCKLWD+DQ
3037 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // PUNPCKLBW+WD+PSHUFD
3038 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 1, 1, 1, 1 } }, // PUNPCKLWD
3039 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } }, // PUNPCKLBW+WD
3040 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 1, 1, 1, 1 } }, // PUNPCKLBW
3041
3042 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 2, 1, 1, 1 } }, // PAND+PACKUSWB
3043 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } },
3044 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 3, 1, 1, 1 } }, // PAND+2*PACKUSWB
3045 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 7, 1, 1, 1 } },
3046 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, { 1, 1, 1, 1 } },
3047 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 3, 1, 1, 1 } },
3048 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
3049 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, {10, 1, 1, 1 } },
3050 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 4, 1, 1, 1 } }, // PAND+3*PACKUSWB
3051 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 2, 1, 1, 1 } }, // PSHUFD+PSHUFLW
3052 { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, { 1, 1, 1, 1 } }, // PSHUFD
3053 };
3054
3055 static const TypeConversionCostKindTblEntry F16ConversionTbl[] = {
3056 { ISD::FP_ROUND, MVT::f16, MVT::f32, { 1, 1, 1, 1 } },
3057 { ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, { 1, 1, 1, 1 } },
3058 { ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, { 1, 1, 1, 1 } },
3059 { ISD::FP_EXTEND, MVT::f32, MVT::f16, { 1, 1, 1, 1 } },
3060 { ISD::FP_EXTEND, MVT::f64, MVT::f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
3061 { ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, { 1, 1, 1, 1 } },
3062 { ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, { 1, 1, 1, 1 } },
3063 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
3064 };
3065
3066 // Attempt to map directly to (simple) MVT types to let us match custom entries.
3067 EVT SrcTy = TLI->getValueType(DL, Src);
3068 EVT DstTy = TLI->getValueType(DL, Dst);
3069
3070 // The function getSimpleVT only handles simple value types.
3071 if (SrcTy.isSimple() && DstTy.isSimple()) {
3072 MVT SimpleSrcTy = SrcTy.getSimpleVT();
3073 MVT SimpleDstTy = DstTy.getSimpleVT();
3074
3075 if (ST->useAVX512Regs()) {
3076 if (ST->hasBWI())
3077 if (const auto *Entry = ConvertCostTableLookup(
3078 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3079 if (auto KindCost = Entry->Cost[CostKind])
3080 return *KindCost;
3081
3082 if (ST->hasDQI())
3083 if (const auto *Entry = ConvertCostTableLookup(
3084 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3085 if (auto KindCost = Entry->Cost[CostKind])
3086 return *KindCost;
3087
3088 if (ST->hasAVX512())
3089 if (const auto *Entry = ConvertCostTableLookup(
3090 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3091 if (auto KindCost = Entry->Cost[CostKind])
3092 return *KindCost;
3093 }
3094
3095 if (ST->hasBWI())
3096 if (const auto *Entry = ConvertCostTableLookup(
3097 AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3098 if (auto KindCost = Entry->Cost[CostKind])
3099 return *KindCost;
3100
3101 if (ST->hasDQI())
3102 if (const auto *Entry = ConvertCostTableLookup(
3103 AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3104 if (auto KindCost = Entry->Cost[CostKind])
3105 return *KindCost;
3106
3107 if (ST->hasAVX512())
3108 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3109 SimpleDstTy, SimpleSrcTy))
3110 if (auto KindCost = Entry->Cost[CostKind])
3111 return *KindCost;
3112
3113 if (ST->hasAVX2()) {
3114 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3115 SimpleDstTy, SimpleSrcTy))
3116 if (auto KindCost = Entry->Cost[CostKind])
3117 return *KindCost;
3118 }
3119
3120 if (ST->hasAVX()) {
3121 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3122 SimpleDstTy, SimpleSrcTy))
3123 if (auto KindCost = Entry->Cost[CostKind])
3124 return *KindCost;
3125 }
3126
3127 if (ST->hasF16C()) {
3128 if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD,
3129 SimpleDstTy, SimpleSrcTy))
3130 if (auto KindCost = Entry->Cost[CostKind])
3131 return *KindCost;
3132 }
3133
3134 if (ST->hasSSE41()) {
3135 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3136 SimpleDstTy, SimpleSrcTy))
3137 if (auto KindCost = Entry->Cost[CostKind])
3138 return *KindCost;
3139 }
3140
3141 if (ST->hasSSE2()) {
3142 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3143 SimpleDstTy, SimpleSrcTy))
3144 if (auto KindCost = Entry->Cost[CostKind])
3145 return *KindCost;
3146 }
3147
3148 if ((ISD == ISD::FP_ROUND && SimpleDstTy == MVT::f16) ||
3149 (ISD == ISD::FP_EXTEND && SimpleSrcTy == MVT::f16)) {
3150 // fp16 conversions not covered by any table entries require a libcall.
3151 // Return a large (arbitrary) number to model this.
3152 return InstructionCost(64);
3153 }
3154 }
3155
3156 // Fall back to legalized types.
3157 std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src);
3158 std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst);
3159
3160 // If we're truncating to the same legalized type - just assume its free.
3161 if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second)
3162 return TTI::TCC_Free;
3163
3164 if (ST->useAVX512Regs()) {
3165 if (ST->hasBWI())
3166 if (const auto *Entry = ConvertCostTableLookup(
3167 AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
3168 if (auto KindCost = Entry->Cost[CostKind])
3169 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3170
3171 if (ST->hasDQI())
3172 if (const auto *Entry = ConvertCostTableLookup(
3173 AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
3174 if (auto KindCost = Entry->Cost[CostKind])
3175 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3176
3177 if (ST->hasAVX512())
3178 if (const auto *Entry = ConvertCostTableLookup(
3179 AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
3180 if (auto KindCost = Entry->Cost[CostKind])
3181 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3182 }
3183
3184 if (ST->hasBWI())
3185 if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
3186 LTDest.second, LTSrc.second))
3187 if (auto KindCost = Entry->Cost[CostKind])
3188 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3189
3190 if (ST->hasDQI())
3191 if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
3192 LTDest.second, LTSrc.second))
3193 if (auto KindCost = Entry->Cost[CostKind])
3194 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3195
3196 if (ST->hasAVX512())
3197 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3198 LTDest.second, LTSrc.second))
3199 if (auto KindCost = Entry->Cost[CostKind])
3200 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3201
3202 if (ST->hasAVX2())
3203 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3204 LTDest.second, LTSrc.second))
3205 if (auto KindCost = Entry->Cost[CostKind])
3206 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3207
3208 if (ST->hasAVX())
3209 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3210 LTDest.second, LTSrc.second))
3211 if (auto KindCost = Entry->Cost[CostKind])
3212 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3213
3214 if (ST->hasF16C()) {
3215 if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD,
3216 LTDest.second, LTSrc.second))
3217 if (auto KindCost = Entry->Cost[CostKind])
3218 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3219 }
3220
3221 if (ST->hasSSE41())
3222 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3223 LTDest.second, LTSrc.second))
3224 if (auto KindCost = Entry->Cost[CostKind])
3225 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3226
3227 if (ST->hasSSE2())
3228 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3229 LTDest.second, LTSrc.second))
3230 if (auto KindCost = Entry->Cost[CostKind])
3231 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3232
3233 // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
3234 // sitofp.
3235 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3236 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
3237 Type *ExtSrc = Src->getWithNewBitWidth(32);
3238 unsigned ExtOpc =
3239 (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
3240
3241 // For scalar loads the extend would be free.
3242 InstructionCost ExtCost = 0;
3243 if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
3244 ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
3245
3246 return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
3248 }
3249
3250 // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
3251 // i32.
3252 if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
3253 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
3254 Type *TruncDst = Dst->getWithNewBitWidth(32);
3255 return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
3256 getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
3258 }
3259
3260 // TODO: Allow non-throughput costs that aren't binary.
3261 auto AdjustCost = [&CostKind](InstructionCost Cost,
3264 return Cost == 0 ? 0 : N;
3265 return Cost * N;
3266 };
3267 return AdjustCost(
3268 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3269}
3270
3272 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
3274 TTI::OperandValueInfo Op2Info, const Instruction *I) {
3275 // Early out if this type isn't scalar/vector integer/float.
3276 if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy()))
3277 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3278 Op1Info, Op2Info, I);
3279
3280 // Legalize the type.
3281 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3282
3283 MVT MTy = LT.second;
3284
3285 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3286 assert(ISD && "Invalid opcode");
3287
3288 InstructionCost ExtraCost = 0;
3289 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
3290 // Some vector comparison predicates cost extra instructions.
3291 // TODO: Adjust ExtraCost based on CostKind?
3292 // TODO: Should we invert this and assume worst case cmp costs
3293 // and reduce for particular predicates?
3294 if (MTy.isVector() &&
3295 !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
3296 (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
3297 ST->hasBWI())) {
3298 // Fallback to I if a specific predicate wasn't specified.
3299 CmpInst::Predicate Pred = VecPred;
3300 if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE ||
3302 Pred = cast<CmpInst>(I)->getPredicate();
3303
3304 bool CmpWithConstant = false;
3305 if (auto *CmpInstr = dyn_cast_or_null<CmpInst>(I))
3306 CmpWithConstant = isa<Constant>(CmpInstr->getOperand(1));
3307
3308 switch (Pred) {
3310 // xor(cmpeq(x,y),-1)
3311 ExtraCost = CmpWithConstant ? 0 : 1;
3312 break;
3315 // xor(cmpgt(x,y),-1)
3316 ExtraCost = CmpWithConstant ? 0 : 1;
3317 break;
3320 // cmpgt(xor(x,signbit),xor(y,signbit))
3321 // xor(cmpeq(pmaxu(x,y),x),-1)
3322 ExtraCost = CmpWithConstant ? 1 : 2;
3323 break;
3326 if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
3327 (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
3328 // cmpeq(psubus(x,y),0)
3329 // cmpeq(pminu(x,y),x)
3330 ExtraCost = 1;
3331 } else {
3332 // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
3333 ExtraCost = CmpWithConstant ? 2 : 3;
3334 }
3335 break;
3338 // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases.
3339 // Use FCMP_UEQ expansion - FCMP_ONE should be the same.
3340 if (CondTy && !ST->hasAVX())
3341 return getCmpSelInstrCost(Opcode, ValTy, CondTy,
3343 Op1Info, Op2Info) +
3344 getCmpSelInstrCost(Opcode, ValTy, CondTy,
3346 Op1Info, Op2Info) +
3347 getArithmeticInstrCost(Instruction::Or, CondTy, CostKind);
3348
3349 break;
3352 // Assume worst case scenario and add the maximum extra cost.
3353 ExtraCost = 3;
3354 break;
3355 default:
3356 break;
3357 }
3358 }
3359 }
3360
3361 static const CostKindTblEntry SLMCostTbl[] = {
3362 // slm pcmpeq/pcmpgt throughput is 2
3363 { ISD::SETCC, MVT::v2i64, { 2, 5, 1, 2 } },
3364 // slm pblendvb/blendvpd/blendvps throughput is 4
3365 { ISD::SELECT, MVT::v2f64, { 4, 4, 1, 3 } }, // vblendvpd
3366 { ISD::SELECT, MVT::v4f32, { 4, 4, 1, 3 } }, // vblendvps
3367 { ISD::SELECT, MVT::v2i64, { 4, 4, 1, 3 } }, // pblendvb
3368 { ISD::SELECT, MVT::v8i32, { 4, 4, 1, 3 } }, // pblendvb
3369 { ISD::SELECT, MVT::v8i16, { 4, 4, 1, 3 } }, // pblendvb
3370 { ISD::SELECT, MVT::v16i8, { 4, 4, 1, 3 } }, // pblendvb
3371 };
3372
3373 static const CostKindTblEntry AVX512BWCostTbl[] = {
3374 { ISD::SETCC, MVT::v32i16, { 1, 1, 1, 1 } },
3375 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 1 } },
3376 { ISD::SETCC, MVT::v64i8, { 1, 1, 1, 1 } },
3377 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 1 } },
3378
3379 { ISD::SELECT, MVT::v32i16, { 1, 1, 1, 1 } },
3380 { ISD::SELECT, MVT::v64i8, { 1, 1, 1, 1 } },
3381 };
3382
3383 static const CostKindTblEntry AVX512CostTbl[] = {
3384 { ISD::SETCC, MVT::v8f64, { 1, 4, 1, 1 } },
3385 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 1 } },
3386 { ISD::SETCC, MVT::v16f32, { 1, 4, 1, 1 } },
3387 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 1 } },
3388
3389 { ISD::SETCC, MVT::v8i64, { 1, 1, 1, 1 } },
3390 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 1 } },
3391 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3392 { ISD::SETCC, MVT::v16i32, { 1, 1, 1, 1 } },
3393 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 1 } },
3394 { ISD::SETCC, MVT::v32i16, { 3, 7, 5, 5 } },
3395 { ISD::SETCC, MVT::v64i8, { 3, 7, 5, 5 } },
3396
3397 { ISD::SELECT, MVT::v8i64, { 1, 1, 1, 1 } },
3398 { ISD::SELECT, MVT::v4i64, { 1, 1, 1, 1 } },
3399 { ISD::SELECT, MVT::v2i64, { 1, 1, 1, 1 } },
3400 { ISD::SELECT, MVT::v16i32, { 1, 1, 1, 1 } },
3401 { ISD::SELECT, MVT::v8i32, { 1, 1, 1, 1 } },
3402 { ISD::SELECT, MVT::v4i32, { 1, 1, 1, 1 } },
3403 { ISD::SELECT, MVT::v8f64, { 1, 1, 1, 1 } },
3404 { ISD::SELECT, MVT::v4f64, { 1, 1, 1, 1 } },
3405 { ISD::SELECT, MVT::v2f64, { 1, 1, 1, 1 } },
3406 { ISD::SELECT, MVT::f64, { 1, 1, 1, 1 } },
3407 { ISD::SELECT, MVT::v16f32, { 1, 1, 1, 1 } },
3408 { ISD::SELECT, MVT::v8f32 , { 1, 1, 1, 1 } },
3409 { ISD::SELECT, MVT::v4f32, { 1, 1, 1, 1 } },
3410 { ISD::SELECT, MVT::f32 , { 1, 1, 1, 1 } },
3411
3412 { ISD::SELECT, MVT::v32i16, { 2, 2, 4, 4 } },
3413 { ISD::SELECT, MVT::v16i16, { 1, 1, 1, 1 } },
3414 { ISD::SELECT, MVT::v8i16, { 1, 1, 1, 1 } },
3415 { ISD::SELECT, MVT::v64i8, { 2, 2, 4, 4 } },
3416 { ISD::SELECT, MVT::v32i8, { 1, 1, 1, 1 } },
3417 { ISD::SELECT, MVT::v16i8, { 1, 1, 1, 1 } },
3418 };
3419
3420 static const CostKindTblEntry AVX2CostTbl[] = {
3421 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 2 } },
3422 { ISD::SETCC, MVT::v2f64, { 1, 4, 1, 1 } },
3423 { ISD::SETCC, MVT::f64, { 1, 4, 1, 1 } },
3424 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 2 } },
3425 { ISD::SETCC, MVT::v4f32, { 1, 4, 1, 1 } },
3426 { ISD::SETCC, MVT::f32, { 1, 4, 1, 1 } },
3427
3428 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 2 } },
3429 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 2 } },
3430 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 2 } },
3431 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 2 } },
3432
3433 { ISD::SELECT, MVT::v4f64, { 2, 2, 1, 2 } }, // vblendvpd
3434 { ISD::SELECT, MVT::v8f32, { 2, 2, 1, 2 } }, // vblendvps
3435 { ISD::SELECT, MVT::v4i64, { 2, 2, 1, 2 } }, // pblendvb
3436 { ISD::SELECT, MVT::v8i32, { 2, 2, 1, 2 } }, // pblendvb
3437 { ISD::SELECT, MVT::v16i16, { 2, 2, 1, 2 } }, // pblendvb
3438 { ISD::SELECT, MVT::v32i8, { 2, 2, 1, 2 } }, // pblendvb
3439 };
3440
3441 static const CostKindTblEntry XOPCostTbl[] = {
3442 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3443 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3444 };
3445
3446 static const CostKindTblEntry AVX1CostTbl[] = {
3447 { ISD::SETCC, MVT::v4f64, { 2, 3, 1, 2 } },
3448 { ISD::SETCC, MVT::v2f64, { 1, 3, 1, 1 } },
3449 { ISD::SETCC, MVT::f64, { 1, 3, 1, 1 } },
3450 { ISD::SETCC, MVT::v8f32, { 2, 3, 1, 2 } },
3451 { ISD::SETCC, MVT::v4f32, { 1, 3, 1, 1 } },
3452 { ISD::SETCC, MVT::f32, { 1, 3, 1, 1 } },
3453
3454 // AVX1 does not support 8-wide integer compare.
3455 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3456 { ISD::SETCC, MVT::v8i32, { 4, 2, 5, 6 } },
3457 { ISD::SETCC, MVT::v16i16, { 4, 2, 5, 6 } },
3458 { ISD::SETCC, MVT::v32i8, { 4, 2, 5, 6 } },
3459
3460 { ISD::SELECT, MVT::v4f64, { 3, 3, 1, 2 } }, // vblendvpd
3461 { ISD::SELECT, MVT::v8f32, { 3, 3, 1, 2 } }, // vblendvps
3462 { ISD::SELECT, MVT::v4i64, { 3, 3, 1, 2 } }, // vblendvpd
3463 { ISD::SELECT, MVT::v8i32, { 3, 3, 1, 2 } }, // vblendvps
3464 { ISD::SELECT, MVT::v16i16, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3465 { ISD::SELECT, MVT::v32i8, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3466 };
3467
3468 static const CostKindTblEntry SSE42CostTbl[] = {
3469 { ISD::SETCC, MVT::v2i64, { 1, 2, 1, 2 } },
3470 };
3471
3472 static const CostKindTblEntry SSE41CostTbl[] = {
3473 { ISD::SETCC, MVT::v2f64, { 1, 5, 1, 1 } },
3474 { ISD::SETCC, MVT::v4f32, { 1, 5, 1, 1 } },
3475
3476 { ISD::SELECT, MVT::v2f64, { 2, 2, 1, 2 } }, // blendvpd
3477 { ISD::SELECT, MVT::f64, { 2, 2, 1, 2 } }, // blendvpd
3478 { ISD::SELECT, MVT::v4f32, { 2, 2, 1, 2 } }, // blendvps
3479 { ISD::SELECT, MVT::f32 , { 2, 2, 1, 2 } }, // blendvps
3480 { ISD::SELECT, MVT::v2i64, { 2, 2, 1, 2 } }, // pblendvb
3481 { ISD::SELECT, MVT::v4i32, { 2, 2, 1, 2 } }, // pblendvb
3482 { ISD::SELECT, MVT::v8i16, { 2, 2, 1, 2 } }, // pblendvb
3483 { ISD::SELECT, MVT::v16i8, { 2, 2, 1, 2 } }, // pblendvb
3484 };
3485
3486 static const CostKindTblEntry SSE2CostTbl[] = {
3487 { ISD::SETCC, MVT::v2f64, { 2, 5, 1, 1 } },
3488 { ISD::SETCC, MVT::f64, { 1, 5, 1, 1 } },
3489
3490 { ISD::SETCC, MVT::v2i64, { 5, 4, 5, 5 } }, // pcmpeqd/pcmpgtd expansion
3491 { ISD::SETCC, MVT::v4i32, { 1, 1, 1, 1 } },
3492 { ISD::SETCC, MVT::v8i16, { 1, 1, 1, 1 } },
3493 { ISD::SETCC, MVT::v16i8, { 1, 1, 1, 1 } },
3494
3495 { ISD::SELECT, MVT::v2f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3496 { ISD::SELECT, MVT::f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3497 { ISD::SELECT, MVT::v2i64, { 2, 2, 3, 3 } }, // pand + pandn + por
3498 { ISD::SELECT, MVT::v4i32, { 2, 2, 3, 3 } }, // pand + pandn + por
3499 { ISD::SELECT, MVT::v8i16, { 2, 2, 3, 3 } }, // pand + pandn + por
3500 { ISD::SELECT, MVT::v16i8, { 2, 2, 3, 3 } }, // pand + pandn + por
3501 };
3502
3503 static const CostKindTblEntry SSE1CostTbl[] = {
3504 { ISD::SETCC, MVT::v4f32, { 2, 5, 1, 1 } },
3505 { ISD::SETCC, MVT::f32, { 1, 5, 1, 1 } },
3506
3507 { ISD::SELECT, MVT::v4f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3508 { ISD::SELECT, MVT::f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3509 };
3510
3511 if (ST->useSLMArithCosts())
3512 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
3513 if (auto KindCost = Entry->Cost[CostKind])
3514 return LT.first * (ExtraCost + *KindCost);
3515
3516 if (ST->hasBWI())
3517 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3518 if (auto KindCost = Entry->Cost[CostKind])
3519 return LT.first * (ExtraCost + *KindCost);
3520
3521 if (ST->hasAVX512())
3522 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3523 if (auto KindCost = Entry->Cost[CostKind])
3524 return LT.first * (ExtraCost + *KindCost);
3525
3526 if (ST->hasAVX2())
3527 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3528 if (auto KindCost = Entry->Cost[CostKind])
3529 return LT.first * (ExtraCost + *KindCost);
3530
3531 if (ST->hasXOP())
3532 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3533 if (auto KindCost = Entry->Cost[CostKind])
3534 return LT.first * (ExtraCost + *KindCost);
3535
3536 if (ST->hasAVX())
3537 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3538 if (auto KindCost = Entry->Cost[CostKind])
3539 return LT.first * (ExtraCost + *KindCost);
3540
3541 if (ST->hasSSE42())
3542 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3543 if (auto KindCost = Entry->Cost[CostKind])
3544 return LT.first * (ExtraCost + *KindCost);
3545
3546 if (ST->hasSSE41())
3547 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3548 if (auto KindCost = Entry->Cost[CostKind])
3549 return LT.first * (ExtraCost + *KindCost);
3550
3551 if (ST->hasSSE2())
3552 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3553 if (auto KindCost = Entry->Cost[CostKind])
3554 return LT.first * (ExtraCost + *KindCost);
3555
3556 if (ST->hasSSE1())
3557 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3558 if (auto KindCost = Entry->Cost[CostKind])
3559 return LT.first * (ExtraCost + *KindCost);
3560
3561 // Assume a 3cy latency for fp select ops.
3562 if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select)
3563 if (ValTy->getScalarType()->isFloatingPointTy())
3564 return 3;
3565
3566 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3567 Op1Info, Op2Info, I);
3568}
3569
3571
3575 // Costs should match the codegen from:
3576 // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
3577 // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
3578 // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
3579 // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
3580 // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
3581
3582 // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
3583 // specialized in these tables yet.
3584 static const CostKindTblEntry AVX512VBMI2CostTbl[] = {
3585 { ISD::FSHL, MVT::v8i64, { 1, 1, 1, 1 } },
3586 { ISD::FSHL, MVT::v4i64, { 1, 1, 1, 1 } },
3587 { ISD::FSHL, MVT::v2i64, { 1, 1, 1, 1 } },
3588 { ISD::FSHL, MVT::v16i32, { 1, 1, 1, 1 } },
3589 { ISD::FSHL, MVT::v8i32, { 1, 1, 1, 1 } },
3590 { ISD::FSHL, MVT::v4i32, { 1, 1, 1, 1 } },
3591 { ISD::FSHL, MVT::v32i16, { 1, 1, 1, 1 } },
3592 { ISD::FSHL, MVT::v16i16, { 1, 1, 1, 1 } },
3593 { ISD::FSHL, MVT::v8i16, { 1, 1, 1, 1 } },
3594 { ISD::ROTL, MVT::v32i16, { 1, 1, 1, 1 } },
3595 { ISD::ROTL, MVT::v16i16, { 1, 1, 1, 1 } },
3596 { ISD::ROTL, MVT::v8i16, { 1, 1, 1, 1 } },
3597 { ISD::ROTR, MVT::v32i16, { 1, 1, 1, 1 } },
3598 { ISD::ROTR, MVT::v16i16, { 1, 1, 1, 1 } },
3599 { ISD::ROTR, MVT::v8i16, { 1, 1, 1, 1 } },
3600 { X86ISD::VROTLI, MVT::v32i16, { 1, 1, 1, 1 } },
3601 { X86ISD::VROTLI, MVT::v16i16, { 1, 1, 1, 1 } },
3602 { X86ISD::VROTLI, MVT::v8i16, { 1, 1, 1, 1 } },
3603 };
3604 static const CostKindTblEntry AVX512BITALGCostTbl[] = {
3605 { ISD::CTPOP, MVT::v32i16, { 1, 1, 1, 1 } },
3606 { ISD::CTPOP, MVT::v64i8, { 1, 1, 1, 1 } },
3607 { ISD::CTPOP, MVT::v16i16, { 1, 1, 1, 1 } },
3608 { ISD::CTPOP, MVT::v32i8, { 1, 1, 1, 1 } },
3609 { ISD::CTPOP, MVT::v8i16, { 1, 1, 1, 1 } },
3610 { ISD::CTPOP, MVT::v16i8, { 1, 1, 1, 1 } },
3611 };
3612 static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = {
3613 { ISD::CTPOP, MVT::v8i64, { 1, 1, 1, 1 } },
3614 { ISD::CTPOP, MVT::v16i32, { 1, 1, 1, 1 } },
3615 { ISD::CTPOP, MVT::v4i64, { 1, 1, 1, 1 } },
3616 { ISD::CTPOP, MVT::v8i32, { 1, 1, 1, 1 } },
3617 { ISD::CTPOP, MVT::v2i64, { 1, 1, 1, 1 } },
3618 { ISD::CTPOP, MVT::v4i32, { 1, 1, 1, 1 } },
3619 };
3620 static const CostKindTblEntry AVX512CDCostTbl[] = {
3621 { ISD::CTLZ, MVT::v8i64, { 1, 5, 1, 1 } },
3622 { ISD::CTLZ, MVT::v16i32, { 1, 5, 1, 1 } },
3623 { ISD::CTLZ, MVT::v32i16, { 18, 27, 23, 27 } },
3624 { ISD::CTLZ, MVT::v64i8, { 3, 16, 9, 11 } },
3625 { ISD::CTLZ, MVT::v4i64, { 1, 5, 1, 1 } },
3626 { ISD::CTLZ, MVT::v8i32, { 1, 5, 1, 1 } },
3627 { ISD::CTLZ, MVT::v16i16, { 8, 19, 11, 13 } },
3628 { ISD::CTLZ, MVT::v32i8, { 2, 11, 9, 10 } },
3629 { ISD::CTLZ, MVT::v2i64, { 1, 5, 1, 1 } },
3630 { ISD::CTLZ, MVT::v4i32, { 1, 5, 1, 1 } },
3631 { ISD::CTLZ, MVT::v8i16, { 3, 15, 4, 6 } },
3632 { ISD::CTLZ, MVT::v16i8, { 2, 10, 9, 10 } },
3633
3634 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3635 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3636 { ISD::CTTZ, MVT::v4i64, { 1, 8, 6, 6 } },
3637 { ISD::CTTZ, MVT::v8i32, { 1, 8, 6, 6 } },
3638 { ISD::CTTZ, MVT::v2i64, { 1, 8, 6, 6 } },
3639 { ISD::CTTZ, MVT::v4i32, { 1, 8, 6, 6 } },
3640 };
3641 static const CostKindTblEntry AVX512BWCostTbl[] = {
3642 { ISD::ABS, MVT::v32i16, { 1, 1, 1, 1 } },
3643 { ISD::ABS, MVT::v64i8, { 1, 1, 1, 1 } },
3644 { ISD::BITREVERSE, MVT::v2i64, { 3, 10, 10, 11 } },
3645 { ISD::BITREVERSE, MVT::v4i64, { 3, 11, 10, 11 } },
3646 { ISD::BITREVERSE, MVT::v8i64, { 3, 12, 10, 14 } },
3647 { ISD::BITREVERSE, MVT::v4i32, { 3, 10, 10, 11 } },
3648 { ISD::BITREVERSE, MVT::v8i32, { 3, 11, 10, 11 } },
3649 { ISD::BITREVERSE, MVT::v16i32, { 3, 12, 10, 14 } },
3650 { ISD::BITREVERSE, MVT::v8i16, { 3, 10, 10, 11 } },
3651 { ISD::BITREVERSE, MVT::v16i16, { 3, 11, 10, 11 } },
3652 { ISD::BITREVERSE, MVT::v32i16, { 3, 12, 10, 14 } },
3653 { ISD::BITREVERSE, MVT::v16i8, { 2, 5, 9, 9 } },
3654 { ISD::BITREVERSE, MVT::v32i8, { 2, 5, 9, 9 } },
3655 { ISD::BITREVERSE, MVT::v64i8, { 2, 5, 9, 12 } },
3656 { ISD::BSWAP, MVT::v2i64, { 1, 1, 1, 2 } },
3657 { ISD::BSWAP, MVT::v4i64, { 1, 1, 1, 2 } },
3658 { ISD::BSWAP, MVT::v8i64, { 1, 1, 1, 2 } },
3659 { ISD::BSWAP, MVT::v4i32, { 1, 1, 1, 2 } },
3660 { ISD::BSWAP, MVT::v8i32, { 1, 1, 1, 2 } },
3661 { ISD::BSWAP, MVT::v16i32, { 1, 1, 1, 2 } },
3662 { ISD::BSWAP, MVT::v8i16, { 1, 1, 1, 2 } },
3663 { ISD::BSWAP, MVT::v16i16, { 1, 1, 1, 2 } },
3664 { ISD::BSWAP, MVT::v32i16, { 1, 1, 1, 2 } },
3665 { ISD::CTLZ, MVT::v8i64, { 8, 22, 23, 23 } },
3666 { ISD::CTLZ, MVT::v16i32, { 8, 23, 25, 25 } },
3667 { ISD::CTLZ, MVT::v32i16, { 4, 15, 15, 16 } },
3668 { ISD::CTLZ, MVT::v64i8, { 3, 12, 10, 9 } },
3669 { ISD::CTPOP, MVT::v2i64, { 3, 7, 10, 10 } },
3670 { ISD::CTPOP, MVT::v4i64, { 3, 7, 10, 10 } },
3671 { ISD::CTPOP, MVT::v8i64, { 3, 8, 10, 12 } },
3672 { ISD::CTPOP, MVT::v4i32, { 7, 11, 14, 14 } },
3673 { ISD::CTPOP, MVT::v8i32, { 7, 11, 14, 14 } },
3674 { ISD::CTPOP, MVT::v16i32, { 7, 12, 14, 16 } },
3675 { ISD::CTPOP, MVT::v8i16, { 2, 7, 11, 11 } },
3676 { ISD::CTPOP, MVT::v16i16, { 2, 7, 11, 11 } },
3677 { ISD::CTPOP, MVT::v32i16, { 3, 7, 11, 13 } },
3678 { ISD::CTPOP, MVT::v16i8, { 2, 4, 8, 8 } },
3679 { ISD::CTPOP, MVT::v32i8, { 2, 4, 8, 8 } },
3680 { ISD::CTPOP, MVT::v64i8, { 2, 5, 8, 10 } },
3681 { ISD::CTTZ, MVT::v8i16, { 3, 9, 14, 14 } },
3682 { ISD::CTTZ, MVT::v16i16, { 3, 9, 14, 14 } },
3683 { ISD::CTTZ, MVT::v32i16, { 3, 10, 14, 16 } },
3684 { ISD::CTTZ, MVT::v16i8, { 2, 6, 11, 11 } },
3685 { ISD::CTTZ, MVT::v32i8, { 2, 6, 11, 11 } },
3686 { ISD::CTTZ, MVT::v64i8, { 3, 7, 11, 13 } },
3687 { ISD::ROTL, MVT::v32i16, { 2, 8, 6, 8 } },
3688 { ISD::ROTL, MVT::v16i16, { 2, 8, 6, 7 } },
3689 { ISD::ROTL, MVT::v8i16, { 2, 7, 6, 7 } },
3690 { ISD::ROTL, MVT::v64i8, { 5, 6, 11, 12 } },
3691 { ISD::ROTL, MVT::v32i8, { 5, 15, 7, 10 } },
3692 { ISD::ROTL, MVT::v16i8, { 5, 15, 7, 10 } },
3693 { ISD::ROTR, MVT::v32i16, { 2, 8, 6, 8 } },
3694 { ISD::ROTR, MVT::v16i16, { 2, 8, 6, 7 } },
3695 { ISD::ROTR, MVT::v8i16, { 2, 7, 6, 7 } },
3696 { ISD::ROTR, MVT::v64i8, { 5, 6, 12, 14 } },
3697 { ISD::ROTR, MVT::v32i8, { 5, 14, 6, 9 } },
3698 { ISD::ROTR, MVT::v16i8, { 5, 14, 6, 9 } },
3699 { X86ISD::VROTLI, MVT::v32i16, { 2, 5, 3, 3 } },
3700 { X86ISD::VROTLI, MVT::v16i16, { 1, 5, 3, 3 } },
3701 { X86ISD::VROTLI, MVT::v8i16, { 1, 5, 3, 3 } },
3702 { X86ISD::VROTLI, MVT::v64i8, { 2, 9, 3, 4 } },
3703 { X86ISD::VROTLI, MVT::v32i8, { 1, 9, 3, 4 } },
3704 { X86ISD::VROTLI, MVT::v16i8, { 1, 8, 3, 4 } },
3705 { ISD::SADDSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3706 { ISD::SADDSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3707 { ISD::SMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3708 { ISD::SMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3709 { ISD::SMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3710 { ISD::SMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3711 { ISD::SMULO, MVT::v32i16, { 3, 6, 4, 4 } },
3712 { ISD::SMULO, MVT::v64i8, { 8, 21, 17, 18 } },
3713 { ISD::UMULO, MVT::v32i16, { 2, 5, 3, 3 } },
3714 { ISD::UMULO, MVT::v64i8, { 8, 15, 15, 16 } },
3715 { ISD::SSUBSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3716 { ISD::SSUBSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3717 { ISD::UADDSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3718 { ISD::UADDSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3719 { ISD::UMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3720 { ISD::UMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3721 { ISD::UMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3722 { ISD::UMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3723 { ISD::USUBSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3724 { ISD::USUBSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3725 };
3726 static const CostKindTblEntry AVX512CostTbl[] = {
3727 { ISD::ABS, MVT::v8i64, { 1, 1, 1, 1 } },
3728 { ISD::ABS, MVT::v4i64, { 1, 1, 1, 1 } },
3729 { ISD::ABS, MVT::v2i64, { 1, 1, 1, 1 } },
3730 { ISD::ABS, MVT::v16i32, { 1, 1, 1, 1 } },
3731 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 1 } },
3732 { ISD::ABS, MVT::v32i16, { 2, 7, 4, 4 } },
3733 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 1 } },
3734 { ISD::ABS, MVT::v64i8, { 2, 7, 4, 4 } },
3735 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 1 } },
3736 { ISD::BITREVERSE, MVT::v8i64, { 9, 13, 20, 20 } },
3737 { ISD::BITREVERSE, MVT::v16i32, { 9, 13, 20, 20 } },
3738 { ISD::BITREVERSE, MVT::v32i16, { 9, 13, 20, 20 } },
3739 { ISD::BITREVERSE, MVT::v64i8, { 6, 11, 17, 17 } },
3740 { ISD::BSWAP, MVT::v8i64, { 4, 7, 5, 5 } },
3741 { ISD::BSWAP, MVT::v16i32, { 4, 7, 5, 5 } },
3742 { ISD::BSWAP, MVT::v32i16, { 4, 7, 5, 5 } },
3743 { ISD::CTLZ, MVT::v8i64, { 10, 28, 32, 32 } },
3744 { ISD::CTLZ, MVT::v16i32, { 12, 30, 38, 38 } },
3745 { ISD::CTLZ, MVT::v32i16, { 8, 15, 29, 29 } },
3746 { ISD::CTLZ, MVT::v64i8, { 6, 11, 19, 19 } },
3747 { ISD::CTPOP, MVT::v8i64, { 16, 16, 19, 19 } },
3748 { ISD::CTPOP, MVT::v16i32, { 24, 19, 27, 27 } },
3749 { ISD::CTPOP, MVT::v32i16, { 18, 15, 22, 22 } },
3750 { ISD::CTPOP, MVT::v64i8, { 12, 11, 16, 16 } },
3751 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3752 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3753 { ISD::CTTZ, MVT::v32i16, { 7, 17, 27, 27 } },
3754 { ISD::CTTZ, MVT::v64i8, { 6, 13, 21, 21 } },
3755 { ISD::ROTL, MVT::v8i64, { 1, 1, 1, 1 } },
3756 { ISD::ROTL, MVT::v4i64, { 1, 1, 1, 1 } },
3757 { ISD::ROTL, MVT::v2i64, { 1, 1, 1, 1 } },
3758 { ISD::ROTL, MVT::v16i32, { 1, 1, 1, 1 } },
3759 { ISD::ROTL, MVT::v8i32, { 1, 1, 1, 1 } },
3760 { ISD::ROTL, MVT::v4i32, { 1, 1, 1, 1 } },
3761 { ISD::ROTR, MVT::v8i64, { 1, 1, 1, 1 } },
3762 { ISD::ROTR, MVT::v4i64, { 1, 1, 1, 1 } },
3763 { ISD::ROTR, MVT::v2i64, { 1, 1, 1, 1 } },
3764 { ISD::ROTR, MVT::v16i32, { 1, 1, 1, 1 } },
3765 { ISD::ROTR, MVT::v8i32, { 1, 1, 1, 1 } },
3766 { ISD::ROTR, MVT::v4i32, { 1, 1, 1, 1 } },
3767 { X86ISD::VROTLI, MVT::v8i64, { 1, 1, 1, 1 } },
3768 { X86ISD::VROTLI, MVT::v4i64, { 1, 1, 1, 1 } },
3769 { X86ISD::VROTLI, MVT::v2i64, { 1, 1, 1, 1 } },
3770 { X86ISD::VROTLI, MVT::v16i32, { 1, 1, 1, 1 } },
3771 { X86ISD::VROTLI, MVT::v8i32, { 1, 1, 1, 1 } },
3772 { X86ISD::VROTLI, MVT::v4i32, { 1, 1, 1, 1 } },
3773 { ISD::SADDSAT, MVT::v2i64, { 3, 3, 8, 9 } },
3774 { ISD::SADDSAT, MVT::v4i64, { 2, 2, 6, 7 } },
3775 { ISD::SADDSAT, MVT::v8i64, { 3, 3, 6, 7 } },
3776 { ISD::SADDSAT, MVT::v4i32, { 2, 2, 6, 7 } },
3777 { ISD::SADDSAT, MVT::v8i32, { 2, 2, 6, 7 } },
3778 { ISD::SADDSAT, MVT::v16i32, { 3, 3, 6, 7 } },
3779 { ISD::SADDSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3780 { ISD::SADDSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3781 { ISD::SMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3782 { ISD::SMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3783 { ISD::SMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3784 { ISD::SMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3785 { ISD::SMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3786 { ISD::SMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3787 { ISD::SMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3788 { ISD::SMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3789 { ISD::SMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3790 { ISD::SMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3791 { ISD::SMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3792 { ISD::SMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3793 { ISD::SMULO, MVT::v8i64, { 44, 44, 81, 93 } },
3794 { ISD::SMULO, MVT::v16i32, { 5, 12, 9, 11 } },
3795 { ISD::SMULO, MVT::v32i16, { 6, 12, 17, 17 } },
3796 { ISD::SMULO, MVT::v64i8, { 22, 28, 42, 42 } },
3797 { ISD::SSUBSAT, MVT::v2i64, { 2, 13, 9, 10 } },
3798 { ISD::SSUBSAT, MVT::v4i64, { 2, 15, 7, 8 } },
3799 { ISD::SSUBSAT, MVT::v8i64, { 2, 14, 7, 8 } },
3800 { ISD::SSUBSAT, MVT::v4i32, { 2, 14, 7, 8 } },
3801 { ISD::SSUBSAT, MVT::v8i32, { 2, 15, 7, 8 } },
3802 { ISD::SSUBSAT, MVT::v16i32, { 2, 14, 7, 8 } },
3803 { ISD::SSUBSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3804 { ISD::SSUBSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3805 { ISD::UMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3806 { ISD::UMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3807 { ISD::UMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3808 { ISD::UMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3809 { ISD::UMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3810 { ISD::UMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3811 { ISD::UMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3812 { ISD::UMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3813 { ISD::UMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3814 { ISD::UMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3815 { ISD::UMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3816 { ISD::UMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3817 { ISD::UMULO, MVT::v8i64, { 52, 52, 95, 104} },
3818 { ISD::UMULO, MVT::v16i32, { 5, 12, 8, 10 } },
3819 { ISD::UMULO, MVT::v32i16, { 5, 13, 16, 16 } },
3820 { ISD::UMULO, MVT::v64i8, { 18, 24, 30, 30 } },
3821 { ISD::UADDSAT, MVT::v2i64, { 1, 4, 4, 4 } },
3822 { ISD::UADDSAT, MVT::v4i64, { 1, 4, 4, 4 } },
3823 { ISD::UADDSAT, MVT::v8i64, { 1, 4, 4, 4 } },
3824 { ISD::UADDSAT, MVT::v4i32, { 1, 2, 4, 4 } },
3825 { ISD::UADDSAT, MVT::v8i32, { 1, 2, 4, 4 } },
3826 { ISD::UADDSAT, MVT::v16i32, { 2, 2, 4, 4 } },
3827 { ISD::UADDSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3828 { ISD::UADDSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3829 { ISD::USUBSAT, MVT::v2i64, { 1, 4, 2, 2 } },
3830 { ISD::USUBSAT, MVT::v4i64, { 1, 4, 2, 2 } },
3831 { ISD::USUBSAT, MVT::v8i64, { 1, 4, 2, 2 } },
3832 { ISD::USUBSAT, MVT::v8i32, { 1, 2, 2, 2 } },
3833 { ISD::USUBSAT, MVT::v16i32, { 1, 2, 2, 2 } },
3834 { ISD::USUBSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3835 { ISD::USUBSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3836 { ISD::FMAXNUM, MVT::f32, { 2, 2, 3, 3 } },
3837 { ISD::FMAXNUM, MVT::v4f32, { 1, 1, 3, 3 } },
3838 { ISD::FMAXNUM, MVT::v8f32, { 2, 2, 3, 3 } },
3839 { ISD::FMAXNUM, MVT::v16f32, { 4, 4, 3, 3 } },
3840 { ISD::FMAXNUM, MVT::f64, { 2, 2, 3, 3 } },
3841 { ISD::FMAXNUM, MVT::v2f64, { 1, 1, 3, 3 } },
3842 { ISD::FMAXNUM, MVT::v4f64, { 2, 2, 3, 3 } },
3843 { ISD::FMAXNUM, MVT::v8f64, { 3, 3, 3, 3 } },
3844 { ISD::FSQRT, MVT::f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3845 { ISD::FSQRT, MVT::v4f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3846 { ISD::FSQRT, MVT::v8f32, { 6, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3847 { ISD::FSQRT, MVT::v16f32, { 12, 20, 1, 3 } }, // Skylake from http://www.agner.org/
3848 { ISD::FSQRT, MVT::f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3849 { ISD::FSQRT, MVT::v2f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3850 { ISD::FSQRT, MVT::v4f64, { 12, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3851 { ISD::FSQRT, MVT::v8f64, { 24, 32, 1, 3 } }, // Skylake from http://www.agner.org/
3852 };
3853 static const CostKindTblEntry XOPCostTbl[] = {
3854 { ISD::BITREVERSE, MVT::v4i64, { 3, 6, 5, 6 } },
3855 { ISD::BITREVERSE, MVT::v8i32, { 3, 6, 5, 6 } },
3856 { ISD::BITREVERSE, MVT::v16i16, { 3, 6, 5, 6 } },
3857 { ISD::BITREVERSE, MVT::v32i8, { 3, 6, 5, 6 } },
3858 { ISD::BITREVERSE, MVT::v2i64, { 2, 7, 1, 1 } },
3859 { ISD::BITREVERSE, MVT::v4i32, { 2, 7, 1, 1 } },
3860 { ISD::BITREVERSE, MVT::v8i16, { 2, 7, 1, 1 } },
3861 { ISD::BITREVERSE, MVT::v16i8, { 2, 7, 1, 1 } },
3862 { ISD::BITREVERSE, MVT::i64, { 2, 2, 3, 4 } },
3863 { ISD::BITREVERSE, MVT::i32, { 2, 2, 3, 4 } },
3864 { ISD::BITREVERSE, MVT::i16, { 2, 2, 3, 4 } },
3865 { ISD::BITREVERSE, MVT::i8, { 2, 2, 3, 4 } },
3866 // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
3867 { ISD::ROTL, MVT::v4i64, { 4, 7, 5, 6 } },
3868 { ISD::ROTL, MVT::v8i32, { 4, 7, 5, 6 } },
3869 { ISD::ROTL, MVT::v16i16, { 4, 7, 5, 6 } },
3870 { ISD::ROTL, MVT::v32i8, { 4, 7, 5, 6 } },
3871 { ISD::ROTL, MVT::v2i64, { 1, 3, 1, 1 } },
3872 { ISD::ROTL, MVT::v4i32, { 1, 3, 1, 1 } },
3873 { ISD::ROTL, MVT::v8i16, { 1, 3, 1, 1 } },
3874 { ISD::ROTL, MVT::v16i8, { 1, 3, 1, 1 } },
3875 { ISD::ROTR, MVT::v4i64, { 4, 7, 8, 9 } },
3876 { ISD::ROTR, MVT::v8i32, { 4, 7, 8, 9 } },
3877 { ISD::ROTR, MVT::v16i16, { 4, 7, 8, 9 } },
3878 { ISD::ROTR, MVT::v32i8, { 4, 7, 8, 9 } },
3879 { ISD::ROTR, MVT::v2i64, { 1, 3, 3, 3 } },
3880 { ISD::ROTR, MVT::v4i32, { 1, 3, 3, 3 } },
3881 { ISD::ROTR, MVT::v8i16, { 1, 3, 3, 3 } },
3882 { ISD::ROTR, MVT::v16i8, { 1, 3, 3, 3 } },
3883 { X86ISD::VROTLI, MVT::v4i64, { 4, 7, 5, 6 } },
3884 { X86ISD::VROTLI, MVT::v8i32, { 4, 7, 5, 6 } },
3885 { X86ISD::VROTLI, MVT::v16i16, { 4, 7, 5, 6 } },
3886 { X86ISD::VROTLI, MVT::v32i8, { 4, 7, 5, 6 } },
3887 { X86ISD::VROTLI, MVT::v2i64, { 1, 3, 1, 1 } },
3888 { X86ISD::VROTLI, MVT::v4i32, { 1, 3, 1, 1 } },
3889 { X86ISD::VROTLI, MVT::v8i16, { 1, 3, 1, 1 } },
3890 { X86ISD::VROTLI, MVT::v16i8, { 1, 3, 1, 1 } },
3891 };
3892 static const CostKindTblEntry AVX2CostTbl[] = {
3893 { ISD::ABS, MVT::v2i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3894 { ISD::ABS, MVT::v4i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3895 { ISD::ABS, MVT::v4i32, { 1, 1, 1, 1 } },
3896 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 2 } },
3897 { ISD::ABS, MVT::v8i16, { 1, 1, 1, 1 } },
3898 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 2 } },
3899 { ISD::ABS, MVT::v16i8, { 1, 1, 1, 1 } },
3900 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 2 } },
3901 { ISD::BITREVERSE, MVT::v2i64, { 3, 11, 10, 11 } },
3902 { ISD::BITREVERSE, MVT::v4i64, { 5, 11, 10, 17 } },
3903 { ISD::BITREVERSE, MVT::v4i32, { 3, 11, 10, 11 } },
3904 { ISD::BITREVERSE, MVT::v8i32, { 5, 11, 10, 17 } },
3905 { ISD::BITREVERSE, MVT::v8i16, { 3, 11, 10, 11 } },
3906 { ISD::BITREVERSE, MVT::v16i16, { 5, 11, 10, 17 } },
3907 { ISD::BITREVERSE, MVT::v16i8, { 3, 6, 9, 9 } },
3908 { ISD::BITREVERSE, MVT::v32i8, { 4, 5, 9, 15 } },
3909 { ISD::BSWAP, MVT::v2i64, { 1, 2, 1, 2 } },
3910 { ISD::BSWAP, MVT::v4i64, { 1, 3, 1, 2 } },
3911 { ISD::BSWAP, MVT::v4i32, { 1, 2, 1, 2 } },
3912 { ISD::BSWAP, MVT::v8i32, { 1, 3, 1, 2 } },
3913 { ISD::BSWAP, MVT::v8i16, { 1, 2, 1, 2 } },
3914 { ISD::BSWAP, MVT::v16i16, { 1, 3, 1, 2 } },
3915 { ISD::CTLZ, MVT::v2i64, { 7, 18, 24, 25 } },
3916 { ISD::CTLZ, MVT::v4i64, { 14, 18, 24, 44 } },
3917 { ISD::CTLZ, MVT::v4i32, { 5, 16, 19, 20 } },
3918 { ISD::CTLZ, MVT::v8i32, { 10, 16, 19, 34 } },
3919 { ISD::CTLZ, MVT::v8i16, { 4, 13, 14, 15 } },
3920 { ISD::CTLZ, MVT::v16i16, { 6, 14, 14, 24 } },
3921 { ISD::CTLZ, MVT::v16i8, { 3, 12, 9, 10 } },
3922 { ISD::CTLZ, MVT::v32i8, { 4, 12, 9, 14 } },
3923 { ISD::CTPOP, MVT::v2i64, { 3, 9, 10, 10 } },
3924 { ISD::CTPOP, MVT::v4i64, { 4, 9, 10, 14 } },
3925 { ISD::CTPOP, MVT::v4i32, { 7, 12, 14, 14 } },
3926 { ISD::CTPOP, MVT::v8i32, { 7, 12, 14, 18 } },
3927 { ISD::CTPOP, MVT::v8i16, { 3, 7, 11, 11 } },
3928 { ISD::CTPOP, MVT::v16i16, { 6, 8, 11, 18 } },
3929 { ISD::CTPOP, MVT::v16i8, { 2, 5, 8, 8 } },
3930 { ISD::CTPOP, MVT::v32i8, { 3, 5, 8, 12 } },
3931 { ISD::CTTZ, MVT::v2i64, { 4, 11, 13, 13 } },
3932 { ISD::CTTZ, MVT::v4i64, { 5, 11, 13, 20 } },
3933 { ISD::CTTZ, MVT::v4i32, { 7, 14, 17, 17 } },
3934 { ISD::CTTZ, MVT::v8i32, { 7, 15, 17, 24 } },
3935 { ISD::CTTZ, MVT::v8i16, { 4, 9, 14, 14 } },
3936 { ISD::CTTZ, MVT::v16i16, { 6, 9, 14, 24 } },
3937 { ISD::CTTZ, MVT::v16i8, { 3, 7, 11, 11 } },
3938 { ISD::CTTZ, MVT::v32i8, { 5, 7, 11, 18 } },
3939 { ISD::SADDSAT, MVT::v2i64, { 4, 13, 8, 11 } },
3940 { ISD::SADDSAT, MVT::v4i64, { 3, 10, 8, 12 } },
3941 { ISD::SADDSAT, MVT::v4i32, { 2, 6, 7, 9 } },
3942 { ISD::SADDSAT, MVT::v8i32, { 4, 6, 7, 13 } },
3943 { ISD::SADDSAT, MVT::v16i16, { 1, 1, 1, 2 } },
3944 { ISD::SADDSAT, MVT::v32i8, { 1, 1, 1, 2 } },
3945 { ISD::SMAX, MVT::v2i64, { 2, 7, 2, 3 } },
3946 { ISD::SMAX, MVT::v4i64, { 2, 7, 2, 3 } },
3947 { ISD::SMAX, MVT::v8i32, { 1, 1, 1, 2 } },
3948 { ISD::SMAX, MVT::v16i16, { 1, 1, 1, 2 } },
3949 { ISD::SMAX, MVT::v32i8, { 1, 1, 1, 2 } },
3950 { ISD::SMIN, MVT::v2i64, { 2, 7, 2, 3 } },
3951 { ISD::SMIN, MVT::v4i64, { 2, 7, 2, 3 } },
3952 { ISD::SMIN, MVT::v8i32, { 1, 1, 1, 2 } },
3953 { ISD::SMIN, MVT::v16i16, { 1, 1, 1, 2 } },
3954 { ISD::SMIN, MVT::v32i8, { 1, 1, 1, 2 } },
3955 { ISD::SMULO, MVT::v4i64, { 20, 20, 33, 37 } },
3956 { ISD::SMULO, MVT::v2i64, { 8, 8, 13, 15 } },
3957 { ISD::SMULO, MVT::v8i32, { 8, 20, 13, 24 } },
3958 { ISD::SMULO, MVT::v4i32, { 5, 15, 11, 12 } },
3959 { ISD::SMULO, MVT::v16i16, { 4, 14, 8, 14 } },
3960 { ISD::SMULO, MVT::v8i16, { 3, 9, 6, 6 } },
3961 { ISD::SMULO, MVT::v32i8, { 9, 15, 18, 35 } },
3962 { ISD::SMULO, MVT::v16i8, { 6, 22, 14, 21 } },
3963 { ISD::SSUBSAT, MVT::v2i64, { 4, 13, 9, 13 } },
3964 { ISD::SSUBSAT, MVT::v4i64, { 4, 15, 9, 13 } },
3965 { ISD::SSUBSAT, MVT::v4i32, { 3, 14, 9, 11 } },
3966 { ISD::SSUBSAT, MVT::v8i32, { 4, 15, 9, 16 } },
3967 { ISD::SSUBSAT, MVT::v16i16, { 1, 1, 1, 2 } },
3968 { ISD::SSUBSAT, MVT::v32i8, { 1, 1, 1, 2 } },
3969 { ISD::UADDSAT, MVT::v2i64, { 2, 8, 6, 6 } },
3970 { ISD::UADDSAT, MVT::v4i64, { 3, 8, 6, 10 } },
3971 { ISD::UADDSAT, MVT::v8i32, { 2, 2, 4, 8 } },
3972 { ISD::UADDSAT, MVT::v16i16, { 1, 1, 1, 2 } },
3973 { ISD::UADDSAT, MVT::v32i8, { 1, 1, 1, 2 } },
3974 { ISD::UMAX, MVT::v2i64, { 2, 8, 5, 6 } },
3975 { ISD::UMAX, MVT::v4i64, { 2, 8, 5, 8 } },
3976 { ISD::UMAX, MVT::v8i32, { 1, 1, 1, 2 } },
3977 { ISD::UMAX, MVT::v16i16, { 1, 1, 1, 2 } },
3978 { ISD::UMAX, MVT::v32i8, { 1, 1, 1, 2 } },
3979 { ISD::UMIN, MVT::v2i64, { 2, 8, 5, 6 } },
3980 { ISD::UMIN, MVT::v4i64, { 2, 8, 5, 8 } },
3981 { ISD::UMIN, MVT::v8i32, { 1, 1, 1, 2 } },
3982 { ISD::UMIN, MVT::v16i16, { 1, 1, 1, 2 } },
3983 { ISD::UMIN, MVT::v32i8, { 1, 1, 1, 2 } },
3984 { ISD::UMULO, MVT::v4i64, { 24, 24, 39, 43 } },
3985 { ISD::UMULO, MVT::v2i64, { 10, 10, 15, 19 } },
3986 { ISD::UMULO, MVT::v8i32, { 8, 11, 13, 23 } },
3987 { ISD::UMULO, MVT::v4i32, { 5, 12, 11, 12 } },
3988 { ISD::UMULO, MVT::v16i16, { 4, 6, 8, 13 } },
3989 { ISD::UMULO, MVT::v8i16, { 2, 8, 6, 6 } },
3990 { ISD::UMULO, MVT::v32i8, { 9, 13, 17, 33 } },
3991 { ISD::UMULO, MVT::v16i8, { 6, 19, 13, 20 } },
3992 { ISD::USUBSAT, MVT::v2i64, { 2, 7, 6, 6 } },
3993 { ISD::USUBSAT, MVT::v4i64, { 3, 7, 6, 10 } },
3994 { ISD::USUBSAT, MVT::v8i32, { 2, 2, 2, 4 } },
3995 { ISD::USUBSAT, MVT::v16i16, { 1, 1, 1, 2 } },
3996 { ISD::USUBSAT, MVT::v32i8, { 1, 1, 1, 2 } },
3997 { ISD::FMAXNUM, MVT::f32, { 2, 7, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3998 { ISD::FMAXNUM, MVT::v4f32, { 2, 7, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3999 { ISD::FMAXNUM, MVT::v8f32, { 3, 7, 3, 6 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4000 { ISD::FMAXNUM, MVT::f64, { 2, 7, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4001 { ISD::FMAXNUM, MVT::v2f64, { 2, 7, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4002 { ISD::FMAXNUM, MVT::v4f64, { 3, 7, 3, 6 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4003 { ISD::FSQRT, MVT::f32, { 7, 15, 1, 1 } }, // vsqrtss
4004 { ISD::FSQRT, MVT::v4f32, { 7, 15, 1, 1 } }, // vsqrtps
4005 { ISD::FSQRT, MVT::v8f32, { 14, 21, 1, 3 } }, // vsqrtps
4006 { ISD::FSQRT, MVT::f64, { 14, 21, 1, 1 } }, // vsqrtsd
4007 { ISD::FSQRT, MVT::v2f64, { 14, 21, 1, 1 } }, // vsqrtpd
4008 { ISD::FSQRT, MVT::v4f64, { 28, 35, 1, 3 } }, // vsqrtpd
4009 };
4010 static const CostKindTblEntry AVX1CostTbl[] = {
4011 { ISD::ABS, MVT::v4i64, { 6, 8, 6, 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
4012 { ISD::ABS, MVT::v8i32, { 3, 6, 4, 5 } },
4013 { ISD::ABS, MVT::v16i16, { 3, 6, 4, 5 } },
4014 { ISD::ABS, MVT::v32i8, { 3, 6, 4, 5 } },
4015 { ISD::BITREVERSE, MVT::v4i64, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4016 { ISD::BITREVERSE, MVT::v2i64, { 8, 13, 10, 16 } },
4017 { ISD::BITREVERSE, MVT::v8i32, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4018 { ISD::BITREVERSE, MVT::v4i32, { 8, 13, 10, 16 } },
4019 { ISD::BITREVERSE, MVT::v16i16, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4020 { ISD::BITREVERSE, MVT::v8i16, { 8, 13, 10, 16 } },
4021 { ISD::BITREVERSE, MVT::v32i8, { 13, 15, 17, 26 } }, // 2 x 128-bit Op + extract/insert
4022 { ISD::BITREVERSE, MVT::v16i8, { 7, 7, 9, 13 } },
4023 { ISD::BSWAP, MVT::v4i64, { 5, 6, 5, 10 } },
4024 { ISD::BSWAP, MVT::v2i64, { 2, 2, 1, 3 } },
4025 { ISD::BSWAP, MVT::v8i32, { 5, 6, 5, 10 } },
4026 { ISD::BSWAP, MVT::v4i32, { 2, 2, 1, 3 } },
4027 { ISD::BSWAP, MVT::v16i16, { 5, 6, 5, 10 } },
4028 { ISD::BSWAP, MVT::v8i16, { 2, 2, 1, 3 } },
4029 { ISD::CTLZ, MVT::v4i64, { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert
4030 { ISD::CTLZ, MVT::v2i64, { 14, 24, 24, 28 } },
4031 { ISD::CTLZ, MVT::v8i32, { 24, 28, 39, 48 } }, // 2 x 128-bit Op + extract/insert
4032 { ISD::CTLZ, MVT::v4i32, { 12, 20, 19, 23 } },
4033 { ISD::CTLZ, MVT::v16i16, { 19, 22, 29, 38 } }, // 2 x 128-bit Op + extract/insert
4034 { ISD::CTLZ, MVT::v8i16, { 9, 16, 14, 18 } },
4035 { ISD::CTLZ, MVT::v32i8, { 14, 15, 19, 28 } }, // 2 x 128-bit Op + extract/insert
4036 { ISD::CTLZ, MVT::v16i8, { 7, 12, 9, 13 } },
4037 { ISD::CTPOP, MVT::v4i64, { 14, 18, 19, 28 } }, // 2 x 128-bit Op + extract/insert
4038 { ISD::CTPOP, MVT::v2i64, { 7, 14, 10, 14 } },
4039 { ISD::CTPOP, MVT::v8i32, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
4040 { ISD::CTPOP, MVT::v4i32, { 9, 20, 14, 18 } },
4041 { ISD::CTPOP, MVT::v16i16, { 16, 21, 22, 31 } }, // 2 x 128-bit Op + extract/insert
4042 { ISD::CTPOP, MVT::v8i16, { 8, 18, 11, 15 } },
4043 { ISD::CTPOP, MVT::v32i8, { 13, 15, 16, 25 } }, // 2 x 128-bit Op + extract/insert
4044 { ISD::CTPOP, MVT::v16i8, { 6, 12, 8, 12 } },
4045 { ISD::CTTZ, MVT::v4i64, { 17, 22, 24, 33 } }, // 2 x 128-bit Op + extract/insert
4046 { ISD::CTTZ, MVT::v2i64, { 9, 19, 13, 17 } },
4047 { ISD::CTTZ, MVT::v8i32, { 21, 27, 32, 41 } }, // 2 x 128-bit Op + extract/insert
4048 { ISD::CTTZ, MVT::v4i32, { 11, 24, 17, 21 } },
4049 { ISD::CTTZ, MVT::v16i16, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
4050 { ISD::CTTZ, MVT::v8i16, { 9, 21, 14, 18 } },
4051 { ISD::CTTZ, MVT::v32i8, { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert
4052 { ISD::CTTZ, MVT::v16i8, { 8, 16, 11, 15 } },
4053 { ISD::SADDSAT, MVT::v2i64, { 6, 13, 8, 11 } },
4054 { ISD::SADDSAT, MVT::v4i64, { 13, 20, 15, 25 } }, // 2 x 128-bit Op + extract/insert
4055 { ISD::SADDSAT, MVT::v8i32, { 12, 18, 14, 24 } }, // 2 x 128-bit Op + extract/insert
4056 { ISD::SADDSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4057 { ISD::SADDSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4058 { ISD::SMAX, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
4059 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 4 } },
4060 { ISD::SMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4061 { ISD::SMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4062 { ISD::SMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4063 { ISD::SMIN, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
4064 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
4065 { ISD::SMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4066 { ISD::SMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4067 { ISD::SMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4068 { ISD::SMULO, MVT::v4i64, { 20, 20, 33, 37 } },
4069 { ISD::SMULO, MVT::v2i64, { 9, 9, 13, 17 } },
4070 { ISD::SMULO, MVT::v8i32, { 15, 20, 24, 29 } },
4071 { ISD::SMULO, MVT::v4i32, { 7, 15, 11, 13 } },
4072 { ISD::SMULO, MVT::v16i16, { 8, 14, 14, 15 } },
4073 { ISD::SMULO, MVT::v8i16, { 3, 9, 6, 6 } },
4074 { ISD::SMULO, MVT::v32i8, { 20, 20, 37, 39 } },
4075 { ISD::SMULO, MVT::v16i8, { 9, 22, 18, 21 } },
4076 { ISD::SSUBSAT, MVT::v2i64, { 7, 13, 9, 13 } },
4077 { ISD::SSUBSAT, MVT::v4i64, { 15, 21, 18, 29 } }, // 2 x 128-bit Op + extract/insert
4078 { ISD::SSUBSAT, MVT::v8i32, { 15, 19, 18, 29 } }, // 2 x 128-bit Op + extract/insert
4079 { ISD::SSUBSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4080 { ISD::SSUBSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4081 { ISD::UADDSAT, MVT::v2i64, { 3, 8, 6, 6 } },
4082 { ISD::UADDSAT, MVT::v4i64, { 8, 11, 14, 15 } }, // 2 x 128-bit Op + extract/insert
4083 { ISD::UADDSAT, MVT::v8i32, { 6, 6, 10, 11 } }, // 2 x 128-bit Op + extract/insert
4084 { ISD::UADDSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4085 { ISD::UADDSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4086 { ISD::UMAX, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
4087 { ISD::UMAX, MVT::v2i64, { 4, 8, 5, 7 } },
4088 { ISD::UMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4089 { ISD::UMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4090 { ISD::UMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4091 { ISD::UMIN, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
4092 { ISD::UMIN, MVT::v2i64, { 4, 8, 5, 7 } },
4093 { ISD::UMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4094 { ISD::UMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4095 { ISD::UMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4096 { ISD::UMULO, MVT::v4i64, { 24, 26, 39, 45 } },
4097 { ISD::UMULO, MVT::v2i64, { 10, 12, 15, 20 } },
4098 { ISD::UMULO, MVT::v8i32, { 14, 15, 23, 28 } },
4099 { ISD::UMULO, MVT::v4i32, { 7, 12, 11, 13 } },
4100 { ISD::UMULO, MVT::v16i16, { 7, 11, 13, 14 } },
4101 { ISD::UMULO, MVT::v8i16, { 3, 8, 6, 6 } },
4102 { ISD::UMULO, MVT::v32i8, { 19, 19, 35, 37 } },
4103 { ISD::UMULO, MVT::v16i8, { 9, 19, 17, 20 } },
4104 { ISD::USUBSAT, MVT::v2i64, { 3, 7, 6, 6 } },
4105 { ISD::USUBSAT, MVT::v4i64, { 8, 10, 14, 15 } }, // 2 x 128-bit Op + extract/insert
4106 { ISD::USUBSAT, MVT::v8i32, { 4, 4, 7, 8 } }, // 2 x 128-bit Op + extract/insert
4107 { ISD::USUBSAT, MVT::v8i32, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4108 { ISD::USUBSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4109 { ISD::USUBSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4110 { ISD::FMAXNUM, MVT::f32, { 3, 6, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4111 { ISD::FMAXNUM, MVT::v4f32, { 3, 6, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4112 { ISD::FMAXNUM, MVT::v8f32, { 5, 7, 3, 10 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4113 { ISD::FMAXNUM, MVT::f64, { 3, 6, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4114 { ISD::FMAXNUM, MVT::v2f64, { 3, 6, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4115 { ISD::FMAXNUM, MVT::v4f64, { 5, 7, 3, 10 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4116 { ISD::FSQRT, MVT::f32, { 21, 21, 1, 1 } }, // vsqrtss
4117 { ISD::FSQRT, MVT::v4f32, { 21, 21, 1, 1 } }, // vsqrtps
4118 { ISD::FSQRT, MVT::v8f32, { 42, 42, 1, 3 } }, // vsqrtps
4119 { ISD::FSQRT, MVT::f64, { 27, 27, 1, 1 } }, // vsqrtsd
4120 { ISD::FSQRT, MVT::v2f64, { 27, 27, 1, 1 } }, // vsqrtpd
4121 { ISD::FSQRT, MVT::v4f64, { 54, 54, 1, 3 } }, // vsqrtpd
4122 };
4123 static const CostKindTblEntry GFNICostTbl[] = {
4124 { ISD::BITREVERSE, MVT::i8, { 3, 3, 3, 4 } }, // gf2p8affineqb
4125 { ISD::BITREVERSE, MVT::i16, { 3, 3, 4, 6 } }, // gf2p8affineqb
4126 { ISD::BITREVERSE, MVT::i32, { 3, 3, 4, 5 } }, // gf2p8affineqb
4127 { ISD::BITREVERSE, MVT::i64, { 3, 3, 4, 6 } }, // gf2p8affineqb
4128 { ISD::BITREVERSE, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4129 { ISD::BITREVERSE, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4130 { ISD::BITREVERSE, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4131 { ISD::BITREVERSE, MVT::v8i16, { 1, 8, 2, 4 } }, // gf2p8affineqb
4132 { ISD::BITREVERSE, MVT::v16i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
4133 { ISD::BITREVERSE, MVT::v32i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
4134 { ISD::BITREVERSE, MVT::v4i32, { 1, 8, 2, 4 } }, // gf2p8affineqb
4135 { ISD::BITREVERSE, MVT::v8i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
4136 { ISD::BITREVERSE, MVT::v16i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
4137 { ISD::BITREVERSE, MVT::v2i64, { 1, 8, 2, 4 } }, // gf2p8affineqb
4138 { ISD::BITREVERSE, MVT::v4i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
4139 { ISD::BITREVERSE, MVT::v8i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
4140 { X86ISD::VROTLI, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4141 { X86ISD::VROTLI, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4142 { X86ISD::VROTLI, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4143 };
4144 static const CostKindTblEntry GLMCostTbl[] = {
4145 { ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss
4146 { ISD::FSQRT, MVT::v4f32, { 37, 41, 1, 5 } }, // sqrtps
4147 { ISD::FSQRT, MVT::f64, { 34, 35, 1, 1 } }, // sqrtsd
4148 { ISD::FSQRT, MVT::v2f64, { 67, 71, 1, 5 } }, // sqrtpd
4149 };
4150 static const CostKindTblEntry SLMCostTbl[] = {
4151 { ISD::BSWAP, MVT::v2i64, { 5, 5, 1, 5 } },
4152 { ISD::BSWAP, MVT::v4i32, { 5, 5, 1, 5 } },
4153 { ISD::BSWAP, MVT::v8i16, { 5, 5, 1, 5 } },
4154 { ISD::FSQRT, MVT::f32, { 20, 20, 1, 1 } }, // sqrtss
4155 { ISD::FSQRT, MVT::v4f32, { 40, 41, 1, 5 } }, // sqrtps
4156 { ISD::FSQRT, MVT::f64, { 35, 35, 1, 1 } }, // sqrtsd
4157 { ISD::FSQRT, MVT::v2f64, { 70, 71, 1, 5 } }, // sqrtpd
4158 };
4159 static const CostKindTblEntry SSE42CostTbl[] = {
4160 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4161 { ISD::FMAXNUM, MVT::v4f32, { 4, 4, 4, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4162 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4163 { ISD::FMAXNUM, MVT::v2f64, { 4, 4, 4, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4164 { ISD::FSQRT, MVT::f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
4165 { ISD::FSQRT, MVT::v4f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
4166 };
4167 static const CostKindTblEntry SSE41CostTbl[] = {
4168 { ISD::ABS, MVT::v2i64, { 3, 4, 3, 5 } }, // BLENDVPD(X,PSUBQ(0,X),X)
4169 { ISD::SADDSAT, MVT::v2i64, { 10, 14, 17, 21 } },
4170 { ISD::SADDSAT, MVT::v4i32, { 5, 11, 8, 10 } },
4171 { ISD::SSUBSAT, MVT::v2i64, { 12, 19, 25, 29 } },
4172 { ISD::SSUBSAT, MVT::v4i32, { 6, 14, 10, 12 } },
4173 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 3 } },
4174 { ISD::SMAX, MVT::v4i32, { 1, 1, 1, 1 } },
4175 { ISD::SMAX, MVT::v16i8, { 1, 1, 1, 1 } },
4176 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
4177 { ISD::SMIN, MVT::v4i32, { 1, 1, 1, 1 } },
4178 { ISD::SMIN, MVT::v16i8, { 1, 1, 1, 1 } },
4179 { ISD::SMULO, MVT::v2i64, { 9, 11, 13, 17 } },
4180 { ISD::SMULO, MVT::v4i32, { 20, 24, 13, 19 } },
4181 { ISD::SMULO, MVT::v8i16, { 5, 9, 8, 8 } },
4182 { ISD::SMULO, MVT::v16i8, { 13, 22, 24, 25 } },
4183 { ISD::UADDSAT, MVT::v2i64, { 6, 13, 14, 14 } },
4184 { ISD::UADDSAT, MVT::v4i32, { 2, 2, 4, 4 } },
4185 { ISD::USUBSAT, MVT::v2i64, { 6, 10, 14, 14 } },
4186 { ISD::USUBSAT, MVT::v4i32, { 1, 2, 2, 2 } },
4187 { ISD::UMAX, MVT::v2i64, { 2, 11, 6, 7 } },
4188 { ISD::UMAX, MVT::v4i32, { 1, 1, 1, 1 } },
4189 { ISD::UMAX, MVT::v8i16, { 1, 1, 1, 1 } },
4190 { ISD::UMIN, MVT::v2i64, { 2, 11, 6, 7 } },
4191 { ISD::UMIN, MVT::v4i32, { 1, 1, 1, 1 } },
4192 { ISD::UMIN, MVT::v8i16, { 1, 1, 1, 1 } },
4193 { ISD::UMULO, MVT::v2i64, { 14, 20, 15, 20 } },
4194 { ISD::UMULO, MVT::v4i32, { 19, 22, 12, 18 } },
4195 { ISD::UMULO, MVT::v8i16, { 4, 9, 7, 7 } },
4196 { ISD::UMULO, MVT::v16i8, { 13, 19, 18, 20 } },
4197 };
4198 static const CostKindTblEntry SSSE3CostTbl[] = {
4199 { ISD::ABS, MVT::v4i32, { 1, 2, 1, 1 } },
4200 { ISD::ABS, MVT::v8i16, { 1, 2, 1, 1 } },
4201 { ISD::ABS, MVT::v16i8, { 1, 2, 1, 1 } },
4202 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 11, 21 } },
4203 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 11, 21 } },
4204 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 11, 21 } },
4205 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 10, 16 } },
4206 { ISD::BSWAP, MVT::v2i64, { 2, 3, 1, 5 } },
4207 { ISD::BSWAP, MVT::v4i32, { 2, 3, 1, 5 } },
4208 { ISD::BSWAP, MVT::v8i16, { 2, 3, 1, 5 } },
4209 { ISD::CTLZ, MVT::v2i64, { 18, 28, 28, 35 } },
4210 { ISD::CTLZ, MVT::v4i32, { 15, 20, 22, 28 } },
4211 { ISD::CTLZ, MVT::v8i16, { 13, 17, 16, 22 } },
4212 { ISD::CTLZ, MVT::v16i8, { 11, 15, 10, 16 } },
4213 { ISD::CTPOP, MVT::v2i64, { 13, 19, 12, 18 } },
4214 { ISD::CTPOP, MVT::v4i32, { 18, 24, 16, 22 } },
4215 { ISD::CTPOP, MVT::v8i16, { 13, 18, 14, 20 } },
4216 { ISD::CTPOP, MVT::v16i8, { 11, 12, 10, 16 } },
4217 { ISD::CTTZ, MVT::v2i64, { 13, 25, 15, 22 } },
4218 { ISD::CTTZ, MVT::v4i32, { 18, 26, 19, 25 } },
4219 { ISD::CTTZ, MVT::v8i16, { 13, 20, 17, 23 } },
4220 { ISD::CTTZ, MVT::v16i8, { 11, 16, 13, 19 } }
4221 };
4222 static const CostKindTblEntry SSE2CostTbl[] = {
4223 { ISD::ABS, MVT::v2i64, { 3, 6, 5, 5 } },
4224 { ISD::ABS, MVT::v4i32, { 1, 4, 4, 4 } },
4225 { ISD::ABS, MVT::v8i16, { 1, 2, 3, 3 } },
4226 { ISD::ABS, MVT::v16i8, { 1, 2, 3, 3 } },
4227 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 32, 32 } },
4228 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 30, 30 } },
4229 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 25, 25 } },
4230 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 21, 21 } },
4231 { ISD::BSWAP, MVT::v2i64, { 5, 6, 11, 11 } },
4232 { ISD::BSWAP, MVT::v4i32, { 5, 5, 9, 9 } },
4233 { ISD::BSWAP, MVT::v8i16, { 5, 5, 4, 5 } },
4234 { ISD::CTLZ, MVT::v2i64, { 10, 45, 36, 38 } },
4235 { ISD::CTLZ, MVT::v4i32, { 10, 45, 38, 40 } },
4236 { ISD::CTLZ, MVT::v8i16, { 9, 38, 32, 34 } },
4237 { ISD::CTLZ, MVT::v16i8, { 8, 39, 29, 32 } },
4238 { ISD::CTPOP, MVT::v2i64, { 12, 26, 16, 18 } },
4239 { ISD::CTPOP, MVT::v4i32, { 15, 29, 21, 23 } },
4240 { ISD::CTPOP, MVT::v8i16, { 13, 25, 18, 20 } },
4241 { ISD::CTPOP, MVT::v16i8, { 10, 21, 14, 16 } },
4242 { ISD::CTTZ, MVT::v2i64, { 14, 28, 19, 21 } },
4243 { ISD::CTTZ, MVT::v4i32, { 18, 31, 24, 26 } },
4244 { ISD::CTTZ, MVT::v8i16, { 16, 27, 21, 23 } },
4245 { ISD::CTTZ, MVT::v16i8, { 13, 23, 17, 19 } },
4246 { ISD::SADDSAT, MVT::v2i64, { 12, 14, 24, 24 } },
4247 { ISD::SADDSAT, MVT::v4i32, { 6, 11, 11, 12 } },
4248 { ISD::SADDSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4249 { ISD::SADDSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4250 { ISD::SMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4251 { ISD::SMAX, MVT::v4i32, { 2, 4, 5, 5 } },
4252 { ISD::SMAX, MVT::v8i16, { 1, 1, 1, 1 } },
4253 { ISD::SMAX, MVT::v16i8, { 2, 4, 5, 5 } },
4254 { ISD::SMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4255 { ISD::SMIN, MVT::v4i32, { 2, 4, 5, 5 } },
4256 { ISD::SMIN, MVT::v8i16, { 1, 1, 1, 1 } },
4257 { ISD::SMIN, MVT::v16i8, { 2, 4, 5, 5 } },
4258 { ISD::SMULO, MVT::v2i64, { 30, 33, 13, 23 } },
4259 { ISD::SMULO, MVT::v4i32, { 20, 24, 23, 23 } },
4260 { ISD::SMULO, MVT::v8i16, { 5, 10, 8, 8 } },
4261 { ISD::SMULO, MVT::v16i8, { 13, 23, 24, 25 } },
4262 { ISD::SSUBSAT, MVT::v2i64, { 16, 19, 31, 31 } },
4263 { ISD::SSUBSAT, MVT::v4i32, { 6, 14, 12, 13 } },
4264 { ISD::SSUBSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4265 { ISD::SSUBSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4266 { ISD::UADDSAT, MVT::v2i64, { 7, 13, 14, 14 } },
4267 { ISD::UADDSAT, MVT::v4i32, { 4, 5, 7, 7 } },
4268 { ISD::UADDSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4269 { ISD::UADDSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4270 { ISD::UMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4271 { ISD::UMAX, MVT::v4i32, { 2, 5, 8, 8 } },
4272 { ISD::UMAX, MVT::v8i16, { 1, 3, 3, 3 } },
4273 { ISD::UMAX, MVT::v16i8, { 1, 1, 1, 1 } },
4274 { ISD::UMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4275 { ISD::UMIN, MVT::v4i32, { 2, 5, 8, 8 } },
4276 { ISD::UMIN, MVT::v8i16, { 1, 3, 3, 3 } },
4277 { ISD::UMIN, MVT::v16i8, { 1, 1, 1, 1 } },
4278 { ISD::UMULO, MVT::v2i64, { 30, 33, 15, 29 } },
4279 { ISD::UMULO, MVT::v4i32, { 19, 22, 14, 18 } },
4280 { ISD::UMULO, MVT::v8i16, { 4, 9, 7, 7 } },
4281 { ISD::UMULO, MVT::v16i8, { 13, 19, 20, 20 } },
4282 { ISD::USUBSAT, MVT::v2i64, { 7, 10, 14, 14 } },
4283 { ISD::USUBSAT, MVT::v4i32, { 4, 4, 7, 7 } },
4284 { ISD::USUBSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4285 { ISD::USUBSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4286 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } },
4287 { ISD::FMAXNUM, MVT::v2f64, { 4, 6, 6, 6 } },
4288 { ISD::FSQRT, MVT::f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4289 { ISD::FSQRT, MVT::v2f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4290 };
4291 static const CostKindTblEntry SSE1CostTbl[] = {
4292 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } },
4293 { ISD::FMAXNUM, MVT::v4f32, { 4, 6, 6, 6 } },
4294 { ISD::FSQRT, MVT::f32, { 28, 30, 1, 2 } }, // Pentium III from http://www.agner.org/
4295 { ISD::FSQRT, MVT::v4f32, { 56, 56, 1, 2 } }, // Pentium III from http://www.agner.org/
4296 };
4297 static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets
4298 { ISD::CTTZ, MVT::i64, { 1, 1, 1, 1 } },
4299 };
4300 static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
4301 { ISD::CTTZ, MVT::i32, { 1, 1, 1, 1 } },
4302 { ISD::CTTZ, MVT::i16, { 2, 1, 1, 1 } },
4303 { ISD::CTTZ, MVT::i8, { 2, 1, 1, 1 } },
4304 };
4305 static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets
4306 { ISD::CTLZ, MVT::i64, { 1, 1, 1, 1 } },
4307 };
4308 static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
4309 { ISD::CTLZ, MVT::i32, { 1, 1, 1, 1 } },
4310 { ISD::CTLZ, MVT::i16, { 2, 1, 1, 1 } },
4311 { ISD::CTLZ, MVT::i8, { 2, 1, 1, 1 } },
4312 };
4313 static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets
4314 { ISD::CTPOP, MVT::i64, { 1, 1, 1, 1 } }, // popcnt
4315 };
4316 static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
4317 { ISD::CTPOP, MVT::i32, { 1, 1, 1, 1 } }, // popcnt
4318 { ISD::CTPOP, MVT::i16, { 1, 1, 2, 2 } }, // popcnt(zext())
4319 { ISD::CTPOP, MVT::i8, { 1, 1, 2, 2 } }, // popcnt(zext())
4320 };
4321 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
4322 { ISD::ABS, MVT::i64, { 1, 2, 3, 3 } }, // SUB+CMOV
4323 { ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } },
4324 { ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } },
4325 { ISD::CTLZ, MVT::i64, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
4326 { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 2, 2 } }, // BSR+XOR
4327 { ISD::CTTZ, MVT::i64, { 2, 2, 3, 4 } }, // TEST+BSF+CMOV/BRANCH
4328 { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 1, 2 } }, // BSF
4329 { ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } },
4330 { ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } },
4331 { ISD::ROTR, MVT::i64, { 2, 3, 1, 3 } },
4332 { X86ISD::VROTLI, MVT::i64, { 1, 1, 1, 1 } },
4333 { ISD::FSHL, MVT::i64, { 4, 4, 1, 4 } },
4334 { ISD::SADDSAT, MVT::i64, { 4, 4, 7, 10 } },
4335 { ISD::SSUBSAT, MVT::i64, { 4, 5, 8, 11 } },
4336 { ISD::UADDSAT, MVT::i64, { 2, 3, 4, 7 } },
4337 { ISD::USUBSAT, MVT::i64, { 2, 3, 4, 7 } },
4338 { ISD::SMAX, MVT::i64, { 1, 3, 2, 3 } },
4339 { ISD::SMIN, MVT::i64, { 1, 3, 2, 3 } },
4340 { ISD::UMAX, MVT::i64, { 1, 3, 2, 3 } },
4341 { ISD::UMIN, MVT::i64, { 1, 3, 2, 3 } },
4342 { ISD::SADDO, MVT::i64, { 2, 2, 4, 6 } },
4343 { ISD::UADDO, MVT::i64, { 2, 2, 4, 6 } },
4344 { ISD::SMULO, MVT::i64, { 4, 4, 4, 6 } },
4345 { ISD::UMULO, MVT::i64, { 8, 8, 4, 7 } },
4346 };
4347 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
4348 { ISD::ABS, MVT::i32, { 1, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
4349 { ISD::ABS, MVT::i16, { 2, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
4350 { ISD::ABS, MVT::i8, { 2, 4, 4, 3 } }, // SUB+XOR+SRA
4351 { ISD::BITREVERSE, MVT::i32, { 9, 12, 17, 19 } },
4352 { ISD::BITREVERSE, MVT::i16, { 9, 12, 17, 19 } },
4353 { ISD::BITREVERSE, MVT::i8, { 7, 9, 13, 14 } },
4354 { ISD::BSWAP, MVT::i32, { 1, 1, 1, 1 } },
4355 { ISD::BSWAP, MVT::i16, { 1, 2, 1, 2 } }, // ROL
4356 { ISD::CTLZ, MVT::i32, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
4357 { ISD::CTLZ, MVT::i16, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
4358 { ISD::CTLZ, MVT::i8, { 2, 2, 5, 6 } }, // BSR+XOR or BSR+XOR+CMOV
4359 { ISD::CTLZ_ZERO_UNDEF, MVT::i32,{ 1, 2, 2, 2 } }, // BSR+XOR
4360 { ISD::CTLZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 2, 2 } }, // BSR+XOR
4361 { ISD::CTLZ_ZERO_UNDEF, MVT::i8, { 2, 2, 3, 3 } }, // BSR+XOR
4362 { ISD::CTTZ, MVT::i32, { 2, 2, 3, 3 } }, // TEST+BSF+CMOV/BRANCH
4363 { ISD::CTTZ, MVT::i16, { 2, 2, 2, 3 } }, // TEST+BSF+CMOV/BRANCH
4364 { ISD::CTTZ, MVT::i8, { 2, 2, 2, 3 } }, // TEST+BSF+CMOV/BRANCH
4365 { ISD::CTTZ_ZERO_UNDEF, MVT::i32,{ 1, 2, 1, 2 } }, // BSF
4366 { ISD::CTTZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 1, 2 } }, // BSF
4367 { ISD::CTTZ_ZERO_UNDEF, MVT::i8, { 2, 2, 1, 2 } }, // BSF
4368 { ISD::CTPOP, MVT::i32, { 8, 7, 15, 15 } },
4369 { ISD::CTPOP, MVT::i16, { 9, 8, 17, 17 } },
4370 { ISD::CTPOP, MVT::i8, { 7, 6, 6, 6 } },
4371 { ISD::ROTL, MVT::i32, { 2, 3, 1, 3 } },
4372 { ISD::ROTL, MVT::i16, { 2, 3, 1, 3 } },
4373 { ISD::ROTL, MVT::i8, { 2, 3, 1, 3 } },
4374 { ISD::ROTR, MVT::i32, { 2, 3, 1, 3 } },
4375 { ISD::ROTR, MVT::i16, { 2, 3, 1, 3 } },
4376 { ISD::ROTR, MVT::i8, { 2, 3, 1, 3 } },
4377 { X86ISD::VROTLI, MVT::i32, { 1, 1, 1, 1 } },
4378 { X86ISD::VROTLI, MVT::i16, { 1, 1, 1, 1 } },
4379 { X86ISD::VROTLI, MVT::i8, { 1, 1, 1, 1 } },
4380 { ISD::FSHL, MVT::i32, { 4, 4, 1, 4 } },
4381 { ISD::FSHL, MVT::i16, { 4, 4, 2, 5 } },
4382 { ISD::FSHL, MVT::i8, { 4, 4, 2, 5 } },
4383 { ISD::SADDSAT, MVT::i32, { 3, 4, 6, 9 } },
4384 { ISD::SADDSAT, MVT::i16, { 4, 4, 7, 10 } },
4385 { ISD::SADDSAT, MVT::i8, { 4, 5, 8, 11 } },
4386 { ISD::SSUBSAT, MVT::i32, { 4, 4, 7, 10 } },
4387 { ISD::SSUBSAT, MVT::i16, { 4, 4, 7, 10 } },
4388 { ISD::SSUBSAT, MVT::i8, { 4, 5, 8, 11 } },
4389 { ISD::UADDSAT, MVT::i32, { 2, 3, 4, 7 } },
4390 { ISD::UADDSAT, MVT::i16, { 2, 3, 4, 7 } },
4391 { ISD::UADDSAT, MVT::i8, { 3, 3, 5, 8 } },
4392 { ISD::USUBSAT, MVT::i32, { 2, 3, 4, 7 } },
4393 { ISD::USUBSAT, MVT::i16, { 2, 3, 4, 7 } },
4394 { ISD::USUBSAT, MVT::i8, { 3, 3, 5, 8 } },
4395 { ISD::SMAX, MVT::i32, { 1, 2, 2, 3 } },
4396 { ISD::SMAX, MVT::i16, { 1, 4, 2, 4 } },
4397 { ISD::SMAX, MVT::i8, { 1, 4, 2, 4 } },
4398 { ISD::SMIN, MVT::i32, { 1, 2, 2, 3 } },
4399 { ISD::SMIN, MVT::i16, { 1, 4, 2, 4 } },
4400 { ISD::SMIN, MVT::i8, { 1, 4, 2, 4 } },
4401 { ISD::UMAX, MVT::i32, { 1, 2, 2, 3 } },
4402 { ISD::UMAX, MVT::i16, { 1, 4, 2, 4 } },
4403 { ISD::UMAX, MVT::i8, { 1, 4, 2, 4 } },
4404 { ISD::UMIN, MVT::i32, { 1, 2, 2, 3 } },
4405 { ISD::UMIN, MVT::i16, { 1, 4, 2, 4 } },
4406 { ISD::UMIN, MVT::i8, { 1, 4, 2, 4 } },
4407 { ISD::SADDO, MVT::i32, { 2, 2, 4, 6 } },
4408 { ISD::SADDO, MVT::i16, { 2, 2, 4, 6 } },
4409 { ISD::SADDO, MVT::i8, { 2, 2, 4, 6 } },
4410 { ISD::UADDO, MVT::i32, { 2, 2, 4, 6 } },
4411 { ISD::UADDO, MVT::i16, { 2, 2, 4, 6 } },
4412 { ISD::UADDO, MVT::i8, { 2, 2, 4, 6 } },
4413 { ISD::SMULO, MVT::i32, { 2, 2, 4, 6 } },
4414 { ISD::SMULO, MVT::i16, { 5, 5, 4, 6 } },
4415 { ISD::SMULO, MVT::i8, { 6, 6, 4, 6 } },
4416 { ISD::UMULO, MVT::i32, { 6, 6, 4, 8 } },
4417 { ISD::UMULO, MVT::i16, { 6, 6, 4, 9 } },
4418 { ISD::UMULO, MVT::i8, { 6, 6, 4, 6 } },
4419 };
4420
4421 Type *RetTy = ICA.getReturnType();
4422 Type *OpTy = RetTy;
4423 Intrinsic::ID IID = ICA.getID();
4424 unsigned ISD = ISD::DELETED_NODE;
4425 switch (IID) {
4426 default:
4427 break;
4428 case Intrinsic::abs:
4429 ISD = ISD::ABS;
4430 break;
4431 case Intrinsic::bitreverse:
4432 ISD = ISD::BITREVERSE;
4433 break;
4434 case Intrinsic::bswap:
4435 ISD = ISD::BSWAP;
4436 break;
4437 case Intrinsic::ctlz:
4438 ISD = ISD::CTLZ;
4439 break;
4440 case Intrinsic::ctpop:
4441 ISD = ISD::CTPOP;
4442 break;
4443 case Intrinsic::cttz:
4444 ISD = ISD::CTTZ;
4445 break;
4446 case Intrinsic::fshl:
4447 ISD = ISD::FSHL;
4448 if (!ICA.isTypeBasedOnly()) {
4449 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4450 if (Args[0] == Args[1]) {
4451 ISD = ISD::ROTL;
4452 // Handle uniform constant rotation amounts.
4453 // TODO: Handle funnel-shift cases.
4454 const APInt *Amt;
4455 if (Args[2] &&
4457 ISD = X86ISD::VROTLI;
4458 }
4459 }
4460 break;
4461 case Intrinsic::fshr:
4462 // FSHR has same costs so don't duplicate.
4463 ISD = ISD::FSHL;
4464 if (!ICA.isTypeBasedOnly()) {
4465 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4466 if (Args[0] == Args[1]) {
4467 ISD = ISD::ROTR;
4468 // Handle uniform constant rotation amount.
4469 // TODO: Handle funnel-shift cases.
4470 const APInt *Amt;
4471 if (Args[2] &&
4473 ISD = X86ISD::VROTLI;
4474 }
4475 }
4476 break;
4477 case Intrinsic::lrint:
4478 case Intrinsic::llrint:
4479 // X86 can use the CVTP2SI instructions to lower lrint/llrint calls, which
4480 // have the same costs as the CVTTP2SI (fptosi) instructions
4481 if (!ICA.isTypeBasedOnly()) {
4482 const SmallVectorImpl<Type *> &ArgTys = ICA.getArgTypes();
4483 return getCastInstrCost(Instruction::FPToSI, RetTy, ArgTys[0],
4485 }
4486 break;
4487 case Intrinsic::maxnum:
4488 case Intrinsic::minnum:
4489 // FMINNUM has same costs so don't duplicate.
4490 ISD = ISD::FMAXNUM;
4491 break;
4492 case Intrinsic::sadd_sat:
4493 ISD = ISD::SADDSAT;
4494 break;
4495 case Intrinsic::smax:
4496 ISD = ISD::SMAX;
4497 break;
4498 case Intrinsic::smin:
4499 ISD = ISD::SMIN;
4500 break;
4501 case Intrinsic::ssub_sat:
4502 ISD = ISD::SSUBSAT;
4503 break;
4504 case Intrinsic::uadd_sat:
4505 ISD = ISD::UADDSAT;
4506 break;
4507 case Intrinsic::umax:
4508 ISD = ISD::UMAX;
4509 break;
4510 case Intrinsic::umin:
4511 ISD = ISD::UMIN;
4512 break;
4513 case Intrinsic::usub_sat:
4514 ISD = ISD::USUBSAT;
4515 break;
4516 case Intrinsic::sqrt:
4517 ISD = ISD::FSQRT;
4518 break;
4519 case Intrinsic::sadd_with_overflow:
4520 case Intrinsic::ssub_with_overflow:
4521 // SSUBO has same costs so don't duplicate.
4522 ISD = ISD::SADDO;
4523 OpTy = RetTy->getContainedType(0);
4524 break;
4525 case Intrinsic::uadd_with_overflow:
4526 case Intrinsic::usub_with_overflow:
4527 // USUBO has same costs so don't duplicate.
4528 ISD = ISD::UADDO;
4529 OpTy = RetTy->getContainedType(0);
4530 break;
4531 case Intrinsic::smul_with_overflow:
4532 ISD = ISD::SMULO;
4533 OpTy = RetTy->getContainedType(0);
4534 break;
4535 case Intrinsic::umul_with_overflow:
4536 ISD = ISD::UMULO;
4537 OpTy = RetTy->getContainedType(0);
4538 break;
4539 }
4540
4541 if (ISD != ISD::DELETED_NODE) {
4542 auto adjustTableCost = [&](int ISD, unsigned Cost,
4543 std::pair<InstructionCost, MVT> LT,
4545 InstructionCost LegalizationCost = LT.first;
4546 MVT MTy = LT.second;
4547
4548 // If there are no NANs to deal with, then these are reduced to a
4549 // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
4550 // assume is used in the non-fast case.
4551 if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) {
4552 if (FMF.noNaNs())
4553 return LegalizationCost * 1;
4554 }
4555
4556 // For cases where some ops can be folded into a load/store, assume free.
4557 if (MTy.isScalarInteger()) {
4558 if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
4559 if (const Instruction *II = ICA.getInst()) {
4560 if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
4561 return TTI::TCC_Free;
4562 if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
4563 if (LI->hasOneUse())
4564 return TTI::TCC_Free;
4565 }
4566 }
4567 }
4568 }
4569
4570 return LegalizationCost * (int)Cost;
4571 };
4572
4573 // Legalize the type.
4574 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy);
4575 MVT MTy = LT.second;
4576
4577 // Without BMI/LZCNT see if we're only looking for a *_ZERO_UNDEF cost.
4578 if (((ISD == ISD::CTTZ && !ST->hasBMI()) ||
4579 (ISD == ISD::CTLZ && !ST->hasLZCNT())) &&
4580 !MTy.isVector() && !ICA.isTypeBasedOnly()) {
4581 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4582 if (auto *Cst = dyn_cast<ConstantInt>(Args[1]))
4583 if (Cst->isAllOnesValue())
4585 }
4586
4587 // FSQRT is a single instruction.
4588 if (ISD == ISD::FSQRT && CostKind == TTI::TCK_CodeSize)
4589 return LT.first;
4590
4591 if (ST->useGLMDivSqrtCosts())
4592 if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
4593 if (auto KindCost = Entry->Cost[CostKind])
4594 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4595
4596 if (ST->useSLMArithCosts())
4597 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
4598 if (auto KindCost = Entry->Cost[CostKind])
4599 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4600
4601 if (ST->hasVBMI2())
4602 if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy))
4603 if (auto KindCost = Entry->Cost[CostKind])
4604 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4605
4606 if (ST->hasBITALG())
4607 if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy))
4608 if (auto KindCost = Entry->Cost[CostKind])
4609 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4610
4611 if (ST->hasVPOPCNTDQ())
4612 if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy))
4613 if (auto KindCost = Entry->Cost[CostKind])
4614 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4615
4616 if (ST->hasGFNI())
4617 if (const auto *Entry = CostTableLookup(GFNICostTbl, ISD, MTy))
4618 if (auto KindCost = Entry->Cost[CostKind])
4619 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4620
4621 if (ST->hasCDI())
4622 if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
4623 if (auto KindCost = Entry->Cost[CostKind])
4624 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4625
4626 if (ST->hasBWI())
4627 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
4628 if (auto KindCost = Entry->Cost[CostKind])
4629 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4630
4631 if (ST->hasAVX512())
4632 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
4633 if (auto KindCost = Entry->Cost[CostKind])
4634 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4635
4636 if (ST->hasXOP())
4637 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
4638 if (auto KindCost = Entry->Cost[CostKind])
4639 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4640
4641 if (ST->hasAVX2())
4642 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
4643 if (auto KindCost = Entry->Cost[CostKind])
4644 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4645
4646 if (ST->hasAVX())
4647 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
4648 if (auto KindCost = Entry->Cost[CostKind])
4649 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4650
4651 if (ST->hasSSE42())
4652 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
4653 if (auto KindCost = Entry->Cost[CostKind])
4654 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4655
4656 if (ST->hasSSE41())
4657 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
4658 if (auto KindCost = Entry->Cost[CostKind])
4659 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4660
4661 if (ST->hasSSSE3())
4662 if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
4663 if (auto KindCost = Entry->Cost[CostKind])
4664 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4665
4666 if (ST->hasSSE2())
4667 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
4668 if (auto KindCost = Entry->Cost[CostKind])
4669 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4670
4671 if (ST->hasSSE1())
4672 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
4673 if (auto KindCost = Entry->Cost[CostKind])
4674 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4675
4676 if (ST->hasBMI()) {
4677 if (ST->is64Bit())
4678 if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
4679 if (auto KindCost = Entry->Cost[CostKind])
4680 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4681
4682 if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
4683 if (auto KindCost = Entry->Cost[CostKind])
4684 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4685 }
4686
4687 if (ST->hasLZCNT()) {
4688 if (ST->is64Bit())
4689 if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
4690 if (auto KindCost = Entry->Cost[CostKind])
4691 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4692
4693 if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
4694 if (auto KindCost = Entry->Cost[CostKind])
4695 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4696 }
4697
4698 if (ST->hasPOPCNT()) {
4699 if (ST->is64Bit())
4700 if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
4701 if (auto KindCost = Entry->Cost[CostKind])
4702 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4703
4704 if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
4705 if (auto KindCost = Entry->Cost[CostKind])
4706 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4707 }
4708
4709 if (ST->is64Bit())
4710 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
4711 if (auto KindCost = Entry->Cost[CostKind])
4712 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4713
4714 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
4715 if (auto KindCost = Entry->Cost[CostKind])
4716 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4717 }
4718
4720}
4721
4724 unsigned Index, Value *Op0,
4725 Value *Op1) {
4726 static const CostTblEntry SLMCostTbl[] = {
4727 { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 },
4728 { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 },
4729 { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 },
4730 { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 }
4731 };
4732
4733 assert(Val->isVectorTy() && "This must be a vector type");
4734 Type *ScalarType = Val->getScalarType();
4735 InstructionCost RegisterFileMoveCost = 0;
4736
4737 // Non-immediate extraction/insertion can be handled as a sequence of
4738 // aliased loads+stores via the stack.
4739 if (Index == -1U && (Opcode == Instruction::ExtractElement ||
4740 Opcode == Instruction::InsertElement)) {
4741 // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
4742 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
4743
4744 // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling.
4745 assert(isa<FixedVectorType>(Val) && "Fixed vector type expected");
4746 Align VecAlign = DL.getPrefTypeAlign(Val);
4747 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
4748
4749 // Extract - store vector to stack, load scalar.
4750 if (Opcode == Instruction::ExtractElement) {
4751 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4752 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
4753 CostKind);
4754 }
4755 // Insert - store vector to stack, store scalar, load vector.
4756 if (Opcode == Instruction::InsertElement) {
4757 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4758 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
4759 CostKind) +
4760 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind);
4761 }
4762 }
4763
4764 if (Index != -1U && (Opcode == Instruction::ExtractElement ||
4765 Opcode == Instruction::InsertElement)) {
4766 // Extraction of vXi1 elements are now efficiently handled by MOVMSK.
4767 if (Opcode == Instruction::ExtractElement &&
4768 ScalarType->getScalarSizeInBits() == 1 &&
4769 cast<FixedVectorType>(Val)->getNumElements() > 1)
4770 return 1;
4771
4772 // Legalize the type.
4773 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
4774
4775 // This type is legalized to a scalar type.
4776 if (!LT.second.isVector())
4777 return TTI::TCC_Free;
4778
4779 // The type may be split. Normalize the index to the new type.
4780 unsigned SizeInBits = LT.second.getSizeInBits();
4781 unsigned NumElts = LT.second.getVectorNumElements();
4782 unsigned SubNumElts = NumElts;
4783 Index = Index % NumElts;
4784
4785 // For >128-bit vectors, we need to extract higher 128-bit subvectors.
4786 // For inserts, we also need to insert the subvector back.
4787 if (SizeInBits > 128) {
4788 assert((SizeInBits % 128) == 0 && "Illegal vector");
4789 unsigned NumSubVecs = SizeInBits / 128;
4790 SubNumElts = NumElts / NumSubVecs;
4791 if (SubNumElts <= Index) {
4792 RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
4793 Index %= SubNumElts;
4794 }
4795 }
4796
4797 MVT MScalarTy = LT.second.getScalarType();
4798 auto IsCheapPInsrPExtrInsertPS = [&]() {
4799 // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
4800 // Also, assume insertps is relatively cheap on all >= SSE41 targets.
4801 return (MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4802 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4803 (MScalarTy == MVT::f32 && ST->hasSSE41() &&
4804 Opcode == Instruction::InsertElement);
4805 };
4806
4807 if (Index == 0) {
4808 // Floating point scalars are already located in index #0.
4809 // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
4810 // true for all.
4811 if (ScalarType->isFloatingPointTy() &&
4812 (Opcode != Instruction::InsertElement || !Op0 ||
4813 isa<UndefValue>(Op0)))
4814 return RegisterFileMoveCost;
4815
4816 if (Opcode == Instruction::InsertElement &&
4817 isa_and_nonnull<UndefValue>(Op0)) {
4818 // Consider the gather cost to be cheap.
4819 if (isa_and_nonnull<LoadInst>(Op1))
4820 return RegisterFileMoveCost;
4821 if (!IsCheapPInsrPExtrInsertPS()) {
4822 // mov constant-to-GPR + movd/movq GPR -> XMM.
4823 if (isa_and_nonnull<Constant>(Op1) && Op1->getType()->isIntegerTy())
4824 return 2 + RegisterFileMoveCost;
4825 // Assume movd/movq GPR -> XMM is relatively cheap on all targets.
4826 return 1 + RegisterFileMoveCost;
4827 }
4828 }
4829
4830 // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
4831 if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
4832 return 1 + RegisterFileMoveCost;
4833 }
4834
4835 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4836 assert(ISD && "Unexpected vector opcode");
4837 if (ST->useSLMArithCosts())
4838 if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
4839 return Entry->Cost + RegisterFileMoveCost;
4840
4841 // Consider cheap cases.
4842 if (IsCheapPInsrPExtrInsertPS())
4843 return 1 + RegisterFileMoveCost;
4844
4845 // For extractions we just need to shuffle the element to index 0, which
4846 // should be very cheap (assume cost = 1). For insertions we need to shuffle
4847 // the elements to its destination. In both cases we must handle the
4848 // subvector move(s).
4849 // If the vector type is already less than 128-bits then don't reduce it.
4850 // TODO: Under what circumstances should we shuffle using the full width?
4851 InstructionCost ShuffleCost = 1;
4852 if (Opcode == Instruction::InsertElement) {
4853 auto *SubTy = cast<VectorType>(Val);
4854 EVT VT = TLI->getValueType(DL, Val);
4855 if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
4856 SubTy = FixedVectorType::get(ScalarType, SubNumElts);
4857 ShuffleCost =
4858 getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, {}, CostKind, 0, SubTy);
4859 }
4860 int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
4861 return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
4862 }
4863
4864 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1) +
4865 RegisterFileMoveCost;
4866}
4867
4869 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
4871 assert(DemandedElts.getBitWidth() ==
4872 cast<FixedVectorType>(Ty)->getNumElements() &&
4873 "Vector size mismatch");
4874
4875 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4876 MVT MScalarTy = LT.second.getScalarType();
4877 unsigned LegalVectorBitWidth = LT.second.getSizeInBits();
4879
4880 constexpr unsigned LaneBitWidth = 128;
4881 assert((LegalVectorBitWidth < LaneBitWidth ||
4882 (LegalVectorBitWidth % LaneBitWidth) == 0) &&
4883 "Illegal vector");
4884
4885 const int NumLegalVectors = *LT.first.getValue();
4886 assert(NumLegalVectors >= 0 && "Negative cost!");
4887
4888 // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
4889 // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
4890 if (Insert) {
4891 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4892 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4893 (MScalarTy == MVT::f32 && ST->hasSSE41())) {
4894 // For types we can insert directly, insertion into 128-bit sub vectors is
4895 // cheap, followed by a cheap chain of concatenations.
4896 if (LegalVectorBitWidth <= LaneBitWidth) {
4897 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
4898 /*Extract*/ false, CostKind);
4899 } else {
4900 // In each 128-lane, if at least one index is demanded but not all
4901 // indices are demanded and this 128-lane is not the first 128-lane of
4902 // the legalized-vector, then this 128-lane needs a extracti128; If in
4903 // each 128-lane, there is at least one demanded index, this 128-lane
4904 // needs a inserti128.
4905
4906 // The following cases will help you build a better understanding:
4907 // Assume we insert several elements into a v8i32 vector in avx2,
4908 // Case#1: inserting into 1th index needs vpinsrd + inserti128.
4909 // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
4910 // inserti128.
4911 // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
4912 assert((LegalVectorBitWidth % LaneBitWidth) == 0 && "Illegal vector");
4913 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
4914 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
4915 unsigned NumLegalElts =
4916 LT.second.getVectorNumElements() * NumLegalVectors;
4917 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
4918 "Vector has been legalized to smaller element count");
4919 assert((NumLegalElts % NumLanesTotal) == 0 &&
4920 "Unexpected elts per lane");
4921 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
4922
4923 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
4924 auto *LaneTy =
4925 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
4926
4927 for (unsigned I = 0; I != NumLanesTotal; ++I) {
4928 APInt LaneEltMask = WidenedDemandedElts.extractBits(
4929 NumEltsPerLane, NumEltsPerLane * I);
4930 if (LaneEltMask.isZero())
4931 continue;
4932 // FIXME: we don't need to extract if all non-demanded elements
4933 // are legalization-inserted padding.
4934 if (!LaneEltMask.isAllOnes())
4936 I * NumEltsPerLane, LaneTy);
4937 Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert,
4938 /*Extract*/ false, CostKind);
4939 }
4940
4941 APInt AffectedLanes =
4942 APIntOps::ScaleBitMask(WidenedDemandedElts, NumLanesTotal);
4943 APInt FullyAffectedLegalVectors = APIntOps::ScaleBitMask(
4944 AffectedLanes, NumLegalVectors, /*MatchAllBits=*/true);
4945 for (int LegalVec = 0; LegalVec != NumLegalVectors; ++LegalVec) {
4946 for (unsigned Lane = 0; Lane != NumLegalLanes; ++Lane) {
4947 unsigned I = NumLegalLanes * LegalVec + Lane;
4948 // No need to insert unaffected lane; or lane 0 of each legal vector
4949 // iff ALL lanes of that vector were affected and will be inserted.
4950 if (!AffectedLanes[I] ||
4951 (Lane == 0 && FullyAffectedLegalVectors[LegalVec]))
4952 continue;
4954 I * NumEltsPerLane, LaneTy);
4955 }
4956 }
4957 }
4958 } else if (LT.second.isVector()) {
4959 // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
4960 // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
4961 // series of UNPCK followed by CONCAT_VECTORS - all of these can be
4962 // considered cheap.
4963 if (Ty->isIntOrIntVectorTy())
4964 Cost += DemandedElts.popcount();
4965
4966 // Get the smaller of the legalized or original pow2-extended number of
4967 // vector elements, which represents the number of unpacks we'll end up
4968 // performing.
4969 unsigned NumElts = LT.second.getVectorNumElements();
4970 unsigned Pow2Elts =
4971 PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements());
4972 Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
4973 }
4974 }
4975
4976 if (Extract) {
4977 // vXi1 can be efficiently extracted with MOVMSK.
4978 // TODO: AVX512 predicate mask handling.
4979 // NOTE: This doesn't work well for roundtrip scalarization.
4980 if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) {
4981 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
4982 unsigned MaxElts = ST->hasAVX2() ? 32 : 16;
4983 unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts;
4984 return MOVMSKCost;
4985 }
4986
4987 if (LT.second.isVector()) {
4988 unsigned NumLegalElts =
4989 LT.second.getVectorNumElements() * NumLegalVectors;
4990 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
4991 "Vector has been legalized to smaller element count");
4992
4993 // If we're extracting elements from a 128-bit subvector lane,
4994 // we only need to extract each lane once, not for every element.
4995 if (LegalVectorBitWidth > LaneBitWidth) {
4996 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
4997 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
4998 assert((NumLegalElts % NumLanesTotal) == 0 &&
4999 "Unexpected elts per lane");
5000 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
5001
5002 // Add cost for each demanded 128-bit subvector extraction.
5003 // Luckily this is a lot easier than for insertion.
5004 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
5005 auto *LaneTy =
5006 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
5007
5008 for (unsigned I = 0; I != NumLanesTotal; ++I) {
5009 APInt LaneEltMask = WidenedDemandedElts.extractBits(
5010 NumEltsPerLane, I * NumEltsPerLane);
5011 if (LaneEltMask.isZero())
5012 continue;
5014 I * NumEltsPerLane, LaneTy);
5016 LaneTy, LaneEltMask, /*Insert*/ false, Extract, CostKind);
5017 }
5018
5019 return Cost;
5020 }
5021 }
5022
5023 // Fallback to default extraction.
5024 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ false,
5025 Extract, CostKind);
5026 }
5027
5028 return Cost;
5029}
5030
5032X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
5033 int VF, const APInt &DemandedDstElts,
5035 const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy);
5036 // We don't differentiate element types here, only element bit width.
5037 EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits);
5038
5039 auto bailout = [&]() {
5040 return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
5041 DemandedDstElts, CostKind);
5042 };
5043
5044 // For now, only deal with AVX512 cases.
5045 if (!ST->hasAVX512())
5046 return bailout();
5047
5048 // Do we have a native shuffle for this element type, or should we promote?
5049 unsigned PromEltTyBits = EltTyBits;
5050 switch (EltTyBits) {
5051 case 32:
5052 case 64:
5053 break; // AVX512F.
5054 case 16:
5055 if (!ST->hasBWI())
5056 PromEltTyBits = 32; // promote to i32, AVX512F.
5057 break; // AVX512BW
5058 case 8:
5059 if (!ST->hasVBMI())
5060 PromEltTyBits = 32; // promote to i32, AVX512F.
5061 break; // AVX512VBMI
5062 case 1:
5063 // There is no support for shuffling i1 elements. We *must* promote.
5064 if (ST->hasBWI()) {
5065 if (ST->hasVBMI())
5066 PromEltTyBits = 8; // promote to i8, AVX512VBMI.
5067 else
5068 PromEltTyBits = 16; // promote to i16, AVX512BW.
5069 break;
5070 }
5071 PromEltTyBits = 32; // promote to i32, AVX512F.
5072 break;
5073 default:
5074 return bailout();
5075 }
5076 auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits);
5077
5078 auto *SrcVecTy = FixedVectorType::get(EltTy, VF);
5079 auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF);
5080
5081 int NumDstElements = VF * ReplicationFactor;
5082 auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements);
5083 auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements);
5084
5085 // Legalize the types.
5086 MVT LegalSrcVecTy = getTypeLegalizationCost(SrcVecTy).second;
5087 MVT LegalPromSrcVecTy = getTypeLegalizationCost(PromSrcVecTy).second;
5088 MVT LegalPromDstVecTy = getTypeLegalizationCost(PromDstVecTy).second;
5089 MVT LegalDstVecTy = getTypeLegalizationCost(DstVecTy).second;
5090 // They should have legalized into vector types.
5091 if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() ||
5092 !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector())
5093 return bailout();
5094
5095 if (PromEltTyBits != EltTyBits) {
5096 // If we have to perform the shuffle with wider elt type than our data type,
5097 // then we will first need to anyext (we don't care about the new bits)
5098 // the source elements, and then truncate Dst elements.
5099 InstructionCost PromotionCost;
5100 PromotionCost += getCastInstrCost(
5101 Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy,
5103 PromotionCost +=
5104 getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy,
5105 /*Src=*/PromDstVecTy,
5107 return PromotionCost + getReplicationShuffleCost(PromEltTy,
5108 ReplicationFactor, VF,
5109 DemandedDstElts, CostKind);
5110 }
5111
5112 assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits &&
5113 LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() &&
5114 "We expect that the legalization doesn't affect the element width, "
5115 "doesn't coalesce/split elements.");
5116
5117 unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements();
5118 unsigned NumDstVectors =
5119 divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec);
5120
5121 auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec);
5122
5123 // Not all the produced Dst elements may be demanded. In our case,
5124 // given that a single Dst vector is formed by a single shuffle,
5125 // if all elements that will form a single Dst vector aren't demanded,
5126 // then we won't need to do that shuffle, so adjust the cost accordingly.
5127 APInt DemandedDstVectors = APIntOps::ScaleBitMask(
5128 DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors);
5129 unsigned NumDstVectorsDemanded = DemandedDstVectors.popcount();
5130
5131 InstructionCost SingleShuffleCost = getShuffleCost(
5132 TTI::SK_PermuteSingleSrc, SingleDstVecTy, /*Mask=*/{}, CostKind,
5133 /*Index=*/0, /*SubTp=*/nullptr);
5134 return NumDstVectorsDemanded * SingleShuffleCost;
5135}
5136
5138 MaybeAlign Alignment,
5139 unsigned AddressSpace,
5141 TTI::OperandValueInfo OpInfo,
5142 const Instruction *I) {
5143 // TODO: Handle other cost kinds.
5145 if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
5146 // Store instruction with index and scale costs 2 Uops.
5147 // Check the preceding GEP to identify non-const indices.
5148 if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
5149 if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
5150 return TTI::TCC_Basic * 2;
5151 }
5152 }
5153 return TTI::TCC_Basic;
5154 }
5155
5156 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
5157 "Invalid Opcode");
5158 // Type legalization can't handle structs
5159 if (TLI->getValueType(DL, Src, true) == MVT::Other)
5160 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5161 CostKind, OpInfo, I);
5162
5163 // Legalize the type.
5164 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
5165
5166 auto *VTy = dyn_cast<FixedVectorType>(Src);
5167
5169
5170 // Add a cost for constant load to vector.
5171 if (Opcode == Instruction::Store && OpInfo.isConstant())
5172 Cost += getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src),
5173 /*AddressSpace=*/0, CostKind, OpInfo);
5174
5175 // Handle the simple case of non-vectors.
5176 // NOTE: this assumes that legalization never creates vector from scalars!
5177 if (!VTy || !LT.second.isVector()) {
5178 // Each load/store unit costs 1.
5179 return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1;
5180 }
5181
5182 bool IsLoad = Opcode == Instruction::Load;
5183
5184 Type *EltTy = VTy->getElementType();
5185
5186 const int EltTyBits = DL.getTypeSizeInBits(EltTy);
5187
5188 // Source of truth: how many elements were there in the original IR vector?
5189 const unsigned SrcNumElt = VTy->getNumElements();
5190
5191 // How far have we gotten?
5192 int NumEltRemaining = SrcNumElt;
5193 // Note that we intentionally capture by-reference, NumEltRemaining changes.
5194 auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
5195
5196 const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8);
5197
5198 // Note that even if we can store 64 bits of an XMM, we still operate on XMM.
5199 const unsigned XMMBits = 128;
5200 if (XMMBits % EltTyBits != 0)
5201 // Vector size must be a multiple of the element size. I.e. no padding.
5202 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5203 CostKind, OpInfo, I);
5204 const int NumEltPerXMM = XMMBits / EltTyBits;
5205
5206 auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
5207
5208 for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0;
5209 NumEltRemaining > 0; CurrOpSizeBytes /= 2) {
5210 // How many elements would a single op deal with at once?
5211 if ((8 * CurrOpSizeBytes) % EltTyBits != 0)
5212 // Vector size must be a multiple of the element size. I.e. no padding.
5213 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5214 CostKind, OpInfo, I);
5215 int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits;
5216
5217 assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?");
5218 assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||
5219 (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&
5220 "Unless we haven't halved the op size yet, "
5221 "we have less than two op's sized units of work left.");
5222
5223 auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
5224 ? FixedVectorType::get(EltTy, CurrNumEltPerOp)
5225 : XMMVecTy;
5226
5227 assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&
5228 "After halving sizes, the vector elt count is no longer a multiple "
5229 "of number of elements per operation?");
5230 auto *CoalescedVecTy =
5231 CurrNumEltPerOp == 1
5232 ? CurrVecTy
5234 IntegerType::get(Src->getContext(),
5235 EltTyBits * CurrNumEltPerOp),
5236 CurrVecTy->getNumElements() / CurrNumEltPerOp);
5237 assert(DL.getTypeSizeInBits(CoalescedVecTy) ==
5238 DL.getTypeSizeInBits(CurrVecTy) &&
5239 "coalesciing elements doesn't change vector width.");
5240
5241 while (NumEltRemaining > 0) {
5242 assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?");
5243
5244 // Can we use this vector size, as per the remaining element count?
5245 // Iff the vector is naturally aligned, we can do a wide load regardless.
5246 if (NumEltRemaining < CurrNumEltPerOp &&
5247 (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) &&
5248 CurrOpSizeBytes != 1)
5249 break; // Try smalled vector size.
5250
5251 // This isn't exactly right. We're using slow unaligned 32-byte accesses
5252 // as a proxy for a double-pumped AVX memory interface such as on
5253 // Sandybridge.
5254 // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or
5255 // will be scalarized.
5256 if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
5257 Cost += 2;
5258 else if (CurrOpSizeBytes < 4)
5259 Cost += 2;
5260 else
5261 Cost += 1;
5262
5263 // If we're loading a uniform value, then we don't need to split the load,
5264 // loading just a single (widest) vector can be reused by all splits.
5265 if (IsLoad && OpInfo.isUniform())
5266 return Cost;
5267
5268 bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
5269
5270 // If we have fully processed the previous reg, we need to replenish it.
5271 if (SubVecEltsLeft == 0) {
5272 SubVecEltsLeft += CurrVecTy->getNumElements();
5273 // And that's free only for the 0'th subvector of a legalized vector.
5274 if (!Is0thSubVec)
5277 VTy, {}, CostKind, NumEltDone(), CurrVecTy);
5278 }
5279
5280 // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
5281 // for smaller widths (32/16/8) we have to insert/extract them separately.
5282 // Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
5283 // but let's pretend that it is also true for 16/8 bit wide ops...)
5284 if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
5285 int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
5286 assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "");
5287 int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
5288 APInt DemandedElts =
5289 APInt::getBitsSet(CoalescedVecTy->getNumElements(),
5290 CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
5291 assert(DemandedElts.popcount() == 1 && "Inserting single value");
5292 Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
5293 !IsLoad, CostKind);
5294 }
5295
5296 SubVecEltsLeft -= CurrNumEltPerOp;
5297 NumEltRemaining -= CurrNumEltPerOp;
5298 Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes);
5299 }
5300 }
5301
5302 assert(NumEltRemaining <= 0 && "Should have processed all the elements.");
5303
5304 return Cost;
5305}
5306
5308X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
5309 unsigned AddressSpace,
5311 bool IsLoad = (Instruction::Load == Opcode);
5312 bool IsStore = (Instruction::Store == Opcode);
5313
5314 auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
5315 if (!SrcVTy)
5316 // To calculate scalar take the regular cost, without mask
5317 return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
5318
5319 unsigned NumElem = SrcVTy->getNumElements();
5320 auto *MaskTy =
5321 FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
5322 if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) ||
5323 (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) {
5324 // Scalarization
5325 APInt DemandedElts = APInt::getAllOnes(NumElem);
5327 MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind);
5328 InstructionCost ScalarCompareCost = getCmpSelInstrCost(
5329 Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
5331 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
5332 InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
5334 SrcVTy, DemandedElts, IsLoad, IsStore, CostKind);
5335 InstructionCost MemopCost =
5336 NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
5337 Alignment, AddressSpace, CostKind);
5338 return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
5339 }
5340
5341 // Legalize the type.
5342 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy);
5343 auto VT = TLI->getValueType(DL, SrcVTy);
5345 MVT Ty = LT.second;
5346 if (Ty == MVT::i16 || Ty == MVT::i32 || Ty == MVT::i64)
5347 // APX masked load/store for scalar is cheap.
5348 return Cost + LT.first;
5349
5350 if (VT.isSimple() && Ty != VT.getSimpleVT() &&
5351 LT.second.getVectorNumElements() == NumElem)
5352 // Promotion requires extend/truncate for data and a shuffle for mask.
5353 Cost +=
5355 nullptr) +
5356 getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, {}, CostKind, 0, nullptr);
5357
5358 else if (LT.first * Ty.getVectorNumElements() > NumElem) {
5359 auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
5361 // Expanding requires fill mask with zeroes
5363 MaskTy);
5364 }
5365
5366 // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
5367 if (!ST->hasAVX512())
5368 return Cost + LT.first * (IsLoad ? 2 : 8);
5369
5370 // AVX-512 masked load/store is cheaper
5371 return Cost + LT.first;
5372}
5373
5376 const Value *Base,
5377 const TTI::PointersChainInfo &Info,
5378 Type *AccessTy, TTI::TargetCostKind CostKind) {
5379 if (Info.isSameBase() && Info.isKnownStride()) {
5380 // If all the pointers have known stride all the differences are translated
5381 // into constants. X86 memory addressing allows encoding it into
5382 // displacement. So we just need to take the base GEP cost.
5383 if (const auto *BaseGEP = dyn_cast<GetElementPtrInst>(Base)) {
5384 SmallVector<const Value *> Indices(BaseGEP->indices());
5385 return getGEPCost(BaseGEP->getSourceElementType(),
5386 BaseGEP->getPointerOperand(), Indices, nullptr,
5387 CostKind);
5388 }
5389 return TTI::TCC_Free;
5390 }
5391 return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind);
5392}
5393
5395 ScalarEvolution *SE,
5396 const SCEV *Ptr) {
5397 // Address computations in vectorized code with non-consecutive addresses will
5398 // likely result in more instructions compared to scalar code where the
5399 // computation can more often be merged into the index mode. The resulting
5400 // extra micro-ops can significantly decrease throughput.
5401 const unsigned NumVectorInstToHideOverhead = 10;
5402
5403 // Cost modeling of Strided Access Computation is hidden by the indexing
5404 // modes of X86 regardless of the stride value. We dont believe that there
5405 // is a difference between constant strided access in gerenal and constant
5406 // strided value which is less than or equal to 64.
5407 // Even in the case of (loop invariant) stride whose value is not known at
5408 // compile time, the address computation will not incur more than one extra
5409 // ADD instruction.
5410 if (Ty->isVectorTy() && SE && !ST->hasAVX2()) {
5411 // TODO: AVX2 is the current cut-off because we don't have correct
5412 // interleaving costs for prior ISA's.
5414 return NumVectorInstToHideOverhead;
5416 return 1;
5417 }
5418
5419 return BaseT::getAddressComputationCost(Ty, SE, Ptr);
5420}
5421
5424 std::optional<FastMathFlags> FMF,
5427 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5428
5429 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5430 // and make it as the cost.
5431
5432 static const CostTblEntry SLMCostTbl[] = {
5433 { ISD::FADD, MVT::v2f64, 3 },
5434 { ISD::ADD, MVT::v2i64, 5 },
5435 };
5436
5437 static const CostTblEntry SSE2CostTbl[] = {
5438 { ISD::FADD, MVT::v2f64, 2 },
5439 { ISD::FADD, MVT::v2f32, 2 },
5440 { ISD::FADD, MVT::v4f32, 4 },
5441 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
5442 { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
5443 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
5444 { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
5445 { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
5446 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
5447 { ISD::ADD, MVT::v2i8, 2 },
5448 { ISD::ADD, MVT::v4i8, 2 },
5449 { ISD::ADD, MVT::v8i8, 2 },
5450 { ISD::ADD, MVT::v16i8, 3 },
5451 };
5452
5453 static const CostTblEntry AVX1CostTbl[] = {
5454 { ISD::FADD, MVT::v4f64, 3 },
5455 { ISD::FADD, MVT::v4f32, 3 },
5456 { ISD::FADD, MVT::v8f32, 4 },
5457 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
5458 { ISD::ADD, MVT::v4i64, 3 },
5459 { ISD::ADD, MVT::v8i32, 5 },
5460 { ISD::ADD, MVT::v16i16, 5 },
5461 { ISD::ADD, MVT::v32i8, 4 },
5462 };
5463
5464 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5465 assert(ISD && "Invalid opcode");
5466
5467 // Before legalizing the type, give a chance to look up illegal narrow types
5468 // in the table.
5469 // FIXME: Is there a better way to do this?
5470 EVT VT = TLI->getValueType(DL, ValTy);
5471 if (VT.isSimple()) {
5472 MVT MTy = VT.getSimpleVT();
5473 if (ST->useSLMArithCosts())
5474 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5475 return Entry->Cost;
5476
5477 if (ST->hasAVX())
5478 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5479 return Entry->Cost;
5480
5481 if (ST->hasSSE2())
5482 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5483 return Entry->Cost;
5484 }
5485
5486 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5487
5488 MVT MTy = LT.second;
5489
5490 auto *ValVTy = cast<FixedVectorType>(ValTy);
5491
5492 // Special case: vXi8 mul reductions are performed as vXi16.
5493 if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) {
5494 auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16);
5495 auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements());
5496 return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
5498 CostKind) +
5499 getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind);
5500 }
5501
5502 InstructionCost ArithmeticCost = 0;
5503 if (LT.first != 1 && MTy.isVector() &&
5504 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5505 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5506 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5507 MTy.getVectorNumElements());
5508 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5509 ArithmeticCost *= LT.first - 1;
5510 }
5511
5512 if (ST->useSLMArithCosts())
5513 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5514 return ArithmeticCost + Entry->Cost;
5515
5516 if (ST->hasAVX())
5517 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5518 return ArithmeticCost + Entry->Cost;
5519
5520 if (ST->hasSSE2())
5521 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5522 return ArithmeticCost + Entry->Cost;
5523
5524 // FIXME: These assume a naive kshift+binop lowering, which is probably
5525 // conservative in most cases.
5526 static const CostTblEntry AVX512BoolReduction[] = {
5527 { ISD::AND, MVT::v2i1, 3 },
5528 { ISD::AND, MVT::v4i1, 5 },
5529 { ISD::AND, MVT::v8i1, 7 },
5530 { ISD::AND, MVT::v16i1, 9 },
5531 { ISD::AND, MVT::v32i1, 11 },
5532 { ISD::AND, MVT::v64i1, 13 },
5533 { ISD::OR, MVT::v2i1, 3 },
5534 { ISD::OR, MVT::v4i1, 5 },
5535 { ISD::OR, MVT::v8i1, 7 },
5536 { ISD::OR, MVT::v16i1, 9 },
5537 { ISD::OR, MVT::v32i1, 11 },
5538 { ISD::OR, MVT::v64i1, 13 },
5539 };
5540
5541 static const CostTblEntry AVX2BoolReduction[] = {
5542 { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
5543 { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
5544 { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
5545 { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
5546 };
5547
5548 static const CostTblEntry AVX1BoolReduction[] = {
5549 { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
5550 { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
5551 { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5552 { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5553 { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
5554 { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
5555 { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5556 { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5557 };
5558
5559 static const CostTblEntry SSE2BoolReduction[] = {
5560 { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
5561 { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
5562 { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
5563 { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
5564 { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
5565 { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
5566 { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
5567 { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
5568 };
5569
5570 // Handle bool allof/anyof patterns.
5571 if (ValVTy->getElementType()->isIntegerTy(1)) {
5572 InstructionCost ArithmeticCost = 0;
5573 if (LT.first != 1 && MTy.isVector() &&
5574 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5575 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5576 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5577 MTy.getVectorNumElements());
5578 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5579 ArithmeticCost *= LT.first - 1;
5580 }
5581
5582 if (ST->hasAVX512())
5583 if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
5584 return ArithmeticCost + Entry->Cost;
5585 if (ST->hasAVX2())
5586 if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
5587 return ArithmeticCost + Entry->Cost;
5588 if (ST->hasAVX())
5589 if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
5590 return ArithmeticCost + Entry->Cost;
5591 if (ST->hasSSE2())
5592 if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
5593 return ArithmeticCost + Entry->Cost;
5594
5595 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5596 }
5597
5598 unsigned NumVecElts = ValVTy->getNumElements();
5599 unsigned ScalarSize = ValVTy->getScalarSizeInBits();
5600
5601 // Special case power of 2 reductions where the scalar type isn't changed
5602 // by type legalization.
5603 if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
5604 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5605
5606 InstructionCost ReductionCost = 0;
5607
5608 auto *Ty = ValVTy;
5609 if (LT.first != 1 && MTy.isVector() &&
5610 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5611 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5612 Ty = FixedVectorType::get(ValVTy->getElementType(),
5613 MTy.getVectorNumElements());
5614 ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5615 ReductionCost *= LT.first - 1;
5616 NumVecElts = MTy.getVectorNumElements();
5617 }
5618
5619 // Now handle reduction with the legal type, taking into account size changes
5620 // at each level.
5621 while (NumVecElts > 1) {
5622 // Determine the size of the remaining vector we need to reduce.
5623 unsigned Size = NumVecElts * ScalarSize;
5624 NumVecElts /= 2;
5625 // If we're reducing from 256/512 bits, use an extract_subvector.
5626 if (Size > 128) {
5627 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5628 ReductionCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, {},
5629 CostKind, NumVecElts, SubTy);
5630 Ty = SubTy;
5631 } else if (Size == 128) {
5632 // Reducing from 128 bits is a permute of v2f64/v2i64.
5633 FixedVectorType *ShufTy;
5634 if (ValVTy->isFloatingPointTy())
5635 ShufTy =
5636 FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
5637 else
5638 ShufTy =
5639 FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
5640 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, {},
5641 CostKind, 0, nullptr);
5642 } else if (Size == 64) {
5643 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5644 FixedVectorType *ShufTy;
5645 if (ValVTy->isFloatingPointTy())
5646 ShufTy =
5647 FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
5648 else
5649 ShufTy =
5650 FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
5651 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, {},
5652 CostKind, 0, nullptr);
5653 } else {
5654 // Reducing from smaller size is a shift by immediate.
5655 auto *ShiftTy = FixedVectorType::get(
5656 Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
5657 ReductionCost += getArithmeticInstrCost(
5658 Instruction::LShr, ShiftTy, CostKind,
5661 }
5662
5663 // Add the arithmetic op for this level.
5664 ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
5665 }
5666
5667 // Add the final extract element to the cost.
5668 return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5669 CostKind, 0, nullptr, nullptr);
5670}
5671
5674 FastMathFlags FMF) {
5675 IntrinsicCostAttributes ICA(IID, Ty, {Ty, Ty}, FMF);
5676 return getIntrinsicInstrCost(ICA, CostKind);
5677}
5678
5681 FastMathFlags FMF,
5683 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5684
5685 MVT MTy = LT.second;
5686
5687 int ISD;
5688 if (ValTy->isIntOrIntVectorTy()) {
5689 ISD = (IID == Intrinsic::umin || IID == Intrinsic::umax) ? ISD::UMIN
5690 : ISD::SMIN;
5691 } else {
5692 assert(ValTy->isFPOrFPVectorTy() &&
5693 "Expected float point or integer vector type.");
5694 ISD = (IID == Intrinsic::minnum || IID == Intrinsic::maxnum)
5695 ? ISD::FMINNUM
5696 : ISD::FMINIMUM;
5697 }
5698
5699 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5700 // and make it as the cost.
5701
5702 static const CostTblEntry SSE2CostTbl[] = {
5703 {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw
5704 {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw
5705 {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw
5706 };
5707
5708 static const CostTblEntry SSE41CostTbl[] = {
5709 {ISD::SMIN, MVT::v2i16, 3}, // same as sse2
5710 {ISD::SMIN, MVT::v4i16, 5}, // same as sse2
5711 {ISD::UMIN, MVT::v2i16, 5}, // same as sse2
5712 {ISD::UMIN, MVT::v4i16, 7}, // same as sse2
5713 {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor
5714 {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax
5715 {ISD::SMIN, MVT::v2i8, 3}, // pminsb
5716 {ISD::SMIN, MVT::v4i8, 5}, // pminsb
5717 {ISD::SMIN, MVT::v8i8, 7}, // pminsb
5718 {ISD::SMIN, MVT::v16i8, 6},
5719 {ISD::UMIN, MVT::v2i8, 3}, // same as sse2
5720 {ISD::UMIN, MVT::v4i8, 5}, // same as sse2
5721 {ISD::UMIN, MVT::v8i8, 7}, // same as sse2
5722 {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax
5723 };
5724
5725 static const CostTblEntry AVX1CostTbl[] = {
5726 {ISD::SMIN, MVT::v16i16, 6},
5727 {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax
5728 {ISD::SMIN, MVT::v32i8, 8},
5729 {ISD::UMIN, MVT::v32i8, 8},
5730 };
5731
5732 static const CostTblEntry AVX512BWCostTbl[] = {
5733 {ISD::SMIN, MVT::v32i16, 8},
5734 {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax
5735 {ISD::SMIN, MVT::v64i8, 10},
5736 {ISD::UMIN, MVT::v64i8, 10},
5737 };
5738
5739 // Before legalizing the type, give a chance to look up illegal narrow types
5740 // in the table.
5741 // FIXME: Is there a better way to do this?
5742 EVT VT = TLI->getValueType(DL, ValTy);
5743 if (VT.isSimple()) {
5744 MVT MTy = VT.getSimpleVT();
5745 if (ST->hasBWI())
5746 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5747 return Entry->Cost;
5748
5749 if (ST->hasAVX())
5750 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5751 return Entry->Cost;
5752
5753 if (ST->hasSSE41())
5754 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5755 return Entry->Cost;
5756
5757 if (ST->hasSSE2())
5758 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5759 return Entry->Cost;
5760 }
5761
5762 auto *ValVTy = cast<FixedVectorType>(ValTy);
5763 unsigned NumVecElts = ValVTy->getNumElements();
5764
5765 auto *Ty = ValVTy;
5766 InstructionCost MinMaxCost = 0;
5767 if (LT.first != 1 && MTy.isVector() &&
5768 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5769 // Type needs to be split. We need LT.first - 1 operations ops.
5770 Ty = FixedVectorType::get(ValVTy->getElementType(),
5771 MTy.getVectorNumElements());
5772 MinMaxCost = getMinMaxCost(IID, Ty, CostKind, FMF);
5773 MinMaxCost *= LT.first - 1;
5774 NumVecElts = MTy.getVectorNumElements();
5775 }
5776
5777 if (ST->hasBWI())
5778 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5779 return MinMaxCost + Entry->Cost;
5780
5781 if (ST->hasAVX())
5782 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5783 return MinMaxCost + Entry->Cost;
5784
5785 if (ST->hasSSE41())
5786 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5787 return MinMaxCost + Entry->Cost;
5788
5789 if (ST->hasSSE2())
5790 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5791 return MinMaxCost + Entry->Cost;
5792
5793 unsigned ScalarSize = ValTy->getScalarSizeInBits();
5794
5795 // Special case power of 2 reductions where the scalar type isn't changed
5796 // by type legalization.
5797 if (!isPowerOf2_32(ValVTy->getNumElements()) ||
5798 ScalarSize != MTy.getScalarSizeInBits())
5799 return BaseT::getMinMaxReductionCost(IID, ValTy, FMF, CostKind);
5800
5801 // Now handle reduction with the legal type, taking into account size changes
5802 // at each level.
5803 while (NumVecElts > 1) {
5804 // Determine the size of the remaining vector we need to reduce.
5805 unsigned Size = NumVecElts * ScalarSize;
5806 NumVecElts /= 2;
5807 // If we're reducing from 256/512 bits, use an extract_subvector.
5808 if (Size > 128) {
5809 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5810 MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, {}, CostKind,
5811 NumVecElts, SubTy);
5812 Ty = SubTy;
5813 } else if (Size == 128) {
5814 // Reducing from 128 bits is a permute of v2f64/v2i64.
5815 VectorType *ShufTy;
5816 if (ValTy->isFloatingPointTy())
5817 ShufTy =
5819 else
5820 ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
5821 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, {},
5822 CostKind, 0, nullptr);
5823 } else if (Size == 64) {
5824 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5825 FixedVectorType *ShufTy;
5826 if (ValTy->isFloatingPointTy())
5827 ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
5828 else
5829 ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
5830 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, {},
5831 CostKind, 0, nullptr);
5832 } else {
5833 // Reducing from smaller size is a shift by immediate.
5834 auto *ShiftTy = FixedVectorType::get(
5835 Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
5836 MinMaxCost += getArithmeticInstrCost(
5837 Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
5840 }
5841
5842 // Add the arithmetic op for this level.
5843 MinMaxCost += getMinMaxCost(IID, Ty, CostKind, FMF);
5844 }
5845
5846 // Add the final extract element to the cost.
5847 return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5848 CostKind, 0, nullptr, nullptr);
5849}
5850
5851/// Calculate the cost of materializing a 64-bit value. This helper
5852/// method might only calculate a fraction of a larger immediate. Therefore it
5853/// is valid to return a cost of ZERO.
5855 if (Val == 0)
5856 return TTI::TCC_Free;
5857
5858 if (isInt<32>(Val))
5859 return TTI::TCC_Basic;
5860
5861 return 2 * TTI::TCC_Basic;
5862}
5863
5866 assert(Ty->isIntegerTy());
5867
5868 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5869 if (BitSize == 0)
5870 return ~0U;
5871
5872 // Never hoist constants larger than 128bit, because this might lead to
5873 // incorrect code generation or assertions in codegen.
5874 // Fixme: Create a cost model for types larger than i128 once the codegen
5875 // issues have been fixed.
5876 if (BitSize > 128)
5877 return TTI::TCC_Free;
5878
5879 if (Imm == 0)
5880 return TTI::TCC_Free;
5881
5882 // Sign-extend all constants to a multiple of 64-bit.
5883 APInt ImmVal = Imm;
5884 if (BitSize % 64 != 0)
5885 ImmVal = Imm.sext(alignTo(BitSize, 64));
5886
5887 // Split the constant into 64-bit chunks and calculate the cost for each
5888 // chunk.
5890 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
5891 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
5892 int64_t Val = Tmp.getSExtValue();
5893 Cost += getIntImmCost(Val);
5894 }
5895 // We need at least one instruction to materialize the constant.
5896 return std::max<InstructionCost>(1, Cost);
5897}
5898
5900 const APInt &Imm, Type *Ty,
5902 Instruction *Inst) {
5903 assert(Ty->isIntegerTy());
5904
5905 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5906 unsigned ImmBitWidth = Imm.getBitWidth();
5907
5908 // There is no cost model for constants with a bit size of 0. Return TCC_Free
5909 // here, so that constant hoisting will ignore this constant.
5910 if (BitSize == 0)
5911 return TTI::TCC_Free;
5912
5913 unsigned ImmIdx = ~0U;
5914 switch (Opcode) {
5915 default:
5916 return TTI::TCC_Free;
5917 case Instruction::GetElementPtr:
5918 // Always hoist the base address of a GetElementPtr. This prevents the
5919 // creation of new constants for every base constant that gets constant
5920 // folded with the offset.
5921 if (Idx == 0)
5922 return 2 * TTI::TCC_Basic;
5923 return TTI::TCC_Free;
5924 case Instruction::Store:
5925 ImmIdx = 0;
5926 break;
5927 case Instruction::ICmp:
5928 // This is an imperfect hack to prevent constant hoisting of
5929 // compares that might be trying to check if a 64-bit value fits in
5930 // 32-bits. The backend can optimize these cases using a right shift by 32.
5931 // Ideally we would check the compare predicate here. There also other
5932 // similar immediates the backend can use shifts for.
5933 if (Idx == 1 && ImmBitWidth == 64) {
5934 uint64_t ImmVal = Imm.getZExtValue();
5935 if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
5936 return TTI::TCC_Free;
5937 }
5938 ImmIdx = 1;
5939 break;
5940 case Instruction::And:
5941 // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
5942 // by using a 32-bit operation with implicit zero extension. Detect such
5943 // immediates here as the normal path expects bit 31 to be sign extended.
5944 if (Idx == 1 && ImmBitWidth == 64 && Imm.isIntN(32))
5945 return TTI::TCC_Free;
5946 // If we have BMI then we can use BEXTR/BZHI to mask out upper i64 bits.
5947 if (Idx == 1 && ImmBitWidth == 64 && ST->is64Bit() && ST->hasBMI() &&
5948 Imm.isMask())
5949 return X86TTIImpl::getIntImmCost(ST->hasBMI2() ? 255 : 65535);
5950 ImmIdx = 1;
5951 break;
5952 case Instruction::Add:
5953 case Instruction::Sub:
5954 // For add/sub, we can use the opposite instruction for INT32_MIN.
5955 if (Idx == 1 && ImmBitWidth == 64 && Imm.getZExtValue() == 0x80000000)
5956 return TTI::TCC_Free;
5957 ImmIdx = 1;
5958 break;
5959 case Instruction::UDiv:
5960 case Instruction::SDiv:
5961 case Instruction::URem:
5962 case Instruction::SRem:
5963 // Division by constant is typically expanded later into a different
5964 // instruction sequence. This completely changes the constants.
5965 // Report them as "free" to stop ConstantHoist from marking them as opaque.
5966 return TTI::TCC_Free;
5967 case Instruction::Mul:
5968 case Instruction::Or:
5969 case Instruction::Xor:
5970 ImmIdx = 1;
5971 break;
5972 // Always return TCC_Free for the shift value of a shift instruction.
5973 case Instruction::Shl:
5974 case Instruction::LShr:
5975 case Instruction::AShr:
5976 if (Idx == 1)
5977 return TTI::TCC_Free;
5978 break;
5979 case Instruction::Trunc:
5980 case Instruction::ZExt:
5981 case Instruction::SExt:
5982 case Instruction::IntToPtr:
5983 case Instruction::PtrToInt:
5984 case Instruction::BitCast:
5985 case Instruction::PHI:
5986 case Instruction::Call:
5987 case Instruction::Select:
5988 case Instruction::Ret:
5989 case Instruction::Load:
5990 break;
5991 }
5992
5993 if (Idx == ImmIdx) {
5994 uint64_t NumConstants = divideCeil(BitSize, 64);
5996 return (Cost <= NumConstants * TTI::TCC_Basic)
5997 ? static_cast<int>(TTI::TCC_Free)
5998 : Cost;
5999 }
6000
6001 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
6002}
6003
6005 const APInt &Imm, Type *Ty,
6007 assert(Ty->isIntegerTy());
6008
6009 unsigned BitSize = Ty->getPrimitiveSizeInBits();
6010 // There is no cost model for constants with a bit size of 0. Return TCC_Free
6011 // here, so that constant hoisting will ignore this constant.
6012 if (BitSize == 0)
6013 return TTI::TCC_Free;
6014
6015 switch (IID) {
6016 default:
6017 return TTI::TCC_Free;
6018 case Intrinsic::sadd_with_overflow:
6019 case Intrinsic::uadd_with_overflow:
6020 case Intrinsic::ssub_with_overflow:
6021 case Intrinsic::usub_with_overflow:
6022 case Intrinsic::smul_with_overflow:
6023 case Intrinsic::umul_with_overflow:
6024 if ((Idx == 1) && Imm.getBitWidth() <= 64 && Imm.isSignedIntN(32))
6025 return TTI::TCC_Free;
6026 break;
6027 case Intrinsic::experimental_stackmap:
6028 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
6029 return TTI::TCC_Free;
6030 break;
6031 case Intrinsic::experimental_patchpoint_void:
6032 case Intrinsic::experimental_patchpoint:
6033 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
6034 return TTI::TCC_Free;
6035 break;
6036 }
6037 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
6038}
6039
6042 const Instruction *I) {
6044 return Opcode == Instruction::PHI ? TTI::TCC_Free : TTI::TCC_Basic;
6045 // Branches are assumed to be predicted.
6046 return TTI::TCC_Free;
6047}
6048
6049int X86TTIImpl::getGatherOverhead() const {
6050 // Some CPUs have more overhead for gather. The specified overhead is relative
6051 // to the Load operation. "2" is the number provided by Intel architects. This
6052 // parameter is used for cost estimation of Gather Op and comparison with
6053 // other alternatives.
6054 // TODO: Remove the explicit hasAVX512()?, That would mean we would only
6055 // enable gather with a -march.
6056 if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
6057 return 2;
6058
6059 return 1024;
6060}
6061
6062int X86TTIImpl::getScatterOverhead() const {
6063 if (ST->hasAVX512())
6064 return 2;
6065
6066 return 1024;
6067}
6068
6069// Return an average cost of Gather / Scatter instruction, maybe improved later.
6070InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode,
6072 Type *SrcVTy, const Value *Ptr,
6073 Align Alignment,
6074 unsigned AddressSpace) {
6075
6076 assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
6077 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
6078
6079 // Try to reduce index size from 64 bit (default for GEP)
6080 // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
6081 // operation will use 16 x 64 indices which do not fit in a zmm and needs
6082 // to split. Also check that the base pointer is the same for all lanes,
6083 // and that there's at most one variable index.
6084 auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
6085 unsigned IndexSize = DL.getPointerSizeInBits();
6086 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
6087 if (IndexSize < 64 || !GEP)
6088 return IndexSize;
6089
6090 unsigned NumOfVarIndices = 0;
6091 const Value *Ptrs = GEP->getPointerOperand();
6092 if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
6093 return IndexSize;
6094 for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) {
6095 if (isa<Constant>(GEP->getOperand(I)))
6096 continue;
6097 Type *IndxTy = GEP->getOperand(I)->getType();
6098 if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
6099 IndxTy = IndexVTy->getElementType();
6100 if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
6101 !isa<SExtInst>(GEP->getOperand(I))) ||
6102 ++NumOfVarIndices > 1)
6103 return IndexSize; // 64
6104 }
6105 return (unsigned)32;
6106 };
6107
6108 // Trying to reduce IndexSize to 32 bits for vector 16.
6109 // By default the IndexSize is equal to pointer size.
6110 unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
6111 ? getIndexSizeInBits(Ptr, DL)
6113
6114 auto *IndexVTy = FixedVectorType::get(
6115 IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
6116 std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(IndexVTy);
6117 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(SrcVTy);
6118 InstructionCost::CostType SplitFactor =
6119 *std::max(IdxsLT.first, SrcLT.first).getValue();
6120 if (SplitFactor > 1) {
6121 // Handle splitting of vector of pointers
6122 auto *SplitSrcTy =
6123 FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
6124 return SplitFactor * getGSVectorCost(Opcode, CostKind, SplitSrcTy, Ptr,
6125 Alignment, AddressSpace);
6126 }
6127
6128 // If we didn't split, this will be a single gather/scatter instruction.
6130 return 1;
6131
6132 // The gather / scatter cost is given by Intel architects. It is a rough
6133 // number since we are looking at one instruction in a time.
6134 const int GSOverhead = (Opcode == Instruction::Load) ? getGatherOverhead()
6135 : getScatterOverhead();
6136 return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
6137 MaybeAlign(Alignment), AddressSpace,
6138 CostKind);
6139}
6140
6141/// Calculate the cost of Gather / Scatter operation
6143 unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask,
6145 const Instruction *I = nullptr) {
6146 if ((Opcode == Instruction::Load &&
6147 (!isLegalMaskedGather(SrcVTy, Align(Alignment)) ||
6148 forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
6149 Align(Alignment)))) ||
6150 (Opcode == Instruction::Store &&
6151 (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) ||
6152 forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
6153 Align(Alignment)))))
6154 return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask,
6155 Alignment, CostKind, I);
6156
6157 assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
6158 PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
6159 if (!PtrTy && Ptr->getType()->isVectorTy())
6160 PtrTy = dyn_cast<PointerType>(
6161 cast<VectorType>(Ptr->getType())->getElementType());
6162 assert(PtrTy && "Unexpected type for Ptr argument");
6163 unsigned AddressSpace = PtrTy->getAddressSpace();
6164 return getGSVectorCost(Opcode, CostKind, SrcVTy, Ptr, Alignment,
6165 AddressSpace);
6166}
6167
6169 const TargetTransformInfo::LSRCost &C2) {
6170 // X86 specific here are "instruction number 1st priority".
6171 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
6172 C1.NumIVMuls, C1.NumBaseAdds,
6173 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
6174 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
6175 C2.NumIVMuls, C2.NumBaseAdds,
6176 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
6177}
6178
6180 return ST->hasMacroFusion() || ST->hasBranchFusion();
6181}
6182
6183bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
6184 Type *ScalarTy = DataTy->getScalarType();
6185
6186 // The backend can't handle a single element vector w/o CFCMOV.
6187 if (isa<VectorType>(DataTy) && cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6188 return ST->hasCF() && hasConditionalLoadStoreForType(ScalarTy);
6189
6190 if (!ST->hasAVX())
6191 return false;
6192
6193 if (ScalarTy->isPointerTy())
6194 return true;
6195
6196 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6197 return true;
6198
6199 if (ScalarTy->isHalfTy() && ST->hasBWI())
6200 return true;
6201
6202 if (ScalarTy->isBFloatTy() && ST->hasBF16())
6203 return true;
6204
6205 if (!ScalarTy->isIntegerTy())
6206 return false;
6207
6208 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6209 return IntWidth == 32 || IntWidth == 64 ||
6210 ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
6211}
6212
6213bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) {
6214 return isLegalMaskedLoad(DataType, Alignment);
6215}
6216
6217bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) {
6218 unsigned DataSize = DL.getTypeStoreSize(DataType);
6219 // The only supported nontemporal loads are for aligned vectors of 16 or 32
6220 // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
6221 // (the equivalent stores only require AVX).
6222 if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
6223 return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
6224
6225 return false;
6226}
6227
6228bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) {
6229 unsigned DataSize = DL.getTypeStoreSize(DataType);
6230
6231 // SSE4A supports nontemporal stores of float and double at arbitrary
6232 // alignment.
6233 if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
6234 return true;
6235
6236 // Besides the SSE4A subtarget exception above, only aligned stores are
6237 // available nontemporaly on any other subtarget. And only stores with a size
6238 // of 4..32 bytes (powers of 2, only) are permitted.
6239 if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
6240 !isPowerOf2_32(DataSize))
6241 return false;
6242
6243 // 32-byte vector nontemporal stores are supported by AVX (the equivalent
6244 // loads require AVX2).
6245 if (DataSize == 32)
6246 return ST->hasAVX();
6247 if (DataSize == 16)
6248 return ST->hasSSE1();
6249 return true;
6250}
6251
6253 ElementCount NumElements) const {
6254 // movddup
6255 return ST->hasSSE3() && !NumElements.isScalable() &&
6256 NumElements.getFixedValue() == 2 &&
6257 ElementTy == Type::getDoubleTy(ElementTy->getContext());
6258}
6259
6261 if (!isa<VectorType>(DataTy))
6262 return false;
6263
6264 if (!ST->hasAVX512())
6265 return false;
6266
6267 // The backend can't handle a single element vector.
6268 if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6269 return false;
6270
6271 Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
6272
6273 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6274 return true;
6275
6276 if (!ScalarTy->isIntegerTy())
6277 return false;
6278
6279 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6280 return IntWidth == 32 || IntWidth == 64 ||
6281 ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
6282}
6283
6285 return isLegalMaskedExpandLoad(DataTy, Alignment);
6286}
6287
6288bool X86TTIImpl::supportsGather() const {
6289 // Some CPUs have better gather performance than others.
6290 // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
6291 // enable gather with a -march.
6292 return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2());
6293}
6294
6296 // Gather / Scatter for vector 2 is not profitable on KNL / SKX
6297 // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend
6298 // it to 8 elements, but zeroing upper bits of the mask vector will add more
6299 // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO:
6300 // Check, maybe the gather/scatter instruction is better in the VariableMask
6301 // case.
6302 unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements();
6303 return NumElts == 1 ||
6304 (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())));
6305}
6306
6308 Type *ScalarTy = DataTy->getScalarType();
6309 if (ScalarTy->isPointerTy())
6310 return true;
6311
6312 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6313 return true;
6314
6315 if (!ScalarTy->isIntegerTy())
6316 return false;
6317
6318 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6319 return IntWidth == 32 || IntWidth == 64;
6320}
6321
6323 if (!supportsGather() || !ST->preferGather())
6324 return false;
6325 return isLegalMaskedGatherScatter(DataTy, Alignment);
6326}
6327
6328bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
6329 unsigned Opcode1,
6330 const SmallBitVector &OpcodeMask) const {
6331 // ADDSUBPS 4xf32 SSE3
6332 // VADDSUBPS 4xf32 AVX
6333 // VADDSUBPS 8xf32 AVX2
6334 // ADDSUBPD 2xf64 SSE3
6335 // VADDSUBPD 2xf64 AVX
6336 // VADDSUBPD 4xf64 AVX2
6337
6338 unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
6339 assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible");
6340 if (!isPowerOf2_32(NumElements))
6341 return false;
6342 // Check the opcode pattern. We apply the mask on the opcode arguments and
6343 // then check if it is what we expect.
6344 for (int Lane : seq<int>(0, NumElements)) {
6345 unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0;
6346 // We expect FSub for even lanes and FAdd for odd lanes.
6347 if (Lane % 2 == 0 && Opc != Instruction::FSub)
6348 return false;
6349 if (Lane % 2 == 1 && Opc != Instruction::FAdd)
6350 return false;
6351 }
6352 // Now check that the pattern is supported by the target ISA.
6353 Type *ElemTy = cast<VectorType>(VecTy)->getElementType();
6354 if (ElemTy->isFloatTy())
6355 return ST->hasSSE3() && NumElements % 4 == 0;
6356 if (ElemTy->isDoubleTy())
6357 return ST->hasSSE3() && NumElements % 2 == 0;
6358 return false;
6359}
6360
6361bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) {
6362 // AVX2 doesn't support scatter
6363 if (!ST->hasAVX512() || !ST->preferScatter())
6364 return false;
6365 return isLegalMaskedGatherScatter(DataType, Alignment);
6366}
6367
6368bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
6369 EVT VT = TLI->getValueType(DL, DataType);
6370 return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
6371}
6372
6374 // FDIV is always expensive, even if it has a very low uop count.
6375 // TODO: Still necessary for recent CPUs with low latency/throughput fdiv?
6376 if (I->getOpcode() == Instruction::FDiv)
6377 return true;
6378
6380}
6381
6383 return false;
6384}
6385
6387 const Function *Callee) const {
6388 const TargetMachine &TM = getTLI()->getTargetMachine();
6389
6390 // Work this as a subsetting of subtarget features.
6391 const FeatureBitset &CallerBits =
6392 TM.getSubtargetImpl(*Caller)->getFeatureBits();
6393 const FeatureBitset &CalleeBits =
6394 TM.getSubtargetImpl(*Callee)->getFeatureBits();
6395
6396 // Check whether features are the same (apart from the ignore list).
6397 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
6398 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
6399 if (RealCallerBits == RealCalleeBits)
6400 return true;
6401
6402 // If the features are a subset, we need to additionally check for calls
6403 // that may become ABI-incompatible as a result of inlining.
6404 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
6405 return false;
6406
6407 for (const Instruction &I : instructions(Callee)) {
6408 if (const auto *CB = dyn_cast<CallBase>(&I)) {
6409 // Having more target features is fine for inline ASM.
6410 if (CB->isInlineAsm())
6411 continue;
6412
6414 for (Value *Arg : CB->args())
6415 Types.push_back(Arg->getType());
6416 if (!CB->getType()->isVoidTy())
6417 Types.push_back(CB->getType());
6418
6419 // Simple types are always ABI compatible.
6420 auto IsSimpleTy = [](Type *Ty) {
6421 return !Ty->isVectorTy() && !Ty->isAggregateType();
6422 };
6423 if (all_of(Types, IsSimpleTy))
6424 continue;
6425
6426 if (Function *NestedCallee = CB->getCalledFunction()) {
6427 // Assume that intrinsics are always ABI compatible.
6428 if (NestedCallee->isIntrinsic())
6429 continue;
6430
6431 // Do a precise compatibility check.
6432 if (!areTypesABICompatible(Caller, NestedCallee, Types))
6433 return false;
6434 } else {
6435 // We don't know the target features of the callee,
6436 // assume it is incompatible.
6437 return false;
6438 }
6439 }
6440 }
6441 return true;
6442}
6443
6445 const Function *Callee,
6446 const ArrayRef<Type *> &Types) const {
6447 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
6448 return false;
6449
6450 // If we get here, we know the target features match. If one function
6451 // considers 512-bit vectors legal and the other does not, consider them
6452 // incompatible.
6453 const TargetMachine &TM = getTLI()->getTargetMachine();
6454
6455 if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
6456 TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs())
6457 return true;
6458
6459 // Consider the arguments compatible if they aren't vectors or aggregates.
6460 // FIXME: Look at the size of vectors.
6461 // FIXME: Look at the element types of aggregates to see if there are vectors.
6462 return llvm::none_of(Types,
6463 [](Type *T) { return T->isVectorTy() || T->isAggregateType(); });
6464}
6465
6467X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
6469 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
6470 Options.NumLoadsPerBlock = 2;
6471 // All GPR and vector loads can be unaligned.
6472 Options.AllowOverlappingLoads = true;
6473 if (IsZeroCmp) {
6474 // Only enable vector loads for equality comparison. Right now the vector
6475 // version is not as fast for three way compare (see #33329).
6476 const unsigned PreferredWidth = ST->getPreferVectorWidth();
6477 if (PreferredWidth >= 512 && ST->hasAVX512() && ST->hasEVEX512())
6478 Options.LoadSizes.push_back(64);
6479 if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
6480 if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
6481 }
6482 if (ST->is64Bit()) {
6483 Options.LoadSizes.push_back(8);
6484 }
6485 Options.LoadSizes.push_back(4);
6486 Options.LoadSizes.push_back(2);
6487 Options.LoadSizes.push_back(1);
6488 return Options;
6489}
6490
6492 return supportsGather();
6493}
6494
6496 return false;
6497}
6498
6500 // TODO: We expect this to be beneficial regardless of arch,
6501 // but there are currently some unexplained performance artifacts on Atom.
6502 // As a temporary solution, disable on Atom.
6503 return !(ST->isAtom());
6504}
6505
6506// Get estimation for interleaved load/store operations and strided load.
6507// \p Indices contains indices for strided load.
6508// \p Factor - the factor of interleaving.
6509// AVX-512 provides 3-src shuffles that significantly reduces the cost.
6511 unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
6512 ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
6513 TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
6514 // VecTy for interleave memop is <VF*Factor x Elt>.
6515 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6516 // VecTy = <12 x i32>.
6517
6518 // Calculate the number of memory operations (NumOfMemOps), required
6519 // for load/store the VecTy.
6520 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6521 unsigned VecTySize = DL.getTypeStoreSize(VecTy);
6522 unsigned LegalVTSize = LegalVT.getStoreSize();
6523 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
6524
6525 // Get the cost of one memory operation.
6526 auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
6527 LegalVT.getVectorNumElements());
6528 InstructionCost MemOpCost;
6529 bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps;
6530 if (UseMaskedMemOp)
6531 MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment,
6533 else
6534 MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, MaybeAlign(Alignment),
6536
6537 unsigned VF = VecTy->getNumElements() / Factor;
6538 MVT VT =
6540
6541 InstructionCost MaskCost;
6542 if (UseMaskedMemOp) {
6543 APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements());
6544 for (unsigned Index : Indices) {
6545 assert(Index < Factor && "Invalid index for interleaved memory op");
6546 for (unsigned Elm = 0; Elm < VF; Elm++)
6547 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
6548 }
6549
6550 Type *I1Type = Type::getInt1Ty(VecTy->getContext());
6551
6552 MaskCost = getReplicationShuffleCost(
6553 I1Type, Factor, VF,
6554 UseMaskForGaps ? DemandedLoadStoreElts
6556 CostKind);
6557
6558 // The Gaps mask is invariant and created outside the loop, therefore the
6559 // cost of creating it is not accounted for here. However if we have both
6560 // a MaskForGaps and some other mask that guards the execution of the
6561 // memory access, we need to account for the cost of And-ing the two masks
6562 // inside the loop.
6563 if (UseMaskForGaps) {
6564 auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements());
6565 MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind);
6566 }
6567 }
6568
6569 if (Opcode == Instruction::Load) {
6570 // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
6571 // contain the cost of the optimized shuffle sequence that the
6572 // X86InterleavedAccess pass will generate.
6573 // The cost of loads and stores are computed separately from the table.
6574
6575 // X86InterleavedAccess support only the following interleaved-access group.
6576 static const CostTblEntry AVX512InterleavedLoadTbl[] = {
6577 {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
6578 {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
6579 {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
6580 };
6581
6582 if (const auto *Entry =
6583 CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
6584 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6585 //If an entry does not exist, fallback to the default implementation.
6586
6587 // Kind of shuffle depends on number of loaded values.
6588 // If we load the entire data in one register, we can use a 1-src shuffle.
6589 // Otherwise, we'll merge 2 sources in each operation.
6590 TTI::ShuffleKind ShuffleKind =
6591 (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
6592
6593 InstructionCost ShuffleCost =
6594 getShuffleCost(ShuffleKind, SingleMemOpTy, {}, CostKind, 0, nullptr);
6595
6596 unsigned NumOfLoadsInInterleaveGrp =
6597 Indices.size() ? Indices.size() : Factor;
6598 auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
6599 VecTy->getNumElements() / Factor);
6600 InstructionCost NumOfResults =
6601 getTypeLegalizationCost(ResultTy).first * NumOfLoadsInInterleaveGrp;
6602
6603 // About a half of the loads may be folded in shuffles when we have only
6604 // one result. If we have more than one result, or the loads are masked,
6605 // we do not fold loads at all.
6606 unsigned NumOfUnfoldedLoads =
6607 UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
6608
6609 // Get a number of shuffle operations per result.
6610 unsigned NumOfShufflesPerResult =
6611 std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
6612
6613 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6614 // When we have more than one destination, we need additional instructions
6615 // to keep sources.
6616 InstructionCost NumOfMoves = 0;
6617 if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
6618 NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
6619
6620 InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
6621 MaskCost + NumOfUnfoldedLoads * MemOpCost +
6622 NumOfMoves;
6623
6624 return Cost;
6625 }
6626
6627 // Store.
6628 assert(Opcode == Instruction::Store &&
6629 "Expected Store Instruction at this point");
6630 // X86InterleavedAccess support only the following interleaved-access group.
6631 static const CostTblEntry AVX512InterleavedStoreTbl[] = {
6632 {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
6633 {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
6634 {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
6635
6636 {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
6637 {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
6638 {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
6639 {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
6640 };
6641
6642 if (const auto *Entry =
6643 CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
6644 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6645 //If an entry does not exist, fallback to the default implementation.
6646
6647 // There is no strided stores meanwhile. And store can't be folded in
6648 // shuffle.
6649 unsigned NumOfSources = Factor; // The number of values to be merged.
6650 InstructionCost ShuffleCost = getShuffleCost(
6651 TTI::SK_PermuteTwoSrc, SingleMemOpTy, {}, CostKind, 0, nullptr);
6652 unsigned NumOfShufflesPerStore = NumOfSources - 1;
6653
6654 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6655 // We need additional instructions to keep sources.
6656 unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
6658 MaskCost +
6659 NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
6660 NumOfMoves;
6661 return Cost;
6662}
6663
6665 unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices,
6666 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
6667 bool UseMaskForCond, bool UseMaskForGaps) {
6668 auto *VecTy = cast<FixedVectorType>(BaseTy);
6669
6670 auto isSupportedOnAVX512 = [&](Type *VecTy) {
6671 Type *EltTy = cast<VectorType>(VecTy)->getElementType();
6672 if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
6673 EltTy->isIntegerTy(32) || EltTy->isPointerTy())
6674 return true;
6675 if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy())
6676 return ST->hasBWI();
6677 if (EltTy->isBFloatTy())
6678 return ST->hasBF16();
6679 return false;
6680 };
6681 if (ST->hasAVX512() && isSupportedOnAVX512(VecTy))
6683 Opcode, VecTy, Factor, Indices, Alignment,
6684 AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
6685
6686 if (UseMaskForCond || UseMaskForGaps)
6687 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6688 Alignment, AddressSpace, CostKind,
6689 UseMaskForCond, UseMaskForGaps);
6690
6691 // Get estimation for interleaved load/store operations for SSE-AVX2.
6692 // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow
6693 // computing the cost using a generic formula as a function of generic
6694 // shuffles. We therefore use a lookup table instead, filled according to
6695 // the instruction sequences that codegen currently generates.
6696
6697 // VecTy for interleave memop is <VF*Factor x Elt>.
6698 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6699 // VecTy = <12 x i32>.
6700 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6701
6702 // This function can be called with VecTy=<6xi128>, Factor=3, in which case
6703 // the VF=2, while v2i128 is an unsupported MVT vector type
6704 // (see MachineValueType.h::getVectorVT()).
6705 if (!LegalVT.isVector())
6706 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6707 Alignment, AddressSpace, CostKind);
6708
6709 unsigned VF = VecTy->getNumElements() / Factor;
6710 Type *ScalarTy = VecTy->getElementType();
6711 // Deduplicate entries, model floats/pointers as appropriately-sized integers.
6712 if (!ScalarTy->isIntegerTy())
6713 ScalarTy =
6714 Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy));
6715
6716 // Get the cost of all the memory operations.
6717 // FIXME: discount dead loads.
6718 InstructionCost MemOpCosts = getMemoryOpCost(
6719 Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind);
6720
6721 auto *VT = FixedVectorType::get(ScalarTy, VF);
6722 EVT ETy = TLI->getValueType(DL, VT);
6723 if (!ETy.isSimple())
6724 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6725 Alignment, AddressSpace, CostKind);
6726
6727 // TODO: Complete for other data-types and strides.
6728 // Each combination of Stride, element bit width and VF results in a different
6729 // sequence; The cost tables are therefore accessed with:
6730 // Factor (stride) and VectorType=VFxiN.
6731 // The Cost accounts only for the shuffle sequence;
6732 // The cost of the loads/stores is accounted for separately.
6733 //
6734 static const CostTblEntry AVX2InterleavedLoadTbl[] = {
6735 {2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8
6736 {2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8
6737 {2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8
6738 {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8
6739 {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8
6740
6741 {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16
6742 {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16
6743 {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16
6744
6745 {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32
6746 {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32
6747 {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32
6748
6749 {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64
6750 {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64
6751 {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64
6752 {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64
6753
6754 {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8
6755 {3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8
6756 {3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8
6757 {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
6758 {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8
6759
6760 {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16
6761 {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16
6762 {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16
6763 {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16
6764 {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16
6765
6766 {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32
6767 {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32
6768 {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32
6769 {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32
6770 {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32
6771
6772 {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64
6773 {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64
6774 {3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64
6775 {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64
6776
6777 {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8
6778 {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8
6779 {4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8
6780 {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8
6781 {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8
6782
6783 {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16
6784 {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16
6785 {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16
6786 {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16
6787 {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16
6788
6789 {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32
6790 {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32
6791 {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32
6792 {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32
6793 {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32
6794
6795 {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64
6796 {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64
6797 {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64
6798 {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64
6799
6800 {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8
6801 {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8
6802 {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8
6803 {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8
6804 {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8
6805
6806 {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16
6807 {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16
6808 {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16
6809 {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16
6810 {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16
6811
6812 {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32
6813 {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32
6814 {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32
6815 {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32
6816
6817 {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64
6818 {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64
6819 {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64
6820
6821 {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32
6822 };
6823
6824 static const CostTblEntry SSSE3InterleavedLoadTbl[] = {
6825 {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16
6826 };
6827
6828 static const CostTblEntry SSE2InterleavedLoadTbl[] = {
6829 {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16
6830 {2, MVT::v4i16, 7}, // (load 8i16 and) deinterleave into 2 x 4i16
6831
6832 {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32
6833 {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32
6834
6835 {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64
6836 };
6837
6838 static const CostTblEntry AVX2InterleavedStoreTbl[] = {
6839 {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store)
6840 {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store)
6841
6842 {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store)
6843 {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store)
6844 {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store)
6845
6846 {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store)
6847 {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store)
6848 {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store)
6849 {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store)
6850
6851 {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store)
6852 {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store)
6853 {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store)
6854 {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store)
6855 {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store)
6856
6857 {3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store)
6858 {3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store)
6859 {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store)
6860 {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
6861 {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
6862
6863 {3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store)
6864 {3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store)
6865 {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store)
6866 {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store)
6867 {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store)
6868
6869 {3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store)
6870 {3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store)
6871 {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store)
6872 {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store)
6873 {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store)
6874
6875 {3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store)
6876 {3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store)
6877 {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store)
6878 {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store)
6879
6880 {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store)
6881 {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store)
6882 {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store)
6883 {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store)
6884 {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store)
6885
6886 {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store)
6887 {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store)
6888 {4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store)
6889 {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store)
6890 {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store)
6891
6892 {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store)
6893 {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store)
6894 {4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store)
6895 {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store)
6896 {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store)
6897
6898 {4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store)
6899 {4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store)
6900 {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store)
6901 {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store)
6902
6903 {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store)
6904 {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store)
6905 {6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store)
6906 {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store)
6907 {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store)
6908
6909 {6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store)
6910 {6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store)
6911 {6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store)
6912 {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store)
6913 {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store)
6914
6915 {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store)
6916 {6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store)
6917 {6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store)
6918 {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store)
6919
6920 {6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store)
6921 {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store)
6922 {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store)
6923 };
6924
6925 static const CostTblEntry SSE2InterleavedStoreTbl[] = {
6926 {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store)
6927 {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store)
6928 {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store)
6929
6930 {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store)
6931 {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store)
6932
6933 {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store)
6934 };
6935
6936 if (Opcode == Instruction::Load) {
6937 auto GetDiscountedCost = [Factor, NumMembers = Indices.size(),
6938 MemOpCosts](const CostTblEntry *Entry) {
6939 // NOTE: this is just an approximation!
6940 // It can over/under -estimate the cost!
6941 return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor);
6942 };
6943
6944 if (ST->hasAVX2())
6945 if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor,
6946 ETy.getSimpleVT()))
6947 return GetDiscountedCost(Entry);
6948
6949 if (ST->hasSSSE3())
6950 if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor,
6951 ETy.getSimpleVT()))
6952 return GetDiscountedCost(Entry);
6953
6954 if (ST->hasSSE2())
6955 if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor,
6956 ETy.getSimpleVT()))
6957 return GetDiscountedCost(Entry);
6958 } else {
6959 assert(Opcode == Instruction::Store &&
6960 "Expected Store Instruction at this point");
6961 assert((!Indices.size() || Indices.size() == Factor) &&
6962 "Interleaved store only supports fully-interleaved groups.");
6963 if (ST->hasAVX2())
6964 if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor,
6965 ETy.getSimpleVT()))
6966 return MemOpCosts + Entry->Cost;
6967
6968 if (ST->hasSSE2())
6969 if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor,
6970 ETy.getSimpleVT()))
6971 return MemOpCosts + Entry->Cost;
6972 }
6973
6974 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6975 Alignment, AddressSpace, CostKind,
6976 UseMaskForCond, UseMaskForGaps);
6977}
6978
6980 StackOffset BaseOffset,
6981 bool HasBaseReg, int64_t Scale,
6982 unsigned AddrSpace) const {
6983 // Scaling factors are not free at all.
6984 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
6985 // will take 2 allocations in the out of order engine instead of 1
6986 // for plain addressing mode, i.e. inst (reg1).
6987 // E.g.,
6988 // vaddps (%rsi,%rdx), %ymm0, %ymm1
6989 // Requires two allocations (one for the load, one for the computation)
6990 // whereas:
6991 // vaddps (%rsi), %ymm0, %ymm1
6992 // Requires just 1 allocation, i.e., freeing allocations for other operations
6993 // and having less micro operations to execute.
6994 //
6995 // For some X86 architectures, this is even worse because for instance for
6996 // stores, the complex addressing mode forces the instruction to use the
6997 // "load" ports instead of the dedicated "store" port.
6998 // E.g., on Haswell:
6999 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
7000 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
7002 AM.BaseGV = BaseGV;
7003 AM.BaseOffs = BaseOffset.getFixed();
7004 AM.HasBaseReg = HasBaseReg;
7005 AM.Scale = Scale;
7006 AM.ScalableOffset = BaseOffset.getScalable();
7007 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
7008 // Scale represents reg2 * scale, thus account for 1
7009 // as soon as we use a second register.
7010 return AM.Scale != 0;
7011 return -1;
7012}
7013
7015 // TODO: Hook MispredictPenalty of SchedMachineModel into this.
7016 return 14;
7017}
7018
7020 unsigned Bits = Ty->getScalarSizeInBits();
7021
7022 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
7023 // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
7024 if (ST->hasXOP() && (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
7025 return false;
7026
7027 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
7028 // shifts just as cheap as scalar ones.
7029 if (ST->hasAVX2() && (Bits == 32 || Bits == 64))
7030 return false;
7031
7032 // AVX512BW has shifts such as vpsllvw.
7033 if (ST->hasBWI() && Bits == 16)
7034 return false;
7035
7036 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
7037 // fully general vector.
7038 return true;
7039}
7040
7041unsigned X86TTIImpl::getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
7042 Type *ScalarValTy) const {
7043 if (ST->hasF16C() && ScalarMemTy->isHalfTy()) {
7044 return 4;
7045 }
7046 return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy);
7047}
7048
7050 SmallVectorImpl<Use *> &Ops) const {
7051 using namespace llvm::PatternMatch;
7052
7053 FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
7054 if (!VTy)
7055 return false;
7056
7057 if (I->getOpcode() == Instruction::Mul &&
7058 VTy->getElementType()->isIntegerTy(64)) {
7059 for (auto &Op : I->operands()) {
7060 // Make sure we are not already sinking this operand
7061 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
7062 continue;
7063
7064 // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or
7065 // the PMULUDQ pattern where the input is a zext_inreg from vXi32.
7066 if (ST->hasSSE41() &&
7067 match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),
7068 m_SpecificInt(32)))) {
7069 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
7070 Ops.push_back(&Op);
7071 } else if (ST->hasSSE2() &&
7072 match(Op.get(),
7073 m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) {
7074 Ops.push_back(&Op);
7075 }
7076 }
7077
7078 return !Ops.empty();
7079 }
7080
7081 // A uniform shift amount in a vector shift or funnel shift may be much
7082 // cheaper than a generic variable vector shift, so make that pattern visible
7083 // to SDAG by sinking the shuffle instruction next to the shift.
7084 int ShiftAmountOpNum = -1;
7085 if (I->isShift())
7086 ShiftAmountOpNum = 1;
7087 else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
7088 if (II->getIntrinsicID() == Intrinsic::fshl ||
7089 II->getIntrinsicID() == Intrinsic::fshr)
7090 ShiftAmountOpNum = 2;
7091 }
7092
7093 if (ShiftAmountOpNum == -1)
7094 return false;
7095
7096 auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
7097 if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
7098 isVectorShiftByScalarCheap(I->getType())) {
7099 Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
7100 return true;
7101 }
7102
7103 return false;
7104}
Expand Atomic instructions
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Hexagon Common GEP
static LVOptions Options
Definition: LVOptions.cpp:25
#define I(x, y, z)
Definition: MD5.cpp:58
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
if(PassOpts->AAPipeline)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getNumElements(Type *Ty)
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
This file a TargetTransformInfo::Concept conforming object specific to the X86 target machine.
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:986
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1649
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1330
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:371
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:258
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:380
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1468
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1015
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:827
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:200
APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition: APInt.cpp:455
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1542
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:479
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
Definition: BasicTTIImpl.h:397
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:923
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={})
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:807
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:959
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:380
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:676
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:703
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:697
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:696
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:681
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:684
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:698
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:701
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:699
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:683
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:364
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:843
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:617
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:421
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:847
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:322
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:563
unsigned getNumElements() const
Definition: DerivedTypes.h:606
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:933
static InstructionCost getInvalid(CostType Val=0)
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:311
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
const IntrinsicInst * getInst() const
Machine Value Type.
bool is128BitVector() const
Return true if this is a 128-bit vector type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Class to represent pointers.
Definition: DerivedTypes.h:670
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:703
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
bool test(unsigned Idx) const
size_type size() const
Returns the number of bits in this bitvector.
bool empty() const
Definition: SmallVector.h:81
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
static StackOffset getScalable(int64_t Scalable)
Definition: TypeSize.h:43
static StackOffset getFixed(int64_t Fixed)
Definition: TypeSize.h:42
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
unsigned getMaxExpandSizeMemcmp(bool OptSize) const
Get maximum # of load operations permitted for memcmp.
MVT getSimpleValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the MVT corresponding to this LLVM type. See getValueType.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
bool isExpensiveToSpeculativelyExecute(const Instruction *I)
bool isStridedAccess(const SCEV *Ptr) const
unsigned minRequiredElementSize(const Value *Val, bool &isSigned) const
const SCEVConstant * getConstantStrideStep(ScalarEvolution *SE, const SCEV *Ptr) const
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
CacheLevel
The possible cache levels.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:348
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
static Type * getDoubleTy(LLVMContext &C)
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:243
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:264
static IntegerType * getInt1Ty(LLVMContext &C)
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:145
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
static IntegerType * getInt8Ty(LLVMContext &C)
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:225
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
Base class of all SIMD vector types.
Definition: DerivedTypes.h:427
static VectorType * getExtendedElementVectorType(VectorType *VTy)
This static method is like getInteger except that the element types are twice as wide as the elements...
Definition: DerivedTypes.h:487
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:665
static VectorType * getDoubleElementsVectorType(VectorType *VTy)
This static method returns a VectorType with twice as many elements as the input type and the same el...
Definition: DerivedTypes.h:541
Type * getElementType() const
Definition: DerivedTypes.h:460
bool hasSSE1() const
Definition: X86Subtarget.h:193
bool hasSSE42() const
Definition: X86Subtarget.h:198
bool useAVX512Regs() const
Definition: X86Subtarget.h:253
bool hasSSE3() const
Definition: X86Subtarget.h:195
bool hasAVX512() const
Definition: X86Subtarget.h:201
bool hasSSE41() const
Definition: X86Subtarget.h:197
bool hasSSE2() const
Definition: X86Subtarget.h:194
bool hasSSSE3() const
Definition: X86Subtarget.h:196
bool hasAVX() const
Definition: X86Subtarget.h:199
unsigned getPreferVectorWidth() const
Definition: X86Subtarget.h:225
bool hasAVX2() const
Definition: X86Subtarget.h:200
InstructionCost getInterleavedMemoryOpCostAVX512(unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
bool isLegalMaskedGather(Type *DataType, Align Alignment)
InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask, TTI::TargetCostKind CostKind) const
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const
std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool isLegalNTStore(Type *DataType, Align Alignment)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
bool isLegalNTLoad(Type *DataType, Align Alignment)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment)
bool isLegalMaskedGatherScatter(Type *DataType, Align Alignment)
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
bool isLegalMaskedLoad(Type *DataType, Align Alignment)
bool hasConditionalLoadStoreForType(Type *Ty=nullptr) const
bool supportsEfficientVectorElementLoadStore() const
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
bool prefersVectorizedAddressing() const
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment)
std::optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const override
bool isLegalMaskedStore(Type *DataType, Align Alignment)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
Calculate the cost of Gather / Scatter operation.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
unsigned getMaxInterleaveFactor(ElementCount VF)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
bool isLegalMaskedCompressStore(Type *DataType, Align Alignment)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isVectorShiftByScalarCheap(Type *Ty) const
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
unsigned getNumberOfRegisters(unsigned ClassID) const
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
unsigned getAtomicMemIntrinsicMaxElementSize() const
bool isLegalMaskedScatter(Type *DataType, Align Alignment)
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={})
InstructionCost getIntImmCost(int64_t)
Calculate the cost of materializing a 64-bit value.
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getBranchMispredictPenalty() const
bool isExpensiveToSpeculativelyExecute(const Instruction *I)
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr)
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
InstructionCost getMinMaxCost(Intrinsic::ID IID, Type *Ty, TTI::TargetCostKind CostKind, FastMathFlags FMF)
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Type) const
bool hasDivRemOp(Type *DataType, bool IsSigned)
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition: APInt.cpp:2982
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:780
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:753
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:744
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:841
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:717
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:262
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:805
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:752
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:981
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:757
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:330
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:550
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:811
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1031
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:697
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:939
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1050
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:887
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:920
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:817
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
Definition: PatternMatch.h:982
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
apint_match m_APIntAllowPoison(const APInt *&Res)
Match APInt while allowing poison in splat vector constants.
Definition: PatternMatch.h:305
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:555
Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:394
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1841
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
InstructionCost Cost
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
#define N
std::optional< unsigned > operator[](TargetTransformInfo::TargetCostKind Kind) const
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition: Alignment.h:141
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Type Conversion Cost Table.
Definition: CostTable.h:55