LLVM 20.0.0git
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements a TargetTransformInfo analysis pass specific to the
10/// X86 target machine. It uses the target's detailed information to provide
11/// more precise answers to certain TTI queries, while letting the target
12/// independent and default TTI implementations handle the rest.
13///
14//===----------------------------------------------------------------------===//
15/// About Cost Model numbers used below it's necessary to say the following:
16/// the numbers correspond to some "generic" X86 CPU instead of usage of a
17/// specific CPU model. Usually the numbers correspond to the CPU where the
18/// feature first appeared. For example, if we do Subtarget.hasSSE42() in
19/// the lookups below the cost is based on Nehalem as that was the first CPU
20/// to support that feature level and thus has most likely the worst case cost,
21/// although we may discard an outlying worst cost from one CPU (e.g. Atom).
22///
23/// Some examples of other technologies/CPUs:
24/// SSE 3 - Pentium4 / Athlon64
25/// SSE 4.1 - Penryn
26/// SSE 4.2 - Nehalem / Silvermont
27/// AVX - Sandy Bridge / Jaguar / Bulldozer
28/// AVX2 - Haswell / Ryzen
29/// AVX-512 - Xeon Phi / Skylake
30///
31/// And some examples of instruction target dependent costs (latency)
32/// divss sqrtss rsqrtss
33/// AMD K7 11-16 19 3
34/// Piledriver 9-24 13-15 5
35/// Jaguar 14 16 2
36/// Pentium II,III 18 30 2
37/// Nehalem 7-14 7-18 3
38/// Haswell 10-13 11 5
39///
40/// Interpreting the 4 TargetCostKind types:
41/// TCK_RecipThroughput and TCK_Latency should try to match the worst case
42/// values reported by the CPU scheduler models (and llvm-mca).
43/// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the
44/// actual encoding size of the instruction.
45/// TCK_SizeAndLatency should match the worst case micro-op counts reported by
46/// by the CPU scheduler models (and llvm-mca), to ensure that they are
47/// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are
48/// often used as the cost thresholds where TCK_SizeAndLatency is requested.
49//===----------------------------------------------------------------------===//
50
58#include <optional>
59
60using namespace llvm;
61
62#define DEBUG_TYPE "x86tti"
63
64//===----------------------------------------------------------------------===//
65//
66// X86 cost model.
67//
68//===----------------------------------------------------------------------===//
69
70// Helper struct to store/access costs for each cost kind.
71// TODO: Move this to allow other targets to use it?
73 unsigned RecipThroughputCost = ~0U;
74 unsigned LatencyCost = ~0U;
75 unsigned CodeSizeCost = ~0U;
76 unsigned SizeAndLatencyCost = ~0U;
77
78 std::optional<unsigned>
80 unsigned Cost = ~0U;
81 switch (Kind) {
84 break;
87 break;
90 break;
93 break;
94 }
95 if (Cost == ~0U)
96 return std::nullopt;
97 return Cost;
98 }
99};
102
105 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
106 // TODO: Currently the __builtin_popcount() implementation using SSE3
107 // instructions is inefficient. Once the problem is fixed, we should
108 // call ST->hasSSE3() instead of ST->hasPOPCNT().
109 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
110}
111
112std::optional<unsigned> X86TTIImpl::getCacheSize(
114 switch (Level) {
116 // - Penryn
117 // - Nehalem
118 // - Westmere
119 // - Sandy Bridge
120 // - Ivy Bridge
121 // - Haswell
122 // - Broadwell
123 // - Skylake
124 // - Kabylake
125 return 32 * 1024; // 32 KByte
127 // - Penryn
128 // - Nehalem
129 // - Westmere
130 // - Sandy Bridge
131 // - Ivy Bridge
132 // - Haswell
133 // - Broadwell
134 // - Skylake
135 // - Kabylake
136 return 256 * 1024; // 256 KByte
137 }
138
139 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
140}
141
142std::optional<unsigned> X86TTIImpl::getCacheAssociativity(
144 // - Penryn
145 // - Nehalem
146 // - Westmere
147 // - Sandy Bridge
148 // - Ivy Bridge
149 // - Haswell
150 // - Broadwell
151 // - Skylake
152 // - Kabylake
153 switch (Level) {
155 [[fallthrough]];
157 return 8;
158 }
159
160 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
161}
162
163unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
164 bool Vector = (ClassID == 1);
165 if (Vector && !ST->hasSSE1())
166 return 0;
167
168 if (ST->is64Bit()) {
169 if (Vector && ST->hasAVX512())
170 return 32;
171 if (!Vector && ST->hasEGPR())
172 return 32;
173 return 16;
174 }
175 return 8;
176}
177
179 if (!ST->hasCF())
180 return false;
181 if (!Ty)
182 return true;
183 // Conditional faulting is supported by CFCMOV, which only accepts
184 // 16/32/64-bit operands.
185 // TODO: Support f32/f64 with VMOVSS/VMOVSD with zero mask when it's
186 // profitable.
187 auto *VTy = dyn_cast<FixedVectorType>(Ty);
188 if (!Ty->isIntegerTy() && (!VTy || VTy->getNumElements() != 1))
189 return false;
190 auto *ScalarTy = Ty->getScalarType();
191 switch (cast<IntegerType>(ScalarTy)->getBitWidth()) {
192 default:
193 return false;
194 case 16:
195 case 32:
196 case 64:
197 return true;
198 }
199}
200
203 unsigned PreferVectorWidth = ST->getPreferVectorWidth();
204 switch (K) {
206 return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
208 if (ST->hasAVX512() && ST->hasEVEX512() && PreferVectorWidth >= 512)
209 return TypeSize::getFixed(512);
210 if (ST->hasAVX() && PreferVectorWidth >= 256)
211 return TypeSize::getFixed(256);
212 if (ST->hasSSE1() && PreferVectorWidth >= 128)
213 return TypeSize::getFixed(128);
214 return TypeSize::getFixed(0);
216 return TypeSize::getScalable(0);
217 }
218
219 llvm_unreachable("Unsupported register kind");
220}
221
224 .getFixedValue();
225}
226
228 // If the loop will not be vectorized, don't interleave the loop.
229 // Let regular unroll to unroll the loop, which saves the overflow
230 // check and memory check cost.
231 if (VF.isScalar())
232 return 1;
233
234 if (ST->isAtom())
235 return 1;
236
237 // Sandybridge and Haswell have multiple execution ports and pipelined
238 // vector units.
239 if (ST->hasAVX())
240 return 4;
241
242 return 2;
243}
244
246 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
249 const Instruction *CxtI) {
250
251 // vXi8 multiplications are always promoted to vXi16.
252 // Sub-128-bit types can be extended/packed more efficiently.
253 if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
254 Ty->getPrimitiveSizeInBits() <= 64 && Ty->getScalarSizeInBits() == 8) {
255 Type *WideVecTy =
256 VectorType::getExtendedElementVectorType(cast<VectorType>(Ty));
257 return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
259 CostKind) +
260 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
262 CostKind) +
263 getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info);
264 }
265
266 // Legalize the type.
267 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
268
269 int ISD = TLI->InstructionOpcodeToISD(Opcode);
270 assert(ISD && "Invalid opcode");
271
272 if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
273 (LT.second.getScalarType() == MVT::i32 ||
274 LT.second.getScalarType() == MVT::i64)) {
275 // Check if the operands can be represented as a smaller datatype.
276 bool Op1Signed = false, Op2Signed = false;
277 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
278 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
279 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
280 bool SignedMode = Op1Signed || Op2Signed;
281
282 // If both vXi32 are representable as i15 and at least one is constant,
283 // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
284 // can treat this as PMADDWD which has the same costs as a vXi16 multiply.
285 if (OpMinSize <= 15 && !ST->isPMADDWDSlow() &&
286 LT.second.getScalarType() == MVT::i32) {
287 bool Op1Constant =
288 isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
289 bool Op2Constant =
290 isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
291 bool Op1Sext = isa<SExtInst>(Args[0]) &&
292 (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
293 bool Op2Sext = isa<SExtInst>(Args[1]) &&
294 (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
295
296 bool IsZeroExtended = !Op1Signed || !Op2Signed;
297 bool IsConstant = Op1Constant || Op2Constant;
298 bool IsSext = Op1Sext || Op2Sext;
299 if (IsConstant || IsZeroExtended || IsSext)
300 LT.second =
301 MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
302 }
303
304 // Check if the vXi32 operands can be shrunk into a smaller datatype.
305 // This should match the codegen from reduceVMULWidth.
306 // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()).
307 if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) {
308 if (OpMinSize <= 7)
309 return LT.first * 3; // pmullw/sext
310 if (!SignedMode && OpMinSize <= 8)
311 return LT.first * 3; // pmullw/zext
312 if (OpMinSize <= 15)
313 return LT.first * 5; // pmullw/pmulhw/pshuf
314 if (!SignedMode && OpMinSize <= 16)
315 return LT.first * 5; // pmullw/pmulhw/pshuf
316 }
317
318 // If both vXi64 are representable as (unsigned) i32, then we can perform
319 // the multiple with a single PMULUDQ instruction.
320 // TODO: Add (SSE41+) PMULDQ handling for signed extensions.
321 if (!SignedMode && OpMinSize <= 32 && LT.second.getScalarType() == MVT::i64)
322 ISD = X86ISD::PMULUDQ;
323 }
324
325 // Vector multiply by pow2 will be simplified to shifts.
326 // Vector multiply by -pow2 will be simplified to shifts/negates.
327 if (ISD == ISD::MUL && Op2Info.isConstant() &&
328 (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) {
330 getArithmeticInstrCost(Instruction::Shl, Ty, CostKind,
331 Op1Info.getNoProps(), Op2Info.getNoProps());
332 if (Op2Info.isNegatedPowerOf2())
333 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
334 return Cost;
335 }
336
337 // On X86, vector signed division by constants power-of-two are
338 // normally expanded to the sequence SRA + SRL + ADD + SRA.
339 // The OperandValue properties may not be the same as that of the previous
340 // operation; conservatively assume OP_None.
341 if ((ISD == ISD::SDIV || ISD == ISD::SREM) &&
342 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
344 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
345 Op1Info.getNoProps(), Op2Info.getNoProps());
346 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
347 Op1Info.getNoProps(), Op2Info.getNoProps());
348 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
349 Op1Info.getNoProps(), Op2Info.getNoProps());
350
351 if (ISD == ISD::SREM) {
352 // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
353 Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(),
354 Op2Info.getNoProps());
355 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(),
356 Op2Info.getNoProps());
357 }
358
359 return Cost;
360 }
361
362 // Vector unsigned division/remainder will be simplified to shifts/masks.
363 if ((ISD == ISD::UDIV || ISD == ISD::UREM) &&
364 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
365 if (ISD == ISD::UDIV)
366 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
367 Op1Info.getNoProps(), Op2Info.getNoProps());
368 // UREM
369 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
370 Op1Info.getNoProps(), Op2Info.getNoProps());
371 }
372
373 static const CostKindTblEntry GFNIUniformConstCostTable[] = {
374 { ISD::SHL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
375 { ISD::SRL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
376 { ISD::SRA, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
377 { ISD::SHL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
378 { ISD::SRL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
379 { ISD::SRA, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
380 { ISD::SHL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
381 { ISD::SRL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
382 { ISD::SRA, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
383 };
384
385 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasGFNI())
386 if (const auto *Entry =
387 CostTableLookup(GFNIUniformConstCostTable, ISD, LT.second))
388 if (auto KindCost = Entry->Cost[CostKind])
389 return LT.first * *KindCost;
390
391 static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
392 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
393 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
394 { ISD::SRA, MVT::v16i8, { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb.
395 { ISD::SHL, MVT::v32i8, { 1, 8, 2, 3 } }, // psllw + pand.
396 { ISD::SRL, MVT::v32i8, { 1, 8, 2, 3 } }, // psrlw + pand.
397 { ISD::SRA, MVT::v32i8, { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb.
398 { ISD::SHL, MVT::v64i8, { 1, 8, 2, 3 } }, // psllw + pand.
399 { ISD::SRL, MVT::v64i8, { 1, 8, 2, 3 } }, // psrlw + pand.
400 { ISD::SRA, MVT::v64i8, { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb.
401
402 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // psllw
403 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
404 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
405 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // psllw
406 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
407 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
408 };
409
410 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI())
411 if (const auto *Entry =
412 CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second))
413 if (auto KindCost = Entry->Cost[CostKind])
414 return LT.first * *KindCost;
415
416 static const CostKindTblEntry AVX512UniformConstCostTable[] = {
417 { ISD::SHL, MVT::v64i8, { 2, 12, 5, 6 } }, // psllw + pand.
418 { ISD::SRL, MVT::v64i8, { 2, 12, 5, 6 } }, // psrlw + pand.
419 { ISD::SRA, MVT::v64i8, { 3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb.
420
421 { ISD::SHL, MVT::v16i16, { 2, 7, 4, 4 } }, // psllw + split.
422 { ISD::SRL, MVT::v16i16, { 2, 7, 4, 4 } }, // psrlw + split.
423 { ISD::SRA, MVT::v16i16, { 2, 7, 4, 4 } }, // psraw + split.
424
425 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, // pslld
426 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, // psrld
427 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, // psrad
428 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, // pslld
429 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, // psrld
430 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, // psrad
431
432 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, // psraq
433 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, // psllq
434 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, // psrlq
435 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, // psraq
436 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, // psllq
437 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, // psrlq
438 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, // psraq
439
440 { ISD::SDIV, MVT::v16i32, { 6 } }, // pmuludq sequence
441 { ISD::SREM, MVT::v16i32, { 8 } }, // pmuludq+mul+sub sequence
442 { ISD::UDIV, MVT::v16i32, { 5 } }, // pmuludq sequence
443 { ISD::UREM, MVT::v16i32, { 7 } }, // pmuludq+mul+sub sequence
444 };
445
446 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512())
447 if (const auto *Entry =
448 CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second))
449 if (auto KindCost = Entry->Cost[CostKind])
450 return LT.first * *KindCost;
451
452 static const CostKindTblEntry AVX2UniformConstCostTable[] = {
453 { ISD::SHL, MVT::v16i8, { 1, 8, 2, 3 } }, // psllw + pand.
454 { ISD::SRL, MVT::v16i8, { 1, 8, 2, 3 } }, // psrlw + pand.
455 { ISD::SRA, MVT::v16i8, { 2, 10, 5, 6 } }, // psrlw, pand, pxor, psubb.
456 { ISD::SHL, MVT::v32i8, { 2, 8, 2, 4 } }, // psllw + pand.
457 { ISD::SRL, MVT::v32i8, { 2, 8, 2, 4 } }, // psrlw + pand.
458 { ISD::SRA, MVT::v32i8, { 3, 10, 5, 9 } }, // psrlw, pand, pxor, psubb.
459
460 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw
461 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw
462 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw
463 { ISD::SHL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psllw
464 { ISD::SRL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psrlw
465 { ISD::SRA, MVT::v16i16,{ 2, 2, 1, 2 } }, // psraw
466
467 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
468 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld
469 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad
470 { ISD::SHL, MVT::v8i32, { 2, 2, 1, 2 } }, // pslld
471 { ISD::SRL, MVT::v8i32, { 2, 2, 1, 2 } }, // psrld
472 { ISD::SRA, MVT::v8i32, { 2, 2, 1, 2 } }, // psrad
473
474 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq
475 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq
476 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
477 { ISD::SHL, MVT::v4i64, { 2, 2, 1, 2 } }, // psllq
478 { ISD::SRL, MVT::v4i64, { 2, 2, 1, 2 } }, // psrlq
479 { ISD::SRA, MVT::v4i64, { 4, 4, 3, 6 } }, // psrad + shuffle + split.
480
481 { ISD::SDIV, MVT::v8i32, { 6 } }, // pmuludq sequence
482 { ISD::SREM, MVT::v8i32, { 8 } }, // pmuludq+mul+sub sequence
483 { ISD::UDIV, MVT::v8i32, { 5 } }, // pmuludq sequence
484 { ISD::UREM, MVT::v8i32, { 7 } }, // pmuludq+mul+sub sequence
485 };
486
487 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2())
488 if (const auto *Entry =
489 CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second))
490 if (auto KindCost = Entry->Cost[CostKind])
491 return LT.first * *KindCost;
492
493 static const CostKindTblEntry AVXUniformConstCostTable[] = {
494 { ISD::SHL, MVT::v16i8, { 2, 7, 2, 3 } }, // psllw + pand.
495 { ISD::SRL, MVT::v16i8, { 2, 7, 2, 3 } }, // psrlw + pand.
496 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
497 { ISD::SHL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psllw + pand) + split.
498 { ISD::SRL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psrlw + pand) + split.
499 { ISD::SRA, MVT::v32i8, { 7, 7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split.
500
501 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 1 } }, // psllw.
502 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 1 } }, // psrlw.
503 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 1 } }, // psraw.
504 { ISD::SHL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psllw + split.
505 { ISD::SRL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psrlw + split.
506 { ISD::SRA, MVT::v16i16,{ 3, 6, 4, 5 } }, // psraw + split.
507
508 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 1 } }, // pslld.
509 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 1 } }, // psrld.
510 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 1 } }, // psrad.
511 { ISD::SHL, MVT::v8i32, { 3, 6, 4, 5 } }, // pslld + split.
512 { ISD::SRL, MVT::v8i32, { 3, 6, 4, 5 } }, // psrld + split.
513 { ISD::SRA, MVT::v8i32, { 3, 6, 4, 5 } }, // psrad + split.
514
515 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 1 } }, // psllq.
516 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 1 } }, // psrlq.
517 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
518 { ISD::SHL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
519 { ISD::SRL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
520 { ISD::SRA, MVT::v4i64, { 5, 7, 8, 9 } }, // 2 x psrad + shuffle + split.
521
522 { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split.
523 { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split.
524 { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split.
525 { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split.
526 };
527
528 // XOP has faster vXi8 shifts.
529 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() &&
530 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
531 if (const auto *Entry =
532 CostTableLookup(AVXUniformConstCostTable, ISD, LT.second))
533 if (auto KindCost = Entry->Cost[CostKind])
534 return LT.first * *KindCost;
535
536 static const CostKindTblEntry SSE2UniformConstCostTable[] = {
537 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
538 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
539 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
540
541 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw.
542 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw.
543 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw.
544
545 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
546 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld.
547 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad.
548
549 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq.
550 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq.
551 { ISD::SRA, MVT::v2i64, { 3, 5, 6, 6 } }, // 2 x psrad + shuffle.
552
553 { ISD::SDIV, MVT::v4i32, { 6 } }, // pmuludq sequence
554 { ISD::SREM, MVT::v4i32, { 8 } }, // pmuludq+mul+sub sequence
555 { ISD::UDIV, MVT::v4i32, { 5 } }, // pmuludq sequence
556 { ISD::UREM, MVT::v4i32, { 7 } }, // pmuludq+mul+sub sequence
557 };
558
559 // XOP has faster vXi8 shifts.
560 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() &&
561 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
562 if (const auto *Entry =
563 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
564 if (auto KindCost = Entry->Cost[CostKind])
565 return LT.first * *KindCost;
566
567 static const CostKindTblEntry AVX512BWConstCostTable[] = {
568 { ISD::SDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
569 { ISD::SREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
570 { ISD::UDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
571 { ISD::UREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
572
573 { ISD::SDIV, MVT::v32i16, { 6 } }, // vpmulhw sequence
574 { ISD::SREM, MVT::v32i16, { 8 } }, // vpmulhw+mul+sub sequence
575 { ISD::UDIV, MVT::v32i16, { 6 } }, // vpmulhuw sequence
576 { ISD::UREM, MVT::v32i16, { 8 } }, // vpmulhuw+mul+sub sequence
577 };
578
579 if (Op2Info.isConstant() && ST->hasBWI())
580 if (const auto *Entry =
581 CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
582 if (auto KindCost = Entry->Cost[CostKind])
583 return LT.first * *KindCost;
584
585 static const CostKindTblEntry AVX512ConstCostTable[] = {
586 { ISD::SDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
587 { ISD::SREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
588 { ISD::UDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
589 { ISD::UREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
590
591 { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence
592 { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence
593 { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence
594 { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence
595
596 { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence
597 { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence
598 { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence
599 { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence
600 };
601
602 if (Op2Info.isConstant() && ST->hasAVX512())
603 if (const auto *Entry =
604 CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
605 if (auto KindCost = Entry->Cost[CostKind])
606 return LT.first * *KindCost;
607
608 static const CostKindTblEntry AVX2ConstCostTable[] = {
609 { ISD::SDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
610 { ISD::SREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
611 { ISD::UDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
612 { ISD::UREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
613
614 { ISD::SDIV, MVT::v16i16, { 6 } }, // vpmulhw sequence
615 { ISD::SREM, MVT::v16i16, { 8 } }, // vpmulhw+mul+sub sequence
616 { ISD::UDIV, MVT::v16i16, { 6 } }, // vpmulhuw sequence
617 { ISD::UREM, MVT::v16i16, { 8 } }, // vpmulhuw+mul+sub sequence
618
619 { ISD::SDIV, MVT::v8i32, { 15 } }, // vpmuldq sequence
620 { ISD::SREM, MVT::v8i32, { 19 } }, // vpmuldq+mul+sub sequence
621 { ISD::UDIV, MVT::v8i32, { 15 } }, // vpmuludq sequence
622 { ISD::UREM, MVT::v8i32, { 19 } }, // vpmuludq+mul+sub sequence
623 };
624
625 if (Op2Info.isConstant() && ST->hasAVX2())
626 if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
627 if (auto KindCost = Entry->Cost[CostKind])
628 return LT.first * *KindCost;
629
630 static const CostKindTblEntry AVXConstCostTable[] = {
631 { ISD::SDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
632 { ISD::SREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
633 { ISD::UDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
634 { ISD::UREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
635
636 { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split.
637 { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split.
638 { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split.
639 { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split.
640
641 { ISD::SDIV, MVT::v8i32, { 32 } }, // vpmuludq sequence
642 { ISD::SREM, MVT::v8i32, { 38 } }, // vpmuludq+mul+sub sequence
643 { ISD::UDIV, MVT::v8i32, { 32 } }, // 2*pmuludq sequence + split.
644 { ISD::UREM, MVT::v8i32, { 42 } }, // 2*pmuludq+mul+sub sequence + split.
645 };
646
647 if (Op2Info.isConstant() && ST->hasAVX())
648 if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second))
649 if (auto KindCost = Entry->Cost[CostKind])
650 return LT.first * *KindCost;
651
652 static const CostKindTblEntry SSE41ConstCostTable[] = {
653 { ISD::SDIV, MVT::v4i32, { 15 } }, // vpmuludq sequence
654 { ISD::SREM, MVT::v4i32, { 20 } }, // vpmuludq+mul+sub sequence
655 };
656
657 if (Op2Info.isConstant() && ST->hasSSE41())
658 if (const auto *Entry =
659 CostTableLookup(SSE41ConstCostTable, ISD, LT.second))
660 if (auto KindCost = Entry->Cost[CostKind])
661 return LT.first * *KindCost;
662
663 static const CostKindTblEntry SSE2ConstCostTable[] = {
664 { ISD::SDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
665 { ISD::SREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
666 { ISD::UDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
667 { ISD::UREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
668
669 { ISD::SDIV, MVT::v8i16, { 6 } }, // pmulhw sequence
670 { ISD::SREM, MVT::v8i16, { 8 } }, // pmulhw+mul+sub sequence
671 { ISD::UDIV, MVT::v8i16, { 6 } }, // pmulhuw sequence
672 { ISD::UREM, MVT::v8i16, { 8 } }, // pmulhuw+mul+sub sequence
673
674 { ISD::SDIV, MVT::v4i32, { 19 } }, // pmuludq sequence
675 { ISD::SREM, MVT::v4i32, { 24 } }, // pmuludq+mul+sub sequence
676 { ISD::UDIV, MVT::v4i32, { 15 } }, // pmuludq sequence
677 { ISD::UREM, MVT::v4i32, { 20 } }, // pmuludq+mul+sub sequence
678 };
679
680 if (Op2Info.isConstant() && ST->hasSSE2())
681 if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
682 if (auto KindCost = Entry->Cost[CostKind])
683 return LT.first * *KindCost;
684
685 static const CostKindTblEntry AVX512BWUniformCostTable[] = {
686 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
687 { ISD::SRL, MVT::v16i8, { 3,10, 5, 8 } }, // psrlw + pand.
688 { ISD::SRA, MVT::v16i8, { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb.
689 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
690 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
691 { ISD::SRA, MVT::v32i8, { 5,10,10,13 } }, // psrlw, pand, pxor, psubb.
692 { ISD::SHL, MVT::v64i8, { 4, 7, 6, 8 } }, // psllw + pand.
693 { ISD::SRL, MVT::v64i8, { 4, 8, 7,10 } }, // psrlw + pand.
694 { ISD::SRA, MVT::v64i8, { 5,10,10,15 } }, // psrlw, pand, pxor, psubb.
695
696 { ISD::SHL, MVT::v32i16, { 2, 4, 2, 3 } }, // psllw
697 { ISD::SRL, MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw
698 { ISD::SRA, MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw
699 };
700
701 if (ST->hasBWI() && Op2Info.isUniform())
702 if (const auto *Entry =
703 CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second))
704 if (auto KindCost = Entry->Cost[CostKind])
705 return LT.first * *KindCost;
706
707 static const CostKindTblEntry AVX512UniformCostTable[] = {
708 { ISD::SHL, MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split.
709 { ISD::SRL, MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split.
710 { ISD::SRA, MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split.
711
712 { ISD::SHL, MVT::v16i32, { 2, 4, 2, 3 } }, // pslld
713 { ISD::SRL, MVT::v16i32, { 2, 4, 2, 3 } }, // psrld
714 { ISD::SRA, MVT::v16i32, { 2, 4, 2, 3 } }, // psrad
715
716 { ISD::SRA, MVT::v2i64, { 1, 2, 1, 2 } }, // psraq
717 { ISD::SHL, MVT::v4i64, { 1, 4, 1, 2 } }, // psllq
718 { ISD::SRL, MVT::v4i64, { 1, 4, 1, 2 } }, // psrlq
719 { ISD::SRA, MVT::v4i64, { 1, 4, 1, 2 } }, // psraq
720 { ISD::SHL, MVT::v8i64, { 1, 4, 1, 2 } }, // psllq
721 { ISD::SRL, MVT::v8i64, { 1, 4, 1, 2 } }, // psrlq
722 { ISD::SRA, MVT::v8i64, { 1, 4, 1, 2 } }, // psraq
723 };
724
725 if (ST->hasAVX512() && Op2Info.isUniform())
726 if (const auto *Entry =
727 CostTableLookup(AVX512UniformCostTable, ISD, LT.second))
728 if (auto KindCost = Entry->Cost[CostKind])
729 return LT.first * *KindCost;
730
731 static const CostKindTblEntry AVX2UniformCostTable[] = {
732 // Uniform splats are cheaper for the following instructions.
733 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
734 { ISD::SRL, MVT::v16i8, { 3, 9, 5, 8 } }, // psrlw + pand.
735 { ISD::SRA, MVT::v16i8, { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb.
736 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
737 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
738 { ISD::SRA, MVT::v32i8, { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb.
739
740 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 2 } }, // psllw.
741 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 2 } }, // psrlw.
742 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 2 } }, // psraw.
743 { ISD::SHL, MVT::v16i16, { 2, 4, 2, 3 } }, // psllw.
744 { ISD::SRL, MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw.
745 { ISD::SRA, MVT::v16i16, { 2, 4, 2, 3 } }, // psraw.
746
747 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 2 } }, // pslld
748 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 2 } }, // psrld
749 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 2 } }, // psrad
750 { ISD::SHL, MVT::v8i32, { 2, 4, 2, 3 } }, // pslld
751 { ISD::SRL, MVT::v8i32, { 2, 4, 2, 3 } }, // psrld
752 { ISD::SRA, MVT::v8i32, { 2, 4, 2, 3 } }, // psrad
753
754 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 2 } }, // psllq
755 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 2 } }, // psrlq
756 { ISD::SRA, MVT::v2i64, { 2, 4, 5, 7 } }, // 2 x psrad + shuffle.
757 { ISD::SHL, MVT::v4i64, { 2, 4, 1, 2 } }, // psllq
758 { ISD::SRL, MVT::v4i64, { 2, 4, 1, 2 } }, // psrlq
759 { ISD::SRA, MVT::v4i64, { 4, 6, 5, 9 } }, // 2 x psrad + shuffle.
760 };
761
762 if (ST->hasAVX2() && Op2Info.isUniform())
763 if (const auto *Entry =
764 CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
765 if (auto KindCost = Entry->Cost[CostKind])
766 return LT.first * *KindCost;
767
768 static const CostKindTblEntry AVXUniformCostTable[] = {
769 { ISD::SHL, MVT::v16i8, { 4, 4, 6, 8 } }, // psllw + pand.
770 { ISD::SRL, MVT::v16i8, { 4, 8, 5, 8 } }, // psrlw + pand.
771 { ISD::SRA, MVT::v16i8, { 6, 6, 9,13 } }, // psrlw, pand, pxor, psubb.
772 { ISD::SHL, MVT::v32i8, { 7, 8,11,14 } }, // psllw + pand + split.
773 { ISD::SRL, MVT::v32i8, { 7, 9,10,14 } }, // psrlw + pand + split.
774 { ISD::SRA, MVT::v32i8, { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split.
775
776 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 2 } }, // psllw.
777 { ISD::SRL, MVT::v8i16, { 1, 3, 1, 2 } }, // psrlw.
778 { ISD::SRA, MVT::v8i16, { 1, 3, 1, 2 } }, // psraw.
779 { ISD::SHL, MVT::v16i16, { 3, 7, 5, 7 } }, // psllw + split.
780 { ISD::SRL, MVT::v16i16, { 3, 7, 5, 7 } }, // psrlw + split.
781 { ISD::SRA, MVT::v16i16, { 3, 7, 5, 7 } }, // psraw + split.
782
783 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 2 } }, // pslld.
784 { ISD::SRL, MVT::v4i32, { 1, 3, 1, 2 } }, // psrld.
785 { ISD::SRA, MVT::v4i32, { 1, 3, 1, 2 } }, // psrad.
786 { ISD::SHL, MVT::v8i32, { 3, 7, 5, 7 } }, // pslld + split.
787 { ISD::SRL, MVT::v8i32, { 3, 7, 5, 7 } }, // psrld + split.
788 { ISD::SRA, MVT::v8i32, { 3, 7, 5, 7 } }, // psrad + split.
789
790 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 2 } }, // psllq.
791 { ISD::SRL, MVT::v2i64, { 1, 3, 1, 2 } }, // psrlq.
792 { ISD::SRA, MVT::v2i64, { 3, 4, 5, 7 } }, // 2 x psrad + shuffle.
793 { ISD::SHL, MVT::v4i64, { 3, 7, 4, 6 } }, // psllq + split.
794 { ISD::SRL, MVT::v4i64, { 3, 7, 4, 6 } }, // psrlq + split.
795 { ISD::SRA, MVT::v4i64, { 6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split.
796 };
797
798 // XOP has faster vXi8 shifts.
799 if (ST->hasAVX() && Op2Info.isUniform() &&
800 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
801 if (const auto *Entry =
802 CostTableLookup(AVXUniformCostTable, ISD, LT.second))
803 if (auto KindCost = Entry->Cost[CostKind])
804 return LT.first * *KindCost;
805
806 static const CostKindTblEntry SSE2UniformCostTable[] = {
807 // Uniform splats are cheaper for the following instructions.
808 { ISD::SHL, MVT::v16i8, { 9, 10, 6, 9 } }, // psllw + pand.
809 { ISD::SRL, MVT::v16i8, { 9, 13, 5, 9 } }, // psrlw + pand.
810 { ISD::SRA, MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence.
811
812 { ISD::SHL, MVT::v8i16, { 2, 2, 1, 2 } }, // psllw.
813 { ISD::SRL, MVT::v8i16, { 2, 2, 1, 2 } }, // psrlw.
814 { ISD::SRA, MVT::v8i16, { 2, 2, 1, 2 } }, // psraw.
815
816 { ISD::SHL, MVT::v4i32, { 2, 2, 1, 2 } }, // pslld
817 { ISD::SRL, MVT::v4i32, { 2, 2, 1, 2 } }, // psrld.
818 { ISD::SRA, MVT::v4i32, { 2, 2, 1, 2 } }, // psrad.
819
820 { ISD::SHL, MVT::v2i64, { 2, 2, 1, 2 } }, // psllq.
821 { ISD::SRL, MVT::v2i64, { 2, 2, 1, 2 } }, // psrlq.
822 { ISD::SRA, MVT::v2i64, { 5, 9, 5, 7 } }, // 2*psrlq + xor + sub.
823 };
824
825 if (ST->hasSSE2() && Op2Info.isUniform() &&
826 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
827 if (const auto *Entry =
828 CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
829 if (auto KindCost = Entry->Cost[CostKind])
830 return LT.first * *KindCost;
831
832 static const CostKindTblEntry AVX512DQCostTable[] = {
833 { ISD::MUL, MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq
834 { ISD::MUL, MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq
835 { ISD::MUL, MVT::v8i64, { 3, 15, 1, 3 } } // pmullq
836 };
837
838 // Look for AVX512DQ lowering tricks for custom cases.
839 if (ST->hasDQI())
840 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
841 if (auto KindCost = Entry->Cost[CostKind])
842 return LT.first * *KindCost;
843
844 static const CostKindTblEntry AVX512BWCostTable[] = {
845 { ISD::SHL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsllvw/pack sequence.
846 { ISD::SRL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsrlvw/pack sequence.
847 { ISD::SRA, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsravw/pack sequence.
848 { ISD::SHL, MVT::v32i8, { 4, 23,11,16 } }, // extend/vpsllvw/pack sequence.
849 { ISD::SRL, MVT::v32i8, { 4, 30,12,18 } }, // extend/vpsrlvw/pack sequence.
850 { ISD::SRA, MVT::v32i8, { 6, 13,24,30 } }, // extend/vpsravw/pack sequence.
851 { ISD::SHL, MVT::v64i8, { 6, 19,13,15 } }, // extend/vpsllvw/pack sequence.
852 { ISD::SRL, MVT::v64i8, { 7, 27,15,18 } }, // extend/vpsrlvw/pack sequence.
853 { ISD::SRA, MVT::v64i8, { 15, 15,30,30 } }, // extend/vpsravw/pack sequence.
854
855 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsllvw
856 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsrlvw
857 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsravw
858 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsllvw
859 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsrlvw
860 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsravw
861 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsllvw
862 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsrlvw
863 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsravw
864
865 { ISD::ADD, MVT::v64i8, { 1, 1, 1, 1 } }, // paddb
866 { ISD::ADD, MVT::v32i16, { 1, 1, 1, 1 } }, // paddw
867
868 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 1 } }, // paddb
869 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 1 } }, // paddw
870 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 1 } }, // paddd
871 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 1 } }, // paddq
872
873 { ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb
874 { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw
875
876 { ISD::MUL, MVT::v16i8, { 4, 12, 4, 5 } }, // extend/pmullw/trunc
877 { ISD::MUL, MVT::v32i8, { 3, 10, 7,10 } }, // pmaddubsw
878 { ISD::MUL, MVT::v64i8, { 3, 11, 7,10 } }, // pmaddubsw
879 { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw
880
881 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb
882 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 1 } }, // psubw
883 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 1 } }, // psubd
884 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 1 } }, // psubq
885 };
886
887 // Look for AVX512BW lowering tricks for custom cases.
888 if (ST->hasBWI())
889 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
890 if (auto KindCost = Entry->Cost[CostKind])
891 return LT.first * *KindCost;
892
893 static const CostKindTblEntry AVX512CostTable[] = {
894 { ISD::SHL, MVT::v64i8, { 15, 19,27,33 } }, // vpblendv+split sequence.
895 { ISD::SRL, MVT::v64i8, { 15, 19,30,36 } }, // vpblendv+split sequence.
896 { ISD::SRA, MVT::v64i8, { 37, 37,51,63 } }, // vpblendv+split sequence.
897
898 { ISD::SHL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
899 { ISD::SRL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
900 { ISD::SRA, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence.
901
902 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } },
903 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } },
904 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } },
905 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } },
906 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } },
907 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } },
908 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } },
909 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } },
910 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } },
911
912 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } },
913 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } },
914 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } },
915 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } },
916 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } },
917 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } },
918 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } },
919 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } },
920 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } },
921
922 { ISD::ADD, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*paddb + split
923 { ISD::ADD, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*paddw + split
924
925 { ISD::SUB, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*psubb + split
926 { ISD::SUB, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*psubw + split
927
928 { ISD::AND, MVT::v32i8, { 1, 1, 1, 1 } },
929 { ISD::AND, MVT::v16i16, { 1, 1, 1, 1 } },
930 { ISD::AND, MVT::v8i32, { 1, 1, 1, 1 } },
931 { ISD::AND, MVT::v4i64, { 1, 1, 1, 1 } },
932
933 { ISD::OR, MVT::v32i8, { 1, 1, 1, 1 } },
934 { ISD::OR, MVT::v16i16, { 1, 1, 1, 1 } },
935 { ISD::OR, MVT::v8i32, { 1, 1, 1, 1 } },
936 { ISD::OR, MVT::v4i64, { 1, 1, 1, 1 } },
937
938 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 1 } },
939 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 1 } },
940 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 1 } },
941 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 1 } },
942
943 { ISD::MUL, MVT::v16i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
944 { ISD::MUL, MVT::v8i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
945 { ISD::MUL, MVT::v4i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
946 { ISD::MUL, MVT::v8i64, { 6, 9, 8, 8 } }, // 3*pmuludq/3*shift/2*add
947 { ISD::MUL, MVT::i64, { 1 } }, // Skylake from http://www.agner.org/
948
949 { X86ISD::PMULUDQ, MVT::v8i64, { 1, 5, 1, 1 } },
950
951 { ISD::FNEG, MVT::v8f64, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
952 { ISD::FADD, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
953 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
954 { ISD::FSUB, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
955 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
956 { ISD::FMUL, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
957 { ISD::FMUL, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
958 { ISD::FMUL, MVT::v2f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
959 { ISD::FMUL, MVT::f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
960
961 { ISD::FDIV, MVT::f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
962 { ISD::FDIV, MVT::v2f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
963 { ISD::FDIV, MVT::v4f64, { 8, 14, 1, 1 } }, // Skylake from http://www.agner.org/
964 { ISD::FDIV, MVT::v8f64, { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/
965
966 { ISD::FNEG, MVT::v16f32, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
967 { ISD::FADD, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
968 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
969 { ISD::FSUB, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
970 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
971 { ISD::FMUL, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
972 { ISD::FMUL, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
973 { ISD::FMUL, MVT::v4f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
974 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
975
976 { ISD::FDIV, MVT::f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
977 { ISD::FDIV, MVT::v4f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
978 { ISD::FDIV, MVT::v8f32, { 5, 11, 1, 1 } }, // Skylake from http://www.agner.org/
979 { ISD::FDIV, MVT::v16f32, { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/
980 };
981
982 if (ST->hasAVX512())
983 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
984 if (auto KindCost = Entry->Cost[CostKind])
985 return LT.first * *KindCost;
986
987 static const CostKindTblEntry AVX2ShiftCostTable[] = {
988 // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
989 // customize them to detect the cases where shift amount is a scalar one.
990 { ISD::SHL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org)
991 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
992 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org)
993 { ISD::SHL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org)
994 { ISD::SRL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
995 { ISD::SRA, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org)
996 { ISD::SHL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org)
997 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org)
998 { ISD::SHL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org)
999 { ISD::SRL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org)
1000 };
1001
1002 if (ST->hasAVX512()) {
1003 if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant())
1004 // On AVX512, a packed v32i16 shift left by a constant build_vector
1005 // is lowered into a vector multiply (vpmullw).
1006 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
1007 Op1Info.getNoProps(), Op2Info.getNoProps());
1008 }
1009
1010 // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
1011 if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
1012 if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
1013 Op2Info.isConstant())
1014 // On AVX2, a packed v16i16 shift left by a constant build_vector
1015 // is lowered into a vector multiply (vpmullw).
1016 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
1017 Op1Info.getNoProps(), Op2Info.getNoProps());
1018
1019 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
1020 if (auto KindCost = Entry->Cost[CostKind])
1021 return LT.first * *KindCost;
1022 }
1023
1024 static const CostKindTblEntry XOPShiftCostTable[] = {
1025 // 128bit shifts take 1cy, but right shifts require negation beforehand.
1026 { ISD::SHL, MVT::v16i8, { 1, 3, 1, 1 } },
1027 { ISD::SRL, MVT::v16i8, { 2, 3, 1, 1 } },
1028 { ISD::SRA, MVT::v16i8, { 2, 3, 1, 1 } },
1029 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 1 } },
1030 { ISD::SRL, MVT::v8i16, { 2, 3, 1, 1 } },
1031 { ISD::SRA, MVT::v8i16, { 2, 3, 1, 1 } },
1032 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 1 } },
1033 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 1 } },
1034 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 1 } },
1035 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 1 } },
1036 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } },
1037 { ISD::SRA, MVT::v2i64, { 2, 3, 1, 1 } },
1038 // 256bit shifts require splitting if AVX2 didn't catch them above.
1039 { ISD::SHL, MVT::v32i8, { 4, 7, 5, 6 } },
1040 { ISD::SRL, MVT::v32i8, { 6, 7, 5, 6 } },
1041 { ISD::SRA, MVT::v32i8, { 6, 7, 5, 6 } },
1042 { ISD::SHL, MVT::v16i16, { 4, 7, 5, 6 } },
1043 { ISD::SRL, MVT::v16i16, { 6, 7, 5, 6 } },
1044 { ISD::SRA, MVT::v16i16, { 6, 7, 5, 6 } },
1045 { ISD::SHL, MVT::v8i32, { 4, 7, 5, 6 } },
1046 { ISD::SRL, MVT::v8i32, { 6, 7, 5, 6 } },
1047 { ISD::SRA, MVT::v8i32, { 6, 7, 5, 6 } },
1048 { ISD::SHL, MVT::v4i64, { 4, 7, 5, 6 } },
1049 { ISD::SRL, MVT::v4i64, { 6, 7, 5, 6 } },
1050 { ISD::SRA, MVT::v4i64, { 6, 7, 5, 6 } },
1051 };
1052
1053 // Look for XOP lowering tricks.
1054 if (ST->hasXOP()) {
1055 // If the right shift is constant then we'll fold the negation so
1056 // it's as cheap as a left shift.
1057 int ShiftISD = ISD;
1058 if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant())
1059 ShiftISD = ISD::SHL;
1060 if (const auto *Entry =
1061 CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
1062 if (auto KindCost = Entry->Cost[CostKind])
1063 return LT.first * *KindCost;
1064 }
1065
1066 if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) {
1067 MVT VT = LT.second;
1068 // Vector shift left by non uniform constant can be lowered
1069 // into vector multiply.
1070 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
1071 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
1072 ISD = ISD::MUL;
1073 }
1074
1075 static const CostKindTblEntry GLMCostTable[] = {
1076 { ISD::FDIV, MVT::f32, { 18, 19, 1, 1 } }, // divss
1077 { ISD::FDIV, MVT::v4f32, { 35, 36, 1, 1 } }, // divps
1078 { ISD::FDIV, MVT::f64, { 33, 34, 1, 1 } }, // divsd
1079 { ISD::FDIV, MVT::v2f64, { 65, 66, 1, 1 } }, // divpd
1080 };
1081
1082 if (ST->useGLMDivSqrtCosts())
1083 if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second))
1084 if (auto KindCost = Entry->Cost[CostKind])
1085 return LT.first * *KindCost;
1086
1087 static const CostKindTblEntry SLMCostTable[] = {
1088 { ISD::MUL, MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld
1089 { ISD::MUL, MVT::v8i16, { 2, 5, 1, 1 } }, // pmullw
1090 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // mulsd
1091 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // mulss
1092 { ISD::FMUL, MVT::v2f64, { 4, 7, 1, 1 } }, // mulpd
1093 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // mulps
1094 { ISD::FDIV, MVT::f32, { 17, 19, 1, 1 } }, // divss
1095 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 6 } }, // divps
1096 { ISD::FDIV, MVT::f64, { 32, 34, 1, 1 } }, // divsd
1097 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 6 } }, // divpd
1098 { ISD::FADD, MVT::v2f64, { 2, 4, 1, 1 } }, // addpd
1099 { ISD::FSUB, MVT::v2f64, { 2, 4, 1, 1 } }, // subpd
1100 // v2i64/v4i64 mul is custom lowered as a series of long:
1101 // multiplies(3), shifts(3) and adds(2)
1102 // slm muldq version throughput is 2 and addq throughput 4
1103 // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
1104 // 3X4 (addq throughput) = 17
1105 { ISD::MUL, MVT::v2i64, { 17, 22, 9, 9 } },
1106 // slm addq\subq throughput is 4
1107 { ISD::ADD, MVT::v2i64, { 4, 2, 1, 2 } },
1108 { ISD::SUB, MVT::v2i64, { 4, 2, 1, 2 } },
1109 };
1110
1111 if (ST->useSLMArithCosts())
1112 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second))
1113 if (auto KindCost = Entry->Cost[CostKind])
1114 return LT.first * *KindCost;
1115
1116 static const CostKindTblEntry AVX2CostTable[] = {
1117 { ISD::SHL, MVT::v16i8, { 6, 21,11,16 } }, // vpblendvb sequence.
1118 { ISD::SHL, MVT::v32i8, { 6, 23,11,22 } }, // vpblendvb sequence.
1119 { ISD::SHL, MVT::v8i16, { 5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence.
1120 { ISD::SHL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1121
1122 { ISD::SRL, MVT::v16i8, { 6, 27,12,18 } }, // vpblendvb sequence.
1123 { ISD::SRL, MVT::v32i8, { 8, 30,12,24 } }, // vpblendvb sequence.
1124 { ISD::SRL, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence.
1125 { ISD::SRL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1126
1127 { ISD::SRA, MVT::v16i8, { 17, 17,24,30 } }, // vpblendvb sequence.
1128 { ISD::SRA, MVT::v32i8, { 18, 20,24,43 } }, // vpblendvb sequence.
1129 { ISD::SRA, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsravd/pack sequence.
1130 { ISD::SRA, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsravd/pack sequence.
1131 { ISD::SRA, MVT::v2i64, { 4, 5, 5, 5 } }, // srl/xor/sub sequence.
1132 { ISD::SRA, MVT::v4i64, { 8, 8, 5, 9 } }, // srl/xor/sub sequence.
1133
1134 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 2 } }, // psubb
1135 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 2 } }, // paddb
1136 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 2 } }, // psubw
1137 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 2 } }, // paddw
1138 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 2 } }, // psubd
1139 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 2 } }, // paddd
1140 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 2 } }, // psubq
1141 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq
1142
1143 { ISD::MUL, MVT::v16i8, { 5, 18, 6,12 } }, // extend/pmullw/pack
1144 { ISD::MUL, MVT::v32i8, { 4, 8, 8,16 } }, // pmaddubsw
1145 { ISD::MUL, MVT::v16i16, { 2, 5, 1, 2 } }, // pmullw
1146 { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld
1147 { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld
1148 { ISD::MUL, MVT::v4i64, { 6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add
1149 { ISD::MUL, MVT::v2i64, { 6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
1150
1151 { X86ISD::PMULUDQ, MVT::v4i64, { 1, 5, 1, 1 } },
1152
1153 { ISD::FNEG, MVT::v4f64, { 1, 1, 1, 2 } }, // vxorpd
1154 { ISD::FNEG, MVT::v8f32, { 1, 1, 1, 2 } }, // vxorps
1155
1156 { ISD::FADD, MVT::f64, { 1, 4, 1, 1 } }, // vaddsd
1157 { ISD::FADD, MVT::f32, { 1, 4, 1, 1 } }, // vaddss
1158 { ISD::FADD, MVT::v2f64, { 1, 4, 1, 1 } }, // vaddpd
1159 { ISD::FADD, MVT::v4f32, { 1, 4, 1, 1 } }, // vaddps
1160 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 2 } }, // vaddpd
1161 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 2 } }, // vaddps
1162
1163 { ISD::FSUB, MVT::f64, { 1, 4, 1, 1 } }, // vsubsd
1164 { ISD::FSUB, MVT::f32, { 1, 4, 1, 1 } }, // vsubss
1165 { ISD::FSUB, MVT::v2f64, { 1, 4, 1, 1 } }, // vsubpd
1166 { ISD::FSUB, MVT::v4f32, { 1, 4, 1, 1 } }, // vsubps
1167 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 2 } }, // vsubpd
1168 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 2 } }, // vsubps
1169
1170 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // vmulsd
1171 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // vmulss
1172 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // vmulpd
1173 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // vmulps
1174 { ISD::FMUL, MVT::v4f64, { 1, 5, 1, 2 } }, // vmulpd
1175 { ISD::FMUL, MVT::v8f32, { 1, 5, 1, 2 } }, // vmulps
1176
1177 { ISD::FDIV, MVT::f32, { 7, 13, 1, 1 } }, // vdivss
1178 { ISD::FDIV, MVT::v4f32, { 7, 13, 1, 1 } }, // vdivps
1179 { ISD::FDIV, MVT::v8f32, { 14, 21, 1, 3 } }, // vdivps
1180 { ISD::FDIV, MVT::f64, { 14, 20, 1, 1 } }, // vdivsd
1181 { ISD::FDIV, MVT::v2f64, { 14, 20, 1, 1 } }, // vdivpd
1182 { ISD::FDIV, MVT::v4f64, { 28, 35, 1, 3 } }, // vdivpd
1183 };
1184
1185 // Look for AVX2 lowering tricks for custom cases.
1186 if (ST->hasAVX2())
1187 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
1188 if (auto KindCost = Entry->Cost[CostKind])
1189 return LT.first * *KindCost;
1190
1191 static const CostKindTblEntry AVX1CostTable[] = {
1192 // We don't have to scalarize unsupported ops. We can issue two half-sized
1193 // operations and we only need to extract the upper YMM half.
1194 // Two ops + 1 extract + 1 insert = 4.
1195 { ISD::MUL, MVT::v32i8, { 10, 11, 18, 19 } }, // pmaddubsw + split
1196 { ISD::MUL, MVT::v16i8, { 5, 6, 8, 12 } }, // 2*pmaddubsw/3*and/psllw/or
1197 { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split
1198 { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split
1199 { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld
1200 { ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } },
1201
1202 { ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps
1203 { ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps
1204 { ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps
1205 { ISD::AND, MVT::v4i64, { 1, 1, 1, 2 } }, // vandps
1206
1207 { ISD::OR, MVT::v32i8, { 1, 1, 1, 2 } }, // vorps
1208 { ISD::OR, MVT::v16i16, { 1, 1, 1, 2 } }, // vorps
1209 { ISD::OR, MVT::v8i32, { 1, 1, 1, 2 } }, // vorps
1210 { ISD::OR, MVT::v4i64, { 1, 1, 1, 2 } }, // vorps
1211
1212 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 2 } }, // vxorps
1213 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 2 } }, // vxorps
1214 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 2 } }, // vxorps
1215 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 2 } }, // vxorps
1216
1217 { ISD::SUB, MVT::v32i8, { 4, 2, 5, 6 } }, // psubb + split
1218 { ISD::ADD, MVT::v32i8, { 4, 2, 5, 6 } }, // paddb + split
1219 { ISD::SUB, MVT::v16i16, { 4, 2, 5, 6 } }, // psubw + split
1220 { ISD::ADD, MVT::v16i16, { 4, 2, 5, 6 } }, // paddw + split
1221 { ISD::SUB, MVT::v8i32, { 4, 2, 5, 6 } }, // psubd + split
1222 { ISD::ADD, MVT::v8i32, { 4, 2, 5, 6 } }, // paddd + split
1223 { ISD::SUB, MVT::v4i64, { 4, 2, 5, 6 } }, // psubq + split
1224 { ISD::ADD, MVT::v4i64, { 4, 2, 5, 6 } }, // paddq + split
1225 { ISD::SUB, MVT::v2i64, { 1, 1, 1, 1 } }, // psubq
1226 { ISD::ADD, MVT::v2i64, { 1, 1, 1, 1 } }, // paddq
1227
1228 { ISD::SHL, MVT::v16i8, { 10, 21,11,17 } }, // pblendvb sequence.
1229 { ISD::SHL, MVT::v32i8, { 22, 22,27,40 } }, // pblendvb sequence + split.
1230 { ISD::SHL, MVT::v8i16, { 6, 9,11,11 } }, // pblendvb sequence.
1231 { ISD::SHL, MVT::v16i16, { 13, 16,24,25 } }, // pblendvb sequence + split.
1232 { ISD::SHL, MVT::v4i32, { 3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld
1233 { ISD::SHL, MVT::v8i32, { 9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split
1234 { ISD::SHL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1235 { ISD::SHL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1236
1237 { ISD::SRL, MVT::v16i8, { 11, 27,12,18 } }, // pblendvb sequence.
1238 { ISD::SRL, MVT::v32i8, { 23, 23,30,43 } }, // pblendvb sequence + split.
1239 { ISD::SRL, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1240 { ISD::SRL, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1241 { ISD::SRL, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1242 { ISD::SRL, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1243 { ISD::SRL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1244 { ISD::SRL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1245
1246 { ISD::SRA, MVT::v16i8, { 21, 22,24,36 } }, // pblendvb sequence.
1247 { ISD::SRA, MVT::v32i8, { 44, 45,51,76 } }, // pblendvb sequence + split.
1248 { ISD::SRA, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1249 { ISD::SRA, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1250 { ISD::SRA, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1251 { ISD::SRA, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1252 { ISD::SRA, MVT::v2i64, { 5, 6,10,14 } }, // Shift each lane + blend.
1253 { ISD::SRA, MVT::v4i64, { 12, 12,22,30 } }, // Shift each lane + blend + split.
1254
1255 { ISD::FNEG, MVT::v4f64, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1256 { ISD::FNEG, MVT::v8f32, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1257
1258 { ISD::FADD, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1259 { ISD::FADD, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1260 { ISD::FADD, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1261 { ISD::FADD, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1262 { ISD::FADD, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1263 { ISD::FADD, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1264
1265 { ISD::FSUB, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1266 { ISD::FSUB, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1267 { ISD::FSUB, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1268 { ISD::FSUB, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1269 { ISD::FSUB, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1270 { ISD::FSUB, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1271
1272 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1273 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1274 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1275 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1276 { ISD::FMUL, MVT::v4f64, { 4, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1277 { ISD::FMUL, MVT::v8f32, { 2, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1278
1279 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1280 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1281 { ISD::FDIV, MVT::v8f32, { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/
1282 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1283 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1284 { ISD::FDIV, MVT::v4f64, { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/
1285 };
1286
1287 if (ST->hasAVX())
1288 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
1289 if (auto KindCost = Entry->Cost[CostKind])
1290 return LT.first * *KindCost;
1291
1292 static const CostKindTblEntry SSE42CostTable[] = {
1293 { ISD::FADD, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1294 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1295 { ISD::FADD, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1296 { ISD::FADD, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1297
1298 { ISD::FSUB, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1299 { ISD::FSUB, MVT::f32 , { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1300 { ISD::FSUB, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1301 { ISD::FSUB, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1302
1303 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1304 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1305 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1306 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1307
1308 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1309 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1310 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1311 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1312
1313 { ISD::MUL, MVT::v2i64, { 6, 10,10,10 } } // 3*pmuludq/3*shift/2*add
1314 };
1315
1316 if (ST->hasSSE42())
1317 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
1318 if (auto KindCost = Entry->Cost[CostKind])
1319 return LT.first * *KindCost;
1320
1321 static const CostKindTblEntry SSE41CostTable[] = {
1322 { ISD::SHL, MVT::v16i8, { 15, 24,17,22 } }, // pblendvb sequence.
1323 { ISD::SHL, MVT::v8i16, { 11, 14,11,11 } }, // pblendvb sequence.
1324 { ISD::SHL, MVT::v4i32, { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld
1325
1326 { ISD::SRL, MVT::v16i8, { 16, 27,18,24 } }, // pblendvb sequence.
1327 { ISD::SRL, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1328 { ISD::SRL, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1329 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1330
1331 { ISD::SRA, MVT::v16i8, { 38, 41,30,36 } }, // pblendvb sequence.
1332 { ISD::SRA, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1333 { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1334 { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence.
1335
1336 { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org)
1337 };
1338
1339 if (ST->hasSSE41())
1340 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
1341 if (auto KindCost = Entry->Cost[CostKind])
1342 return LT.first * *KindCost;
1343
1344 static const CostKindTblEntry SSSE3CostTable[] = {
1345 { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*pmaddubsw/3*and/psllw/or
1346 };
1347
1348 if (ST->hasSSSE3())
1349 if (const auto *Entry = CostTableLookup(SSSE3CostTable, ISD, LT.second))
1350 if (auto KindCost = Entry->Cost[CostKind])
1351 return LT.first * *KindCost;
1352
1353 static const CostKindTblEntry SSE2CostTable[] = {
1354 // We don't correctly identify costs of casts because they are marked as
1355 // custom.
1356 { ISD::SHL, MVT::v16i8, { 13, 21,26,28 } }, // cmpgtb sequence.
1357 { ISD::SHL, MVT::v8i16, { 24, 27,16,20 } }, // cmpgtw sequence.
1358 { ISD::SHL, MVT::v4i32, { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq.
1359 { ISD::SHL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1360
1361 { ISD::SRL, MVT::v16i8, { 14, 28,27,30 } }, // cmpgtb sequence.
1362 { ISD::SRL, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1363 { ISD::SRL, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1364 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1365
1366 { ISD::SRA, MVT::v16i8, { 27, 30,54,54 } }, // unpacked cmpgtb sequence.
1367 { ISD::SRA, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1368 { ISD::SRA, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1369 { ISD::SRA, MVT::v2i64, { 8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence.
1370
1371 { ISD::AND, MVT::v16i8, { 1, 1, 1, 1 } }, // pand
1372 { ISD::AND, MVT::v8i16, { 1, 1, 1, 1 } }, // pand
1373 { ISD::AND, MVT::v4i32, { 1, 1, 1, 1 } }, // pand
1374 { ISD::AND, MVT::v2i64, { 1, 1, 1, 1 } }, // pand
1375
1376 { ISD::OR, MVT::v16i8, { 1, 1, 1, 1 } }, // por
1377 { ISD::OR, MVT::v8i16, { 1, 1, 1, 1 } }, // por
1378 { ISD::OR, MVT::v4i32, { 1, 1, 1, 1 } }, // por
1379 { ISD::OR, MVT::v2i64, { 1, 1, 1, 1 } }, // por
1380
1381 { ISD::XOR, MVT::v16i8, { 1, 1, 1, 1 } }, // pxor
1382 { ISD::XOR, MVT::v8i16, { 1, 1, 1, 1 } }, // pxor
1383 { ISD::XOR, MVT::v4i32, { 1, 1, 1, 1 } }, // pxor
1384 { ISD::XOR, MVT::v2i64, { 1, 1, 1, 1 } }, // pxor
1385
1386 { ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq
1387 { ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq
1388
1389 { ISD::MUL, MVT::v16i8, { 6, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack
1390 { ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw
1391 { ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle
1392 { ISD::MUL, MVT::v2i64, { 7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add
1393
1394 { X86ISD::PMULUDQ, MVT::v2i64, { 1, 5, 1, 1 } },
1395
1396 { ISD::FDIV, MVT::f32, { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/
1397 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/
1398 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/
1399 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/
1400
1401 { ISD::FNEG, MVT::f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1402 { ISD::FNEG, MVT::f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1403 { ISD::FNEG, MVT::v4f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1404 { ISD::FNEG, MVT::v2f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1405
1406 { ISD::FADD, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1407 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1408 { ISD::FADD, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1409
1410 { ISD::FSUB, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1411 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1412 { ISD::FSUB, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1413
1414 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1415 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1416 };
1417
1418 if (ST->hasSSE2())
1419 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
1420 if (auto KindCost = Entry->Cost[CostKind])
1421 return LT.first * *KindCost;
1422
1423 static const CostKindTblEntry SSE1CostTable[] = {
1424 { ISD::FDIV, MVT::f32, { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/
1425 { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/
1426
1427 { ISD::FNEG, MVT::f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1428 { ISD::FNEG, MVT::v4f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1429
1430 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1431 { ISD::FADD, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1432
1433 { ISD::FSUB, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1434 { ISD::FSUB, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1435
1436 { ISD::FMUL, MVT::f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1437 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1438 };
1439
1440 if (ST->hasSSE1())
1441 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1442 if (auto KindCost = Entry->Cost[CostKind])
1443 return LT.first * *KindCost;
1444
1445 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
1446 { ISD::ADD, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1447 { ISD::SUB, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1448 { ISD::MUL, MVT::i64, { 2, 6, 1, 2 } },
1449 };
1450
1451 if (ST->is64Bit())
1452 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1453 if (auto KindCost = Entry->Cost[CostKind])
1454 return LT.first * *KindCost;
1455
1456 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1457 { ISD::ADD, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1458 { ISD::ADD, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1459 { ISD::ADD, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1460
1461 { ISD::SUB, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1462 { ISD::SUB, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1463 { ISD::SUB, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1464
1465 { ISD::MUL, MVT::i8, { 3, 4, 1, 1 } },
1466 { ISD::MUL, MVT::i16, { 2, 4, 1, 1 } },
1467 { ISD::MUL, MVT::i32, { 1, 4, 1, 1 } },
1468
1469 { ISD::FNEG, MVT::f64, { 2, 2, 1, 3 } }, // (x87)
1470 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1471 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1472 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // (x87)
1473 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87)
1474 };
1475
1476 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1477 if (auto KindCost = Entry->Cost[CostKind])
1478 return LT.first * *KindCost;
1479
1480 // It is not a good idea to vectorize division. We have to scalarize it and
1481 // in the process we will often end up having to spilling regular
1482 // registers. The overhead of division is going to dominate most kernels
1483 // anyways so try hard to prevent vectorization of division - it is
1484 // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1485 // to hide "20 cycles" for each lane.
1486 if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() &&
1487 (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
1488 ISD == ISD::UREM)) {
1489 InstructionCost ScalarCost =
1491 Op1Info.getNoProps(), Op2Info.getNoProps());
1492 return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1493 }
1494
1495 // Handle some basic single instruction code size cases.
1496 if (CostKind == TTI::TCK_CodeSize) {
1497 switch (ISD) {
1498 case ISD::FADD:
1499 case ISD::FSUB:
1500 case ISD::FMUL:
1501 case ISD::FDIV:
1502 case ISD::FNEG:
1503 case ISD::AND:
1504 case ISD::OR:
1505 case ISD::XOR:
1506 return LT.first;
1507 break;
1508 }
1509 }
1510
1511 // Fallback to the default implementation.
1512 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1513 Args, CxtI);
1514}
1515
1518 unsigned Opcode1, const SmallBitVector &OpcodeMask,
1520 if (isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
1521 return TTI::TCC_Basic;
1523}
1524
1526 TTI::ShuffleKind Kind, VectorType *BaseTp, ArrayRef<int> Mask,
1527 TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
1528 ArrayRef<const Value *> Args, const Instruction *CxtI) {
1529 // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1530 // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1531 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(BaseTp);
1532
1533 Kind = improveShuffleKindFromMask(Kind, Mask, BaseTp, Index, SubTp);
1534
1535 // If all args are constant than this will be constant folded away.
1536 if (!Args.empty() &&
1537 all_of(Args, [](const Value *Arg) { return isa<Constant>(Arg); }))
1538 return TTI::TCC_Free;
1539
1540 // Recognize a basic concat_vector shuffle.
1541 if (Kind == TTI::SK_PermuteTwoSrc &&
1542 Mask.size() == (2 * BaseTp->getElementCount().getKnownMinValue()) &&
1543 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
1546 CostKind, Mask.size() / 2, BaseTp);
1547
1548 // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1549 if (Kind == TTI::SK_Transpose)
1550 Kind = TTI::SK_PermuteTwoSrc;
1551
1552 if (Kind == TTI::SK_Broadcast) {
1553 // For Broadcasts we are splatting the first element from the first input
1554 // register, so only need to reference that input and all the output
1555 // registers are the same.
1556 LT.first = 1;
1557
1558 // If we're broadcasting a load then AVX/AVX2 can do this for free.
1559 using namespace PatternMatch;
1560 if (!Args.empty() && match(Args[0], m_OneUse(m_Load(m_Value()))) &&
1561 (ST->hasAVX2() ||
1562 (ST->hasAVX() && LT.second.getScalarSizeInBits() >= 32)))
1563 return TTI::TCC_Free;
1564 }
1565
1566 // Attempt to detect a cheaper inlane shuffle, avoiding 128-bit subvector
1567 // permutation.
1568 bool IsInLaneShuffle = false;
1569 if (BaseTp->getPrimitiveSizeInBits() > 0 &&
1570 (BaseTp->getPrimitiveSizeInBits() % 128) == 0 &&
1571 BaseTp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
1572 Mask.size() == BaseTp->getElementCount().getKnownMinValue()) {
1573 unsigned NumLanes = BaseTp->getPrimitiveSizeInBits() / 128;
1574 unsigned NumEltsPerLane = Mask.size() / NumLanes;
1575 if ((Mask.size() % NumLanes) == 0)
1576 IsInLaneShuffle = all_of(enumerate(Mask), [&](const auto &P) {
1577 return P.value() == PoisonMaskElem ||
1578 ((P.value() % Mask.size()) / NumEltsPerLane) ==
1579 (P.index() / NumEltsPerLane);
1580 });
1581 }
1582
1583 // Treat <X x bfloat> shuffles as <X x half>.
1584 if (LT.second.isVector() && LT.second.getScalarType() == MVT::bf16)
1585 LT.second = LT.second.changeVectorElementType(MVT::f16);
1586
1587 // Subvector extractions are free if they start at the beginning of a
1588 // vector and cheap if the subvectors are aligned.
1589 if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1590 int NumElts = LT.second.getVectorNumElements();
1591 if ((Index % NumElts) == 0)
1592 return TTI::TCC_Free;
1593 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1594 if (SubLT.second.isVector()) {
1595 int NumSubElts = SubLT.second.getVectorNumElements();
1596 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1597 return SubLT.first;
1598 // Handle some cases for widening legalization. For now we only handle
1599 // cases where the original subvector was naturally aligned and evenly
1600 // fit in its legalized subvector type.
1601 // FIXME: Remove some of the alignment restrictions.
1602 // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1603 // vectors.
1604 int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1605 if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1606 (NumSubElts % OrigSubElts) == 0 &&
1607 LT.second.getVectorElementType() ==
1608 SubLT.second.getVectorElementType() &&
1609 LT.second.getVectorElementType().getSizeInBits() ==
1611 assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1612 "Unexpected number of elements!");
1613 auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
1614 LT.second.getVectorNumElements());
1615 auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
1616 SubLT.second.getVectorNumElements());
1617 int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1618 InstructionCost ExtractCost = getShuffleCost(
1619 TTI::SK_ExtractSubvector, VecTy, {}, CostKind, ExtractIndex, SubTy);
1620
1621 // If the original size is 32-bits or more, we can use pshufd. Otherwise
1622 // if we have SSSE3 we can use pshufb.
1623 if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1624 return ExtractCost + 1; // pshufd or pshufb
1625
1626 assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1627 "Unexpected vector size");
1628
1629 return ExtractCost + 2; // worst case pshufhw + pshufd
1630 }
1631 }
1632 // If the extract subvector is not optimal, treat it as single op shuffle.
1634 }
1635
1636 // Subvector insertions are cheap if the subvectors are aligned.
1637 // Note that in general, the insertion starting at the beginning of a vector
1638 // isn't free, because we need to preserve the rest of the wide vector,
1639 // but if the destination vector legalizes to the same width as the subvector
1640 // then the insertion will simplify to a (free) register copy.
1641 if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1642 int NumElts = LT.second.getVectorNumElements();
1643 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1644 if (SubLT.second.isVector()) {
1645 int NumSubElts = SubLT.second.getVectorNumElements();
1646 bool MatchingTypes =
1647 NumElts == NumSubElts &&
1648 (SubTp->getElementCount().getKnownMinValue() % NumSubElts) == 0;
1649 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1650 return MatchingTypes ? TTI::TCC_Free : SubLT.first;
1651 }
1652
1653 // If the insertion isn't aligned, treat it like a 2-op shuffle.
1654 Kind = TTI::SK_PermuteTwoSrc;
1655 }
1656
1657 // Handle some common (illegal) sub-vector types as they are often very cheap
1658 // to shuffle even on targets without PSHUFB.
1659 EVT VT = TLI->getValueType(DL, BaseTp);
1660 if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1661 !ST->hasSSSE3()) {
1662 static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
1663 {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw
1664 {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw
1665 {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw
1666 {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw
1667 {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck
1668
1669 {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw
1670 {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw
1671 {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus
1672 {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck
1673
1674 {TTI::SK_Splice, MVT::v4i16, 2}, // punpck+psrldq
1675 {TTI::SK_Splice, MVT::v2i16, 2}, // punpck+psrldq
1676 {TTI::SK_Splice, MVT::v4i8, 2}, // punpck+psrldq
1677 {TTI::SK_Splice, MVT::v2i8, 2}, // punpck+psrldq
1678
1679 {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw
1680 {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw
1681 {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw
1682 {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw
1683 {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck
1684
1685 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
1686 {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
1687 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw
1688 {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw
1689 {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck
1690 };
1691
1692 if (ST->hasSSE2())
1693 if (const auto *Entry =
1694 CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1695 return Entry->Cost;
1696 }
1697
1698 // We are going to permute multiple sources and the result will be in multiple
1699 // destinations. Providing an accurate cost only for splits where the element
1700 // type remains the same.
1701 if ((Kind == TTI::SK_PermuteSingleSrc || Kind == TTI::SK_PermuteTwoSrc) &&
1702 LT.first != 1) {
1703 MVT LegalVT = LT.second;
1704 if (LegalVT.isVector() &&
1705 LegalVT.getVectorElementType().getSizeInBits() ==
1707 LegalVT.getVectorNumElements() <
1708 cast<FixedVectorType>(BaseTp)->getNumElements()) {
1709 unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
1710 unsigned LegalVTSize = LegalVT.getStoreSize();
1711 // Number of source vectors after legalization:
1712 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1713 // Number of destination vectors after legalization:
1714 InstructionCost NumOfDests = LT.first;
1715
1716 auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
1717 LegalVT.getVectorNumElements());
1718
1719 if (!Mask.empty() && NumOfDests.isValid()) {
1720 // Try to perform better estimation of the permutation.
1721 // 1. Split the source/destination vectors into real registers.
1722 // 2. Do the mask analysis to identify which real registers are
1723 // permuted. If more than 1 source registers are used for the
1724 // destination register building, the cost for this destination register
1725 // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
1726 // source register is used, build mask and calculate the cost as a cost
1727 // of PermuteSingleSrc.
1728 // Also, for the single register permute we try to identify if the
1729 // destination register is just a copy of the source register or the
1730 // copy of the previous destination register (the cost is
1731 // TTI::TCC_Basic). If the source register is just reused, the cost for
1732 // this operation is TTI::TCC_Free.
1733 NumOfDests =
1735 FixedVectorType::get(BaseTp->getElementType(), Mask.size()))
1736 .first;
1737 unsigned E = *NumOfDests.getValue();
1738 unsigned NormalizedVF =
1739 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
1740 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
1741 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
1742 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
1743 copy(Mask, NormalizedMask.begin());
1744 unsigned PrevSrcReg = 0;
1745 ArrayRef<int> PrevRegMask;
1748 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
1749 [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask,
1750 &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
1751 if (!ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) {
1752 // Check if the previous register can be just copied to the next
1753 // one.
1754 if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
1755 PrevRegMask != RegMask)
1757 RegMask, CostKind, 0, nullptr);
1758 else
1759 // Just a copy of previous destination register.
1761 return;
1762 }
1763 if (SrcReg != DestReg &&
1764 any_of(RegMask, [](int I) { return I != PoisonMaskElem; })) {
1765 // Just a copy of the source register.
1767 }
1768 PrevSrcReg = SrcReg;
1769 PrevRegMask = RegMask;
1770 },
1771 [this, SingleOpTy, CostKind, &Cost](ArrayRef<int> RegMask,
1772 unsigned /*Unused*/,
1773 unsigned /*Unused*/) {
1774 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
1775 CostKind, 0, nullptr);
1776 });
1777 return Cost;
1778 }
1779
1780 InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1781 return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1782 {}, CostKind, 0, nullptr);
1783 }
1784
1785 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
1786 }
1787
1788 static const CostTblEntry AVX512VBMIShuffleTbl[] = {
1789 {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
1790 {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
1791
1792 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
1793 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
1794
1795 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
1796 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
1797 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b
1798 };
1799
1800 if (ST->hasVBMI())
1801 if (const auto *Entry =
1802 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1803 return LT.first * Entry->Cost;
1804
1805 static const CostTblEntry AVX512BWShuffleTbl[] = {
1806 {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1807 {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
1808 {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1809
1810 {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
1811 {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
1812 {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
1813 {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
1814
1815 {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
1816 {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
1817 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
1818 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
1819 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
1820
1821 {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
1822 {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
1823 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
1824 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
1825 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
1826
1827 {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
1828 {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb
1829
1830 {TTI::SK_Splice, MVT::v32i16, 2}, // vshufi64x2 + palignr
1831 {TTI::SK_Splice, MVT::v32f16, 2}, // vshufi64x2 + palignr
1832 {TTI::SK_Splice, MVT::v64i8, 2}, // vshufi64x2 + palignr
1833 };
1834
1835 if (ST->hasBWI())
1836 if (const auto *Entry =
1837 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1838 return LT.first * Entry->Cost;
1839
1840 static const CostKindTblEntry AVX512ShuffleTbl[] = {
1841 {TTI::SK_Broadcast, MVT::v8f64, { 1, 1, 1, 1 } }, // vbroadcastsd
1842 {TTI::SK_Broadcast, MVT::v16f32, { 1, 1, 1, 1 } }, // vbroadcastss
1843 {TTI::SK_Broadcast, MVT::v8i64, { 1, 1, 1, 1 } }, // vpbroadcastq
1844 {TTI::SK_Broadcast, MVT::v16i32, { 1, 1, 1, 1 } }, // vpbroadcastd
1845 {TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw
1846 {TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw
1847 {TTI::SK_Broadcast, MVT::v64i8, { 1, 1, 1, 1 } }, // vpbroadcastb
1848
1849 {TTI::SK_Reverse, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1850 {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1851 {TTI::SK_Reverse, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1852 {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1853 {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca
1854 {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca
1855 {TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca
1856
1857 {TTI::SK_Splice, MVT::v8f64, { 1, 1, 1, 1 } }, // vpalignd
1858 {TTI::SK_Splice, MVT::v4f64, { 1, 1, 1, 1 } }, // vpalignd
1859 {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd
1860 {TTI::SK_Splice, MVT::v8f32, { 1, 1, 1, 1 } }, // vpalignd
1861 {TTI::SK_Splice, MVT::v8i64, { 1, 1, 1, 1 } }, // vpalignd
1862 {TTI::SK_Splice, MVT::v4i64, { 1, 1, 1, 1 } }, // vpalignd
1863 {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd
1864 {TTI::SK_Splice, MVT::v8i32, { 1, 1, 1, 1 } }, // vpalignd
1865 {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr
1866 {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr
1867 {TTI::SK_Splice, MVT::v64i8, { 4, 4, 4, 4 } }, // split + palignr
1868
1869 {TTI::SK_PermuteSingleSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1870 {TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermpd
1871 {TTI::SK_PermuteSingleSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermpd
1872 {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1873 {TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermps
1874 {TTI::SK_PermuteSingleSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermps
1875 {TTI::SK_PermuteSingleSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1876 {TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermq
1877 {TTI::SK_PermuteSingleSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermq
1878 {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1879 {TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermd
1880 {TTI::SK_PermuteSingleSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermd
1881 {TTI::SK_PermuteSingleSrc, MVT::v16i8, { 1, 3, 1, 1 } }, // pshufb
1882
1883 {TTI::SK_PermuteTwoSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermt2pd
1884 {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermt2ps
1885 {TTI::SK_PermuteTwoSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermt2q
1886 {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermt2d
1887 {TTI::SK_PermuteTwoSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermt2pd
1888 {TTI::SK_PermuteTwoSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermt2ps
1889 {TTI::SK_PermuteTwoSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermt2q
1890 {TTI::SK_PermuteTwoSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermt2d
1891 {TTI::SK_PermuteTwoSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermt2pd
1892 {TTI::SK_PermuteTwoSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermt2ps
1893 {TTI::SK_PermuteTwoSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermt2q
1894 {TTI::SK_PermuteTwoSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermt2d
1895
1896 // FIXME: This just applies the type legalization cost rules above
1897 // assuming these completely split.
1898 {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } },
1899 {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } },
1900 {TTI::SK_PermuteSingleSrc, MVT::v64i8, { 14, 14, 14, 14 } },
1901 {TTI::SK_PermuteTwoSrc, MVT::v32i16, { 42, 42, 42, 42 } },
1902 {TTI::SK_PermuteTwoSrc, MVT::v32f16, { 42, 42, 42, 42 } },
1903 {TTI::SK_PermuteTwoSrc, MVT::v64i8, { 42, 42, 42, 42 } },
1904
1905 {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq
1906 {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq
1907 {TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vpternlogq
1908 {TTI::SK_Select, MVT::v8f64, { 1, 1, 1, 1 } }, // vblendmpd
1909 {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps
1910 {TTI::SK_Select, MVT::v8i64, { 1, 1, 1, 1 } }, // vblendmq
1911 {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd
1912 };
1913
1914 if (ST->hasAVX512())
1915 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1916 if (auto KindCost = Entry->Cost[CostKind])
1917 return LT.first * *KindCost;
1918
1919 static const CostTblEntry AVX2InLaneShuffleTbl[] = {
1920 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 1}, // vpshufb
1921 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 1}, // vpshufb
1922 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpshufb
1923
1924 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 2}, // 2*vshufpd + vblendpd
1925 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 2}, // 2*vshufps + vblendps
1926 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 2}, // 2*vpshufd + vpblendd
1927 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 2}, // 2*vpshufd + vpblendd
1928 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // 2*vpshufb + vpor
1929 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 2}, // 2*vpshufb + vpor
1930 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // 2*vpshufb + vpor
1931 };
1932
1933 if (IsInLaneShuffle && ST->hasAVX2())
1934 if (const auto *Entry =
1935 CostTableLookup(AVX2InLaneShuffleTbl, Kind, LT.second))
1936 return LT.first * Entry->Cost;
1937
1938 static const CostTblEntry AVX2ShuffleTbl[] = {
1939 {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
1940 {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
1941 {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
1942 {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
1943 {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1944 {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw
1945 {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
1946
1947 {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
1948 {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
1949 {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
1950 {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
1951 {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1952 {TTI::SK_Reverse, MVT::v16f16, 2}, // vperm2i128 + pshufb
1953 {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
1954
1955 {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1956 {TTI::SK_Select, MVT::v16f16, 1}, // vpblendvb
1957 {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
1958
1959 {TTI::SK_Splice, MVT::v8i32, 2}, // vperm2i128 + vpalignr
1960 {TTI::SK_Splice, MVT::v8f32, 2}, // vperm2i128 + vpalignr
1961 {TTI::SK_Splice, MVT::v16i16, 2}, // vperm2i128 + vpalignr
1962 {TTI::SK_Splice, MVT::v16f16, 2}, // vperm2i128 + vpalignr
1963 {TTI::SK_Splice, MVT::v32i8, 2}, // vperm2i128 + vpalignr
1964
1965 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1966 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1967 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1968 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1969 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1970 // + vpblendvb
1971 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vperm2i128 + 2*vpshufb
1972 // + vpblendvb
1973 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
1974 // + vpblendvb
1975
1976 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
1977 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
1978 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
1979 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
1980 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1981 // + vpblendvb
1982 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 7}, // 2*vperm2i128 + 4*vpshufb
1983 // + vpblendvb
1984 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
1985 // + vpblendvb
1986 };
1987
1988 if (ST->hasAVX2())
1989 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1990 return LT.first * Entry->Cost;
1991
1992 static const CostTblEntry XOPShuffleTbl[] = {
1993 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
1994 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
1995 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
1996 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
1997 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1998 // + vinsertf128
1999 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
2000 // + vinsertf128
2001
2002 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
2003 // + vinsertf128
2004 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
2005 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
2006 // + vinsertf128
2007 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
2008 };
2009
2010 if (ST->hasXOP())
2011 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
2012 return LT.first * Entry->Cost;
2013
2014 static const CostTblEntry AVX1InLaneShuffleTbl[] = {
2015 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermilpd
2016 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermilpd
2017 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermilps
2018 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermilps
2019
2020 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
2021 // + vpor + vinsertf128
2022 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vextractf128 + 2*pshufb
2023 // + vpor + vinsertf128
2024 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
2025 // + vpor + vinsertf128
2026
2027 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 2}, // 2*vshufpd + vblendpd
2028 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 2}, // 2*vshufps + vblendps
2029 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 2}, // 2*vpermilpd + vblendpd
2030 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 2}, // 2*vpermilps + vblendps
2031 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 4*pshufb
2032 // + 2*vpor + vinsertf128
2033 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 9}, // 2*vextractf128 + 4*pshufb
2034 // + 2*vpor + vinsertf128
2035 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 4*pshufb
2036 // + 2*vpor + vinsertf128
2037 };
2038
2039 if (IsInLaneShuffle && ST->hasAVX())
2040 if (const auto *Entry =
2041 CostTableLookup(AVX1InLaneShuffleTbl, Kind, LT.second))
2042 return LT.first * Entry->Cost;
2043
2044 static const CostTblEntry AVX1ShuffleTbl[] = {
2045 {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
2046 {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
2047 {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
2048 {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
2049 {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
2050 {TTI::SK_Broadcast, MVT::v16f16, 3}, // vpshuflw + vpshufd + vinsertf128
2051 {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
2052
2053 {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
2054 {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
2055 {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
2056 {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
2057 {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
2058 // + vinsertf128
2059 {TTI::SK_Reverse, MVT::v16f16, 4}, // vextractf128 + 2*pshufb
2060 // + vinsertf128
2061 {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
2062 // + vinsertf128
2063
2064 {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
2065 {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
2066 {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
2067 {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
2068 {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
2069 {TTI::SK_Select, MVT::v16f16, 3}, // vpand + vpandn + vpor
2070 {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
2071
2072 {TTI::SK_Splice, MVT::v4i64, 2}, // vperm2f128 + shufpd
2073 {TTI::SK_Splice, MVT::v4f64, 2}, // vperm2f128 + shufpd
2074 {TTI::SK_Splice, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
2075 {TTI::SK_Splice, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
2076 {TTI::SK_Splice, MVT::v16i16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2077 {TTI::SK_Splice, MVT::v16f16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2078 {TTI::SK_Splice, MVT::v32i8, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2079
2080 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
2081 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
2082 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
2083 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
2084 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
2085 // + 2*por + vinsertf128
2086 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 8}, // vextractf128 + 4*pshufb
2087 // + 2*por + vinsertf128
2088 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
2089 // + 2*por + vinsertf128
2090
2091 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
2092 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
2093 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
2094 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
2095 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
2096 // + 4*por + vinsertf128
2097 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 15}, // 2*vextractf128 + 8*pshufb
2098 // + 4*por + vinsertf128
2099 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
2100 // + 4*por + vinsertf128
2101 };
2102
2103 if (ST->hasAVX())
2104 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
2105 return LT.first * Entry->Cost;
2106
2107 static const CostTblEntry SSE41ShuffleTbl[] = {
2108 {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
2109 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
2110 {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
2111 {TTI::SK_Select, MVT::v4f32, 1}, // blendps
2112 {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
2113 {TTI::SK_Select, MVT::v8f16, 1}, // pblendw
2114 {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
2115 };
2116
2117 if (ST->hasSSE41())
2118 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
2119 return LT.first * Entry->Cost;
2120
2121 static const CostTblEntry SSSE3ShuffleTbl[] = {
2122 {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
2123 {TTI::SK_Broadcast, MVT::v8f16, 1}, // pshufb
2124 {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
2125
2126 {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
2127 {TTI::SK_Reverse, MVT::v8f16, 1}, // pshufb
2128 {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
2129
2130 {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
2131 {TTI::SK_Select, MVT::v8f16, 3}, // 2*pshufb + por
2132 {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
2133
2134 {TTI::SK_Splice, MVT::v4i32, 1}, // palignr
2135 {TTI::SK_Splice, MVT::v4f32, 1}, // palignr
2136 {TTI::SK_Splice, MVT::v8i16, 1}, // palignr
2137 {TTI::SK_Splice, MVT::v8f16, 1}, // palignr
2138 {TTI::SK_Splice, MVT::v16i8, 1}, // palignr
2139
2140 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
2141 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // pshufb
2142 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
2143
2144 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
2145 {TTI::SK_PermuteTwoSrc, MVT::v8f16, 3}, // 2*pshufb + por
2146 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
2147 };
2148
2149 if (ST->hasSSSE3())
2150 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
2151 return LT.first * Entry->Cost;
2152
2153 static const CostTblEntry SSE2ShuffleTbl[] = {
2154 {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
2155 {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
2156 {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
2157 {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
2158 {TTI::SK_Broadcast, MVT::v8f16, 2}, // pshuflw + pshufd
2159 {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
2160
2161 {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
2162 {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
2163 {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
2164 {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
2165 {TTI::SK_Reverse, MVT::v8f16, 3}, // pshuflw + pshufhw + pshufd
2166 {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
2167 // + 2*pshufd + 2*unpck + packus
2168
2169 {TTI::SK_Select, MVT::v2i64, 1}, // movsd
2170 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
2171 {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
2172 {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
2173 {TTI::SK_Select, MVT::v8f16, 3}, // pand + pandn + por
2174 {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
2175
2176 {TTI::SK_Splice, MVT::v2i64, 1}, // shufpd
2177 {TTI::SK_Splice, MVT::v2f64, 1}, // shufpd
2178 {TTI::SK_Splice, MVT::v4i32, 2}, // 2*{unpck,movsd,pshufd}
2179 {TTI::SK_Splice, MVT::v8i16, 3}, // psrldq + psrlldq + por
2180 {TTI::SK_Splice, MVT::v8f16, 3}, // psrldq + psrlldq + por
2181 {TTI::SK_Splice, MVT::v16i8, 3}, // psrldq + psrlldq + por
2182
2183 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
2184 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
2185 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
2186 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
2187 // + pshufd/unpck
2188 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 5}, // 2*pshuflw + 2*pshufhw
2189 // + pshufd/unpck
2190 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
2191 // + 2*pshufd + 2*unpck + 2*packus
2192
2193 { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
2194 { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
2195 { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
2196 { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
2197 { TTI::SK_PermuteTwoSrc, MVT::v8f16, 8 }, // blend+permute
2198 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
2199 };
2200
2201 static const CostTblEntry SSE3BroadcastLoadTbl[] = {
2202 {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
2203 };
2204
2205 if (ST->hasSSE2()) {
2206 bool IsLoad =
2207 llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); });
2208 if (ST->hasSSE3() && IsLoad)
2209 if (const auto *Entry =
2210 CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
2212 LT.second.getVectorElementCount()) &&
2213 "Table entry missing from isLegalBroadcastLoad()");
2214 return LT.first * Entry->Cost;
2215 }
2216
2217 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
2218 return LT.first * Entry->Cost;
2219 }
2220
2221 static const CostTblEntry SSE1ShuffleTbl[] = {
2222 { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
2223 { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
2224 { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
2225 { TTI::SK_Splice, MVT::v4f32, 2 }, // 2*shufps
2226 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
2227 { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
2228 };
2229
2230 if (ST->hasSSE1())
2231 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
2232 return LT.first * Entry->Cost;
2233
2234 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
2235}
2236
2238 Type *Src,
2241 const Instruction *I) {
2242 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2243 assert(ISD && "Invalid opcode");
2244
2245 // The cost tables include both specific, custom (non-legal) src/dst type
2246 // conversions and generic, legalized types. We test for customs first, before
2247 // falling back to legalization.
2248 // FIXME: Need a better design of the cost table to handle non-simple types of
2249 // potential massive combinations (elem_num x src_type x dst_type).
2250 static const TypeConversionCostKindTblEntry AVX512BWConversionTbl[]{
2251 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2252 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2253
2254 // Mask sign extend has an instruction.
2255 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2256 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2257 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2258 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2259 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2260 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2261 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2262 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2263 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2264 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2265 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2266 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2267 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2268 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2269 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, { 1, 1, 1, 1 } },
2270 { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, { 1, 1, 1, 1 } },
2271 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, { 1, 1, 1, 1 } },
2272
2273 // Mask zero extend is a sext + shift.
2274 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2275 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2276 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2277 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2278 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2279 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2280 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2281 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2282 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2283 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2284 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2285 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2286 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2287 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2288 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, { 2, 1, 1, 1 } },
2289 { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, { 2, 1, 1, 1 } },
2290 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, { 2, 1, 1, 1 } },
2291
2292 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2293 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2294 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2295 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2296 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2297 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2298 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2299 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2300 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2301 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2302 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2303 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2304 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2305 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2306 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, { 2, 1, 1, 1 } },
2307 { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, { 2, 1, 1, 1 } },
2308 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, { 2, 1, 1, 1 } },
2309
2310 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 2, 1, 1, 1 } },
2311 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // widen to zmm
2312 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, { 2, 1, 1, 1 } }, // vpmovwb
2313 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, { 2, 1, 1, 1 } }, // vpmovwb
2314 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, { 2, 1, 1, 1 } }, // vpmovwb
2315 };
2316
2317 static const TypeConversionCostKindTblEntry AVX512DQConversionTbl[] = {
2318 // Mask sign extend has an instruction.
2319 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2320 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2321 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2322 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2323 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2324 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, { 1, 1, 1, 1 } },
2325 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } },
2326 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } },
2327
2328 // Mask zero extend is a sext + shift.
2329 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1, } },
2330 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1, } },
2331 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1, } },
2332 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1, } },
2333 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1, } },
2334 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, { 2, 1, 1, 1, } },
2335 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1, } },
2336 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1, } },
2337
2338 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2339 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2340 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2341 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2342 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2343 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } },
2344 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } },
2345 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, { 2, 1, 1, 1 } },
2346
2347 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2348 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2349
2350 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2351 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2352
2353 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2354 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2355
2356 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2357 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2358 };
2359
2360 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
2361 // 256-bit wide vectors.
2362
2363 static const TypeConversionCostKindTblEntry AVX512FConversionTbl[] = {
2364 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 1, 1, 1, 1 } },
2365 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, { 3, 1, 1, 1 } },
2366 { ISD::FP_EXTEND, MVT::v16f64, MVT::v16f32, { 4, 1, 1, 1 } }, // 2*vcvtps2pd+vextractf64x4
2367 { ISD::FP_EXTEND, MVT::v16f32, MVT::v16f16, { 1, 1, 1, 1 } }, // vcvtph2ps
2368 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
2369 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 1, 1, 1, 1 } },
2370 { ISD::FP_ROUND, MVT::v16f16, MVT::v16f32, { 1, 1, 1, 1 } }, // vcvtps2ph
2371
2372 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2373 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2374 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2375 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2376 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2377 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2378 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2379 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2380 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2381 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2382 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2383 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2384 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2385 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2386 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2387 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, { 2, 1, 1, 1 } }, // vpmovdb
2388 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, { 2, 1, 1, 1 } }, // vpmovdb
2389 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2390 { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2391 { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2392 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2393 { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2394 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, { 2, 1, 1, 1 } }, // vpmovqb
2395 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, { 1, 1, 1, 1 } }, // vpshufb
2396 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2397 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2398 { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2399 { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2400 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2401 { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2402 { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2403 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, { 1, 1, 1, 1 } }, // vpmovqd
2404 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // zmm vpmovqd
2405 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, { 5, 1, 1, 1 } },// 2*vpmovqd+concat+vpmovdb
2406
2407 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } }, // extend to v16i32
2408 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 8, 1, 1, 1 } },
2409 { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, { 8, 1, 1, 1 } },
2410
2411 // Sign extend is zmm vpternlogd+vptruncdb.
2412 // Zero extend is zmm broadcast load+vptruncdw.
2413 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 3, 1, 1, 1 } },
2414 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 4, 1, 1, 1 } },
2415 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 3, 1, 1, 1 } },
2416 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 4, 1, 1, 1 } },
2417 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 3, 1, 1, 1 } },
2418 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 4, 1, 1, 1 } },
2419 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 3, 1, 1, 1 } },
2420 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 4, 1, 1, 1 } },
2421
2422 // Sign extend is zmm vpternlogd+vptruncdw.
2423 // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
2424 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 3, 1, 1, 1 } },
2425 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2426 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 3, 1, 1, 1 } },
2427 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2428 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 3, 1, 1, 1 } },
2429 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2430 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 3, 1, 1, 1 } },
2431 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2432
2433 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2434 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2435 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2436 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2437 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2438 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2439 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2440 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2441 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2442 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2443
2444 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2445 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2446 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogq
2447 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2448
2449 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2450 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2451 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2452 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2453 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2454 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2455 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2456 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2457 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2458 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2459
2460 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2461 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2462
2463 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2464 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2465 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2466 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2467 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2468 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2469 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2470 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2471
2472 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2473 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2474 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2475 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2476 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2477 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2478 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2479 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2480 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, {26, 1, 1, 1 } },
2481 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 5, 1, 1, 1 } },
2482
2483 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2484 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, { 7, 1, 1, 1 } },
2485 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64, {15, 1, 1, 1 } },
2486 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32, {11, 1, 1, 1 } },
2487 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64, {31, 1, 1, 1 } },
2488 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2489 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, { 7, 1, 1, 1 } },
2490 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, { 5, 1, 1, 1 } },
2491 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64, {15, 1, 1, 1 } },
2492 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2493 { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, { 3, 1, 1, 1 } },
2494
2495 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2496 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2497 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, { 3, 1, 1, 1 } },
2498 { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, { 1, 1, 1, 1 } },
2499 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, { 3, 1, 1, 1 } },
2500 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, { 3, 1, 1, 1 } },
2501 };
2502
2503 static const TypeConversionCostKindTblEntry AVX512BWVLConversionTbl[] {
2504 // Mask sign extend has an instruction.
2505 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2506 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2507 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2508 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2509 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2510 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2511 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2512 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2513 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2514 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2515 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2516 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2517 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2518 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2519 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, { 1, 1, 1, 1 } },
2520 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, { 1, 1, 1, 1 } },
2521 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, { 1, 1, 1, 1 } },
2522
2523 // Mask zero extend is a sext + shift.
2524 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2525 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2526 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2527 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2528 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2529 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2530 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2531 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2532 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2533 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2534 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2535 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2536 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2537 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2538 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, { 2, 1, 1, 1 } },
2539 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, { 2, 1, 1, 1 } },
2540 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, { 2, 1, 1, 1 } },
2541
2542 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2543 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2544 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2545 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2546 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2547 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2548 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2549 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2550 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2551 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2552 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2553 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2554 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2555 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2556 { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, { 2, 1, 1, 1 } },
2557 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, { 2, 1, 1, 1 } },
2558 { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, { 2, 1, 1, 1 } },
2559
2560 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } },
2561 };
2562
2563 static const TypeConversionCostKindTblEntry AVX512DQVLConversionTbl[] = {
2564 // Mask sign extend has an instruction.
2565 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2566 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2567 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2568 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, { 1, 1, 1, 1 } },
2569 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2570 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, { 1, 1, 1, 1 } },
2571 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } },
2572 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2573
2574 // Mask zero extend is a sext + shift.
2575 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } },
2576 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1 } },
2577 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } },
2578 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, { 2, 1, 1, 1 } },
2579 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } },
2580 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, { 2, 1, 1, 1 } },
2581 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } },
2582 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } },
2583
2584 { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, { 2, 1, 1, 1 } },
2585 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } },
2586 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2587 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2588 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2589 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2590 { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, { 2, 1, 1, 1 } },
2591 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2592
2593 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2594 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2595 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2596 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2597
2598 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2599 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2600 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2601 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2602
2603 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2604 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2605 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2606 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2607
2608 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2609 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2610 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2611 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2612 };
2613
2614 static const TypeConversionCostKindTblEntry AVX512VLConversionTbl[] = {
2615 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2616 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2617 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2618 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 8, 1, 1, 1 } }, // split+2*v8i8
2619 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2620 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2621 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2622 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 8, 1, 1, 1 } }, // split+2*v8i16
2623 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2624 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2625 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2626 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2627 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2628 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2629 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // vpmovqd
2630 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqb
2631 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqw
2632 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, { 2, 1, 1, 1 } }, // vpmovwb
2633
2634 // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
2635 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
2636 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 5, 1, 1, 1 } },
2637 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 6, 1, 1, 1 } },
2638 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 5, 1, 1, 1 } },
2639 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 6, 1, 1, 1 } },
2640 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 5, 1, 1, 1 } },
2641 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 6, 1, 1, 1 } },
2642 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, {10, 1, 1, 1 } },
2643 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, {12, 1, 1, 1 } },
2644
2645 // sign extend is vpcmpeq+maskedmove+vpmovdw
2646 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
2647 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2648 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 5, 1, 1, 1 } },
2649 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2650 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 5, 1, 1, 1 } },
2651 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2652 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 5, 1, 1, 1 } },
2653 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, {10, 1, 1, 1 } },
2654 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, {12, 1, 1, 1 } },
2655
2656 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogd
2657 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2658 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogd
2659 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2660 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogd
2661 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2662 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2663 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2664
2665 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogq
2666 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2667 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogq
2668 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2669
2670 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2671 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2672 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2673 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2674 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2675 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2676 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2677 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2678 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2679 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2680 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2681 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2682
2683 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2684 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2685 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2686 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2687
2688 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
2689 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
2690 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2691 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2692 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2693 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2694 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 1, 1, 1, 1 } },
2695 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
2696 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2697 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2698 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 5, 1, 1, 1 } },
2699 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2700 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 5, 1, 1, 1 } },
2701
2702 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2703 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2704 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, { 5, 1, 1, 1 } },
2705
2706 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
2707 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
2708 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
2709 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
2710 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2711 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2712 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2713 };
2714
2715 static const TypeConversionCostKindTblEntry AVX2ConversionTbl[] = {
2716 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2717 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2718 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2719 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2720 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2721 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2722
2723 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2724 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2725 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2726 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2727 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2728 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2729 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2730 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2731 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2732 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2733 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2734 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2735 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2736 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2737
2738 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2739
2740 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 4, 1, 1, 1 } },
2741 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 4, 1, 1, 1 } },
2742 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 1, 1, 1, 1 } },
2743 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 1, 1, 1, 1 } },
2744 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 1, 1, 1, 1 } },
2745 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 4, 1, 1, 1 } },
2746 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 4, 1, 1, 1 } },
2747 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 1, 1, 1, 1 } },
2748 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 1, 1, 1, 1 } },
2749 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 5, 1, 1, 1 } },
2750 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } },
2751 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 2, 1, 1, 1 } },
2752
2753 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 3, 1, 1, 1 } },
2754 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 3, 1, 1, 1 } },
2755
2756 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2757 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2758 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2759 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 3, 1, 1, 1 } },
2760
2761 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 3, 1, 1, 1 } },
2762 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 3, 1, 1, 1 } },
2763 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2764 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
2765 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2766 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 4, 1, 1, 1 } },
2767 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 3, 1, 1, 1 } },
2768 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 4, 1, 1, 1 } },
2769
2770 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2771 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2772 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2773 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2774 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2775 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2776 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 3, 1, 1, 1 } },
2777
2778 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2779 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2780 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2781 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2782 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 2, 1, 1, 1 } },
2783 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 1, 1, 1, 1 } },
2784 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 2, 1, 1, 1 } },
2785 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2786 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2787 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2788 };
2789
2790 static const TypeConversionCostKindTblEntry AVXConversionTbl[] = {
2791 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2792 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2793 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2794 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2795 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2796 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2797
2798 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2799 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2800 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2801 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2802 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2803 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2804 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2805 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2806 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2807 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2808 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2809 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2810
2811 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 4, 1, 1, 1 } },
2812 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 5, 1, 1, 1 } },
2813 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 4, 1, 1, 1 } },
2814 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 9, 1, 1, 1 } },
2815 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, {11, 1, 1, 1 } },
2816
2817 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 6, 1, 1, 1 } },
2818 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 6, 1, 1, 1 } },
2819 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // and+extract+packuswb
2820 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 5, 1, 1, 1 } },
2821 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
2822 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 5, 1, 1, 1 } },
2823 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 3, 1, 1, 1 } }, // and+extract+2*packusdw
2824 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 2, 1, 1, 1 } },
2825
2826 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, { 3, 1, 1, 1 } },
2827 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, { 3, 1, 1, 1 } },
2828 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, { 8, 1, 1, 1 } },
2829 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2830 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2831 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2832 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2833 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2834 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2835 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2836 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 5, 1, 1, 1 } },
2837 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 8, 1, 1, 1 } },
2838
2839 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, { 7, 1, 1, 1 } },
2840 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, { 7, 1, 1, 1 } },
2841 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, { 6, 1, 1, 1 } },
2842 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2843 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2844 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2845 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2846 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 4, 1, 1, 1 } },
2847 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 4, 1, 1, 1 } },
2848 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
2849 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 6, 1, 1, 1 } },
2850 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 8, 1, 1, 1 } },
2851 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, {10, 1, 1, 1 } },
2852 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, {10, 1, 1, 1 } },
2853 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {18, 1, 1, 1 } },
2854 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2855 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, {10, 1, 1, 1 } },
2856
2857 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2858 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2859 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2860 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
2861 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
2862 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
2863 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
2864 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
2865 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 2, 1, 1, 1 } },
2866 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 2, 1, 1, 1 } },
2867 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 5, 1, 1, 1 } },
2868
2869 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2870 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2871 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2872 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
2873 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
2874 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
2875 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
2876 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
2877 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
2878 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2879 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 6, 1, 1, 1 } },
2880 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 7, 1, 1, 1 } },
2881 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 7, 1, 1, 1 } },
2882
2883 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, { 1, 1, 1, 1 } },
2884 { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, { 1, 1, 1, 1 } },
2885 };
2886
2887 static const TypeConversionCostKindTblEntry SSE41ConversionTbl[] = {
2888 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
2889 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
2890 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
2891 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
2892 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
2893 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
2894 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
2895 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
2896 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
2897 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
2898 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
2899 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
2900
2901 // These truncates end up widening elements.
2902 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 1, 1, 1, 1 } }, // PMOVXZBQ
2903 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 1, 1, 1, 1 } }, // PMOVXZWQ
2904 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 1, 1, 1, 1 } }, // PMOVXZBD
2905
2906 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 2, 1, 1, 1 } },
2907 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 2, 1, 1, 1 } },
2908 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 2, 1, 1, 1 } },
2909
2910 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
2911 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
2912 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
2913 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
2914 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
2915 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2916 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
2917 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2918 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
2919 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 1, 1, 1, 1 } },
2920 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2921
2922 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
2923 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
2924 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 4, 1, 1, 1 } },
2925 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 4, 1, 1, 1 } },
2926 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
2927 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2928 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
2929 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2930 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 3, 1, 1, 1 } },
2931 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
2932 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 2, 1, 1, 1 } },
2933 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {12, 1, 1, 1 } },
2934 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {22, 1, 1, 1 } },
2935 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 4, 1, 1, 1 } },
2936
2937 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
2938 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
2939 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
2940 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
2941 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
2942 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
2943 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
2944 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
2945 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
2946 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
2947
2948 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
2949 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
2950 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
2951 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
2952 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
2953 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
2954 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
2955 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
2956 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
2957 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2958 };
2959
2960 static const TypeConversionCostKindTblEntry SSE2ConversionTbl[] = {
2961 // These are somewhat magic numbers justified by comparing the
2962 // output of llvm-mca for our various supported scheduler models
2963 // and basing it off the worst case scenario.
2964 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
2965 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
2966 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 3, 1, 1, 1 } },
2967 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 3, 1, 1, 1 } },
2968 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 3, 1, 1, 1 } },
2969 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
2970 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 3, 1, 1, 1 } },
2971 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
2972 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
2973 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 4, 1, 1, 1 } },
2974 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 8, 1, 1, 1 } },
2975 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 8, 1, 1, 1 } },
2976
2977 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
2978 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
2979 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 8, 1, 1, 1 } },
2980 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 9, 1, 1, 1 } },
2981 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
2982 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 4, 1, 1, 1 } },
2983 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 4, 1, 1, 1 } },
2984 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
2985 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 7, 1, 1, 1 } },
2986 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 7, 1, 1, 1 } },
2987 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
2988 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, {15, 1, 1, 1 } },
2989 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {18, 1, 1, 1 } },
2990
2991 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
2992 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
2993 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
2994 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
2995 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
2996 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
2997 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
2998 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
2999 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
3000 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
3001
3002 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
3003 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3004 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
3005 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, {15, 1, 1, 1 } },
3006 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
3007 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
3008 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
3009 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
3010 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 8, 1, 1, 1 } },
3011 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 8, 1, 1, 1 } },
3012
3013 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
3014 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
3015 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 2, 1, 1, 1 } },
3016 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 3, 1, 1, 1 } },
3017 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
3018 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 2, 1, 1, 1 } },
3019 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 2, 1, 1, 1 } },
3020 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 3, 1, 1, 1 } },
3021 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
3022 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 2, 1, 1, 1 } },
3023 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
3024 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 2, 1, 1, 1 } },
3025
3026 // These truncates are really widening elements.
3027 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 1, 1, 1, 1 } }, // PSHUFD
3028 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } }, // PUNPCKLWD+DQ
3029 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // PUNPCKLBW+WD+PSHUFD
3030 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 1, 1, 1, 1 } }, // PUNPCKLWD
3031 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } }, // PUNPCKLBW+WD
3032 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 1, 1, 1, 1 } }, // PUNPCKLBW
3033
3034 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 2, 1, 1, 1 } }, // PAND+PACKUSWB
3035 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } },
3036 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 3, 1, 1, 1 } }, // PAND+2*PACKUSWB
3037 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 7, 1, 1, 1 } },
3038 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, { 1, 1, 1, 1 } },
3039 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 3, 1, 1, 1 } },
3040 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
3041 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, {10, 1, 1, 1 } },
3042 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 4, 1, 1, 1 } }, // PAND+3*PACKUSWB
3043 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 2, 1, 1, 1 } }, // PSHUFD+PSHUFLW
3044 { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, { 1, 1, 1, 1 } }, // PSHUFD
3045 };
3046
3047 static const TypeConversionCostKindTblEntry F16ConversionTbl[] = {
3048 { ISD::FP_ROUND, MVT::f16, MVT::f32, { 1, 1, 1, 1 } },
3049 { ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, { 1, 1, 1, 1 } },
3050 { ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, { 1, 1, 1, 1 } },
3051 { ISD::FP_EXTEND, MVT::f32, MVT::f16, { 1, 1, 1, 1 } },
3052 { ISD::FP_EXTEND, MVT::f64, MVT::f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
3053 { ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, { 1, 1, 1, 1 } },
3054 { ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, { 1, 1, 1, 1 } },
3055 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
3056 };
3057
3058 // Attempt to map directly to (simple) MVT types to let us match custom entries.
3059 EVT SrcTy = TLI->getValueType(DL, Src);
3060 EVT DstTy = TLI->getValueType(DL, Dst);
3061
3062 // The function getSimpleVT only handles simple value types.
3063 if (SrcTy.isSimple() && DstTy.isSimple()) {
3064 MVT SimpleSrcTy = SrcTy.getSimpleVT();
3065 MVT SimpleDstTy = DstTy.getSimpleVT();
3066
3067 if (ST->useAVX512Regs()) {
3068 if (ST->hasBWI())
3069 if (const auto *Entry = ConvertCostTableLookup(
3070 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3071 if (auto KindCost = Entry->Cost[CostKind])
3072 return *KindCost;
3073
3074 if (ST->hasDQI())
3075 if (const auto *Entry = ConvertCostTableLookup(
3076 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3077 if (auto KindCost = Entry->Cost[CostKind])
3078 return *KindCost;
3079
3080 if (ST->hasAVX512())
3081 if (const auto *Entry = ConvertCostTableLookup(
3082 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3083 if (auto KindCost = Entry->Cost[CostKind])
3084 return *KindCost;
3085 }
3086
3087 if (ST->hasBWI())
3088 if (const auto *Entry = ConvertCostTableLookup(
3089 AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3090 if (auto KindCost = Entry->Cost[CostKind])
3091 return *KindCost;
3092
3093 if (ST->hasDQI())
3094 if (const auto *Entry = ConvertCostTableLookup(
3095 AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3096 if (auto KindCost = Entry->Cost[CostKind])
3097 return *KindCost;
3098
3099 if (ST->hasAVX512())
3100 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3101 SimpleDstTy, SimpleSrcTy))
3102 if (auto KindCost = Entry->Cost[CostKind])
3103 return *KindCost;
3104
3105 if (ST->hasAVX2()) {
3106 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3107 SimpleDstTy, SimpleSrcTy))
3108 if (auto KindCost = Entry->Cost[CostKind])
3109 return *KindCost;
3110 }
3111
3112 if (ST->hasAVX()) {
3113 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3114 SimpleDstTy, SimpleSrcTy))
3115 if (auto KindCost = Entry->Cost[CostKind])
3116 return *KindCost;
3117 }
3118
3119 if (ST->hasF16C()) {
3120 if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD,
3121 SimpleDstTy, SimpleSrcTy))
3122 if (auto KindCost = Entry->Cost[CostKind])
3123 return *KindCost;
3124 }
3125
3126 if (ST->hasSSE41()) {
3127 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3128 SimpleDstTy, SimpleSrcTy))
3129 if (auto KindCost = Entry->Cost[CostKind])
3130 return *KindCost;
3131 }
3132
3133 if (ST->hasSSE2()) {
3134 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3135 SimpleDstTy, SimpleSrcTy))
3136 if (auto KindCost = Entry->Cost[CostKind])
3137 return *KindCost;
3138 }
3139
3140 if ((ISD == ISD::FP_ROUND && SimpleDstTy == MVT::f16) ||
3141 (ISD == ISD::FP_EXTEND && SimpleSrcTy == MVT::f16)) {
3142 // fp16 conversions not covered by any table entries require a libcall.
3143 // Return a large (arbitrary) number to model this.
3144 return InstructionCost(64);
3145 }
3146 }
3147
3148 // Fall back to legalized types.
3149 std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src);
3150 std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst);
3151
3152 // If we're truncating to the same legalized type - just assume its free.
3153 if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second)
3154 return TTI::TCC_Free;
3155
3156 if (ST->useAVX512Regs()) {
3157 if (ST->hasBWI())
3158 if (const auto *Entry = ConvertCostTableLookup(
3159 AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
3160 if (auto KindCost = Entry->Cost[CostKind])
3161 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3162
3163 if (ST->hasDQI())
3164 if (const auto *Entry = ConvertCostTableLookup(
3165 AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
3166 if (auto KindCost = Entry->Cost[CostKind])
3167 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3168
3169 if (ST->hasAVX512())
3170 if (const auto *Entry = ConvertCostTableLookup(
3171 AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
3172 if (auto KindCost = Entry->Cost[CostKind])
3173 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3174 }
3175
3176 if (ST->hasBWI())
3177 if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
3178 LTDest.second, LTSrc.second))
3179 if (auto KindCost = Entry->Cost[CostKind])
3180 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3181
3182 if (ST->hasDQI())
3183 if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
3184 LTDest.second, LTSrc.second))
3185 if (auto KindCost = Entry->Cost[CostKind])
3186 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3187
3188 if (ST->hasAVX512())
3189 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3190 LTDest.second, LTSrc.second))
3191 if (auto KindCost = Entry->Cost[CostKind])
3192 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3193
3194 if (ST->hasAVX2())
3195 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3196 LTDest.second, LTSrc.second))
3197 if (auto KindCost = Entry->Cost[CostKind])
3198 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3199
3200 if (ST->hasAVX())
3201 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3202 LTDest.second, LTSrc.second))
3203 if (auto KindCost = Entry->Cost[CostKind])
3204 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3205
3206 if (ST->hasF16C()) {
3207 if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD,
3208 LTDest.second, LTSrc.second))
3209 if (auto KindCost = Entry->Cost[CostKind])
3210 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3211 }
3212
3213 if (ST->hasSSE41())
3214 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3215 LTDest.second, LTSrc.second))
3216 if (auto KindCost = Entry->Cost[CostKind])
3217 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3218
3219 if (ST->hasSSE2())
3220 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3221 LTDest.second, LTSrc.second))
3222 if (auto KindCost = Entry->Cost[CostKind])
3223 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3224
3225 // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
3226 // sitofp.
3227 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3228 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
3229 Type *ExtSrc = Src->getWithNewBitWidth(32);
3230 unsigned ExtOpc =
3231 (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
3232
3233 // For scalar loads the extend would be free.
3234 InstructionCost ExtCost = 0;
3235 if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
3236 ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
3237
3238 return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
3240 }
3241
3242 // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
3243 // i32.
3244 if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
3245 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
3246 Type *TruncDst = Dst->getWithNewBitWidth(32);
3247 return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
3248 getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
3250 }
3251
3252 // TODO: Allow non-throughput costs that aren't binary.
3253 auto AdjustCost = [&CostKind](InstructionCost Cost,
3256 return Cost == 0 ? 0 : N;
3257 return Cost * N;
3258 };
3259 return AdjustCost(
3260 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3261}
3262
3264 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
3266 TTI::OperandValueInfo Op2Info, const Instruction *I) {
3267 // Early out if this type isn't scalar/vector integer/float.
3268 if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy()))
3269 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3270 Op1Info, Op2Info, I);
3271
3272 // Legalize the type.
3273 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3274
3275 MVT MTy = LT.second;
3276
3277 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3278 assert(ISD && "Invalid opcode");
3279
3280 InstructionCost ExtraCost = 0;
3281 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
3282 // Some vector comparison predicates cost extra instructions.
3283 // TODO: Adjust ExtraCost based on CostKind?
3284 // TODO: Should we invert this and assume worst case cmp costs
3285 // and reduce for particular predicates?
3286 if (MTy.isVector() &&
3287 !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
3288 (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
3289 ST->hasBWI())) {
3290 // Fallback to I if a specific predicate wasn't specified.
3291 CmpInst::Predicate Pred = VecPred;
3292 if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE ||
3294 Pred = cast<CmpInst>(I)->getPredicate();
3295
3296 bool CmpWithConstant = false;
3297 if (auto *CmpInstr = dyn_cast_or_null<CmpInst>(I))
3298 CmpWithConstant = isa<Constant>(CmpInstr->getOperand(1));
3299
3300 switch (Pred) {
3302 // xor(cmpeq(x,y),-1)
3303 ExtraCost = CmpWithConstant ? 0 : 1;
3304 break;
3307 // xor(cmpgt(x,y),-1)
3308 ExtraCost = CmpWithConstant ? 0 : 1;
3309 break;
3312 // cmpgt(xor(x,signbit),xor(y,signbit))
3313 // xor(cmpeq(pmaxu(x,y),x),-1)
3314 ExtraCost = CmpWithConstant ? 1 : 2;
3315 break;
3318 if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
3319 (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
3320 // cmpeq(psubus(x,y),0)
3321 // cmpeq(pminu(x,y),x)
3322 ExtraCost = 1;
3323 } else {
3324 // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
3325 ExtraCost = CmpWithConstant ? 2 : 3;
3326 }
3327 break;
3330 // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases.
3331 // Use FCMP_UEQ expansion - FCMP_ONE should be the same.
3332 if (CondTy && !ST->hasAVX())
3333 return getCmpSelInstrCost(Opcode, ValTy, CondTy,
3335 Op1Info, Op2Info) +
3336 getCmpSelInstrCost(Opcode, ValTy, CondTy,
3338 Op1Info, Op2Info) +
3339 getArithmeticInstrCost(Instruction::Or, CondTy, CostKind);
3340
3341 break;
3344 // Assume worst case scenario and add the maximum extra cost.
3345 ExtraCost = 3;
3346 break;
3347 default:
3348 break;
3349 }
3350 }
3351 }
3352
3353 static const CostKindTblEntry SLMCostTbl[] = {
3354 // slm pcmpeq/pcmpgt throughput is 2
3355 { ISD::SETCC, MVT::v2i64, { 2, 5, 1, 2 } },
3356 // slm pblendvb/blendvpd/blendvps throughput is 4
3357 { ISD::SELECT, MVT::v2f64, { 4, 4, 1, 3 } }, // vblendvpd
3358 { ISD::SELECT, MVT::v4f32, { 4, 4, 1, 3 } }, // vblendvps
3359 { ISD::SELECT, MVT::v2i64, { 4, 4, 1, 3 } }, // pblendvb
3360 { ISD::SELECT, MVT::v8i32, { 4, 4, 1, 3 } }, // pblendvb
3361 { ISD::SELECT, MVT::v8i16, { 4, 4, 1, 3 } }, // pblendvb
3362 { ISD::SELECT, MVT::v16i8, { 4, 4, 1, 3 } }, // pblendvb
3363 };
3364
3365 static const CostKindTblEntry AVX512BWCostTbl[] = {
3366 { ISD::SETCC, MVT::v32i16, { 1, 1, 1, 1 } },
3367 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 1 } },
3368 { ISD::SETCC, MVT::v64i8, { 1, 1, 1, 1 } },
3369 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 1 } },
3370
3371 { ISD::SELECT, MVT::v32i16, { 1, 1, 1, 1 } },
3372 { ISD::SELECT, MVT::v64i8, { 1, 1, 1, 1 } },
3373 };
3374
3375 static const CostKindTblEntry AVX512CostTbl[] = {
3376 { ISD::SETCC, MVT::v8f64, { 1, 4, 1, 1 } },
3377 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 1 } },
3378 { ISD::SETCC, MVT::v16f32, { 1, 4, 1, 1 } },
3379 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 1 } },
3380
3381 { ISD::SETCC, MVT::v8i64, { 1, 1, 1, 1 } },
3382 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 1 } },
3383 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3384 { ISD::SETCC, MVT::v16i32, { 1, 1, 1, 1 } },
3385 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 1 } },
3386 { ISD::SETCC, MVT::v32i16, { 3, 7, 5, 5 } },
3387 { ISD::SETCC, MVT::v64i8, { 3, 7, 5, 5 } },
3388
3389 { ISD::SELECT, MVT::v8i64, { 1, 1, 1, 1 } },
3390 { ISD::SELECT, MVT::v4i64, { 1, 1, 1, 1 } },
3391 { ISD::SELECT, MVT::v2i64, { 1, 1, 1, 1 } },
3392 { ISD::SELECT, MVT::v16i32, { 1, 1, 1, 1 } },
3393 { ISD::SELECT, MVT::v8i32, { 1, 1, 1, 1 } },
3394 { ISD::SELECT, MVT::v4i32, { 1, 1, 1, 1 } },
3395 { ISD::SELECT, MVT::v8f64, { 1, 1, 1, 1 } },
3396 { ISD::SELECT, MVT::v4f64, { 1, 1, 1, 1 } },
3397 { ISD::SELECT, MVT::v2f64, { 1, 1, 1, 1 } },
3398 { ISD::SELECT, MVT::f64, { 1, 1, 1, 1 } },
3399 { ISD::SELECT, MVT::v16f32, { 1, 1, 1, 1 } },
3400 { ISD::SELECT, MVT::v8f32 , { 1, 1, 1, 1 } },
3401 { ISD::SELECT, MVT::v4f32, { 1, 1, 1, 1 } },
3402 { ISD::SELECT, MVT::f32 , { 1, 1, 1, 1 } },
3403
3404 { ISD::SELECT, MVT::v32i16, { 2, 2, 4, 4 } },
3405 { ISD::SELECT, MVT::v16i16, { 1, 1, 1, 1 } },
3406 { ISD::SELECT, MVT::v8i16, { 1, 1, 1, 1 } },
3407 { ISD::SELECT, MVT::v64i8, { 2, 2, 4, 4 } },
3408 { ISD::SELECT, MVT::v32i8, { 1, 1, 1, 1 } },
3409 { ISD::SELECT, MVT::v16i8, { 1, 1, 1, 1 } },
3410 };
3411
3412 static const CostKindTblEntry AVX2CostTbl[] = {
3413 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 2 } },
3414 { ISD::SETCC, MVT::v2f64, { 1, 4, 1, 1 } },
3415 { ISD::SETCC, MVT::f64, { 1, 4, 1, 1 } },
3416 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 2 } },
3417 { ISD::SETCC, MVT::v4f32, { 1, 4, 1, 1 } },
3418 { ISD::SETCC, MVT::f32, { 1, 4, 1, 1 } },
3419
3420 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 2 } },
3421 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 2 } },
3422 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 2 } },
3423 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 2 } },
3424
3425 { ISD::SELECT, MVT::v4f64, { 2, 2, 1, 2 } }, // vblendvpd
3426 { ISD::SELECT, MVT::v8f32, { 2, 2, 1, 2 } }, // vblendvps
3427 { ISD::SELECT, MVT::v4i64, { 2, 2, 1, 2 } }, // pblendvb
3428 { ISD::SELECT, MVT::v8i32, { 2, 2, 1, 2 } }, // pblendvb
3429 { ISD::SELECT, MVT::v16i16, { 2, 2, 1, 2 } }, // pblendvb
3430 { ISD::SELECT, MVT::v32i8, { 2, 2, 1, 2 } }, // pblendvb
3431 };
3432
3433 static const CostKindTblEntry XOPCostTbl[] = {
3434 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3435 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3436 };
3437
3438 static const CostKindTblEntry AVX1CostTbl[] = {
3439 { ISD::SETCC, MVT::v4f64, { 2, 3, 1, 2 } },
3440 { ISD::SETCC, MVT::v2f64, { 1, 3, 1, 1 } },
3441 { ISD::SETCC, MVT::f64, { 1, 3, 1, 1 } },
3442 { ISD::SETCC, MVT::v8f32, { 2, 3, 1, 2 } },
3443 { ISD::SETCC, MVT::v4f32, { 1, 3, 1, 1 } },
3444 { ISD::SETCC, MVT::f32, { 1, 3, 1, 1 } },
3445
3446 // AVX1 does not support 8-wide integer compare.
3447 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3448 { ISD::SETCC, MVT::v8i32, { 4, 2, 5, 6 } },
3449 { ISD::SETCC, MVT::v16i16, { 4, 2, 5, 6 } },
3450 { ISD::SETCC, MVT::v32i8, { 4, 2, 5, 6 } },
3451
3452 { ISD::SELECT, MVT::v4f64, { 3, 3, 1, 2 } }, // vblendvpd
3453 { ISD::SELECT, MVT::v8f32, { 3, 3, 1, 2 } }, // vblendvps
3454 { ISD::SELECT, MVT::v4i64, { 3, 3, 1, 2 } }, // vblendvpd
3455 { ISD::SELECT, MVT::v8i32, { 3, 3, 1, 2 } }, // vblendvps
3456 { ISD::SELECT, MVT::v16i16, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3457 { ISD::SELECT, MVT::v32i8, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3458 };
3459
3460 static const CostKindTblEntry SSE42CostTbl[] = {
3461 { ISD::SETCC, MVT::v2i64, { 1, 2, 1, 2 } },
3462 };
3463
3464 static const CostKindTblEntry SSE41CostTbl[] = {
3465 { ISD::SETCC, MVT::v2f64, { 1, 5, 1, 1 } },
3466 { ISD::SETCC, MVT::v4f32, { 1, 5, 1, 1 } },
3467
3468 { ISD::SELECT, MVT::v2f64, { 2, 2, 1, 2 } }, // blendvpd
3469 { ISD::SELECT, MVT::f64, { 2, 2, 1, 2 } }, // blendvpd
3470 { ISD::SELECT, MVT::v4f32, { 2, 2, 1, 2 } }, // blendvps
3471 { ISD::SELECT, MVT::f32 , { 2, 2, 1, 2 } }, // blendvps
3472 { ISD::SELECT, MVT::v2i64, { 2, 2, 1, 2 } }, // pblendvb
3473 { ISD::SELECT, MVT::v4i32, { 2, 2, 1, 2 } }, // pblendvb
3474 { ISD::SELECT, MVT::v8i16, { 2, 2, 1, 2 } }, // pblendvb
3475 { ISD::SELECT, MVT::v16i8, { 2, 2, 1, 2 } }, // pblendvb
3476 };
3477
3478 static const CostKindTblEntry SSE2CostTbl[] = {
3479 { ISD::SETCC, MVT::v2f64, { 2, 5, 1, 1 } },
3480 { ISD::SETCC, MVT::f64, { 1, 5, 1, 1 } },
3481
3482 { ISD::SETCC, MVT::v2i64, { 5, 4, 5, 5 } }, // pcmpeqd/pcmpgtd expansion
3483 { ISD::SETCC, MVT::v4i32, { 1, 1, 1, 1 } },
3484 { ISD::SETCC, MVT::v8i16, { 1, 1, 1, 1 } },
3485 { ISD::SETCC, MVT::v16i8, { 1, 1, 1, 1 } },
3486
3487 { ISD::SELECT, MVT::v2f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3488 { ISD::SELECT, MVT::f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3489 { ISD::SELECT, MVT::v2i64, { 2, 2, 3, 3 } }, // pand + pandn + por
3490 { ISD::SELECT, MVT::v4i32, { 2, 2, 3, 3 } }, // pand + pandn + por
3491 { ISD::SELECT, MVT::v8i16, { 2, 2, 3, 3 } }, // pand + pandn + por
3492 { ISD::SELECT, MVT::v16i8, { 2, 2, 3, 3 } }, // pand + pandn + por
3493 };
3494
3495 static const CostKindTblEntry SSE1CostTbl[] = {
3496 { ISD::SETCC, MVT::v4f32, { 2, 5, 1, 1 } },
3497 { ISD::SETCC, MVT::f32, { 1, 5, 1, 1 } },
3498
3499 { ISD::SELECT, MVT::v4f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3500 { ISD::SELECT, MVT::f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3501 };
3502
3503 if (ST->useSLMArithCosts())
3504 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
3505 if (auto KindCost = Entry->Cost[CostKind])
3506 return LT.first * (ExtraCost + *KindCost);
3507
3508 if (ST->hasBWI())
3509 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3510 if (auto KindCost = Entry->Cost[CostKind])
3511 return LT.first * (ExtraCost + *KindCost);
3512
3513 if (ST->hasAVX512())
3514 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3515 if (auto KindCost = Entry->Cost[CostKind])
3516 return LT.first * (ExtraCost + *KindCost);
3517
3518 if (ST->hasAVX2())
3519 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3520 if (auto KindCost = Entry->Cost[CostKind])
3521 return LT.first * (ExtraCost + *KindCost);
3522
3523 if (ST->hasXOP())
3524 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3525 if (auto KindCost = Entry->Cost[CostKind])
3526 return LT.first * (ExtraCost + *KindCost);
3527
3528 if (ST->hasAVX())
3529 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3530 if (auto KindCost = Entry->Cost[CostKind])
3531 return LT.first * (ExtraCost + *KindCost);
3532
3533 if (ST->hasSSE42())
3534 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3535 if (auto KindCost = Entry->Cost[CostKind])
3536 return LT.first * (ExtraCost + *KindCost);
3537
3538 if (ST->hasSSE41())
3539 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3540 if (auto KindCost = Entry->Cost[CostKind])
3541 return LT.first * (ExtraCost + *KindCost);
3542
3543 if (ST->hasSSE2())
3544 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3545 if (auto KindCost = Entry->Cost[CostKind])
3546 return LT.first * (ExtraCost + *KindCost);
3547
3548 if (ST->hasSSE1())
3549 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3550 if (auto KindCost = Entry->Cost[CostKind])
3551 return LT.first * (ExtraCost + *KindCost);
3552
3553 // Assume a 3cy latency for fp select ops.
3554 if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select)
3555 if (ValTy->getScalarType()->isFloatingPointTy())
3556 return 3;
3557
3558 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3559 Op1Info, Op2Info, I);
3560}
3561
3563
3567 // Costs should match the codegen from:
3568 // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
3569 // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
3570 // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
3571 // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
3572 // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
3573
3574 // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
3575 // specialized in these tables yet.
3576 static const CostKindTblEntry AVX512VBMI2CostTbl[] = {
3577 { ISD::FSHL, MVT::v8i64, { 1, 1, 1, 1 } },
3578 { ISD::FSHL, MVT::v4i64, { 1, 1, 1, 1 } },
3579 { ISD::FSHL, MVT::v2i64, { 1, 1, 1, 1 } },
3580 { ISD::FSHL, MVT::v16i32, { 1, 1, 1, 1 } },
3581 { ISD::FSHL, MVT::v8i32, { 1, 1, 1, 1 } },
3582 { ISD::FSHL, MVT::v4i32, { 1, 1, 1, 1 } },
3583 { ISD::FSHL, MVT::v32i16, { 1, 1, 1, 1 } },
3584 { ISD::FSHL, MVT::v16i16, { 1, 1, 1, 1 } },
3585 { ISD::FSHL, MVT::v8i16, { 1, 1, 1, 1 } },
3586 { ISD::ROTL, MVT::v32i16, { 1, 1, 1, 1 } },
3587 { ISD::ROTL, MVT::v16i16, { 1, 1, 1, 1 } },
3588 { ISD::ROTL, MVT::v8i16, { 1, 1, 1, 1 } },
3589 { ISD::ROTR, MVT::v32i16, { 1, 1, 1, 1 } },
3590 { ISD::ROTR, MVT::v16i16, { 1, 1, 1, 1 } },
3591 { ISD::ROTR, MVT::v8i16, { 1, 1, 1, 1 } },
3592 { X86ISD::VROTLI, MVT::v32i16, { 1, 1, 1, 1 } },
3593 { X86ISD::VROTLI, MVT::v16i16, { 1, 1, 1, 1 } },
3594 { X86ISD::VROTLI, MVT::v8i16, { 1, 1, 1, 1 } },
3595 };
3596 static const CostKindTblEntry AVX512BITALGCostTbl[] = {
3597 { ISD::CTPOP, MVT::v32i16, { 1, 1, 1, 1 } },
3598 { ISD::CTPOP, MVT::v64i8, { 1, 1, 1, 1 } },
3599 { ISD::CTPOP, MVT::v16i16, { 1, 1, 1, 1 } },
3600 { ISD::CTPOP, MVT::v32i8, { 1, 1, 1, 1 } },
3601 { ISD::CTPOP, MVT::v8i16, { 1, 1, 1, 1 } },
3602 { ISD::CTPOP, MVT::v16i8, { 1, 1, 1, 1 } },
3603 };
3604 static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = {
3605 { ISD::CTPOP, MVT::v8i64, { 1, 1, 1, 1 } },
3606 { ISD::CTPOP, MVT::v16i32, { 1, 1, 1, 1 } },
3607 { ISD::CTPOP, MVT::v4i64, { 1, 1, 1, 1 } },
3608 { ISD::CTPOP, MVT::v8i32, { 1, 1, 1, 1 } },
3609 { ISD::CTPOP, MVT::v2i64, { 1, 1, 1, 1 } },
3610 { ISD::CTPOP, MVT::v4i32, { 1, 1, 1, 1 } },
3611 };
3612 static const CostKindTblEntry AVX512CDCostTbl[] = {
3613 { ISD::CTLZ, MVT::v8i64, { 1, 5, 1, 1 } },
3614 { ISD::CTLZ, MVT::v16i32, { 1, 5, 1, 1 } },
3615 { ISD::CTLZ, MVT::v32i16, { 18, 27, 23, 27 } },
3616 { ISD::CTLZ, MVT::v64i8, { 3, 16, 9, 11 } },
3617 { ISD::CTLZ, MVT::v4i64, { 1, 5, 1, 1 } },
3618 { ISD::CTLZ, MVT::v8i32, { 1, 5, 1, 1 } },
3619 { ISD::CTLZ, MVT::v16i16, { 8, 19, 11, 13 } },
3620 { ISD::CTLZ, MVT::v32i8, { 2, 11, 9, 10 } },
3621 { ISD::CTLZ, MVT::v2i64, { 1, 5, 1, 1 } },
3622 { ISD::CTLZ, MVT::v4i32, { 1, 5, 1, 1 } },
3623 { ISD::CTLZ, MVT::v8i16, { 3, 15, 4, 6 } },
3624 { ISD::CTLZ, MVT::v16i8, { 2, 10, 9, 10 } },
3625
3626 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3627 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3628 { ISD::CTTZ, MVT::v4i64, { 1, 8, 6, 6 } },
3629 { ISD::CTTZ, MVT::v8i32, { 1, 8, 6, 6 } },
3630 { ISD::CTTZ, MVT::v2i64, { 1, 8, 6, 6 } },
3631 { ISD::CTTZ, MVT::v4i32, { 1, 8, 6, 6 } },
3632 };
3633 static const CostKindTblEntry AVX512BWCostTbl[] = {
3634 { ISD::ABS, MVT::v32i16, { 1, 1, 1, 1 } },
3635 { ISD::ABS, MVT::v64i8, { 1, 1, 1, 1 } },
3636 { ISD::BITREVERSE, MVT::v2i64, { 3, 10, 10, 11 } },
3637 { ISD::BITREVERSE, MVT::v4i64, { 3, 11, 10, 11 } },
3638 { ISD::BITREVERSE, MVT::v8i64, { 3, 12, 10, 14 } },
3639 { ISD::BITREVERSE, MVT::v4i32, { 3, 10, 10, 11 } },
3640 { ISD::BITREVERSE, MVT::v8i32, { 3, 11, 10, 11 } },
3641 { ISD::BITREVERSE, MVT::v16i32, { 3, 12, 10, 14 } },
3642 { ISD::BITREVERSE, MVT::v8i16, { 3, 10, 10, 11 } },
3643 { ISD::BITREVERSE, MVT::v16i16, { 3, 11, 10, 11 } },
3644 { ISD::BITREVERSE, MVT::v32i16, { 3, 12, 10, 14 } },
3645 { ISD::BITREVERSE, MVT::v16i8, { 2, 5, 9, 9 } },
3646 { ISD::BITREVERSE, MVT::v32i8, { 2, 5, 9, 9 } },
3647 { ISD::BITREVERSE, MVT::v64i8, { 2, 5, 9, 12 } },
3648 { ISD::BSWAP, MVT::v2i64, { 1, 1, 1, 2 } },
3649 { ISD::BSWAP, MVT::v4i64, { 1, 1, 1, 2 } },
3650 { ISD::BSWAP, MVT::v8i64, { 1, 1, 1, 2 } },
3651 { ISD::BSWAP, MVT::v4i32, { 1, 1, 1, 2 } },
3652 { ISD::BSWAP, MVT::v8i32, { 1, 1, 1, 2 } },
3653 { ISD::BSWAP, MVT::v16i32, { 1, 1, 1, 2 } },
3654 { ISD::BSWAP, MVT::v8i16, { 1, 1, 1, 2 } },
3655 { ISD::BSWAP, MVT::v16i16, { 1, 1, 1, 2 } },
3656 { ISD::BSWAP, MVT::v32i16, { 1, 1, 1, 2 } },
3657 { ISD::CTLZ, MVT::v8i64, { 8, 22, 23, 23 } },
3658 { ISD::CTLZ, MVT::v16i32, { 8, 23, 25, 25 } },
3659 { ISD::CTLZ, MVT::v32i16, { 4, 15, 15, 16 } },
3660 { ISD::CTLZ, MVT::v64i8, { 3, 12, 10, 9 } },
3661 { ISD::CTPOP, MVT::v2i64, { 3, 7, 10, 10 } },
3662 { ISD::CTPOP, MVT::v4i64, { 3, 7, 10, 10 } },
3663 { ISD::CTPOP, MVT::v8i64, { 3, 8, 10, 12 } },
3664 { ISD::CTPOP, MVT::v4i32, { 7, 11, 14, 14 } },
3665 { ISD::CTPOP, MVT::v8i32, { 7, 11, 14, 14 } },
3666 { ISD::CTPOP, MVT::v16i32, { 7, 12, 14, 16 } },
3667 { ISD::CTPOP, MVT::v8i16, { 2, 7, 11, 11 } },
3668 { ISD::CTPOP, MVT::v16i16, { 2, 7, 11, 11 } },
3669 { ISD::CTPOP, MVT::v32i16, { 3, 7, 11, 13 } },
3670 { ISD::CTPOP, MVT::v16i8, { 2, 4, 8, 8 } },
3671 { ISD::CTPOP, MVT::v32i8, { 2, 4, 8, 8 } },
3672 { ISD::CTPOP, MVT::v64i8, { 2, 5, 8, 10 } },
3673 { ISD::CTTZ, MVT::v8i16, { 3, 9, 14, 14 } },
3674 { ISD::CTTZ, MVT::v16i16, { 3, 9, 14, 14 } },
3675 { ISD::CTTZ, MVT::v32i16, { 3, 10, 14, 16 } },
3676 { ISD::CTTZ, MVT::v16i8, { 2, 6, 11, 11 } },
3677 { ISD::CTTZ, MVT::v32i8, { 2, 6, 11, 11 } },
3678 { ISD::CTTZ, MVT::v64i8, { 3, 7, 11, 13 } },
3679 { ISD::ROTL, MVT::v32i16, { 2, 8, 6, 8 } },
3680 { ISD::ROTL, MVT::v16i16, { 2, 8, 6, 7 } },
3681 { ISD::ROTL, MVT::v8i16, { 2, 7, 6, 7 } },
3682 { ISD::ROTL, MVT::v64i8, { 5, 6, 11, 12 } },
3683 { ISD::ROTL, MVT::v32i8, { 5, 15, 7, 10 } },
3684 { ISD::ROTL, MVT::v16i8, { 5, 15, 7, 10 } },
3685 { ISD::ROTR, MVT::v32i16, { 2, 8, 6, 8 } },
3686 { ISD::ROTR, MVT::v16i16, { 2, 8, 6, 7 } },
3687 { ISD::ROTR, MVT::v8i16, { 2, 7, 6, 7 } },
3688 { ISD::ROTR, MVT::v64i8, { 5, 6, 12, 14 } },
3689 { ISD::ROTR, MVT::v32i8, { 5, 14, 6, 9 } },
3690 { ISD::ROTR, MVT::v16i8, { 5, 14, 6, 9 } },
3691 { X86ISD::VROTLI, MVT::v32i16, { 2, 5, 3, 3 } },
3692 { X86ISD::VROTLI, MVT::v16i16, { 1, 5, 3, 3 } },
3693 { X86ISD::VROTLI, MVT::v8i16, { 1, 5, 3, 3 } },
3694 { X86ISD::VROTLI, MVT::v64i8, { 2, 9, 3, 4 } },
3695 { X86ISD::VROTLI, MVT::v32i8, { 1, 9, 3, 4 } },
3696 { X86ISD::VROTLI, MVT::v16i8, { 1, 8, 3, 4 } },
3697 { ISD::SADDSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3698 { ISD::SADDSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3699 { ISD::SMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3700 { ISD::SMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3701 { ISD::SMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3702 { ISD::SMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3703 { ISD::SMULO, MVT::v32i16, { 3, 6, 4, 4 } },
3704 { ISD::SMULO, MVT::v64i8, { 8, 21, 17, 18 } },
3705 { ISD::UMULO, MVT::v32i16, { 2, 5, 3, 3 } },
3706 { ISD::UMULO, MVT::v64i8, { 8, 15, 15, 16 } },
3707 { ISD::SSUBSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3708 { ISD::SSUBSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3709 { ISD::UADDSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3710 { ISD::UADDSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3711 { ISD::UMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3712 { ISD::UMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3713 { ISD::UMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3714 { ISD::UMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3715 { ISD::USUBSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3716 { ISD::USUBSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3717 };
3718 static const CostKindTblEntry AVX512CostTbl[] = {
3719 { ISD::ABS, MVT::v8i64, { 1, 1, 1, 1 } },
3720 { ISD::ABS, MVT::v4i64, { 1, 1, 1, 1 } },
3721 { ISD::ABS, MVT::v2i64, { 1, 1, 1, 1 } },
3722 { ISD::ABS, MVT::v16i32, { 1, 1, 1, 1 } },
3723 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 1 } },
3724 { ISD::ABS, MVT::v32i16, { 2, 7, 4, 4 } },
3725 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 1 } },
3726 { ISD::ABS, MVT::v64i8, { 2, 7, 4, 4 } },
3727 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 1 } },
3728 { ISD::BITREVERSE, MVT::v8i64, { 9, 13, 20, 20 } },
3729 { ISD::BITREVERSE, MVT::v16i32, { 9, 13, 20, 20 } },
3730 { ISD::BITREVERSE, MVT::v32i16, { 9, 13, 20, 20 } },
3731 { ISD::BITREVERSE, MVT::v64i8, { 6, 11, 17, 17 } },
3732 { ISD::BSWAP, MVT::v8i64, { 4, 7, 5, 5 } },
3733 { ISD::BSWAP, MVT::v16i32, { 4, 7, 5, 5 } },
3734 { ISD::BSWAP, MVT::v32i16, { 4, 7, 5, 5 } },
3735 { ISD::CTLZ, MVT::v8i64, { 10, 28, 32, 32 } },
3736 { ISD::CTLZ, MVT::v16i32, { 12, 30, 38, 38 } },
3737 { ISD::CTLZ, MVT::v32i16, { 8, 15, 29, 29 } },
3738 { ISD::CTLZ, MVT::v64i8, { 6, 11, 19, 19 } },
3739 { ISD::CTPOP, MVT::v8i64, { 16, 16, 19, 19 } },
3740 { ISD::CTPOP, MVT::v16i32, { 24, 19, 27, 27 } },
3741 { ISD::CTPOP, MVT::v32i16, { 18, 15, 22, 22 } },
3742 { ISD::CTPOP, MVT::v64i8, { 12, 11, 16, 16 } },
3743 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3744 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3745 { ISD::CTTZ, MVT::v32i16, { 7, 17, 27, 27 } },
3746 { ISD::CTTZ, MVT::v64i8, { 6, 13, 21, 21 } },
3747 { ISD::ROTL, MVT::v8i64, { 1, 1, 1, 1 } },
3748 { ISD::ROTL, MVT::v4i64, { 1, 1, 1, 1 } },
3749 { ISD::ROTL, MVT::v2i64, { 1, 1, 1, 1 } },
3750 { ISD::ROTL, MVT::v16i32, { 1, 1, 1, 1 } },
3751 { ISD::ROTL, MVT::v8i32, { 1, 1, 1, 1 } },
3752 { ISD::ROTL, MVT::v4i32, { 1, 1, 1, 1 } },
3753 { ISD::ROTR, MVT::v8i64, { 1, 1, 1, 1 } },
3754 { ISD::ROTR, MVT::v4i64, { 1, 1, 1, 1 } },
3755 { ISD::ROTR, MVT::v2i64, { 1, 1, 1, 1 } },
3756 { ISD::ROTR, MVT::v16i32, { 1, 1, 1, 1 } },
3757 { ISD::ROTR, MVT::v8i32, { 1, 1, 1, 1 } },
3758 { ISD::ROTR, MVT::v4i32, { 1, 1, 1, 1 } },
3759 { X86ISD::VROTLI, MVT::v8i64, { 1, 1, 1, 1 } },
3760 { X86ISD::VROTLI, MVT::v4i64, { 1, 1, 1, 1 } },
3761 { X86ISD::VROTLI, MVT::v2i64, { 1, 1, 1, 1 } },
3762 { X86ISD::VROTLI, MVT::v16i32, { 1, 1, 1, 1 } },
3763 { X86ISD::VROTLI, MVT::v8i32, { 1, 1, 1, 1 } },
3764 { X86ISD::VROTLI, MVT::v4i32, { 1, 1, 1, 1 } },
3765 { ISD::SADDSAT, MVT::v2i64, { 3, 3, 8, 9 } },
3766 { ISD::SADDSAT, MVT::v4i64, { 2, 2, 6, 7 } },
3767 { ISD::SADDSAT, MVT::v8i64, { 3, 3, 6, 7 } },
3768 { ISD::SADDSAT, MVT::v4i32, { 2, 2, 6, 7 } },
3769 { ISD::SADDSAT, MVT::v8i32, { 2, 2, 6, 7 } },
3770 { ISD::SADDSAT, MVT::v16i32, { 3, 3, 6, 7 } },
3771 { ISD::SADDSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3772 { ISD::SADDSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3773 { ISD::SMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3774 { ISD::SMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3775 { ISD::SMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3776 { ISD::SMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3777 { ISD::SMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3778 { ISD::SMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3779 { ISD::SMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3780 { ISD::SMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3781 { ISD::SMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3782 { ISD::SMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3783 { ISD::SMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3784 { ISD::SMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3785 { ISD::SMULO, MVT::v8i64, { 44, 44, 81, 93 } },
3786 { ISD::SMULO, MVT::v16i32, { 5, 12, 9, 11 } },
3787 { ISD::SMULO, MVT::v32i16, { 6, 12, 17, 17 } },
3788 { ISD::SMULO, MVT::v64i8, { 22, 28, 42, 42 } },
3789 { ISD::SSUBSAT, MVT::v2i64, { 2, 13, 9, 10 } },
3790 { ISD::SSUBSAT, MVT::v4i64, { 2, 15, 7, 8 } },
3791 { ISD::SSUBSAT, MVT::v8i64, { 2, 14, 7, 8 } },
3792 { ISD::SSUBSAT, MVT::v4i32, { 2, 14, 7, 8 } },
3793 { ISD::SSUBSAT, MVT::v8i32, { 2, 15, 7, 8 } },
3794 { ISD::SSUBSAT, MVT::v16i32, { 2, 14, 7, 8 } },
3795 { ISD::SSUBSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3796 { ISD::SSUBSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3797 { ISD::UMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3798 { ISD::UMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3799 { ISD::UMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3800 { ISD::UMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3801 { ISD::UMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3802 { ISD::UMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3803 { ISD::UMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3804 { ISD::UMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3805 { ISD::UMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3806 { ISD::UMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3807 { ISD::UMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3808 { ISD::UMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3809 { ISD::UMULO, MVT::v8i64, { 52, 52, 95, 104} },
3810 { ISD::UMULO, MVT::v16i32, { 5, 12, 8, 10 } },
3811 { ISD::UMULO, MVT::v32i16, { 5, 13, 16, 16 } },
3812 { ISD::UMULO, MVT::v64i8, { 18, 24, 30, 30 } },
3813 { ISD::UADDSAT, MVT::v2i64, { 1, 4, 4, 4 } },
3814 { ISD::UADDSAT, MVT::v4i64, { 1, 4, 4, 4 } },
3815 { ISD::UADDSAT, MVT::v8i64, { 1, 4, 4, 4 } },
3816 { ISD::UADDSAT, MVT::v4i32, { 1, 2, 4, 4 } },
3817 { ISD::UADDSAT, MVT::v8i32, { 1, 2, 4, 4 } },
3818 { ISD::UADDSAT, MVT::v16i32, { 2, 2, 4, 4 } },
3819 { ISD::UADDSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3820 { ISD::UADDSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3821 { ISD::USUBSAT, MVT::v2i64, { 1, 4, 2, 2 } },
3822 { ISD::USUBSAT, MVT::v4i64, { 1, 4, 2, 2 } },
3823 { ISD::USUBSAT, MVT::v8i64, { 1, 4, 2, 2 } },
3824 { ISD::USUBSAT, MVT::v8i32, { 1, 2, 2, 2 } },
3825 { ISD::USUBSAT, MVT::v16i32, { 1, 2, 2, 2 } },
3826 { ISD::USUBSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3827 { ISD::USUBSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3828 { ISD::FMAXNUM, MVT::f32, { 2, 2, 3, 3 } },
3829 { ISD::FMAXNUM, MVT::v4f32, { 1, 1, 3, 3 } },
3830 { ISD::FMAXNUM, MVT::v8f32, { 2, 2, 3, 3 } },
3831 { ISD::FMAXNUM, MVT::v16f32, { 4, 4, 3, 3 } },
3832 { ISD::FMAXNUM, MVT::f64, { 2, 2, 3, 3 } },
3833 { ISD::FMAXNUM, MVT::v2f64, { 1, 1, 3, 3 } },
3834 { ISD::FMAXNUM, MVT::v4f64, { 2, 2, 3, 3 } },
3835 { ISD::FMAXNUM, MVT::v8f64, { 3, 3, 3, 3 } },
3836 { ISD::FSQRT, MVT::f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3837 { ISD::FSQRT, MVT::v4f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3838 { ISD::FSQRT, MVT::v8f32, { 6, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3839 { ISD::FSQRT, MVT::v16f32, { 12, 20, 1, 3 } }, // Skylake from http://www.agner.org/
3840 { ISD::FSQRT, MVT::f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3841 { ISD::FSQRT, MVT::v2f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3842 { ISD::FSQRT, MVT::v4f64, { 12, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3843 { ISD::FSQRT, MVT::v8f64, { 24, 32, 1, 3 } }, // Skylake from http://www.agner.org/
3844 };
3845 static const CostKindTblEntry XOPCostTbl[] = {
3846 { ISD::BITREVERSE, MVT::v4i64, { 3, 6, 5, 6 } },
3847 { ISD::BITREVERSE, MVT::v8i32, { 3, 6, 5, 6 } },
3848 { ISD::BITREVERSE, MVT::v16i16, { 3, 6, 5, 6 } },
3849 { ISD::BITREVERSE, MVT::v32i8, { 3, 6, 5, 6 } },
3850 { ISD::BITREVERSE, MVT::v2i64, { 2, 7, 1, 1 } },
3851 { ISD::BITREVERSE, MVT::v4i32, { 2, 7, 1, 1 } },
3852 { ISD::BITREVERSE, MVT::v8i16, { 2, 7, 1, 1 } },
3853 { ISD::BITREVERSE, MVT::v16i8, { 2, 7, 1, 1 } },
3854 { ISD::BITREVERSE, MVT::i64, { 2, 2, 3, 4 } },
3855 { ISD::BITREVERSE, MVT::i32, { 2, 2, 3, 4 } },
3856 { ISD::BITREVERSE, MVT::i16, { 2, 2, 3, 4 } },
3857 { ISD::BITREVERSE, MVT::i8, { 2, 2, 3, 4 } },
3858 // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
3859 { ISD::ROTL, MVT::v4i64, { 4, 7, 5, 6 } },
3860 { ISD::ROTL, MVT::v8i32, { 4, 7, 5, 6 } },
3861 { ISD::ROTL, MVT::v16i16, { 4, 7, 5, 6 } },
3862 { ISD::ROTL, MVT::v32i8, { 4, 7, 5, 6 } },
3863 { ISD::ROTL, MVT::v2i64, { 1, 3, 1, 1 } },
3864 { ISD::ROTL, MVT::v4i32, { 1, 3, 1, 1 } },
3865 { ISD::ROTL, MVT::v8i16, { 1, 3, 1, 1 } },
3866 { ISD::ROTL, MVT::v16i8, { 1, 3, 1, 1 } },
3867 { ISD::ROTR, MVT::v4i64, { 4, 7, 8, 9 } },
3868 { ISD::ROTR, MVT::v8i32, { 4, 7, 8, 9 } },
3869 { ISD::ROTR, MVT::v16i16, { 4, 7, 8, 9 } },
3870 { ISD::ROTR, MVT::v32i8, { 4, 7, 8, 9 } },
3871 { ISD::ROTR, MVT::v2i64, { 1, 3, 3, 3 } },
3872 { ISD::ROTR, MVT::v4i32, { 1, 3, 3, 3 } },
3873 { ISD::ROTR, MVT::v8i16, { 1, 3, 3, 3 } },
3874 { ISD::ROTR, MVT::v16i8, { 1, 3, 3, 3 } },
3875 { X86ISD::VROTLI, MVT::v4i64, { 4, 7, 5, 6 } },
3876 { X86ISD::VROTLI, MVT::v8i32, { 4, 7, 5, 6 } },
3877 { X86ISD::VROTLI, MVT::v16i16, { 4, 7, 5, 6 } },
3878 { X86ISD::VROTLI, MVT::v32i8, { 4, 7, 5, 6 } },
3879 { X86ISD::VROTLI, MVT::v2i64, { 1, 3, 1, 1 } },
3880 { X86ISD::VROTLI, MVT::v4i32, { 1, 3, 1, 1 } },
3881 { X86ISD::VROTLI, MVT::v8i16, { 1, 3, 1, 1 } },
3882 { X86ISD::VROTLI, MVT::v16i8, { 1, 3, 1, 1 } },
3883 };
3884 static const CostKindTblEntry AVX2CostTbl[] = {
3885 { ISD::ABS, MVT::v2i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3886 { ISD::ABS, MVT::v4i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3887 { ISD::ABS, MVT::v4i32, { 1, 1, 1, 1 } },
3888 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 2 } },
3889 { ISD::ABS, MVT::v8i16, { 1, 1, 1, 1 } },
3890 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 2 } },
3891 { ISD::ABS, MVT::v16i8, { 1, 1, 1, 1 } },
3892 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 2 } },
3893 { ISD::BITREVERSE, MVT::v2i64, { 3, 11, 10, 11 } },
3894 { ISD::BITREVERSE, MVT::v4i64, { 5, 11, 10, 17 } },
3895 { ISD::BITREVERSE, MVT::v4i32, { 3, 11, 10, 11 } },
3896 { ISD::BITREVERSE, MVT::v8i32, { 5, 11, 10, 17 } },
3897 { ISD::BITREVERSE, MVT::v8i16, { 3, 11, 10, 11 } },
3898 { ISD::BITREVERSE, MVT::v16i16, { 5, 11, 10, 17 } },
3899 { ISD::BITREVERSE, MVT::v16i8, { 3, 6, 9, 9 } },
3900 { ISD::BITREVERSE, MVT::v32i8, { 4, 5, 9, 15 } },
3901 { ISD::BSWAP, MVT::v2i64, { 1, 2, 1, 2 } },
3902 { ISD::BSWAP, MVT::v4i64, { 1, 3, 1, 2 } },
3903 { ISD::BSWAP, MVT::v4i32, { 1, 2, 1, 2 } },
3904 { ISD::BSWAP, MVT::v8i32, { 1, 3, 1, 2 } },
3905 { ISD::BSWAP, MVT::v8i16, { 1, 2, 1, 2 } },
3906 { ISD::BSWAP, MVT::v16i16, { 1, 3, 1, 2 } },
3907 { ISD::CTLZ, MVT::v2i64, { 7, 18, 24, 25 } },
3908 { ISD::CTLZ, MVT::v4i64, { 14, 18, 24, 44 } },
3909 { ISD::CTLZ, MVT::v4i32, { 5, 16, 19, 20 } },
3910 { ISD::CTLZ, MVT::v8i32, { 10, 16, 19, 34 } },
3911 { ISD::CTLZ, MVT::v8i16, { 4, 13, 14, 15 } },
3912 { ISD::CTLZ, MVT::v16i16, { 6, 14, 14, 24 } },
3913 { ISD::CTLZ, MVT::v16i8, { 3, 12, 9, 10 } },
3914 { ISD::CTLZ, MVT::v32i8, { 4, 12, 9, 14 } },
3915 { ISD::CTPOP, MVT::v2i64, { 3, 9, 10, 10 } },
3916 { ISD::CTPOP, MVT::v4i64, { 4, 9, 10, 14 } },
3917 { ISD::CTPOP, MVT::v4i32, { 7, 12, 14, 14 } },
3918 { ISD::CTPOP, MVT::v8i32, { 7, 12, 14, 18 } },
3919 { ISD::CTPOP, MVT::v8i16, { 3, 7, 11, 11 } },
3920 { ISD::CTPOP, MVT::v16i16, { 6, 8, 11, 18 } },
3921 { ISD::CTPOP, MVT::v16i8, { 2, 5, 8, 8 } },
3922 { ISD::CTPOP, MVT::v32i8, { 3, 5, 8, 12 } },
3923 { ISD::CTTZ, MVT::v2i64, { 4, 11, 13, 13 } },
3924 { ISD::CTTZ, MVT::v4i64, { 5, 11, 13, 20 } },
3925 { ISD::CTTZ, MVT::v4i32, { 7, 14, 17, 17 } },
3926 { ISD::CTTZ, MVT::v8i32, { 7, 15, 17, 24 } },
3927 { ISD::CTTZ, MVT::v8i16, { 4, 9, 14, 14 } },
3928 { ISD::CTTZ, MVT::v16i16, { 6, 9, 14, 24 } },
3929 { ISD::CTTZ, MVT::v16i8, { 3, 7, 11, 11 } },
3930 { ISD::CTTZ, MVT::v32i8, { 5, 7, 11, 18 } },
3931 { ISD::SADDSAT, MVT::v2i64, { 4, 13, 8, 11 } },
3932 { ISD::SADDSAT, MVT::v4i64, { 3, 10, 8, 12 } },
3933 { ISD::SADDSAT, MVT::v4i32, { 2, 6, 7, 9 } },
3934 { ISD::SADDSAT, MVT::v8i32, { 4, 6, 7, 13 } },
3935 { ISD::SADDSAT, MVT::v16i16, { 1, 1, 1, 2 } },
3936 { ISD::SADDSAT, MVT::v32i8, { 1, 1, 1, 2 } },
3937 { ISD::SMAX, MVT::v2i64, { 2, 7, 2, 3 } },
3938 { ISD::SMAX, MVT::v4i64, { 2, 7, 2, 3 } },
3939 { ISD::SMAX, MVT::v8i32, { 1, 1, 1, 2 } },
3940 { ISD::SMAX, MVT::v16i16, { 1, 1, 1, 2 } },
3941 { ISD::SMAX, MVT::v32i8, { 1, 1, 1, 2 } },
3942 { ISD::SMIN, MVT::v2i64, { 2, 7, 2, 3 } },
3943 { ISD::SMIN, MVT::v4i64, { 2, 7, 2, 3 } },
3944 { ISD::SMIN, MVT::v8i32, { 1, 1, 1, 2 } },
3945 { ISD::SMIN, MVT::v16i16, { 1, 1, 1, 2 } },
3946 { ISD::SMIN, MVT::v32i8, { 1, 1, 1, 2 } },
3947 { ISD::SMULO, MVT::v4i64, { 20, 20, 33, 37 } },
3948 { ISD::SMULO, MVT::v2i64, { 8, 8, 13, 15 } },
3949 { ISD::SMULO, MVT::v8i32, { 8, 20, 13, 24 } },
3950 { ISD::SMULO, MVT::v4i32, { 5, 15, 11, 12 } },
3951 { ISD::SMULO, MVT::v16i16, { 4, 14, 8, 14 } },
3952 { ISD::SMULO, MVT::v8i16, { 3, 9, 6, 6 } },
3953 { ISD::SMULO, MVT::v32i8, { 9, 15, 18, 35 } },
3954 { ISD::SMULO, MVT::v16i8, { 6, 22, 14, 21 } },
3955 { ISD::SSUBSAT, MVT::v2i64, { 4, 13, 9, 13 } },
3956 { ISD::SSUBSAT, MVT::v4i64, { 4, 15, 9, 13 } },
3957 { ISD::SSUBSAT, MVT::v4i32, { 3, 14, 9, 11 } },
3958 { ISD::SSUBSAT, MVT::v8i32, { 4, 15, 9, 16 } },
3959 { ISD::SSUBSAT, MVT::v16i16, { 1, 1, 1, 2 } },
3960 { ISD::SSUBSAT, MVT::v32i8, { 1, 1, 1, 2 } },
3961 { ISD::UADDSAT, MVT::v2i64, { 2, 8, 6, 6 } },
3962 { ISD::UADDSAT, MVT::v4i64, { 3, 8, 6, 10 } },
3963 { ISD::UADDSAT, MVT::v8i32, { 2, 2, 4, 8 } },
3964 { ISD::UADDSAT, MVT::v16i16, { 1, 1, 1, 2 } },
3965 { ISD::UADDSAT, MVT::v32i8, { 1, 1, 1, 2 } },
3966 { ISD::UMAX, MVT::v2i64, { 2, 8, 5, 6 } },
3967 { ISD::UMAX, MVT::v4i64, { 2, 8, 5, 8 } },
3968 { ISD::UMAX, MVT::v8i32, { 1, 1, 1, 2 } },
3969 { ISD::UMAX, MVT::v16i16, { 1, 1, 1, 2 } },
3970 { ISD::UMAX, MVT::v32i8, { 1, 1, 1, 2 } },
3971 { ISD::UMIN, MVT::v2i64, { 2, 8, 5, 6 } },
3972 { ISD::UMIN, MVT::v4i64, { 2, 8, 5, 8 } },
3973 { ISD::UMIN, MVT::v8i32, { 1, 1, 1, 2 } },
3974 { ISD::UMIN, MVT::v16i16, { 1, 1, 1, 2 } },
3975 { ISD::UMIN, MVT::v32i8, { 1, 1, 1, 2 } },
3976 { ISD::UMULO, MVT::v4i64, { 24, 24, 39, 43 } },
3977 { ISD::UMULO, MVT::v2i64, { 10, 10, 15, 19 } },
3978 { ISD::UMULO, MVT::v8i32, { 8, 11, 13, 23 } },
3979 { ISD::UMULO, MVT::v4i32, { 5, 12, 11, 12 } },
3980 { ISD::UMULO, MVT::v16i16, { 4, 6, 8, 13 } },
3981 { ISD::UMULO, MVT::v8i16, { 2, 8, 6, 6 } },
3982 { ISD::UMULO, MVT::v32i8, { 9, 13, 17, 33 } },
3983 { ISD::UMULO, MVT::v16i8, { 6, 19, 13, 20 } },
3984 { ISD::USUBSAT, MVT::v2i64, { 2, 7, 6, 6 } },
3985 { ISD::USUBSAT, MVT::v4i64, { 3, 7, 6, 10 } },
3986 { ISD::USUBSAT, MVT::v8i32, { 2, 2, 2, 4 } },
3987 { ISD::USUBSAT, MVT::v16i16, { 1, 1, 1, 2 } },
3988 { ISD::USUBSAT, MVT::v32i8, { 1, 1, 1, 2 } },
3989 { ISD::FMAXNUM, MVT::f32, { 2, 7, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3990 { ISD::FMAXNUM, MVT::v4f32, { 2, 7, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3991 { ISD::FMAXNUM, MVT::v8f32, { 3, 7, 3, 6 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3992 { ISD::FMAXNUM, MVT::f64, { 2, 7, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3993 { ISD::FMAXNUM, MVT::v2f64, { 2, 7, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3994 { ISD::FMAXNUM, MVT::v4f64, { 3, 7, 3, 6 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3995 { ISD::FSQRT, MVT::f32, { 7, 15, 1, 1 } }, // vsqrtss
3996 { ISD::FSQRT, MVT::v4f32, { 7, 15, 1, 1 } }, // vsqrtps
3997 { ISD::FSQRT, MVT::v8f32, { 14, 21, 1, 3 } }, // vsqrtps
3998 { ISD::FSQRT, MVT::f64, { 14, 21, 1, 1 } }, // vsqrtsd
3999 { ISD::FSQRT, MVT::v2f64, { 14, 21, 1, 1 } }, // vsqrtpd
4000 { ISD::FSQRT, MVT::v4f64, { 28, 35, 1, 3 } }, // vsqrtpd
4001 };
4002 static const CostKindTblEntry AVX1CostTbl[] = {
4003 { ISD::ABS, MVT::v4i64, { 6, 8, 6, 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
4004 { ISD::ABS, MVT::v8i32, { 3, 6, 4, 5 } },
4005 { ISD::ABS, MVT::v16i16, { 3, 6, 4, 5 } },
4006 { ISD::ABS, MVT::v32i8, { 3, 6, 4, 5 } },
4007 { ISD::BITREVERSE, MVT::v4i64, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4008 { ISD::BITREVERSE, MVT::v2i64, { 8, 13, 10, 16 } },
4009 { ISD::BITREVERSE, MVT::v8i32, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4010 { ISD::BITREVERSE, MVT::v4i32, { 8, 13, 10, 16 } },
4011 { ISD::BITREVERSE, MVT::v16i16, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4012 { ISD::BITREVERSE, MVT::v8i16, { 8, 13, 10, 16 } },
4013 { ISD::BITREVERSE, MVT::v32i8, { 13, 15, 17, 26 } }, // 2 x 128-bit Op + extract/insert
4014 { ISD::BITREVERSE, MVT::v16i8, { 7, 7, 9, 13 } },
4015 { ISD::BSWAP, MVT::v4i64, { 5, 6, 5, 10 } },
4016 { ISD::BSWAP, MVT::v2i64, { 2, 2, 1, 3 } },
4017 { ISD::BSWAP, MVT::v8i32, { 5, 6, 5, 10 } },
4018 { ISD::BSWAP, MVT::v4i32, { 2, 2, 1, 3 } },
4019 { ISD::BSWAP, MVT::v16i16, { 5, 6, 5, 10 } },
4020 { ISD::BSWAP, MVT::v8i16, { 2, 2, 1, 3 } },
4021 { ISD::CTLZ, MVT::v4i64, { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert
4022 { ISD::CTLZ, MVT::v2i64, { 14, 24, 24, 28 } },
4023 { ISD::CTLZ, MVT::v8i32, { 24, 28, 39, 48 } }, // 2 x 128-bit Op + extract/insert
4024 { ISD::CTLZ, MVT::v4i32, { 12, 20, 19, 23 } },
4025 { ISD::CTLZ, MVT::v16i16, { 19, 22, 29, 38 } }, // 2 x 128-bit Op + extract/insert
4026 { ISD::CTLZ, MVT::v8i16, { 9, 16, 14, 18 } },
4027 { ISD::CTLZ, MVT::v32i8, { 14, 15, 19, 28 } }, // 2 x 128-bit Op + extract/insert
4028 { ISD::CTLZ, MVT::v16i8, { 7, 12, 9, 13 } },
4029 { ISD::CTPOP, MVT::v4i64, { 14, 18, 19, 28 } }, // 2 x 128-bit Op + extract/insert
4030 { ISD::CTPOP, MVT::v2i64, { 7, 14, 10, 14 } },
4031 { ISD::CTPOP, MVT::v8i32, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
4032 { ISD::CTPOP, MVT::v4i32, { 9, 20, 14, 18 } },
4033 { ISD::CTPOP, MVT::v16i16, { 16, 21, 22, 31 } }, // 2 x 128-bit Op + extract/insert
4034 { ISD::CTPOP, MVT::v8i16, { 8, 18, 11, 15 } },
4035 { ISD::CTPOP, MVT::v32i8, { 13, 15, 16, 25 } }, // 2 x 128-bit Op + extract/insert
4036 { ISD::CTPOP, MVT::v16i8, { 6, 12, 8, 12 } },
4037 { ISD::CTTZ, MVT::v4i64, { 17, 22, 24, 33 } }, // 2 x 128-bit Op + extract/insert
4038 { ISD::CTTZ, MVT::v2i64, { 9, 19, 13, 17 } },
4039 { ISD::CTTZ, MVT::v8i32, { 21, 27, 32, 41 } }, // 2 x 128-bit Op + extract/insert
4040 { ISD::CTTZ, MVT::v4i32, { 11, 24, 17, 21 } },
4041 { ISD::CTTZ, MVT::v16i16, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
4042 { ISD::CTTZ, MVT::v8i16, { 9, 21, 14, 18 } },
4043 { ISD::CTTZ, MVT::v32i8, { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert
4044 { ISD::CTTZ, MVT::v16i8, { 8, 16, 11, 15 } },
4045 { ISD::SADDSAT, MVT::v2i64, { 6, 13, 8, 11 } },
4046 { ISD::SADDSAT, MVT::v4i64, { 13, 20, 15, 25 } }, // 2 x 128-bit Op + extract/insert
4047 { ISD::SADDSAT, MVT::v8i32, { 12, 18, 14, 24 } }, // 2 x 128-bit Op + extract/insert
4048 { ISD::SADDSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4049 { ISD::SADDSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4050 { ISD::SMAX, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
4051 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 4 } },
4052 { ISD::SMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4053 { ISD::SMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4054 { ISD::SMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4055 { ISD::SMIN, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
4056 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
4057 { ISD::SMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4058 { ISD::SMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4059 { ISD::SMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4060 { ISD::SMULO, MVT::v4i64, { 20, 20, 33, 37 } },
4061 { ISD::SMULO, MVT::v2i64, { 9, 9, 13, 17 } },
4062 { ISD::SMULO, MVT::v8i32, { 15, 20, 24, 29 } },
4063 { ISD::SMULO, MVT::v4i32, { 7, 15, 11, 13 } },
4064 { ISD::SMULO, MVT::v16i16, { 8, 14, 14, 15 } },
4065 { ISD::SMULO, MVT::v8i16, { 3, 9, 6, 6 } },
4066 { ISD::SMULO, MVT::v32i8, { 20, 20, 37, 39 } },
4067 { ISD::SMULO, MVT::v16i8, { 9, 22, 18, 21 } },
4068 { ISD::SSUBSAT, MVT::v2i64, { 7, 13, 9, 13 } },
4069 { ISD::SSUBSAT, MVT::v4i64, { 15, 21, 18, 29 } }, // 2 x 128-bit Op + extract/insert
4070 { ISD::SSUBSAT, MVT::v8i32, { 15, 19, 18, 29 } }, // 2 x 128-bit Op + extract/insert
4071 { ISD::SSUBSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4072 { ISD::SSUBSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4073 { ISD::UADDSAT, MVT::v2i64, { 3, 8, 6, 6 } },
4074 { ISD::UADDSAT, MVT::v4i64, { 8, 11, 14, 15 } }, // 2 x 128-bit Op + extract/insert
4075 { ISD::UADDSAT, MVT::v8i32, { 6, 6, 10, 11 } }, // 2 x 128-bit Op + extract/insert
4076 { ISD::UADDSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4077 { ISD::UADDSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4078 { ISD::UMAX, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
4079 { ISD::UMAX, MVT::v2i64, { 4, 8, 5, 7 } },
4080 { ISD::UMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4081 { ISD::UMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4082 { ISD::UMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4083 { ISD::UMIN, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
4084 { ISD::UMIN, MVT::v2i64, { 4, 8, 5, 7 } },
4085 { ISD::UMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4086 { ISD::UMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4087 { ISD::UMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4088 { ISD::UMULO, MVT::v4i64, { 24, 26, 39, 45 } },
4089 { ISD::UMULO, MVT::v2i64, { 10, 12, 15, 20 } },
4090 { ISD::UMULO, MVT::v8i32, { 14, 15, 23, 28 } },
4091 { ISD::UMULO, MVT::v4i32, { 7, 12, 11, 13 } },
4092 { ISD::UMULO, MVT::v16i16, { 7, 11, 13, 14 } },
4093 { ISD::UMULO, MVT::v8i16, { 3, 8, 6, 6 } },
4094 { ISD::UMULO, MVT::v32i8, { 19, 19, 35, 37 } },
4095 { ISD::UMULO, MVT::v16i8, { 9, 19, 17, 20 } },
4096 { ISD::USUBSAT, MVT::v2i64, { 3, 7, 6, 6 } },
4097 { ISD::USUBSAT, MVT::v4i64, { 8, 10, 14, 15 } }, // 2 x 128-bit Op + extract/insert
4098 { ISD::USUBSAT, MVT::v8i32, { 4, 4, 7, 8 } }, // 2 x 128-bit Op + extract/insert
4099 { ISD::USUBSAT, MVT::v8i32, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4100 { ISD::USUBSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4101 { ISD::USUBSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4102 { ISD::FMAXNUM, MVT::f32, { 3, 6, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4103 { ISD::FMAXNUM, MVT::v4f32, { 3, 6, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4104 { ISD::FMAXNUM, MVT::v8f32, { 5, 7, 3, 10 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4105 { ISD::FMAXNUM, MVT::f64, { 3, 6, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4106 { ISD::FMAXNUM, MVT::v2f64, { 3, 6, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4107 { ISD::FMAXNUM, MVT::v4f64, { 5, 7, 3, 10 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4108 { ISD::FSQRT, MVT::f32, { 21, 21, 1, 1 } }, // vsqrtss
4109 { ISD::FSQRT, MVT::v4f32, { 21, 21, 1, 1 } }, // vsqrtps
4110 { ISD::FSQRT, MVT::v8f32, { 42, 42, 1, 3 } }, // vsqrtps
4111 { ISD::FSQRT, MVT::f64, { 27, 27, 1, 1 } }, // vsqrtsd
4112 { ISD::FSQRT, MVT::v2f64, { 27, 27, 1, 1 } }, // vsqrtpd
4113 { ISD::FSQRT, MVT::v4f64, { 54, 54, 1, 3 } }, // vsqrtpd
4114 };
4115 static const CostKindTblEntry GFNICostTbl[] = {
4116 { ISD::BITREVERSE, MVT::i8, { 3, 3, 3, 4 } }, // gf2p8affineqb
4117 { ISD::BITREVERSE, MVT::i16, { 3, 3, 4, 6 } }, // gf2p8affineqb
4118 { ISD::BITREVERSE, MVT::i32, { 3, 3, 4, 5 } }, // gf2p8affineqb
4119 { ISD::BITREVERSE, MVT::i64, { 3, 3, 4, 6 } }, // gf2p8affineqb
4120 { ISD::BITREVERSE, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4121 { ISD::BITREVERSE, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4122 { ISD::BITREVERSE, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4123 { ISD::BITREVERSE, MVT::v8i16, { 1, 8, 2, 4 } }, // gf2p8affineqb
4124 { ISD::BITREVERSE, MVT::v16i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
4125 { ISD::BITREVERSE, MVT::v32i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
4126 { ISD::BITREVERSE, MVT::v4i32, { 1, 8, 2, 4 } }, // gf2p8affineqb
4127 { ISD::BITREVERSE, MVT::v8i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
4128 { ISD::BITREVERSE, MVT::v16i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
4129 { ISD::BITREVERSE, MVT::v2i64, { 1, 8, 2, 4 } }, // gf2p8affineqb
4130 { ISD::BITREVERSE, MVT::v4i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
4131 { ISD::BITREVERSE, MVT::v8i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
4132 { X86ISD::VROTLI, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4133 { X86ISD::VROTLI, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4134 { X86ISD::VROTLI, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4135 };
4136 static const CostKindTblEntry GLMCostTbl[] = {
4137 { ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss
4138 { ISD::FSQRT, MVT::v4f32, { 37, 41, 1, 5 } }, // sqrtps
4139 { ISD::FSQRT, MVT::f64, { 34, 35, 1, 1 } }, // sqrtsd
4140 { ISD::FSQRT, MVT::v2f64, { 67, 71, 1, 5 } }, // sqrtpd
4141 };
4142 static const CostKindTblEntry SLMCostTbl[] = {
4143 { ISD::BSWAP, MVT::v2i64, { 5, 5, 1, 5 } },
4144 { ISD::BSWAP, MVT::v4i32, { 5, 5, 1, 5 } },
4145 { ISD::BSWAP, MVT::v8i16, { 5, 5, 1, 5 } },
4146 { ISD::FSQRT, MVT::f32, { 20, 20, 1, 1 } }, // sqrtss
4147 { ISD::FSQRT, MVT::v4f32, { 40, 41, 1, 5 } }, // sqrtps
4148 { ISD::FSQRT, MVT::f64, { 35, 35, 1, 1 } }, // sqrtsd
4149 { ISD::FSQRT, MVT::v2f64, { 70, 71, 1, 5 } }, // sqrtpd
4150 };
4151 static const CostKindTblEntry SSE42CostTbl[] = {
4152 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4153 { ISD::FMAXNUM, MVT::v4f32, { 4, 4, 4, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4154 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4155 { ISD::FMAXNUM, MVT::v2f64, { 4, 4, 4, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4156 { ISD::FSQRT, MVT::f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
4157 { ISD::FSQRT, MVT::v4f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
4158 };
4159 static const CostKindTblEntry SSE41CostTbl[] = {
4160 { ISD::ABS, MVT::v2i64, { 3, 4, 3, 5 } }, // BLENDVPD(X,PSUBQ(0,X),X)
4161 { ISD::SADDSAT, MVT::v2i64, { 10, 14, 17, 21 } },
4162 { ISD::SADDSAT, MVT::v4i32, { 5, 11, 8, 10 } },
4163 { ISD::SSUBSAT, MVT::v2i64, { 12, 19, 25, 29 } },
4164 { ISD::SSUBSAT, MVT::v4i32, { 6, 14, 10, 12 } },
4165 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 3 } },
4166 { ISD::SMAX, MVT::v4i32, { 1, 1, 1, 1 } },
4167 { ISD::SMAX, MVT::v16i8, { 1, 1, 1, 1 } },
4168 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
4169 { ISD::SMIN, MVT::v4i32, { 1, 1, 1, 1 } },
4170 { ISD::SMIN, MVT::v16i8, { 1, 1, 1, 1 } },
4171 { ISD::SMULO, MVT::v2i64, { 9, 11, 13, 17 } },
4172 { ISD::SMULO, MVT::v4i32, { 20, 24, 13, 19 } },
4173 { ISD::SMULO, MVT::v8i16, { 5, 9, 8, 8 } },
4174 { ISD::SMULO, MVT::v16i8, { 13, 22, 24, 25 } },
4175 { ISD::UADDSAT, MVT::v2i64, { 6, 13, 14, 14 } },
4176 { ISD::UADDSAT, MVT::v4i32, { 2, 2, 4, 4 } },
4177 { ISD::USUBSAT, MVT::v2i64, { 6, 10, 14, 14 } },
4178 { ISD::USUBSAT, MVT::v4i32, { 1, 2, 2, 2 } },
4179 { ISD::UMAX, MVT::v2i64, { 2, 11, 6, 7 } },
4180 { ISD::UMAX, MVT::v4i32, { 1, 1, 1, 1 } },
4181 { ISD::UMAX, MVT::v8i16, { 1, 1, 1, 1 } },
4182 { ISD::UMIN, MVT::v2i64, { 2, 11, 6, 7 } },
4183 { ISD::UMIN, MVT::v4i32, { 1, 1, 1, 1 } },
4184 { ISD::UMIN, MVT::v8i16, { 1, 1, 1, 1 } },
4185 { ISD::UMULO, MVT::v2i64, { 14, 20, 15, 20 } },
4186 { ISD::UMULO, MVT::v4i32, { 19, 22, 12, 18 } },
4187 { ISD::UMULO, MVT::v8i16, { 4, 9, 7, 7 } },
4188 { ISD::UMULO, MVT::v16i8, { 13, 19, 18, 20 } },
4189 };
4190 static const CostKindTblEntry SSSE3CostTbl[] = {
4191 { ISD::ABS, MVT::v4i32, { 1, 2, 1, 1 } },
4192 { ISD::ABS, MVT::v8i16, { 1, 2, 1, 1 } },
4193 { ISD::ABS, MVT::v16i8, { 1, 2, 1, 1 } },
4194 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 11, 21 } },
4195 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 11, 21 } },
4196 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 11, 21 } },
4197 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 10, 16 } },
4198 { ISD::BSWAP, MVT::v2i64, { 2, 3, 1, 5 } },
4199 { ISD::BSWAP, MVT::v4i32, { 2, 3, 1, 5 } },
4200 { ISD::BSWAP, MVT::v8i16, { 2, 3, 1, 5 } },
4201 { ISD::CTLZ, MVT::v2i64, { 18, 28, 28, 35 } },
4202 { ISD::CTLZ, MVT::v4i32, { 15, 20, 22, 28 } },
4203 { ISD::CTLZ, MVT::v8i16, { 13, 17, 16, 22 } },
4204 { ISD::CTLZ, MVT::v16i8, { 11, 15, 10, 16 } },
4205 { ISD::CTPOP, MVT::v2i64, { 13, 19, 12, 18 } },
4206 { ISD::CTPOP, MVT::v4i32, { 18, 24, 16, 22 } },
4207 { ISD::CTPOP, MVT::v8i16, { 13, 18, 14, 20 } },
4208 { ISD::CTPOP, MVT::v16i8, { 11, 12, 10, 16 } },
4209 { ISD::CTTZ, MVT::v2i64, { 13, 25, 15, 22 } },
4210 { ISD::CTTZ, MVT::v4i32, { 18, 26, 19, 25 } },
4211 { ISD::CTTZ, MVT::v8i16, { 13, 20, 17, 23 } },
4212 { ISD::CTTZ, MVT::v16i8, { 11, 16, 13, 19 } }
4213 };
4214 static const CostKindTblEntry SSE2CostTbl[] = {
4215 { ISD::ABS, MVT::v2i64, { 3, 6, 5, 5 } },
4216 { ISD::ABS, MVT::v4i32, { 1, 4, 4, 4 } },
4217 { ISD::ABS, MVT::v8i16, { 1, 2, 3, 3 } },
4218 { ISD::ABS, MVT::v16i8, { 1, 2, 3, 3 } },
4219 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 32, 32 } },
4220 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 30, 30 } },
4221 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 25, 25 } },
4222 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 21, 21 } },
4223 { ISD::BSWAP, MVT::v2i64, { 5, 6, 11, 11 } },
4224 { ISD::BSWAP, MVT::v4i32, { 5, 5, 9, 9 } },
4225 { ISD::BSWAP, MVT::v8i16, { 5, 5, 4, 5 } },
4226 { ISD::CTLZ, MVT::v2i64, { 10, 45, 36, 38 } },
4227 { ISD::CTLZ, MVT::v4i32, { 10, 45, 38, 40 } },
4228 { ISD::CTLZ, MVT::v8i16, { 9, 38, 32, 34 } },
4229 { ISD::CTLZ, MVT::v16i8, { 8, 39, 29, 32 } },
4230 { ISD::CTPOP, MVT::v2i64, { 12, 26, 16, 18 } },
4231 { ISD::CTPOP, MVT::v4i32, { 15, 29, 21, 23 } },
4232 { ISD::CTPOP, MVT::v8i16, { 13, 25, 18, 20 } },
4233 { ISD::CTPOP, MVT::v16i8, { 10, 21, 14, 16 } },
4234 { ISD::CTTZ, MVT::v2i64, { 14, 28, 19, 21 } },
4235 { ISD::CTTZ, MVT::v4i32, { 18, 31, 24, 26 } },
4236 { ISD::CTTZ, MVT::v8i16, { 16, 27, 21, 23 } },
4237 { ISD::CTTZ, MVT::v16i8, { 13, 23, 17, 19 } },
4238 { ISD::SADDSAT, MVT::v2i64, { 12, 14, 24, 24 } },
4239 { ISD::SADDSAT, MVT::v4i32, { 6, 11, 11, 12 } },
4240 { ISD::SADDSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4241 { ISD::SADDSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4242 { ISD::SMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4243 { ISD::SMAX, MVT::v4i32, { 2, 4, 5, 5 } },
4244 { ISD::SMAX, MVT::v8i16, { 1, 1, 1, 1 } },
4245 { ISD::SMAX, MVT::v16i8, { 2, 4, 5, 5 } },
4246 { ISD::SMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4247 { ISD::SMIN, MVT::v4i32, { 2, 4, 5, 5 } },
4248 { ISD::SMIN, MVT::v8i16, { 1, 1, 1, 1 } },
4249 { ISD::SMIN, MVT::v16i8, { 2, 4, 5, 5 } },
4250 { ISD::SMULO, MVT::v2i64, { 30, 33, 13, 23 } },
4251 { ISD::SMULO, MVT::v4i32, { 20, 24, 23, 23 } },
4252 { ISD::SMULO, MVT::v8i16, { 5, 10, 8, 8 } },
4253 { ISD::SMULO, MVT::v16i8, { 13, 23, 24, 25 } },
4254 { ISD::SSUBSAT, MVT::v2i64, { 16, 19, 31, 31 } },
4255 { ISD::SSUBSAT, MVT::v4i32, { 6, 14, 12, 13 } },
4256 { ISD::SSUBSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4257 { ISD::SSUBSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4258 { ISD::UADDSAT, MVT::v2i64, { 7, 13, 14, 14 } },
4259 { ISD::UADDSAT, MVT::v4i32, { 4, 5, 7, 7 } },
4260 { ISD::UADDSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4261 { ISD::UADDSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4262 { ISD::UMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4263 { ISD::UMAX, MVT::v4i32, { 2, 5, 8, 8 } },
4264 { ISD::UMAX, MVT::v8i16, { 1, 3, 3, 3 } },
4265 { ISD::UMAX, MVT::v16i8, { 1, 1, 1, 1 } },
4266 { ISD::UMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4267 { ISD::UMIN, MVT::v4i32, { 2, 5, 8, 8 } },
4268 { ISD::UMIN, MVT::v8i16, { 1, 3, 3, 3 } },
4269 { ISD::UMIN, MVT::v16i8, { 1, 1, 1, 1 } },
4270 { ISD::UMULO, MVT::v2i64, { 30, 33, 15, 29 } },
4271 { ISD::UMULO, MVT::v4i32, { 19, 22, 14, 18 } },
4272 { ISD::UMULO, MVT::v8i16, { 4, 9, 7, 7 } },
4273 { ISD::UMULO, MVT::v16i8, { 13, 19, 20, 20 } },
4274 { ISD::USUBSAT, MVT::v2i64, { 7, 10, 14, 14 } },
4275 { ISD::USUBSAT, MVT::v4i32, { 4, 4, 7, 7 } },
4276 { ISD::USUBSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4277 { ISD::USUBSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4278 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } },
4279 { ISD::FMAXNUM, MVT::v2f64, { 4, 6, 6, 6 } },
4280 { ISD::FSQRT, MVT::f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4281 { ISD::FSQRT, MVT::v2f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4282 };
4283 static const CostKindTblEntry SSE1CostTbl[] = {
4284 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } },
4285 { ISD::FMAXNUM, MVT::v4f32, { 4, 6, 6, 6 } },
4286 { ISD::FSQRT, MVT::f32, { 28, 30, 1, 2 } }, // Pentium III from http://www.agner.org/
4287 { ISD::FSQRT, MVT::v4f32, { 56, 56, 1, 2 } }, // Pentium III from http://www.agner.org/
4288 };
4289 static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets
4290 { ISD::CTTZ, MVT::i64, { 1, 1, 1, 1 } },
4291 };
4292 static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
4293 { ISD::CTTZ, MVT::i32, { 1, 1, 1, 1 } },
4294 { ISD::CTTZ, MVT::i16, { 2, 1, 1, 1 } },
4295 { ISD::CTTZ, MVT::i8, { 2, 1, 1, 1 } },
4296 };
4297 static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets
4298 { ISD::CTLZ, MVT::i64, { 1, 1, 1, 1 } },
4299 };
4300 static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
4301 { ISD::CTLZ, MVT::i32, { 1, 1, 1, 1 } },
4302 { ISD::CTLZ, MVT::i16, { 2, 1, 1, 1 } },
4303 { ISD::CTLZ, MVT::i8, { 2, 1, 1, 1 } },
4304 };
4305 static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets
4306 { ISD::CTPOP, MVT::i64, { 1, 1, 1, 1 } }, // popcnt
4307 };
4308 static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
4309 { ISD::CTPOP, MVT::i32, { 1, 1, 1, 1 } }, // popcnt
4310 { ISD::CTPOP, MVT::i16, { 1, 1, 2, 2 } }, // popcnt(zext())
4311 { ISD::CTPOP, MVT::i8, { 1, 1, 2, 2 } }, // popcnt(zext())
4312 };
4313 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
4314 { ISD::ABS, MVT::i64, { 1, 2, 3, 3 } }, // SUB+CMOV
4315 { ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } },
4316 { ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } },
4317 { ISD::CTLZ, MVT::i64, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
4318 { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 2, 2 } }, // BSR+XOR
4319 { ISD::CTTZ, MVT::i64, { 2, 2, 3, 4 } }, // TEST+BSF+CMOV/BRANCH
4320 { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 1, 2 } }, // BSF
4321 { ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } },
4322 { ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } },
4323 { ISD::ROTR, MVT::i64, { 2, 3, 1, 3 } },
4324 { X86ISD::VROTLI, MVT::i64, { 1, 1, 1, 1 } },
4325 { ISD::FSHL, MVT::i64, { 4, 4, 1, 4 } },
4326 { ISD::SADDSAT, MVT::i64, { 4, 4, 7, 10 } },
4327 { ISD::SSUBSAT, MVT::i64, { 4, 5, 8, 11 } },
4328 { ISD::UADDSAT, MVT::i64, { 2, 3, 4, 7 } },
4329 { ISD::USUBSAT, MVT::i64, { 2, 3, 4, 7 } },
4330 { ISD::SMAX, MVT::i64, { 1, 3, 2, 3 } },
4331 { ISD::SMIN, MVT::i64, { 1, 3, 2, 3 } },
4332 { ISD::UMAX, MVT::i64, { 1, 3, 2, 3 } },
4333 { ISD::UMIN, MVT::i64, { 1, 3, 2, 3 } },
4334 { ISD::SADDO, MVT::i64, { 2, 2, 4, 6 } },
4335 { ISD::UADDO, MVT::i64, { 2, 2, 4, 6 } },
4336 { ISD::SMULO, MVT::i64, { 4, 4, 4, 6 } },
4337 { ISD::UMULO, MVT::i64, { 8, 8, 4, 7 } },
4338 };
4339 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
4340 { ISD::ABS, MVT::i32, { 1, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
4341 { ISD::ABS, MVT::i16, { 2, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
4342 { ISD::ABS, MVT::i8, { 2, 4, 4, 3 } }, // SUB+XOR+SRA
4343 { ISD::BITREVERSE, MVT::i32, { 9, 12, 17, 19 } },
4344 { ISD::BITREVERSE, MVT::i16, { 9, 12, 17, 19 } },
4345 { ISD::BITREVERSE, MVT::i8, { 7, 9, 13, 14 } },
4346 { ISD::BSWAP, MVT::i32, { 1, 1, 1, 1 } },
4347 { ISD::BSWAP, MVT::i16, { 1, 2, 1, 2 } }, // ROL
4348 { ISD::CTLZ, MVT::i32, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
4349 { ISD::CTLZ, MVT::i16, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
4350 { ISD::CTLZ, MVT::i8, { 2, 2, 5, 6 } }, // BSR+XOR or BSR+XOR+CMOV
4351 { ISD::CTLZ_ZERO_UNDEF, MVT::i32,{ 1, 2, 2, 2 } }, // BSR+XOR
4352 { ISD::CTLZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 2, 2 } }, // BSR+XOR
4353 { ISD::CTLZ_ZERO_UNDEF, MVT::i8, { 2, 2, 3, 3 } }, // BSR+XOR
4354 { ISD::CTTZ, MVT::i32, { 2, 2, 3, 3 } }, // TEST+BSF+CMOV/BRANCH
4355 { ISD::CTTZ, MVT::i16, { 2, 2, 2, 3 } }, // TEST+BSF+CMOV/BRANCH
4356 { ISD::CTTZ, MVT::i8, { 2, 2, 2, 3 } }, // TEST+BSF+CMOV/BRANCH
4357 { ISD::CTTZ_ZERO_UNDEF, MVT::i32,{ 1, 2, 1, 2 } }, // BSF
4358 { ISD::CTTZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 1, 2 } }, // BSF
4359 { ISD::CTTZ_ZERO_UNDEF, MVT::i8, { 2, 2, 1, 2 } }, // BSF
4360 { ISD::CTPOP, MVT::i32, { 8, 7, 15, 15 } },
4361 { ISD::CTPOP, MVT::i16, { 9, 8, 17, 17 } },
4362 { ISD::CTPOP, MVT::i8, { 7, 6, 6, 6 } },
4363 { ISD::ROTL, MVT::i32, { 2, 3, 1, 3 } },
4364 { ISD::ROTL, MVT::i16, { 2, 3, 1, 3 } },
4365 { ISD::ROTL, MVT::i8, { 2, 3, 1, 3 } },
4366 { ISD::ROTR, MVT::i32, { 2, 3, 1, 3 } },
4367 { ISD::ROTR, MVT::i16, { 2, 3, 1, 3 } },
4368 { ISD::ROTR, MVT::i8, { 2, 3, 1, 3 } },
4369 { X86ISD::VROTLI, MVT::i32, { 1, 1, 1, 1 } },
4370 { X86ISD::VROTLI, MVT::i16, { 1, 1, 1, 1 } },
4371 { X86ISD::VROTLI, MVT::i8, { 1, 1, 1, 1 } },
4372 { ISD::FSHL, MVT::i32, { 4, 4, 1, 4 } },
4373 { ISD::FSHL, MVT::i16, { 4, 4, 2, 5 } },
4374 { ISD::FSHL, MVT::i8, { 4, 4, 2, 5 } },
4375 { ISD::SADDSAT, MVT::i32, { 3, 4, 6, 9 } },
4376 { ISD::SADDSAT, MVT::i16, { 4, 4, 7, 10 } },
4377 { ISD::SADDSAT, MVT::i8, { 4, 5, 8, 11 } },
4378 { ISD::SSUBSAT, MVT::i32, { 4, 4, 7, 10 } },
4379 { ISD::SSUBSAT, MVT::i16, { 4, 4, 7, 10 } },
4380 { ISD::SSUBSAT, MVT::i8, { 4, 5, 8, 11 } },
4381 { ISD::UADDSAT, MVT::i32, { 2, 3, 4, 7 } },
4382 { ISD::UADDSAT, MVT::i16, { 2, 3, 4, 7 } },
4383 { ISD::UADDSAT, MVT::i8, { 3, 3, 5, 8 } },
4384 { ISD::USUBSAT, MVT::i32, { 2, 3, 4, 7 } },
4385 { ISD::USUBSAT, MVT::i16, { 2, 3, 4, 7 } },
4386 { ISD::USUBSAT, MVT::i8, { 3, 3, 5, 8 } },
4387 { ISD::SMAX, MVT::i32, { 1, 2, 2, 3 } },
4388 { ISD::SMAX, MVT::i16, { 1, 4, 2, 4 } },
4389 { ISD::SMAX, MVT::i8, { 1, 4, 2, 4 } },
4390 { ISD::SMIN, MVT::i32, { 1, 2, 2, 3 } },
4391 { ISD::SMIN, MVT::i16, { 1, 4, 2, 4 } },
4392 { ISD::SMIN, MVT::i8, { 1, 4, 2, 4 } },
4393 { ISD::UMAX, MVT::i32, { 1, 2, 2, 3 } },
4394 { ISD::UMAX, MVT::i16, { 1, 4, 2, 4 } },
4395 { ISD::UMAX, MVT::i8, { 1, 4, 2, 4 } },
4396 { ISD::UMIN, MVT::i32, { 1, 2, 2, 3 } },
4397 { ISD::UMIN, MVT::i16, { 1, 4, 2, 4 } },
4398 { ISD::UMIN, MVT::i8, { 1, 4, 2, 4 } },
4399 { ISD::SADDO, MVT::i32, { 2, 2, 4, 6 } },
4400 { ISD::SADDO, MVT::i16, { 2, 2, 4, 6 } },
4401 { ISD::SADDO, MVT::i8, { 2, 2, 4, 6 } },
4402 { ISD::UADDO, MVT::i32, { 2, 2, 4, 6 } },
4403 { ISD::UADDO, MVT::i16, { 2, 2, 4, 6 } },
4404 { ISD::UADDO, MVT::i8, { 2, 2, 4, 6 } },
4405 { ISD::SMULO, MVT::i32, { 2, 2, 4, 6 } },
4406 { ISD::SMULO, MVT::i16, { 5, 5, 4, 6 } },
4407 { ISD::SMULO, MVT::i8, { 6, 6, 4, 6 } },
4408 { ISD::UMULO, MVT::i32, { 6, 6, 4, 8 } },
4409 { ISD::UMULO, MVT::i16, { 6, 6, 4, 9 } },
4410 { ISD::UMULO, MVT::i8, { 6, 6, 4, 6 } },
4411 };
4412
4413 Type *RetTy = ICA.getReturnType();
4414 Type *OpTy = RetTy;
4415 Intrinsic::ID IID = ICA.getID();
4416 unsigned ISD = ISD::DELETED_NODE;
4417 switch (IID) {
4418 default:
4419 break;
4420 case Intrinsic::abs:
4421 ISD = ISD::ABS;
4422 break;
4423 case Intrinsic::bitreverse:
4424 ISD = ISD::BITREVERSE;
4425 break;
4426 case Intrinsic::bswap:
4427 ISD = ISD::BSWAP;
4428 break;
4429 case Intrinsic::ctlz:
4430 ISD = ISD::CTLZ;
4431 break;
4432 case Intrinsic::ctpop:
4433 ISD = ISD::CTPOP;
4434 break;
4435 case Intrinsic::cttz:
4436 ISD = ISD::CTTZ;
4437 break;
4438 case Intrinsic::fshl:
4439 ISD = ISD::FSHL;
4440 if (!ICA.isTypeBasedOnly()) {
4441 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4442 if (Args[0] == Args[1]) {
4443 ISD = ISD::ROTL;
4444 // Handle uniform constant rotation amounts.
4445 // TODO: Handle funnel-shift cases.
4446 const APInt *Amt;
4447 if (Args[2] &&
4449 ISD = X86ISD::VROTLI;
4450 }
4451 }
4452 break;
4453 case Intrinsic::fshr:
4454 // FSHR has same costs so don't duplicate.
4455 ISD = ISD::FSHL;
4456 if (!ICA.isTypeBasedOnly()) {
4457 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4458 if (Args[0] == Args[1]) {
4459 ISD = ISD::ROTR;
4460 // Handle uniform constant rotation amount.
4461 // TODO: Handle funnel-shift cases.
4462 const APInt *Amt;
4463 if (Args[2] &&
4465 ISD = X86ISD::VROTLI;
4466 }
4467 }
4468 break;
4469 case Intrinsic::lrint:
4470 case Intrinsic::llrint:
4471 // X86 can use the CVTP2SI instructions to lower lrint/llrint calls, which
4472 // have the same costs as the CVTTP2SI (fptosi) instructions
4473 if (!ICA.isTypeBasedOnly()) {
4474 const SmallVectorImpl<Type *> &ArgTys = ICA.getArgTypes();
4475 return getCastInstrCost(Instruction::FPToSI, RetTy, ArgTys[0],
4477 }
4478 break;
4479 case Intrinsic::maxnum:
4480 case Intrinsic::minnum:
4481 // FMINNUM has same costs so don't duplicate.
4482 ISD = ISD::FMAXNUM;
4483 break;
4484 case Intrinsic::sadd_sat:
4485 ISD = ISD::SADDSAT;
4486 break;
4487 case Intrinsic::smax:
4488 ISD = ISD::SMAX;
4489 break;
4490 case Intrinsic::smin:
4491 ISD = ISD::SMIN;
4492 break;
4493 case Intrinsic::ssub_sat:
4494 ISD = ISD::SSUBSAT;
4495 break;
4496 case Intrinsic::uadd_sat:
4497 ISD = ISD::UADDSAT;
4498 break;
4499 case Intrinsic::umax:
4500 ISD = ISD::UMAX;
4501 break;
4502 case Intrinsic::umin:
4503 ISD = ISD::UMIN;
4504 break;
4505 case Intrinsic::usub_sat:
4506 ISD = ISD::USUBSAT;
4507 break;
4508 case Intrinsic::sqrt:
4509 ISD = ISD::FSQRT;
4510 break;
4511 case Intrinsic::sadd_with_overflow:
4512 case Intrinsic::ssub_with_overflow:
4513 // SSUBO has same costs so don't duplicate.
4514 ISD = ISD::SADDO;
4515 OpTy = RetTy->getContainedType(0);
4516 break;
4517 case Intrinsic::uadd_with_overflow:
4518 case Intrinsic::usub_with_overflow:
4519 // USUBO has same costs so don't duplicate.
4520 ISD = ISD::UADDO;
4521 OpTy = RetTy->getContainedType(0);
4522 break;
4523 case Intrinsic::smul_with_overflow:
4524 ISD = ISD::SMULO;
4525 OpTy = RetTy->getContainedType(0);
4526 break;
4527 case Intrinsic::umul_with_overflow:
4528 ISD = ISD::UMULO;
4529 OpTy = RetTy->getContainedType(0);
4530 break;
4531 }
4532
4533 if (ISD != ISD::DELETED_NODE) {
4534 auto adjustTableCost = [&](int ISD, unsigned Cost,
4535 std::pair<InstructionCost, MVT> LT,
4537 InstructionCost LegalizationCost = LT.first;
4538 MVT MTy = LT.second;
4539
4540 // If there are no NANs to deal with, then these are reduced to a
4541 // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
4542 // assume is used in the non-fast case.
4543 if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) {
4544 if (FMF.noNaNs())
4545 return LegalizationCost * 1;
4546 }
4547
4548 // For cases where some ops can be folded into a load/store, assume free.
4549 if (MTy.isScalarInteger()) {
4550 if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
4551 if (const Instruction *II = ICA.getInst()) {
4552 if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
4553 return TTI::TCC_Free;
4554 if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
4555 if (LI->hasOneUse())
4556 return TTI::TCC_Free;
4557 }
4558 }
4559 }
4560 }
4561
4562 return LegalizationCost * (int)Cost;
4563 };
4564
4565 // Legalize the type.
4566 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy);
4567 MVT MTy = LT.second;
4568
4569 // Without BMI/LZCNT see if we're only looking for a *_ZERO_UNDEF cost.
4570 if (((ISD == ISD::CTTZ && !ST->hasBMI()) ||
4571 (ISD == ISD::CTLZ && !ST->hasLZCNT())) &&
4572 !MTy.isVector() && !ICA.isTypeBasedOnly()) {
4573 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4574 if (auto *Cst = dyn_cast<ConstantInt>(Args[1]))
4575 if (Cst->isAllOnesValue())
4577 }
4578
4579 // FSQRT is a single instruction.
4580 if (ISD == ISD::FSQRT && CostKind == TTI::TCK_CodeSize)
4581 return LT.first;
4582
4583 if (ST->useGLMDivSqrtCosts())
4584 if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
4585 if (auto KindCost = Entry->Cost[CostKind])
4586 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4587
4588 if (ST->useSLMArithCosts())
4589 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
4590 if (auto KindCost = Entry->Cost[CostKind])
4591 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4592
4593 if (ST->hasVBMI2())
4594 if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy))
4595 if (auto KindCost = Entry->Cost[CostKind])
4596 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4597
4598 if (ST->hasBITALG())
4599 if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy))
4600 if (auto KindCost = Entry->Cost[CostKind])
4601 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4602
4603 if (ST->hasVPOPCNTDQ())
4604 if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy))
4605 if (auto KindCost = Entry->Cost[CostKind])
4606 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4607
4608 if (ST->hasGFNI())
4609 if (const auto *Entry = CostTableLookup(GFNICostTbl, ISD, MTy))
4610 if (auto KindCost = Entry->Cost[CostKind])
4611 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4612
4613 if (ST->hasCDI())
4614 if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
4615 if (auto KindCost = Entry->Cost[CostKind])
4616 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4617
4618 if (ST->hasBWI())
4619 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
4620 if (auto KindCost = Entry->Cost[CostKind])
4621 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4622
4623 if (ST->hasAVX512())
4624 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
4625 if (auto KindCost = Entry->Cost[CostKind])
4626 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4627
4628 if (ST->hasXOP())
4629 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
4630 if (auto KindCost = Entry->Cost[CostKind])
4631 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4632
4633 if (ST->hasAVX2())
4634 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
4635 if (auto KindCost = Entry->Cost[CostKind])
4636 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4637
4638 if (ST->hasAVX())
4639 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
4640 if (auto KindCost = Entry->Cost[CostKind])
4641 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4642
4643 if (ST->hasSSE42())
4644 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
4645 if (auto KindCost = Entry->Cost[CostKind])
4646 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4647
4648 if (ST->hasSSE41())
4649 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
4650 if (auto KindCost = Entry->Cost[CostKind])
4651 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4652
4653 if (ST->hasSSSE3())
4654 if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
4655 if (auto KindCost = Entry->Cost[CostKind])
4656 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4657
4658 if (ST->hasSSE2())
4659 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
4660 if (auto KindCost = Entry->Cost[CostKind])
4661 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4662
4663 if (ST->hasSSE1())
4664 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
4665 if (auto KindCost = Entry->Cost[CostKind])
4666 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4667
4668 if (ST->hasBMI()) {
4669 if (ST->is64Bit())
4670 if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
4671 if (auto KindCost = Entry->Cost[CostKind])
4672 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4673
4674 if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
4675 if (auto KindCost = Entry->Cost[CostKind])
4676 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4677 }
4678
4679 if (ST->hasLZCNT()) {
4680 if (ST->is64Bit())
4681 if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
4682 if (auto KindCost = Entry->Cost[CostKind])
4683 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4684
4685 if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
4686 if (auto KindCost = Entry->Cost[CostKind])
4687 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4688 }
4689
4690 if (ST->hasPOPCNT()) {
4691 if (ST->is64Bit())
4692 if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
4693 if (auto KindCost = Entry->Cost[CostKind])
4694 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4695
4696 if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
4697 if (auto KindCost = Entry->Cost[CostKind])
4698 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4699 }
4700
4701 if (ST->is64Bit())
4702 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
4703 if (auto KindCost = Entry->Cost[CostKind])
4704 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4705
4706 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
4707 if (auto KindCost = Entry->Cost[CostKind])
4708 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4709 }
4710
4712}
4713
4716 unsigned Index, Value *Op0,
4717 Value *Op1) {
4718 static const CostTblEntry SLMCostTbl[] = {
4719 { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 },
4720 { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 },
4721 { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 },
4722 { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 }
4723 };
4724
4725 assert(Val->isVectorTy() && "This must be a vector type");
4726 Type *ScalarType = Val->getScalarType();
4727 InstructionCost RegisterFileMoveCost = 0;
4728
4729 // Non-immediate extraction/insertion can be handled as a sequence of
4730 // aliased loads+stores via the stack.
4731 if (Index == -1U && (Opcode == Instruction::ExtractElement ||
4732 Opcode == Instruction::InsertElement)) {
4733 // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
4734 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
4735
4736 // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling.
4737 assert(isa<FixedVectorType>(Val) && "Fixed vector type expected");
4738 Align VecAlign = DL.getPrefTypeAlign(Val);
4739 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
4740
4741 // Extract - store vector to stack, load scalar.
4742 if (Opcode == Instruction::ExtractElement) {
4743 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4744 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
4745 CostKind);
4746 }
4747 // Insert - store vector to stack, store scalar, load vector.
4748 if (Opcode == Instruction::InsertElement) {
4749 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4750 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
4751 CostKind) +
4752 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind);
4753 }
4754 }
4755
4756 if (Index != -1U && (Opcode == Instruction::ExtractElement ||
4757 Opcode == Instruction::InsertElement)) {
4758 // Extraction of vXi1 elements are now efficiently handled by MOVMSK.
4759 if (Opcode == Instruction::ExtractElement &&
4760 ScalarType->getScalarSizeInBits() == 1 &&
4761 cast<FixedVectorType>(Val)->getNumElements() > 1)
4762 return 1;
4763
4764 // Legalize the type.
4765 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
4766
4767 // This type is legalized to a scalar type.
4768 if (!LT.second.isVector())
4769 return TTI::TCC_Free;
4770
4771 // The type may be split. Normalize the index to the new type.
4772 unsigned SizeInBits = LT.second.getSizeInBits();
4773 unsigned NumElts = LT.second.getVectorNumElements();
4774 unsigned SubNumElts = NumElts;
4775 Index = Index % NumElts;
4776
4777 // For >128-bit vectors, we need to extract higher 128-bit subvectors.
4778 // For inserts, we also need to insert the subvector back.
4779 if (SizeInBits > 128) {
4780 assert((SizeInBits % 128) == 0 && "Illegal vector");
4781 unsigned NumSubVecs = SizeInBits / 128;
4782 SubNumElts = NumElts / NumSubVecs;
4783 if (SubNumElts <= Index) {
4784 RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
4785 Index %= SubNumElts;
4786 }
4787 }
4788
4789 MVT MScalarTy = LT.second.getScalarType();
4790 auto IsCheapPInsrPExtrInsertPS = [&]() {
4791 // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
4792 // Also, assume insertps is relatively cheap on all >= SSE41 targets.
4793 return (MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4794 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4795 (MScalarTy == MVT::f32 && ST->hasSSE41() &&
4796 Opcode == Instruction::InsertElement);
4797 };
4798
4799 if (Index == 0) {
4800 // Floating point scalars are already located in index #0.
4801 // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
4802 // true for all.
4803 if (ScalarType->isFloatingPointTy() &&
4804 (Opcode != Instruction::InsertElement || !Op0 ||
4805 isa<UndefValue>(Op0)))
4806 return RegisterFileMoveCost;
4807
4808 if (Opcode == Instruction::InsertElement &&
4809 isa_and_nonnull<UndefValue>(Op0)) {
4810 // Consider the gather cost to be cheap.
4811 if (isa_and_nonnull<LoadInst>(Op1))
4812 return RegisterFileMoveCost;
4813 if (!IsCheapPInsrPExtrInsertPS()) {
4814 // mov constant-to-GPR + movd/movq GPR -> XMM.
4815 if (isa_and_nonnull<Constant>(Op1) && Op1->getType()->isIntegerTy())
4816 return 2 + RegisterFileMoveCost;
4817 // Assume movd/movq GPR -> XMM is relatively cheap on all targets.
4818 return 1 + RegisterFileMoveCost;
4819 }
4820 }
4821
4822 // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
4823 if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
4824 return 1 + RegisterFileMoveCost;
4825 }
4826
4827 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4828 assert(ISD && "Unexpected vector opcode");
4829 if (ST->useSLMArithCosts())
4830 if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
4831 return Entry->Cost + RegisterFileMoveCost;
4832
4833 // Consider cheap cases.
4834 if (IsCheapPInsrPExtrInsertPS())
4835 return 1 + RegisterFileMoveCost;
4836
4837 // For extractions we just need to shuffle the element to index 0, which
4838 // should be very cheap (assume cost = 1). For insertions we need to shuffle
4839 // the elements to its destination. In both cases we must handle the
4840 // subvector move(s).
4841 // If the vector type is already less than 128-bits then don't reduce it.
4842 // TODO: Under what circumstances should we shuffle using the full width?
4843 InstructionCost ShuffleCost = 1;
4844 if (Opcode == Instruction::InsertElement) {
4845 auto *SubTy = cast<VectorType>(Val);
4846 EVT VT = TLI->getValueType(DL, Val);
4847 if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
4848 SubTy = FixedVectorType::get(ScalarType, SubNumElts);
4849 ShuffleCost =
4850 getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, {}, CostKind, 0, SubTy);
4851 }
4852 int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
4853 return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
4854 }
4855
4856 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1) +
4857 RegisterFileMoveCost;
4858}
4859
4861 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
4863 assert(DemandedElts.getBitWidth() ==
4864 cast<FixedVectorType>(Ty)->getNumElements() &&
4865 "Vector size mismatch");
4866
4867 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4868 MVT MScalarTy = LT.second.getScalarType();
4869 unsigned LegalVectorBitWidth = LT.second.getSizeInBits();
4871
4872 constexpr unsigned LaneBitWidth = 128;
4873 assert((LegalVectorBitWidth < LaneBitWidth ||
4874 (LegalVectorBitWidth % LaneBitWidth) == 0) &&
4875 "Illegal vector");
4876
4877 const int NumLegalVectors = *LT.first.getValue();
4878 assert(NumLegalVectors >= 0 && "Negative cost!");
4879
4880 // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
4881 // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
4882 if (Insert) {
4883 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4884 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4885 (MScalarTy == MVT::f32 && ST->hasSSE41())) {
4886 // For types we can insert directly, insertion into 128-bit sub vectors is
4887 // cheap, followed by a cheap chain of concatenations.
4888 if (LegalVectorBitWidth <= LaneBitWidth) {
4889 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
4890 /*Extract*/ false, CostKind);
4891 } else {
4892 // In each 128-lane, if at least one index is demanded but not all
4893 // indices are demanded and this 128-lane is not the first 128-lane of
4894 // the legalized-vector, then this 128-lane needs a extracti128; If in
4895 // each 128-lane, there is at least one demanded index, this 128-lane
4896 // needs a inserti128.
4897
4898 // The following cases will help you build a better understanding:
4899 // Assume we insert several elements into a v8i32 vector in avx2,
4900 // Case#1: inserting into 1th index needs vpinsrd + inserti128.
4901 // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
4902 // inserti128.
4903 // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
4904 assert((LegalVectorBitWidth % LaneBitWidth) == 0 && "Illegal vector");
4905 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
4906 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
4907 unsigned NumLegalElts =
4908 LT.second.getVectorNumElements() * NumLegalVectors;
4909 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
4910 "Vector has been legalized to smaller element count");
4911 assert((NumLegalElts % NumLanesTotal) == 0 &&
4912 "Unexpected elts per lane");
4913 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
4914
4915 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
4916 auto *LaneTy =
4917 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
4918
4919 for (unsigned I = 0; I != NumLanesTotal; ++I) {
4920 APInt LaneEltMask = WidenedDemandedElts.extractBits(
4921 NumEltsPerLane, NumEltsPerLane * I);
4922 if (LaneEltMask.isZero())
4923 continue;
4924 // FIXME: we don't need to extract if all non-demanded elements
4925 // are legalization-inserted padding.
4926 if (!LaneEltMask.isAllOnes())
4928 I * NumEltsPerLane, LaneTy);
4929 Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert,
4930 /*Extract*/ false, CostKind);
4931 }
4932
4933 APInt AffectedLanes =
4934 APIntOps::ScaleBitMask(WidenedDemandedElts, NumLanesTotal);
4935 APInt FullyAffectedLegalVectors = APIntOps::ScaleBitMask(
4936 AffectedLanes, NumLegalVectors, /*MatchAllBits=*/true);
4937 for (int LegalVec = 0; LegalVec != NumLegalVectors; ++LegalVec) {
4938 for (unsigned Lane = 0; Lane != NumLegalLanes; ++Lane) {
4939 unsigned I = NumLegalLanes * LegalVec + Lane;
4940 // No need to insert unaffected lane; or lane 0 of each legal vector
4941 // iff ALL lanes of that vector were affected and will be inserted.
4942 if (!AffectedLanes[I] ||
4943 (Lane == 0 && FullyAffectedLegalVectors[LegalVec]))
4944 continue;
4946 I * NumEltsPerLane, LaneTy);
4947 }
4948 }
4949 }
4950 } else if (LT.second.isVector()) {
4951 // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
4952 // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
4953 // series of UNPCK followed by CONCAT_VECTORS - all of these can be
4954 // considered cheap.
4955 if (Ty->isIntOrIntVectorTy())
4956 Cost += DemandedElts.popcount();
4957
4958 // Get the smaller of the legalized or original pow2-extended number of
4959 // vector elements, which represents the number of unpacks we'll end up
4960 // performing.
4961 unsigned NumElts = LT.second.getVectorNumElements();
4962 unsigned Pow2Elts =
4963 PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements());
4964 Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
4965 }
4966 }
4967
4968 if (Extract) {
4969 // vXi1 can be efficiently extracted with MOVMSK.
4970 // TODO: AVX512 predicate mask handling.
4971 // NOTE: This doesn't work well for roundtrip scalarization.
4972 if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) {
4973 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
4974 unsigned MaxElts = ST->hasAVX2() ? 32 : 16;
4975 unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts;
4976 return MOVMSKCost;
4977 }
4978
4979 if (LT.second.isVector()) {
4980 unsigned NumLegalElts =
4981 LT.second.getVectorNumElements() * NumLegalVectors;
4982 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
4983 "Vector has been legalized to smaller element count");
4984
4985 // If we're extracting elements from a 128-bit subvector lane,
4986 // we only need to extract each lane once, not for every element.
4987 if (LegalVectorBitWidth > LaneBitWidth) {
4988 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
4989 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
4990 assert((NumLegalElts % NumLanesTotal) == 0 &&
4991 "Unexpected elts per lane");
4992 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
4993
4994 // Add cost for each demanded 128-bit subvector extraction.
4995 // Luckily this is a lot easier than for insertion.
4996 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
4997 auto *LaneTy =
4998 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
4999
5000 for (unsigned I = 0; I != NumLanesTotal; ++I) {
5001 APInt LaneEltMask = WidenedDemandedElts.extractBits(
5002 NumEltsPerLane, I * NumEltsPerLane);
5003 if (LaneEltMask.isZero())
5004 continue;
5006 I * NumEltsPerLane, LaneTy);
5008 LaneTy, LaneEltMask, /*Insert*/ false, Extract, CostKind);
5009 }
5010
5011 return Cost;
5012 }
5013 }
5014
5015 // Fallback to default extraction.
5016 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ false,
5017 Extract, CostKind);
5018 }
5019
5020 return Cost;
5021}
5022
5024X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
5025 int VF, const APInt &DemandedDstElts,
5027 const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy);
5028 // We don't differentiate element types here, only element bit width.
5029 EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits);
5030
5031 auto bailout = [&]() {
5032 return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
5033 DemandedDstElts, CostKind);
5034 };
5035
5036 // For now, only deal with AVX512 cases.
5037 if (!ST->hasAVX512())
5038 return bailout();
5039
5040 // Do we have a native shuffle for this element type, or should we promote?
5041 unsigned PromEltTyBits = EltTyBits;
5042 switch (EltTyBits) {
5043 case 32:
5044 case 64:
5045 break; // AVX512F.
5046 case 16:
5047 if (!ST->hasBWI())
5048 PromEltTyBits = 32; // promote to i32, AVX512F.
5049 break; // AVX512BW
5050 case 8:
5051 if (!ST->hasVBMI())
5052 PromEltTyBits = 32; // promote to i32, AVX512F.
5053 break; // AVX512VBMI
5054 case 1:
5055 // There is no support for shuffling i1 elements. We *must* promote.
5056 if (ST->hasBWI()) {
5057 if (ST->hasVBMI())
5058 PromEltTyBits = 8; // promote to i8, AVX512VBMI.
5059 else
5060 PromEltTyBits = 16; // promote to i16, AVX512BW.
5061 break;
5062 }
5063 PromEltTyBits = 32; // promote to i32, AVX512F.
5064 break;
5065 default:
5066 return bailout();
5067 }
5068 auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits);
5069
5070 auto *SrcVecTy = FixedVectorType::get(EltTy, VF);
5071 auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF);
5072
5073 int NumDstElements = VF * ReplicationFactor;
5074 auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements);
5075 auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements);
5076
5077 // Legalize the types.
5078 MVT LegalSrcVecTy = getTypeLegalizationCost(SrcVecTy).second;
5079 MVT LegalPromSrcVecTy = getTypeLegalizationCost(PromSrcVecTy).second;
5080 MVT LegalPromDstVecTy = getTypeLegalizationCost(PromDstVecTy).second;
5081 MVT LegalDstVecTy = getTypeLegalizationCost(DstVecTy).second;
5082 // They should have legalized into vector types.
5083 if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() ||
5084 !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector())
5085 return bailout();
5086
5087 if (PromEltTyBits != EltTyBits) {
5088 // If we have to perform the shuffle with wider elt type than our data type,
5089 // then we will first need to anyext (we don't care about the new bits)
5090 // the source elements, and then truncate Dst elements.
5091 InstructionCost PromotionCost;
5092 PromotionCost += getCastInstrCost(
5093 Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy,
5095 PromotionCost +=
5096 getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy,
5097 /*Src=*/PromDstVecTy,
5099 return PromotionCost + getReplicationShuffleCost(PromEltTy,
5100 ReplicationFactor, VF,
5101 DemandedDstElts, CostKind);
5102 }
5103
5104 assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits &&
5105 LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() &&
5106 "We expect that the legalization doesn't affect the element width, "
5107 "doesn't coalesce/split elements.");
5108
5109 unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements();
5110 unsigned NumDstVectors =
5111 divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec);
5112
5113 auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec);
5114
5115 // Not all the produced Dst elements may be demanded. In our case,
5116 // given that a single Dst vector is formed by a single shuffle,
5117 // if all elements that will form a single Dst vector aren't demanded,
5118 // then we won't need to do that shuffle, so adjust the cost accordingly.
5119 APInt DemandedDstVectors = APIntOps::ScaleBitMask(
5120 DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors);
5121 unsigned NumDstVectorsDemanded = DemandedDstVectors.popcount();
5122
5123 InstructionCost SingleShuffleCost = getShuffleCost(
5124 TTI::SK_PermuteSingleSrc, SingleDstVecTy, /*Mask=*/{}, CostKind,
5125 /*Index=*/0, /*SubTp=*/nullptr);
5126 return NumDstVectorsDemanded * SingleShuffleCost;
5127}
5128
5130 MaybeAlign Alignment,
5131 unsigned AddressSpace,
5133 TTI::OperandValueInfo OpInfo,
5134 const Instruction *I) {
5135 // TODO: Handle other cost kinds.
5137 if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
5138 // Store instruction with index and scale costs 2 Uops.
5139 // Check the preceding GEP to identify non-const indices.
5140 if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
5141 if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
5142 return TTI::TCC_Basic * 2;
5143 }
5144 }
5145 return TTI::TCC_Basic;
5146 }
5147
5148 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
5149 "Invalid Opcode");
5150 // Type legalization can't handle structs
5151 if (TLI->getValueType(DL, Src, true) == MVT::Other)
5152 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5153 CostKind, OpInfo, I);
5154
5155 // Legalize the type.
5156 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
5157
5158 auto *VTy = dyn_cast<FixedVectorType>(Src);
5159
5161
5162 // Add a cost for constant load to vector.
5163 if (Opcode == Instruction::Store && OpInfo.isConstant())
5164 Cost += getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src),
5165 /*AddressSpace=*/0, CostKind, OpInfo);
5166
5167 // Handle the simple case of non-vectors.
5168 // NOTE: this assumes that legalization never creates vector from scalars!
5169 if (!VTy || !LT.second.isVector()) {
5170 // Each load/store unit costs 1.
5171 return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1;
5172 }
5173
5174 bool IsLoad = Opcode == Instruction::Load;
5175
5176 Type *EltTy = VTy->getElementType();
5177
5178 const int EltTyBits = DL.getTypeSizeInBits(EltTy);
5179
5180 // Source of truth: how many elements were there in the original IR vector?
5181 const unsigned SrcNumElt = VTy->getNumElements();
5182
5183 // How far have we gotten?
5184 int NumEltRemaining = SrcNumElt;
5185 // Note that we intentionally capture by-reference, NumEltRemaining changes.
5186 auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
5187
5188 const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8);
5189
5190 // Note that even if we can store 64 bits of an XMM, we still operate on XMM.
5191 const unsigned XMMBits = 128;
5192 if (XMMBits % EltTyBits != 0)
5193 // Vector size must be a multiple of the element size. I.e. no padding.
5194 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5195 CostKind, OpInfo, I);
5196 const int NumEltPerXMM = XMMBits / EltTyBits;
5197
5198 auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
5199
5200 for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0;
5201 NumEltRemaining > 0; CurrOpSizeBytes /= 2) {
5202 // How many elements would a single op deal with at once?
5203 if ((8 * CurrOpSizeBytes) % EltTyBits != 0)
5204 // Vector size must be a multiple of the element size. I.e. no padding.
5205 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5206 CostKind, OpInfo, I);
5207 int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits;
5208
5209 assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?");
5210 assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||
5211 (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&
5212 "Unless we haven't halved the op size yet, "
5213 "we have less than two op's sized units of work left.");
5214
5215 auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
5216 ? FixedVectorType::get(EltTy, CurrNumEltPerOp)
5217 : XMMVecTy;
5218
5219 assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&
5220 "After halving sizes, the vector elt count is no longer a multiple "
5221 "of number of elements per operation?");
5222 auto *CoalescedVecTy =
5223 CurrNumEltPerOp == 1
5224 ? CurrVecTy
5226 IntegerType::get(Src->getContext(),
5227 EltTyBits * CurrNumEltPerOp),
5228 CurrVecTy->getNumElements() / CurrNumEltPerOp);
5229 assert(DL.getTypeSizeInBits(CoalescedVecTy) ==
5230 DL.getTypeSizeInBits(CurrVecTy) &&
5231 "coalesciing elements doesn't change vector width.");
5232
5233 while (NumEltRemaining > 0) {
5234 assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?");
5235
5236 // Can we use this vector size, as per the remaining element count?
5237 // Iff the vector is naturally aligned, we can do a wide load regardless.
5238 if (NumEltRemaining < CurrNumEltPerOp &&
5239 (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) &&
5240 CurrOpSizeBytes != 1)
5241 break; // Try smalled vector size.
5242
5243 // This isn't exactly right. We're using slow unaligned 32-byte accesses
5244 // as a proxy for a double-pumped AVX memory interface such as on
5245 // Sandybridge.
5246 // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or
5247 // will be scalarized.
5248 if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
5249 Cost += 2;
5250 else if (CurrOpSizeBytes < 4)
5251 Cost += 2;
5252 else
5253 Cost += 1;
5254
5255 // If we're loading a uniform value, then we don't need to split the load,
5256 // loading just a single (widest) vector can be reused by all splits.
5257 if (IsLoad && OpInfo.isUniform())
5258 return Cost;
5259
5260 bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
5261
5262 // If we have fully processed the previous reg, we need to replenish it.
5263 if (SubVecEltsLeft == 0) {
5264 SubVecEltsLeft += CurrVecTy->getNumElements();
5265 // And that's free only for the 0'th subvector of a legalized vector.
5266 if (!Is0thSubVec)
5269 VTy, {}, CostKind, NumEltDone(), CurrVecTy);
5270 }
5271
5272 // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
5273 // for smaller widths (32/16/8) we have to insert/extract them separately.
5274 // Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
5275 // but let's pretend that it is also true for 16/8 bit wide ops...)
5276 if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
5277 int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
5278 assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "");
5279 int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
5280 APInt DemandedElts =
5281 APInt::getBitsSet(CoalescedVecTy->getNumElements(),
5282 CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
5283 assert(DemandedElts.popcount() == 1 && "Inserting single value");
5284 Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
5285 !IsLoad, CostKind);
5286 }
5287
5288 SubVecEltsLeft -= CurrNumEltPerOp;
5289 NumEltRemaining -= CurrNumEltPerOp;
5290 Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes);
5291 }
5292 }
5293
5294 assert(NumEltRemaining <= 0 && "Should have processed all the elements.");
5295
5296 return Cost;
5297}
5298
5300X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
5301 unsigned AddressSpace,
5303 bool IsLoad = (Instruction::Load == Opcode);
5304 bool IsStore = (Instruction::Store == Opcode);
5305
5306 auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
5307 if (!SrcVTy)
5308 // To calculate scalar take the regular cost, without mask
5309 return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
5310
5311 unsigned NumElem = SrcVTy->getNumElements();
5312 auto *MaskTy =
5313 FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
5314 if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) ||
5315 (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) {
5316 // Scalarization
5317 APInt DemandedElts = APInt::getAllOnes(NumElem);
5319 MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind);
5320 InstructionCost ScalarCompareCost = getCmpSelInstrCost(
5321 Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
5323 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
5324 InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
5326 SrcVTy, DemandedElts, IsLoad, IsStore, CostKind);
5327 InstructionCost MemopCost =
5328 NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
5329 Alignment, AddressSpace, CostKind);
5330 return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
5331 }
5332
5333 // Legalize the type.
5334 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy);
5335 auto VT = TLI->getValueType(DL, SrcVTy);
5337 MVT Ty = LT.second;
5338 if (Ty == MVT::i16 || Ty == MVT::i32 || Ty == MVT::i64)
5339 // APX masked load/store for scalar is cheap.
5340 return Cost + LT.first;
5341
5342 if (VT.isSimple() && Ty != VT.getSimpleVT() &&
5343 LT.second.getVectorNumElements() == NumElem)
5344 // Promotion requires extend/truncate for data and a shuffle for mask.
5345 Cost +=
5347 nullptr) +
5348 getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, {}, CostKind, 0, nullptr);
5349
5350 else if (LT.first * Ty.getVectorNumElements() > NumElem) {
5351 auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
5353 // Expanding requires fill mask with zeroes
5355 MaskTy);
5356 }
5357
5358 // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
5359 if (!ST->hasAVX512())
5360 return Cost + LT.first * (IsLoad ? 2 : 8);
5361
5362 // AVX-512 masked load/store is cheaper
5363 return Cost + LT.first;
5364}
5365
5368 const Value *Base,
5369 const TTI::PointersChainInfo &Info,
5370 Type *AccessTy, TTI::TargetCostKind CostKind) {
5371 if (Info.isSameBase() && Info.isKnownStride()) {
5372 // If all the pointers have known stride all the differences are translated
5373 // into constants. X86 memory addressing allows encoding it into
5374 // displacement. So we just need to take the base GEP cost.
5375 if (const auto *BaseGEP = dyn_cast<GetElementPtrInst>(Base)) {
5376 SmallVector<const Value *> Indices(BaseGEP->indices());
5377 return getGEPCost(BaseGEP->getSourceElementType(),
5378 BaseGEP->getPointerOperand(), Indices, nullptr,
5379 CostKind);
5380 }
5381 return TTI::TCC_Free;
5382 }
5383 return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind);
5384}
5385
5387 ScalarEvolution *SE,
5388 const SCEV *Ptr) {
5389 // Address computations in vectorized code with non-consecutive addresses will
5390 // likely result in more instructions compared to scalar code where the
5391 // computation can more often be merged into the index mode. The resulting
5392 // extra micro-ops can significantly decrease throughput.
5393 const unsigned NumVectorInstToHideOverhead = 10;
5394
5395 // Cost modeling of Strided Access Computation is hidden by the indexing
5396 // modes of X86 regardless of the stride value. We dont believe that there
5397 // is a difference between constant strided access in gerenal and constant
5398 // strided value which is less than or equal to 64.
5399 // Even in the case of (loop invariant) stride whose value is not known at
5400 // compile time, the address computation will not incur more than one extra
5401 // ADD instruction.
5402 if (Ty->isVectorTy() && SE && !ST->hasAVX2()) {
5403 // TODO: AVX2 is the current cut-off because we don't have correct
5404 // interleaving costs for prior ISA's.
5406 return NumVectorInstToHideOverhead;
5408 return 1;
5409 }
5410
5411 return BaseT::getAddressComputationCost(Ty, SE, Ptr);
5412}
5413
5416 std::optional<FastMathFlags> FMF,
5419 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5420
5421 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5422 // and make it as the cost.
5423
5424 static const CostTblEntry SLMCostTbl[] = {
5425 { ISD::FADD, MVT::v2f64, 3 },
5426 { ISD::ADD, MVT::v2i64, 5 },
5427 };
5428
5429 static const CostTblEntry SSE2CostTbl[] = {
5430 { ISD::FADD, MVT::v2f64, 2 },
5431 { ISD::FADD, MVT::v2f32, 2 },
5432 { ISD::FADD, MVT::v4f32, 4 },
5433 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
5434 { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
5435 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
5436 { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
5437 { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
5438 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
5439 { ISD::ADD, MVT::v2i8, 2 },
5440 { ISD::ADD, MVT::v4i8, 2 },
5441 { ISD::ADD, MVT::v8i8, 2 },
5442 { ISD::ADD, MVT::v16i8, 3 },
5443 };
5444
5445 static const CostTblEntry AVX1CostTbl[] = {
5446 { ISD::FADD, MVT::v4f64, 3 },
5447 { ISD::FADD, MVT::v4f32, 3 },
5448 { ISD::FADD, MVT::v8f32, 4 },
5449 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
5450 { ISD::ADD, MVT::v4i64, 3 },
5451 { ISD::ADD, MVT::v8i32, 5 },
5452 { ISD::ADD, MVT::v16i16, 5 },
5453 { ISD::ADD, MVT::v32i8, 4 },
5454 };
5455
5456 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5457 assert(ISD && "Invalid opcode");
5458
5459 // Before legalizing the type, give a chance to look up illegal narrow types
5460 // in the table.
5461 // FIXME: Is there a better way to do this?
5462 EVT VT = TLI->getValueType(DL, ValTy);
5463 if (VT.isSimple()) {
5464 MVT MTy = VT.getSimpleVT();
5465 if (ST->useSLMArithCosts())
5466 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5467 return Entry->Cost;
5468
5469 if (ST->hasAVX())
5470 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5471 return Entry->Cost;
5472
5473 if (ST->hasSSE2())
5474 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5475 return Entry->Cost;
5476 }
5477
5478 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5479
5480 MVT MTy = LT.second;
5481
5482 auto *ValVTy = cast<FixedVectorType>(ValTy);
5483
5484 // Special case: vXi8 mul reductions are performed as vXi16.
5485 if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) {
5486 auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16);
5487 auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements());
5488 return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
5490 CostKind) +
5491 getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind);
5492 }
5493
5494 InstructionCost ArithmeticCost = 0;
5495 if (LT.first != 1 && MTy.isVector() &&
5496 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5497 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5498 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5499 MTy.getVectorNumElements());
5500 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5501 ArithmeticCost *= LT.first - 1;
5502 }
5503
5504 if (ST->useSLMArithCosts())
5505 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5506 return ArithmeticCost + Entry->Cost;
5507
5508 if (ST->hasAVX())
5509 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5510 return ArithmeticCost + Entry->Cost;
5511
5512 if (ST->hasSSE2())
5513 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5514 return ArithmeticCost + Entry->Cost;
5515
5516 // FIXME: These assume a naive kshift+binop lowering, which is probably
5517 // conservative in most cases.
5518 static const CostTblEntry AVX512BoolReduction[] = {
5519 { ISD::AND, MVT::v2i1, 3 },
5520 { ISD::AND, MVT::v4i1, 5 },
5521 { ISD::AND, MVT::v8i1, 7 },
5522 { ISD::AND, MVT::v16i1, 9 },
5523 { ISD::AND, MVT::v32i1, 11 },
5524 { ISD::AND, MVT::v64i1, 13 },
5525 { ISD::OR, MVT::v2i1, 3 },
5526 { ISD::OR, MVT::v4i1, 5 },
5527 { ISD::OR, MVT::v8i1, 7 },
5528 { ISD::OR, MVT::v16i1, 9 },
5529 { ISD::OR, MVT::v32i1, 11 },
5530 { ISD::OR, MVT::v64i1, 13 },
5531 };
5532
5533 static const CostTblEntry AVX2BoolReduction[] = {
5534 { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
5535 { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
5536 { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
5537 { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
5538 };
5539
5540 static const CostTblEntry AVX1BoolReduction[] = {
5541 { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
5542 { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
5543 { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5544 { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5545 { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
5546 { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
5547 { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5548 { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5549 };
5550
5551 static const CostTblEntry SSE2BoolReduction[] = {
5552 { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
5553 { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
5554 { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
5555 { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
5556 { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
5557 { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
5558 { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
5559 { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
5560 };
5561
5562 // Handle bool allof/anyof patterns.
5563 if (ValVTy->getElementType()->isIntegerTy(1)) {
5564 InstructionCost ArithmeticCost = 0;
5565 if (LT.first != 1 && MTy.isVector() &&
5566 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5567 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5568 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5569 MTy.getVectorNumElements());
5570 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5571 ArithmeticCost *= LT.first - 1;
5572 }
5573
5574 if (ST->hasAVX512())
5575 if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
5576 return ArithmeticCost + Entry->Cost;
5577 if (ST->hasAVX2())
5578 if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
5579 return ArithmeticCost + Entry->Cost;
5580 if (ST->hasAVX())
5581 if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
5582 return ArithmeticCost + Entry->Cost;
5583 if (ST->hasSSE2())
5584 if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
5585 return ArithmeticCost + Entry->Cost;
5586
5587 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5588 }
5589
5590 unsigned NumVecElts = ValVTy->getNumElements();
5591 unsigned ScalarSize = ValVTy->getScalarSizeInBits();
5592
5593 // Special case power of 2 reductions where the scalar type isn't changed
5594 // by type legalization.
5595 if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
5596 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5597
5598 InstructionCost ReductionCost = 0;
5599
5600 auto *Ty = ValVTy;
5601 if (LT.first != 1 && MTy.isVector() &&
5602 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5603 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5604 Ty = FixedVectorType::get(ValVTy->getElementType(),
5605 MTy.getVectorNumElements());
5606 ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5607 ReductionCost *= LT.first - 1;
5608 NumVecElts = MTy.getVectorNumElements();
5609 }
5610
5611 // Now handle reduction with the legal type, taking into account size changes
5612 // at each level.
5613 while (NumVecElts > 1) {
5614 // Determine the size of the remaining vector we need to reduce.
5615 unsigned Size = NumVecElts * ScalarSize;
5616 NumVecElts /= 2;
5617 // If we're reducing from 256/512 bits, use an extract_subvector.
5618 if (Size > 128) {
5619 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5620 ReductionCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, {},
5621 CostKind, NumVecElts, SubTy);
5622 Ty = SubTy;
5623 } else if (Size == 128) {
5624 // Reducing from 128 bits is a permute of v2f64/v2i64.
5625 FixedVectorType *ShufTy;
5626 if (ValVTy->isFloatingPointTy())
5627 ShufTy =
5628 FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
5629 else
5630 ShufTy =
5631 FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
5632 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, {},
5633 CostKind, 0, nullptr);
5634 } else if (Size == 64) {
5635 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5636 FixedVectorType *ShufTy;
5637 if (ValVTy->isFloatingPointTy())
5638 ShufTy =
5639 FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
5640 else
5641 ShufTy =
5642 FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
5643 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, {},
5644 CostKind, 0, nullptr);
5645 } else {
5646 // Reducing from smaller size is a shift by immediate.
5647 auto *ShiftTy = FixedVectorType::get(
5648 Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
5649 ReductionCost += getArithmeticInstrCost(
5650 Instruction::LShr, ShiftTy, CostKind,
5653 }
5654
5655 // Add the arithmetic op for this level.
5656 ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
5657 }
5658
5659 // Add the final extract element to the cost.
5660 return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5661 CostKind, 0, nullptr, nullptr);
5662}
5663
5666 FastMathFlags FMF) {
5667 IntrinsicCostAttributes ICA(IID, Ty, {Ty, Ty}, FMF);
5668 return getIntrinsicInstrCost(ICA, CostKind);
5669}
5670
5673 FastMathFlags FMF,
5675 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5676
5677 MVT MTy = LT.second;
5678
5679 int ISD;
5680 if (ValTy->isIntOrIntVectorTy()) {
5681 ISD = (IID == Intrinsic::umin || IID == Intrinsic::umax) ? ISD::UMIN
5682 : ISD::SMIN;
5683 } else {
5684 assert(ValTy->isFPOrFPVectorTy() &&
5685 "Expected float point or integer vector type.");
5686 ISD = (IID == Intrinsic::minnum || IID == Intrinsic::maxnum)
5687 ? ISD::FMINNUM
5688 : ISD::FMINIMUM;
5689 }
5690
5691 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5692 // and make it as the cost.
5693
5694 static const CostTblEntry SSE2CostTbl[] = {
5695 {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw
5696 {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw
5697 {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw
5698 };
5699
5700 static const CostTblEntry SSE41CostTbl[] = {
5701 {ISD::SMIN, MVT::v2i16, 3}, // same as sse2
5702 {ISD::SMIN, MVT::v4i16, 5}, // same as sse2
5703 {ISD::UMIN, MVT::v2i16, 5}, // same as sse2
5704 {ISD::UMIN, MVT::v4i16, 7}, // same as sse2
5705 {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor
5706 {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax
5707 {ISD::SMIN, MVT::v2i8, 3}, // pminsb
5708 {ISD::SMIN, MVT::v4i8, 5}, // pminsb
5709 {ISD::SMIN, MVT::v8i8, 7}, // pminsb
5710 {ISD::SMIN, MVT::v16i8, 6},
5711 {ISD::UMIN, MVT::v2i8, 3}, // same as sse2
5712 {ISD::UMIN, MVT::v4i8, 5}, // same as sse2
5713 {ISD::UMIN, MVT::v8i8, 7}, // same as sse2
5714 {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax
5715 };
5716
5717 static const CostTblEntry AVX1CostTbl[] = {
5718 {ISD::SMIN, MVT::v16i16, 6},
5719 {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax
5720 {ISD::SMIN, MVT::v32i8, 8},
5721 {ISD::UMIN, MVT::v32i8, 8},
5722 };
5723
5724 static const CostTblEntry AVX512BWCostTbl[] = {
5725 {ISD::SMIN, MVT::v32i16, 8},
5726 {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax
5727 {ISD::SMIN, MVT::v64i8, 10},
5728 {ISD::UMIN, MVT::v64i8, 10},
5729 };
5730
5731 // Before legalizing the type, give a chance to look up illegal narrow types
5732 // in the table.
5733 // FIXME: Is there a better way to do this?
5734 EVT VT = TLI->getValueType(DL, ValTy);
5735 if (VT.isSimple()) {
5736 MVT MTy = VT.getSimpleVT();
5737 if (ST->hasBWI())
5738 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5739 return Entry->Cost;
5740
5741 if (ST->hasAVX())
5742 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5743 return Entry->Cost;
5744
5745 if (ST->hasSSE41())
5746 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5747 return Entry->Cost;
5748
5749 if (ST->hasSSE2())
5750 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5751 return Entry->Cost;
5752 }
5753
5754 auto *ValVTy = cast<FixedVectorType>(ValTy);
5755 unsigned NumVecElts = ValVTy->getNumElements();
5756
5757 auto *Ty = ValVTy;
5758 InstructionCost MinMaxCost = 0;
5759 if (LT.first != 1 && MTy.isVector() &&
5760 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5761 // Type needs to be split. We need LT.first - 1 operations ops.
5762 Ty = FixedVectorType::get(ValVTy->getElementType(),
5763 MTy.getVectorNumElements());
5764 MinMaxCost = getMinMaxCost(IID, Ty, CostKind, FMF);
5765 MinMaxCost *= LT.first - 1;
5766 NumVecElts = MTy.getVectorNumElements();
5767 }
5768
5769 if (ST->hasBWI())
5770 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5771 return MinMaxCost + Entry->Cost;
5772
5773 if (ST->hasAVX())
5774 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5775 return MinMaxCost + Entry->Cost;
5776
5777 if (ST->hasSSE41())
5778 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5779 return MinMaxCost + Entry->Cost;
5780
5781 if (ST->hasSSE2())
5782 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5783 return MinMaxCost + Entry->Cost;
5784
5785 unsigned ScalarSize = ValTy->getScalarSizeInBits();
5786
5787 // Special case power of 2 reductions where the scalar type isn't changed
5788 // by type legalization.
5789 if (!isPowerOf2_32(ValVTy->getNumElements()) ||
5790 ScalarSize != MTy.getScalarSizeInBits())
5791 return BaseT::getMinMaxReductionCost(IID, ValTy, FMF, CostKind);
5792
5793 // Now handle reduction with the legal type, taking into account size changes
5794 // at each level.
5795 while (NumVecElts > 1) {
5796 // Determine the size of the remaining vector we need to reduce.
5797 unsigned Size = NumVecElts * ScalarSize;
5798 NumVecElts /= 2;
5799 // If we're reducing from 256/512 bits, use an extract_subvector.
5800 if (Size > 128) {
5801 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5802 MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, {}, CostKind,
5803 NumVecElts, SubTy);
5804 Ty = SubTy;
5805 } else if (Size == 128) {
5806 // Reducing from 128 bits is a permute of v2f64/v2i64.
5807 VectorType *ShufTy;
5808 if (ValTy->isFloatingPointTy())
5809 ShufTy =
5811 else
5812 ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
5813 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, {},
5814 CostKind, 0, nullptr);
5815 } else if (Size == 64) {
5816 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5817 FixedVectorType *ShufTy;
5818 if (ValTy->isFloatingPointTy())
5819 ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
5820 else
5821 ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
5822 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, {},
5823 CostKind, 0, nullptr);
5824 } else {
5825 // Reducing from smaller size is a shift by immediate.
5826 auto *ShiftTy = FixedVectorType::get(
5827 Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
5828 MinMaxCost += getArithmeticInstrCost(
5829 Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
5832 }
5833
5834 // Add the arithmetic op for this level.
5835 MinMaxCost += getMinMaxCost(IID, Ty, CostKind, FMF);
5836 }
5837
5838 // Add the final extract element to the cost.
5839 return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5840 CostKind, 0, nullptr, nullptr);
5841}
5842
5843/// Calculate the cost of materializing a 64-bit value. This helper
5844/// method might only calculate a fraction of a larger immediate. Therefore it
5845/// is valid to return a cost of ZERO.
5847 if (Val == 0)
5848 return TTI::TCC_Free;
5849
5850 if (isInt<32>(Val))
5851 return TTI::TCC_Basic;
5852
5853 return 2 * TTI::TCC_Basic;
5854}
5855
5858 assert(Ty->isIntegerTy());
5859
5860 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5861 if (BitSize == 0)
5862 return ~0U;
5863
5864 // Never hoist constants larger than 128bit, because this might lead to
5865 // incorrect code generation or assertions in codegen.
5866 // Fixme: Create a cost model for types larger than i128 once the codegen
5867 // issues have been fixed.
5868 if (BitSize > 128)
5869 return TTI::TCC_Free;
5870
5871 if (Imm == 0)
5872 return TTI::TCC_Free;
5873
5874 // Sign-extend all constants to a multiple of 64-bit.
5875 APInt ImmVal = Imm;
5876 if (BitSize % 64 != 0)
5877 ImmVal = Imm.sext(alignTo(BitSize, 64));
5878
5879 // Split the constant into 64-bit chunks and calculate the cost for each
5880 // chunk.
5882 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
5883 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
5884 int64_t Val = Tmp.getSExtValue();
5885 Cost += getIntImmCost(Val);
5886 }
5887 // We need at least one instruction to materialize the constant.
5888 return std::max<InstructionCost>(1, Cost);
5889}
5890
5892 const APInt &Imm, Type *Ty,
5894 Instruction *Inst) {
5895 assert(Ty->isIntegerTy());
5896
5897 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5898 unsigned ImmBitWidth = Imm.getBitWidth();
5899
5900 // There is no cost model for constants with a bit size of 0. Return TCC_Free
5901 // here, so that constant hoisting will ignore this constant.
5902 if (BitSize == 0)
5903 return TTI::TCC_Free;
5904
5905 unsigned ImmIdx = ~0U;
5906 switch (Opcode) {
5907 default:
5908 return TTI::TCC_Free;
5909 case Instruction::GetElementPtr:
5910 // Always hoist the base address of a GetElementPtr. This prevents the
5911 // creation of new constants for every base constant that gets constant
5912 // folded with the offset.
5913 if (Idx == 0)
5914 return 2 * TTI::TCC_Basic;
5915 return TTI::TCC_Free;
5916 case Instruction::Store:
5917 ImmIdx = 0;
5918 break;
5919 case Instruction::ICmp:
5920 // This is an imperfect hack to prevent constant hoisting of
5921 // compares that might be trying to check if a 64-bit value fits in
5922 // 32-bits. The backend can optimize these cases using a right shift by 32.
5923 // Ideally we would check the compare predicate here. There also other
5924 // similar immediates the backend can use shifts for.
5925 if (Idx == 1 && ImmBitWidth == 64) {
5926 uint64_t ImmVal = Imm.getZExtValue();
5927 if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
5928 return TTI::TCC_Free;
5929 }
5930 ImmIdx = 1;
5931 break;
5932 case Instruction::And:
5933 // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
5934 // by using a 32-bit operation with implicit zero extension. Detect such
5935 // immediates here as the normal path expects bit 31 to be sign extended.
5936 if (Idx == 1 && ImmBitWidth == 64 && Imm.isIntN(32))
5937 return TTI::TCC_Free;
5938 // If we have BMI then we can use BEXTR/BZHI to mask out upper i64 bits.
5939 if (Idx == 1 && ImmBitWidth == 64 && ST->is64Bit() && ST->hasBMI() &&
5940 Imm.isMask())
5941 return X86TTIImpl::getIntImmCost(ST->hasBMI2() ? 255 : 65535);
5942 ImmIdx = 1;
5943 break;
5944 case Instruction::Add:
5945 case Instruction::Sub:
5946 // For add/sub, we can use the opposite instruction for INT32_MIN.
5947 if (Idx == 1 && ImmBitWidth == 64 && Imm.getZExtValue() == 0x80000000)
5948 return TTI::TCC_Free;
5949 ImmIdx = 1;
5950 break;
5951 case Instruction::UDiv:
5952 case Instruction::SDiv:
5953 case Instruction::URem:
5954 case Instruction::SRem:
5955 // Division by constant is typically expanded later into a different
5956 // instruction sequence. This completely changes the constants.
5957 // Report them as "free" to stop ConstantHoist from marking them as opaque.
5958 return TTI::TCC_Free;
5959 case Instruction::Mul:
5960 case Instruction::Or:
5961 case Instruction::Xor:
5962 ImmIdx = 1;
5963 break;
5964 // Always return TCC_Free for the shift value of a shift instruction.
5965 case Instruction::Shl:
5966 case Instruction::LShr:
5967 case Instruction::AShr:
5968 if (Idx == 1)
5969 return TTI::TCC_Free;
5970 break;
5971 case Instruction::Trunc:
5972 case Instruction::ZExt:
5973 case Instruction::SExt:
5974 case Instruction::IntToPtr:
5975 case Instruction::PtrToInt:
5976 case Instruction::BitCast:
5977 case Instruction::PHI:
5978 case Instruction::Call:
5979 case Instruction::Select:
5980 case Instruction::Ret:
5981 case Instruction::Load:
5982 break;
5983 }
5984
5985 if (Idx == ImmIdx) {
5986 uint64_t NumConstants = divideCeil(BitSize, 64);
5988 return (Cost <= NumConstants * TTI::TCC_Basic)
5989 ? static_cast<int>(TTI::TCC_Free)
5990 : Cost;
5991 }
5992
5993 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
5994}
5995
5997 const APInt &Imm, Type *Ty,
5999 assert(Ty->isIntegerTy());
6000
6001 unsigned BitSize = Ty->getPrimitiveSizeInBits();
6002 // There is no cost model for constants with a bit size of 0. Return TCC_Free
6003 // here, so that constant hoisting will ignore this constant.
6004 if (BitSize == 0)
6005 return TTI::TCC_Free;
6006
6007 switch (IID) {
6008 default:
6009 return TTI::TCC_Free;
6010 case Intrinsic::sadd_with_overflow:
6011 case Intrinsic::uadd_with_overflow:
6012 case Intrinsic::ssub_with_overflow:
6013 case Intrinsic::usub_with_overflow:
6014 case Intrinsic::smul_with_overflow:
6015 case Intrinsic::umul_with_overflow:
6016 if ((Idx == 1) && Imm.getBitWidth() <= 64 && Imm.isSignedIntN(32))
6017 return TTI::TCC_Free;
6018 break;
6019 case Intrinsic::experimental_stackmap:
6020 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
6021 return TTI::TCC_Free;
6022 break;
6023 case Intrinsic::experimental_patchpoint_void:
6024 case Intrinsic::experimental_patchpoint:
6025 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
6026 return TTI::TCC_Free;
6027 break;
6028 }
6029 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
6030}
6031
6034 const Instruction *I) {
6036 return Opcode == Instruction::PHI ? TTI::TCC_Free : TTI::TCC_Basic;
6037 // Branches are assumed to be predicted.
6038 return TTI::TCC_Free;
6039}
6040
6041int X86TTIImpl::getGatherOverhead() const {
6042 // Some CPUs have more overhead for gather. The specified overhead is relative
6043 // to the Load operation. "2" is the number provided by Intel architects. This
6044 // parameter is used for cost estimation of Gather Op and comparison with
6045 // other alternatives.
6046 // TODO: Remove the explicit hasAVX512()?, That would mean we would only
6047 // enable gather with a -march.
6048 if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
6049 return 2;
6050
6051 return 1024;
6052}
6053
6054int X86TTIImpl::getScatterOverhead() const {
6055 if (ST->hasAVX512())
6056 return 2;
6057
6058 return 1024;
6059}
6060
6061// Return an average cost of Gather / Scatter instruction, maybe improved later.
6062InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode,
6064 Type *SrcVTy, const Value *Ptr,
6065 Align Alignment,
6066 unsigned AddressSpace) {
6067
6068 assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
6069 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
6070
6071 // Try to reduce index size from 64 bit (default for GEP)
6072 // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
6073 // operation will use 16 x 64 indices which do not fit in a zmm and needs
6074 // to split. Also check that the base pointer is the same for all lanes,
6075 // and that there's at most one variable index.
6076 auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
6077 unsigned IndexSize = DL.getPointerSizeInBits();
6078 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
6079 if (IndexSize < 64 || !GEP)
6080 return IndexSize;
6081
6082 unsigned NumOfVarIndices = 0;
6083 const Value *Ptrs = GEP->getPointerOperand();
6084 if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
6085 return IndexSize;
6086 for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) {
6087 if (isa<Constant>(GEP->getOperand(I)))
6088 continue;
6089 Type *IndxTy = GEP->getOperand(I)->getType();
6090 if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
6091 IndxTy = IndexVTy->getElementType();
6092 if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
6093 !isa<SExtInst>(GEP->getOperand(I))) ||
6094 ++NumOfVarIndices > 1)
6095 return IndexSize; // 64
6096 }
6097 return (unsigned)32;
6098 };
6099
6100 // Trying to reduce IndexSize to 32 bits for vector 16.
6101 // By default the IndexSize is equal to pointer size.
6102 unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
6103 ? getIndexSizeInBits(Ptr, DL)
6105
6106 auto *IndexVTy = FixedVectorType::get(
6107 IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
6108 std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(IndexVTy);
6109 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(SrcVTy);
6110 InstructionCost::CostType SplitFactor =
6111 *std::max(IdxsLT.first, SrcLT.first).getValue();
6112 if (SplitFactor > 1) {
6113 // Handle splitting of vector of pointers
6114 auto *SplitSrcTy =
6115 FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
6116 return SplitFactor * getGSVectorCost(Opcode, CostKind, SplitSrcTy, Ptr,
6117 Alignment, AddressSpace);
6118 }
6119
6120 // If we didn't split, this will be a single gather/scatter instruction.
6122 return 1;
6123
6124 // The gather / scatter cost is given by Intel architects. It is a rough
6125 // number since we are looking at one instruction in a time.
6126 const int GSOverhead = (Opcode == Instruction::Load) ? getGatherOverhead()
6127 : getScatterOverhead();
6128 return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
6129 MaybeAlign(Alignment), AddressSpace,
6130 CostKind);
6131}
6132
6133/// Calculate the cost of Gather / Scatter operation
6135 unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask,
6137 const Instruction *I = nullptr) {
6138 if ((Opcode == Instruction::Load &&
6139 (!isLegalMaskedGather(SrcVTy, Align(Alignment)) ||
6140 forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
6141 Align(Alignment)))) ||
6142 (Opcode == Instruction::Store &&
6143 (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) ||
6144 forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
6145 Align(Alignment)))))
6146 return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask,
6147 Alignment, CostKind, I);
6148
6149 assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
6150 PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
6151 if (!PtrTy && Ptr->getType()->isVectorTy())
6152 PtrTy = dyn_cast<PointerType>(
6153 cast<VectorType>(Ptr->getType())->getElementType());
6154 assert(PtrTy && "Unexpected type for Ptr argument");
6155 unsigned AddressSpace = PtrTy->getAddressSpace();
6156 return getGSVectorCost(Opcode, CostKind, SrcVTy, Ptr, Alignment,
6157 AddressSpace);
6158}
6159
6161 const TargetTransformInfo::LSRCost &C2) {
6162 // X86 specific here are "instruction number 1st priority".
6163 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
6164 C1.NumIVMuls, C1.NumBaseAdds,
6165 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
6166 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
6167 C2.NumIVMuls, C2.NumBaseAdds,
6168 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
6169}
6170
6172 return ST->hasMacroFusion() || ST->hasBranchFusion();
6173}
6174
6175bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
6176 Type *ScalarTy = DataTy->getScalarType();
6177
6178 // The backend can't handle a single element vector w/o CFCMOV.
6179 if (isa<VectorType>(DataTy) && cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6180 return ST->hasCF() && hasConditionalLoadStoreForType(ScalarTy);
6181
6182 if (!ST->hasAVX())
6183 return false;
6184
6185 if (ScalarTy->isPointerTy())
6186 return true;
6187
6188 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6189 return true;
6190
6191 if (ScalarTy->isHalfTy() && ST->hasBWI())
6192 return true;
6193
6194 if (ScalarTy->isBFloatTy() && ST->hasBF16())
6195 return true;
6196
6197 if (!ScalarTy->isIntegerTy())
6198 return false;
6199
6200 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6201 return IntWidth == 32 || IntWidth == 64 ||
6202 ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
6203}
6204
6205bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) {
6206 return isLegalMaskedLoad(DataType, Alignment);
6207}
6208
6209bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) {
6210 unsigned DataSize = DL.getTypeStoreSize(DataType);
6211 // The only supported nontemporal loads are for aligned vectors of 16 or 32
6212 // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
6213 // (the equivalent stores only require AVX).
6214 if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
6215 return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
6216
6217 return false;
6218}
6219
6220bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) {
6221 unsigned DataSize = DL.getTypeStoreSize(DataType);
6222
6223 // SSE4A supports nontemporal stores of float and double at arbitrary
6224 // alignment.
6225 if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
6226 return true;
6227
6228 // Besides the SSE4A subtarget exception above, only aligned stores are
6229 // available nontemporaly on any other subtarget. And only stores with a size
6230 // of 4..32 bytes (powers of 2, only) are permitted.
6231 if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
6232 !isPowerOf2_32(DataSize))
6233 return false;
6234
6235 // 32-byte vector nontemporal stores are supported by AVX (the equivalent
6236 // loads require AVX2).
6237 if (DataSize == 32)
6238 return ST->hasAVX();
6239 if (DataSize == 16)
6240 return ST->hasSSE1();
6241 return true;
6242}
6243
6245 ElementCount NumElements) const {
6246 // movddup
6247 return ST->hasSSE3() && !NumElements.isScalable() &&
6248 NumElements.getFixedValue() == 2 &&
6249 ElementTy == Type::getDoubleTy(ElementTy->getContext());
6250}
6251
6253 if (!isa<VectorType>(DataTy))
6254 return false;
6255
6256 if (!ST->hasAVX512())
6257 return false;
6258
6259 // The backend can't handle a single element vector.
6260 if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6261 return false;
6262
6263 Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
6264
6265 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6266 return true;
6267
6268 if (!ScalarTy->isIntegerTy())
6269 return false;
6270
6271 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6272 return IntWidth == 32 || IntWidth == 64 ||
6273 ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
6274}
6275
6277 return isLegalMaskedExpandLoad(DataTy, Alignment);
6278}
6279
6280bool X86TTIImpl::supportsGather() const {
6281 // Some CPUs have better gather performance than others.
6282 // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
6283 // enable gather with a -march.
6284 return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2());
6285}
6286
6288 // Gather / Scatter for vector 2 is not profitable on KNL / SKX
6289 // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend
6290 // it to 8 elements, but zeroing upper bits of the mask vector will add more
6291 // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO:
6292 // Check, maybe the gather/scatter instruction is better in the VariableMask
6293 // case.
6294 unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements();
6295 return NumElts == 1 ||
6296 (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())));
6297}
6298
6300 Type *ScalarTy = DataTy->getScalarType();
6301 if (ScalarTy->isPointerTy())
6302 return true;
6303
6304 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6305 return true;
6306
6307 if (!ScalarTy->isIntegerTy())
6308 return false;
6309
6310 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6311 return IntWidth == 32 || IntWidth == 64;
6312}
6313
6315 if (!supportsGather() || !ST->preferGather())
6316 return false;
6317 return isLegalMaskedGatherScatter(DataTy, Alignment);
6318}
6319
6320bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
6321 unsigned Opcode1,
6322 const SmallBitVector &OpcodeMask) const {
6323 // ADDSUBPS 4xf32 SSE3
6324 // VADDSUBPS 4xf32 AVX
6325 // VADDSUBPS 8xf32 AVX2
6326 // ADDSUBPD 2xf64 SSE3
6327 // VADDSUBPD 2xf64 AVX
6328 // VADDSUBPD 4xf64 AVX2
6329
6330 unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
6331 assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible");
6332 if (!isPowerOf2_32(NumElements))
6333 return false;
6334 // Check the opcode pattern. We apply the mask on the opcode arguments and
6335 // then check if it is what we expect.
6336 for (int Lane : seq<int>(0, NumElements)) {
6337 unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0;
6338 // We expect FSub for even lanes and FAdd for odd lanes.
6339 if (Lane % 2 == 0 && Opc != Instruction::FSub)
6340 return false;
6341 if (Lane % 2 == 1 && Opc != Instruction::FAdd)
6342 return false;
6343 }
6344 // Now check that the pattern is supported by the target ISA.
6345 Type *ElemTy = cast<VectorType>(VecTy)->getElementType();
6346 if (ElemTy->isFloatTy())
6347 return ST->hasSSE3() && NumElements % 4 == 0;
6348 if (ElemTy->isDoubleTy())
6349 return ST->hasSSE3() && NumElements % 2 == 0;
6350 return false;
6351}
6352
6353bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) {
6354 // AVX2 doesn't support scatter
6355 if (!ST->hasAVX512() || !ST->preferScatter())
6356 return false;
6357 return isLegalMaskedGatherScatter(DataType, Alignment);
6358}
6359
6360bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
6361 EVT VT = TLI->getValueType(DL, DataType);
6362 return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
6363}
6364
6366 // FDIV is always expensive, even if it has a very low uop count.
6367 // TODO: Still necessary for recent CPUs with low latency/throughput fdiv?
6368 if (I->getOpcode() == Instruction::FDiv)
6369 return true;
6370
6372}
6373
6375 return false;
6376}
6377
6379 const Function *Callee) const {
6380 const TargetMachine &TM = getTLI()->getTargetMachine();
6381
6382 // Work this as a subsetting of subtarget features.
6383 const FeatureBitset &CallerBits =
6384 TM.getSubtargetImpl(*Caller)->getFeatureBits();
6385 const FeatureBitset &CalleeBits =
6386 TM.getSubtargetImpl(*Callee)->getFeatureBits();
6387
6388 // Check whether features are the same (apart from the ignore list).
6389 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
6390 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
6391 if (RealCallerBits == RealCalleeBits)
6392 return true;
6393
6394 // If the features are a subset, we need to additionally check for calls
6395 // that may become ABI-incompatible as a result of inlining.
6396 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
6397 return false;
6398
6399 for (const Instruction &I : instructions(Callee)) {
6400 if (const auto *CB = dyn_cast<CallBase>(&I)) {
6401 // Having more target features is fine for inline ASM.
6402 if (CB->isInlineAsm())
6403 continue;
6404
6406 for (Value *Arg : CB->args())
6407 Types.push_back(Arg->getType());
6408 if (!CB->getType()->isVoidTy())
6409 Types.push_back(CB->getType());
6410
6411 // Simple types are always ABI compatible.
6412 auto IsSimpleTy = [](Type *Ty) {
6413 return !Ty->isVectorTy() && !Ty->isAggregateType();
6414 };
6415 if (all_of(Types, IsSimpleTy))
6416 continue;
6417
6418 if (Function *NestedCallee = CB->getCalledFunction()) {
6419 // Assume that intrinsics are always ABI compatible.
6420 if (NestedCallee->isIntrinsic())
6421 continue;
6422
6423 // Do a precise compatibility check.
6424 if (!areTypesABICompatible(Caller, NestedCallee, Types))
6425 return false;
6426 } else {
6427 // We don't know the target features of the callee,
6428 // assume it is incompatible.
6429 return false;
6430 }
6431 }
6432 }
6433 return true;
6434}
6435
6437 const Function *Callee,
6438 const ArrayRef<Type *> &Types) const {
6439 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
6440 return false;
6441
6442 // If we get here, we know the target features match. If one function
6443 // considers 512-bit vectors legal and the other does not, consider them
6444 // incompatible.
6445 const TargetMachine &TM = getTLI()->getTargetMachine();
6446
6447 if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
6448 TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs())
6449 return true;
6450
6451 // Consider the arguments compatible if they aren't vectors or aggregates.
6452 // FIXME: Look at the size of vectors.
6453 // FIXME: Look at the element types of aggregates to see if there are vectors.
6454 return llvm::none_of(Types,
6455 [](Type *T) { return T->isVectorTy() || T->isAggregateType(); });
6456}
6457
6459X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
6461 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
6462 Options.NumLoadsPerBlock = 2;
6463 // All GPR and vector loads can be unaligned.
6464 Options.AllowOverlappingLoads = true;
6465 if (IsZeroCmp) {
6466 // Only enable vector loads for equality comparison. Right now the vector
6467 // version is not as fast for three way compare (see #33329).
6468 const unsigned PreferredWidth = ST->getPreferVectorWidth();
6469 if (PreferredWidth >= 512 && ST->hasAVX512() && ST->hasEVEX512())
6470 Options.LoadSizes.push_back(64);
6471 if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
6472 if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
6473 }
6474 if (ST->is64Bit()) {
6475 Options.LoadSizes.push_back(8);
6476 }
6477 Options.LoadSizes.push_back(4);
6478 Options.LoadSizes.push_back(2);
6479 Options.LoadSizes.push_back(1);
6480 return Options;
6481}
6482
6484 return supportsGather();
6485}
6486
6488 return false;
6489}
6490
6492 // TODO: We expect this to be beneficial regardless of arch,
6493 // but there are currently some unexplained performance artifacts on Atom.
6494 // As a temporary solution, disable on Atom.
6495 return !(ST->isAtom());
6496}
6497
6498// Get estimation for interleaved load/store operations and strided load.
6499// \p Indices contains indices for strided load.
6500// \p Factor - the factor of interleaving.
6501// AVX-512 provides 3-src shuffles that significantly reduces the cost.
6503 unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
6504 ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
6505 TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
6506 // VecTy for interleave memop is <VF*Factor x Elt>.
6507 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6508 // VecTy = <12 x i32>.
6509
6510 // Calculate the number of memory operations (NumOfMemOps), required
6511 // for load/store the VecTy.
6512 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6513 unsigned VecTySize = DL.getTypeStoreSize(VecTy);
6514 unsigned LegalVTSize = LegalVT.getStoreSize();
6515 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
6516
6517 // Get the cost of one memory operation.
6518 auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
6519 LegalVT.getVectorNumElements());
6520 InstructionCost MemOpCost;
6521 bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps;
6522 if (UseMaskedMemOp)
6523 MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment,
6525 else
6526 MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, MaybeAlign(Alignment),
6528
6529 unsigned VF = VecTy->getNumElements() / Factor;
6530 MVT VT =
6532
6533 InstructionCost MaskCost;
6534 if (UseMaskedMemOp) {
6535 APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements());
6536 for (unsigned Index : Indices) {
6537 assert(Index < Factor && "Invalid index for interleaved memory op");
6538 for (unsigned Elm = 0; Elm < VF; Elm++)
6539 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
6540 }
6541
6542 Type *I1Type = Type::getInt1Ty(VecTy->getContext());
6543
6544 MaskCost = getReplicationShuffleCost(
6545 I1Type, Factor, VF,
6546 UseMaskForGaps ? DemandedLoadStoreElts
6548 CostKind);
6549
6550 // The Gaps mask is invariant and created outside the loop, therefore the
6551 // cost of creating it is not accounted for here. However if we have both
6552 // a MaskForGaps and some other mask that guards the execution of the
6553 // memory access, we need to account for the cost of And-ing the two masks
6554 // inside the loop.
6555 if (UseMaskForGaps) {
6556 auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements());
6557 MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind);
6558 }
6559 }
6560
6561 if (Opcode == Instruction::Load) {
6562 // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
6563 // contain the cost of the optimized shuffle sequence that the
6564 // X86InterleavedAccess pass will generate.
6565 // The cost of loads and stores are computed separately from the table.
6566
6567 // X86InterleavedAccess support only the following interleaved-access group.
6568 static const CostTblEntry AVX512InterleavedLoadTbl[] = {
6569 {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
6570 {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
6571 {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
6572 };
6573
6574 if (const auto *Entry =
6575 CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
6576 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6577 //If an entry does not exist, fallback to the default implementation.
6578
6579 // Kind of shuffle depends on number of loaded values.
6580 // If we load the entire data in one register, we can use a 1-src shuffle.
6581 // Otherwise, we'll merge 2 sources in each operation.
6582 TTI::ShuffleKind ShuffleKind =
6583 (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
6584
6585 InstructionCost ShuffleCost =
6586 getShuffleCost(ShuffleKind, SingleMemOpTy, {}, CostKind, 0, nullptr);
6587
6588 unsigned NumOfLoadsInInterleaveGrp =
6589 Indices.size() ? Indices.size() : Factor;
6590 auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
6591 VecTy->getNumElements() / Factor);
6592 InstructionCost NumOfResults =
6593 getTypeLegalizationCost(ResultTy).first * NumOfLoadsInInterleaveGrp;
6594
6595 // About a half of the loads may be folded in shuffles when we have only
6596 // one result. If we have more than one result, or the loads are masked,
6597 // we do not fold loads at all.
6598 unsigned NumOfUnfoldedLoads =
6599 UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
6600
6601 // Get a number of shuffle operations per result.
6602 unsigned NumOfShufflesPerResult =
6603 std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
6604
6605 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6606 // When we have more than one destination, we need additional instructions
6607 // to keep sources.
6608 InstructionCost NumOfMoves = 0;
6609 if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
6610 NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
6611
6612 InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
6613 MaskCost + NumOfUnfoldedLoads * MemOpCost +
6614 NumOfMoves;
6615
6616 return Cost;
6617 }
6618
6619 // Store.
6620 assert(Opcode == Instruction::Store &&
6621 "Expected Store Instruction at this point");
6622 // X86InterleavedAccess support only the following interleaved-access group.
6623 static const CostTblEntry AVX512InterleavedStoreTbl[] = {
6624 {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
6625 {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
6626 {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
6627
6628 {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
6629 {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
6630 {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
6631 {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
6632 };
6633
6634 if (const auto *Entry =
6635 CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
6636 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6637 //If an entry does not exist, fallback to the default implementation.
6638
6639 // There is no strided stores meanwhile. And store can't be folded in
6640 // shuffle.
6641 unsigned NumOfSources = Factor; // The number of values to be merged.
6642 InstructionCost ShuffleCost = getShuffleCost(
6643 TTI::SK_PermuteTwoSrc, SingleMemOpTy, {}, CostKind, 0, nullptr);
6644 unsigned NumOfShufflesPerStore = NumOfSources - 1;
6645
6646 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6647 // We need additional instructions to keep sources.
6648 unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
6650 MaskCost +
6651 NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
6652 NumOfMoves;
6653 return Cost;
6654}
6655
6657 unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices,
6658 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
6659 bool UseMaskForCond, bool UseMaskForGaps) {
6660 auto *VecTy = cast<FixedVectorType>(BaseTy);
6661
6662 auto isSupportedOnAVX512 = [&](Type *VecTy) {
6663 Type *EltTy = cast<VectorType>(VecTy)->getElementType();
6664 if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
6665 EltTy->isIntegerTy(32) || EltTy->isPointerTy())
6666 return true;
6667 if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy())
6668 return ST->hasBWI();
6669 if (EltTy->isBFloatTy())
6670 return ST->hasBF16();
6671 return false;
6672 };
6673 if (ST->hasAVX512() && isSupportedOnAVX512(VecTy))
6675 Opcode, VecTy, Factor, Indices, Alignment,
6676 AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
6677
6678 if (UseMaskForCond || UseMaskForGaps)
6679 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6680 Alignment, AddressSpace, CostKind,
6681 UseMaskForCond, UseMaskForGaps);
6682
6683 // Get estimation for interleaved load/store operations for SSE-AVX2.
6684 // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow
6685 // computing the cost using a generic formula as a function of generic
6686 // shuffles. We therefore use a lookup table instead, filled according to
6687 // the instruction sequences that codegen currently generates.
6688
6689 // VecTy for interleave memop is <VF*Factor x Elt>.
6690 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6691 // VecTy = <12 x i32>.
6692 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6693
6694 // This function can be called with VecTy=<6xi128>, Factor=3, in which case
6695 // the VF=2, while v2i128 is an unsupported MVT vector type
6696 // (see MachineValueType.h::getVectorVT()).
6697 if (!LegalVT.isVector())
6698 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6699 Alignment, AddressSpace, CostKind);
6700
6701 unsigned VF = VecTy->getNumElements() / Factor;
6702 Type *ScalarTy = VecTy->getElementType();
6703 // Deduplicate entries, model floats/pointers as appropriately-sized integers.
6704 if (!ScalarTy->isIntegerTy())
6705 ScalarTy =
6706 Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy));
6707
6708 // Get the cost of all the memory operations.
6709 // FIXME: discount dead loads.
6710 InstructionCost MemOpCosts = getMemoryOpCost(
6711 Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind);
6712
6713 auto *VT = FixedVectorType::get(ScalarTy, VF);
6714 EVT ETy = TLI->getValueType(DL, VT);
6715 if (!ETy.isSimple())
6716 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6717 Alignment, AddressSpace, CostKind);
6718
6719 // TODO: Complete for other data-types and strides.
6720 // Each combination of Stride, element bit width and VF results in a different
6721 // sequence; The cost tables are therefore accessed with:
6722 // Factor (stride) and VectorType=VFxiN.
6723 // The Cost accounts only for the shuffle sequence;
6724 // The cost of the loads/stores is accounted for separately.
6725 //
6726 static const CostTblEntry AVX2InterleavedLoadTbl[] = {
6727 {2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8
6728 {2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8
6729 {2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8
6730 {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8
6731 {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8
6732
6733 {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16
6734 {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16
6735 {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16
6736
6737 {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32
6738 {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32
6739 {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32
6740
6741 {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64
6742 {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64
6743 {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64
6744 {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64
6745
6746 {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8
6747 {3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8
6748 {3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8
6749 {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
6750 {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8
6751
6752 {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16
6753 {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16
6754 {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16
6755 {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16
6756 {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16
6757
6758 {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32
6759 {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32
6760 {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32
6761 {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32
6762 {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32
6763
6764 {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64
6765 {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64
6766 {3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64
6767 {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64
6768
6769 {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8
6770 {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8
6771 {4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8
6772 {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8
6773 {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8
6774
6775 {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16
6776 {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16
6777 {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16
6778 {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16
6779 {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16
6780
6781 {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32
6782 {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32
6783 {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32
6784 {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32
6785 {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32
6786
6787 {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64
6788 {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64
6789 {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64
6790 {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64
6791
6792 {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8
6793 {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8
6794 {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8
6795 {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8
6796 {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8
6797
6798 {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16
6799 {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16
6800 {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16
6801 {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16
6802 {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16
6803
6804 {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32
6805 {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32
6806 {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32
6807 {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32
6808
6809 {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64
6810 {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64
6811 {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64
6812
6813 {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32
6814 };
6815
6816 static const CostTblEntry SSSE3InterleavedLoadTbl[] = {
6817 {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16
6818 };
6819
6820 static const CostTblEntry SSE2InterleavedLoadTbl[] = {
6821 {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16
6822 {2, MVT::v4i16, 7}, // (load 8i16 and) deinterleave into 2 x 4i16
6823
6824 {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32
6825 {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32
6826
6827 {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64
6828 };
6829
6830 static const CostTblEntry AVX2InterleavedStoreTbl[] = {
6831 {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store)
6832 {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store)
6833
6834 {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store)
6835 {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store)
6836 {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store)
6837
6838 {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store)
6839 {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store)
6840 {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store)
6841 {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store)
6842
6843 {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store)
6844 {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store)
6845 {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store)
6846 {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store)
6847 {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store)
6848
6849 {3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store)
6850 {3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store)
6851 {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store)
6852 {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
6853 {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
6854
6855 {3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store)
6856 {3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store)
6857 {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store)
6858 {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store)
6859 {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store)
6860
6861 {3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store)
6862 {3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store)
6863 {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store)
6864 {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store)
6865 {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store)
6866
6867 {3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store)
6868 {3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store)
6869 {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store)
6870 {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store)
6871
6872 {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store)
6873 {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store)
6874 {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store)
6875 {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store)
6876 {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store)
6877
6878 {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store)
6879 {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store)
6880 {4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store)
6881 {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store)
6882 {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store)
6883
6884 {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store)
6885 {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store)
6886 {4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store)
6887 {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store)
6888 {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store)
6889
6890 {4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store)
6891 {4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store)
6892 {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store)
6893 {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store)
6894
6895 {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store)
6896 {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store)
6897 {6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store)
6898 {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store)
6899 {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store)
6900
6901 {6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store)
6902 {6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store)
6903 {6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store)
6904 {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store)
6905 {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store)
6906
6907 {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store)
6908 {6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store)
6909 {6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store)
6910 {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store)
6911
6912 {6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store)
6913 {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store)
6914 {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store)
6915 };
6916
6917 static const CostTblEntry SSE2InterleavedStoreTbl[] = {
6918 {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store)
6919 {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store)
6920 {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store)
6921
6922 {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store)
6923 {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store)
6924
6925 {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store)
6926 };
6927
6928 if (Opcode == Instruction::Load) {
6929 auto GetDiscountedCost = [Factor, NumMembers = Indices.size(),
6930 MemOpCosts](const CostTblEntry *Entry) {
6931 // NOTE: this is just an approximation!
6932 // It can over/under -estimate the cost!
6933 return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor);
6934 };
6935
6936 if (ST->hasAVX2())
6937 if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor,
6938 ETy.getSimpleVT()))
6939 return GetDiscountedCost(Entry);
6940
6941 if (ST->hasSSSE3())
6942 if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor,
6943 ETy.getSimpleVT()))
6944 return GetDiscountedCost(Entry);
6945
6946 if (ST->hasSSE2())
6947 if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor,
6948 ETy.getSimpleVT()))
6949 return GetDiscountedCost(Entry);
6950 } else {
6951 assert(Opcode == Instruction::Store &&
6952 "Expected Store Instruction at this point");
6953 assert((!Indices.size() || Indices.size() == Factor) &&
6954 "Interleaved store only supports fully-interleaved groups.");
6955 if (ST->hasAVX2())
6956 if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor,
6957 ETy.getSimpleVT()))
6958 return MemOpCosts + Entry->Cost;
6959
6960 if (ST->hasSSE2())
6961 if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor,
6962 ETy.getSimpleVT()))
6963 return MemOpCosts + Entry->Cost;
6964 }
6965
6966 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6967 Alignment, AddressSpace, CostKind,
6968 UseMaskForCond, UseMaskForGaps);
6969}
6970
6972 StackOffset BaseOffset,
6973 bool HasBaseReg, int64_t Scale,
6974 unsigned AddrSpace) const {
6975 // Scaling factors are not free at all.
6976 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
6977 // will take 2 allocations in the out of order engine instead of 1
6978 // for plain addressing mode, i.e. inst (reg1).
6979 // E.g.,
6980 // vaddps (%rsi,%rdx), %ymm0, %ymm1
6981 // Requires two allocations (one for the load, one for the computation)
6982 // whereas:
6983 // vaddps (%rsi), %ymm0, %ymm1
6984 // Requires just 1 allocation, i.e., freeing allocations for other operations
6985 // and having less micro operations to execute.
6986 //
6987 // For some X86 architectures, this is even worse because for instance for
6988 // stores, the complex addressing mode forces the instruction to use the
6989 // "load" ports instead of the dedicated "store" port.
6990 // E.g., on Haswell:
6991 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
6992 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
6994 AM.BaseGV = BaseGV;
6995 AM.BaseOffs = BaseOffset.getFixed();
6996 AM.HasBaseReg = HasBaseReg;
6997 AM.Scale = Scale;
6998 AM.ScalableOffset = BaseOffset.getScalable();
6999 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
7000 // Scale represents reg2 * scale, thus account for 1
7001 // as soon as we use a second register.
7002 return AM.Scale != 0;
7003 return -1;
7004}
7005
7007 // TODO: Hook MispredictPenalty of SchedMachineModel into this.
7008 return 14;
7009}
7010
7012 unsigned Bits = Ty->getScalarSizeInBits();
7013
7014 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
7015 // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
7016 if (ST->hasXOP() && (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
7017 return false;
7018
7019 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
7020 // shifts just as cheap as scalar ones.
7021 if (ST->hasAVX2() && (Bits == 32 || Bits == 64))
7022 return false;
7023
7024 // AVX512BW has shifts such as vpsllvw.
7025 if (ST->hasBWI() && Bits == 16)
7026 return false;
7027
7028 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
7029 // fully general vector.
7030 return true;
7031}
7032
7033unsigned X86TTIImpl::getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
7034 Type *ScalarValTy) const {
7035 if (ST->hasF16C() && ScalarMemTy->isHalfTy()) {
7036 return 4;
7037 }
7038 return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy);
7039}
7040
7042 SmallVectorImpl<Use *> &Ops) const {
7043 using namespace llvm::PatternMatch;
7044
7045 FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
7046 if (!VTy)
7047 return false;
7048
7049 if (I->getOpcode() == Instruction::Mul &&
7050 VTy->getElementType()->isIntegerTy(64)) {
7051 for (auto &Op : I->operands()) {
7052 // Make sure we are not already sinking this operand
7053 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
7054 continue;
7055
7056 // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or
7057 // the PMULUDQ pattern where the input is a zext_inreg from vXi32.
7058 if (ST->hasSSE41() &&
7059 match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),
7060 m_SpecificInt(32)))) {
7061 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
7062 Ops.push_back(&Op);
7063 } else if (ST->hasSSE2() &&
7064 match(Op.get(),
7065 m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) {
7066 Ops.push_back(&Op);
7067 }
7068 }
7069
7070 return !Ops.empty();
7071 }
7072
7073 // A uniform shift amount in a vector shift or funnel shift may be much
7074 // cheaper than a generic variable vector shift, so make that pattern visible
7075 // to SDAG by sinking the shuffle instruction next to the shift.
7076 int ShiftAmountOpNum = -1;
7077 if (I->isShift())
7078 ShiftAmountOpNum = 1;
7079 else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
7080 if (II->getIntrinsicID() == Intrinsic::fshl ||
7081 II->getIntrinsicID() == Intrinsic::fshr)
7082 ShiftAmountOpNum = 2;
7083 }
7084
7085 if (ShiftAmountOpNum == -1)
7086 return false;
7087
7088 auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
7089 if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
7090 isVectorShiftByScalarCheap(I->getType())) {
7091 Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
7092 return true;
7093 }
7094
7095 return false;
7096}
Expand Atomic instructions
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
Hexagon Common GEP
static LVOptions Options
Definition: LVOptions.cpp:25
#define I(x, y, z)
Definition: MD5.cpp:58
uint64_t IntrinsicInst * II
#define P(N)
if(PassOpts->AAPipeline)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getNumElements(Type *Ty)
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
This file a TargetTransformInfo::Concept conforming object specific to the X86 target machine.
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:986
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1649
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1330
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:371
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:258
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:380
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1468
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1015
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:827
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:200
APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition: APInt.cpp:455
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1542
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:479
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
Definition: BasicTTIImpl.h:397
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:923
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={})
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:807
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:959
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:380
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:676
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:703
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:697
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:696
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:681
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:684
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:698
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:701
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:699
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:683
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:364
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:843
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:617
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:421
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:847
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:322
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:563
unsigned getNumElements() const
Definition: DerivedTypes.h:606
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:933
static InstructionCost getInvalid(CostType Val=0)
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:311
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
const IntrinsicInst * getInst() const
Machine Value Type.
bool is128BitVector() const
Return true if this is a 128-bit vector type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Class to represent pointers.
Definition: DerivedTypes.h:670
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:703
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
bool test(unsigned Idx) const
size_type size() const
Returns the number of bits in this bitvector.
bool empty() const
Definition: SmallVector.h:81
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
static StackOffset getScalable(int64_t Scalable)
Definition: TypeSize.h:43
static StackOffset getFixed(int64_t Fixed)
Definition: TypeSize.h:42
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
unsigned getMaxExpandSizeMemcmp(bool OptSize) const
Get maximum # of load operations permitted for memcmp.
MVT getSimpleValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the MVT corresponding to this LLVM type. See getValueType.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
bool isExpensiveToSpeculativelyExecute(const Instruction *I)
bool isStridedAccess(const SCEV *Ptr) const
unsigned minRequiredElementSize(const Value *Val, bool &isSigned) const
const SCEVConstant * getConstantStrideStep(ScalarEvolution *SE, const SCEV *Ptr) const
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
CacheLevel
The possible cache levels.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:348
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
static Type * getDoubleTy(LLVMContext &C)
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:243
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:264
static IntegerType * getInt1Ty(LLVMContext &C)
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:145
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
static IntegerType * getInt8Ty(LLVMContext &C)
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:225
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
Base class of all SIMD vector types.
Definition: DerivedTypes.h:427
static VectorType * getExtendedElementVectorType(VectorType *VTy)
This static method is like getInteger except that the element types are twice as wide as the elements...
Definition: DerivedTypes.h:487
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:665
static VectorType * getDoubleElementsVectorType(VectorType *VTy)
This static method returns a VectorType with twice as many elements as the input type and the same el...
Definition: DerivedTypes.h:541
Type * getElementType() const
Definition: DerivedTypes.h:460
bool hasSSE1() const
Definition: X86Subtarget.h:193
bool hasSSE42() const
Definition: X86Subtarget.h:198
bool useAVX512Regs() const
Definition: X86Subtarget.h:253
bool hasSSE3() const
Definition: X86Subtarget.h:195
bool hasAVX512() const
Definition: X86Subtarget.h:201
bool hasSSE41() const
Definition: X86Subtarget.h:197
bool hasSSE2() const
Definition: X86Subtarget.h:194
bool hasSSSE3() const
Definition: X86Subtarget.h:196
bool hasAVX() const
Definition: X86Subtarget.h:199
unsigned getPreferVectorWidth() const
Definition: X86Subtarget.h:225
bool hasAVX2() const
Definition: X86Subtarget.h:200
InstructionCost getInterleavedMemoryOpCostAVX512(unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
bool isLegalMaskedGather(Type *DataType, Align Alignment)
InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask, TTI::TargetCostKind CostKind) const
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const
std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool isLegalNTStore(Type *DataType, Align Alignment)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
bool isLegalNTLoad(Type *DataType, Align Alignment)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment)
bool isLegalMaskedGatherScatter(Type *DataType, Align Alignment)
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
bool isLegalMaskedLoad(Type *DataType, Align Alignment)
bool hasConditionalLoadStoreForType(Type *Ty=nullptr) const
bool supportsEfficientVectorElementLoadStore() const
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
bool prefersVectorizedAddressing() const
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment)
std::optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const override
bool isLegalMaskedStore(Type *DataType, Align Alignment)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
Calculate the cost of Gather / Scatter operation.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
unsigned getMaxInterleaveFactor(ElementCount VF)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
bool isLegalMaskedCompressStore(Type *DataType, Align Alignment)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isVectorShiftByScalarCheap(Type *Ty) const
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
unsigned getNumberOfRegisters(unsigned ClassID) const
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
unsigned getAtomicMemIntrinsicMaxElementSize() const
bool isLegalMaskedScatter(Type *DataType, Align Alignment)
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={})
InstructionCost getIntImmCost(int64_t)
Calculate the cost of materializing a 64-bit value.
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getBranchMispredictPenalty() const
bool isExpensiveToSpeculativelyExecute(const Instruction *I)
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr)
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
InstructionCost getMinMaxCost(Intrinsic::ID IID, Type *Ty, TTI::TargetCostKind CostKind, FastMathFlags FMF)
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Type) const
bool hasDivRemOp(Type *DataType, bool IsSigned)
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition: APInt.cpp:2982
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:780
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:753
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:744
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:841
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:717
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:262
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:805
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:752
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:981
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:757
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:330
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:550
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:811
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1031
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:697
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:939
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1050
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:887
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:920
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:817
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
Definition: PatternMatch.h:982
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
apint_match m_APIntAllowPoison(const APInt *&Res)
Match APInt while allowing poison in splat vector constants.
Definition: PatternMatch.h:305
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:555
Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:394
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1841
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
InstructionCost Cost
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
#define N
std::optional< unsigned > operator[](TargetTransformInfo::TargetCostKind Kind) const
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition: Alignment.h:141
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Type Conversion Cost Table.
Definition: CostTable.h:55