LLVM 23.0.0git
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements a TargetTransformInfo analysis pass specific to the
10/// X86 target machine. It uses the target's detailed information to provide
11/// more precise answers to certain TTI queries, while letting the target
12/// independent and default TTI implementations handle the rest.
13///
14//===----------------------------------------------------------------------===//
15/// About Cost Model numbers used below it's necessary to say the following:
16/// the numbers correspond to some "generic" X86 CPU instead of usage of a
17/// specific CPU model. Usually the numbers correspond to the CPU where the
18/// feature first appeared. For example, if we do Subtarget.hasSSE42() in
19/// the lookups below the cost is based on Nehalem as that was the first CPU
20/// to support that feature level and thus has most likely the worst case cost,
21/// although we may discard an outlying worst cost from one CPU (e.g. Atom).
22///
23/// Some examples of other technologies/CPUs:
24/// SSE 3 - Pentium4 / Athlon64
25/// SSE 4.1 - Penryn
26/// SSE 4.2 - Nehalem / Silvermont
27/// AVX - Sandy Bridge / Jaguar / Bulldozer
28/// AVX2 - Haswell / Ryzen
29/// AVX-512 - Xeon Phi / Skylake
30///
31/// And some examples of instruction target dependent costs (latency)
32/// divss sqrtss rsqrtss
33/// AMD K7 11-16 19 3
34/// Piledriver 9-24 13-15 5
35/// Jaguar 14 16 2
36/// Pentium II,III 18 30 2
37/// Nehalem 7-14 7-18 3
38/// Haswell 10-13 11 5
39///
40/// Interpreting the 4 TargetCostKind types:
41/// TCK_RecipThroughput and TCK_Latency should try to match the worst case
42/// values reported by the CPU scheduler models (and llvm-mca).
43/// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the
44/// actual encoding size of the instruction.
45/// TCK_SizeAndLatency should match the worst case micro-op counts reported by
46/// by the CPU scheduler models (and llvm-mca), to ensure that they are
47/// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are
48/// often used as the cost thresholds where TCK_SizeAndLatency is requested.
49//===----------------------------------------------------------------------===//
50
59#include <optional>
60
61using namespace llvm;
62
63#define DEBUG_TYPE "x86tti"
64
65//===----------------------------------------------------------------------===//
66//
67// X86 cost model.
68//
69//===----------------------------------------------------------------------===//
70
71// Helper struct to store/access costs for each cost kind.
72// TODO: Move this to allow other targets to use it?
74 unsigned RecipThroughputCost = ~0U;
75 unsigned LatencyCost = ~0U;
76 unsigned CodeSizeCost = ~0U;
77 unsigned SizeAndLatencyCost = ~0U;
78
79 std::optional<unsigned>
81 unsigned Cost = ~0U;
82 switch (Kind) {
85 break;
88 break;
91 break;
94 break;
95 }
96 if (Cost == ~0U)
97 return std::nullopt;
98 return Cost;
99 }
100};
103
105X86TTIImpl::getPopcntSupport(unsigned TyWidth) const {
106 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
107 // TODO: Currently the __builtin_popcount() implementation using SSE3
108 // instructions is inefficient. Once the problem is fixed, we should
109 // call ST->hasSSE3() instead of ST->hasPOPCNT().
110 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
111}
112
113std::optional<unsigned> X86TTIImpl::getCacheSize(
115 switch (Level) {
117 // - Penryn
118 // - Nehalem
119 // - Westmere
120 // - Sandy Bridge
121 // - Ivy Bridge
122 // - Haswell
123 // - Broadwell
124 // - Skylake
125 // - Kabylake
126 return 32 * 1024; // 32 KiB
128 // - Penryn
129 // - Nehalem
130 // - Westmere
131 // - Sandy Bridge
132 // - Ivy Bridge
133 // - Haswell
134 // - Broadwell
135 // - Skylake
136 // - Kabylake
137 return 256 * 1024; // 256 KiB
138 }
139
140 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
141}
142
143std::optional<unsigned> X86TTIImpl::getCacheAssociativity(
145 // - Penryn
146 // - Nehalem
147 // - Westmere
148 // - Sandy Bridge
149 // - Ivy Bridge
150 // - Haswell
151 // - Broadwell
152 // - Skylake
153 // - Kabylake
154 switch (Level) {
156 [[fallthrough]];
158 return 8;
159 }
160
161 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
162}
163
165
167 return Vector ? VectorClass
168 : Ty && Ty->isFloatingPointTy() ? ScalarFPClass
169 : GPRClass;
170}
171
172unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
173 if (ClassID == VectorClass && !ST->hasSSE1())
174 return 0;
175
176 if (!ST->is64Bit())
177 return 8;
178
179 if ((ClassID == GPRClass && ST->hasEGPR()) ||
180 (ClassID != GPRClass && ST->hasAVX512()))
181 return 32;
182
183 return 16;
184}
185
187 if (!ST->hasCF())
188 return false;
189 if (!Ty)
190 return true;
191 // Conditional faulting is supported by CFCMOV, which only accepts
192 // 16/32/64-bit operands.
193 // TODO: Support f32/f64 with VMOVSS/VMOVSD with zero mask when it's
194 // profitable.
195 auto *VTy = dyn_cast<FixedVectorType>(Ty);
196 if (!Ty->isIntegerTy() && (!VTy || VTy->getNumElements() != 1))
197 return false;
198 auto *ScalarTy = Ty->getScalarType();
199 switch (cast<IntegerType>(ScalarTy)->getBitWidth()) {
200 default:
201 return false;
202 case 16:
203 case 32:
204 case 64:
205 return true;
206 }
207}
208
211 unsigned PreferVectorWidth = ST->getPreferVectorWidth();
212 switch (K) {
214 return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
216 if (ST->hasAVX512() && PreferVectorWidth >= 512)
217 return TypeSize::getFixed(512);
218 if (ST->hasAVX() && PreferVectorWidth >= 256)
219 return TypeSize::getFixed(256);
220 if (ST->hasSSE1() && PreferVectorWidth >= 128)
221 return TypeSize::getFixed(128);
222 return TypeSize::getFixed(0);
224 return TypeSize::getScalable(0);
225 }
226
227 llvm_unreachable("Unsupported register kind");
228}
229
234
236 // If the loop will not be vectorized, don't interleave the loop.
237 // Let regular unroll to unroll the loop, which saves the overflow
238 // check and memory check cost.
239 if (VF.isScalar())
240 return 1;
241
242 if (ST->isAtom())
243 return 1;
244
245 // Sandybridge and Haswell have multiple execution ports and pipelined
246 // vector units.
247 if (ST->hasAVX())
248 return 4;
249
250 return 2;
251}
252
254 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
256 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
257
258 // vXi8 multiplications are always promoted to vXi16.
259 // Sub-128-bit types can be extended/packed more efficiently.
260 if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
261 Ty->getPrimitiveSizeInBits() <= 64 && Ty->getScalarSizeInBits() == 8) {
262 Type *WideVecTy =
264 return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
266 CostKind) +
267 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
269 CostKind) +
270 getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info);
271 }
272
273 // Legalize the type.
274 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
275
276 int ISD = TLI->InstructionOpcodeToISD(Opcode);
277 assert(ISD && "Invalid opcode");
278
279 if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
280 (LT.second.getScalarType() == MVT::i32 ||
281 LT.second.getScalarType() == MVT::i64)) {
282 // Check if the operands can be represented as a smaller datatype.
283 bool Op1Signed = false, Op2Signed = false;
284 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
285 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
286 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
287 bool SignedMode = Op1Signed || Op2Signed;
288
289 // If both vXi32 are representable as i15 and at least one is constant,
290 // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
291 // can treat this as PMADDWD which has the same costs as a vXi16 multiply.
292 if (OpMinSize <= 15 && !ST->isPMADDWDSlow() &&
293 LT.second.getScalarType() == MVT::i32) {
294 bool Op1Constant =
295 isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
296 bool Op2Constant =
297 isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
298 bool Op1Sext = isa<SExtInst>(Args[0]) &&
299 (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
300 bool Op2Sext = isa<SExtInst>(Args[1]) &&
301 (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
302
303 bool IsZeroExtended = !Op1Signed || !Op2Signed;
304 bool IsConstant = Op1Constant || Op2Constant;
305 bool IsSext = Op1Sext || Op2Sext;
306 if (IsConstant || IsZeroExtended || IsSext)
307 LT.second =
308 MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
309 }
310
311 // Check if the vXi32 operands can be shrunk into a smaller datatype.
312 // This should match the codegen from reduceVMULWidth.
313 // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()).
314 if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) {
315 if (OpMinSize <= 7)
316 return LT.first * 3; // pmullw/sext
317 if (!SignedMode && OpMinSize <= 8)
318 return LT.first * 3; // pmullw/zext
319 if (OpMinSize <= 15)
320 return LT.first * 5; // pmullw/pmulhw/pshuf
321 if (!SignedMode && OpMinSize <= 16)
322 return LT.first * 5; // pmullw/pmulhw/pshuf
323 }
324
325 // If both vXi64 are representable as (unsigned) i32, then we can perform
326 // the multiple with a single PMULUDQ instruction.
327 // TODO: Add (SSE41+) PMULDQ handling for signed extensions.
328 if (!SignedMode && OpMinSize <= 32 && LT.second.getScalarType() == MVT::i64)
330 }
331
332 // Vector multiply by pow2 will be simplified to shifts.
333 // Vector multiply by -pow2 will be simplified to shifts/negates.
334 if (ISD == ISD::MUL && Op2Info.isConstant() &&
335 (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) {
337 getArithmeticInstrCost(Instruction::Shl, Ty, CostKind,
338 Op1Info.getNoProps(), Op2Info.getNoProps());
339 if (Op2Info.isNegatedPowerOf2())
340 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
341 return Cost;
342 }
343
344 // On X86, vector signed division by constants power-of-two are
345 // normally expanded to the sequence SRA + SRL + ADD + SRA.
346 // The OperandValue properties may not be the same as that of the previous
347 // operation; conservatively assume OP_None.
348 if ((ISD == ISD::SDIV || ISD == ISD::SREM) &&
349 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
351 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
352 Op1Info.getNoProps(), Op2Info.getNoProps());
353 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
354 Op1Info.getNoProps(), Op2Info.getNoProps());
355 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
356 Op1Info.getNoProps(), Op2Info.getNoProps());
357
358 if (ISD == ISD::SREM) {
359 // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
360 Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(),
361 Op2Info.getNoProps());
362 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(),
363 Op2Info.getNoProps());
364 }
365
366 return Cost;
367 }
368
369 // Vector unsigned division/remainder will be simplified to shifts/masks.
370 if ((ISD == ISD::UDIV || ISD == ISD::UREM) &&
371 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
372 if (ISD == ISD::UDIV)
373 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
374 Op1Info.getNoProps(), Op2Info.getNoProps());
375 // UREM
376 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
377 Op1Info.getNoProps(), Op2Info.getNoProps());
378 }
379
380 static const CostKindTblEntry GFNIUniformConstCostTable[] = {
381 { ISD::SHL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
382 { ISD::SRL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
383 { ISD::SRA, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
384 { ISD::SHL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
385 { ISD::SRL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
386 { ISD::SRA, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
387 { ISD::SHL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
388 { ISD::SRL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
389 { ISD::SRA, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
390 };
391
392 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasGFNI())
393 if (const auto *Entry =
394 CostTableLookup(GFNIUniformConstCostTable, ISD, LT.second))
395 if (auto KindCost = Entry->Cost[CostKind])
396 return LT.first * *KindCost;
397
398 static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
399 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
400 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
401 { ISD::SRA, MVT::v16i8, { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb.
402 { ISD::SHL, MVT::v32i8, { 1, 8, 2, 3 } }, // psllw + pand.
403 { ISD::SRL, MVT::v32i8, { 1, 8, 2, 3 } }, // psrlw + pand.
404 { ISD::SRA, MVT::v32i8, { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb.
405 { ISD::SHL, MVT::v64i8, { 1, 8, 2, 3 } }, // psllw + pand.
406 { ISD::SRL, MVT::v64i8, { 1, 8, 2, 3 } }, // psrlw + pand.
407 { ISD::SRA, MVT::v64i8, { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb.
408
409 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // psllw
410 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
411 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
412 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // psllw
413 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
414 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
415 };
416
417 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI())
418 if (const auto *Entry =
419 CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second))
420 if (auto KindCost = Entry->Cost[CostKind])
421 return LT.first * *KindCost;
422
423 static const CostKindTblEntry AVX512UniformConstCostTable[] = {
424 { ISD::SHL, MVT::v64i8, { 2, 12, 5, 6 } }, // psllw + pand.
425 { ISD::SRL, MVT::v64i8, { 2, 12, 5, 6 } }, // psrlw + pand.
426 { ISD::SRA, MVT::v64i8, { 3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb.
427
428 { ISD::SHL, MVT::v16i16, { 2, 7, 4, 4 } }, // psllw + split.
429 { ISD::SRL, MVT::v16i16, { 2, 7, 4, 4 } }, // psrlw + split.
430 { ISD::SRA, MVT::v16i16, { 2, 7, 4, 4 } }, // psraw + split.
431
432 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, // pslld
433 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, // psrld
434 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, // psrad
435 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, // pslld
436 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, // psrld
437 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, // psrad
438
439 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, // psraq
440 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, // psllq
441 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, // psrlq
442 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, // psraq
443 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, // psllq
444 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, // psrlq
445 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, // psraq
446
447 { ISD::SDIV, MVT::v16i32, { 6 } }, // pmuludq sequence
448 { ISD::SREM, MVT::v16i32, { 8 } }, // pmuludq+mul+sub sequence
449 { ISD::UDIV, MVT::v16i32, { 5 } }, // pmuludq sequence
450 { ISD::UREM, MVT::v16i32, { 7 } }, // pmuludq+mul+sub sequence
451 };
452
453 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512())
454 if (const auto *Entry =
455 CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second))
456 if (auto KindCost = Entry->Cost[CostKind])
457 return LT.first * *KindCost;
458
459 static const CostKindTblEntry AVX2UniformConstCostTable[] = {
460 { ISD::SHL, MVT::v16i8, { 1, 8, 2, 3 } }, // psllw + pand.
461 { ISD::SRL, MVT::v16i8, { 1, 8, 2, 3 } }, // psrlw + pand.
462 { ISD::SRA, MVT::v16i8, { 2, 10, 5, 6 } }, // psrlw, pand, pxor, psubb.
463 { ISD::SHL, MVT::v32i8, { 2, 8, 2, 4 } }, // psllw + pand.
464 { ISD::SRL, MVT::v32i8, { 2, 8, 2, 4 } }, // psrlw + pand.
465 { ISD::SRA, MVT::v32i8, { 3, 10, 5, 9 } }, // psrlw, pand, pxor, psubb.
466
467 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw
468 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw
469 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw
470 { ISD::SHL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psllw
471 { ISD::SRL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psrlw
472 { ISD::SRA, MVT::v16i16,{ 2, 2, 1, 2 } }, // psraw
473
474 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
475 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld
476 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad
477 { ISD::SHL, MVT::v8i32, { 2, 2, 1, 2 } }, // pslld
478 { ISD::SRL, MVT::v8i32, { 2, 2, 1, 2 } }, // psrld
479 { ISD::SRA, MVT::v8i32, { 2, 2, 1, 2 } }, // psrad
480
481 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq
482 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq
483 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
484 { ISD::SHL, MVT::v4i64, { 2, 2, 1, 2 } }, // psllq
485 { ISD::SRL, MVT::v4i64, { 2, 2, 1, 2 } }, // psrlq
486 { ISD::SRA, MVT::v4i64, { 4, 4, 3, 6 } }, // psrad + shuffle + split.
487
488 { ISD::SDIV, MVT::v8i32, { 6 } }, // pmuludq sequence
489 { ISD::SREM, MVT::v8i32, { 8 } }, // pmuludq+mul+sub sequence
490 { ISD::UDIV, MVT::v8i32, { 5 } }, // pmuludq sequence
491 { ISD::UREM, MVT::v8i32, { 7 } }, // pmuludq+mul+sub sequence
492 };
493
494 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2())
495 if (const auto *Entry =
496 CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second))
497 if (auto KindCost = Entry->Cost[CostKind])
498 return LT.first * *KindCost;
499
500 static const CostKindTblEntry AVXUniformConstCostTable[] = {
501 { ISD::SHL, MVT::v16i8, { 2, 7, 2, 3 } }, // psllw + pand.
502 { ISD::SRL, MVT::v16i8, { 2, 7, 2, 3 } }, // psrlw + pand.
503 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
504 { ISD::SHL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psllw + pand) + split.
505 { ISD::SRL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psrlw + pand) + split.
506 { ISD::SRA, MVT::v32i8, { 7, 7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split.
507
508 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 1 } }, // psllw.
509 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 1 } }, // psrlw.
510 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 1 } }, // psraw.
511 { ISD::SHL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psllw + split.
512 { ISD::SRL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psrlw + split.
513 { ISD::SRA, MVT::v16i16,{ 3, 6, 4, 5 } }, // psraw + split.
514
515 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 1 } }, // pslld.
516 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 1 } }, // psrld.
517 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 1 } }, // psrad.
518 { ISD::SHL, MVT::v8i32, { 3, 6, 4, 5 } }, // pslld + split.
519 { ISD::SRL, MVT::v8i32, { 3, 6, 4, 5 } }, // psrld + split.
520 { ISD::SRA, MVT::v8i32, { 3, 6, 4, 5 } }, // psrad + split.
521
522 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 1 } }, // psllq.
523 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 1 } }, // psrlq.
524 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
525 { ISD::SHL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
526 { ISD::SRL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
527 { ISD::SRA, MVT::v4i64, { 5, 7, 8, 9 } }, // 2 x psrad + shuffle + split.
528
529 { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split.
530 { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split.
531 { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split.
532 { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split.
533 };
534
535 // XOP has faster vXi8 shifts.
536 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() &&
537 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
538 if (const auto *Entry =
539 CostTableLookup(AVXUniformConstCostTable, ISD, LT.second))
540 if (auto KindCost = Entry->Cost[CostKind])
541 return LT.first * *KindCost;
542
543 static const CostKindTblEntry SSE2UniformConstCostTable[] = {
544 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
545 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
546 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
547
548 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw.
549 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw.
550 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw.
551
552 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
553 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld.
554 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad.
555
556 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq.
557 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq.
558 { ISD::SRA, MVT::v2i64, { 3, 5, 6, 6 } }, // 2 x psrad + shuffle.
559
560 { ISD::SDIV, MVT::v4i32, { 6 } }, // pmuludq sequence
561 { ISD::SREM, MVT::v4i32, { 8 } }, // pmuludq+mul+sub sequence
562 { ISD::UDIV, MVT::v4i32, { 5 } }, // pmuludq sequence
563 { ISD::UREM, MVT::v4i32, { 7 } }, // pmuludq+mul+sub sequence
564 };
565
566 // XOP has faster vXi8 shifts.
567 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() &&
568 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
569 if (const auto *Entry =
570 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
571 if (auto KindCost = Entry->Cost[CostKind])
572 return LT.first * *KindCost;
573
574 static const CostKindTblEntry AVX512BWConstCostTable[] = {
575 { ISD::SDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
576 { ISD::SREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
577 { ISD::UDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
578 { ISD::UREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
579
580 { ISD::SDIV, MVT::v32i16, { 6 } }, // vpmulhw sequence
581 { ISD::SREM, MVT::v32i16, { 8 } }, // vpmulhw+mul+sub sequence
582 { ISD::UDIV, MVT::v32i16, { 6 } }, // vpmulhuw sequence
583 { ISD::UREM, MVT::v32i16, { 8 } }, // vpmulhuw+mul+sub sequence
584 };
585
586 if (Op2Info.isConstant() && ST->hasBWI())
587 if (const auto *Entry =
588 CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
589 if (auto KindCost = Entry->Cost[CostKind])
590 return LT.first * *KindCost;
591
592 static const CostKindTblEntry AVX512ConstCostTable[] = {
593 { ISD::SDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
594 { ISD::SREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
595 { ISD::UDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
596 { ISD::UREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
597
598 { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence
599 { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence
600 { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence
601 { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence
602
603 { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence
604 { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence
605 { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence
606 { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence
607 };
608
609 if (Op2Info.isConstant() && ST->hasAVX512())
610 if (const auto *Entry =
611 CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
612 if (auto KindCost = Entry->Cost[CostKind])
613 return LT.first * *KindCost;
614
615 static const CostKindTblEntry AVX2ConstCostTable[] = {
616 { ISD::SDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
617 { ISD::SREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
618 { ISD::UDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
619 { ISD::UREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
620
621 { ISD::SDIV, MVT::v16i16, { 6 } }, // vpmulhw sequence
622 { ISD::SREM, MVT::v16i16, { 8 } }, // vpmulhw+mul+sub sequence
623 { ISD::UDIV, MVT::v16i16, { 6 } }, // vpmulhuw sequence
624 { ISD::UREM, MVT::v16i16, { 8 } }, // vpmulhuw+mul+sub sequence
625
626 { ISD::SDIV, MVT::v8i32, { 15 } }, // vpmuldq sequence
627 { ISD::SREM, MVT::v8i32, { 19 } }, // vpmuldq+mul+sub sequence
628 { ISD::UDIV, MVT::v8i32, { 15 } }, // vpmuludq sequence
629 { ISD::UREM, MVT::v8i32, { 19 } }, // vpmuludq+mul+sub sequence
630 };
631
632 if (Op2Info.isConstant() && ST->hasAVX2())
633 if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
634 if (auto KindCost = Entry->Cost[CostKind])
635 return LT.first * *KindCost;
636
637 static const CostKindTblEntry AVXConstCostTable[] = {
638 { ISD::SDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
639 { ISD::SREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
640 { ISD::UDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
641 { ISD::UREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
642
643 { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split.
644 { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split.
645 { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split.
646 { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split.
647
648 { ISD::SDIV, MVT::v8i32, { 32 } }, // vpmuludq sequence
649 { ISD::SREM, MVT::v8i32, { 38 } }, // vpmuludq+mul+sub sequence
650 { ISD::UDIV, MVT::v8i32, { 32 } }, // 2*pmuludq sequence + split.
651 { ISD::UREM, MVT::v8i32, { 42 } }, // 2*pmuludq+mul+sub sequence + split.
652 };
653
654 if (Op2Info.isConstant() && ST->hasAVX())
655 if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second))
656 if (auto KindCost = Entry->Cost[CostKind])
657 return LT.first * *KindCost;
658
659 static const CostKindTblEntry SSE41ConstCostTable[] = {
660 { ISD::SDIV, MVT::v4i32, { 15 } }, // vpmuludq sequence
661 { ISD::SREM, MVT::v4i32, { 20 } }, // vpmuludq+mul+sub sequence
662 };
663
664 if (Op2Info.isConstant() && ST->hasSSE41())
665 if (const auto *Entry =
666 CostTableLookup(SSE41ConstCostTable, ISD, LT.second))
667 if (auto KindCost = Entry->Cost[CostKind])
668 return LT.first * *KindCost;
669
670 static const CostKindTblEntry SSE2ConstCostTable[] = {
671 { ISD::SDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
672 { ISD::SREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
673 { ISD::UDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
674 { ISD::UREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
675
676 { ISD::SDIV, MVT::v8i16, { 6 } }, // pmulhw sequence
677 { ISD::SREM, MVT::v8i16, { 8 } }, // pmulhw+mul+sub sequence
678 { ISD::UDIV, MVT::v8i16, { 6 } }, // pmulhuw sequence
679 { ISD::UREM, MVT::v8i16, { 8 } }, // pmulhuw+mul+sub sequence
680
681 { ISD::SDIV, MVT::v4i32, { 19 } }, // pmuludq sequence
682 { ISD::SREM, MVT::v4i32, { 24 } }, // pmuludq+mul+sub sequence
683 { ISD::UDIV, MVT::v4i32, { 15 } }, // pmuludq sequence
684 { ISD::UREM, MVT::v4i32, { 20 } }, // pmuludq+mul+sub sequence
685 };
686
687 if (Op2Info.isConstant() && ST->hasSSE2())
688 if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
689 if (auto KindCost = Entry->Cost[CostKind])
690 return LT.first * *KindCost;
691
692 static const CostKindTblEntry AVX512BWUniformCostTable[] = {
693 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
694 { ISD::SRL, MVT::v16i8, { 3,10, 5, 8 } }, // psrlw + pand.
695 { ISD::SRA, MVT::v16i8, { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb.
696 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
697 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
698 { ISD::SRA, MVT::v32i8, { 5,10,10,13 } }, // psrlw, pand, pxor, psubb.
699 { ISD::SHL, MVT::v64i8, { 4, 7, 6, 8 } }, // psllw + pand.
700 { ISD::SRL, MVT::v64i8, { 4, 8, 7,10 } }, // psrlw + pand.
701 { ISD::SRA, MVT::v64i8, { 5,10,10,15 } }, // psrlw, pand, pxor, psubb.
702
703 { ISD::SHL, MVT::v32i16, { 2, 4, 2, 3 } }, // psllw
704 { ISD::SRL, MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw
705 { ISD::SRA, MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw
706 };
707
708 if (ST->hasBWI() && Op2Info.isUniform())
709 if (const auto *Entry =
710 CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second))
711 if (auto KindCost = Entry->Cost[CostKind])
712 return LT.first * *KindCost;
713
714 static const CostKindTblEntry AVX512UniformCostTable[] = {
715 { ISD::SHL, MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split.
716 { ISD::SRL, MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split.
717 { ISD::SRA, MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split.
718
719 { ISD::SHL, MVT::v16i32, { 2, 4, 2, 3 } }, // pslld
720 { ISD::SRL, MVT::v16i32, { 2, 4, 2, 3 } }, // psrld
721 { ISD::SRA, MVT::v16i32, { 2, 4, 2, 3 } }, // psrad
722
723 { ISD::SRA, MVT::v2i64, { 1, 2, 1, 2 } }, // psraq
724 { ISD::SHL, MVT::v4i64, { 1, 4, 1, 2 } }, // psllq
725 { ISD::SRL, MVT::v4i64, { 1, 4, 1, 2 } }, // psrlq
726 { ISD::SRA, MVT::v4i64, { 1, 4, 1, 2 } }, // psraq
727 { ISD::SHL, MVT::v8i64, { 1, 4, 1, 2 } }, // psllq
728 { ISD::SRL, MVT::v8i64, { 1, 4, 1, 2 } }, // psrlq
729 { ISD::SRA, MVT::v8i64, { 1, 4, 1, 2 } }, // psraq
730 };
731
732 if (ST->hasAVX512() && Op2Info.isUniform())
733 if (const auto *Entry =
734 CostTableLookup(AVX512UniformCostTable, ISD, LT.second))
735 if (auto KindCost = Entry->Cost[CostKind])
736 return LT.first * *KindCost;
737
738 static const CostKindTblEntry AVX2UniformCostTable[] = {
739 // Uniform splats are cheaper for the following instructions.
740 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
741 { ISD::SRL, MVT::v16i8, { 3, 9, 5, 8 } }, // psrlw + pand.
742 { ISD::SRA, MVT::v16i8, { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb.
743 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
744 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
745 { ISD::SRA, MVT::v32i8, { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb.
746
747 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 2 } }, // psllw.
748 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 2 } }, // psrlw.
749 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 2 } }, // psraw.
750 { ISD::SHL, MVT::v16i16, { 2, 4, 2, 3 } }, // psllw.
751 { ISD::SRL, MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw.
752 { ISD::SRA, MVT::v16i16, { 2, 4, 2, 3 } }, // psraw.
753
754 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 2 } }, // pslld
755 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 2 } }, // psrld
756 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 2 } }, // psrad
757 { ISD::SHL, MVT::v8i32, { 2, 4, 2, 3 } }, // pslld
758 { ISD::SRL, MVT::v8i32, { 2, 4, 2, 3 } }, // psrld
759 { ISD::SRA, MVT::v8i32, { 2, 4, 2, 3 } }, // psrad
760
761 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 2 } }, // psllq
762 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 2 } }, // psrlq
763 { ISD::SRA, MVT::v2i64, { 2, 4, 5, 7 } }, // 2 x psrad + shuffle.
764 { ISD::SHL, MVT::v4i64, { 2, 4, 1, 2 } }, // psllq
765 { ISD::SRL, MVT::v4i64, { 2, 4, 1, 2 } }, // psrlq
766 { ISD::SRA, MVT::v4i64, { 4, 6, 5, 9 } }, // 2 x psrad + shuffle.
767 };
768
769 if (ST->hasAVX2() && Op2Info.isUniform())
770 if (const auto *Entry =
771 CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
772 if (auto KindCost = Entry->Cost[CostKind])
773 return LT.first * *KindCost;
774
775 static const CostKindTblEntry AVXUniformCostTable[] = {
776 { ISD::SHL, MVT::v16i8, { 4, 4, 6, 8 } }, // psllw + pand.
777 { ISD::SRL, MVT::v16i8, { 4, 8, 5, 8 } }, // psrlw + pand.
778 { ISD::SRA, MVT::v16i8, { 6, 6, 9,13 } }, // psrlw, pand, pxor, psubb.
779 { ISD::SHL, MVT::v32i8, { 7, 8,11,14 } }, // psllw + pand + split.
780 { ISD::SRL, MVT::v32i8, { 7, 9,10,14 } }, // psrlw + pand + split.
781 { ISD::SRA, MVT::v32i8, { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split.
782
783 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 2 } }, // psllw.
784 { ISD::SRL, MVT::v8i16, { 1, 3, 1, 2 } }, // psrlw.
785 { ISD::SRA, MVT::v8i16, { 1, 3, 1, 2 } }, // psraw.
786 { ISD::SHL, MVT::v16i16, { 3, 7, 5, 7 } }, // psllw + split.
787 { ISD::SRL, MVT::v16i16, { 3, 7, 5, 7 } }, // psrlw + split.
788 { ISD::SRA, MVT::v16i16, { 3, 7, 5, 7 } }, // psraw + split.
789
790 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 2 } }, // pslld.
791 { ISD::SRL, MVT::v4i32, { 1, 3, 1, 2 } }, // psrld.
792 { ISD::SRA, MVT::v4i32, { 1, 3, 1, 2 } }, // psrad.
793 { ISD::SHL, MVT::v8i32, { 3, 7, 5, 7 } }, // pslld + split.
794 { ISD::SRL, MVT::v8i32, { 3, 7, 5, 7 } }, // psrld + split.
795 { ISD::SRA, MVT::v8i32, { 3, 7, 5, 7 } }, // psrad + split.
796
797 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 2 } }, // psllq.
798 { ISD::SRL, MVT::v2i64, { 1, 3, 1, 2 } }, // psrlq.
799 { ISD::SRA, MVT::v2i64, { 3, 4, 5, 7 } }, // 2 x psrad + shuffle.
800 { ISD::SHL, MVT::v4i64, { 3, 7, 4, 6 } }, // psllq + split.
801 { ISD::SRL, MVT::v4i64, { 3, 7, 4, 6 } }, // psrlq + split.
802 { ISD::SRA, MVT::v4i64, { 6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split.
803 };
804
805 // XOP has faster vXi8 shifts.
806 if (ST->hasAVX() && Op2Info.isUniform() &&
807 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
808 if (const auto *Entry =
809 CostTableLookup(AVXUniformCostTable, ISD, LT.second))
810 if (auto KindCost = Entry->Cost[CostKind])
811 return LT.first * *KindCost;
812
813 static const CostKindTblEntry SSE2UniformCostTable[] = {
814 // Uniform splats are cheaper for the following instructions.
815 { ISD::SHL, MVT::v16i8, { 9, 10, 6, 9 } }, // psllw + pand.
816 { ISD::SRL, MVT::v16i8, { 9, 13, 5, 9 } }, // psrlw + pand.
817 { ISD::SRA, MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence.
818
819 { ISD::SHL, MVT::v8i16, { 2, 2, 1, 2 } }, // psllw.
820 { ISD::SRL, MVT::v8i16, { 2, 2, 1, 2 } }, // psrlw.
821 { ISD::SRA, MVT::v8i16, { 2, 2, 1, 2 } }, // psraw.
822
823 { ISD::SHL, MVT::v4i32, { 2, 2, 1, 2 } }, // pslld
824 { ISD::SRL, MVT::v4i32, { 2, 2, 1, 2 } }, // psrld.
825 { ISD::SRA, MVT::v4i32, { 2, 2, 1, 2 } }, // psrad.
826
827 { ISD::SHL, MVT::v2i64, { 2, 2, 1, 2 } }, // psllq.
828 { ISD::SRL, MVT::v2i64, { 2, 2, 1, 2 } }, // psrlq.
829 { ISD::SRA, MVT::v2i64, { 5, 9, 5, 7 } }, // 2*psrlq + xor + sub.
830 };
831
832 if (ST->hasSSE2() && Op2Info.isUniform() &&
833 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
834 if (const auto *Entry =
835 CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
836 if (auto KindCost = Entry->Cost[CostKind])
837 return LT.first * *KindCost;
838
839 static const CostKindTblEntry AVX512DQCostTable[] = {
840 { ISD::MUL, MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq
841 { ISD::MUL, MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq
842 { ISD::MUL, MVT::v8i64, { 3, 15, 1, 3 } } // pmullq
843 };
844
845 // Look for AVX512DQ lowering tricks for custom cases.
846 if (ST->hasDQI())
847 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
848 if (auto KindCost = Entry->Cost[CostKind])
849 return LT.first * *KindCost;
850
851 static const CostKindTblEntry AVX512BWCostTable[] = {
852 { ISD::SHL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsllvw/pack sequence.
853 { ISD::SRL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsrlvw/pack sequence.
854 { ISD::SRA, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsravw/pack sequence.
855 { ISD::SHL, MVT::v32i8, { 4, 23,11,16 } }, // extend/vpsllvw/pack sequence.
856 { ISD::SRL, MVT::v32i8, { 4, 30,12,18 } }, // extend/vpsrlvw/pack sequence.
857 { ISD::SRA, MVT::v32i8, { 6, 13,24,30 } }, // extend/vpsravw/pack sequence.
858 { ISD::SHL, MVT::v64i8, { 6, 19,13,15 } }, // extend/vpsllvw/pack sequence.
859 { ISD::SRL, MVT::v64i8, { 7, 27,15,18 } }, // extend/vpsrlvw/pack sequence.
860 { ISD::SRA, MVT::v64i8, { 15, 15,30,30 } }, // extend/vpsravw/pack sequence.
861
862 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsllvw
863 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsrlvw
864 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsravw
865 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsllvw
866 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsrlvw
867 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsravw
868 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsllvw
869 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsrlvw
870 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsravw
871
872 { ISD::ADD, MVT::v64i8, { 1, 1, 1, 1 } }, // paddb
873 { ISD::ADD, MVT::v32i16, { 1, 1, 1, 1 } }, // paddw
874
875 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 1 } }, // paddb
876 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 1 } }, // paddw
877 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 1 } }, // paddd
878 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 1 } }, // paddq
879
880 { ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb
881 { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw
882
883 { ISD::MUL, MVT::v16i8, { 4, 12, 4, 5 } }, // extend/pmullw/trunc
884 { ISD::MUL, MVT::v32i8, { 3, 10, 7,10 } }, // pmaddubsw
885 { ISD::MUL, MVT::v64i8, { 3, 11, 7,10 } }, // pmaddubsw
886 { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw
887
888 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb
889 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 1 } }, // psubw
890 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 1 } }, // psubd
891 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 1 } }, // psubq
892 };
893
894 // Look for AVX512BW lowering tricks for custom cases.
895 if (ST->hasBWI())
896 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
897 if (auto KindCost = Entry->Cost[CostKind])
898 return LT.first * *KindCost;
899
900 static const CostKindTblEntry AVX512CostTable[] = {
901 { ISD::SHL, MVT::v64i8, { 15, 19,27,33 } }, // vpblendv+split sequence.
902 { ISD::SRL, MVT::v64i8, { 15, 19,30,36 } }, // vpblendv+split sequence.
903 { ISD::SRA, MVT::v64i8, { 37, 37,51,63 } }, // vpblendv+split sequence.
904
905 { ISD::SHL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
906 { ISD::SRL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
907 { ISD::SRA, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence.
908
909 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } },
910 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } },
911 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } },
912 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } },
913 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } },
914 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } },
915 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } },
916 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } },
917 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } },
918
919 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } },
920 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } },
921 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } },
922 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } },
923 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } },
924 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } },
925 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } },
926 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } },
927 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } },
928
929 { ISD::ADD, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*paddb + split
930 { ISD::ADD, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*paddw + split
931
932 { ISD::SUB, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*psubb + split
933 { ISD::SUB, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*psubw + split
934
935 { ISD::AND, MVT::v32i8, { 1, 1, 1, 1 } },
936 { ISD::AND, MVT::v16i16, { 1, 1, 1, 1 } },
937 { ISD::AND, MVT::v8i32, { 1, 1, 1, 1 } },
938 { ISD::AND, MVT::v4i64, { 1, 1, 1, 1 } },
939
940 { ISD::OR, MVT::v32i8, { 1, 1, 1, 1 } },
941 { ISD::OR, MVT::v16i16, { 1, 1, 1, 1 } },
942 { ISD::OR, MVT::v8i32, { 1, 1, 1, 1 } },
943 { ISD::OR, MVT::v4i64, { 1, 1, 1, 1 } },
944
945 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 1 } },
946 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 1 } },
947 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 1 } },
948 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 1 } },
949
950 { ISD::MUL, MVT::v16i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
951 { ISD::MUL, MVT::v8i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
952 { ISD::MUL, MVT::v4i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
953 { ISD::MUL, MVT::v8i64, { 6, 9, 8, 8 } }, // 3*pmuludq/3*shift/2*add
954 { ISD::MUL, MVT::i64, { 1 } }, // Skylake from http://www.agner.org/
955
956 { X86ISD::PMULUDQ, MVT::v8i64, { 1, 5, 1, 1 } },
957
958 { ISD::FNEG, MVT::v8f64, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
959 { ISD::FADD, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
960 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
961 { ISD::FSUB, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
962 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
963 { ISD::FMUL, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
964 { ISD::FMUL, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
965 { ISD::FMUL, MVT::v2f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
966 { ISD::FMUL, MVT::f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
967
968 { ISD::FDIV, MVT::f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
969 { ISD::FDIV, MVT::v2f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
970 { ISD::FDIV, MVT::v4f64, { 8, 14, 1, 1 } }, // Skylake from http://www.agner.org/
971 { ISD::FDIV, MVT::v8f64, { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/
972
973 { ISD::FNEG, MVT::v16f32, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
974 { ISD::FADD, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
975 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
976 { ISD::FSUB, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
977 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
978 { ISD::FMUL, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
979 { ISD::FMUL, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
980 { ISD::FMUL, MVT::v4f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
981 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
982
983 { ISD::FDIV, MVT::f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
984 { ISD::FDIV, MVT::v4f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
985 { ISD::FDIV, MVT::v8f32, { 5, 11, 1, 1 } }, // Skylake from http://www.agner.org/
986 { ISD::FDIV, MVT::v16f32, { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/
987 };
988
989 if (ST->hasAVX512())
990 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
991 if (auto KindCost = Entry->Cost[CostKind])
992 return LT.first * *KindCost;
993
994 static const CostKindTblEntry AVX2ShiftCostTable[] = {
995 // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
996 // customize them to detect the cases where shift amount is a scalar one.
997 { ISD::SHL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org)
998 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
999 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org)
1000 { ISD::SHL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org)
1001 { ISD::SRL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
1002 { ISD::SRA, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org)
1003 { ISD::SHL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org)
1004 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org)
1005 { ISD::SHL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org)
1006 { ISD::SRL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org)
1007 };
1008
1009 if (ST->hasAVX512()) {
1010 if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant())
1011 // On AVX512, a packed v32i16 shift left by a constant build_vector
1012 // is lowered into a vector multiply (vpmullw).
1013 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
1014 Op1Info.getNoProps(), Op2Info.getNoProps());
1015 }
1016
1017 // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
1018 if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
1019 if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
1020 Op2Info.isConstant())
1021 // On AVX2, a packed v16i16 shift left by a constant build_vector
1022 // is lowered into a vector multiply (vpmullw).
1023 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
1024 Op1Info.getNoProps(), Op2Info.getNoProps());
1025
1026 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
1027 if (auto KindCost = Entry->Cost[CostKind])
1028 return LT.first * *KindCost;
1029 }
1030
1031 static const CostKindTblEntry XOPShiftCostTable[] = {
1032 // 128bit shifts take 1cy, but right shifts require negation beforehand.
1033 { ISD::SHL, MVT::v16i8, { 1, 3, 1, 1 } },
1034 { ISD::SRL, MVT::v16i8, { 2, 3, 1, 1 } },
1035 { ISD::SRA, MVT::v16i8, { 2, 3, 1, 1 } },
1036 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 1 } },
1037 { ISD::SRL, MVT::v8i16, { 2, 3, 1, 1 } },
1038 { ISD::SRA, MVT::v8i16, { 2, 3, 1, 1 } },
1039 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 1 } },
1040 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 1 } },
1041 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 1 } },
1042 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 1 } },
1043 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } },
1044 { ISD::SRA, MVT::v2i64, { 2, 3, 1, 1 } },
1045 // 256bit shifts require splitting if AVX2 didn't catch them above.
1046 { ISD::SHL, MVT::v32i8, { 4, 7, 5, 6 } },
1047 { ISD::SRL, MVT::v32i8, { 6, 7, 5, 6 } },
1048 { ISD::SRA, MVT::v32i8, { 6, 7, 5, 6 } },
1049 { ISD::SHL, MVT::v16i16, { 4, 7, 5, 6 } },
1050 { ISD::SRL, MVT::v16i16, { 6, 7, 5, 6 } },
1051 { ISD::SRA, MVT::v16i16, { 6, 7, 5, 6 } },
1052 { ISD::SHL, MVT::v8i32, { 4, 7, 5, 6 } },
1053 { ISD::SRL, MVT::v8i32, { 6, 7, 5, 6 } },
1054 { ISD::SRA, MVT::v8i32, { 6, 7, 5, 6 } },
1055 { ISD::SHL, MVT::v4i64, { 4, 7, 5, 6 } },
1056 { ISD::SRL, MVT::v4i64, { 6, 7, 5, 6 } },
1057 { ISD::SRA, MVT::v4i64, { 6, 7, 5, 6 } },
1058 };
1059
1060 // Look for XOP lowering tricks.
1061 if (ST->hasXOP()) {
1062 // If the right shift is constant then we'll fold the negation so
1063 // it's as cheap as a left shift.
1064 int ShiftISD = ISD;
1065 if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant())
1066 ShiftISD = ISD::SHL;
1067 if (const auto *Entry =
1068 CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
1069 if (auto KindCost = Entry->Cost[CostKind])
1070 return LT.first * *KindCost;
1071 }
1072
1073 if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) {
1074 MVT VT = LT.second;
1075 // Vector shift left by non uniform constant can be lowered
1076 // into vector multiply.
1077 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
1078 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
1079 ISD = ISD::MUL;
1080 }
1081
1082 static const CostKindTblEntry GLMCostTable[] = {
1083 { ISD::FDIV, MVT::f32, { 18, 19, 1, 1 } }, // divss
1084 { ISD::FDIV, MVT::v4f32, { 35, 36, 1, 1 } }, // divps
1085 { ISD::FDIV, MVT::f64, { 33, 34, 1, 1 } }, // divsd
1086 { ISD::FDIV, MVT::v2f64, { 65, 66, 1, 1 } }, // divpd
1087 };
1088
1089 if (ST->useGLMDivSqrtCosts())
1090 if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second))
1091 if (auto KindCost = Entry->Cost[CostKind])
1092 return LT.first * *KindCost;
1093
1094 static const CostKindTblEntry SLMCostTable[] = {
1095 { ISD::MUL, MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld
1096 { ISD::MUL, MVT::v8i16, { 2, 5, 1, 1 } }, // pmullw
1097 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // mulsd
1098 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // mulss
1099 { ISD::FMUL, MVT::v2f64, { 4, 7, 1, 1 } }, // mulpd
1100 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // mulps
1101 { ISD::FDIV, MVT::f32, { 17, 19, 1, 1 } }, // divss
1102 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 6 } }, // divps
1103 { ISD::FDIV, MVT::f64, { 32, 34, 1, 1 } }, // divsd
1104 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 6 } }, // divpd
1105 { ISD::FADD, MVT::v2f64, { 2, 4, 1, 1 } }, // addpd
1106 { ISD::FSUB, MVT::v2f64, { 2, 4, 1, 1 } }, // subpd
1107 // v2i64/v4i64 mul is custom lowered as a series of long:
1108 // multiplies(3), shifts(3) and adds(2)
1109 // slm muldq version throughput is 2 and addq throughput 4
1110 // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
1111 // 3X4 (addq throughput) = 17
1112 { ISD::MUL, MVT::v2i64, { 17, 22, 9, 9 } },
1113 // slm addq\subq throughput is 4
1114 { ISD::ADD, MVT::v2i64, { 4, 2, 1, 2 } },
1115 { ISD::SUB, MVT::v2i64, { 4, 2, 1, 2 } },
1116 };
1117
1118 if (ST->useSLMArithCosts())
1119 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second))
1120 if (auto KindCost = Entry->Cost[CostKind])
1121 return LT.first * *KindCost;
1122
1123 static const CostKindTblEntry AVX2CostTable[] = {
1124 { ISD::SHL, MVT::v16i8, { 6, 21,11,16 } }, // vpblendvb sequence.
1125 { ISD::SHL, MVT::v32i8, { 6, 23,11,22 } }, // vpblendvb sequence.
1126 { ISD::SHL, MVT::v8i16, { 5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence.
1127 { ISD::SHL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1128
1129 { ISD::SRL, MVT::v16i8, { 6, 27,12,18 } }, // vpblendvb sequence.
1130 { ISD::SRL, MVT::v32i8, { 8, 30,12,24 } }, // vpblendvb sequence.
1131 { ISD::SRL, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence.
1132 { ISD::SRL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1133
1134 { ISD::SRA, MVT::v16i8, { 17, 17,24,30 } }, // vpblendvb sequence.
1135 { ISD::SRA, MVT::v32i8, { 18, 20,24,43 } }, // vpblendvb sequence.
1136 { ISD::SRA, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsravd/pack sequence.
1137 { ISD::SRA, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsravd/pack sequence.
1138 { ISD::SRA, MVT::v2i64, { 4, 5, 5, 5 } }, // srl/xor/sub sequence.
1139 { ISD::SRA, MVT::v4i64, { 8, 8, 5, 9 } }, // srl/xor/sub sequence.
1140
1141 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 2 } }, // psubb
1142 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 2 } }, // paddb
1143 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 2 } }, // psubw
1144 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 2 } }, // paddw
1145 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 2 } }, // psubd
1146 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 2 } }, // paddd
1147 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 2 } }, // psubq
1148 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq
1149
1150 { ISD::MUL, MVT::v16i8, { 5, 18, 6,12 } }, // extend/pmullw/pack
1151 { ISD::MUL, MVT::v32i8, { 4, 8, 8,16 } }, // pmaddubsw
1152 { ISD::MUL, MVT::v16i16, { 2, 5, 1, 2 } }, // pmullw
1153 { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld
1154 { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld
1155 { ISD::MUL, MVT::v4i64, { 6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add
1156 { ISD::MUL, MVT::v2i64, { 6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
1157
1158 { X86ISD::PMULUDQ, MVT::v4i64, { 1, 5, 1, 1 } },
1159
1160 { ISD::FNEG, MVT::v4f64, { 1, 1, 1, 2 } }, // vxorpd
1161 { ISD::FNEG, MVT::v8f32, { 1, 1, 1, 2 } }, // vxorps
1162
1163 { ISD::FADD, MVT::f64, { 1, 4, 1, 1 } }, // vaddsd
1164 { ISD::FADD, MVT::f32, { 1, 4, 1, 1 } }, // vaddss
1165 { ISD::FADD, MVT::v2f64, { 1, 4, 1, 1 } }, // vaddpd
1166 { ISD::FADD, MVT::v4f32, { 1, 4, 1, 1 } }, // vaddps
1167 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 2 } }, // vaddpd
1168 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 2 } }, // vaddps
1169
1170 { ISD::FSUB, MVT::f64, { 1, 4, 1, 1 } }, // vsubsd
1171 { ISD::FSUB, MVT::f32, { 1, 4, 1, 1 } }, // vsubss
1172 { ISD::FSUB, MVT::v2f64, { 1, 4, 1, 1 } }, // vsubpd
1173 { ISD::FSUB, MVT::v4f32, { 1, 4, 1, 1 } }, // vsubps
1174 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 2 } }, // vsubpd
1175 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 2 } }, // vsubps
1176
1177 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // vmulsd
1178 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // vmulss
1179 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // vmulpd
1180 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // vmulps
1181 { ISD::FMUL, MVT::v4f64, { 1, 5, 1, 2 } }, // vmulpd
1182 { ISD::FMUL, MVT::v8f32, { 1, 5, 1, 2 } }, // vmulps
1183
1184 { ISD::FDIV, MVT::f32, { 7, 13, 1, 1 } }, // vdivss
1185 { ISD::FDIV, MVT::v4f32, { 7, 13, 1, 1 } }, // vdivps
1186 { ISD::FDIV, MVT::v8f32, { 14, 21, 1, 3 } }, // vdivps
1187 { ISD::FDIV, MVT::f64, { 14, 20, 1, 1 } }, // vdivsd
1188 { ISD::FDIV, MVT::v2f64, { 14, 20, 1, 1 } }, // vdivpd
1189 { ISD::FDIV, MVT::v4f64, { 28, 35, 1, 3 } }, // vdivpd
1190 };
1191
1192 // Look for AVX2 lowering tricks for custom cases.
1193 if (ST->hasAVX2())
1194 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
1195 if (auto KindCost = Entry->Cost[CostKind])
1196 return LT.first * *KindCost;
1197
1198 static const CostKindTblEntry AVX1CostTable[] = {
1199 // We don't have to scalarize unsupported ops. We can issue two half-sized
1200 // operations and we only need to extract the upper YMM half.
1201 // Two ops + 1 extract + 1 insert = 4.
1202 { ISD::MUL, MVT::v32i8, { 10, 11, 18, 19 } }, // pmaddubsw + split
1203 { ISD::MUL, MVT::v16i8, { 5, 6, 8, 12 } }, // 2*pmaddubsw/3*and/psllw/or
1204 { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split
1205 { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split
1206 { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld
1207 { ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } },
1208
1209 { X86ISD::PMULUDQ, MVT::v4i64, { 3, 5, 5, 6 } }, // pmuludq + split
1210
1211 { ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps
1212 { ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps
1213 { ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps
1214 { ISD::AND, MVT::v4i64, { 1, 1, 1, 2 } }, // vandps
1215
1216 { ISD::OR, MVT::v32i8, { 1, 1, 1, 2 } }, // vorps
1217 { ISD::OR, MVT::v16i16, { 1, 1, 1, 2 } }, // vorps
1218 { ISD::OR, MVT::v8i32, { 1, 1, 1, 2 } }, // vorps
1219 { ISD::OR, MVT::v4i64, { 1, 1, 1, 2 } }, // vorps
1220
1221 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 2 } }, // vxorps
1222 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 2 } }, // vxorps
1223 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 2 } }, // vxorps
1224 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 2 } }, // vxorps
1225
1226 { ISD::SUB, MVT::v32i8, { 4, 2, 5, 6 } }, // psubb + split
1227 { ISD::ADD, MVT::v32i8, { 4, 2, 5, 6 } }, // paddb + split
1228 { ISD::SUB, MVT::v16i16, { 4, 2, 5, 6 } }, // psubw + split
1229 { ISD::ADD, MVT::v16i16, { 4, 2, 5, 6 } }, // paddw + split
1230 { ISD::SUB, MVT::v8i32, { 4, 2, 5, 6 } }, // psubd + split
1231 { ISD::ADD, MVT::v8i32, { 4, 2, 5, 6 } }, // paddd + split
1232 { ISD::SUB, MVT::v4i64, { 4, 2, 5, 6 } }, // psubq + split
1233 { ISD::ADD, MVT::v4i64, { 4, 2, 5, 6 } }, // paddq + split
1234 { ISD::SUB, MVT::v2i64, { 1, 1, 1, 1 } }, // psubq
1235 { ISD::ADD, MVT::v2i64, { 1, 1, 1, 1 } }, // paddq
1236
1237 { ISD::SHL, MVT::v16i8, { 10, 21,11,17 } }, // pblendvb sequence.
1238 { ISD::SHL, MVT::v32i8, { 22, 22,27,40 } }, // pblendvb sequence + split.
1239 { ISD::SHL, MVT::v8i16, { 6, 9,11,11 } }, // pblendvb sequence.
1240 { ISD::SHL, MVT::v16i16, { 13, 16,24,25 } }, // pblendvb sequence + split.
1241 { ISD::SHL, MVT::v4i32, { 3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld
1242 { ISD::SHL, MVT::v8i32, { 9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split
1243 { ISD::SHL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1244 { ISD::SHL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1245
1246 { ISD::SRL, MVT::v16i8, { 11, 27,12,18 } }, // pblendvb sequence.
1247 { ISD::SRL, MVT::v32i8, { 23, 23,30,43 } }, // pblendvb sequence + split.
1248 { ISD::SRL, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1249 { ISD::SRL, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1250 { ISD::SRL, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1251 { ISD::SRL, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1252 { ISD::SRL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1253 { ISD::SRL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1254
1255 { ISD::SRA, MVT::v16i8, { 21, 22,24,36 } }, // pblendvb sequence.
1256 { ISD::SRA, MVT::v32i8, { 44, 45,51,76 } }, // pblendvb sequence + split.
1257 { ISD::SRA, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1258 { ISD::SRA, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1259 { ISD::SRA, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1260 { ISD::SRA, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1261 { ISD::SRA, MVT::v2i64, { 5, 6,10,14 } }, // Shift each lane + blend.
1262 { ISD::SRA, MVT::v4i64, { 12, 12,22,30 } }, // Shift each lane + blend + split.
1263
1264 { ISD::FNEG, MVT::v4f64, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1265 { ISD::FNEG, MVT::v8f32, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1266
1267 { ISD::FADD, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1268 { ISD::FADD, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1269 { ISD::FADD, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1270 { ISD::FADD, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1271 { ISD::FADD, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1272 { ISD::FADD, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1273
1274 { ISD::FSUB, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1275 { ISD::FSUB, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1276 { ISD::FSUB, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1277 { ISD::FSUB, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1278 { ISD::FSUB, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1279 { ISD::FSUB, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1280
1281 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1282 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1283 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1284 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1285 { ISD::FMUL, MVT::v4f64, { 4, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1286 { ISD::FMUL, MVT::v8f32, { 2, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1287
1288 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1289 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1290 { ISD::FDIV, MVT::v8f32, { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/
1291 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1292 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1293 { ISD::FDIV, MVT::v4f64, { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/
1294 };
1295
1296 if (ST->hasAVX())
1297 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
1298 if (auto KindCost = Entry->Cost[CostKind])
1299 return LT.first * *KindCost;
1300
1301 static const CostKindTblEntry SSE42CostTable[] = {
1302 { ISD::FADD, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1303 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1304 { ISD::FADD, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1305 { ISD::FADD, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1306
1307 { ISD::FSUB, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1308 { ISD::FSUB, MVT::f32 , { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1309 { ISD::FSUB, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1310 { ISD::FSUB, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1311
1312 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1313 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1314 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1315 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1316
1317 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1318 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1319 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1320 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1321
1322 { ISD::MUL, MVT::v2i64, { 6, 10,10,10 } } // 3*pmuludq/3*shift/2*add
1323 };
1324
1325 if (ST->hasSSE42())
1326 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
1327 if (auto KindCost = Entry->Cost[CostKind])
1328 return LT.first * *KindCost;
1329
1330 static const CostKindTblEntry SSE41CostTable[] = {
1331 { ISD::SHL, MVT::v16i8, { 15, 24,17,22 } }, // pblendvb sequence.
1332 { ISD::SHL, MVT::v8i16, { 11, 14,11,11 } }, // pblendvb sequence.
1333 { ISD::SHL, MVT::v4i32, { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld
1334
1335 { ISD::SRL, MVT::v16i8, { 16, 27,18,24 } }, // pblendvb sequence.
1336 { ISD::SRL, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1337 { ISD::SRL, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1338 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1339
1340 { ISD::SRA, MVT::v16i8, { 38, 41,30,36 } }, // pblendvb sequence.
1341 { ISD::SRA, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1342 { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1343 { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence.
1344
1345 { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org)
1346 };
1347
1348 if (ST->hasSSE41())
1349 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
1350 if (auto KindCost = Entry->Cost[CostKind])
1351 return LT.first * *KindCost;
1352
1353 static const CostKindTblEntry SSSE3CostTable[] = {
1354 { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*pmaddubsw/3*and/psllw/or
1355 };
1356
1357 if (ST->hasSSSE3())
1358 if (const auto *Entry = CostTableLookup(SSSE3CostTable, ISD, LT.second))
1359 if (auto KindCost = Entry->Cost[CostKind])
1360 return LT.first * *KindCost;
1361
1362 static const CostKindTblEntry SSE2CostTable[] = {
1363 // We don't correctly identify costs of casts because they are marked as
1364 // custom.
1365 { ISD::SHL, MVT::v16i8, { 13, 21,26,28 } }, // cmpgtb sequence.
1366 { ISD::SHL, MVT::v8i16, { 24, 27,16,20 } }, // cmpgtw sequence.
1367 { ISD::SHL, MVT::v4i32, { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq.
1368 { ISD::SHL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1369
1370 { ISD::SRL, MVT::v16i8, { 14, 28,27,30 } }, // cmpgtb sequence.
1371 { ISD::SRL, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1372 { ISD::SRL, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1373 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1374
1375 { ISD::SRA, MVT::v16i8, { 27, 30,54,54 } }, // unpacked cmpgtb sequence.
1376 { ISD::SRA, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1377 { ISD::SRA, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1378 { ISD::SRA, MVT::v2i64, { 8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence.
1379
1380 { ISD::AND, MVT::v16i8, { 1, 1, 1, 1 } }, // pand
1381 { ISD::AND, MVT::v8i16, { 1, 1, 1, 1 } }, // pand
1382 { ISD::AND, MVT::v4i32, { 1, 1, 1, 1 } }, // pand
1383 { ISD::AND, MVT::v2i64, { 1, 1, 1, 1 } }, // pand
1384
1385 { ISD::OR, MVT::v16i8, { 1, 1, 1, 1 } }, // por
1386 { ISD::OR, MVT::v8i16, { 1, 1, 1, 1 } }, // por
1387 { ISD::OR, MVT::v4i32, { 1, 1, 1, 1 } }, // por
1388 { ISD::OR, MVT::v2i64, { 1, 1, 1, 1 } }, // por
1389
1390 { ISD::XOR, MVT::v16i8, { 1, 1, 1, 1 } }, // pxor
1391 { ISD::XOR, MVT::v8i16, { 1, 1, 1, 1 } }, // pxor
1392 { ISD::XOR, MVT::v4i32, { 1, 1, 1, 1 } }, // pxor
1393 { ISD::XOR, MVT::v2i64, { 1, 1, 1, 1 } }, // pxor
1394
1395 { ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq
1396 { ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq
1397
1398 { ISD::MUL, MVT::v16i8, { 6, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack
1399 { ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw
1400 { ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle
1401 { ISD::MUL, MVT::v2i64, { 7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add
1402
1403 { X86ISD::PMULUDQ, MVT::v2i64, { 1, 5, 1, 1 } },
1404
1405 { ISD::FDIV, MVT::f32, { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/
1406 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/
1407 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/
1408 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/
1409
1410 { ISD::FNEG, MVT::f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1411 { ISD::FNEG, MVT::f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1412 { ISD::FNEG, MVT::v4f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1413 { ISD::FNEG, MVT::v2f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1414
1415 { ISD::FADD, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1416 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1417 { ISD::FADD, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1418
1419 { ISD::FSUB, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1420 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1421 { ISD::FSUB, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1422
1423 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1424 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1425 };
1426
1427 if (ST->hasSSE2())
1428 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
1429 if (auto KindCost = Entry->Cost[CostKind])
1430 return LT.first * *KindCost;
1431
1432 static const CostKindTblEntry SSE1CostTable[] = {
1433 { ISD::FDIV, MVT::f32, { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/
1434 { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/
1435
1436 { ISD::FNEG, MVT::f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1437 { ISD::FNEG, MVT::v4f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1438
1439 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1440 { ISD::FADD, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1441
1442 { ISD::FSUB, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1443 { ISD::FSUB, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1444
1445 { ISD::FMUL, MVT::f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1446 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1447 };
1448
1449 if (ST->hasSSE1())
1450 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1451 if (auto KindCost = Entry->Cost[CostKind])
1452 return LT.first * *KindCost;
1453
1454 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
1455 { ISD::ADD, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1456 { ISD::SUB, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1457 { ISD::MUL, MVT::i64, { 2, 6, 1, 2 } },
1458 };
1459
1460 if (ST->is64Bit())
1461 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1462 if (auto KindCost = Entry->Cost[CostKind])
1463 return LT.first * *KindCost;
1464
1465 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1466 { ISD::ADD, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1467 { ISD::ADD, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1468 { ISD::ADD, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1469
1470 { ISD::SUB, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1471 { ISD::SUB, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1472 { ISD::SUB, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1473
1474 { ISD::MUL, MVT::i8, { 3, 4, 1, 1 } },
1475 { ISD::MUL, MVT::i16, { 2, 4, 1, 1 } },
1476 { ISD::MUL, MVT::i32, { 1, 4, 1, 1 } },
1477
1478 { ISD::FNEG, MVT::f64, { 2, 2, 1, 3 } }, // (x87)
1479 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1480 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1481 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // (x87)
1482 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87)
1483 };
1484
1485 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1486 if (auto KindCost = Entry->Cost[CostKind])
1487 return LT.first * *KindCost;
1488
1489 // It is not a good idea to vectorize division. We have to scalarize it and
1490 // in the process we will often end up having to spilling regular
1491 // registers. The overhead of division is going to dominate most kernels
1492 // anyways so try hard to prevent vectorization of division - it is
1493 // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1494 // to hide "20 cycles" for each lane.
1495 if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() &&
1496 (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
1497 ISD == ISD::UREM)) {
1498 InstructionCost ScalarCost =
1499 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
1500 Op1Info.getNoProps(), Op2Info.getNoProps());
1501 return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1502 }
1503
1504 // Handle some basic single instruction code size cases.
1505 if (CostKind == TTI::TCK_CodeSize) {
1506 switch (ISD) {
1507 case ISD::FADD:
1508 case ISD::FSUB:
1509 case ISD::FMUL:
1510 case ISD::FDIV:
1511 case ISD::FNEG:
1512 case ISD::AND:
1513 case ISD::OR:
1514 case ISD::XOR:
1515 return LT.first;
1516 break;
1517 }
1518 }
1519
1520 // Fallback to the default implementation.
1521 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1522 Args, CxtI);
1523}
1524
1527 unsigned Opcode1, const SmallBitVector &OpcodeMask,
1529 if (isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
1530 return TTI::TCC_Basic;
1532}
1533
1535 VectorType *DstTy, VectorType *SrcTy,
1536 ArrayRef<int> Mask,
1538 int Index, VectorType *SubTp,
1540 const Instruction *CxtI) const {
1541 assert((Mask.empty() || DstTy->isScalableTy() ||
1542 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
1543 "Expected the Mask to match the return size if given");
1544 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
1545 "Expected the same scalar types");
1546
1547 // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1548 // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1549 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1550
1551 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
1552
1553 // If all args are constant than this will be constant folded away.
1554 if (!Args.empty() &&
1555 all_of(Args, [](const Value *Arg) { return isa<Constant>(Arg); }))
1556 return TTI::TCC_Free;
1557
1558 // Recognize a basic concat_vector shuffle.
1559 if (Kind == TTI::SK_PermuteTwoSrc &&
1560 Mask.size() == (2 * SrcTy->getElementCount().getKnownMinValue()) &&
1561 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
1565 CostKind, Mask.size() / 2, SrcTy);
1566
1567 // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1568 if (Kind == TTI::SK_Transpose)
1569 if (LT.second != MVT::v4f64 && LT.second != MVT::v4i64)
1570 Kind = TTI::SK_PermuteTwoSrc;
1571
1572 if (Kind == TTI::SK_Broadcast) {
1573 // For Broadcasts we are splatting the first element from the first input
1574 // register, so only need to reference that input and all the output
1575 // registers are the same.
1576 LT.first = 1;
1577
1578 // If we're broadcasting a load then AVX/AVX2 can do this for free.
1579 using namespace PatternMatch;
1580 if (!Args.empty() && match(Args[0], m_OneUse(m_Load(m_Value()))) &&
1581 (ST->hasAVX2() ||
1582 (ST->hasAVX() && LT.second.getScalarSizeInBits() >= 32)))
1583 return TTI::TCC_Free;
1584 }
1585
1586 // Attempt to detect a cheaper inlane shuffle, avoiding 128-bit subvector
1587 // permutation.
1588 // Attempt to detect a shuffle mask with a single defined element.
1589 bool IsInLaneShuffle = false;
1590 bool IsSingleElementMask = false;
1591 if (SrcTy->getPrimitiveSizeInBits() > 0 &&
1592 (SrcTy->getPrimitiveSizeInBits() % 128) == 0 &&
1593 SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
1594 Mask.size() == SrcTy->getElementCount().getKnownMinValue()) {
1595 unsigned NumLanes = SrcTy->getPrimitiveSizeInBits() / 128;
1596 unsigned NumEltsPerLane = Mask.size() / NumLanes;
1597 if ((Mask.size() % NumLanes) == 0) {
1598 IsInLaneShuffle = all_of(enumerate(Mask), [&](const auto &P) {
1599 return P.value() == PoisonMaskElem ||
1600 ((P.value() % Mask.size()) / NumEltsPerLane) ==
1601 (P.index() / NumEltsPerLane);
1602 });
1603 IsSingleElementMask =
1604 (Mask.size() - 1) == static_cast<unsigned>(count_if(Mask, [](int M) {
1605 return M == PoisonMaskElem;
1606 }));
1607 }
1608 }
1609
1610 // Treat <X x bfloat> shuffles as <X x half>.
1611 if (LT.second.isVector() && LT.second.getScalarType() == MVT::bf16)
1612 LT.second = LT.second.changeVectorElementType(MVT::f16);
1613
1614 // Subvector extractions are free if they start at the beginning of a
1615 // vector and cheap if the subvectors are aligned.
1616 if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1617 int NumElts = LT.second.getVectorNumElements();
1618 if ((Index % NumElts) == 0)
1619 return TTI::TCC_Free;
1620 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1621 if (SubLT.second.isVector()) {
1622 int NumSubElts = SubLT.second.getVectorNumElements();
1623 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1624 return SubLT.first;
1625 // Handle some cases for widening legalization. For now we only handle
1626 // cases where the original subvector was naturally aligned and evenly
1627 // fit in its legalized subvector type.
1628 // FIXME: Remove some of the alignment restrictions.
1629 // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1630 // vectors.
1631 int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1632 if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1633 (NumSubElts % OrigSubElts) == 0 &&
1634 LT.second.getVectorElementType() ==
1635 SubLT.second.getVectorElementType() &&
1636 LT.second.getVectorElementType().getSizeInBits() ==
1637 SrcTy->getElementType()->getPrimitiveSizeInBits()) {
1638 assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1639 "Unexpected number of elements!");
1640 auto *VecTy = FixedVectorType::get(SrcTy->getElementType(),
1641 LT.second.getVectorNumElements());
1642 auto *SubTy = FixedVectorType::get(SrcTy->getElementType(),
1643 SubLT.second.getVectorNumElements());
1644 int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1645 InstructionCost ExtractCost =
1647 ExtractIndex, SubTy);
1648
1649 // If the original size is 32-bits or more, we can use pshufd. Otherwise
1650 // if we have SSSE3 we can use pshufb.
1651 if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1652 return ExtractCost + 1; // pshufd or pshufb
1653
1654 assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1655 "Unexpected vector size");
1656
1657 return ExtractCost + 2; // worst case pshufhw + pshufd
1658 }
1659 }
1660 // If the extract subvector is not optimal, treat it as single op shuffle.
1662 }
1663
1664 // Subvector insertions are cheap if the subvectors are aligned.
1665 // Note that in general, the insertion starting at the beginning of a vector
1666 // isn't free, because we need to preserve the rest of the wide vector,
1667 // but if the destination vector legalizes to the same width as the subvector
1668 // then the insertion will simplify to a (free) register copy.
1669 if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1670 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(DstTy);
1671 int NumElts = DstLT.second.getVectorNumElements();
1672 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1673 if (SubLT.second.isVector()) {
1674 int NumSubElts = SubLT.second.getVectorNumElements();
1675 bool MatchingTypes =
1676 NumElts == NumSubElts &&
1677 (SubTp->getElementCount().getKnownMinValue() % NumSubElts) == 0;
1678 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1679 return MatchingTypes ? TTI::TCC_Free : SubLT.first;
1680 }
1681
1682 // Attempt to match MOVSS (Idx == 0) or INSERTPS pattern. This will have
1683 // been matched by improveShuffleKindFromMask as a SK_InsertSubvector of
1684 // v1f32 (legalised to f32) into a v4f32.
1685 if (LT.first == 1 && LT.second == MVT::v4f32 && SubLT.first == 1 &&
1686 SubLT.second == MVT::f32 && (Index == 0 || ST->hasSSE41()))
1687 return 1;
1688
1689 // If the insertion is the lowest subvector then it will be blended
1690 // otherwise treat it like a 2-op shuffle.
1691 Kind =
1692 (Index == 0 && LT.first == 1) ? TTI::SK_Select : TTI::SK_PermuteTwoSrc;
1693 }
1694
1695 // Handle some common (illegal) sub-vector types as they are often very cheap
1696 // to shuffle even on targets without PSHUFB.
1697 EVT VT = TLI->getValueType(DL, SrcTy);
1698 if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1699 !ST->hasSSSE3()) {
1700 static const CostKindTblEntry SSE2SubVectorShuffleTbl[] = {
1701 {TTI::SK_Broadcast, MVT::v4i16, {1,1,1,1}}, // pshuflw
1702 {TTI::SK_Broadcast, MVT::v2i16, {1,1,1,1}}, // pshuflw
1703 {TTI::SK_Broadcast, MVT::v8i8, {2,2,2,2}}, // punpck/pshuflw
1704 {TTI::SK_Broadcast, MVT::v4i8, {2,2,2,2}}, // punpck/pshuflw
1705 {TTI::SK_Broadcast, MVT::v2i8, {1,1,1,1}}, // punpck
1706
1707 {TTI::SK_Reverse, MVT::v4i16, {1,1,1,1}}, // pshuflw
1708 {TTI::SK_Reverse, MVT::v2i16, {1,1,1,1}}, // pshuflw
1709 {TTI::SK_Reverse, MVT::v4i8, {3,3,3,3}}, // punpck/pshuflw/packus
1710 {TTI::SK_Reverse, MVT::v2i8, {1,1,1,1}}, // punpck
1711
1712 {TTI::SK_Splice, MVT::v4i16, {2,2,2,2}}, // punpck+psrldq
1713 {TTI::SK_Splice, MVT::v2i16, {2,2,2,2}}, // punpck+psrldq
1714 {TTI::SK_Splice, MVT::v4i8, {2,2,2,2}}, // punpck+psrldq
1715 {TTI::SK_Splice, MVT::v2i8, {2,2,2,2}}, // punpck+psrldq
1716
1717 {TTI::SK_PermuteTwoSrc, MVT::v4i16, {2,2,2,2}}, // punpck/pshuflw
1718 {TTI::SK_PermuteTwoSrc, MVT::v2i16, {2,2,2,2}}, // punpck/pshuflw
1719 {TTI::SK_PermuteTwoSrc, MVT::v8i8, {7,7,7,7}}, // punpck/pshuflw
1720 {TTI::SK_PermuteTwoSrc, MVT::v4i8, {4,4,4,4}}, // punpck/pshuflw
1721 {TTI::SK_PermuteTwoSrc, MVT::v2i8, {2,2,2,2}}, // punpck
1722
1723 {TTI::SK_PermuteSingleSrc, MVT::v4i16, {1,1,1,1}}, // pshuflw
1724 {TTI::SK_PermuteSingleSrc, MVT::v2i16, {1,1,1,1}}, // pshuflw
1725 {TTI::SK_PermuteSingleSrc, MVT::v8i8, {5,5,5,5}}, // punpck/pshuflw
1726 {TTI::SK_PermuteSingleSrc, MVT::v4i8, {3,3,3,3}}, // punpck/pshuflw
1727 {TTI::SK_PermuteSingleSrc, MVT::v2i8, {1,1,1,1}}, // punpck
1728 };
1729
1730 if (ST->hasSSE2())
1731 if (const auto *Entry =
1732 CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1733 if (auto KindCost = Entry->Cost[CostKind])
1734 return LT.first * *KindCost;
1735 }
1736
1737 // We are going to permute multiple sources and the result will be in multiple
1738 // destinations. Providing an accurate cost only for splits where the element
1739 // type remains the same.
1740 if (LT.first != 1) {
1741 MVT LegalVT = LT.second;
1742 if (LegalVT.isVector() &&
1743 LegalVT.getVectorElementType().getSizeInBits() ==
1744 SrcTy->getElementType()->getPrimitiveSizeInBits() &&
1745 LegalVT.getVectorNumElements() <
1746 cast<FixedVectorType>(SrcTy)->getNumElements()) {
1747 unsigned VecTySize = DL.getTypeStoreSize(SrcTy);
1748 unsigned LegalVTSize = LegalVT.getStoreSize();
1749 // Number of source vectors after legalization:
1750 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1751 // Number of destination vectors after legalization:
1752 InstructionCost NumOfDests = LT.first;
1753
1754 auto *SingleOpTy = FixedVectorType::get(SrcTy->getElementType(),
1755 LegalVT.getVectorNumElements());
1756
1757 if (!Mask.empty() && NumOfDests.isValid()) {
1758 // Try to perform better estimation of the permutation.
1759 // 1. Split the source/destination vectors into real registers.
1760 // 2. Do the mask analysis to identify which real registers are
1761 // permuted. If more than 1 source registers are used for the
1762 // destination register building, the cost for this destination register
1763 // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
1764 // source register is used, build mask and calculate the cost as a cost
1765 // of PermuteSingleSrc.
1766 // Also, for the single register permute we try to identify if the
1767 // destination register is just a copy of the source register or the
1768 // copy of the previous destination register (the cost is
1769 // TTI::TCC_Basic). If the source register is just reused, the cost for
1770 // this operation is TTI::TCC_Free.
1771 NumOfDests =
1773 FixedVectorType::get(SrcTy->getElementType(), Mask.size()))
1774 .first;
1775 unsigned E = NumOfDests.getValue();
1776 unsigned NormalizedVF =
1777 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
1778 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
1779 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
1780 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
1781 copy(Mask, NormalizedMask.begin());
1782 unsigned PrevSrcReg = 0;
1783 ArrayRef<int> PrevRegMask;
1786 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
1787 [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask,
1788 &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
1789 if (!ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) {
1790 // Check if the previous register can be just copied to the next
1791 // one.
1792 if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
1793 PrevRegMask != RegMask)
1794 Cost +=
1796 SingleOpTy, RegMask, CostKind, 0, nullptr);
1797 else
1798 // Just a copy of previous destination register.
1800 return;
1801 }
1802 if (SrcReg != DestReg &&
1803 any_of(RegMask, not_equal_to(PoisonMaskElem))) {
1804 // Just a copy of the source register.
1806 }
1807 PrevSrcReg = SrcReg;
1808 PrevRegMask = RegMask;
1809 },
1810 [this, SingleOpTy, CostKind,
1811 &Cost](ArrayRef<int> RegMask, unsigned /*Unused*/,
1812 unsigned /*Unused*/, bool /*Unused*/) {
1814 SingleOpTy, RegMask, CostKind, 0, nullptr);
1815 });
1816 return Cost;
1817 }
1818
1819 InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1820 return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1821 SingleOpTy, {}, CostKind, 0,
1822 nullptr);
1823 }
1824
1825 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1826 SubTp);
1827 }
1828
1829 // If we're just moving a single element around (probably as an alternative to
1830 // extracting it), we can assume this is cheap.
1831 if (LT.first == 1 && IsInLaneShuffle && IsSingleElementMask)
1832 return TTI::TCC_Basic;
1833
1834 static const CostKindTblEntry AVX512VBMIShuffleTbl[] = {
1835 { TTI::SK_Reverse, MVT::v64i8, { 1, 1, 1, 1 } }, // vpermb
1836 { TTI::SK_Reverse, MVT::v32i8, { 1, 1, 1, 1 } }, // vpermb
1837 { TTI::SK_PermuteSingleSrc, MVT::v64i8, { 1, 1, 1, 1 } }, // vpermb
1838 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 1, 1, 1, 1 } }, // vpermb
1839 { TTI::SK_PermuteTwoSrc, MVT::v64i8, { 2, 2, 2, 2 } }, // vpermt2b
1840 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 2, 2, 2, 2 } }, // vpermt2b
1841 { TTI::SK_PermuteTwoSrc, MVT::v16i8, { 2, 2, 2, 2 } } // vpermt2b
1842 };
1843
1844 if (ST->hasVBMI())
1845 if (const auto *Entry =
1846 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1847 if (auto KindCost = Entry->Cost[CostKind])
1848 return LT.first * *KindCost;
1849
1850 static const CostKindTblEntry AVX512BWShuffleTbl[] = {
1851 { TTI::SK_Broadcast, MVT::v32i16, { 1, 3, 1, 1 } }, // vpbroadcastw
1852 { TTI::SK_Broadcast, MVT::v32f16, { 1, 3, 1, 1 } }, // vpbroadcastw
1853 { TTI::SK_Broadcast, MVT::v64i8, { 1, 3, 1, 1 } }, // vpbroadcastb
1854
1855 { TTI::SK_Reverse, MVT::v32i16, { 2, 6, 2, 4 } }, // vpermw
1856 { TTI::SK_Reverse, MVT::v32f16, { 2, 6, 2, 4 } }, // vpermw
1857 { TTI::SK_Reverse, MVT::v16i16, { 2, 2, 2, 2 } }, // vpermw
1858 { TTI::SK_Reverse, MVT::v16f16, { 2, 2, 2, 2 } }, // vpermw
1859 { TTI::SK_Reverse, MVT::v64i8, { 2, 9, 2, 3 } }, // pshufb + vshufi64x2
1860
1861 { TTI::SK_PermuteSingleSrc, MVT::v32i16, { 2, 2, 2, 2 } }, // vpermw
1862 { TTI::SK_PermuteSingleSrc, MVT::v32f16, { 2, 2, 2, 2 } }, // vpermw
1863 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 2, 2, 2, 2 } }, // vpermw
1864 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 2, 2, 2, 2 } }, // vpermw
1865 { TTI::SK_PermuteSingleSrc, MVT::v64i8, { 8, 8, 8, 8 } }, // extend to v32i16
1866
1867 { TTI::SK_PermuteTwoSrc, MVT::v32i16,{ 2, 2, 2, 2 } }, // vpermt2w
1868 { TTI::SK_PermuteTwoSrc, MVT::v32f16,{ 2, 2, 2, 2 } }, // vpermt2w
1869 { TTI::SK_PermuteTwoSrc, MVT::v16i16,{ 2, 2, 2, 2 } }, // vpermt2w
1870 { TTI::SK_PermuteTwoSrc, MVT::v8i16, { 2, 2, 2, 2 } }, // vpermt2w
1871 { TTI::SK_PermuteTwoSrc, MVT::v64i8, { 19, 19, 19, 19 } }, // 6 * v32i8 + 1
1872
1873 { TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vblendmw
1874 { TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vblendmb
1875
1876 { TTI::SK_Splice, MVT::v32i16, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
1877 { TTI::SK_Splice, MVT::v32f16, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
1878 { TTI::SK_Splice, MVT::v64i8, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
1879 };
1880
1881 if (ST->hasBWI())
1882 if (const auto *Entry =
1883 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1884 if (auto KindCost = Entry->Cost[CostKind])
1885 return LT.first * *KindCost;
1886
1887 static const CostKindTblEntry AVX512ShuffleTbl[] = {
1888 {TTI::SK_Broadcast, MVT::v8f64, { 1, 3, 1, 1 } }, // vbroadcastsd
1889 {TTI::SK_Broadcast, MVT::v4f64, { 1, 3, 1, 1 } }, // vbroadcastsd
1890 {TTI::SK_Broadcast, MVT::v16f32, { 1, 3, 1, 1 } }, // vbroadcastss
1891 {TTI::SK_Broadcast, MVT::v8f32, { 1, 3, 1, 1 } }, // vbroadcastss
1892 {TTI::SK_Broadcast, MVT::v8i64, { 1, 3, 1, 1 } }, // vpbroadcastq
1893 {TTI::SK_Broadcast, MVT::v4i64, { 1, 3, 1, 1 } }, // vpbroadcastq
1894 {TTI::SK_Broadcast, MVT::v16i32, { 1, 3, 1, 1 } }, // vpbroadcastd
1895 {TTI::SK_Broadcast, MVT::v8i32, { 1, 3, 1, 1 } }, // vpbroadcastd
1896 {TTI::SK_Broadcast, MVT::v32i16, { 1, 3, 1, 1 } }, // vpbroadcastw
1897 {TTI::SK_Broadcast, MVT::v16i16, { 1, 3, 1, 1 } }, // vpbroadcastw
1898 {TTI::SK_Broadcast, MVT::v32f16, { 1, 3, 1, 1 } }, // vpbroadcastw
1899 {TTI::SK_Broadcast, MVT::v16f16, { 1, 3, 1, 1 } }, // vpbroadcastw
1900 {TTI::SK_Broadcast, MVT::v64i8, { 1, 3, 1, 1 } }, // vpbroadcastb
1901 {TTI::SK_Broadcast, MVT::v32i8, { 1, 3, 1, 1 }}, // vpbroadcastb
1902
1903 {TTI::SK_Reverse, MVT::v8f64, { 1, 5, 2, 3 } }, // vpermpd
1904 {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 2, 3 } }, // vpermps
1905 {TTI::SK_Reverse, MVT::v8i64, { 1, 5, 2, 3 } }, // vpermq
1906 {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 2, 3 } }, // vpermd
1907 {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca
1908 {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca
1909 {TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca
1910
1911 {TTI::SK_Splice, MVT::v8f64, { 1, 1, 1, 1 } }, // vpalignd
1912 {TTI::SK_Splice, MVT::v4f64, { 1, 1, 1, 1 } }, // vpalignd
1913 {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd
1914 {TTI::SK_Splice, MVT::v8f32, { 1, 1, 1, 1 } }, // vpalignd
1915 {TTI::SK_Splice, MVT::v8i64, { 1, 1, 1, 1 } }, // vpalignd
1916 {TTI::SK_Splice, MVT::v4i64, { 1, 1, 1, 1 } }, // vpalignd
1917 {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd
1918 {TTI::SK_Splice, MVT::v8i32, { 1, 1, 1, 1 } }, // vpalignd
1919 {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr
1920 {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr
1921 {TTI::SK_Splice, MVT::v64i8, { 4, 4, 4, 4 } }, // split + palignr
1922
1923 {TTI::SK_PermuteSingleSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1924 {TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermpd
1925 {TTI::SK_PermuteSingleSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermpd
1926 {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1927 {TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermps
1928 {TTI::SK_PermuteSingleSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermps
1929 {TTI::SK_PermuteSingleSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1930 {TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermq
1931 {TTI::SK_PermuteSingleSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermq
1932 {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1933 {TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermd
1934 {TTI::SK_PermuteSingleSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermd
1935 {TTI::SK_PermuteSingleSrc, MVT::v16i8, { 1, 3, 1, 1 } }, // pshufb
1936
1937 {TTI::SK_PermuteTwoSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermt2pd
1938 {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermt2ps
1939 {TTI::SK_PermuteTwoSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermt2q
1940 {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermt2d
1941 {TTI::SK_PermuteTwoSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermt2pd
1942 {TTI::SK_PermuteTwoSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermt2ps
1943 {TTI::SK_PermuteTwoSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermt2q
1944 {TTI::SK_PermuteTwoSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermt2d
1945 {TTI::SK_PermuteTwoSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermt2pd
1946 {TTI::SK_PermuteTwoSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermt2ps
1947 {TTI::SK_PermuteTwoSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermt2q
1948 {TTI::SK_PermuteTwoSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermt2d
1949
1950 // FIXME: This just applies the type legalization cost rules above
1951 // assuming these completely split.
1952 {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } },
1953 {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } },
1954 {TTI::SK_PermuteSingleSrc, MVT::v64i8, { 14, 14, 14, 14 } },
1955 {TTI::SK_PermuteTwoSrc, MVT::v32i16, { 42, 42, 42, 42 } },
1956 {TTI::SK_PermuteTwoSrc, MVT::v32f16, { 42, 42, 42, 42 } },
1957 {TTI::SK_PermuteTwoSrc, MVT::v64i8, { 42, 42, 42, 42 } },
1958
1959 {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq
1960 {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq
1961 {TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vpternlogq
1962 {TTI::SK_Select, MVT::v8f64, { 1, 1, 1, 1 } }, // vblendmpd
1963 {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps
1964 {TTI::SK_Select, MVT::v8i64, { 1, 1, 1, 1 } }, // vblendmq
1965 {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd
1966 };
1967
1968 if (ST->hasAVX512())
1969 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1970 if (auto KindCost = Entry->Cost[CostKind])
1971 return LT.first * *KindCost;
1972
1973 static const CostKindTblEntry AVX2InLaneShuffleTbl[] = {
1974 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 1, 1, 1, 1 } }, // vpshufb
1975 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 1, 1, 1, 1 } }, // vpshufb
1976 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 1, 1, 1, 1 } }, // vpshufb
1977
1978 { TTI::SK_Transpose, MVT::v4f64, { 1, 1, 1, 1 } }, // vshufpd/vunpck
1979 { TTI::SK_Transpose, MVT::v4i64, { 1, 1, 1, 1 } }, // vshufpd/vunpck
1980
1981 { TTI::SK_PermuteTwoSrc, MVT::v4f64, { 2, 2, 2, 2 } }, // 2*vshufpd + vblendpd
1982 { TTI::SK_PermuteTwoSrc, MVT::v8f32, { 2, 2, 2, 2 } }, // 2*vshufps + vblendps
1983 { TTI::SK_PermuteTwoSrc, MVT::v4i64, { 2, 2, 2, 2 } }, // 2*vpshufd + vpblendd
1984 { TTI::SK_PermuteTwoSrc, MVT::v8i32, { 2, 2, 2, 2 } }, // 2*vpshufd + vpblendd
1985 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 2, 2, 2, 2 } }, // 2*vpshufb + vpor
1986 { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 2, 2, 2, 2 } }, // 2*vpshufb + vpor
1987 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 2, 2, 2, 2 } }, // 2*vpshufb + vpor
1988 };
1989
1990 if (IsInLaneShuffle && ST->hasAVX2())
1991 if (const auto *Entry =
1992 CostTableLookup(AVX2InLaneShuffleTbl, Kind, LT.second))
1993 if (auto KindCost = Entry->Cost[CostKind])
1994 return LT.first * *KindCost;
1995
1996 static const CostKindTblEntry AVX2ShuffleTbl[] = {
1997 { TTI::SK_Broadcast, MVT::v4f64, { 1, 3, 1, 2 } }, // vbroadcastpd
1998 { TTI::SK_Broadcast, MVT::v8f32, { 1, 3, 1, 2 } }, // vbroadcastps
1999 { TTI::SK_Broadcast, MVT::v4i64, { 1, 3, 1, 2 } }, // vpbroadcastq
2000 { TTI::SK_Broadcast, MVT::v8i32, { 1, 3, 1, 2 } }, // vpbroadcastd
2001 { TTI::SK_Broadcast, MVT::v16i16, { 1, 3, 1, 2 } }, // vpbroadcastw
2002 { TTI::SK_Broadcast, MVT::v8i16, { 1, 3, 1, 1 } }, // vpbroadcastw
2003 { TTI::SK_Broadcast, MVT::v16f16, { 1, 3, 1, 2 } }, // vpbroadcastw
2004 { TTI::SK_Broadcast, MVT::v8f16, { 1, 3, 1, 1 } }, // vpbroadcastw
2005 { TTI::SK_Broadcast, MVT::v32i8, { 1, 3, 1, 2 } }, // vpbroadcastb
2006 { TTI::SK_Broadcast, MVT::v16i8, { 1, 3, 1, 1 } }, // vpbroadcastb
2007
2008 { TTI::SK_Reverse, MVT::v4f64, { 1, 6, 1, 2 } }, // vpermpd
2009 { TTI::SK_Reverse, MVT::v8f32, { 2, 7, 2, 4 } }, // vpermps
2010 { TTI::SK_Reverse, MVT::v4i64, { 1, 6, 1, 2 } }, // vpermq
2011 { TTI::SK_Reverse, MVT::v8i32, { 2, 7, 2, 4 } }, // vpermd
2012 { TTI::SK_Reverse, MVT::v16i16, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
2013 { TTI::SK_Reverse, MVT::v16f16, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
2014 { TTI::SK_Reverse, MVT::v32i8, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
2015
2016 { TTI::SK_Select, MVT::v16i16, { 1, 1, 1, 1 } }, // vpblendvb
2017 { TTI::SK_Select, MVT::v16f16, { 1, 1, 1, 1 } }, // vpblendvb
2018 { TTI::SK_Select, MVT::v32i8, { 1, 1, 1, 1 } }, // vpblendvb
2019
2020 { TTI::SK_Splice, MVT::v8i32, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2021 { TTI::SK_Splice, MVT::v8f32, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2022 { TTI::SK_Splice, MVT::v16i16, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2023 { TTI::SK_Splice, MVT::v16f16, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2024 { TTI::SK_Splice, MVT::v32i8, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2025
2026 { TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 1, 1, 1 } }, // vpermpd
2027 { TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 1, 1, 1 } }, // vpermps
2028 { TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 1, 1, 1 } }, // vpermq
2029 { TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 1, 1, 1 } }, // vpermd
2030 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 4, 4, 4, 4 } },
2031 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 4, 4, 4, 4 } },
2032 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 4, 4, 4, 4 } },
2033
2034 { TTI::SK_PermuteTwoSrc, MVT::v4f64, { 3, 3, 3, 3 } }, // 2*vpermpd + vblendpd
2035 { TTI::SK_PermuteTwoSrc, MVT::v8f32, { 3, 3, 3, 3 } }, // 2*vpermps + vblendps
2036 { TTI::SK_PermuteTwoSrc, MVT::v4i64, { 3, 3, 3, 3 } }, // 2*vpermq + vpblendd
2037 { TTI::SK_PermuteTwoSrc, MVT::v8i32, { 3, 3, 3, 3 } }, // 2*vpermd + vpblendd
2038 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 7, 7, 7, 7 } },
2039 { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 7, 7, 7, 7 } },
2040 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 7, 7, 7, 7 } },
2041 };
2042
2043 if (ST->hasAVX2())
2044 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
2045 if (auto KindCost = Entry->Cost[CostKind])
2046 return LT.first * *KindCost;
2047
2048 static const CostKindTblEntry XOPShuffleTbl[] = {
2049 { TTI::SK_PermuteSingleSrc, MVT::v4f64, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2pd
2050 { TTI::SK_PermuteSingleSrc, MVT::v8f32, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2ps
2051 { TTI::SK_PermuteSingleSrc, MVT::v4i64, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2pd
2052 { TTI::SK_PermuteSingleSrc, MVT::v8i32, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2ps
2053 { TTI::SK_PermuteSingleSrc, MVT::v16i16,{ 4, 4, 4, 4 } }, // vextractf128 + 2*vpperm
2054 // + vinsertf128
2055 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 4, 4, 4, 4 } }, // vextractf128 + 2*vpperm
2056 // + vinsertf128
2057
2058 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 6*vpperm
2059 // + vinsertf128
2060
2061 { TTI::SK_PermuteTwoSrc, MVT::v8i16, { 1, 1, 1, 1 } }, // vpperm
2062 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 9, 9, 9, 9 } }, // 2*vextractf128 + 6*vpperm
2063 // + vinsertf128
2064 { TTI::SK_PermuteTwoSrc, MVT::v16i8, { 1, 1, 1, 1 } }, // vpperm
2065 };
2066
2067 if (ST->hasXOP())
2068 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
2069 if (auto KindCost = Entry->Cost[CostKind])
2070 return LT.first * *KindCost;
2071
2072 static const CostKindTblEntry AVX1InLaneShuffleTbl[] = {
2073 { TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 1, 1, 1 } }, // vpermilpd
2074 { TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 1, 1, 1 } }, // vpermilpd
2075 { TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 1, 1, 1 } }, // vpermilps
2076 { TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 1, 1, 1 } }, // vpermilps
2077
2078 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb
2079 // + vpor + vinsertf128
2080 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb
2081 // + vpor + vinsertf128
2082 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb
2083 // + vpor + vinsertf128
2084
2085 { TTI::SK_Transpose, MVT::v4f64, { 1, 1, 1, 1 } }, // vshufpd/vunpck
2086 { TTI::SK_Transpose, MVT::v4i64, { 1, 1, 1, 1 } }, // vshufpd/vunpck
2087
2088 { TTI::SK_PermuteTwoSrc, MVT::v4f64, { 2, 2, 2, 2 } }, // 2*vshufpd + vblendpd
2089 { TTI::SK_PermuteTwoSrc, MVT::v8f32, { 2, 2, 2, 2 } }, // 2*vshufps + vblendps
2090 { TTI::SK_PermuteTwoSrc, MVT::v4i64, { 2, 2, 2, 2 } }, // 2*vpermilpd + vblendpd
2091 { TTI::SK_PermuteTwoSrc, MVT::v8i32, { 2, 2, 2, 2 } }, // 2*vpermilps + vblendps
2092 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb
2093 // + 2*vpor + vinsertf128
2094 { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb
2095 // + 2*vpor + vinsertf128
2096 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb
2097 // + 2*vpor + vinsertf128
2098 };
2099
2100 if (IsInLaneShuffle && ST->hasAVX())
2101 if (const auto *Entry =
2102 CostTableLookup(AVX1InLaneShuffleTbl, Kind, LT.second))
2103 if (auto KindCost = Entry->Cost[CostKind])
2104 return LT.first * *KindCost;
2105
2106 static const CostKindTblEntry AVX1ShuffleTbl[] = {
2107 {TTI::SK_Broadcast, MVT::v4f64, {2,3,2,3}}, // vperm2f128 + vpermilpd
2108 {TTI::SK_Broadcast, MVT::v8f32, {2,3,2,3}}, // vperm2f128 + vpermilps
2109 {TTI::SK_Broadcast, MVT::v4i64, {2,3,2,3}}, // vperm2f128 + vpermilpd
2110 {TTI::SK_Broadcast, MVT::v8i32, {2,3,2,3}}, // vperm2f128 + vpermilps
2111 {TTI::SK_Broadcast, MVT::v16i16, {2,3,3,4}}, // vpshuflw + vpshufd + vinsertf128
2112 {TTI::SK_Broadcast, MVT::v16f16, {2,3,3,4}}, // vpshuflw + vpshufd + vinsertf128
2113 {TTI::SK_Broadcast, MVT::v32i8, {3,4,3,6}}, // vpshufb + vinsertf128
2114
2115 {TTI::SK_Reverse, MVT::v4f64, {2,6,2,2}}, // vperm2f128 + vpermilpd
2116 {TTI::SK_Reverse, MVT::v8f32, {2,7,2,4}}, // vperm2f128 + vpermilps
2117 {TTI::SK_Reverse, MVT::v4i64, {2,6,2,2}}, // vperm2f128 + vpermilpd
2118 {TTI::SK_Reverse, MVT::v8i32, {2,7,2,4}}, // vperm2f128 + vpermilps
2119 {TTI::SK_Reverse, MVT::v16i16, {2,9,5,5}}, // vextractf128 + 2*pshufb
2120 // + vinsertf128
2121 {TTI::SK_Reverse, MVT::v16f16, {2,9,5,5}}, // vextractf128 + 2*pshufb
2122 // + vinsertf128
2123 {TTI::SK_Reverse, MVT::v32i8, {2,9,5,5}}, // vextractf128 + 2*pshufb
2124 // + vinsertf128
2125
2126 {TTI::SK_Select, MVT::v4i64, {1,1,1,1}}, // vblendpd
2127 {TTI::SK_Select, MVT::v4f64, {1,1,1,1}}, // vblendpd
2128 {TTI::SK_Select, MVT::v8i32, {1,1,1,1}}, // vblendps
2129 {TTI::SK_Select, MVT::v8f32, {1,1,1,1}}, // vblendps
2130 {TTI::SK_Select, MVT::v16i16, {3,3,3,3}}, // vpand + vpandn + vpor
2131 {TTI::SK_Select, MVT::v16f16, {3,3,3,3}}, // vpand + vpandn + vpor
2132 {TTI::SK_Select, MVT::v32i8, {3,3,3,3}}, // vpand + vpandn + vpor
2133
2134 {TTI::SK_Splice, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + shufpd
2135 {TTI::SK_Splice, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + shufpd
2136 {TTI::SK_Splice, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2137 {TTI::SK_Splice, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2138 {TTI::SK_Splice, MVT::v16i16, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2139 {TTI::SK_Splice, MVT::v16f16, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2140 {TTI::SK_Splice, MVT::v32i8, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2141
2142 {TTI::SK_PermuteSingleSrc, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + vshufpd
2143 {TTI::SK_PermuteSingleSrc, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + vshufpd
2144 {TTI::SK_PermuteSingleSrc, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2145 {TTI::SK_PermuteSingleSrc, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2146 {TTI::SK_PermuteSingleSrc, MVT::v16i16,{8,8,8,8}}, // vextractf128 + 4*pshufb
2147 // + 2*por + vinsertf128
2148 {TTI::SK_PermuteSingleSrc, MVT::v16f16,{8,8,8,8}}, // vextractf128 + 4*pshufb
2149 // + 2*por + vinsertf128
2150 {TTI::SK_PermuteSingleSrc, MVT::v32i8, {8,8,8,8}}, // vextractf128 + 4*pshufb
2151 // + 2*por + vinsertf128
2152
2153 {TTI::SK_PermuteTwoSrc, MVT::v4f64, {3,3,3,3}}, // 2*vperm2f128 + vshufpd
2154 {TTI::SK_PermuteTwoSrc, MVT::v4i64, {3,3,3,3}}, // 2*vperm2f128 + vshufpd
2155 {TTI::SK_PermuteTwoSrc, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2156 {TTI::SK_PermuteTwoSrc, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2157 {TTI::SK_PermuteTwoSrc, MVT::v16i16,{15,15,15,15}}, // 2*vextractf128 + 8*pshufb
2158 // + 4*por + vinsertf128
2159 {TTI::SK_PermuteTwoSrc, MVT::v16f16,{15,15,15,15}}, // 2*vextractf128 + 8*pshufb
2160 // + 4*por + vinsertf128
2161 {TTI::SK_PermuteTwoSrc, MVT::v32i8, {15,15,15,15}}, // 2*vextractf128 + 8*pshufb
2162 // + 4*por + vinsertf128
2163 };
2164
2165 if (ST->hasAVX())
2166 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
2167 if (auto KindCost = Entry->Cost[CostKind])
2168 return LT.first * *KindCost;
2169
2170 static const CostKindTblEntry SSE41ShuffleTbl[] = {
2171 {TTI::SK_Select, MVT::v2i64, {1,1,1,1}}, // pblendw
2172 {TTI::SK_Select, MVT::v2f64, {1,1,1,1}}, // movsd
2173 {TTI::SK_Select, MVT::v4i32, {1,1,1,1}}, // pblendw
2174 {TTI::SK_Select, MVT::v4f32, {1,1,1,1}}, // blendps
2175 {TTI::SK_Select, MVT::v8i16, {1,1,1,1}}, // pblendw
2176 {TTI::SK_Select, MVT::v8f16, {1,1,1,1}}, // pblendw
2177 {TTI::SK_Select, MVT::v16i8, {1,1,1,1}} // pblendvb
2178 };
2179
2180 if (ST->hasSSE41())
2181 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
2182 if (auto KindCost = Entry->Cost[CostKind])
2183 return LT.first * *KindCost;
2184
2185 static const CostKindTblEntry SSSE3ShuffleTbl[] = {
2186 {TTI::SK_Broadcast, MVT::v8i16, {1, 3, 2, 2}}, // pshufb
2187 {TTI::SK_Broadcast, MVT::v8f16, {1, 3, 2, 2}}, // pshufb
2188 {TTI::SK_Broadcast, MVT::v16i8, {1, 3, 2, 2}}, // pshufb
2189
2190 {TTI::SK_Reverse, MVT::v8i16, {1, 2, 1, 2}}, // pshufb
2191 {TTI::SK_Reverse, MVT::v8f16, {1, 2, 1, 2}}, // pshufb
2192 {TTI::SK_Reverse, MVT::v16i8, {1, 2, 1, 2}}, // pshufb
2193
2194 {TTI::SK_Select, MVT::v8i16, {3, 3, 3, 3}}, // 2*pshufb + por
2195 {TTI::SK_Select, MVT::v8f16, {3, 3, 3, 3}}, // 2*pshufb + por
2196 {TTI::SK_Select, MVT::v16i8, {3, 3, 3, 3}}, // 2*pshufb + por
2197
2198 {TTI::SK_Splice, MVT::v4i32, {1, 1, 1, 1}}, // palignr
2199 {TTI::SK_Splice, MVT::v4f32, {1, 1, 1, 1}}, // palignr
2200 {TTI::SK_Splice, MVT::v8i16, {1, 1, 1, 1}}, // palignr
2201 {TTI::SK_Splice, MVT::v8f16, {1, 1, 1, 1}}, // palignr
2202 {TTI::SK_Splice, MVT::v16i8, {1, 1, 1, 1}}, // palignr
2203
2204 {TTI::SK_PermuteSingleSrc, MVT::v8i16, {1, 1, 1, 1}}, // pshufb
2205 {TTI::SK_PermuteSingleSrc, MVT::v8f16, {1, 1, 1, 1}}, // pshufb
2206 {TTI::SK_PermuteSingleSrc, MVT::v16i8, {1, 1, 1, 1}}, // pshufb
2207
2208 {TTI::SK_PermuteTwoSrc, MVT::v8i16, {3, 3, 3, 3}}, // 2*pshufb + por
2209 {TTI::SK_PermuteTwoSrc, MVT::v8f16, {3, 3, 3, 3}}, // 2*pshufb + por
2210 {TTI::SK_PermuteTwoSrc, MVT::v16i8, {3, 3, 3, 3}}, // 2*pshufb + por
2211 };
2212
2213 if (ST->hasSSSE3())
2214 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
2215 if (auto KindCost = Entry->Cost[CostKind])
2216 return LT.first * *KindCost;
2217
2218 static const CostKindTblEntry SSE2ShuffleTbl[] = {
2219 {TTI::SK_Broadcast, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2220 {TTI::SK_Broadcast, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
2221 {TTI::SK_Broadcast, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
2222 {TTI::SK_Broadcast, MVT::v8i16, {1, 2, 2, 2}}, // pshuflw + pshufd
2223 {TTI::SK_Broadcast, MVT::v8f16, {1, 2, 2, 2}}, // pshuflw + pshufd
2224 {TTI::SK_Broadcast, MVT::v16i8, {2, 3, 3, 4}}, // unpck + pshuflw + pshufd
2225
2226 {TTI::SK_Reverse, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2227 {TTI::SK_Reverse, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
2228 {TTI::SK_Reverse, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
2229 {TTI::SK_Reverse, MVT::v8i16, {2, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
2230 {TTI::SK_Reverse, MVT::v8f16, {2, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
2231 {TTI::SK_Reverse, MVT::v16i8, {5, 6,11,11}}, // 2*pshuflw + 2*pshufhw
2232 // + 2*pshufd + 2*unpck + packus
2233
2234 {TTI::SK_Select, MVT::v2i64, {1, 1, 1, 1}}, // movsd
2235 {TTI::SK_Select, MVT::v2f64, {1, 1, 1, 1}}, // movsd
2236 {TTI::SK_Select, MVT::v4i32, {2, 2, 2, 2}}, // 2*shufps
2237 {TTI::SK_Select, MVT::v8i16, {3, 3, 3, 3}}, // pand + pandn + por
2238 {TTI::SK_Select, MVT::v8f16, {3, 3, 3, 3}}, // pand + pandn + por
2239 {TTI::SK_Select, MVT::v16i8, {3, 3, 3, 3}}, // pand + pandn + por
2240
2241 {TTI::SK_Splice, MVT::v2i64, {1, 1, 1, 1}}, // shufpd
2242 {TTI::SK_Splice, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2243 {TTI::SK_Splice, MVT::v4i32, {2, 2, 2, 2}}, // 2*{unpck,movsd,pshufd}
2244 {TTI::SK_Splice, MVT::v8i16, {3, 3, 3, 3}}, // psrldq + psrlldq + por
2245 {TTI::SK_Splice, MVT::v8f16, {3, 3, 3, 3}}, // psrldq + psrlldq + por
2246 {TTI::SK_Splice, MVT::v16i8, {3, 3, 3, 3}}, // psrldq + psrlldq + por
2247
2248 {TTI::SK_PermuteSingleSrc, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2249 {TTI::SK_PermuteSingleSrc, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
2250 {TTI::SK_PermuteSingleSrc, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
2251 {TTI::SK_PermuteSingleSrc, MVT::v8i16, {3, 5, 5, 5}}, // 2*pshuflw + 2*pshufhw
2252 // + pshufd/unpck
2253 {TTI::SK_PermuteSingleSrc, MVT::v8f16, {3, 5, 5, 5}}, // 2*pshuflw + 2*pshufhw
2254 // + pshufd/unpck
2255 {TTI::SK_PermuteSingleSrc, MVT::v16i8, {8, 10, 10, 10}}, // 2*pshuflw + 2*pshufhw
2256 // + 2*pshufd + 2*unpck + 2*packus
2257
2258 {TTI::SK_PermuteTwoSrc, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2259 {TTI::SK_PermuteTwoSrc, MVT::v2i64, {1, 1, 1, 1}}, // shufpd
2260 {TTI::SK_PermuteTwoSrc, MVT::v4i32, {2, 2, 2, 2}}, // 2*{unpck,movsd,pshufd}
2261 {TTI::SK_PermuteTwoSrc, MVT::v8i16, {6, 8, 8, 8}}, // blend+permute
2262 {TTI::SK_PermuteTwoSrc, MVT::v8f16, {6, 8, 8, 8}}, // blend+permute
2263 {TTI::SK_PermuteTwoSrc, MVT::v16i8, {11, 13, 13, 13}}, // blend+permute
2264 };
2265
2266 static const CostTblEntry SSE3BroadcastLoadTbl[] = {
2267 {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
2268 };
2269
2270 if (ST->hasSSE2()) {
2271 bool IsLoad =
2272 llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); });
2273 if (ST->hasSSE3() && IsLoad)
2274 if (const auto *Entry =
2275 CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
2276 assert(isLegalBroadcastLoad(SrcTy->getElementType(),
2277 LT.second.getVectorElementCount()) &&
2278 "Table entry missing from isLegalBroadcastLoad()");
2279 return LT.first * Entry->Cost;
2280 }
2281
2282 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
2283 if (auto KindCost = Entry->Cost[CostKind])
2284 return LT.first * *KindCost;
2285 }
2286
2287 static const CostKindTblEntry SSE1ShuffleTbl[] = {
2288 { TTI::SK_Broadcast, MVT::v4f32, {1,1,1,1} }, // shufps
2289 { TTI::SK_Reverse, MVT::v4f32, {1,1,1,1} }, // shufps
2290 { TTI::SK_Select, MVT::v4f32, {2,2,2,2} }, // 2*shufps
2291 { TTI::SK_Splice, MVT::v4f32, {2,2,2,2} }, // 2*shufps
2292 { TTI::SK_PermuteSingleSrc, MVT::v4f32, {1,1,1,1} }, // shufps
2293 { TTI::SK_PermuteTwoSrc, MVT::v4f32, {2,2,2,2} }, // 2*shufps
2294 };
2295
2296 if (ST->hasSSE1()) {
2297 if (LT.first == 1 && LT.second == MVT::v4f32 && Mask.size() == 4) {
2298 // SHUFPS: both pairs must come from the same source register.
2299 auto MatchSHUFPS = [](int X, int Y) {
2300 return X < 0 || Y < 0 || ((X & 4) == (Y & 4));
2301 };
2302 if (MatchSHUFPS(Mask[0], Mask[1]) && MatchSHUFPS(Mask[2], Mask[3]))
2303 return 1;
2304 }
2305 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
2306 if (auto KindCost = Entry->Cost[CostKind])
2307 return LT.first * *KindCost;
2308 }
2309
2310 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
2311 SubTp);
2312}
2313
2315 Type *Src,
2318 const Instruction *I) const {
2319 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2320 assert(ISD && "Invalid opcode");
2321
2322 // The cost tables include both specific, custom (non-legal) src/dst type
2323 // conversions and generic, legalized types. We test for customs first, before
2324 // falling back to legalization.
2325 // FIXME: Need a better design of the cost table to handle non-simple types of
2326 // potential massive combinations (elem_num x src_type x dst_type).
2327 static const TypeConversionCostKindTblEntry AVX512BWConversionTbl[]{
2328 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2329 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2330
2331 // Mask sign extend has an instruction.
2332 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2333 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2334 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2335 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2336 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2337 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2338 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2339 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2340 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2341 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2342 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2343 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2344 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2345 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2346 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, { 1, 1, 1, 1 } },
2347 { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, { 1, 1, 1, 1 } },
2348 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, { 1, 1, 1, 1 } },
2349
2350 // Mask zero extend is a sext + shift.
2351 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2352 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2353 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2354 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2355 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2356 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2357 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2358 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2359 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2360 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2361 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2362 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2363 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2364 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2365 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, { 2, 1, 1, 1 } },
2366 { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, { 2, 1, 1, 1 } },
2367 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, { 2, 1, 1, 1 } },
2368
2369 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2370 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2371 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2372 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2373 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2374 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2375 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2376 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2377 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2378 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2379 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2380 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2381 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2382 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2383 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, { 2, 1, 1, 1 } },
2384 { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, { 2, 1, 1, 1 } },
2385 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, { 2, 1, 1, 1 } },
2386
2387 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 2, 1, 1, 1 } },
2388 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // widen to zmm
2389 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, { 2, 1, 1, 1 } }, // vpmovwb
2390 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, { 2, 1, 1, 1 } }, // vpmovwb
2391 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, { 2, 1, 1, 1 } }, // vpmovwb
2392 };
2393
2394 static const TypeConversionCostKindTblEntry AVX512DQConversionTbl[] = {
2395 // Mask sign extend has an instruction.
2396 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2397 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2398 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2399 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2400 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2401 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, { 1, 1, 1, 1 } },
2402 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } },
2403 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } },
2404
2405 // Mask zero extend is a sext + shift.
2406 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1, } },
2407 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1, } },
2408 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1, } },
2409 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1, } },
2410 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1, } },
2411 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, { 2, 1, 1, 1, } },
2412 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1, } },
2413 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1, } },
2414
2415 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2416 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2417 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2418 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2419 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2420 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } },
2421 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } },
2422 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, { 2, 1, 1, 1 } },
2423
2424 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2425 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2426
2427 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2428 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2429
2430 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2431 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2432
2433 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2434 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2435 };
2436
2437 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
2438 // 256-bit wide vectors.
2439
2440 static const TypeConversionCostKindTblEntry AVX512FConversionTbl[] = {
2441 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 1, 1, 1, 1 } },
2442 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, { 3, 1, 1, 1 } },
2443 { ISD::FP_EXTEND, MVT::v16f64, MVT::v16f32, { 4, 1, 1, 1 } }, // 2*vcvtps2pd+vextractf64x4
2444 { ISD::FP_EXTEND, MVT::v16f32, MVT::v16f16, { 1, 1, 1, 1 } }, // vcvtph2ps
2445 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
2446 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 1, 1, 1, 1 } },
2447 { ISD::FP_ROUND, MVT::v16f16, MVT::v16f32, { 1, 1, 1, 1 } }, // vcvtps2ph
2448
2449 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2450 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2451 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2452 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2453 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2454 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2455 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2456 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2457 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2458 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2459 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2460 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2461 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2462 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2463 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2464 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, { 2, 1, 1, 1 } }, // vpmovdb
2465 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, { 2, 1, 1, 1 } }, // vpmovdb
2466 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2467 { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2468 { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2469 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2470 { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2471 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, { 2, 1, 1, 1 } }, // vpmovqb
2472 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, { 1, 1, 1, 1 } }, // vpshufb
2473 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2474 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2475 { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2476 { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2477 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2478 { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2479 { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2480 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, { 1, 1, 1, 1 } }, // vpmovqd
2481 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // zmm vpmovqd
2482 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, { 5, 1, 1, 1 } },// 2*vpmovqd+concat+vpmovdb
2483
2484 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } }, // extend to v16i32
2485 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 8, 1, 1, 1 } },
2486 { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, { 8, 1, 1, 1 } },
2487
2488 // Sign extend is zmm vpternlogd+vptruncdb.
2489 // Zero extend is zmm broadcast load+vptruncdw.
2490 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 3, 1, 1, 1 } },
2491 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 4, 1, 1, 1 } },
2492 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 3, 1, 1, 1 } },
2493 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 4, 1, 1, 1 } },
2494 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 3, 1, 1, 1 } },
2495 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 4, 1, 1, 1 } },
2496 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 3, 1, 1, 1 } },
2497 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 4, 1, 1, 1 } },
2498
2499 // Sign extend is zmm vpternlogd+vptruncdw.
2500 // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
2501 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 3, 1, 1, 1 } },
2502 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2503 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 3, 1, 1, 1 } },
2504 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2505 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 3, 1, 1, 1 } },
2506 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2507 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 3, 1, 1, 1 } },
2508 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2509
2510 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2511 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2512 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2513 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2514 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2515 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2516 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2517 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2518 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2519 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2520
2521 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2522 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2523 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogq
2524 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2525
2526 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2527 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2528 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2529 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2530 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2531 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2532 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2533 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2534 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2535 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2536
2537 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2538 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2539
2540 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2541 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2542 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2543 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2544 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2545 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2546 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2547 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2548
2549 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2550 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2551 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2552 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2553 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2554 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2555 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2556 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2557 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, {26, 1, 1, 1 } },
2558 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 5, 1, 1, 1 } },
2559
2560 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2561 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, { 7, 1, 1, 1 } },
2562 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64, {15, 1, 1, 1 } },
2563 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32, {11, 1, 1, 1 } },
2564 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64, {31, 1, 1, 1 } },
2565 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2566 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, { 7, 1, 1, 1 } },
2567 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, { 5, 1, 1, 1 } },
2568 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64, {15, 1, 1, 1 } },
2569 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2570 { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, { 3, 1, 1, 1 } },
2571
2572 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2573 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2574 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, { 3, 1, 1, 1 } },
2575 { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, { 1, 1, 1, 1 } },
2576 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, { 3, 1, 1, 1 } },
2577 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, { 3, 1, 1, 1 } },
2578 };
2579
2580 static const TypeConversionCostKindTblEntry AVX512BWVLConversionTbl[] {
2581 // Mask sign extend has an instruction.
2582 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2583 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2584 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2585 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2586 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2587 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2588 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2589 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2590 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2591 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2592 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2593 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2594 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2595 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2596 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, { 1, 1, 1, 1 } },
2597 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, { 1, 1, 1, 1 } },
2598 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, { 1, 1, 1, 1 } },
2599
2600 // Mask zero extend is a sext + shift.
2601 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2602 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2603 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2604 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2605 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2606 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2607 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2608 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2609 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2610 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2611 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2612 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2613 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2614 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2615 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, { 2, 1, 1, 1 } },
2616 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, { 2, 1, 1, 1 } },
2617 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, { 2, 1, 1, 1 } },
2618
2619 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2620 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2621 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2622 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2623 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2624 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2625 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2626 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2627 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2628 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2629 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2630 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2631 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2632 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2633 { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, { 2, 1, 1, 1 } },
2634 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, { 2, 1, 1, 1 } },
2635 { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, { 2, 1, 1, 1 } },
2636
2637 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } },
2638 };
2639
2640 static const TypeConversionCostKindTblEntry AVX512DQVLConversionTbl[] = {
2641 // Mask sign extend has an instruction.
2642 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2643 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2644 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2645 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, { 1, 1, 1, 1 } },
2646 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2647 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, { 1, 1, 1, 1 } },
2648 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } },
2649 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2650
2651 // Mask zero extend is a sext + shift.
2652 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } },
2653 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1 } },
2654 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } },
2655 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, { 2, 1, 1, 1 } },
2656 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } },
2657 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, { 2, 1, 1, 1 } },
2658 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } },
2659 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } },
2660
2661 { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, { 2, 1, 1, 1 } },
2662 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } },
2663 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2664 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2665 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2666 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2667 { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, { 2, 1, 1, 1 } },
2668 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2669
2670 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2671 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2672 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2673 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2674
2675 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2676 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2677 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2678 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2679
2680 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2681 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2682 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2683 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2684
2685 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2686 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2687 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2688 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2689 };
2690
2691 static const TypeConversionCostKindTblEntry AVX512VLConversionTbl[] = {
2692 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2693 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2694 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2695 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 8, 1, 1, 1 } }, // split+2*v8i8
2696 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2697 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2698 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2699 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 8, 1, 1, 1 } }, // split+2*v8i16
2700 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2701 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2702 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2703 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2704 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2705 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2706 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // vpmovqd
2707 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqb
2708 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqw
2709 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, { 2, 1, 1, 1 } }, // vpmovwb
2710
2711 // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
2712 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
2713 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 5, 1, 1, 1 } },
2714 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 6, 1, 1, 1 } },
2715 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 5, 1, 1, 1 } },
2716 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 6, 1, 1, 1 } },
2717 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 5, 1, 1, 1 } },
2718 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 6, 1, 1, 1 } },
2719 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, {10, 1, 1, 1 } },
2720 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, {12, 1, 1, 1 } },
2721
2722 // sign extend is vpcmpeq+maskedmove+vpmovdw
2723 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
2724 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2725 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 5, 1, 1, 1 } },
2726 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2727 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 5, 1, 1, 1 } },
2728 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2729 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 5, 1, 1, 1 } },
2730 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, {10, 1, 1, 1 } },
2731 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, {12, 1, 1, 1 } },
2732
2733 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogd
2734 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2735 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogd
2736 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2737 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogd
2738 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2739 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2740 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2741
2742 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogq
2743 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2744 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogq
2745 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2746
2747 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2748 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2749 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2750 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2751 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2752 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2753 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2754 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2755 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2756 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2757 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2758 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2759
2760 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2761 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2762 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2763 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2764
2765 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
2766 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
2767 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2768 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2769 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2770 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2771 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 1, 1, 1, 1 } },
2772 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
2773 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2774 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2775 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 5, 1, 1, 1 } },
2776 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2777 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 5, 1, 1, 1 } },
2778
2779 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2780 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2781 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, { 5, 1, 1, 1 } },
2782
2783 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
2784 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
2785 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
2786 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
2787 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2788 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2789 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2790 };
2791
2792 static const TypeConversionCostKindTblEntry AVX2ConversionTbl[] = {
2793 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2794 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2795 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2796 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2797 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2798 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2799
2800 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2801 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2802 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2803 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2804 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2805 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2806 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2807 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2808 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2809 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2810 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2811 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2812 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2813 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2814
2815 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2816
2817 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 4, 1, 1, 1 } },
2818 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 4, 1, 1, 1 } },
2819 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 1, 1, 1, 1 } },
2820 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 1, 1, 1, 1 } },
2821 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 1, 1, 1, 1 } },
2822 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 4, 1, 1, 1 } },
2823 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 4, 1, 1, 1 } },
2824 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 1, 1, 1, 1 } },
2825 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 1, 1, 1, 1 } },
2826 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 5, 1, 1, 1 } },
2827 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } },
2828 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 2, 1, 1, 1 } },
2829
2830 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 3, 1, 1, 1 } },
2831 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 3, 1, 1, 1 } },
2832
2833 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2834 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2835 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2836 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 3, 1, 1, 1 } },
2837
2838 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 3, 1, 1, 1 } },
2839 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 3, 1, 1, 1 } },
2840 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2841 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
2842 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2843 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 4, 1, 1, 1 } },
2844 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 3, 1, 1, 1 } },
2845 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 4, 1, 1, 1 } },
2846
2847 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2848 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2849 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2850 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2851 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2852 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2853 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 3, 1, 1, 1 } },
2854
2855 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2856 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2857 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2858 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2859 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 2, 1, 1, 1 } },
2860 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 1, 1, 1, 1 } },
2861 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 2, 1, 1, 1 } },
2862 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2863 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2864 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2865 };
2866
2867 static const TypeConversionCostKindTblEntry AVXConversionTbl[] = {
2868 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2869 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2870 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2871 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2872 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2873 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2874
2875 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2876 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2877 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2878 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2879 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2880 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2881 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2882 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2883 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2884 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2885 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2886 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2887
2888 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 4, 1, 1, 1 } },
2889 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 5, 1, 1, 1 } },
2890 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 4, 1, 1, 1 } },
2891 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 9, 1, 1, 1 } },
2892 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, {11, 1, 1, 1 } },
2893
2894 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 6, 1, 1, 1 } },
2895 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 6, 1, 1, 1 } },
2896 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // and+extract+packuswb
2897 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 5, 1, 1, 1 } },
2898 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
2899 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 5, 1, 1, 1 } },
2900 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 3, 1, 1, 1 } }, // and+extract+2*packusdw
2901 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 2, 1, 1, 1 } },
2902
2903 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, { 3, 1, 1, 1 } },
2904 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, { 3, 1, 1, 1 } },
2905 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, { 8, 1, 1, 1 } },
2906 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2907 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2908 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2909 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2910 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2911 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2912 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2913 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 5, 1, 1, 1 } },
2914 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 8, 1, 1, 1 } },
2915
2916 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, { 7, 1, 1, 1 } },
2917 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, { 7, 1, 1, 1 } },
2918 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, { 6, 1, 1, 1 } },
2919 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2920 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2921 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2922 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2923 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 4, 1, 1, 1 } },
2924 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 4, 1, 1, 1 } },
2925 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
2926 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 6, 1, 1, 1 } },
2927 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 8, 1, 1, 1 } },
2928 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, {10, 1, 1, 1 } },
2929 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, {10, 1, 1, 1 } },
2930 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {18, 1, 1, 1 } },
2931 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2932 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, {10, 1, 1, 1 } },
2933
2934 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2935 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2936 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2937 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
2938 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
2939 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
2940 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
2941 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
2942 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 2, 1, 1, 1 } },
2943 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 2, 1, 1, 1 } },
2944 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 5, 1, 1, 1 } },
2945
2946 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2947 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2948 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2949 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
2950 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
2951 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
2952 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
2953 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
2954 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
2955 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2956 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 6, 1, 1, 1 } },
2957 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 7, 1, 1, 1 } },
2958 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 7, 1, 1, 1 } },
2959
2960 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, { 1, 1, 1, 1 } },
2961 { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, { 1, 1, 1, 1 } },
2962 };
2963
2964 static const TypeConversionCostKindTblEntry SSE41ConversionTbl[] = {
2965 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
2966 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
2967 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
2968 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
2969 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
2970 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
2971 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
2972 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
2973 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
2974 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
2975 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
2976 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
2977
2978 // These truncates end up widening elements.
2979 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 1, 1, 1, 1 } }, // PMOVXZBQ
2980 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 1, 1, 1, 1 } }, // PMOVXZWQ
2981 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 1, 1, 1, 1 } }, // PMOVXZBD
2982
2983 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 2, 1, 1, 1 } },
2984 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 2, 1, 1, 1 } },
2985 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 2, 1, 1, 1 } },
2986
2987 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
2988 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
2989 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
2990 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
2991 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
2992 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2993 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
2994 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2995 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
2996 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 1, 1, 1, 1 } },
2997 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2998
2999 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
3000 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
3001 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 4, 1, 1, 1 } },
3002 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 4, 1, 1, 1 } },
3003 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
3004 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
3005 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
3006 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
3007 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 3, 1, 1, 1 } },
3008 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
3009 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 2, 1, 1, 1 } },
3010 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {12, 1, 1, 1 } },
3011 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {22, 1, 1, 1 } },
3012 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 4, 1, 1, 1 } },
3013
3014 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
3015 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
3016 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
3017 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
3018 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
3019 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
3020 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
3021 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
3022 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
3023 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
3024
3025 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
3026 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3027 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
3028 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
3029 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
3030 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
3031 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
3032 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
3033 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
3034 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
3035 };
3036
3037 static const TypeConversionCostKindTblEntry SSE2ConversionTbl[] = {
3038 // These are somewhat magic numbers justified by comparing the
3039 // output of llvm-mca for our various supported scheduler models
3040 // and basing it off the worst case scenario.
3041 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
3042 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
3043 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 3, 1, 1, 1 } },
3044 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 3, 1, 1, 1 } },
3045 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 3, 1, 1, 1 } },
3046 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
3047 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 3, 1, 1, 1 } },
3048 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
3049 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
3050 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 4, 1, 1, 1 } },
3051 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 8, 1, 1, 1 } },
3052 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 8, 1, 1, 1 } },
3053
3054 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
3055 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
3056 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 8, 1, 1, 1 } },
3057 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 9, 1, 1, 1 } },
3058 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
3059 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 4, 1, 1, 1 } },
3060 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 4, 1, 1, 1 } },
3061 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
3062 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 7, 1, 1, 1 } },
3063 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 7, 1, 1, 1 } },
3064 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
3065 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, {15, 1, 1, 1 } },
3066 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {18, 1, 1, 1 } },
3067
3068 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
3069 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3070 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
3071 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
3072 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
3073 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
3074 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
3075 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
3076 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
3077 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
3078
3079 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
3080 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3081 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
3082 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, {15, 1, 1, 1 } },
3083 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
3084 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
3085 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
3086 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
3087 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 8, 1, 1, 1 } },
3088 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 8, 1, 1, 1 } },
3089
3090 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
3091 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
3092 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 2, 1, 1, 1 } },
3093 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 3, 1, 1, 1 } },
3094 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
3095 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 2, 1, 1, 1 } },
3096 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 2, 1, 1, 1 } },
3097 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 3, 1, 1, 1 } },
3098 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
3099 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 2, 1, 1, 1 } },
3100 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
3101 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 2, 1, 1, 1 } },
3102
3103 // These truncates are really widening elements.
3104 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 1, 1, 1, 1 } }, // PSHUFD
3105 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } }, // PUNPCKLWD+DQ
3106 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // PUNPCKLBW+WD+PSHUFD
3107 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 1, 1, 1, 1 } }, // PUNPCKLWD
3108 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } }, // PUNPCKLBW+WD
3109 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 1, 1, 1, 1 } }, // PUNPCKLBW
3110
3111 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 2, 1, 1, 1 } }, // PAND+PACKUSWB
3112 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } },
3113 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 3, 1, 1, 1 } }, // PAND+2*PACKUSWB
3114 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 7, 1, 1, 1 } },
3115 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, { 1, 1, 1, 1 } },
3116 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 3, 1, 1, 1 } },
3117 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
3118 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, {10, 1, 1, 1 } },
3119 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 4, 1, 1, 1 } }, // PAND+3*PACKUSWB
3120 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 2, 1, 1, 1 } }, // PSHUFD+PSHUFLW
3121 { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, { 1, 1, 1, 1 } }, // PSHUFD
3122 };
3123
3124 static const TypeConversionCostKindTblEntry F16ConversionTbl[] = {
3125 { ISD::FP_ROUND, MVT::f16, MVT::f32, { 1, 1, 1, 1 } },
3126 { ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, { 1, 1, 1, 1 } },
3127 { ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, { 1, 1, 1, 1 } },
3128 { ISD::FP_EXTEND, MVT::f32, MVT::f16, { 1, 1, 1, 1 } },
3129 { ISD::FP_EXTEND, MVT::f64, MVT::f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
3130 { ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, { 1, 1, 1, 1 } },
3131 { ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, { 1, 1, 1, 1 } },
3132 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
3133 };
3134
3135 // Attempt to map directly to (simple) MVT types to let us match custom entries.
3136 EVT SrcTy = TLI->getValueType(DL, Src);
3137 EVT DstTy = TLI->getValueType(DL, Dst);
3138
3139 // The function getSimpleVT only handles simple value types.
3140 if (SrcTy.isSimple() && DstTy.isSimple()) {
3141 MVT SimpleSrcTy = SrcTy.getSimpleVT();
3142 MVT SimpleDstTy = DstTy.getSimpleVT();
3143
3144 if (ST->useAVX512Regs()) {
3145 if (ST->hasBWI())
3146 if (const auto *Entry = ConvertCostTableLookup(
3147 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3148 if (auto KindCost = Entry->Cost[CostKind])
3149 return *KindCost;
3150
3151 if (ST->hasDQI())
3152 if (const auto *Entry = ConvertCostTableLookup(
3153 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3154 if (auto KindCost = Entry->Cost[CostKind])
3155 return *KindCost;
3156
3157 if (ST->hasAVX512())
3158 if (const auto *Entry = ConvertCostTableLookup(
3159 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3160 if (auto KindCost = Entry->Cost[CostKind])
3161 return *KindCost;
3162 }
3163
3164 if (ST->hasBWI())
3165 if (const auto *Entry = ConvertCostTableLookup(
3166 AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3167 if (auto KindCost = Entry->Cost[CostKind])
3168 return *KindCost;
3169
3170 if (ST->hasDQI())
3171 if (const auto *Entry = ConvertCostTableLookup(
3172 AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3173 if (auto KindCost = Entry->Cost[CostKind])
3174 return *KindCost;
3175
3176 if (ST->hasAVX512())
3177 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3178 SimpleDstTy, SimpleSrcTy))
3179 if (auto KindCost = Entry->Cost[CostKind])
3180 return *KindCost;
3181
3182 if (ST->hasAVX2()) {
3183 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3184 SimpleDstTy, SimpleSrcTy))
3185 if (auto KindCost = Entry->Cost[CostKind])
3186 return *KindCost;
3187 }
3188
3189 if (ST->hasAVX()) {
3190 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3191 SimpleDstTy, SimpleSrcTy))
3192 if (auto KindCost = Entry->Cost[CostKind])
3193 return *KindCost;
3194 }
3195
3196 if (ST->hasF16C()) {
3197 if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD,
3198 SimpleDstTy, SimpleSrcTy))
3199 if (auto KindCost = Entry->Cost[CostKind])
3200 return *KindCost;
3201 }
3202
3203 if (ST->hasSSE41()) {
3204 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3205 SimpleDstTy, SimpleSrcTy))
3206 if (auto KindCost = Entry->Cost[CostKind])
3207 return *KindCost;
3208 }
3209
3210 if (ST->hasSSE2()) {
3211 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3212 SimpleDstTy, SimpleSrcTy))
3213 if (auto KindCost = Entry->Cost[CostKind])
3214 return *KindCost;
3215 }
3216
3217 if ((ISD == ISD::FP_ROUND && SimpleDstTy == MVT::f16) ||
3218 (ISD == ISD::FP_EXTEND && SimpleSrcTy == MVT::f16)) {
3219 // fp16 conversions not covered by any table entries require a libcall.
3220 // Return a large (arbitrary) number to model this.
3221 return InstructionCost(64);
3222 }
3223 }
3224
3225 // Fall back to legalized types.
3226 std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src);
3227 std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst);
3228
3229 // If we're truncating to the same legalized type - just assume its free.
3230 if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second)
3231 return TTI::TCC_Free;
3232
3233 if (ST->useAVX512Regs()) {
3234 if (ST->hasBWI())
3235 if (const auto *Entry = ConvertCostTableLookup(
3236 AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
3237 if (auto KindCost = Entry->Cost[CostKind])
3238 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3239
3240 if (ST->hasDQI())
3241 if (const auto *Entry = ConvertCostTableLookup(
3242 AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
3243 if (auto KindCost = Entry->Cost[CostKind])
3244 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3245
3246 if (ST->hasAVX512())
3247 if (const auto *Entry = ConvertCostTableLookup(
3248 AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
3249 if (auto KindCost = Entry->Cost[CostKind])
3250 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3251 }
3252
3253 if (ST->hasBWI())
3254 if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
3255 LTDest.second, LTSrc.second))
3256 if (auto KindCost = Entry->Cost[CostKind])
3257 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3258
3259 if (ST->hasDQI())
3260 if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
3261 LTDest.second, LTSrc.second))
3262 if (auto KindCost = Entry->Cost[CostKind])
3263 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3264
3265 if (ST->hasAVX512())
3266 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3267 LTDest.second, LTSrc.second))
3268 if (auto KindCost = Entry->Cost[CostKind])
3269 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3270
3271 if (ST->hasAVX2())
3272 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3273 LTDest.second, LTSrc.second))
3274 if (auto KindCost = Entry->Cost[CostKind])
3275 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3276
3277 if (ST->hasAVX())
3278 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3279 LTDest.second, LTSrc.second))
3280 if (auto KindCost = Entry->Cost[CostKind])
3281 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3282
3283 if (ST->hasF16C()) {
3284 if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD,
3285 LTDest.second, LTSrc.second))
3286 if (auto KindCost = Entry->Cost[CostKind])
3287 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3288 }
3289
3290 if (ST->hasSSE41())
3291 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3292 LTDest.second, LTSrc.second))
3293 if (auto KindCost = Entry->Cost[CostKind])
3294 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3295
3296 if (ST->hasSSE2())
3297 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3298 LTDest.second, LTSrc.second))
3299 if (auto KindCost = Entry->Cost[CostKind])
3300 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3301
3302 // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
3303 // sitofp.
3304 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3305 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
3306 Type *ExtSrc = Src->getWithNewBitWidth(32);
3307 unsigned ExtOpc =
3308 (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
3309
3310 // For scalar loads the extend would be free.
3311 InstructionCost ExtCost = 0;
3312 if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
3313 ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
3314
3315 return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
3317 }
3318
3319 // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
3320 // i32.
3321 if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
3322 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
3323 Type *TruncDst = Dst->getWithNewBitWidth(32);
3324 return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
3325 getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
3327 }
3328
3329 // TODO: Allow non-throughput costs that aren't binary.
3330 auto AdjustCost = [&CostKind](InstructionCost Cost,
3333 return Cost == 0 ? 0 : N;
3334 return Cost * N;
3335 };
3336 return AdjustCost(
3337 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3338}
3339
3341 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
3343 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
3344 // Early out if this type isn't scalar/vector integer/float.
3345 if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy()))
3346 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3347 Op1Info, Op2Info, I);
3348
3349 // Legalize the type.
3350 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3351
3352 MVT MTy = LT.second;
3353
3354 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3355 assert(ISD && "Invalid opcode");
3356
3357 InstructionCost ExtraCost = 0;
3358 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
3359 // Some vector comparison predicates cost extra instructions.
3360 // TODO: Adjust ExtraCost based on CostKind?
3361 // TODO: Should we invert this and assume worst case cmp costs
3362 // and reduce for particular predicates?
3363 if (MTy.isVector() &&
3364 !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
3365 (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
3366 ST->hasBWI())) {
3367 // Fallback to I if a specific predicate wasn't specified.
3368 CmpInst::Predicate Pred = VecPred;
3369 if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE ||
3371 Pred = cast<CmpInst>(I)->getPredicate();
3372
3373 bool CmpWithConstant = false;
3374 if (auto *CmpInstr = dyn_cast_or_null<CmpInst>(I))
3375 CmpWithConstant = isa<Constant>(CmpInstr->getOperand(1));
3376
3377 switch (Pred) {
3379 // xor(cmpeq(x,y),-1)
3380 ExtraCost = CmpWithConstant ? 0 : 1;
3381 break;
3384 // xor(cmpgt(x,y),-1)
3385 ExtraCost = CmpWithConstant ? 0 : 1;
3386 break;
3389 // cmpgt(xor(x,signbit),xor(y,signbit))
3390 // xor(cmpeq(pmaxu(x,y),x),-1)
3391 ExtraCost = CmpWithConstant ? 1 : 2;
3392 break;
3395 if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
3396 (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
3397 // cmpeq(psubus(x,y),0)
3398 // cmpeq(pminu(x,y),x)
3399 ExtraCost = 1;
3400 } else {
3401 // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
3402 ExtraCost = CmpWithConstant ? 2 : 3;
3403 }
3404 break;
3407 // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases.
3408 // Use FCMP_UEQ expansion - FCMP_ONE should be the same.
3409 if (CondTy && !ST->hasAVX())
3410 return getCmpSelInstrCost(Opcode, ValTy, CondTy,
3412 Op1Info, Op2Info) +
3413 getCmpSelInstrCost(Opcode, ValTy, CondTy,
3415 Op1Info, Op2Info) +
3416 getArithmeticInstrCost(Instruction::Or, CondTy, CostKind);
3417
3418 break;
3421 // Assume worst case scenario and add the maximum extra cost.
3422 ExtraCost = 3;
3423 break;
3424 default:
3425 break;
3426 }
3427 }
3428 }
3429
3430 static const CostKindTblEntry SLMCostTbl[] = {
3431 // slm pcmpeq/pcmpgt throughput is 2
3432 { ISD::SETCC, MVT::v2i64, { 2, 5, 1, 2 } },
3433 // slm pblendvb/blendvpd/blendvps throughput is 4
3434 { ISD::SELECT, MVT::v2f64, { 4, 4, 1, 3 } }, // vblendvpd
3435 { ISD::SELECT, MVT::v4f32, { 4, 4, 1, 3 } }, // vblendvps
3436 { ISD::SELECT, MVT::v2i64, { 4, 4, 1, 3 } }, // pblendvb
3437 { ISD::SELECT, MVT::v8i32, { 4, 4, 1, 3 } }, // pblendvb
3438 { ISD::SELECT, MVT::v8i16, { 4, 4, 1, 3 } }, // pblendvb
3439 { ISD::SELECT, MVT::v16i8, { 4, 4, 1, 3 } }, // pblendvb
3440 };
3441
3442 static const CostKindTblEntry AVX512BWCostTbl[] = {
3443 { ISD::SETCC, MVT::v32i16, { 1, 1, 1, 1 } },
3444 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 1 } },
3445 { ISD::SETCC, MVT::v64i8, { 1, 1, 1, 1 } },
3446 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 1 } },
3447
3448 { ISD::SELECT, MVT::v32i16, { 1, 1, 1, 1 } },
3449 { ISD::SELECT, MVT::v64i8, { 1, 1, 1, 1 } },
3450 };
3451
3452 static const CostKindTblEntry AVX512CostTbl[] = {
3453 { ISD::SETCC, MVT::v8f64, { 1, 4, 1, 1 } },
3454 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 1 } },
3455 { ISD::SETCC, MVT::v16f32, { 1, 4, 1, 1 } },
3456 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 1 } },
3457
3458 { ISD::SETCC, MVT::v8i64, { 1, 1, 1, 1 } },
3459 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 1 } },
3460 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3461 { ISD::SETCC, MVT::v16i32, { 1, 1, 1, 1 } },
3462 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 1 } },
3463 { ISD::SETCC, MVT::v32i16, { 3, 7, 5, 5 } },
3464 { ISD::SETCC, MVT::v64i8, { 3, 7, 5, 5 } },
3465
3466 { ISD::SELECT, MVT::v8i64, { 1, 1, 1, 1 } },
3467 { ISD::SELECT, MVT::v4i64, { 1, 1, 1, 1 } },
3468 { ISD::SELECT, MVT::v2i64, { 1, 1, 1, 1 } },
3469 { ISD::SELECT, MVT::v16i32, { 1, 1, 1, 1 } },
3470 { ISD::SELECT, MVT::v8i32, { 1, 1, 1, 1 } },
3471 { ISD::SELECT, MVT::v4i32, { 1, 1, 1, 1 } },
3472 { ISD::SELECT, MVT::v8f64, { 1, 1, 1, 1 } },
3473 { ISD::SELECT, MVT::v4f64, { 1, 1, 1, 1 } },
3474 { ISD::SELECT, MVT::v2f64, { 1, 1, 1, 1 } },
3475 { ISD::SELECT, MVT::f64, { 1, 1, 1, 1 } },
3476 { ISD::SELECT, MVT::v16f32, { 1, 1, 1, 1 } },
3477 { ISD::SELECT, MVT::v8f32 , { 1, 1, 1, 1 } },
3478 { ISD::SELECT, MVT::v4f32, { 1, 1, 1, 1 } },
3479 { ISD::SELECT, MVT::f32 , { 1, 1, 1, 1 } },
3480
3481 { ISD::SELECT, MVT::v32i16, { 2, 2, 4, 4 } },
3482 { ISD::SELECT, MVT::v16i16, { 1, 1, 1, 1 } },
3483 { ISD::SELECT, MVT::v8i16, { 1, 1, 1, 1 } },
3484 { ISD::SELECT, MVT::v64i8, { 2, 2, 4, 4 } },
3485 { ISD::SELECT, MVT::v32i8, { 1, 1, 1, 1 } },
3486 { ISD::SELECT, MVT::v16i8, { 1, 1, 1, 1 } },
3487 };
3488
3489 static const CostKindTblEntry AVX2CostTbl[] = {
3490 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 2 } },
3491 { ISD::SETCC, MVT::v2f64, { 1, 4, 1, 1 } },
3492 { ISD::SETCC, MVT::f64, { 1, 4, 1, 1 } },
3493 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 2 } },
3494 { ISD::SETCC, MVT::v4f32, { 1, 4, 1, 1 } },
3495 { ISD::SETCC, MVT::f32, { 1, 4, 1, 1 } },
3496
3497 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 2 } },
3498 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 2 } },
3499 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 2 } },
3500 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 2 } },
3501
3502 { ISD::SELECT, MVT::v4f64, { 2, 2, 1, 2 } }, // vblendvpd
3503 { ISD::SELECT, MVT::v8f32, { 2, 2, 1, 2 } }, // vblendvps
3504 { ISD::SELECT, MVT::v4i64, { 2, 2, 1, 2 } }, // pblendvb
3505 { ISD::SELECT, MVT::v8i32, { 2, 2, 1, 2 } }, // pblendvb
3506 { ISD::SELECT, MVT::v16i16, { 2, 2, 1, 2 } }, // pblendvb
3507 { ISD::SELECT, MVT::v32i8, { 2, 2, 1, 2 } }, // pblendvb
3508 };
3509
3510 static const CostKindTblEntry XOPCostTbl[] = {
3511 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3512 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3513 };
3514
3515 static const CostKindTblEntry AVX1CostTbl[] = {
3516 { ISD::SETCC, MVT::v4f64, { 2, 3, 1, 2 } },
3517 { ISD::SETCC, MVT::v2f64, { 1, 3, 1, 1 } },
3518 { ISD::SETCC, MVT::f64, { 1, 3, 1, 1 } },
3519 { ISD::SETCC, MVT::v8f32, { 2, 3, 1, 2 } },
3520 { ISD::SETCC, MVT::v4f32, { 1, 3, 1, 1 } },
3521 { ISD::SETCC, MVT::f32, { 1, 3, 1, 1 } },
3522
3523 // AVX1 does not support 8-wide integer compare.
3524 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3525 { ISD::SETCC, MVT::v8i32, { 4, 2, 5, 6 } },
3526 { ISD::SETCC, MVT::v16i16, { 4, 2, 5, 6 } },
3527 { ISD::SETCC, MVT::v32i8, { 4, 2, 5, 6 } },
3528
3529 { ISD::SELECT, MVT::v4f64, { 3, 3, 1, 2 } }, // vblendvpd
3530 { ISD::SELECT, MVT::v8f32, { 3, 3, 1, 2 } }, // vblendvps
3531 { ISD::SELECT, MVT::v4i64, { 3, 3, 1, 2 } }, // vblendvpd
3532 { ISD::SELECT, MVT::v8i32, { 3, 3, 1, 2 } }, // vblendvps
3533 { ISD::SELECT, MVT::v16i16, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3534 { ISD::SELECT, MVT::v32i8, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3535 };
3536
3537 static const CostKindTblEntry SSE42CostTbl[] = {
3538 { ISD::SETCC, MVT::v2i64, { 1, 2, 1, 2 } },
3539 };
3540
3541 static const CostKindTblEntry SSE41CostTbl[] = {
3542 { ISD::SETCC, MVT::v2f64, { 1, 5, 1, 1 } },
3543 { ISD::SETCC, MVT::v4f32, { 1, 5, 1, 1 } },
3544
3545 { ISD::SELECT, MVT::v2f64, { 2, 2, 1, 2 } }, // blendvpd
3546 { ISD::SELECT, MVT::f64, { 2, 2, 1, 2 } }, // blendvpd
3547 { ISD::SELECT, MVT::v4f32, { 2, 2, 1, 2 } }, // blendvps
3548 { ISD::SELECT, MVT::f32 , { 2, 2, 1, 2 } }, // blendvps
3549 { ISD::SELECT, MVT::v2i64, { 2, 2, 1, 2 } }, // pblendvb
3550 { ISD::SELECT, MVT::v4i32, { 2, 2, 1, 2 } }, // pblendvb
3551 { ISD::SELECT, MVT::v8i16, { 2, 2, 1, 2 } }, // pblendvb
3552 { ISD::SELECT, MVT::v16i8, { 2, 2, 1, 2 } }, // pblendvb
3553 };
3554
3555 static const CostKindTblEntry SSE2CostTbl[] = {
3556 { ISD::SETCC, MVT::v2f64, { 2, 5, 1, 1 } },
3557 { ISD::SETCC, MVT::f64, { 1, 5, 1, 1 } },
3558
3559 { ISD::SETCC, MVT::v2i64, { 5, 4, 5, 5 } }, // pcmpeqd/pcmpgtd expansion
3560 { ISD::SETCC, MVT::v4i32, { 1, 1, 1, 1 } },
3561 { ISD::SETCC, MVT::v8i16, { 1, 1, 1, 1 } },
3562 { ISD::SETCC, MVT::v16i8, { 1, 1, 1, 1 } },
3563
3564 { ISD::SELECT, MVT::v2f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3565 { ISD::SELECT, MVT::f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3566 { ISD::SELECT, MVT::v2i64, { 2, 2, 3, 3 } }, // pand + pandn + por
3567 { ISD::SELECT, MVT::v4i32, { 2, 2, 3, 3 } }, // pand + pandn + por
3568 { ISD::SELECT, MVT::v8i16, { 2, 2, 3, 3 } }, // pand + pandn + por
3569 { ISD::SELECT, MVT::v16i8, { 2, 2, 3, 3 } }, // pand + pandn + por
3570 };
3571
3572 static const CostKindTblEntry SSE1CostTbl[] = {
3573 { ISD::SETCC, MVT::v4f32, { 2, 5, 1, 1 } },
3574 { ISD::SETCC, MVT::f32, { 1, 5, 1, 1 } },
3575
3576 { ISD::SELECT, MVT::v4f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3577 { ISD::SELECT, MVT::f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3578 };
3579
3580 if (ST->useSLMArithCosts())
3581 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
3582 if (auto KindCost = Entry->Cost[CostKind])
3583 return LT.first * (ExtraCost + *KindCost);
3584
3585 if (ST->hasBWI())
3586 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3587 if (auto KindCost = Entry->Cost[CostKind])
3588 return LT.first * (ExtraCost + *KindCost);
3589
3590 if (ST->hasAVX512())
3591 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3592 if (auto KindCost = Entry->Cost[CostKind])
3593 return LT.first * (ExtraCost + *KindCost);
3594
3595 if (ST->hasAVX2())
3596 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3597 if (auto KindCost = Entry->Cost[CostKind])
3598 return LT.first * (ExtraCost + *KindCost);
3599
3600 if (ST->hasXOP())
3601 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3602 if (auto KindCost = Entry->Cost[CostKind])
3603 return LT.first * (ExtraCost + *KindCost);
3604
3605 if (ST->hasAVX())
3606 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3607 if (auto KindCost = Entry->Cost[CostKind])
3608 return LT.first * (ExtraCost + *KindCost);
3609
3610 if (ST->hasSSE42())
3611 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3612 if (auto KindCost = Entry->Cost[CostKind])
3613 return LT.first * (ExtraCost + *KindCost);
3614
3615 if (ST->hasSSE41())
3616 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3617 if (auto KindCost = Entry->Cost[CostKind])
3618 return LT.first * (ExtraCost + *KindCost);
3619
3620 if (ST->hasSSE2())
3621 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3622 if (auto KindCost = Entry->Cost[CostKind])
3623 return LT.first * (ExtraCost + *KindCost);
3624
3625 if (ST->hasSSE1())
3626 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3627 if (auto KindCost = Entry->Cost[CostKind])
3628 return LT.first * (ExtraCost + *KindCost);
3629
3630 // Assume a 3cy latency for fp select ops.
3631 if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select)
3632 if (ValTy->getScalarType()->isFloatingPointTy())
3633 return 3;
3634
3635 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3636 Op1Info, Op2Info, I);
3637}
3638
3640
3644 // Costs should match the codegen from:
3645 // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
3646 // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
3647 // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
3648 // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
3649 // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
3650
3651 // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
3652 // specialized in these tables yet.
3653 static const CostKindTblEntry AVX512VBMI2CostTbl[] = {
3654 { ISD::FSHL, MVT::v8i64, { 1, 1, 1, 1 } },
3655 { ISD::FSHL, MVT::v4i64, { 1, 1, 1, 1 } },
3656 { ISD::FSHL, MVT::v2i64, { 1, 1, 1, 1 } },
3657 { ISD::FSHL, MVT::v16i32, { 1, 1, 1, 1 } },
3658 { ISD::FSHL, MVT::v8i32, { 1, 1, 1, 1 } },
3659 { ISD::FSHL, MVT::v4i32, { 1, 1, 1, 1 } },
3660 { ISD::FSHL, MVT::v32i16, { 1, 1, 1, 1 } },
3661 { ISD::FSHL, MVT::v16i16, { 1, 1, 1, 1 } },
3662 { ISD::FSHL, MVT::v8i16, { 1, 1, 1, 1 } },
3663 { ISD::ROTL, MVT::v32i16, { 1, 1, 1, 1 } },
3664 { ISD::ROTL, MVT::v16i16, { 1, 1, 1, 1 } },
3665 { ISD::ROTL, MVT::v8i16, { 1, 1, 1, 1 } },
3666 { ISD::ROTR, MVT::v32i16, { 1, 1, 1, 1 } },
3667 { ISD::ROTR, MVT::v16i16, { 1, 1, 1, 1 } },
3668 { ISD::ROTR, MVT::v8i16, { 1, 1, 1, 1 } },
3669 { X86ISD::VROTLI, MVT::v32i16, { 1, 1, 1, 1 } },
3670 { X86ISD::VROTLI, MVT::v16i16, { 1, 1, 1, 1 } },
3671 { X86ISD::VROTLI, MVT::v8i16, { 1, 1, 1, 1 } },
3672 };
3673 static const CostKindTblEntry AVX512BITALGCostTbl[] = {
3674 { ISD::CTPOP, MVT::v32i16, { 1, 1, 1, 1 } },
3675 { ISD::CTPOP, MVT::v64i8, { 1, 1, 1, 1 } },
3676 { ISD::CTPOP, MVT::v16i16, { 1, 1, 1, 1 } },
3677 { ISD::CTPOP, MVT::v32i8, { 1, 1, 1, 1 } },
3678 { ISD::CTPOP, MVT::v8i16, { 1, 1, 1, 1 } },
3679 { ISD::CTPOP, MVT::v16i8, { 1, 1, 1, 1 } },
3680 };
3681 static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = {
3682 { ISD::CTPOP, MVT::v8i64, { 1, 1, 1, 1 } },
3683 { ISD::CTPOP, MVT::v16i32, { 1, 1, 1, 1 } },
3684 { ISD::CTPOP, MVT::v4i64, { 1, 1, 1, 1 } },
3685 { ISD::CTPOP, MVT::v8i32, { 1, 1, 1, 1 } },
3686 { ISD::CTPOP, MVT::v2i64, { 1, 1, 1, 1 } },
3687 { ISD::CTPOP, MVT::v4i32, { 1, 1, 1, 1 } },
3688 };
3689 static const CostKindTblEntry AVX512CDCostTbl[] = {
3690 { ISD::CTLZ, MVT::v8i64, { 1, 5, 1, 1 } },
3691 { ISD::CTLZ, MVT::v16i32, { 1, 5, 1, 1 } },
3692 { ISD::CTLZ, MVT::v32i16, { 18, 27, 23, 27 } },
3693 { ISD::CTLZ, MVT::v64i8, { 3, 16, 9, 11 } },
3694 { ISD::CTLZ, MVT::v4i64, { 1, 5, 1, 1 } },
3695 { ISD::CTLZ, MVT::v8i32, { 1, 5, 1, 1 } },
3696 { ISD::CTLZ, MVT::v16i16, { 8, 19, 11, 13 } },
3697 { ISD::CTLZ, MVT::v32i8, { 2, 11, 9, 10 } },
3698 { ISD::CTLZ, MVT::v2i64, { 1, 5, 1, 1 } },
3699 { ISD::CTLZ, MVT::v4i32, { 1, 5, 1, 1 } },
3700 { ISD::CTLZ, MVT::v8i16, { 3, 15, 4, 6 } },
3701 { ISD::CTLZ, MVT::v16i8, { 2, 10, 9, 10 } },
3702
3703 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3704 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3705 { ISD::CTTZ, MVT::v4i64, { 1, 8, 6, 6 } },
3706 { ISD::CTTZ, MVT::v8i32, { 1, 8, 6, 6 } },
3707 { ISD::CTTZ, MVT::v2i64, { 1, 8, 6, 6 } },
3708 { ISD::CTTZ, MVT::v4i32, { 1, 8, 6, 6 } },
3709 };
3710 static const CostKindTblEntry AVX512BWCostTbl[] = {
3711 { ISD::ABS, MVT::v32i16, { 1, 1, 1, 1 } },
3712 { ISD::ABS, MVT::v64i8, { 1, 1, 1, 1 } },
3713 { ISD::BITREVERSE, MVT::v2i64, { 3, 10, 10, 11 } },
3714 { ISD::BITREVERSE, MVT::v4i64, { 3, 11, 10, 11 } },
3715 { ISD::BITREVERSE, MVT::v8i64, { 3, 12, 10, 14 } },
3716 { ISD::BITREVERSE, MVT::v4i32, { 3, 10, 10, 11 } },
3717 { ISD::BITREVERSE, MVT::v8i32, { 3, 11, 10, 11 } },
3718 { ISD::BITREVERSE, MVT::v16i32, { 3, 12, 10, 14 } },
3719 { ISD::BITREVERSE, MVT::v8i16, { 3, 10, 10, 11 } },
3720 { ISD::BITREVERSE, MVT::v16i16, { 3, 11, 10, 11 } },
3721 { ISD::BITREVERSE, MVT::v32i16, { 3, 12, 10, 14 } },
3722 { ISD::BITREVERSE, MVT::v16i8, { 2, 5, 9, 9 } },
3723 { ISD::BITREVERSE, MVT::v32i8, { 2, 5, 9, 9 } },
3724 { ISD::BITREVERSE, MVT::v64i8, { 2, 5, 9, 12 } },
3725 { ISD::BSWAP, MVT::v2i64, { 1, 1, 1, 2 } },
3726 { ISD::BSWAP, MVT::v4i64, { 1, 1, 1, 2 } },
3727 { ISD::BSWAP, MVT::v8i64, { 1, 1, 1, 2 } },
3728 { ISD::BSWAP, MVT::v4i32, { 1, 1, 1, 2 } },
3729 { ISD::BSWAP, MVT::v8i32, { 1, 1, 1, 2 } },
3730 { ISD::BSWAP, MVT::v16i32, { 1, 1, 1, 2 } },
3731 { ISD::BSWAP, MVT::v8i16, { 1, 1, 1, 2 } },
3732 { ISD::BSWAP, MVT::v16i16, { 1, 1, 1, 2 } },
3733 { ISD::BSWAP, MVT::v32i16, { 1, 1, 1, 2 } },
3734 { ISD::CTLZ, MVT::v8i64, { 8, 22, 23, 23 } },
3735 { ISD::CTLZ, MVT::v16i32, { 8, 23, 25, 25 } },
3736 { ISD::CTLZ, MVT::v32i16, { 4, 15, 15, 16 } },
3737 { ISD::CTLZ, MVT::v64i8, { 3, 12, 10, 9 } },
3738 { ISD::CTPOP, MVT::v2i64, { 3, 7, 10, 10 } },
3739 { ISD::CTPOP, MVT::v4i64, { 3, 7, 10, 10 } },
3740 { ISD::CTPOP, MVT::v8i64, { 3, 8, 10, 12 } },
3741 { ISD::CTPOP, MVT::v4i32, { 7, 11, 14, 14 } },
3742 { ISD::CTPOP, MVT::v8i32, { 7, 11, 14, 14 } },
3743 { ISD::CTPOP, MVT::v16i32, { 7, 12, 14, 16 } },
3744 { ISD::CTPOP, MVT::v8i16, { 2, 7, 11, 11 } },
3745 { ISD::CTPOP, MVT::v16i16, { 2, 7, 11, 11 } },
3746 { ISD::CTPOP, MVT::v32i16, { 3, 7, 11, 13 } },
3747 { ISD::CTPOP, MVT::v16i8, { 2, 4, 8, 8 } },
3748 { ISD::CTPOP, MVT::v32i8, { 2, 4, 8, 8 } },
3749 { ISD::CTPOP, MVT::v64i8, { 2, 5, 8, 10 } },
3750 { ISD::CTTZ, MVT::v8i16, { 3, 9, 14, 14 } },
3751 { ISD::CTTZ, MVT::v16i16, { 3, 9, 14, 14 } },
3752 { ISD::CTTZ, MVT::v32i16, { 3, 10, 14, 16 } },
3753 { ISD::CTTZ, MVT::v16i8, { 2, 6, 11, 11 } },
3754 { ISD::CTTZ, MVT::v32i8, { 2, 6, 11, 11 } },
3755 { ISD::CTTZ, MVT::v64i8, { 3, 7, 11, 13 } },
3756 { ISD::ROTL, MVT::v32i16, { 2, 8, 6, 8 } },
3757 { ISD::ROTL, MVT::v16i16, { 2, 8, 6, 7 } },
3758 { ISD::ROTL, MVT::v8i16, { 2, 7, 6, 7 } },
3759 { ISD::ROTL, MVT::v64i8, { 5, 6, 11, 12 } },
3760 { ISD::ROTL, MVT::v32i8, { 5, 15, 7, 10 } },
3761 { ISD::ROTL, MVT::v16i8, { 5, 15, 7, 10 } },
3762 { ISD::ROTR, MVT::v32i16, { 2, 8, 6, 8 } },
3763 { ISD::ROTR, MVT::v16i16, { 2, 8, 6, 7 } },
3764 { ISD::ROTR, MVT::v8i16, { 2, 7, 6, 7 } },
3765 { ISD::ROTR, MVT::v64i8, { 5, 6, 12, 14 } },
3766 { ISD::ROTR, MVT::v32i8, { 5, 14, 6, 9 } },
3767 { ISD::ROTR, MVT::v16i8, { 5, 14, 6, 9 } },
3768 { X86ISD::VROTLI, MVT::v32i16, { 2, 5, 3, 3 } },
3769 { X86ISD::VROTLI, MVT::v16i16, { 1, 5, 3, 3 } },
3770 { X86ISD::VROTLI, MVT::v8i16, { 1, 5, 3, 3 } },
3771 { X86ISD::VROTLI, MVT::v64i8, { 2, 9, 3, 4 } },
3772 { X86ISD::VROTLI, MVT::v32i8, { 1, 9, 3, 4 } },
3773 { X86ISD::VROTLI, MVT::v16i8, { 1, 8, 3, 4 } },
3774 { ISD::SADDSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3775 { ISD::SADDSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3776 { ISD::SMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3777 { ISD::SMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3778 { ISD::SMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3779 { ISD::SMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3780 { ISD::SMULO, MVT::v32i16, { 3, 6, 4, 4 } },
3781 { ISD::SMULO, MVT::v64i8, { 8, 21, 17, 18 } },
3782 { ISD::UMULO, MVT::v32i16, { 2, 5, 3, 3 } },
3783 { ISD::UMULO, MVT::v64i8, { 8, 15, 15, 16 } },
3784 { ISD::SSUBSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3785 { ISD::SSUBSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3786 { ISD::UADDSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3787 { ISD::UADDSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3788 { ISD::UMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3789 { ISD::UMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3790 { ISD::UMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3791 { ISD::UMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3792 { ISD::USUBSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3793 { ISD::USUBSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3794 };
3795 static const CostKindTblEntry AVX512CostTbl[] = {
3796 { ISD::ABS, MVT::v8i64, { 1, 1, 1, 1 } },
3797 { ISD::ABS, MVT::v4i64, { 1, 1, 1, 1 } },
3798 { ISD::ABS, MVT::v2i64, { 1, 1, 1, 1 } },
3799 { ISD::ABS, MVT::v16i32, { 1, 1, 1, 1 } },
3800 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 1 } },
3801 { ISD::ABS, MVT::v32i16, { 2, 7, 4, 4 } },
3802 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 1 } },
3803 { ISD::ABS, MVT::v64i8, { 2, 7, 4, 4 } },
3804 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 1 } },
3805 { ISD::BITREVERSE, MVT::v8i64, { 9, 13, 20, 20 } },
3806 { ISD::BITREVERSE, MVT::v16i32, { 9, 13, 20, 20 } },
3807 { ISD::BITREVERSE, MVT::v32i16, { 9, 13, 20, 20 } },
3808 { ISD::BITREVERSE, MVT::v64i8, { 6, 11, 17, 17 } },
3809 { ISD::BSWAP, MVT::v8i64, { 4, 7, 5, 5 } },
3810 { ISD::BSWAP, MVT::v16i32, { 4, 7, 5, 5 } },
3811 { ISD::BSWAP, MVT::v32i16, { 4, 7, 5, 5 } },
3812 { ISD::CTLZ, MVT::v8i64, { 10, 28, 32, 32 } },
3813 { ISD::CTLZ, MVT::v16i32, { 12, 30, 38, 38 } },
3814 { ISD::CTLZ, MVT::v32i16, { 8, 15, 29, 29 } },
3815 { ISD::CTLZ, MVT::v64i8, { 6, 11, 19, 19 } },
3816 { ISD::CTPOP, MVT::v8i64, { 16, 16, 19, 19 } },
3817 { ISD::CTPOP, MVT::v16i32, { 24, 19, 27, 27 } },
3818 { ISD::CTPOP, MVT::v32i16, { 18, 15, 22, 22 } },
3819 { ISD::CTPOP, MVT::v64i8, { 12, 11, 16, 16 } },
3820 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3821 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3822 { ISD::CTTZ, MVT::v32i16, { 7, 17, 27, 27 } },
3823 { ISD::CTTZ, MVT::v64i8, { 6, 13, 21, 21 } },
3824 { ISD::ROTL, MVT::v8i64, { 1, 1, 1, 1 } },
3825 { ISD::ROTL, MVT::v4i64, { 1, 1, 1, 1 } },
3826 { ISD::ROTL, MVT::v2i64, { 1, 1, 1, 1 } },
3827 { ISD::ROTL, MVT::v16i32, { 1, 1, 1, 1 } },
3828 { ISD::ROTL, MVT::v8i32, { 1, 1, 1, 1 } },
3829 { ISD::ROTL, MVT::v4i32, { 1, 1, 1, 1 } },
3830 { ISD::ROTR, MVT::v8i64, { 1, 1, 1, 1 } },
3831 { ISD::ROTR, MVT::v4i64, { 1, 1, 1, 1 } },
3832 { ISD::ROTR, MVT::v2i64, { 1, 1, 1, 1 } },
3833 { ISD::ROTR, MVT::v16i32, { 1, 1, 1, 1 } },
3834 { ISD::ROTR, MVT::v8i32, { 1, 1, 1, 1 } },
3835 { ISD::ROTR, MVT::v4i32, { 1, 1, 1, 1 } },
3836 { X86ISD::VROTLI, MVT::v8i64, { 1, 1, 1, 1 } },
3837 { X86ISD::VROTLI, MVT::v4i64, { 1, 1, 1, 1 } },
3838 { X86ISD::VROTLI, MVT::v2i64, { 1, 1, 1, 1 } },
3839 { X86ISD::VROTLI, MVT::v16i32, { 1, 1, 1, 1 } },
3840 { X86ISD::VROTLI, MVT::v8i32, { 1, 1, 1, 1 } },
3841 { X86ISD::VROTLI, MVT::v4i32, { 1, 1, 1, 1 } },
3842 { ISD::SADDSAT, MVT::v2i64, { 3, 3, 8, 9 } },
3843 { ISD::SADDSAT, MVT::v4i64, { 2, 2, 6, 7 } },
3844 { ISD::SADDSAT, MVT::v8i64, { 3, 3, 6, 7 } },
3845 { ISD::SADDSAT, MVT::v4i32, { 2, 2, 6, 7 } },
3846 { ISD::SADDSAT, MVT::v8i32, { 2, 2, 6, 7 } },
3847 { ISD::SADDSAT, MVT::v16i32, { 3, 3, 6, 7 } },
3848 { ISD::SADDSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3849 { ISD::SADDSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3850 { ISD::SMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3851 { ISD::SMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3852 { ISD::SMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3853 { ISD::SMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3854 { ISD::SMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3855 { ISD::SMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3856 { ISD::SMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3857 { ISD::SMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3858 { ISD::SMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3859 { ISD::SMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3860 { ISD::SMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3861 { ISD::SMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3862 { ISD::SMULO, MVT::v8i64, { 44, 44, 81, 93 } },
3863 { ISD::SMULO, MVT::v16i32, { 5, 12, 9, 11 } },
3864 { ISD::SMULO, MVT::v32i16, { 6, 12, 17, 17 } },
3865 { ISD::SMULO, MVT::v64i8, { 22, 28, 42, 42 } },
3866 { ISD::SSUBSAT, MVT::v2i64, { 2, 13, 9, 10 } },
3867 { ISD::SSUBSAT, MVT::v4i64, { 2, 15, 7, 8 } },
3868 { ISD::SSUBSAT, MVT::v8i64, { 2, 14, 7, 8 } },
3869 { ISD::SSUBSAT, MVT::v4i32, { 2, 14, 7, 8 } },
3870 { ISD::SSUBSAT, MVT::v8i32, { 2, 15, 7, 8 } },
3871 { ISD::SSUBSAT, MVT::v16i32, { 2, 14, 7, 8 } },
3872 { ISD::SSUBSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3873 { ISD::SSUBSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3874 { ISD::UMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3875 { ISD::UMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3876 { ISD::UMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3877 { ISD::UMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3878 { ISD::UMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3879 { ISD::UMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3880 { ISD::UMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3881 { ISD::UMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3882 { ISD::UMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3883 { ISD::UMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3884 { ISD::UMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3885 { ISD::UMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3886 { ISD::UMULO, MVT::v8i64, { 52, 52, 95, 104} },
3887 { ISD::UMULO, MVT::v16i32, { 5, 12, 8, 10 } },
3888 { ISD::UMULO, MVT::v32i16, { 5, 13, 16, 16 } },
3889 { ISD::UMULO, MVT::v64i8, { 18, 24, 30, 30 } },
3890 { ISD::UADDSAT, MVT::v2i64, { 1, 4, 4, 4 } },
3891 { ISD::UADDSAT, MVT::v4i64, { 1, 4, 4, 4 } },
3892 { ISD::UADDSAT, MVT::v8i64, { 1, 4, 4, 4 } },
3893 { ISD::UADDSAT, MVT::v4i32, { 1, 2, 4, 4 } },
3894 { ISD::UADDSAT, MVT::v8i32, { 1, 2, 4, 4 } },
3895 { ISD::UADDSAT, MVT::v16i32, { 2, 2, 4, 4 } },
3896 { ISD::UADDSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3897 { ISD::UADDSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3898 { ISD::USUBSAT, MVT::v2i64, { 1, 4, 2, 2 } },
3899 { ISD::USUBSAT, MVT::v4i64, { 1, 4, 2, 2 } },
3900 { ISD::USUBSAT, MVT::v8i64, { 1, 4, 2, 2 } },
3901 { ISD::USUBSAT, MVT::v8i32, { 1, 2, 2, 2 } },
3902 { ISD::USUBSAT, MVT::v16i32, { 1, 2, 2, 2 } },
3903 { ISD::USUBSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3904 { ISD::USUBSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3905 { ISD::FMAXNUM, MVT::f32, { 2, 2, 3, 3 } },
3906 { ISD::FMAXNUM, MVT::v4f32, { 1, 1, 3, 3 } },
3907 { ISD::FMAXNUM, MVT::v8f32, { 2, 2, 3, 3 } },
3908 { ISD::FMAXNUM, MVT::v16f32, { 4, 4, 3, 3 } },
3909 { ISD::FMAXNUM, MVT::f64, { 2, 2, 3, 3 } },
3910 { ISD::FMAXNUM, MVT::v2f64, { 1, 1, 3, 3 } },
3911 { ISD::FMAXNUM, MVT::v4f64, { 2, 2, 3, 3 } },
3912 { ISD::FMAXNUM, MVT::v8f64, { 3, 3, 3, 3 } },
3913 { ISD::FSQRT, MVT::f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3914 { ISD::FSQRT, MVT::v4f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3915 { ISD::FSQRT, MVT::v8f32, { 6, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3916 { ISD::FSQRT, MVT::v16f32, { 12, 20, 1, 3 } }, // Skylake from http://www.agner.org/
3917 { ISD::FSQRT, MVT::f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3918 { ISD::FSQRT, MVT::v2f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3919 { ISD::FSQRT, MVT::v4f64, { 12, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3920 { ISD::FSQRT, MVT::v8f64, { 24, 32, 1, 3 } }, // Skylake from http://www.agner.org/
3921 };
3922 static const CostKindTblEntry XOPCostTbl[] = {
3923 { ISD::BITREVERSE, MVT::v4i64, { 3, 6, 5, 6 } },
3924 { ISD::BITREVERSE, MVT::v8i32, { 3, 6, 5, 6 } },
3925 { ISD::BITREVERSE, MVT::v16i16, { 3, 6, 5, 6 } },
3926 { ISD::BITREVERSE, MVT::v32i8, { 3, 6, 5, 6 } },
3927 { ISD::BITREVERSE, MVT::v2i64, { 2, 7, 1, 1 } },
3928 { ISD::BITREVERSE, MVT::v4i32, { 2, 7, 1, 1 } },
3929 { ISD::BITREVERSE, MVT::v8i16, { 2, 7, 1, 1 } },
3930 { ISD::BITREVERSE, MVT::v16i8, { 2, 7, 1, 1 } },
3931 { ISD::BITREVERSE, MVT::i64, { 2, 2, 3, 4 } },
3932 { ISD::BITREVERSE, MVT::i32, { 2, 2, 3, 4 } },
3933 { ISD::BITREVERSE, MVT::i16, { 2, 2, 3, 4 } },
3934 { ISD::BITREVERSE, MVT::i8, { 2, 2, 3, 4 } },
3935 // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
3936 { ISD::ROTL, MVT::v4i64, { 4, 7, 5, 6 } },
3937 { ISD::ROTL, MVT::v8i32, { 4, 7, 5, 6 } },
3938 { ISD::ROTL, MVT::v16i16, { 4, 7, 5, 6 } },
3939 { ISD::ROTL, MVT::v32i8, { 4, 7, 5, 6 } },
3940 { ISD::ROTL, MVT::v2i64, { 1, 3, 1, 1 } },
3941 { ISD::ROTL, MVT::v4i32, { 1, 3, 1, 1 } },
3942 { ISD::ROTL, MVT::v8i16, { 1, 3, 1, 1 } },
3943 { ISD::ROTL, MVT::v16i8, { 1, 3, 1, 1 } },
3944 { ISD::ROTR, MVT::v4i64, { 4, 7, 8, 9 } },
3945 { ISD::ROTR, MVT::v8i32, { 4, 7, 8, 9 } },
3946 { ISD::ROTR, MVT::v16i16, { 4, 7, 8, 9 } },
3947 { ISD::ROTR, MVT::v32i8, { 4, 7, 8, 9 } },
3948 { ISD::ROTR, MVT::v2i64, { 1, 3, 3, 3 } },
3949 { ISD::ROTR, MVT::v4i32, { 1, 3, 3, 3 } },
3950 { ISD::ROTR, MVT::v8i16, { 1, 3, 3, 3 } },
3951 { ISD::ROTR, MVT::v16i8, { 1, 3, 3, 3 } },
3952 { X86ISD::VROTLI, MVT::v4i64, { 4, 7, 5, 6 } },
3953 { X86ISD::VROTLI, MVT::v8i32, { 4, 7, 5, 6 } },
3954 { X86ISD::VROTLI, MVT::v16i16, { 4, 7, 5, 6 } },
3955 { X86ISD::VROTLI, MVT::v32i8, { 4, 7, 5, 6 } },
3956 { X86ISD::VROTLI, MVT::v2i64, { 1, 3, 1, 1 } },
3957 { X86ISD::VROTLI, MVT::v4i32, { 1, 3, 1, 1 } },
3958 { X86ISD::VROTLI, MVT::v8i16, { 1, 3, 1, 1 } },
3959 { X86ISD::VROTLI, MVT::v16i8, { 1, 3, 1, 1 } },
3960 };
3961 static const CostKindTblEntry AVX2CostTbl[] = {
3962 { ISD::ABS, MVT::v2i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3963 { ISD::ABS, MVT::v4i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3964 { ISD::ABS, MVT::v4i32, { 1, 1, 1, 1 } },
3965 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 2 } },
3966 { ISD::ABS, MVT::v8i16, { 1, 1, 1, 1 } },
3967 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 2 } },
3968 { ISD::ABS, MVT::v16i8, { 1, 1, 1, 1 } },
3969 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 2 } },
3970 { ISD::BITREVERSE, MVT::v2i64, { 3, 11, 10, 11 } },
3971 { ISD::BITREVERSE, MVT::v4i64, { 5, 11, 10, 17 } },
3972 { ISD::BITREVERSE, MVT::v4i32, { 3, 11, 10, 11 } },
3973 { ISD::BITREVERSE, MVT::v8i32, { 5, 11, 10, 17 } },
3974 { ISD::BITREVERSE, MVT::v8i16, { 3, 11, 10, 11 } },
3975 { ISD::BITREVERSE, MVT::v16i16, { 5, 11, 10, 17 } },
3976 { ISD::BITREVERSE, MVT::v16i8, { 3, 6, 9, 9 } },
3977 { ISD::BITREVERSE, MVT::v32i8, { 4, 5, 9, 15 } },
3978 { ISD::BSWAP, MVT::v2i64, { 1, 2, 1, 2 } },
3979 { ISD::BSWAP, MVT::v4i64, { 1, 3, 1, 2 } },
3980 { ISD::BSWAP, MVT::v4i32, { 1, 2, 1, 2 } },
3981 { ISD::BSWAP, MVT::v8i32, { 1, 3, 1, 2 } },
3982 { ISD::BSWAP, MVT::v8i16, { 1, 2, 1, 2 } },
3983 { ISD::BSWAP, MVT::v16i16, { 1, 3, 1, 2 } },
3984 { ISD::CTLZ, MVT::v2i64, { 7, 18, 24, 25 } },
3985 { ISD::CTLZ, MVT::v4i64, { 14, 18, 24, 44 } },
3986 { ISD::CTLZ, MVT::v4i32, { 5, 16, 19, 20 } },
3987 { ISD::CTLZ, MVT::v8i32, { 10, 16, 19, 34 } },
3988 { ISD::CTLZ, MVT::v8i16, { 4, 13, 14, 15 } },
3989 { ISD::CTLZ, MVT::v16i16, { 6, 14, 14, 24 } },
3990 { ISD::CTLZ, MVT::v16i8, { 3, 12, 9, 10 } },
3991 { ISD::CTLZ, MVT::v32i8, { 4, 12, 9, 14 } },
3992 { ISD::CTPOP, MVT::v2i64, { 3, 9, 10, 10 } },
3993 { ISD::CTPOP, MVT::v4i64, { 4, 9, 10, 14 } },
3994 { ISD::CTPOP, MVT::v4i32, { 7, 12, 14, 14 } },
3995 { ISD::CTPOP, MVT::v8i32, { 7, 12, 14, 18 } },
3996 { ISD::CTPOP, MVT::v8i16, { 3, 7, 11, 11 } },
3997 { ISD::CTPOP, MVT::v16i16, { 6, 8, 11, 18 } },
3998 { ISD::CTPOP, MVT::v16i8, { 2, 5, 8, 8 } },
3999 { ISD::CTPOP, MVT::v32i8, { 3, 5, 8, 12 } },
4000 { ISD::CTTZ, MVT::v2i64, { 4, 11, 13, 13 } },
4001 { ISD::CTTZ, MVT::v4i64, { 5, 11, 13, 20 } },
4002 { ISD::CTTZ, MVT::v4i32, { 7, 14, 17, 17 } },
4003 { ISD::CTTZ, MVT::v8i32, { 7, 15, 17, 24 } },
4004 { ISD::CTTZ, MVT::v8i16, { 4, 9, 14, 14 } },
4005 { ISD::CTTZ, MVT::v16i16, { 6, 9, 14, 24 } },
4006 { ISD::CTTZ, MVT::v16i8, { 3, 7, 11, 11 } },
4007 { ISD::CTTZ, MVT::v32i8, { 5, 7, 11, 18 } },
4008 { ISD::SADDSAT, MVT::v2i64, { 4, 13, 8, 11 } },
4009 { ISD::SADDSAT, MVT::v4i64, { 3, 10, 8, 12 } },
4010 { ISD::SADDSAT, MVT::v4i32, { 2, 6, 7, 9 } },
4011 { ISD::SADDSAT, MVT::v8i32, { 4, 6, 7, 13 } },
4012 { ISD::SADDSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4013 { ISD::SADDSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4014 { ISD::SMAX, MVT::v2i64, { 2, 7, 2, 3 } },
4015 { ISD::SMAX, MVT::v4i64, { 2, 7, 2, 3 } },
4016 { ISD::SMAX, MVT::v8i32, { 1, 1, 1, 2 } },
4017 { ISD::SMAX, MVT::v16i16, { 1, 1, 1, 2 } },
4018 { ISD::SMAX, MVT::v32i8, { 1, 1, 1, 2 } },
4019 { ISD::SMIN, MVT::v2i64, { 2, 7, 2, 3 } },
4020 { ISD::SMIN, MVT::v4i64, { 2, 7, 2, 3 } },
4021 { ISD::SMIN, MVT::v8i32, { 1, 1, 1, 2 } },
4022 { ISD::SMIN, MVT::v16i16, { 1, 1, 1, 2 } },
4023 { ISD::SMIN, MVT::v32i8, { 1, 1, 1, 2 } },
4024 { ISD::SMULO, MVT::v4i64, { 20, 20, 33, 37 } },
4025 { ISD::SMULO, MVT::v2i64, { 8, 8, 13, 15 } },
4026 { ISD::SMULO, MVT::v8i32, { 8, 20, 13, 24 } },
4027 { ISD::SMULO, MVT::v4i32, { 5, 15, 11, 12 } },
4028 { ISD::SMULO, MVT::v16i16, { 4, 14, 8, 14 } },
4029 { ISD::SMULO, MVT::v8i16, { 3, 9, 6, 6 } },
4030 { ISD::SMULO, MVT::v32i8, { 9, 15, 18, 35 } },
4031 { ISD::SMULO, MVT::v16i8, { 6, 22, 14, 21 } },
4032 { ISD::SSUBSAT, MVT::v2i64, { 4, 13, 9, 13 } },
4033 { ISD::SSUBSAT, MVT::v4i64, { 4, 15, 9, 13 } },
4034 { ISD::SSUBSAT, MVT::v4i32, { 3, 14, 9, 11 } },
4035 { ISD::SSUBSAT, MVT::v8i32, { 4, 15, 9, 16 } },
4036 { ISD::SSUBSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4037 { ISD::SSUBSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4038 { ISD::UADDSAT, MVT::v2i64, { 2, 8, 6, 6 } },
4039 { ISD::UADDSAT, MVT::v4i64, { 3, 8, 6, 10 } },
4040 { ISD::UADDSAT, MVT::v8i32, { 2, 2, 4, 8 } },
4041 { ISD::UADDSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4042 { ISD::UADDSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4043 { ISD::UMAX, MVT::v2i64, { 2, 8, 5, 6 } },
4044 { ISD::UMAX, MVT::v4i64, { 2, 8, 5, 8 } },
4045 { ISD::UMAX, MVT::v8i32, { 1, 1, 1, 2 } },
4046 { ISD::UMAX, MVT::v16i16, { 1, 1, 1, 2 } },
4047 { ISD::UMAX, MVT::v32i8, { 1, 1, 1, 2 } },
4048 { ISD::UMIN, MVT::v2i64, { 2, 8, 5, 6 } },
4049 { ISD::UMIN, MVT::v4i64, { 2, 8, 5, 8 } },
4050 { ISD::UMIN, MVT::v8i32, { 1, 1, 1, 2 } },
4051 { ISD::UMIN, MVT::v16i16, { 1, 1, 1, 2 } },
4052 { ISD::UMIN, MVT::v32i8, { 1, 1, 1, 2 } },
4053 { ISD::UMULO, MVT::v4i64, { 24, 24, 39, 43 } },
4054 { ISD::UMULO, MVT::v2i64, { 10, 10, 15, 19 } },
4055 { ISD::UMULO, MVT::v8i32, { 8, 11, 13, 23 } },
4056 { ISD::UMULO, MVT::v4i32, { 5, 12, 11, 12 } },
4057 { ISD::UMULO, MVT::v16i16, { 4, 6, 8, 13 } },
4058 { ISD::UMULO, MVT::v8i16, { 2, 8, 6, 6 } },
4059 { ISD::UMULO, MVT::v32i8, { 9, 13, 17, 33 } },
4060 { ISD::UMULO, MVT::v16i8, { 6, 19, 13, 20 } },
4061 { ISD::USUBSAT, MVT::v2i64, { 2, 7, 6, 6 } },
4062 { ISD::USUBSAT, MVT::v4i64, { 3, 7, 6, 10 } },
4063 { ISD::USUBSAT, MVT::v8i32, { 2, 2, 2, 4 } },
4064 { ISD::USUBSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4065 { ISD::USUBSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4066 { ISD::FMAXNUM, MVT::f32, { 2, 7, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4067 { ISD::FMAXNUM, MVT::v4f32, { 2, 7, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4068 { ISD::FMAXNUM, MVT::v8f32, { 3, 7, 3, 6 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4069 { ISD::FMAXNUM, MVT::f64, { 2, 7, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4070 { ISD::FMAXNUM, MVT::v2f64, { 2, 7, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4071 { ISD::FMAXNUM, MVT::v4f64, { 3, 7, 3, 6 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4072 { ISD::FSQRT, MVT::f32, { 7, 15, 1, 1 } }, // vsqrtss
4073 { ISD::FSQRT, MVT::v4f32, { 7, 15, 1, 1 } }, // vsqrtps
4074 { ISD::FSQRT, MVT::v8f32, { 14, 21, 1, 3 } }, // vsqrtps
4075 { ISD::FSQRT, MVT::f64, { 14, 21, 1, 1 } }, // vsqrtsd
4076 { ISD::FSQRT, MVT::v2f64, { 14, 21, 1, 1 } }, // vsqrtpd
4077 { ISD::FSQRT, MVT::v4f64, { 28, 35, 1, 3 } }, // vsqrtpd
4078 };
4079 static const CostKindTblEntry AVX1CostTbl[] = {
4080 { ISD::ABS, MVT::v4i64, { 6, 8, 6, 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
4081 { ISD::ABS, MVT::v8i32, { 3, 6, 4, 5 } },
4082 { ISD::ABS, MVT::v16i16, { 3, 6, 4, 5 } },
4083 { ISD::ABS, MVT::v32i8, { 3, 6, 4, 5 } },
4084 { ISD::BITREVERSE, MVT::v4i64, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4085 { ISD::BITREVERSE, MVT::v2i64, { 8, 13, 10, 16 } },
4086 { ISD::BITREVERSE, MVT::v8i32, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4087 { ISD::BITREVERSE, MVT::v4i32, { 8, 13, 10, 16 } },
4088 { ISD::BITREVERSE, MVT::v16i16, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4089 { ISD::BITREVERSE, MVT::v8i16, { 8, 13, 10, 16 } },
4090 { ISD::BITREVERSE, MVT::v32i8, { 13, 15, 17, 26 } }, // 2 x 128-bit Op + extract/insert
4091 { ISD::BITREVERSE, MVT::v16i8, { 7, 7, 9, 13 } },
4092 { ISD::BSWAP, MVT::v4i64, { 5, 6, 5, 10 } },
4093 { ISD::BSWAP, MVT::v2i64, { 2, 2, 1, 3 } },
4094 { ISD::BSWAP, MVT::v8i32, { 5, 6, 5, 10 } },
4095 { ISD::BSWAP, MVT::v4i32, { 2, 2, 1, 3 } },
4096 { ISD::BSWAP, MVT::v16i16, { 5, 6, 5, 10 } },
4097 { ISD::BSWAP, MVT::v8i16, { 2, 2, 1, 3 } },
4098 { ISD::CTLZ, MVT::v4i64, { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert
4099 { ISD::CTLZ, MVT::v2i64, { 14, 24, 24, 28 } },
4100 { ISD::CTLZ, MVT::v8i32, { 24, 28, 39, 48 } }, // 2 x 128-bit Op + extract/insert
4101 { ISD::CTLZ, MVT::v4i32, { 12, 20, 19, 23 } },
4102 { ISD::CTLZ, MVT::v16i16, { 19, 22, 29, 38 } }, // 2 x 128-bit Op + extract/insert
4103 { ISD::CTLZ, MVT::v8i16, { 9, 16, 14, 18 } },
4104 { ISD::CTLZ, MVT::v32i8, { 14, 15, 19, 28 } }, // 2 x 128-bit Op + extract/insert
4105 { ISD::CTLZ, MVT::v16i8, { 7, 12, 9, 13 } },
4106 { ISD::CTPOP, MVT::v4i64, { 14, 18, 19, 28 } }, // 2 x 128-bit Op + extract/insert
4107 { ISD::CTPOP, MVT::v2i64, { 7, 14, 10, 14 } },
4108 { ISD::CTPOP, MVT::v8i32, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
4109 { ISD::CTPOP, MVT::v4i32, { 9, 20, 14, 18 } },
4110 { ISD::CTPOP, MVT::v16i16, { 16, 21, 22, 31 } }, // 2 x 128-bit Op + extract/insert
4111 { ISD::CTPOP, MVT::v8i16, { 8, 18, 11, 15 } },
4112 { ISD::CTPOP, MVT::v32i8, { 13, 15, 16, 25 } }, // 2 x 128-bit Op + extract/insert
4113 { ISD::CTPOP, MVT::v16i8, { 6, 12, 8, 12 } },
4114 { ISD::CTTZ, MVT::v4i64, { 17, 22, 24, 33 } }, // 2 x 128-bit Op + extract/insert
4115 { ISD::CTTZ, MVT::v2i64, { 9, 19, 13, 17 } },
4116 { ISD::CTTZ, MVT::v8i32, { 21, 27, 32, 41 } }, // 2 x 128-bit Op + extract/insert
4117 { ISD::CTTZ, MVT::v4i32, { 11, 24, 17, 21 } },
4118 { ISD::CTTZ, MVT::v16i16, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
4119 { ISD::CTTZ, MVT::v8i16, { 9, 21, 14, 18 } },
4120 { ISD::CTTZ, MVT::v32i8, { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert
4121 { ISD::CTTZ, MVT::v16i8, { 8, 16, 11, 15 } },
4122 { ISD::SADDSAT, MVT::v2i64, { 6, 13, 8, 11 } },
4123 { ISD::SADDSAT, MVT::v4i64, { 13, 20, 15, 25 } }, // 2 x 128-bit Op + extract/insert
4124 { ISD::SADDSAT, MVT::v8i32, { 12, 18, 14, 24 } }, // 2 x 128-bit Op + extract/insert
4125 { ISD::SADDSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4126 { ISD::SADDSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4127 { ISD::SMAX, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
4128 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 4 } },
4129 { ISD::SMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4130 { ISD::SMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4131 { ISD::SMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4132 { ISD::SMIN, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
4133 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
4134 { ISD::SMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4135 { ISD::SMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4136 { ISD::SMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4137 { ISD::SMULO, MVT::v4i64, { 20, 20, 33, 37 } },
4138 { ISD::SMULO, MVT::v2i64, { 9, 9, 13, 17 } },
4139 { ISD::SMULO, MVT::v8i32, { 15, 20, 24, 29 } },
4140 { ISD::SMULO, MVT::v4i32, { 7, 15, 11, 13 } },
4141 { ISD::SMULO, MVT::v16i16, { 8, 14, 14, 15 } },
4142 { ISD::SMULO, MVT::v8i16, { 3, 9, 6, 6 } },
4143 { ISD::SMULO, MVT::v32i8, { 20, 20, 37, 39 } },
4144 { ISD::SMULO, MVT::v16i8, { 9, 22, 18, 21 } },
4145 { ISD::SSUBSAT, MVT::v2i64, { 7, 13, 9, 13 } },
4146 { ISD::SSUBSAT, MVT::v4i64, { 15, 21, 18, 29 } }, // 2 x 128-bit Op + extract/insert
4147 { ISD::SSUBSAT, MVT::v8i32, { 15, 19, 18, 29 } }, // 2 x 128-bit Op + extract/insert
4148 { ISD::SSUBSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4149 { ISD::SSUBSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4150 { ISD::UADDSAT, MVT::v2i64, { 3, 8, 6, 6 } },
4151 { ISD::UADDSAT, MVT::v4i64, { 8, 11, 14, 15 } }, // 2 x 128-bit Op + extract/insert
4152 { ISD::UADDSAT, MVT::v8i32, { 6, 6, 10, 11 } }, // 2 x 128-bit Op + extract/insert
4153 { ISD::UADDSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4154 { ISD::UADDSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4155 { ISD::UMAX, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
4156 { ISD::UMAX, MVT::v2i64, { 4, 8, 5, 7 } },
4157 { ISD::UMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4158 { ISD::UMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4159 { ISD::UMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4160 { ISD::UMIN, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
4161 { ISD::UMIN, MVT::v2i64, { 4, 8, 5, 7 } },
4162 { ISD::UMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4163 { ISD::UMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4164 { ISD::UMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4165 { ISD::UMULO, MVT::v4i64, { 24, 26, 39, 45 } },
4166 { ISD::UMULO, MVT::v2i64, { 10, 12, 15, 20 } },
4167 { ISD::UMULO, MVT::v8i32, { 14, 15, 23, 28 } },
4168 { ISD::UMULO, MVT::v4i32, { 7, 12, 11, 13 } },
4169 { ISD::UMULO, MVT::v16i16, { 7, 11, 13, 14 } },
4170 { ISD::UMULO, MVT::v8i16, { 3, 8, 6, 6 } },
4171 { ISD::UMULO, MVT::v32i8, { 19, 19, 35, 37 } },
4172 { ISD::UMULO, MVT::v16i8, { 9, 19, 17, 20 } },
4173 { ISD::USUBSAT, MVT::v2i64, { 3, 7, 6, 6 } },
4174 { ISD::USUBSAT, MVT::v4i64, { 8, 10, 14, 15 } }, // 2 x 128-bit Op + extract/insert
4175 { ISD::USUBSAT, MVT::v8i32, { 4, 4, 7, 8 } }, // 2 x 128-bit Op + extract/insert
4176 { ISD::USUBSAT, MVT::v8i32, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4177 { ISD::USUBSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4178 { ISD::USUBSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4179 { ISD::FMAXNUM, MVT::f32, { 3, 6, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4180 { ISD::FMAXNUM, MVT::v4f32, { 3, 6, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4181 { ISD::FMAXNUM, MVT::v8f32, { 5, 7, 3, 10 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4182 { ISD::FMAXNUM, MVT::f64, { 3, 6, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4183 { ISD::FMAXNUM, MVT::v2f64, { 3, 6, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4184 { ISD::FMAXNUM, MVT::v4f64, { 5, 7, 3, 10 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4185 { ISD::FSQRT, MVT::f32, { 21, 21, 1, 1 } }, // vsqrtss
4186 { ISD::FSQRT, MVT::v4f32, { 21, 21, 1, 1 } }, // vsqrtps
4187 { ISD::FSQRT, MVT::v8f32, { 42, 42, 1, 3 } }, // vsqrtps
4188 { ISD::FSQRT, MVT::f64, { 27, 27, 1, 1 } }, // vsqrtsd
4189 { ISD::FSQRT, MVT::v2f64, { 27, 27, 1, 1 } }, // vsqrtpd
4190 { ISD::FSQRT, MVT::v4f64, { 54, 54, 1, 3 } }, // vsqrtpd
4191 };
4192 static const CostKindTblEntry GFNICostTbl[] = {
4193 { ISD::BITREVERSE, MVT::i8, { 3, 3, 3, 4 } }, // gf2p8affineqb
4194 { ISD::BITREVERSE, MVT::i16, { 3, 3, 4, 6 } }, // gf2p8affineqb
4195 { ISD::BITREVERSE, MVT::i32, { 3, 3, 4, 5 } }, // gf2p8affineqb
4196 { ISD::BITREVERSE, MVT::i64, { 3, 3, 4, 6 } }, // gf2p8affineqb
4197 { ISD::BITREVERSE, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4198 { ISD::BITREVERSE, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4199 { ISD::BITREVERSE, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4200 { ISD::BITREVERSE, MVT::v8i16, { 1, 8, 2, 4 } }, // gf2p8affineqb
4201 { ISD::BITREVERSE, MVT::v16i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
4202 { ISD::BITREVERSE, MVT::v32i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
4203 { ISD::BITREVERSE, MVT::v4i32, { 1, 8, 2, 4 } }, // gf2p8affineqb
4204 { ISD::BITREVERSE, MVT::v8i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
4205 { ISD::BITREVERSE, MVT::v16i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
4206 { ISD::BITREVERSE, MVT::v2i64, { 1, 8, 2, 4 } }, // gf2p8affineqb
4207 { ISD::BITREVERSE, MVT::v4i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
4208 { ISD::BITREVERSE, MVT::v8i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
4209 { X86ISD::VROTLI, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4210 { X86ISD::VROTLI, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4211 { X86ISD::VROTLI, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4212 };
4213 static const CostKindTblEntry GLMCostTbl[] = {
4214 { ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss
4215 { ISD::FSQRT, MVT::v4f32, { 37, 41, 1, 5 } }, // sqrtps
4216 { ISD::FSQRT, MVT::f64, { 34, 35, 1, 1 } }, // sqrtsd
4217 { ISD::FSQRT, MVT::v2f64, { 67, 71, 1, 5 } }, // sqrtpd
4218 };
4219 static const CostKindTblEntry SLMCostTbl[] = {
4220 { ISD::BSWAP, MVT::v2i64, { 5, 5, 1, 5 } },
4221 { ISD::BSWAP, MVT::v4i32, { 5, 5, 1, 5 } },
4222 { ISD::BSWAP, MVT::v8i16, { 5, 5, 1, 5 } },
4223 { ISD::FSQRT, MVT::f32, { 20, 20, 1, 1 } }, // sqrtss
4224 { ISD::FSQRT, MVT::v4f32, { 40, 41, 1, 5 } }, // sqrtps
4225 { ISD::FSQRT, MVT::f64, { 35, 35, 1, 1 } }, // sqrtsd
4226 { ISD::FSQRT, MVT::v2f64, { 70, 71, 1, 5 } }, // sqrtpd
4227 };
4228 static const CostKindTblEntry SSE42CostTbl[] = {
4229 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4230 { ISD::FMAXNUM, MVT::v4f32, { 4, 4, 4, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4231 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4232 { ISD::FMAXNUM, MVT::v2f64, { 4, 4, 4, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4233 { ISD::FSQRT, MVT::f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
4234 { ISD::FSQRT, MVT::v4f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
4235 };
4236 static const CostKindTblEntry SSE41CostTbl[] = {
4237 { ISD::ABS, MVT::v2i64, { 3, 4, 3, 5 } }, // BLENDVPD(X,PSUBQ(0,X),X)
4238 { ISD::SADDSAT, MVT::v2i64, { 10, 14, 17, 21 } },
4239 { ISD::SADDSAT, MVT::v4i32, { 5, 11, 8, 10 } },
4240 { ISD::SSUBSAT, MVT::v2i64, { 12, 19, 25, 29 } },
4241 { ISD::SSUBSAT, MVT::v4i32, { 6, 14, 10, 12 } },
4242 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 3 } },
4243 { ISD::SMAX, MVT::v4i32, { 1, 1, 1, 1 } },
4244 { ISD::SMAX, MVT::v16i8, { 1, 1, 1, 1 } },
4245 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
4246 { ISD::SMIN, MVT::v4i32, { 1, 1, 1, 1 } },
4247 { ISD::SMIN, MVT::v16i8, { 1, 1, 1, 1 } },
4248 { ISD::SMULO, MVT::v2i64, { 9, 11, 13, 17 } },
4249 { ISD::SMULO, MVT::v4i32, { 20, 24, 13, 19 } },
4250 { ISD::SMULO, MVT::v8i16, { 5, 9, 8, 8 } },
4251 { ISD::SMULO, MVT::v16i8, { 13, 22, 24, 25 } },
4252 { ISD::UADDSAT, MVT::v2i64, { 6, 13, 14, 14 } },
4253 { ISD::UADDSAT, MVT::v4i32, { 2, 2, 4, 4 } },
4254 { ISD::USUBSAT, MVT::v2i64, { 6, 10, 14, 14 } },
4255 { ISD::USUBSAT, MVT::v4i32, { 1, 2, 2, 2 } },
4256 { ISD::UMAX, MVT::v2i64, { 2, 11, 6, 7 } },
4257 { ISD::UMAX, MVT::v4i32, { 1, 1, 1, 1 } },
4258 { ISD::UMAX, MVT::v8i16, { 1, 1, 1, 1 } },
4259 { ISD::UMIN, MVT::v2i64, { 2, 11, 6, 7 } },
4260 { ISD::UMIN, MVT::v4i32, { 1, 1, 1, 1 } },
4261 { ISD::UMIN, MVT::v8i16, { 1, 1, 1, 1 } },
4262 { ISD::UMULO, MVT::v2i64, { 14, 20, 15, 20 } },
4263 { ISD::UMULO, MVT::v4i32, { 19, 22, 12, 18 } },
4264 { ISD::UMULO, MVT::v8i16, { 4, 9, 7, 7 } },
4265 { ISD::UMULO, MVT::v16i8, { 13, 19, 18, 20 } },
4266 };
4267 static const CostKindTblEntry SSSE3CostTbl[] = {
4268 { ISD::ABS, MVT::v4i32, { 1, 2, 1, 1 } },
4269 { ISD::ABS, MVT::v8i16, { 1, 2, 1, 1 } },
4270 { ISD::ABS, MVT::v16i8, { 1, 2, 1, 1 } },
4271 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 11, 21 } },
4272 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 11, 21 } },
4273 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 11, 21 } },
4274 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 10, 16 } },
4275 { ISD::BSWAP, MVT::v2i64, { 2, 3, 1, 5 } },
4276 { ISD::BSWAP, MVT::v4i32, { 2, 3, 1, 5 } },
4277 { ISD::BSWAP, MVT::v8i16, { 2, 3, 1, 5 } },
4278 { ISD::CTLZ, MVT::v2i64, { 18, 28, 28, 35 } },
4279 { ISD::CTLZ, MVT::v4i32, { 15, 20, 22, 28 } },
4280 { ISD::CTLZ, MVT::v8i16, { 13, 17, 16, 22 } },
4281 { ISD::CTLZ, MVT::v16i8, { 11, 15, 10, 16 } },
4282 { ISD::CTPOP, MVT::v2i64, { 13, 19, 12, 18 } },
4283 { ISD::CTPOP, MVT::v4i32, { 18, 24, 16, 22 } },
4284 { ISD::CTPOP, MVT::v8i16, { 13, 18, 14, 20 } },
4285 { ISD::CTPOP, MVT::v16i8, { 11, 12, 10, 16 } },
4286 { ISD::CTTZ, MVT::v2i64, { 13, 25, 15, 22 } },
4287 { ISD::CTTZ, MVT::v4i32, { 18, 26, 19, 25 } },
4288 { ISD::CTTZ, MVT::v8i16, { 13, 20, 17, 23 } },
4289 { ISD::CTTZ, MVT::v16i8, { 11, 16, 13, 19 } }
4290 };
4291 static const CostKindTblEntry SSE2CostTbl[] = {
4292 { ISD::ABS, MVT::v2i64, { 3, 6, 5, 5 } },
4293 { ISD::ABS, MVT::v4i32, { 1, 4, 4, 4 } },
4294 { ISD::ABS, MVT::v8i16, { 1, 2, 3, 3 } },
4295 { ISD::ABS, MVT::v16i8, { 1, 2, 3, 3 } },
4296 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 32, 32 } },
4297 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 30, 30 } },
4298 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 25, 25 } },
4299 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 21, 21 } },
4300 { ISD::BSWAP, MVT::v2i64, { 5, 6, 11, 11 } },
4301 { ISD::BSWAP, MVT::v4i32, { 5, 5, 9, 9 } },
4302 { ISD::BSWAP, MVT::v8i16, { 5, 5, 4, 5 } },
4303 { ISD::CTLZ, MVT::v2i64, { 10, 45, 36, 38 } },
4304 { ISD::CTLZ, MVT::v4i32, { 10, 45, 38, 40 } },
4305 { ISD::CTLZ, MVT::v8i16, { 9, 38, 32, 34 } },
4306 { ISD::CTLZ, MVT::v16i8, { 8, 39, 29, 32 } },
4307 { ISD::CTPOP, MVT::v2i64, { 12, 26, 16, 18 } },
4308 { ISD::CTPOP, MVT::v4i32, { 15, 29, 21, 23 } },
4309 { ISD::CTPOP, MVT::v8i16, { 13, 25, 18, 20 } },
4310 { ISD::CTPOP, MVT::v16i8, { 10, 21, 14, 16 } },
4311 { ISD::CTTZ, MVT::v2i64, { 14, 28, 19, 21 } },
4312 { ISD::CTTZ, MVT::v4i32, { 18, 31, 24, 26 } },
4313 { ISD::CTTZ, MVT::v8i16, { 16, 27, 21, 23 } },
4314 { ISD::CTTZ, MVT::v16i8, { 13, 23, 17, 19 } },
4315 { ISD::SADDSAT, MVT::v2i64, { 12, 14, 24, 24 } },
4316 { ISD::SADDSAT, MVT::v4i32, { 6, 11, 11, 12 } },
4317 { ISD::SADDSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4318 { ISD::SADDSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4319 { ISD::SMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4320 { ISD::SMAX, MVT::v4i32, { 2, 4, 5, 5 } },
4321 { ISD::SMAX, MVT::v8i16, { 1, 1, 1, 1 } },
4322 { ISD::SMAX, MVT::v16i8, { 2, 4, 5, 5 } },
4323 { ISD::SMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4324 { ISD::SMIN, MVT::v4i32, { 2, 4, 5, 5 } },
4325 { ISD::SMIN, MVT::v8i16, { 1, 1, 1, 1 } },
4326 { ISD::SMIN, MVT::v16i8, { 2, 4, 5, 5 } },
4327 { ISD::SMULO, MVT::v2i64, { 30, 33, 13, 23 } },
4328 { ISD::SMULO, MVT::v4i32, { 20, 24, 23, 23 } },
4329 { ISD::SMULO, MVT::v8i16, { 5, 10, 8, 8 } },
4330 { ISD::SMULO, MVT::v16i8, { 13, 23, 24, 25 } },
4331 { ISD::SSUBSAT, MVT::v2i64, { 16, 19, 31, 31 } },
4332 { ISD::SSUBSAT, MVT::v4i32, { 6, 14, 12, 13 } },
4333 { ISD::SSUBSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4334 { ISD::SSUBSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4335 { ISD::UADDSAT, MVT::v2i64, { 7, 13, 14, 14 } },
4336 { ISD::UADDSAT, MVT::v4i32, { 4, 5, 7, 7 } },
4337 { ISD::UADDSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4338 { ISD::UADDSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4339 { ISD::UMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4340 { ISD::UMAX, MVT::v4i32, { 2, 5, 8, 8 } },
4341 { ISD::UMAX, MVT::v8i16, { 1, 3, 3, 3 } },
4342 { ISD::UMAX, MVT::v16i8, { 1, 1, 1, 1 } },
4343 { ISD::UMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4344 { ISD::UMIN, MVT::v4i32, { 2, 5, 8, 8 } },
4345 { ISD::UMIN, MVT::v8i16, { 1, 3, 3, 3 } },
4346 { ISD::UMIN, MVT::v16i8, { 1, 1, 1, 1 } },
4347 { ISD::UMULO, MVT::v2i64, { 30, 33, 15, 29 } },
4348 { ISD::UMULO, MVT::v4i32, { 19, 22, 14, 18 } },
4349 { ISD::UMULO, MVT::v8i16, { 4, 9, 7, 7 } },
4350 { ISD::UMULO, MVT::v16i8, { 13, 19, 20, 20 } },
4351 { ISD::USUBSAT, MVT::v2i64, { 7, 10, 14, 14 } },
4352 { ISD::USUBSAT, MVT::v4i32, { 4, 4, 7, 7 } },
4353 { ISD::USUBSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4354 { ISD::USUBSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4355 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } },
4356 { ISD::FMAXNUM, MVT::v2f64, { 4, 6, 6, 6 } },
4357 { ISD::FSQRT, MVT::f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4358 { ISD::FSQRT, MVT::v2f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4359 };
4360 static const CostKindTblEntry SSE1CostTbl[] = {
4361 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } },
4362 { ISD::FMAXNUM, MVT::v4f32, { 4, 6, 6, 6 } },
4363 { ISD::FSQRT, MVT::f32, { 28, 30, 1, 2 } }, // Pentium III from http://www.agner.org/
4364 { ISD::FSQRT, MVT::v4f32, { 56, 56, 1, 2 } }, // Pentium III from http://www.agner.org/
4365 };
4366 static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets
4367 { ISD::CTTZ, MVT::i64, { 1, 1, 1, 1 } },
4368 };
4369 static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
4370 { ISD::CTTZ, MVT::i32, { 1, 1, 1, 1 } },
4371 { ISD::CTTZ, MVT::i16, { 2, 1, 1, 1 } },
4372 { ISD::CTTZ, MVT::i8, { 2, 1, 1, 1 } },
4373 };
4374 static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets
4375 { ISD::CTLZ, MVT::i64, { 1, 1, 1, 1 } },
4376 };
4377 static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
4378 { ISD::CTLZ, MVT::i32, { 1, 1, 1, 1 } },
4379 { ISD::CTLZ, MVT::i16, { 2, 1, 1, 1 } },
4380 { ISD::CTLZ, MVT::i8, { 2, 1, 1, 1 } },
4381 };
4382 static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets
4383 { ISD::CTPOP, MVT::i64, { 1, 1, 1, 1 } }, // popcnt
4384 };
4385 static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
4386 { ISD::CTPOP, MVT::i32, { 1, 1, 1, 1 } }, // popcnt
4387 { ISD::CTPOP, MVT::i16, { 1, 1, 2, 2 } }, // popcnt(zext())
4388 { ISD::CTPOP, MVT::i8, { 1, 1, 2, 2 } }, // popcnt(zext())
4389 };
4390 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
4391 { ISD::ABS, MVT::i64, { 1, 2, 3, 3 } }, // SUB+CMOV
4392 { ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } },
4393 { ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } },
4394 { ISD::CTLZ, MVT::i64, { 1, 2, 3, 3 } }, // MOV+BSR+XOR
4395 { ISD::CTLZ, MVT::i32, { 1, 2, 3, 3 } }, // MOV+BSR+XOR
4396 { ISD::CTLZ, MVT::i16, { 2, 2, 3, 3 } }, // MOV+BSR+XOR
4397 { ISD::CTLZ, MVT::i8, { 2, 2, 4, 3 } }, // MOV+BSR+XOR
4398 { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 2, 2 } }, // BSR+XOR
4399 { ISD::CTTZ, MVT::i64, { 1, 2, 2, 2 } }, // MOV+BSF
4400 { ISD::CTTZ, MVT::i32, { 1, 2, 2, 2 } }, // MOV+BSF
4401 { ISD::CTTZ, MVT::i16, { 2, 2, 2, 2 } }, // MOV+BSF
4402 { ISD::CTTZ, MVT::i8, { 2, 2, 2, 2 } }, // MOV+BSF
4403 { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 1, 2 } }, // BSF
4404 { ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } },
4405 { ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } },
4406 { ISD::ROTR, MVT::i64, { 2, 3, 1, 3 } },
4407 { X86ISD::VROTLI, MVT::i64, { 1, 1, 1, 1 } },
4408 { ISD::FSHL, MVT::i64, { 4, 4, 1, 4 } },
4409 { ISD::SADDSAT, MVT::i64, { 4, 4, 7, 10 } },
4410 { ISD::SSUBSAT, MVT::i64, { 4, 5, 8, 11 } },
4411 { ISD::UADDSAT, MVT::i64, { 2, 3, 4, 7 } },
4412 { ISD::USUBSAT, MVT::i64, { 2, 3, 4, 7 } },
4413 { ISD::SMAX, MVT::i64, { 1, 3, 2, 3 } },
4414 { ISD::SMIN, MVT::i64, { 1, 3, 2, 3 } },
4415 { ISD::UMAX, MVT::i64, { 1, 3, 2, 3 } },
4416 { ISD::UMIN, MVT::i64, { 1, 3, 2, 3 } },
4417 { ISD::SADDO, MVT::i64, { 2, 2, 4, 6 } },
4418 { ISD::UADDO, MVT::i64, { 2, 2, 4, 6 } },
4419 { ISD::SMULO, MVT::i64, { 4, 4, 4, 6 } },
4420 { ISD::UMULO, MVT::i64, { 8, 8, 4, 7 } },
4421 };
4422 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
4423 { ISD::ABS, MVT::i32, { 1, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
4424 { ISD::ABS, MVT::i16, { 2, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
4425 { ISD::ABS, MVT::i8, { 2, 4, 4, 3 } }, // SUB+XOR+SRA
4426 { ISD::BITREVERSE, MVT::i32, { 9, 12, 17, 19 } },
4427 { ISD::BITREVERSE, MVT::i16, { 9, 12, 17, 19 } },
4428 { ISD::BITREVERSE, MVT::i8, { 7, 9, 13, 14 } },
4429 { ISD::BSWAP, MVT::i32, { 1, 1, 1, 1 } },
4430 { ISD::BSWAP, MVT::i16, { 1, 2, 1, 2 } }, // ROL
4431 { ISD::CTLZ, MVT::i32, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
4432 { ISD::CTLZ, MVT::i16, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
4433 { ISD::CTLZ, MVT::i8, { 2, 2, 5, 6 } }, // BSR+XOR or BSR+XOR+CMOV
4434 { ISD::CTLZ_ZERO_UNDEF, MVT::i32,{ 1, 2, 2, 2 } }, // BSR+XOR
4435 { ISD::CTLZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 2, 2 } }, // BSR+XOR
4436 { ISD::CTLZ_ZERO_UNDEF, MVT::i8, { 2, 2, 3, 3 } }, // BSR+XOR
4437 { ISD::CTTZ, MVT::i32, { 2, 2, 3, 3 } }, // TEST+BSF+CMOV/BRANCH
4438 { ISD::CTTZ, MVT::i16, { 2, 2, 2, 3 } }, // TEST+BSF+CMOV/BRANCH
4439 { ISD::CTTZ, MVT::i8, { 2, 2, 2, 3 } }, // TEST+BSF+CMOV/BRANCH
4440 { ISD::CTTZ_ZERO_UNDEF, MVT::i32,{ 1, 2, 1, 2 } }, // BSF
4441 { ISD::CTTZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 1, 2 } }, // BSF
4442 { ISD::CTTZ_ZERO_UNDEF, MVT::i8, { 2, 2, 1, 2 } }, // BSF
4443 { ISD::CTPOP, MVT::i32, { 8, 7, 15, 15 } },
4444 { ISD::CTPOP, MVT::i16, { 9, 8, 17, 17 } },
4445 { ISD::CTPOP, MVT::i8, { 7, 6, 6, 6 } },
4446 { ISD::ROTL, MVT::i32, { 2, 3, 1, 3 } },
4447 { ISD::ROTL, MVT::i16, { 2, 3, 1, 3 } },
4448 { ISD::ROTL, MVT::i8, { 2, 3, 1, 3 } },
4449 { ISD::ROTR, MVT::i32, { 2, 3, 1, 3 } },
4450 { ISD::ROTR, MVT::i16, { 2, 3, 1, 3 } },
4451 { ISD::ROTR, MVT::i8, { 2, 3, 1, 3 } },
4452 { X86ISD::VROTLI, MVT::i32, { 1, 1, 1, 1 } },
4453 { X86ISD::VROTLI, MVT::i16, { 1, 1, 1, 1 } },
4454 { X86ISD::VROTLI, MVT::i8, { 1, 1, 1, 1 } },
4455 { ISD::FSHL, MVT::i32, { 4, 4, 1, 4 } },
4456 { ISD::FSHL, MVT::i16, { 4, 4, 2, 5 } },
4457 { ISD::FSHL, MVT::i8, { 4, 4, 2, 5 } },
4458 { ISD::SADDSAT, MVT::i32, { 3, 4, 6, 9 } },
4459 { ISD::SADDSAT, MVT::i16, { 4, 4, 7, 10 } },
4460 { ISD::SADDSAT, MVT::i8, { 4, 5, 8, 11 } },
4461 { ISD::SSUBSAT, MVT::i32, { 4, 4, 7, 10 } },
4462 { ISD::SSUBSAT, MVT::i16, { 4, 4, 7, 10 } },
4463 { ISD::SSUBSAT, MVT::i8, { 4, 5, 8, 11 } },
4464 { ISD::UADDSAT, MVT::i32, { 2, 3, 4, 7 } },
4465 { ISD::UADDSAT, MVT::i16, { 2, 3, 4, 7 } },
4466 { ISD::UADDSAT, MVT::i8, { 3, 3, 5, 8 } },
4467 { ISD::USUBSAT, MVT::i32, { 2, 3, 4, 7 } },
4468 { ISD::USUBSAT, MVT::i16, { 2, 3, 4, 7 } },
4469 { ISD::USUBSAT, MVT::i8, { 3, 3, 5, 8 } },
4470 { ISD::SMAX, MVT::i32, { 1, 2, 2, 3 } },
4471 { ISD::SMAX, MVT::i16, { 1, 4, 2, 4 } },
4472 { ISD::SMAX, MVT::i8, { 1, 4, 2, 4 } },
4473 { ISD::SMIN, MVT::i32, { 1, 2, 2, 3 } },
4474 { ISD::SMIN, MVT::i16, { 1, 4, 2, 4 } },
4475 { ISD::SMIN, MVT::i8, { 1, 4, 2, 4 } },
4476 { ISD::UMAX, MVT::i32, { 1, 2, 2, 3 } },
4477 { ISD::UMAX, MVT::i16, { 1, 4, 2, 4 } },
4478 { ISD::UMAX, MVT::i8, { 1, 4, 2, 4 } },
4479 { ISD::UMIN, MVT::i32, { 1, 2, 2, 3 } },
4480 { ISD::UMIN, MVT::i16, { 1, 4, 2, 4 } },
4481 { ISD::UMIN, MVT::i8, { 1, 4, 2, 4 } },
4482 { ISD::SADDO, MVT::i32, { 2, 2, 4, 6 } },
4483 { ISD::SADDO, MVT::i16, { 2, 2, 4, 6 } },
4484 { ISD::SADDO, MVT::i8, { 2, 2, 4, 6 } },
4485 { ISD::UADDO, MVT::i32, { 2, 2, 4, 6 } },
4486 { ISD::UADDO, MVT::i16, { 2, 2, 4, 6 } },
4487 { ISD::UADDO, MVT::i8, { 2, 2, 4, 6 } },
4488 { ISD::SMULO, MVT::i32, { 2, 2, 4, 6 } },
4489 { ISD::SMULO, MVT::i16, { 5, 5, 4, 6 } },
4490 { ISD::SMULO, MVT::i8, { 6, 6, 4, 6 } },
4491 { ISD::UMULO, MVT::i32, { 6, 6, 4, 8 } },
4492 { ISD::UMULO, MVT::i16, { 6, 6, 4, 9 } },
4493 { ISD::UMULO, MVT::i8, { 6, 6, 4, 6 } },
4494 };
4495
4496 Type *RetTy = ICA.getReturnType();
4497 Type *OpTy = RetTy;
4498 Intrinsic::ID IID = ICA.getID();
4499 unsigned ISD = ISD::DELETED_NODE;
4500 switch (IID) {
4501 default:
4502 break;
4503 case Intrinsic::abs:
4504 ISD = ISD::ABS;
4505 break;
4506 case Intrinsic::bitreverse:
4508 break;
4509 case Intrinsic::bswap:
4510 ISD = ISD::BSWAP;
4511 break;
4512 case Intrinsic::ctlz:
4513 ISD = ISD::CTLZ;
4514 break;
4515 case Intrinsic::ctpop:
4516 ISD = ISD::CTPOP;
4517 break;
4518 case Intrinsic::cttz:
4519 ISD = ISD::CTTZ;
4520 break;
4521 case Intrinsic::fshl:
4522 ISD = ISD::FSHL;
4523 if (!ICA.isTypeBasedOnly()) {
4524 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4525 if (Args[0] == Args[1]) {
4526 ISD = ISD::ROTL;
4527 // Handle uniform constant rotation amounts.
4528 // TODO: Handle funnel-shift cases.
4529 const APInt *Amt;
4530 if (Args[2] &&
4533 }
4534 }
4535 break;
4536 case Intrinsic::fshr:
4537 // FSHR has same costs so don't duplicate.
4538 ISD = ISD::FSHL;
4539 if (!ICA.isTypeBasedOnly()) {
4540 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4541 if (Args[0] == Args[1]) {
4542 ISD = ISD::ROTR;
4543 // Handle uniform constant rotation amount.
4544 // TODO: Handle funnel-shift cases.
4545 const APInt *Amt;
4546 if (Args[2] &&
4549 }
4550 }
4551 break;
4552 case Intrinsic::lrint:
4553 case Intrinsic::llrint: {
4554 // X86 can use the CVTP2SI instructions to lower lrint/llrint calls, which
4555 // have the same costs as the CVTTP2SI (fptosi) instructions
4556 const SmallVectorImpl<Type *> &ArgTys = ICA.getArgTypes();
4557 return getCastInstrCost(Instruction::FPToSI, RetTy, ArgTys[0],
4559 }
4560 case Intrinsic::maxnum:
4561 case Intrinsic::minnum:
4562 // FMINNUM has same costs so don't duplicate.
4563 ISD = ISD::FMAXNUM;
4564 break;
4565 case Intrinsic::sadd_sat:
4566 ISD = ISD::SADDSAT;
4567 break;
4568 case Intrinsic::smax:
4569 ISD = ISD::SMAX;
4570 break;
4571 case Intrinsic::smin:
4572 ISD = ISD::SMIN;
4573 break;
4574 case Intrinsic::ssub_sat:
4575 ISD = ISD::SSUBSAT;
4576 break;
4577 case Intrinsic::uadd_sat:
4578 ISD = ISD::UADDSAT;
4579 break;
4580 case Intrinsic::umax:
4581 ISD = ISD::UMAX;
4582 break;
4583 case Intrinsic::umin:
4584 ISD = ISD::UMIN;
4585 break;
4586 case Intrinsic::usub_sat:
4587 ISD = ISD::USUBSAT;
4588 break;
4589 case Intrinsic::sqrt:
4590 ISD = ISD::FSQRT;
4591 break;
4592 case Intrinsic::sadd_with_overflow:
4593 case Intrinsic::ssub_with_overflow:
4594 // SSUBO has same costs so don't duplicate.
4595 ISD = ISD::SADDO;
4596 OpTy = RetTy->getContainedType(0);
4597 break;
4598 case Intrinsic::uadd_with_overflow:
4599 case Intrinsic::usub_with_overflow:
4600 // USUBO has same costs so don't duplicate.
4601 ISD = ISD::UADDO;
4602 OpTy = RetTy->getContainedType(0);
4603 break;
4604 case Intrinsic::smul_with_overflow:
4605 ISD = ISD::SMULO;
4606 OpTy = RetTy->getContainedType(0);
4607 break;
4608 case Intrinsic::umul_with_overflow:
4609 ISD = ISD::UMULO;
4610 OpTy = RetTy->getContainedType(0);
4611 break;
4612 }
4613
4614 if (ISD != ISD::DELETED_NODE) {
4615 auto adjustTableCost = [&](int ISD, unsigned Cost,
4616 std::pair<InstructionCost, MVT> LT,
4618 InstructionCost LegalizationCost = LT.first;
4619 MVT MTy = LT.second;
4620
4621 // If there are no NANs to deal with, then these are reduced to a
4622 // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
4623 // assume is used in the non-fast case.
4624 if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) {
4625 if (FMF.noNaNs())
4626 return LegalizationCost * 1;
4627 }
4628
4629 // For cases where some ops can be folded into a load/store, assume free.
4630 if (MTy.isScalarInteger()) {
4631 if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
4632 if (const Instruction *II = ICA.getInst()) {
4633 if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
4634 return TTI::TCC_Free;
4635 if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
4636 if (LI->hasOneUse())
4637 return TTI::TCC_Free;
4638 }
4639 }
4640 }
4641 }
4642
4643 return LegalizationCost * (int)Cost;
4644 };
4645
4646 // Legalize the type.
4647 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy);
4648 MVT MTy = LT.second;
4649
4650 // Without BMI/LZCNT see if we're only looking for a *_ZERO_UNDEF cost.
4651 if (((ISD == ISD::CTTZ && !ST->hasBMI()) ||
4652 (ISD == ISD::CTLZ && !ST->hasLZCNT())) &&
4653 !MTy.isVector() && !ICA.isTypeBasedOnly()) {
4654 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4655 if (auto *Cst = dyn_cast<ConstantInt>(Args[1]))
4656 if (Cst->isAllOnesValue())
4658 }
4659
4660 // FSQRT is a single instruction.
4662 return LT.first;
4663
4664 if (ST->useGLMDivSqrtCosts())
4665 if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
4666 if (auto KindCost = Entry->Cost[CostKind])
4667 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4668
4669 if (ST->useSLMArithCosts())
4670 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
4671 if (auto KindCost = Entry->Cost[CostKind])
4672 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4673
4674 if (ST->hasVBMI2())
4675 if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy))
4676 if (auto KindCost = Entry->Cost[CostKind])
4677 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4678
4679 if (ST->hasBITALG())
4680 if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy))
4681 if (auto KindCost = Entry->Cost[CostKind])
4682 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4683
4684 if (ST->hasVPOPCNTDQ())
4685 if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy))
4686 if (auto KindCost = Entry->Cost[CostKind])
4687 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4688
4689 if (ST->hasGFNI())
4690 if (const auto *Entry = CostTableLookup(GFNICostTbl, ISD, MTy))
4691 if (auto KindCost = Entry->Cost[CostKind])
4692 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4693
4694 if (ST->hasCDI())
4695 if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
4696 if (auto KindCost = Entry->Cost[CostKind])
4697 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4698
4699 if (ST->hasBWI())
4700 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
4701 if (auto KindCost = Entry->Cost[CostKind])
4702 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4703
4704 if (ST->hasAVX512())
4705 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
4706 if (auto KindCost = Entry->Cost[CostKind])
4707 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4708
4709 if (ST->hasXOP())
4710 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
4711 if (auto KindCost = Entry->Cost[CostKind])
4712 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4713
4714 if (ST->hasAVX2())
4715 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
4716 if (auto KindCost = Entry->Cost[CostKind])
4717 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4718
4719 if (ST->hasAVX())
4720 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
4721 if (auto KindCost = Entry->Cost[CostKind])
4722 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4723
4724 if (ST->hasSSE42())
4725 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
4726 if (auto KindCost = Entry->Cost[CostKind])
4727 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4728
4729 if (ST->hasSSE41())
4730 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
4731 if (auto KindCost = Entry->Cost[CostKind])
4732 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4733
4734 if (ST->hasSSSE3())
4735 if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
4736 if (auto KindCost = Entry->Cost[CostKind])
4737 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4738
4739 if (ST->hasSSE2())
4740 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
4741 if (auto KindCost = Entry->Cost[CostKind])
4742 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4743
4744 if (ST->hasSSE1())
4745 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
4746 if (auto KindCost = Entry->Cost[CostKind])
4747 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4748
4749 if (ST->hasBMI()) {
4750 if (ST->is64Bit())
4751 if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
4752 if (auto KindCost = Entry->Cost[CostKind])
4753 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4754
4755 if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
4756 if (auto KindCost = Entry->Cost[CostKind])
4757 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4758 }
4759
4760 if (ST->hasLZCNT()) {
4761 if (ST->is64Bit())
4762 if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
4763 if (auto KindCost = Entry->Cost[CostKind])
4764 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4765
4766 if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
4767 if (auto KindCost = Entry->Cost[CostKind])
4768 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4769 }
4770
4771 if (ST->hasPOPCNT()) {
4772 if (ST->is64Bit())
4773 if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
4774 if (auto KindCost = Entry->Cost[CostKind])
4775 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4776
4777 if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
4778 if (auto KindCost = Entry->Cost[CostKind])
4779 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4780 }
4781
4782 if (ST->is64Bit())
4783 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
4784 if (auto KindCost = Entry->Cost[CostKind])
4785 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4786
4787 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
4788 if (auto KindCost = Entry->Cost[CostKind])
4789 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4790
4791 // Without arg data, we need to compute the expanded costs of custom lowered
4792 // intrinsics to prevent use of the (very low) default costs.
4793 if (ICA.isTypeBasedOnly() &&
4794 (IID == Intrinsic::fshl || IID == Intrinsic::fshr)) {
4795 Type *CondTy = RetTy->getWithNewBitWidth(1);
4797 Cost += getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
4798 Cost += getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
4799 Cost += getArithmeticInstrCost(BinaryOperator::Shl, RetTy, CostKind);
4800 Cost += getArithmeticInstrCost(BinaryOperator::LShr, RetTy, CostKind);
4801 Cost += getArithmeticInstrCost(BinaryOperator::And, RetTy, CostKind);
4802 Cost += getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
4804 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
4806 return Cost;
4807 }
4808 }
4809
4811}
4812
4814 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4815 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
4816 static const CostTblEntry SLMCostTbl[] = {
4817 { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 },
4818 { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 },
4819 { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 },
4820 { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 }
4821 };
4822
4823 assert(Val->isVectorTy() && "This must be a vector type");
4824 auto *VT = cast<VectorType>(Val);
4825 if (VT->isScalableTy())
4827
4828 Type *ScalarType = Val->getScalarType();
4829 InstructionCost RegisterFileMoveCost = 0;
4830
4831 // Non-immediate extraction/insertion can be handled as a sequence of
4832 // aliased loads+stores via the stack.
4833 if (Index == -1U && (Opcode == Instruction::ExtractElement ||
4834 Opcode == Instruction::InsertElement)) {
4835 // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
4836 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
4837
4838 // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling.
4839 assert(isa<FixedVectorType>(Val) && "Fixed vector type expected");
4840 Align VecAlign = DL.getPrefTypeAlign(Val);
4841 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
4842
4843 // Extract - store vector to stack, load scalar.
4844 if (Opcode == Instruction::ExtractElement) {
4845 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4846 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
4847 CostKind);
4848 }
4849 // Insert - store vector to stack, store scalar, load vector.
4850 if (Opcode == Instruction::InsertElement) {
4851 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4852 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
4853 CostKind) +
4854 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind);
4855 }
4856 }
4857
4858 if (Index != -1U && (Opcode == Instruction::ExtractElement ||
4859 Opcode == Instruction::InsertElement)) {
4860 // Extraction of vXi1 elements are now efficiently handled by MOVMSK.
4861 if (Opcode == Instruction::ExtractElement &&
4862 ScalarType->getScalarSizeInBits() == 1 &&
4863 cast<FixedVectorType>(Val)->getNumElements() > 1)
4864 return 1;
4865
4866 // Legalize the type.
4867 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
4868
4869 // This type is legalized to a scalar type.
4870 if (!LT.second.isVector())
4871 return TTI::TCC_Free;
4872
4873 // The type may be split. Normalize the index to the new type.
4874 unsigned SizeInBits = LT.second.getSizeInBits();
4875 unsigned NumElts = LT.second.getVectorNumElements();
4876 unsigned SubNumElts = NumElts;
4877 Index = Index % NumElts;
4878
4879 // For >128-bit vectors, we need to extract higher 128-bit subvectors.
4880 // For inserts, we also need to insert the subvector back.
4881 if (SizeInBits > 128) {
4882 assert((SizeInBits % 128) == 0 && "Illegal vector");
4883 unsigned NumSubVecs = SizeInBits / 128;
4884 SubNumElts = NumElts / NumSubVecs;
4885 if (SubNumElts <= Index) {
4886 RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
4887 Index %= SubNumElts;
4888 }
4889 }
4890
4891 MVT MScalarTy = LT.second.getScalarType();
4892 auto IsCheapPInsrPExtrInsertPS = [&]() {
4893 // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
4894 // Inserting f32 into index0 is just movss.
4895 // Also, assume insertps is relatively cheap on all >= SSE41 targets.
4896 return (MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4897 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4898 (MScalarTy == MVT::f32 && ST->hasSSE1() && Index == 0 &&
4899 Opcode == Instruction::InsertElement) ||
4900 (MScalarTy == MVT::f32 && ST->hasSSE41() &&
4901 Opcode == Instruction::InsertElement);
4902 };
4903
4904 if (Index == 0) {
4905 // Floating point scalars are already located in index #0.
4906 // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
4907 // true for all.
4908 if (ScalarType->isFloatingPointTy() &&
4909 (Opcode != Instruction::InsertElement || !Op0 ||
4910 isa<UndefValue>(Op0)))
4911 return RegisterFileMoveCost;
4912
4913 if (Opcode == Instruction::InsertElement &&
4915 // Consider the gather cost to be cheap.
4917 return RegisterFileMoveCost;
4918 if (!IsCheapPInsrPExtrInsertPS()) {
4919 // mov constant-to-GPR + movd/movq GPR -> XMM.
4920 if (isa_and_nonnull<Constant>(Op1) && Op1->getType()->isIntegerTy())
4921 return 2 + RegisterFileMoveCost;
4922 // Assume movd/movq GPR -> XMM is relatively cheap on all targets.
4923 return 1 + RegisterFileMoveCost;
4924 }
4925 }
4926
4927 // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
4928 if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
4929 return 1 + RegisterFileMoveCost;
4930 }
4931
4932 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4933 assert(ISD && "Unexpected vector opcode");
4934 if (ST->useSLMArithCosts())
4935 if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
4936 return Entry->Cost + RegisterFileMoveCost;
4937
4938 // Consider cheap cases.
4939 if (IsCheapPInsrPExtrInsertPS())
4940 return 1 + RegisterFileMoveCost;
4941
4942 // For extractions we just need to shuffle the element to index 0, which
4943 // should be very cheap (assume cost = 1). For insertions we need to shuffle
4944 // the elements to its destination. In both cases we must handle the
4945 // subvector move(s).
4946 // If the vector type is already less than 128-bits then don't reduce it.
4947 // TODO: Under what circumstances should we shuffle using the full width?
4948 InstructionCost ShuffleCost = 1;
4949 if (Opcode == Instruction::InsertElement) {
4950 auto *SubTy = cast<VectorType>(Val);
4951 EVT VT = TLI->getValueType(DL, Val);
4952 if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
4953 SubTy = FixedVectorType::get(ScalarType, SubNumElts);
4954 ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, SubTy, {},
4955 CostKind, 0, SubTy);
4956 }
4957 int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
4958 return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
4959 }
4960
4961 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1,
4962 VIC) +
4963 RegisterFileMoveCost;
4964}
4965
4967 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
4968 TTI::TargetCostKind CostKind, bool ForPoisonSrc, ArrayRef<Value *> VL,
4969 TTI::VectorInstrContext VIC) const {
4970 assert(DemandedElts.getBitWidth() ==
4971 cast<FixedVectorType>(Ty)->getNumElements() &&
4972 "Vector size mismatch");
4973
4974 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4975 MVT MScalarTy = LT.second.getScalarType();
4976 unsigned LegalVectorBitWidth = LT.second.getSizeInBits();
4978
4979 constexpr unsigned LaneBitWidth = 128;
4980 assert((LegalVectorBitWidth < LaneBitWidth ||
4981 (LegalVectorBitWidth % LaneBitWidth) == 0) &&
4982 "Illegal vector");
4983
4984 const int NumLegalVectors = LT.first.getValue();
4985 assert(NumLegalVectors >= 0 && "Negative cost!");
4986
4987 // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
4988 // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT. SLPVectorizer has
4989 // a special heuristic regarding poison input which is passed here in
4990 // ForPoisonSrc.
4991 if (Insert && !ForPoisonSrc) {
4992 // This is nearly identical to BaseT::getScalarizationOverhead(), except
4993 // it is passing nullptr to getVectorInstrCost() for Op0 (instead of
4994 // Constant::getNullValue()), which makes the X86TTIImpl
4995 // getVectorInstrCost() return 0 instead of 1.
4996 for (unsigned I : seq(DemandedElts.getBitWidth())) {
4997 if (!DemandedElts[I])
4998 continue;
4999 Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, I,
5001 VL.empty() ? nullptr : VL[I],
5003 }
5004 return Cost;
5005 }
5006
5007 if (Insert) {
5008 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
5009 (MScalarTy.isInteger() && ST->hasSSE41()) ||
5010 (MScalarTy == MVT::f32 && ST->hasSSE41())) {
5011 // For types we can insert directly, insertion into 128-bit sub vectors is
5012 // cheap, followed by a cheap chain of concatenations.
5013 if (LegalVectorBitWidth <= LaneBitWidth) {
5014 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
5015 /*Extract*/ false, CostKind);
5016 } else {
5017 // In each 128-lane, if at least one index is demanded but not all
5018 // indices are demanded and this 128-lane is not the first 128-lane of
5019 // the legalized-vector, then this 128-lane needs a extracti128; If in
5020 // each 128-lane, there is at least one demanded index, this 128-lane
5021 // needs a inserti128.
5022
5023 // The following cases will help you build a better understanding:
5024 // Assume we insert several elements into a v8i32 vector in avx2,
5025 // Case#1: inserting into 1th index needs vpinsrd + inserti128.
5026 // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
5027 // inserti128.
5028 // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
5029 assert((LegalVectorBitWidth % LaneBitWidth) == 0 && "Illegal vector");
5030 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
5031 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
5032 unsigned NumLegalElts =
5033 LT.second.getVectorNumElements() * NumLegalVectors;
5034 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
5035 "Vector has been legalized to smaller element count");
5036 assert((NumLegalElts % NumLanesTotal) == 0 &&
5037 "Unexpected elts per lane");
5038 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
5039
5040 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
5041 auto *LaneTy =
5042 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
5043
5044 for (unsigned I = 0; I != NumLanesTotal; ++I) {
5045 APInt LaneEltMask = WidenedDemandedElts.extractBits(
5046 NumEltsPerLane, NumEltsPerLane * I);
5047 if (LaneEltMask.isZero())
5048 continue;
5049 // FIXME: we don't need to extract if all non-demanded elements
5050 // are legalization-inserted padding.
5051 if (!LaneEltMask.isAllOnes())
5053 CostKind, I * NumEltsPerLane, LaneTy);
5054 Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert,
5055 /*Extract*/ false, CostKind);
5056 }
5057
5058 APInt AffectedLanes =
5059 APIntOps::ScaleBitMask(WidenedDemandedElts, NumLanesTotal);
5060 APInt FullyAffectedLegalVectors = APIntOps::ScaleBitMask(
5061 AffectedLanes, NumLegalVectors, /*MatchAllBits=*/true);
5062 for (int LegalVec = 0; LegalVec != NumLegalVectors; ++LegalVec) {
5063 for (unsigned Lane = 0; Lane != NumLegalLanes; ++Lane) {
5064 unsigned I = NumLegalLanes * LegalVec + Lane;
5065 // No need to insert unaffected lane; or lane 0 of each legal vector
5066 // iff ALL lanes of that vector were affected and will be inserted.
5067 if (!AffectedLanes[I] ||
5068 (Lane == 0 && FullyAffectedLegalVectors[LegalVec]))
5069 continue;
5071 CostKind, I * NumEltsPerLane, LaneTy);
5072 }
5073 }
5074 }
5075 } else if (LT.second.isVector()) {
5076 // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
5077 // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
5078 // series of UNPCK followed by CONCAT_VECTORS - all of these can be
5079 // considered cheap.
5080 if (Ty->isIntOrIntVectorTy())
5081 Cost += DemandedElts.popcount();
5082
5083 // Get the smaller of the legalized or original pow2-extended number of
5084 // vector elements, which represents the number of unpacks we'll end up
5085 // performing.
5086 unsigned NumElts = LT.second.getVectorNumElements();
5087 unsigned Pow2Elts =
5089 Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
5090 }
5091 }
5092
5093 if (Extract) {
5094 // vXi1 can be efficiently extracted with MOVMSK.
5095 // TODO: AVX512 predicate mask handling.
5096 // NOTE: This doesn't work well for roundtrip scalarization.
5097 if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) {
5098 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
5099 unsigned MaxElts = ST->hasAVX2() ? 32 : 16;
5100 unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts;
5101 return MOVMSKCost;
5102 }
5103
5104 if (LT.second.isVector()) {
5105 unsigned NumLegalElts =
5106 LT.second.getVectorNumElements() * NumLegalVectors;
5107 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
5108 "Vector has been legalized to smaller element count");
5109
5110 // If we're extracting elements from a 128-bit subvector lane,
5111 // we only need to extract each lane once, not for every element.
5112 if (LegalVectorBitWidth > LaneBitWidth) {
5113 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
5114 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
5115 assert((NumLegalElts % NumLanesTotal) == 0 &&
5116 "Unexpected elts per lane");
5117 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
5118
5119 // Add cost for each demanded 128-bit subvector extraction.
5120 // Luckily this is a lot easier than for insertion.
5121 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
5122 auto *LaneTy =
5123 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
5124
5125 for (unsigned I = 0; I != NumLanesTotal; ++I) {
5126 APInt LaneEltMask = WidenedDemandedElts.extractBits(
5127 NumEltsPerLane, I * NumEltsPerLane);
5128 if (LaneEltMask.isZero())
5129 continue;
5131 I * NumEltsPerLane, LaneTy);
5133 LaneTy, LaneEltMask, /*Insert*/ false, Extract, CostKind);
5134 }
5135
5136 return Cost;
5137 }
5138 }
5139
5140 // Fallback to default extraction.
5141 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ false,
5142 Extract, CostKind);
5143 }
5144
5145 return Cost;
5146}
5147
5149X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
5150 int VF, const APInt &DemandedDstElts,
5152 const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy);
5153 // We don't differentiate element types here, only element bit width.
5154 EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits);
5155
5156 auto bailout = [&]() {
5157 return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
5158 DemandedDstElts, CostKind);
5159 };
5160
5161 // For now, only deal with AVX512 cases.
5162 if (!ST->hasAVX512())
5163 return bailout();
5164
5165 // Do we have a native shuffle for this element type, or should we promote?
5166 unsigned PromEltTyBits = EltTyBits;
5167 switch (EltTyBits) {
5168 case 32:
5169 case 64:
5170 break; // AVX512F.
5171 case 16:
5172 if (!ST->hasBWI())
5173 PromEltTyBits = 32; // promote to i32, AVX512F.
5174 break; // AVX512BW
5175 case 8:
5176 if (!ST->hasVBMI())
5177 PromEltTyBits = 32; // promote to i32, AVX512F.
5178 break; // AVX512VBMI
5179 case 1:
5180 // There is no support for shuffling i1 elements. We *must* promote.
5181 if (ST->hasBWI()) {
5182 if (ST->hasVBMI())
5183 PromEltTyBits = 8; // promote to i8, AVX512VBMI.
5184 else
5185 PromEltTyBits = 16; // promote to i16, AVX512BW.
5186 break;
5187 }
5188 PromEltTyBits = 32; // promote to i32, AVX512F.
5189 break;
5190 default:
5191 return bailout();
5192 }
5193 auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits);
5194
5195 auto *SrcVecTy = FixedVectorType::get(EltTy, VF);
5196 auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF);
5197
5198 int NumDstElements = VF * ReplicationFactor;
5199 auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements);
5200 auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements);
5201
5202 // Legalize the types.
5203 MVT LegalSrcVecTy = getTypeLegalizationCost(SrcVecTy).second;
5204 MVT LegalPromSrcVecTy = getTypeLegalizationCost(PromSrcVecTy).second;
5205 MVT LegalPromDstVecTy = getTypeLegalizationCost(PromDstVecTy).second;
5206 MVT LegalDstVecTy = getTypeLegalizationCost(DstVecTy).second;
5207 // They should have legalized into vector types.
5208 if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() ||
5209 !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector())
5210 return bailout();
5211
5212 if (PromEltTyBits != EltTyBits) {
5213 // If we have to perform the shuffle with wider elt type than our data type,
5214 // then we will first need to anyext (we don't care about the new bits)
5215 // the source elements, and then truncate Dst elements.
5216 InstructionCost PromotionCost;
5217 PromotionCost += getCastInstrCost(
5218 Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy,
5220 PromotionCost +=
5221 getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy,
5222 /*Src=*/PromDstVecTy,
5224 return PromotionCost + getReplicationShuffleCost(PromEltTy,
5225 ReplicationFactor, VF,
5226 DemandedDstElts, CostKind);
5227 }
5228
5229 assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits &&
5230 LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() &&
5231 "We expect that the legalization doesn't affect the element width, "
5232 "doesn't coalesce/split elements.");
5233
5234 unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements();
5235 unsigned NumDstVectors =
5236 divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec);
5237
5238 auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec);
5239
5240 // Not all the produced Dst elements may be demanded. In our case,
5241 // given that a single Dst vector is formed by a single shuffle,
5242 // if all elements that will form a single Dst vector aren't demanded,
5243 // then we won't need to do that shuffle, so adjust the cost accordingly.
5244 APInt DemandedDstVectors = APIntOps::ScaleBitMask(
5245 DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors);
5246 unsigned NumDstVectorsDemanded = DemandedDstVectors.popcount();
5247
5248 InstructionCost SingleShuffleCost =
5249 getShuffleCost(TTI::SK_PermuteSingleSrc, SingleDstVecTy, SingleDstVecTy,
5250 /*Mask=*/{}, CostKind,
5251 /*Index=*/0, /*SubTp=*/nullptr);
5252 return NumDstVectorsDemanded * SingleShuffleCost;
5253}
5254
5256 Align Alignment,
5257 unsigned AddressSpace,
5259 TTI::OperandValueInfo OpInfo,
5260 const Instruction *I) const {
5261 // TODO: Handle other cost kinds.
5263 if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
5264 // Store instruction with index and scale costs 2 Uops.
5265 // Check the preceding GEP to identify non-const indices.
5266 if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
5267 if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
5268 return TTI::TCC_Basic * 2;
5269 }
5270 }
5271 return TTI::TCC_Basic;
5272 }
5273
5274 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
5275 "Invalid Opcode");
5276 // Type legalization can't handle structs
5277 if (TLI->getValueType(DL, Src, true) == MVT::Other)
5278 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5279 CostKind, OpInfo, I);
5280
5281 // Legalize the type.
5282 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
5283
5284 auto *VTy = dyn_cast<FixedVectorType>(Src);
5285
5287
5288 // Add a cost for constant load to vector.
5289 if (Opcode == Instruction::Store && OpInfo.isConstant())
5290 Cost += getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src),
5291 /*AddressSpace=*/0, CostKind, OpInfo);
5292
5293 // Handle the simple case of non-vectors.
5294 // NOTE: this assumes that legalization never creates vector from scalars!
5295 if (!VTy || !LT.second.isVector()) {
5296 // Each load/store unit costs 1.
5297 return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1;
5298 }
5299
5300 bool IsLoad = Opcode == Instruction::Load;
5301
5302 Type *EltTy = VTy->getElementType();
5303
5304 const int EltTyBits = DL.getTypeSizeInBits(EltTy);
5305
5306 // Source of truth: how many elements were there in the original IR vector?
5307 const unsigned SrcNumElt = VTy->getNumElements();
5308
5309 // How far have we gotten?
5310 int NumEltRemaining = SrcNumElt;
5311 // Note that we intentionally capture by-reference, NumEltRemaining changes.
5312 auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
5313
5314 const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8);
5315
5316 // Note that even if we can store 64 bits of an XMM, we still operate on XMM.
5317 const unsigned XMMBits = 128;
5318 if (XMMBits % EltTyBits != 0)
5319 // Vector size must be a multiple of the element size. I.e. no padding.
5320 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5321 CostKind, OpInfo, I);
5322 const int NumEltPerXMM = XMMBits / EltTyBits;
5323
5324 auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
5325
5326 for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0;
5327 NumEltRemaining > 0; CurrOpSizeBytes /= 2) {
5328 // How many elements would a single op deal with at once?
5329 if ((8 * CurrOpSizeBytes) % EltTyBits != 0)
5330 // Vector size must be a multiple of the element size. I.e. no padding.
5331 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5332 CostKind, OpInfo, I);
5333 int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits;
5334
5335 assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?");
5336 assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||
5337 (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&
5338 "Unless we haven't halved the op size yet, "
5339 "we have less than two op's sized units of work left.");
5340
5341 auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
5342 ? FixedVectorType::get(EltTy, CurrNumEltPerOp)
5343 : XMMVecTy;
5344
5345 assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&
5346 "After halving sizes, the vector elt count is no longer a multiple "
5347 "of number of elements per operation?");
5348 auto *CoalescedVecTy =
5349 CurrNumEltPerOp == 1
5350 ? CurrVecTy
5352 IntegerType::get(Src->getContext(),
5353 EltTyBits * CurrNumEltPerOp),
5354 CurrVecTy->getNumElements() / CurrNumEltPerOp);
5355 assert(DL.getTypeSizeInBits(CoalescedVecTy) ==
5356 DL.getTypeSizeInBits(CurrVecTy) &&
5357 "coalesciing elements doesn't change vector width.");
5358
5359 while (NumEltRemaining > 0) {
5360 assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?");
5361
5362 // Can we use this vector size, as per the remaining element count?
5363 // Iff the vector is naturally aligned, we can do a wide load regardless.
5364 if (NumEltRemaining < CurrNumEltPerOp &&
5365 (!IsLoad || Alignment < CurrOpSizeBytes) && CurrOpSizeBytes != 1)
5366 break; // Try smalled vector size.
5367
5368 // This isn't exactly right. We're using slow unaligned 32-byte accesses
5369 // as a proxy for a double-pumped AVX memory interface such as on
5370 // Sandybridge.
5371 // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or
5372 // will be scalarized.
5373 if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
5374 Cost += 2;
5375 else if (CurrOpSizeBytes < 4)
5376 Cost += 2;
5377 else
5378 Cost += 1;
5379
5380 // If we're loading a uniform value, then we don't need to split the load,
5381 // loading just a single (widest) vector can be reused by all splits.
5382 if (IsLoad && OpInfo.isUniform())
5383 return Cost;
5384
5385 bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
5386
5387 // If we have fully processed the previous reg, we need to replenish it.
5388 if (SubVecEltsLeft == 0) {
5389 SubVecEltsLeft += CurrVecTy->getNumElements();
5390 // And that's free only for the 0'th subvector of a legalized vector.
5391 if (!Is0thSubVec)
5392 Cost +=
5395 VTy, VTy, {}, CostKind, NumEltDone(), CurrVecTy);
5396 }
5397
5398 // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
5399 // for smaller widths (32/16/8) we have to insert/extract them separately.
5400 // Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
5401 // but let's pretend that it is also true for 16/8 bit wide ops...)
5402 if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
5403 int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
5404 assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "");
5405 int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
5406 APInt DemandedElts =
5407 APInt::getBitsSet(CoalescedVecTy->getNumElements(),
5408 CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
5409 assert(DemandedElts.popcount() == 1 && "Inserting single value");
5410 Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
5411 !IsLoad, CostKind);
5412 }
5413
5414 SubVecEltsLeft -= CurrNumEltPerOp;
5415 NumEltRemaining -= CurrNumEltPerOp;
5416 Alignment = commonAlignment(Alignment, CurrOpSizeBytes);
5417 }
5418 }
5419
5420 assert(NumEltRemaining <= 0 && "Should have processed all the elements.");
5421
5422 return Cost;
5423}
5424
5428 switch (MICA.getID()) {
5429 case Intrinsic::masked_scatter:
5430 case Intrinsic::masked_gather:
5431 return getGatherScatterOpCost(MICA, CostKind);
5432 case Intrinsic::masked_load:
5433 case Intrinsic::masked_store:
5434 return getMaskedMemoryOpCost(MICA, CostKind);
5435 }
5437}
5438
5442 unsigned Opcode = MICA.getID() == Intrinsic::masked_load ? Instruction::Load
5443 : Instruction::Store;
5444 Type *SrcTy = MICA.getDataType();
5445 Align Alignment = MICA.getAlignment();
5446 unsigned AddressSpace = MICA.getAddressSpace();
5447
5448 bool IsLoad = (Instruction::Load == Opcode);
5449 bool IsStore = (Instruction::Store == Opcode);
5450
5451 auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
5452 if (!SrcVTy)
5453 // To calculate scalar take the regular cost, without mask
5454 return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
5455
5456 unsigned NumElem = SrcVTy->getNumElements();
5457 auto *MaskTy =
5458 FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
5459 if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment, AddressSpace)) ||
5460 (IsStore && !isLegalMaskedStore(SrcVTy, Alignment, AddressSpace))) {
5461 // Scalarization
5462 APInt DemandedElts = APInt::getAllOnes(NumElem);
5464 MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind);
5465 InstructionCost ScalarCompareCost = getCmpSelInstrCost(
5466 Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
5468 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
5469 InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
5471 SrcVTy, DemandedElts, IsLoad, IsStore, CostKind);
5472 InstructionCost MemopCost =
5473 NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
5474 Alignment, AddressSpace, CostKind);
5475 return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
5476 }
5477
5478 // Legalize the type.
5479 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy);
5480 auto VT = TLI->getValueType(DL, SrcVTy);
5482 MVT Ty = LT.second;
5483 if (Ty == MVT::i16 || Ty == MVT::i32 || Ty == MVT::i64)
5484 // APX masked load/store for scalar is cheap.
5485 return Cost + LT.first;
5486
5487 if (VT.isSimple() && Ty != VT.getSimpleVT() &&
5488 LT.second.getVectorNumElements() == NumElem)
5489 // Promotion requires extend/truncate for data and a shuffle for mask.
5490 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, SrcVTy, {}, CostKind,
5491 0, nullptr) +
5492 getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, MaskTy, {}, CostKind,
5493 0, nullptr);
5494
5495 else if (LT.first * Ty.getVectorNumElements() > NumElem) {
5496 auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
5497 (unsigned)LT.first.getValue() *
5498 Ty.getVectorNumElements());
5499 // Expanding requires fill mask with zeroes
5500 Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, NewMaskTy, {},
5501 CostKind, 0, MaskTy);
5502 }
5503
5504 // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
5505 if (!ST->hasAVX512())
5506 return Cost + LT.first * (IsLoad ? 2 : 8);
5507
5508 // AVX-512 masked load/store is cheaper
5509 return Cost + LT.first;
5510}
5511
5513 ArrayRef<const Value *> Ptrs, const Value *Base,
5514 const TTI::PointersChainInfo &Info, Type *AccessTy,
5516 if (Info.isSameBase() && Info.isKnownStride()) {
5517 // If all the pointers have known stride all the differences are translated
5518 // into constants. X86 memory addressing allows encoding it into
5519 // displacement. So we just need to take the base GEP cost.
5520 if (const auto *BaseGEP = dyn_cast<GetElementPtrInst>(Base)) {
5521 SmallVector<const Value *> Indices(BaseGEP->indices());
5522 return getGEPCost(BaseGEP->getSourceElementType(),
5523 BaseGEP->getPointerOperand(), Indices, nullptr,
5524 CostKind);
5525 }
5526 return TTI::TCC_Free;
5527 }
5528 return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind);
5529}
5530
5533 const SCEV *Ptr,
5535 // Address computations in vectorized code with non-consecutive addresses will
5536 // likely result in more instructions compared to scalar code where the
5537 // computation can more often be merged into the index mode. The resulting
5538 // extra micro-ops can significantly decrease throughput.
5539 const unsigned NumVectorInstToHideOverhead = 10;
5540
5541 // Cost modeling of Strided Access Computation is hidden by the indexing
5542 // modes of X86 regardless of the stride value. We dont believe that there
5543 // is a difference between constant strided access in gerenal and constant
5544 // strided value which is less than or equal to 64.
5545 // Even in the case of (loop invariant) stride whose value is not known at
5546 // compile time, the address computation will not incur more than one extra
5547 // ADD instruction.
5548 if (PtrTy->isVectorTy() && SE && !ST->hasAVX2()) {
5549 // TODO: AVX2 is the current cut-off because we don't have correct
5550 // interleaving costs for prior ISA's.
5551 if (!BaseT::isStridedAccess(Ptr))
5552 return NumVectorInstToHideOverhead;
5553 if (!BaseT::getConstantStrideStep(SE, Ptr))
5554 return 1;
5555 }
5556
5557 return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
5558}
5559
5562 std::optional<FastMathFlags> FMF,
5565 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5566
5567 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5568 // and make it as the cost.
5569
5570 static const CostTblEntry SLMCostTbl[] = {
5571 { ISD::FADD, MVT::v2f64, 3 },
5572 { ISD::ADD, MVT::v2i64, 5 },
5573 };
5574
5575 static const CostTblEntry SSE2CostTbl[] = {
5576 { ISD::FADD, MVT::v2f64, 2 },
5577 { ISD::FADD, MVT::v2f32, 2 },
5578 { ISD::FADD, MVT::v4f32, 4 },
5579 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
5580 { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
5581 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
5582 { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
5583 { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
5584 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
5585 { ISD::ADD, MVT::v2i8, 2 },
5586 { ISD::ADD, MVT::v4i8, 2 },
5587 { ISD::ADD, MVT::v8i8, 2 },
5588 { ISD::ADD, MVT::v16i8, 3 },
5589 };
5590
5591 static const CostTblEntry AVX1CostTbl[] = {
5592 { ISD::FADD, MVT::v4f64, 3 },
5593 { ISD::FADD, MVT::v4f32, 3 },
5594 { ISD::FADD, MVT::v8f32, 4 },
5595 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
5596 { ISD::ADD, MVT::v4i64, 3 },
5597 { ISD::ADD, MVT::v8i32, 5 },
5598 { ISD::ADD, MVT::v16i16, 5 },
5599 { ISD::ADD, MVT::v32i8, 4 },
5600 };
5601
5602 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5603 assert(ISD && "Invalid opcode");
5604
5605 // Before legalizing the type, give a chance to look up illegal narrow types
5606 // in the table.
5607 // FIXME: Is there a better way to do this?
5608 EVT VT = TLI->getValueType(DL, ValTy);
5609 if (VT.isSimple()) {
5610 MVT MTy = VT.getSimpleVT();
5611 if (ST->useSLMArithCosts())
5612 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5613 return Entry->Cost;
5614
5615 if (ST->hasAVX())
5616 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5617 return Entry->Cost;
5618
5619 if (ST->hasSSE2())
5620 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5621 return Entry->Cost;
5622 }
5623
5624 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5625
5626 MVT MTy = LT.second;
5627
5628 auto *ValVTy = cast<FixedVectorType>(ValTy);
5629
5630 // Special case: vXi8 mul reductions are performed as vXi16.
5631 if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) {
5632 auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16);
5633 auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements());
5634 return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
5636 CostKind) +
5637 getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind);
5638 }
5639
5640 InstructionCost ArithmeticCost = 0;
5641 if (LT.first != 1 && MTy.isVector() &&
5642 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5643 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5644 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5645 MTy.getVectorNumElements());
5646 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5647 ArithmeticCost *= LT.first - 1;
5648 }
5649
5650 if (ST->useSLMArithCosts())
5651 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5652 return ArithmeticCost + Entry->Cost;
5653
5654 if (ST->hasAVX())
5655 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5656 return ArithmeticCost + Entry->Cost;
5657
5658 if (ST->hasSSE2())
5659 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5660 return ArithmeticCost + Entry->Cost;
5661
5662 // FIXME: These assume a naive kshift+binop lowering, which is probably
5663 // conservative in most cases.
5664 static const CostTblEntry AVX512BoolReduction[] = {
5665 { ISD::AND, MVT::v2i1, 3 },
5666 { ISD::AND, MVT::v4i1, 5 },
5667 { ISD::AND, MVT::v8i1, 7 },
5668 { ISD::AND, MVT::v16i1, 9 },
5669 { ISD::AND, MVT::v32i1, 11 },
5670 { ISD::AND, MVT::v64i1, 13 },
5671 { ISD::OR, MVT::v2i1, 3 },
5672 { ISD::OR, MVT::v4i1, 5 },
5673 { ISD::OR, MVT::v8i1, 7 },
5674 { ISD::OR, MVT::v16i1, 9 },
5675 { ISD::OR, MVT::v32i1, 11 },
5676 { ISD::OR, MVT::v64i1, 13 },
5677 };
5678
5679 static const CostTblEntry AVX2BoolReduction[] = {
5680 { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
5681 { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
5682 { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
5683 { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
5684 };
5685
5686 static const CostTblEntry AVX1BoolReduction[] = {
5687 { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
5688 { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
5689 { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5690 { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5691 { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
5692 { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
5693 { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5694 { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5695 };
5696
5697 static const CostTblEntry SSE2BoolReduction[] = {
5698 { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
5699 { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
5700 { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
5701 { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
5702 { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
5703 { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
5704 { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
5705 { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
5706 };
5707
5708 // Handle bool allof/anyof patterns.
5709 if (ValVTy->getElementType()->isIntegerTy(1)) {
5710 if (ISD == ISD::ADD) {
5711 // vXi1 addition reduction will bitcast to scalar and perform a popcount.
5712 auto *IntTy = IntegerType::getIntNTy(ValVTy->getContext(),
5713 ValVTy->getNumElements());
5714 IntrinsicCostAttributes ICA(Intrinsic::ctpop, IntTy, {IntTy});
5715 return getCastInstrCost(Instruction::BitCast, IntTy, ValVTy,
5717 CostKind) +
5719 }
5720
5721 InstructionCost ArithmeticCost = 0;
5722 if (LT.first != 1 && MTy.isVector() &&
5723 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5724 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5725 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5726 MTy.getVectorNumElements());
5727 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5728 ArithmeticCost *= LT.first - 1;
5729 }
5730
5731 if (ST->hasAVX512())
5732 if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
5733 return ArithmeticCost + Entry->Cost;
5734 if (ST->hasAVX2())
5735 if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
5736 return ArithmeticCost + Entry->Cost;
5737 if (ST->hasAVX())
5738 if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
5739 return ArithmeticCost + Entry->Cost;
5740 if (ST->hasSSE2())
5741 if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
5742 return ArithmeticCost + Entry->Cost;
5743
5744 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5745 }
5746
5747 unsigned NumVecElts = ValVTy->getNumElements();
5748 unsigned ScalarSize = ValVTy->getScalarSizeInBits();
5749
5750 // Special case power of 2 reductions where the scalar type isn't changed
5751 // by type legalization.
5752 if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
5753 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5754
5755 InstructionCost ReductionCost = 0;
5756
5757 auto *Ty = ValVTy;
5758 if (LT.first != 1 && MTy.isVector() &&
5759 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5760 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5761 Ty = FixedVectorType::get(ValVTy->getElementType(),
5762 MTy.getVectorNumElements());
5763 ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5764 ReductionCost *= LT.first - 1;
5765 NumVecElts = MTy.getVectorNumElements();
5766 }
5767
5768 // Now handle reduction with the legal type, taking into account size changes
5769 // at each level.
5770 while (NumVecElts > 1) {
5771 // Determine the size of the remaining vector we need to reduce.
5772 unsigned Size = NumVecElts * ScalarSize;
5773 NumVecElts /= 2;
5774 // If we're reducing from 256/512 bits, use an extract_subvector.
5775 if (Size > 128) {
5776 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5777 ReductionCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, Ty, {},
5778 CostKind, NumVecElts, SubTy);
5779 Ty = SubTy;
5780 } else if (Size == 128) {
5781 // Reducing from 128 bits is a permute of v2f64/v2i64.
5782 FixedVectorType *ShufTy;
5783 if (ValVTy->isFloatingPointTy())
5784 ShufTy =
5785 FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
5786 else
5787 ShufTy =
5788 FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
5789 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy,
5790 {}, CostKind, 0, nullptr);
5791 } else if (Size == 64) {
5792 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5793 FixedVectorType *ShufTy;
5794 if (ValVTy->isFloatingPointTy())
5795 ShufTy =
5796 FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
5797 else
5798 ShufTy =
5799 FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
5800 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy,
5801 {}, CostKind, 0, nullptr);
5802 } else {
5803 // Reducing from smaller size is a shift by immediate.
5804 auto *ShiftTy = FixedVectorType::get(
5805 Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
5806 ReductionCost += getArithmeticInstrCost(
5807 Instruction::LShr, ShiftTy, CostKind,
5810 }
5811
5812 // Add the arithmetic op for this level.
5813 ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
5814 }
5815
5816 // Add the final extract element to the cost.
5817 return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5818 CostKind, 0, nullptr, nullptr,
5820}
5821
5824 FastMathFlags FMF) const {
5825 IntrinsicCostAttributes ICA(IID, Ty, {Ty, Ty}, FMF);
5826 return getIntrinsicInstrCost(ICA, CostKind);
5827}
5828
5831 FastMathFlags FMF,
5833 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5834
5835 MVT MTy = LT.second;
5836
5837 int ISD;
5838 if (ValTy->isIntOrIntVectorTy()) {
5839 ISD = (IID == Intrinsic::umin || IID == Intrinsic::umax) ? ISD::UMIN
5840 : ISD::SMIN;
5841 } else {
5842 assert(ValTy->isFPOrFPVectorTy() &&
5843 "Expected float point or integer vector type.");
5844 ISD = (IID == Intrinsic::minnum || IID == Intrinsic::maxnum)
5845 ? ISD::FMINNUM
5846 : ISD::FMINIMUM;
5847 }
5848
5849 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5850 // and make it as the cost.
5851
5852 static const CostTblEntry SSE2CostTbl[] = {
5853 {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw
5854 {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw
5855 {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw
5856 };
5857
5858 static const CostTblEntry SSE41CostTbl[] = {
5859 {ISD::SMIN, MVT::v2i16, 3}, // same as sse2
5860 {ISD::SMIN, MVT::v4i16, 5}, // same as sse2
5861 {ISD::UMIN, MVT::v2i16, 5}, // same as sse2
5862 {ISD::UMIN, MVT::v4i16, 7}, // same as sse2
5863 {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor
5864 {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax
5865 {ISD::SMIN, MVT::v2i8, 3}, // pminsb
5866 {ISD::SMIN, MVT::v4i8, 5}, // pminsb
5867 {ISD::SMIN, MVT::v8i8, 7}, // pminsb
5868 {ISD::SMIN, MVT::v16i8, 6},
5869 {ISD::UMIN, MVT::v2i8, 3}, // same as sse2
5870 {ISD::UMIN, MVT::v4i8, 5}, // same as sse2
5871 {ISD::UMIN, MVT::v8i8, 7}, // same as sse2
5872 {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax
5873 };
5874
5875 static const CostTblEntry AVX1CostTbl[] = {
5876 {ISD::SMIN, MVT::v16i16, 6},
5877 {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax
5878 {ISD::SMIN, MVT::v32i8, 8},
5879 {ISD::UMIN, MVT::v32i8, 8},
5880 };
5881
5882 static const CostTblEntry AVX512BWCostTbl[] = {
5883 {ISD::SMIN, MVT::v32i16, 8},
5884 {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax
5885 {ISD::SMIN, MVT::v64i8, 10},
5886 {ISD::UMIN, MVT::v64i8, 10},
5887 };
5888
5889 // Before legalizing the type, give a chance to look up illegal narrow types
5890 // in the table.
5891 // FIXME: Is there a better way to do this?
5892 EVT VT = TLI->getValueType(DL, ValTy);
5893 if (VT.isSimple()) {
5894 MVT MTy = VT.getSimpleVT();
5895 if (ST->hasBWI())
5896 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5897 return Entry->Cost;
5898
5899 if (ST->hasAVX())
5900 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5901 return Entry->Cost;
5902
5903 if (ST->hasSSE41())
5904 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5905 return Entry->Cost;
5906
5907 if (ST->hasSSE2())
5908 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5909 return Entry->Cost;
5910 }
5911
5912 auto *ValVTy = cast<FixedVectorType>(ValTy);
5913 unsigned NumVecElts = ValVTy->getNumElements();
5914
5915 auto *Ty = ValVTy;
5916 InstructionCost MinMaxCost = 0;
5917 if (LT.first != 1 && MTy.isVector() &&
5918 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5919 // Type needs to be split. We need LT.first - 1 operations ops.
5920 Ty = FixedVectorType::get(ValVTy->getElementType(),
5921 MTy.getVectorNumElements());
5922 MinMaxCost = getMinMaxCost(IID, Ty, CostKind, FMF);
5923 MinMaxCost *= LT.first - 1;
5924 NumVecElts = MTy.getVectorNumElements();
5925 }
5926
5927 if (ST->hasBWI())
5928 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5929 return MinMaxCost + Entry->Cost;
5930
5931 if (ST->hasAVX())
5932 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5933 return MinMaxCost + Entry->Cost;
5934
5935 if (ST->hasSSE41())
5936 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5937 return MinMaxCost + Entry->Cost;
5938
5939 if (ST->hasSSE2())
5940 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5941 return MinMaxCost + Entry->Cost;
5942
5943 unsigned ScalarSize = ValTy->getScalarSizeInBits();
5944
5945 // Special case power of 2 reductions where the scalar type isn't changed
5946 // by type legalization.
5947 if (!isPowerOf2_32(ValVTy->getNumElements()) ||
5948 ScalarSize != MTy.getScalarSizeInBits())
5949 return BaseT::getMinMaxReductionCost(IID, ValTy, FMF, CostKind);
5950
5951 // Now handle reduction with the legal type, taking into account size changes
5952 // at each level.
5953 while (NumVecElts > 1) {
5954 // Determine the size of the remaining vector we need to reduce.
5955 unsigned Size = NumVecElts * ScalarSize;
5956 NumVecElts /= 2;
5957 // If we're reducing from 256/512 bits, use an extract_subvector.
5958 if (Size > 128) {
5959 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5960 MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, Ty, {},
5961 CostKind, NumVecElts, SubTy);
5962 Ty = SubTy;
5963 } else if (Size == 128) {
5964 // Reducing from 128 bits is a permute of v2f64/v2i64.
5965 VectorType *ShufTy;
5966 if (ValTy->isFloatingPointTy())
5967 ShufTy =
5968 FixedVectorType::get(Type::getDoubleTy(ValTy->getContext()), 2);
5969 else
5970 ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
5971 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy, {},
5972 CostKind, 0, nullptr);
5973 } else if (Size == 64) {
5974 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5975 FixedVectorType *ShufTy;
5976 if (ValTy->isFloatingPointTy())
5977 ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
5978 else
5979 ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
5980 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy, {},
5981 CostKind, 0, nullptr);
5982 } else {
5983 // Reducing from smaller size is a shift by immediate.
5984 auto *ShiftTy = FixedVectorType::get(
5985 Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
5986 MinMaxCost += getArithmeticInstrCost(
5987 Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
5990 }
5991
5992 // Add the arithmetic op for this level.
5993 MinMaxCost += getMinMaxCost(IID, Ty, CostKind, FMF);
5994 }
5995
5996 // Add the final extract element to the cost.
5997 return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5998 CostKind, 0, nullptr, nullptr,
6000}
6001
6002/// Calculate the cost of materializing a 64-bit value. This helper
6003/// method might only calculate a fraction of a larger immediate. Therefore it
6004/// is valid to return a cost of ZERO.
6006 if (Val == 0)
6007 return TTI::TCC_Free;
6008
6009 if (isInt<32>(Val))
6010 return TTI::TCC_Basic;
6011
6012 return 2 * TTI::TCC_Basic;
6013}
6014
6017 assert(Ty->isIntegerTy());
6018
6019 unsigned BitSize = Ty->getPrimitiveSizeInBits();
6020 if (BitSize == 0)
6021 return ~0U;
6022
6023 // Never hoist constants larger than 128bit, because this might lead to
6024 // incorrect code generation or assertions in codegen.
6025 // Fixme: Create a cost model for types larger than i128 once the codegen
6026 // issues have been fixed.
6027 if (BitSize > 128)
6028 return TTI::TCC_Free;
6029
6030 if (Imm == 0)
6031 return TTI::TCC_Free;
6032
6033 // Sign-extend all constants to a multiple of 64-bit.
6034 APInt ImmVal = Imm;
6035 if (BitSize % 64 != 0)
6036 ImmVal = Imm.sext(alignTo(BitSize, 64));
6037
6038 // Split the constant into 64-bit chunks and calculate the cost for each
6039 // chunk.
6041 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
6042 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
6043 int64_t Val = Tmp.getSExtValue();
6044 Cost += getIntImmCost(Val);
6045 }
6046 // We need at least one instruction to materialize the constant.
6047 return std::max<InstructionCost>(1, Cost);
6048}
6049
6051 const APInt &Imm, Type *Ty,
6053 Instruction *Inst) const {
6054 assert(Ty->isIntegerTy());
6055
6056 unsigned BitSize = Ty->getPrimitiveSizeInBits();
6057 unsigned ImmBitWidth = Imm.getBitWidth();
6058
6059 // There is no cost model for constants with a bit size of 0. Return TCC_Free
6060 // here, so that constant hoisting will ignore this constant.
6061 if (BitSize == 0)
6062 return TTI::TCC_Free;
6063
6064 unsigned ImmIdx = ~0U;
6065 switch (Opcode) {
6066 default:
6067 return TTI::TCC_Free;
6068 case Instruction::GetElementPtr:
6069 // Always hoist the base address of a GetElementPtr. This prevents the
6070 // creation of new constants for every base constant that gets constant
6071 // folded with the offset.
6072 if (Idx == 0)
6073 return 2 * TTI::TCC_Basic;
6074 return TTI::TCC_Free;
6075 case Instruction::Store:
6076 ImmIdx = 0;
6077 break;
6078 case Instruction::ICmp:
6079 // This is an imperfect hack to prevent constant hoisting of
6080 // compares that might be trying to check if a 64-bit value fits in
6081 // 32-bits. The backend can optimize these cases using a right shift by 32.
6082 // There are other predicates and immediates the backend can use shifts for.
6083 if (Idx == 1 && ImmBitWidth == 64) {
6084 uint64_t ImmVal = Imm.getZExtValue();
6085 if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
6086 return TTI::TCC_Free;
6087
6088 if (auto *Cmp = dyn_cast_or_null<CmpInst>(Inst)) {
6089 if (Cmp->isEquality()) {
6090 KnownBits Known = computeKnownBits(Cmp->getOperand(0), DL);
6091 if (Known.countMinTrailingZeros() >= 32)
6092 return TTI::TCC_Free;
6093 }
6094 }
6095 }
6096 ImmIdx = 1;
6097 break;
6098 case Instruction::And:
6099 // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
6100 // by using a 32-bit operation with implicit zero extension. Detect such
6101 // immediates here as the normal path expects bit 31 to be sign extended.
6102 if (Idx == 1 && ImmBitWidth == 64 && Imm.isIntN(32))
6103 return TTI::TCC_Free;
6104 // If we have BMI then we can use BEXTR/BZHI to mask out upper i64 bits.
6105 if (Idx == 1 && ImmBitWidth == 64 && ST->is64Bit() && ST->hasBMI() &&
6106 Imm.isMask())
6107 return X86TTIImpl::getIntImmCost(ST->hasBMI2() ? 255 : 65535);
6108 ImmIdx = 1;
6109 break;
6110 case Instruction::Add:
6111 case Instruction::Sub:
6112 // For add/sub, we can use the opposite instruction for INT32_MIN.
6113 if (Idx == 1 && ImmBitWidth == 64 && Imm.getZExtValue() == 0x80000000)
6114 return TTI::TCC_Free;
6115 ImmIdx = 1;
6116 break;
6117 case Instruction::UDiv:
6118 case Instruction::SDiv:
6119 case Instruction::URem:
6120 case Instruction::SRem:
6121 // Division by constant is typically expanded later into a different
6122 // instruction sequence. This completely changes the constants.
6123 // Report them as "free" to stop ConstantHoist from marking them as opaque.
6124 return TTI::TCC_Free;
6125 case Instruction::Mul:
6126 case Instruction::Or:
6127 case Instruction::Xor:
6128 ImmIdx = 1;
6129 break;
6130 // Always return TCC_Free for the shift value of a shift instruction.
6131 case Instruction::Shl:
6132 case Instruction::LShr:
6133 case Instruction::AShr:
6134 if (Idx == 1)
6135 return TTI::TCC_Free;
6136 break;
6137 case Instruction::Trunc:
6138 case Instruction::ZExt:
6139 case Instruction::SExt:
6140 case Instruction::IntToPtr:
6141 case Instruction::PtrToInt:
6142 case Instruction::BitCast:
6143 case Instruction::PHI:
6144 case Instruction::Call:
6145 case Instruction::Select:
6146 case Instruction::Ret:
6147 case Instruction::Load:
6148 break;
6149 }
6150
6151 if (Idx == ImmIdx) {
6152 uint64_t NumConstants = divideCeil(BitSize, 64);
6154 return (Cost <= NumConstants * TTI::TCC_Basic)
6155 ? static_cast<int>(TTI::TCC_Free)
6156 : Cost;
6157 }
6158
6159 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
6160}
6161
6164 const APInt &Imm, Type *Ty,
6166 assert(Ty->isIntegerTy());
6167
6168 unsigned BitSize = Ty->getPrimitiveSizeInBits();
6169 // There is no cost model for constants with a bit size of 0. Return TCC_Free
6170 // here, so that constant hoisting will ignore this constant.
6171 if (BitSize == 0)
6172 return TTI::TCC_Free;
6173
6174 switch (IID) {
6175 default:
6176 return TTI::TCC_Free;
6177 case Intrinsic::sadd_with_overflow:
6178 case Intrinsic::uadd_with_overflow:
6179 case Intrinsic::ssub_with_overflow:
6180 case Intrinsic::usub_with_overflow:
6181 case Intrinsic::smul_with_overflow:
6182 case Intrinsic::umul_with_overflow:
6183 if ((Idx == 1) && Imm.getBitWidth() <= 64 && Imm.isSignedIntN(32))
6184 return TTI::TCC_Free;
6185 break;
6186 case Intrinsic::experimental_stackmap:
6187 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
6188 return TTI::TCC_Free;
6189 break;
6190 case Intrinsic::experimental_patchpoint_void:
6191 case Intrinsic::experimental_patchpoint:
6192 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
6193 return TTI::TCC_Free;
6194 break;
6195 }
6196 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
6197}
6198
6201 const Instruction *I) const {
6203 return Opcode == Instruction::PHI ? TTI::TCC_Free : TTI::TCC_Basic;
6204 // Branches are assumed to be predicted.
6205 return TTI::TCC_Free;
6206}
6207
6208int X86TTIImpl::getGatherOverhead() const {
6209 // Some CPUs have more overhead for gather. The specified overhead is relative
6210 // to the Load operation. "2" is the number provided by Intel architects. This
6211 // parameter is used for cost estimation of Gather Op and comparison with
6212 // other alternatives.
6213 // TODO: Remove the explicit hasAVX512()?, That would mean we would only
6214 // enable gather with a -march.
6215 if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
6216 return 2;
6217
6218 return 1024;
6219}
6220
6221int X86TTIImpl::getScatterOverhead() const {
6222 if (ST->hasAVX512())
6223 return 2;
6224
6225 return 1024;
6226}
6227
6228// Return an average cost of Gather / Scatter instruction, maybe improved later.
6229InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode,
6231 Type *SrcVTy, const Value *Ptr,
6232 Align Alignment,
6233 unsigned AddressSpace) const {
6234
6235 assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
6236 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
6237
6238 // Try to reduce index size from 64 bit (default for GEP)
6239 // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
6240 // operation will use 16 x 64 indices which do not fit in a zmm and needs
6241 // to split. Also check that the base pointer is the same for all lanes,
6242 // and that there's at most one variable index.
6243 auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
6244 unsigned IndexSize = DL.getPointerSizeInBits();
6245 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
6246 if (IndexSize < 64 || !GEP)
6247 return IndexSize;
6248
6249 unsigned NumOfVarIndices = 0;
6250 const Value *Ptrs = GEP->getPointerOperand();
6251 if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
6252 return IndexSize;
6253 for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) {
6254 if (isa<Constant>(GEP->getOperand(I)))
6255 continue;
6256 Type *IndxTy = GEP->getOperand(I)->getType();
6257 if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
6258 IndxTy = IndexVTy->getElementType();
6259 if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
6260 !isa<SExtInst>(GEP->getOperand(I))) ||
6261 ++NumOfVarIndices > 1)
6262 return IndexSize; // 64
6263 }
6264 return (unsigned)32;
6265 };
6266
6267 // Trying to reduce IndexSize to 32 bits for vector 16.
6268 // By default the IndexSize is equal to pointer size.
6269 unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
6270 ? getIndexSizeInBits(Ptr, DL)
6271 : DL.getPointerSizeInBits();
6272
6273 auto *IndexVTy = FixedVectorType::get(
6274 IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
6275 std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(IndexVTy);
6276 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(SrcVTy);
6277 InstructionCost::CostType SplitFactor =
6278 std::max(IdxsLT.first, SrcLT.first).getValue();
6279 if (SplitFactor > 1) {
6280 // Handle splitting of vector of pointers
6281 auto *SplitSrcTy =
6282 FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
6283 return SplitFactor * getGSVectorCost(Opcode, CostKind, SplitSrcTy, Ptr,
6284 Alignment, AddressSpace);
6285 }
6286
6287 // If we didn't split, this will be a single gather/scatter instruction.
6289 return 1;
6290
6291 // The gather / scatter cost is given by Intel architects. It is a rough
6292 // number since we are looking at one instruction in a time.
6293 const int GSOverhead = (Opcode == Instruction::Load) ? getGatherOverhead()
6294 : getScatterOverhead();
6295 return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
6296 Alignment, AddressSpace, CostKind);
6297}
6298
6299/// Calculate the cost of Gather / Scatter operation
6303 bool IsLoad = MICA.getID() == Intrinsic::masked_gather ||
6304 MICA.getID() == Intrinsic::vp_gather;
6305 unsigned Opcode = IsLoad ? Instruction::Load : Instruction::Store;
6306 Type *SrcVTy = MICA.getDataType();
6307 const Value *Ptr = MICA.getPointer();
6308 Align Alignment = MICA.getAlignment();
6309 if ((Opcode == Instruction::Load &&
6310 (!isLegalMaskedGather(SrcVTy, Align(Alignment)) ||
6312 Align(Alignment)))) ||
6313 (Opcode == Instruction::Store &&
6314 (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) ||
6316 Align(Alignment)))))
6318
6319 assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
6320 PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
6321 if (!PtrTy && Ptr->getType()->isVectorTy())
6322 PtrTy = dyn_cast<PointerType>(
6323 cast<VectorType>(Ptr->getType())->getElementType());
6324 assert(PtrTy && "Unexpected type for Ptr argument");
6325 unsigned AddressSpace = PtrTy->getAddressSpace();
6326 return getGSVectorCost(Opcode, CostKind, SrcVTy, Ptr, Alignment,
6327 AddressSpace);
6328}
6329
6331 const TargetTransformInfo::LSRCost &C2) const {
6332 // X86 specific here are "instruction number 1st priority".
6333 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, C1.NumIVMuls,
6334 C1.NumBaseAdds, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
6335 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, C2.NumIVMuls,
6336 C2.NumBaseAdds, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
6337}
6338
6340 return ST->hasMacroFusion() || ST->hasBranchFusion();
6341}
6342
6343static bool isLegalMaskedLoadStore(Type *ScalarTy, const X86Subtarget *ST) {
6344 if (!ST->hasAVX())
6345 return false;
6346
6347 if (ScalarTy->isPointerTy())
6348 return true;
6349
6350 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6351 return true;
6352
6353 if (ScalarTy->isHalfTy() && ST->hasBWI())
6354 return true;
6355
6356 if (ScalarTy->isBFloatTy() && ST->hasBF16())
6357 return true;
6358
6359 if (!ScalarTy->isIntegerTy())
6360 return false;
6361
6362 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6363 return IntWidth == 32 || IntWidth == 64 ||
6364 ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
6365}
6366
6368 unsigned AddressSpace,
6369 TTI::MaskKind MaskKind) const {
6370 Type *ScalarTy = DataTy->getScalarType();
6371
6372 // The backend can't handle a single element vector w/o CFCMOV.
6373 if (isa<VectorType>(DataTy) &&
6374 cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6375 return ST->hasCF() &&
6376 hasConditionalLoadStoreForType(ScalarTy, /*IsStore=*/false);
6377
6378 return isLegalMaskedLoadStore(ScalarTy, ST);
6379}
6380
6382 unsigned AddressSpace,
6383 TTI::MaskKind MaskKind) const {
6384 Type *ScalarTy = DataTy->getScalarType();
6385
6386 // The backend can't handle a single element vector w/o CFCMOV.
6387 if (isa<VectorType>(DataTy) &&
6388 cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6389 return ST->hasCF() &&
6390 hasConditionalLoadStoreForType(ScalarTy, /*IsStore=*/true);
6391
6392 return isLegalMaskedLoadStore(ScalarTy, ST);
6393}
6394
6395bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) const {
6396 unsigned DataSize = DL.getTypeStoreSize(DataType);
6397 // The only supported nontemporal loads are for aligned vectors of 16 or 32
6398 // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
6399 // (the equivalent stores only require AVX).
6400 if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
6401 return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
6402
6403 return false;
6404}
6405
6406bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) const {
6407 unsigned DataSize = DL.getTypeStoreSize(DataType);
6408
6409 // SSE4A supports nontemporal stores of float and double at arbitrary
6410 // alignment.
6411 if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
6412 return true;
6413
6414 // Besides the SSE4A subtarget exception above, only aligned stores are
6415 // available nontemporaly on any other subtarget. And only stores with a size
6416 // of 4..32 bytes (powers of 2, only) are permitted.
6417 if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
6419 return false;
6420
6421 // 32-byte vector nontemporal stores are supported by AVX (the equivalent
6422 // loads require AVX2).
6423 if (DataSize == 32)
6424 return ST->hasAVX();
6425 if (DataSize == 16)
6426 return ST->hasSSE1();
6427 return true;
6428}
6429
6431 ElementCount NumElements) const {
6432 // movddup
6433 return ST->hasSSE3() && !NumElements.isScalable() &&
6434 NumElements.getFixedValue() == 2 &&
6435 ElementTy == Type::getDoubleTy(ElementTy->getContext());
6436}
6437
6438bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy, Align Alignment) const {
6439 if (!isa<VectorType>(DataTy))
6440 return false;
6441
6442 if (!ST->hasAVX512())
6443 return false;
6444
6445 // The backend can't handle a single element vector.
6446 if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6447 return false;
6448
6449 Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
6450
6451 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6452 return true;
6453
6454 if (!ScalarTy->isIntegerTy())
6455 return false;
6456
6457 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6458 return IntWidth == 32 || IntWidth == 64 ||
6459 ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
6460}
6461
6463 Align Alignment) const {
6464 return isLegalMaskedExpandLoad(DataTy, Alignment);
6465}
6466
6467bool X86TTIImpl::supportsGather() const {
6468 // Some CPUs have better gather performance than others.
6469 // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
6470 // enable gather with a -march.
6471 return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2());
6472}
6473
6475 Align Alignment) const {
6476 // Gather / Scatter for vector 2 is not profitable on KNL / SKX
6477 // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend
6478 // it to 8 elements, but zeroing upper bits of the mask vector will add more
6479 // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO:
6480 // Check, maybe the gather/scatter instruction is better in the VariableMask
6481 // case.
6482 unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements();
6483 return NumElts == 1 ||
6484 (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())));
6485}
6486
6488 Align Alignment) const {
6489 Type *ScalarTy = DataTy->getScalarType();
6490 if (ScalarTy->isPointerTy())
6491 return true;
6492
6493 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6494 return true;
6495
6496 if (!ScalarTy->isIntegerTy())
6497 return false;
6498
6499 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6500 return IntWidth == 32 || IntWidth == 64;
6501}
6502
6503bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) const {
6504 if (!supportsGather() || !ST->preferGather())
6505 return false;
6506 return isLegalMaskedGatherScatter(DataTy, Alignment);
6507}
6508
6509bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
6510 unsigned Opcode1,
6511 const SmallBitVector &OpcodeMask) const {
6512 // ADDSUBPS 4xf32 SSE3
6513 // VADDSUBPS 4xf32 AVX
6514 // VADDSUBPS 8xf32 AVX2
6515 // ADDSUBPD 2xf64 SSE3
6516 // VADDSUBPD 2xf64 AVX
6517 // VADDSUBPD 4xf64 AVX2
6518
6519 unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
6520 assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible");
6521 if (!isPowerOf2_32(NumElements))
6522 return false;
6523 // Check the opcode pattern. We apply the mask on the opcode arguments and
6524 // then check if it is what we expect.
6525 for (int Lane : seq<int>(0, NumElements)) {
6526 unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0;
6527 // We expect FSub for even lanes and FAdd for odd lanes.
6528 if (Lane % 2 == 0 && Opc != Instruction::FSub)
6529 return false;
6530 if (Lane % 2 == 1 && Opc != Instruction::FAdd)
6531 return false;
6532 }
6533 // Now check that the pattern is supported by the target ISA.
6534 Type *ElemTy = cast<VectorType>(VecTy)->getElementType();
6535 if (ElemTy->isFloatTy())
6536 return ST->hasSSE3() && NumElements % 4 == 0;
6537 if (ElemTy->isDoubleTy())
6538 return ST->hasSSE3() && NumElements % 2 == 0;
6539 return false;
6540}
6541
6542bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) const {
6543 // AVX2 doesn't support scatter
6544 if (!ST->hasAVX512() || !ST->preferScatter())
6545 return false;
6546 return isLegalMaskedGatherScatter(DataType, Alignment);
6547}
6548
6549bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) const {
6550 EVT VT = TLI->getValueType(DL, DataType);
6551 return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
6552}
6553
6555 // FDIV is always expensive, even if it has a very low uop count.
6556 // TODO: Still necessary for recent CPUs with low latency/throughput fdiv?
6557 if (I->getOpcode() == Instruction::FDiv)
6558 return true;
6559
6561}
6562
6563bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) const { return false; }
6564
6566 const Function *Callee) const {
6567 const TargetMachine &TM = getTLI()->getTargetMachine();
6568
6569 // Work this as a subsetting of subtarget features.
6570 const FeatureBitset &CallerBits =
6571 TM.getSubtargetImpl(*Caller)->getFeatureBits();
6572 const FeatureBitset &CalleeBits =
6573 TM.getSubtargetImpl(*Callee)->getFeatureBits();
6574
6575 // Check whether features are the same (apart from the ignore list).
6576 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
6577 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
6578 if (RealCallerBits == RealCalleeBits)
6579 return true;
6580
6581 // If the features are a subset, we need to additionally check for calls
6582 // that may become ABI-incompatible as a result of inlining.
6583 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
6584 return false;
6585
6586 for (const Instruction &I : instructions(Callee)) {
6587 if (const auto *CB = dyn_cast<CallBase>(&I)) {
6588 // Having more target features is fine for inline ASM and intrinsics.
6589 if (CB->isInlineAsm() || CB->getIntrinsicID() != Intrinsic::not_intrinsic)
6590 continue;
6591
6593 for (Value *Arg : CB->args())
6594 Types.push_back(Arg->getType());
6595 if (!CB->getType()->isVoidTy())
6596 Types.push_back(CB->getType());
6597
6598 // Simple types are always ABI compatible.
6599 auto IsSimpleTy = [](Type *Ty) {
6600 return !Ty->isVectorTy() && !Ty->isAggregateType();
6601 };
6602 if (all_of(Types, IsSimpleTy))
6603 continue;
6604
6605 // Do a precise compatibility check.
6606 if (!areTypesABICompatible(Caller, Callee, Types))
6607 return false;
6608 }
6609 }
6610 return true;
6611}
6612
6614 const Function *Callee,
6615 ArrayRef<Type *> Types) const {
6616 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
6617 return false;
6618
6619 // If we get here, we know the target features match. If one function
6620 // considers 512-bit vectors legal and the other does not, consider them
6621 // incompatible.
6622 const TargetMachine &TM = getTLI()->getTargetMachine();
6623
6624 if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
6626 return true;
6627
6628 // Consider the arguments compatible if they aren't vectors or aggregates.
6629 // FIXME: Look at the size of vectors.
6630 // FIXME: Look at the element types of aggregates to see if there are vectors.
6631 return llvm::none_of(Types,
6632 [](Type *T) { return T->isVectorTy() || T->isAggregateType(); });
6633}
6634
6636X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
6638 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
6639 Options.NumLoadsPerBlock = 2;
6640 // All GPR and vector loads can be unaligned.
6641 Options.AllowOverlappingLoads = true;
6642 if (IsZeroCmp) {
6643 // Only enable vector loads for equality comparison. Right now the vector
6644 // version is not as fast for three way compare (see #33329).
6645 const unsigned PreferredWidth = ST->getPreferVectorWidth();
6646 if (PreferredWidth >= 512 && ST->hasAVX512())
6647 Options.LoadSizes.push_back(64);
6648 if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
6649 if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
6650 }
6651 if (ST->is64Bit()) {
6652 Options.LoadSizes.push_back(8);
6653 }
6654 Options.LoadSizes.push_back(4);
6655 Options.LoadSizes.push_back(2);
6656 Options.LoadSizes.push_back(1);
6657 return Options;
6658}
6659
6661 return supportsGather();
6662}
6663
6665 return false;
6666}
6667
6669 // TODO: We expect this to be beneficial regardless of arch,
6670 // but there are currently some unexplained performance artifacts on Atom.
6671 // As a temporary solution, disable on Atom.
6672 return !(ST->isAtom());
6673}
6674
6675// Get estimation for interleaved load/store operations and strided load.
6676// \p Indices contains indices for strided load.
6677// \p Factor - the factor of interleaving.
6678// AVX-512 provides 3-src shuffles that significantly reduces the cost.
6680 unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
6681 ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
6682 TTI::TargetCostKind CostKind, bool UseMaskForCond,
6683 bool UseMaskForGaps) const {
6684 // VecTy for interleave memop is <VF*Factor x Elt>.
6685 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6686 // VecTy = <12 x i32>.
6687
6688 // Calculate the number of memory operations (NumOfMemOps), required
6689 // for load/store the VecTy.
6690 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6691 unsigned VecTySize = DL.getTypeStoreSize(VecTy);
6692 unsigned LegalVTSize = LegalVT.getStoreSize();
6693 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
6694
6695 // Get the cost of one memory operation.
6696 auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
6697 LegalVT.getVectorNumElements());
6698 InstructionCost MemOpCost;
6699 bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps;
6700 if (UseMaskedMemOp) {
6701 unsigned IID = Opcode == Instruction::Load ? Intrinsic::masked_load
6702 : Intrinsic::masked_store;
6703 MemOpCost = getMaskedMemoryOpCost(
6704 {IID, SingleMemOpTy, Alignment, AddressSpace}, CostKind);
6705 } else
6706 MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace,
6707 CostKind);
6708
6709 unsigned VF = VecTy->getNumElements() / Factor;
6710 MVT VT =
6711 MVT::getVectorVT(TLI->getSimpleValueType(DL, VecTy->getScalarType()), VF);
6712
6713 InstructionCost MaskCost;
6714 if (UseMaskedMemOp) {
6715 APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements());
6716 for (unsigned Index : Indices) {
6717 assert(Index < Factor && "Invalid index for interleaved memory op");
6718 for (unsigned Elm = 0; Elm < VF; Elm++)
6719 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
6720 }
6721
6722 Type *I1Type = Type::getInt1Ty(VecTy->getContext());
6723
6724 MaskCost = getReplicationShuffleCost(
6725 I1Type, Factor, VF,
6726 UseMaskForGaps ? DemandedLoadStoreElts
6728 CostKind);
6729
6730 // The Gaps mask is invariant and created outside the loop, therefore the
6731 // cost of creating it is not accounted for here. However if we have both
6732 // a MaskForGaps and some other mask that guards the execution of the
6733 // memory access, we need to account for the cost of And-ing the two masks
6734 // inside the loop.
6735 if (UseMaskForGaps) {
6736 auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements());
6737 MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind);
6738 }
6739 }
6740
6741 if (Opcode == Instruction::Load) {
6742 // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
6743 // contain the cost of the optimized shuffle sequence that the
6744 // X86InterleavedAccess pass will generate.
6745 // The cost of loads and stores are computed separately from the table.
6746
6747 // X86InterleavedAccess support only the following interleaved-access group.
6748 static const CostTblEntry AVX512InterleavedLoadTbl[] = {
6749 {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
6750 {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
6751 {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
6752 };
6753
6754 if (const auto *Entry =
6755 CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
6756 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6757 //If an entry does not exist, fallback to the default implementation.
6758
6759 // Kind of shuffle depends on number of loaded values.
6760 // If we load the entire data in one register, we can use a 1-src shuffle.
6761 // Otherwise, we'll merge 2 sources in each operation.
6762 TTI::ShuffleKind ShuffleKind =
6763 (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
6764
6765 InstructionCost ShuffleCost = getShuffleCost(
6766 ShuffleKind, SingleMemOpTy, SingleMemOpTy, {}, CostKind, 0, nullptr);
6767
6768 unsigned NumOfLoadsInInterleaveGrp =
6769 Indices.size() ? Indices.size() : Factor;
6770 auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
6771 VecTy->getNumElements() / Factor);
6772 InstructionCost NumOfResults =
6773 getTypeLegalizationCost(ResultTy).first * NumOfLoadsInInterleaveGrp;
6774
6775 // About a half of the loads may be folded in shuffles when we have only
6776 // one result. If we have more than one result, or the loads are masked,
6777 // we do not fold loads at all.
6778 unsigned NumOfUnfoldedLoads =
6779 UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
6780
6781 // Get a number of shuffle operations per result.
6782 unsigned NumOfShufflesPerResult =
6783 std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
6784
6785 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6786 // When we have more than one destination, we need additional instructions
6787 // to keep sources.
6788 InstructionCost NumOfMoves = 0;
6789 if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
6790 NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
6791
6792 InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
6793 MaskCost + NumOfUnfoldedLoads * MemOpCost +
6794 NumOfMoves;
6795
6796 return Cost;
6797 }
6798
6799 // Store.
6800 assert(Opcode == Instruction::Store &&
6801 "Expected Store Instruction at this point");
6802 // X86InterleavedAccess support only the following interleaved-access group.
6803 static const CostTblEntry AVX512InterleavedStoreTbl[] = {
6804 {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
6805 {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
6806 {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
6807
6808 {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
6809 {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
6810 {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
6811 {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
6812 };
6813
6814 if (const auto *Entry =
6815 CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
6816 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6817 //If an entry does not exist, fallback to the default implementation.
6818
6819 // There is no strided stores meanwhile. And store can't be folded in
6820 // shuffle.
6821 unsigned NumOfSources = Factor; // The number of values to be merged.
6822 InstructionCost ShuffleCost =
6823 getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, SingleMemOpTy, {},
6824 CostKind, 0, nullptr);
6825 unsigned NumOfShufflesPerStore = NumOfSources - 1;
6826
6827 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6828 // We need additional instructions to keep sources.
6829 unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
6831 MaskCost +
6832 NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
6833 NumOfMoves;
6834 return Cost;
6835}
6836
6838 unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices,
6839 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
6840 bool UseMaskForCond, bool UseMaskForGaps) const {
6841 auto *VecTy = cast<FixedVectorType>(BaseTy);
6842
6843 auto isSupportedOnAVX512 = [&](Type *VecTy) {
6844 Type *EltTy = cast<VectorType>(VecTy)->getElementType();
6845 if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
6846 EltTy->isIntegerTy(32) || EltTy->isPointerTy())
6847 return true;
6848 if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy())
6849 return ST->hasBWI();
6850 if (EltTy->isBFloatTy())
6851 return ST->hasBF16();
6852 return false;
6853 };
6854 if (ST->hasAVX512() && isSupportedOnAVX512(VecTy))
6856 Opcode, VecTy, Factor, Indices, Alignment,
6857 AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
6858
6859 if (UseMaskForCond || UseMaskForGaps)
6860 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6861 Alignment, AddressSpace, CostKind,
6862 UseMaskForCond, UseMaskForGaps);
6863
6864 // Get estimation for interleaved load/store operations for SSE-AVX2.
6865 // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow
6866 // computing the cost using a generic formula as a function of generic
6867 // shuffles. We therefore use a lookup table instead, filled according to
6868 // the instruction sequences that codegen currently generates.
6869
6870 // VecTy for interleave memop is <VF*Factor x Elt>.
6871 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6872 // VecTy = <12 x i32>.
6873 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6874
6875 // This function can be called with VecTy=<6xi128>, Factor=3, in which case
6876 // the VF=2, while v2i128 is an unsupported MVT vector type
6877 // (see MachineValueType.h::getVectorVT()).
6878 if (!LegalVT.isVector())
6879 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6880 Alignment, AddressSpace, CostKind);
6881
6882 unsigned VF = VecTy->getNumElements() / Factor;
6883 Type *ScalarTy = VecTy->getElementType();
6884 // Deduplicate entries, model floats/pointers as appropriately-sized integers.
6885 if (!ScalarTy->isIntegerTy())
6886 ScalarTy =
6887 Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy));
6888
6889 // Get the cost of all the memory operations.
6890 // FIXME: discount dead loads.
6891 InstructionCost MemOpCosts =
6892 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
6893
6894 auto *VT = FixedVectorType::get(ScalarTy, VF);
6895 EVT ETy = TLI->getValueType(DL, VT);
6896 if (!ETy.isSimple())
6897 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6898 Alignment, AddressSpace, CostKind);
6899
6900 // TODO: Complete for other data-types and strides.
6901 // Each combination of Stride, element bit width and VF results in a different
6902 // sequence; The cost tables are therefore accessed with:
6903 // Factor (stride) and VectorType=VFxiN.
6904 // The Cost accounts only for the shuffle sequence;
6905 // The cost of the loads/stores is accounted for separately.
6906 //
6907 static const CostTblEntry AVX2InterleavedLoadTbl[] = {
6908 {2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8
6909 {2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8
6910 {2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8
6911 {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8
6912 {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8
6913
6914 {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16
6915 {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16
6916 {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16
6917
6918 {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32
6919 {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32
6920 {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32
6921
6922 {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64
6923 {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64
6924 {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64
6925 {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64
6926
6927 {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8
6928 {3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8
6929 {3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8
6930 {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
6931 {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8
6932
6933 {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16
6934 {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16
6935 {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16
6936 {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16
6937 {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16
6938
6939 {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32
6940 {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32
6941 {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32
6942 {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32
6943 {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32
6944
6945 {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64
6946 {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64
6947 {3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64
6948 {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64
6949
6950 {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8
6951 {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8
6952 {4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8
6953 {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8
6954 {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8
6955
6956 {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16
6957 {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16
6958 {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16
6959 {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16
6960 {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16
6961
6962 {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32
6963 {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32
6964 {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32
6965 {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32
6966 {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32
6967
6968 {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64
6969 {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64
6970 {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64
6971 {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64
6972
6973 {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8
6974 {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8
6975 {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8
6976 {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8
6977 {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8
6978
6979 {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16
6980 {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16
6981 {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16
6982 {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16
6983 {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16
6984
6985 {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32
6986 {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32
6987 {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32
6988 {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32
6989
6990 {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64
6991 {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64
6992 {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64
6993
6994 {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32
6995 };
6996
6997 static const CostTblEntry SSSE3InterleavedLoadTbl[] = {
6998 {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16
6999 };
7000
7001 static const CostTblEntry SSE2InterleavedLoadTbl[] = {
7002 {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16
7003 {2, MVT::v4i16, 7}, // (load 8i16 and) deinterleave into 2 x 4i16
7004
7005 {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32
7006 {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32
7007
7008 {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64
7009 };
7010
7011 static const CostTblEntry AVX2InterleavedStoreTbl[] = {
7012 {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store)
7013 {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store)
7014
7015 {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store)
7016 {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store)
7017 {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store)
7018
7019 {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store)
7020 {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store)
7021 {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store)
7022 {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store)
7023
7024 {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store)
7025 {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store)
7026 {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store)
7027 {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store)
7028 {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store)
7029
7030 {3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store)
7031 {3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store)
7032 {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store)
7033 {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
7034 {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
7035
7036 {3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store)
7037 {3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store)
7038 {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store)
7039 {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store)
7040 {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store)
7041
7042 {3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store)
7043 {3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store)
7044 {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store)
7045 {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store)
7046 {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store)
7047
7048 {3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store)
7049 {3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store)
7050 {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store)
7051 {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store)
7052
7053 {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store)
7054 {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store)
7055 {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store)
7056 {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store)
7057 {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store)
7058
7059 {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store)
7060 {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store)
7061 {4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store)
7062 {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store)
7063 {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store)
7064
7065 {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store)
7066 {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store)
7067 {4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store)
7068 {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store)
7069 {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store)
7070
7071 {4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store)
7072 {4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store)
7073 {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store)
7074 {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store)
7075
7076 {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store)
7077 {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store)
7078 {6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store)
7079 {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store)
7080 {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store)
7081
7082 {6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store)
7083 {6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store)
7084 {6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store)
7085 {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store)
7086 {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store)
7087
7088 {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store)
7089 {6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store)
7090 {6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store)
7091 {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store)
7092
7093 {6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store)
7094 {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store)
7095 {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store)
7096 };
7097
7098 static const CostTblEntry SSE2InterleavedStoreTbl[] = {
7099 {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store)
7100 {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store)
7101 {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store)
7102
7103 {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store)
7104 {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store)
7105
7106 {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store)
7107 };
7108
7109 if (Opcode == Instruction::Load) {
7110 auto GetDiscountedCost = [Factor, NumMembers = Indices.size(),
7111 MemOpCosts](const CostTblEntry *Entry) {
7112 // NOTE: this is just an approximation!
7113 // It can over/under -estimate the cost!
7114 return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor);
7115 };
7116
7117 if (ST->hasAVX2())
7118 if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor,
7119 ETy.getSimpleVT()))
7120 return GetDiscountedCost(Entry);
7121
7122 if (ST->hasSSSE3())
7123 if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor,
7124 ETy.getSimpleVT()))
7125 return GetDiscountedCost(Entry);
7126
7127 if (ST->hasSSE2())
7128 if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor,
7129 ETy.getSimpleVT()))
7130 return GetDiscountedCost(Entry);
7131 } else {
7132 assert(Opcode == Instruction::Store &&
7133 "Expected Store Instruction at this point");
7134 assert((!Indices.size() || Indices.size() == Factor) &&
7135 "Interleaved store only supports fully-interleaved groups.");
7136 if (ST->hasAVX2())
7137 if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor,
7138 ETy.getSimpleVT()))
7139 return MemOpCosts + Entry->Cost;
7140
7141 if (ST->hasSSE2())
7142 if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor,
7143 ETy.getSimpleVT()))
7144 return MemOpCosts + Entry->Cost;
7145 }
7146
7147 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
7148 Alignment, AddressSpace, CostKind,
7149 UseMaskForCond, UseMaskForGaps);
7150}
7151
7153 StackOffset BaseOffset,
7154 bool HasBaseReg, int64_t Scale,
7155 unsigned AddrSpace) const {
7156 // Scaling factors are not free at all.
7157 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
7158 // will take 2 allocations in the out of order engine instead of 1
7159 // for plain addressing mode, i.e. inst (reg1).
7160 // E.g.,
7161 // vaddps (%rsi,%rdx), %ymm0, %ymm1
7162 // Requires two allocations (one for the load, one for the computation)
7163 // whereas:
7164 // vaddps (%rsi), %ymm0, %ymm1
7165 // Requires just 1 allocation, i.e., freeing allocations for other operations
7166 // and having less micro operations to execute.
7167 //
7168 // For some X86 architectures, this is even worse because for instance for
7169 // stores, the complex addressing mode forces the instruction to use the
7170 // "load" ports instead of the dedicated "store" port.
7171 // E.g., on Haswell:
7172 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
7173 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
7175 AM.BaseGV = BaseGV;
7176 AM.BaseOffs = BaseOffset.getFixed();
7177 AM.HasBaseReg = HasBaseReg;
7178 AM.Scale = Scale;
7179 AM.ScalableOffset = BaseOffset.getScalable();
7180 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
7181 // Scale represents reg2 * scale, thus account for 1
7182 // as soon as we use a second register.
7183 return AM.Scale != 0;
7185}
7186
7188 // TODO: Hook MispredictPenalty of SchedMachineModel into this.
7189 return 14;
7190}
7191
7193 unsigned Bits = Ty->getScalarSizeInBits();
7194
7195 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
7196 // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
7197 if (ST->hasXOP() && (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
7198 return false;
7199
7200 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
7201 // shifts just as cheap as scalar ones.
7202 if (ST->hasAVX2() && (Bits == 32 || Bits == 64))
7203 return false;
7204
7205 // AVX512BW has shifts such as vpsllvw.
7206 if (ST->hasBWI() && Bits == 16)
7207 return false;
7208
7209 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
7210 // fully general vector.
7211 return true;
7212}
7213
7214unsigned X86TTIImpl::getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
7215 Type *ScalarValTy) const {
7216 if (ST->hasF16C() && ScalarMemTy->isHalfTy()) {
7217 return 4;
7218 }
7219 return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy);
7220}
7221
7223 SmallVectorImpl<Use *> &Ops) const {
7224 using namespace llvm::PatternMatch;
7225
7226 FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
7227 if (!VTy)
7228 return false;
7229
7230 if (I->getOpcode() == Instruction::Mul &&
7231 VTy->getElementType()->isIntegerTy(64)) {
7232 for (auto &Op : I->operands()) {
7233 // Make sure we are not already sinking this operand
7234 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
7235 continue;
7236
7237 // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or
7238 // the PMULUDQ pattern where the input is a zext_inreg from vXi32.
7239 if (ST->hasSSE41() &&
7240 match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),
7241 m_SpecificInt(32)))) {
7242 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
7243 Ops.push_back(&Op);
7244 } else if (ST->hasSSE2() &&
7245 match(Op.get(),
7246 m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) {
7247 Ops.push_back(&Op);
7248 }
7249 }
7250
7251 return !Ops.empty();
7252 }
7253
7254 // A uniform shift amount in a vector shift or funnel shift may be much
7255 // cheaper than a generic variable vector shift, so make that pattern visible
7256 // to SDAG by sinking the shuffle instruction next to the shift.
7257 int ShiftAmountOpNum = -1;
7258 if (I->isShift())
7259 ShiftAmountOpNum = 1;
7260 else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
7261 if (II->getIntrinsicID() == Intrinsic::fshl ||
7262 II->getIntrinsicID() == Intrinsic::fshr)
7263 ShiftAmountOpNum = 2;
7264 }
7265
7266 if (ShiftAmountOpNum == -1)
7267 return false;
7268
7269 auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
7270 if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
7271 isVectorShiftByScalarCheap(I->getType())) {
7272 Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
7273 return true;
7274 }
7275
7276 return false;
7277}
7278
7280 bool HasEGPR = ST->hasEGPR();
7281 const TargetMachine &TM = getTLI()->getTargetMachine();
7282
7283 for (User *U : F.users()) {
7285 if (!CB || CB->getCalledOperand() != &F)
7286 continue;
7287 Function *CallerFunc = CB->getFunction();
7288 if (TM.getSubtarget<X86Subtarget>(*CallerFunc).hasEGPR() != HasEGPR)
7289 return false;
7290 }
7291
7292 return true;
7293}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Expand Atomic instructions
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
Hexagon Common GEP
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define T
uint64_t IntrinsicInst * II
#define P(N)
static unsigned getNumElements(Type *Ty)
This file implements the SmallBitVector class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
CostTblEntryT< CostKindCosts > CostKindTblEntry
static bool isLegalMaskedLoadStore(Type *ScalarTy, const X86Subtarget *ST)
TypeConversionCostTblEntryT< CostKindCosts > TypeConversionCostKindTblEntry
This file a TargetTransformInfoImplBase conforming object specific to the X86 target machine.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1023
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1685
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1345
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:259
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1503
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1052
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:834
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:201
LLVM_ABI APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition APInt.cpp:482
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1577
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const override
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Value * getCalledOperand() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
const IntrinsicInst * getInst() const
const FeatureBitset & getFeatureBits() const
Machine Value Type.
bool is128BitVector() const
Return true if this is a 128-bit vector type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Information for memory intrinsic cost model.
Class to represent pointers.
unsigned getAddressSpace() const
Return the address space of the Pointer type.
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
bool test(unsigned Idx) const
size_type size() const
Returns the number of bits in this bitvector.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
Primary interface to the complete machine description for the target machine.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
virtual InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const
bool isStridedAccess(const SCEV *Ptr) const
unsigned minRequiredElementSize(const Value *Val, bool &isSigned) const
const SCEVConstant * getConstantStrideStep(ScalarEvolution *SE, const SCEV *Ptr) const
virtual bool isExpensiveToSpeculativelyExecute(const Instruction *I) const
virtual bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
MaskKind
Some targets only support masked load/store with a constant mask.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
CacheLevel
The possible cache levels.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
LLVM_ABI unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:294
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:156
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
static LLVM_ABI Type * getDoubleTy(LLVMContext &C)
Definition Type.cpp:285
Type * getContainedType(unsigned i) const
This method is used to implement the type iterator (defined at the end of the file).
Definition Type.h:381
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:284
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
Base class of all SIMD vector types.
static VectorType * getExtendedElementVectorType(VectorType *VTy)
This static method is like getInteger except that the element types are twice as wide as the elements...
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * getDoubleElementsVectorType(VectorType *VTy)
This static method returns a VectorType with twice as many elements as the input type and the same el...
Type * getElementType() const
bool useAVX512Regs() const
bool hasAVX512() const
bool hasAVX2() const
bool useFastCCForInternalCall(Function &F) const override
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind) const override
bool isLegalNTLoad(Type *DataType, Align Alignment) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const override
unsigned getRegisterClassForType(bool Vector, Type *Ty) const override
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Get memory intrinsic cost based on arguments.
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool isLegalNTStore(Type *DataType, Align Alignment) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getInterleavedMemoryOpCostAVX512(unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
bool isVectorShiftByScalarCheap(Type *Ty) const override
bool isLegalMaskedGather(Type *DataType, Align Alignment) const override
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
unsigned getAtomicMemIntrinsicMaxElementSize() const override
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const override
bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) const override
InstructionCost getBranchMispredictPenalty() const override
bool isExpensiveToSpeculativelyExecute(const Instruction *I) const override
bool hasConditionalLoadStoreForType(Type *Ty, bool IsStore) const override
bool isLegalMaskedStore(Type *DataType, Align Alignment, unsigned AddressSpace, TTI::MaskKind MaskKind=TTI::MaskKind::VariableOrConstantMask) const override
std::optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool isLegalMaskedGatherScatter(Type *DataType, Align Alignment) const
bool isLegalMaskedLoad(Type *DataType, Align Alignment, unsigned AddressSpace, TTI::MaskKind MaskKind=TTI::MaskKind::VariableOrConstantMask) const override
bool enableInterleavedAccessVectorization() const override
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const override
unsigned getNumberOfRegisters(unsigned ClassID) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const override
bool hasDivRemOp(Type *DataType, bool IsSigned) const override
bool isLegalMaskedCompressStore(Type *DataType, Align Alignment) const override
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
bool supportsEfficientVectorElementLoadStore() const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getIntImmCost(int64_t) const
Calculate the cost of materializing a 64-bit value.
InstructionCost getMinMaxCost(Intrinsic::ID IID, Type *Ty, TTI::TargetCostKind CostKind, FastMathFlags FMF) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool canMacroFuseCmp() const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool prefersVectorizedAddressing() const override
bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Type) const override
InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask, TTI::TargetCostKind CostKind) const override
bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
Estimate the overhead of scalarizing an instruction.
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
Calculate the cost of Gather / Scatter operation.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition APInt.cpp:3020
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:819
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:788
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:779
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:880
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:747
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:280
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:844
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:787
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:374
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:796
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:348
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:850
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:356
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:727
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:978
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:926
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:959
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:856
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:365
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
ap_match< APInt > m_APIntAllowPoison(const APInt *&Res)
Match APInt while allowing poison in splat vector constants.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
constexpr auto not_equal_to(T &&Arg)
Functor variant of std::not_equal_to that can be used as a UnaryPredicate in functional algorithms li...
Definition STLExtras.h:2180
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
FunctionAddr VTableAddr uintptr_t uintptr_t DataSize
Definition InstrProf.h:267
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1885
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2019
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
LLVM_ABI void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition CostTable.h:66
LLVM_ABI int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
#define N
std::optional< unsigned > operator[](TargetTransformInfo::TargetCostKind Kind) const
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Cost Table Entry.
Definition CostTable.h:25
Extended Value Type.
Definition ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition KnownBits.h:258
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Type Conversion Cost Table.
Definition CostTable.h:55