LLVM 23.0.0git
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements a TargetTransformInfo analysis pass specific to the
10/// X86 target machine. It uses the target's detailed information to provide
11/// more precise answers to certain TTI queries, while letting the target
12/// independent and default TTI implementations handle the rest.
13///
14//===----------------------------------------------------------------------===//
15/// About Cost Model numbers used below it's necessary to say the following:
16/// the numbers correspond to some "generic" X86 CPU instead of usage of a
17/// specific CPU model. Usually the numbers correspond to the CPU where the
18/// feature first appeared. For example, if we do Subtarget.hasSSE42() in
19/// the lookups below the cost is based on Nehalem as that was the first CPU
20/// to support that feature level and thus has most likely the worst case cost,
21/// although we may discard an outlying worst cost from one CPU (e.g. Atom).
22///
23/// Some examples of other technologies/CPUs:
24/// SSE 3 - Pentium4 / Athlon64
25/// SSE 4.1 - Penryn
26/// SSE 4.2 - Nehalem / Silvermont
27/// AVX - Sandy Bridge / Jaguar / Bulldozer
28/// AVX2 - Haswell / Ryzen
29/// AVX-512 - Xeon Phi / Skylake
30///
31/// And some examples of instruction target dependent costs (latency)
32/// divss sqrtss rsqrtss
33/// AMD K7 11-16 19 3
34/// Piledriver 9-24 13-15 5
35/// Jaguar 14 16 2
36/// Pentium II,III 18 30 2
37/// Nehalem 7-14 7-18 3
38/// Haswell 10-13 11 5
39///
40/// Interpreting the 4 TargetCostKind types:
41/// TCK_RecipThroughput and TCK_Latency should try to match the worst case
42/// values reported by the CPU scheduler models (and llvm-mca).
43/// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the
44/// actual encoding size of the instruction.
45/// TCK_SizeAndLatency should match the worst case micro-op counts reported by
46/// by the CPU scheduler models (and llvm-mca), to ensure that they are
47/// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are
48/// often used as the cost thresholds where TCK_SizeAndLatency is requested.
49//===----------------------------------------------------------------------===//
50
59#include <optional>
60
61using namespace llvm;
62
63#define DEBUG_TYPE "x86tti"
64
65//===----------------------------------------------------------------------===//
66//
67// X86 cost model.
68//
69//===----------------------------------------------------------------------===//
70
71// Helper struct to store/access costs for each cost kind.
72// TODO: Move this to allow other targets to use it?
74 unsigned RecipThroughputCost = ~0U;
75 unsigned LatencyCost = ~0U;
76 unsigned CodeSizeCost = ~0U;
77 unsigned SizeAndLatencyCost = ~0U;
78
79 std::optional<unsigned>
81 unsigned Cost = ~0U;
82 switch (Kind) {
85 break;
88 break;
91 break;
94 break;
95 }
96 if (Cost == ~0U)
97 return std::nullopt;
98 return Cost;
99 }
100};
103
105X86TTIImpl::getPopcntSupport(unsigned TyWidth) const {
106 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
107 // TODO: Currently the __builtin_popcount() implementation using SSE3
108 // instructions is inefficient. Once the problem is fixed, we should
109 // call ST->hasSSE3() instead of ST->hasPOPCNT().
110 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
111}
112
113std::optional<unsigned> X86TTIImpl::getCacheSize(
115 switch (Level) {
117 // - Penryn
118 // - Nehalem
119 // - Westmere
120 // - Sandy Bridge
121 // - Ivy Bridge
122 // - Haswell
123 // - Broadwell
124 // - Skylake
125 // - Kabylake
126 return 32 * 1024; // 32 KiB
128 // - Penryn
129 // - Nehalem
130 // - Westmere
131 // - Sandy Bridge
132 // - Ivy Bridge
133 // - Haswell
134 // - Broadwell
135 // - Skylake
136 // - Kabylake
137 return 256 * 1024; // 256 KiB
138 }
139
140 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
141}
142
143std::optional<unsigned> X86TTIImpl::getCacheAssociativity(
145 // - Penryn
146 // - Nehalem
147 // - Westmere
148 // - Sandy Bridge
149 // - Ivy Bridge
150 // - Haswell
151 // - Broadwell
152 // - Skylake
153 // - Kabylake
154 switch (Level) {
156 [[fallthrough]];
158 return 8;
159 }
160
161 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
162}
163
165
167 return Vector ? VectorClass
168 : Ty && Ty->isFloatingPointTy() ? ScalarFPClass
169 : GPRClass;
170}
171
172unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
173 if (ClassID == VectorClass && !ST->hasSSE1())
174 return 0;
175
176 if (!ST->is64Bit())
177 return 8;
178
179 if ((ClassID == GPRClass && ST->hasEGPR()) ||
180 (ClassID != GPRClass && ST->hasAVX512()))
181 return 32;
182
183 return 16;
184}
185
187 if (!ST->hasCF())
188 return false;
189 if (!Ty)
190 return true;
191 // Conditional faulting is supported by CFCMOV, which only accepts
192 // 16/32/64-bit operands.
193 // TODO: Support f32/f64 with VMOVSS/VMOVSD with zero mask when it's
194 // profitable.
195 auto *VTy = dyn_cast<FixedVectorType>(Ty);
196 if (!Ty->isIntegerTy() && (!VTy || VTy->getNumElements() != 1))
197 return false;
198 auto *ScalarTy = Ty->getScalarType();
199 switch (cast<IntegerType>(ScalarTy)->getBitWidth()) {
200 default:
201 return false;
202 case 16:
203 case 32:
204 case 64:
205 return true;
206 }
207}
208
211 unsigned PreferVectorWidth = ST->getPreferVectorWidth();
212 switch (K) {
214 return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
216 if (ST->hasAVX512() && PreferVectorWidth >= 512)
217 return TypeSize::getFixed(512);
218 if (ST->hasAVX() && PreferVectorWidth >= 256)
219 return TypeSize::getFixed(256);
220 if (ST->hasSSE1() && PreferVectorWidth >= 128)
221 return TypeSize::getFixed(128);
222 return TypeSize::getFixed(0);
224 return TypeSize::getScalable(0);
225 }
226
227 llvm_unreachable("Unsupported register kind");
228}
229
234
236 // If the loop will not be vectorized, don't interleave the loop.
237 // Let regular unroll to unroll the loop, which saves the overflow
238 // check and memory check cost.
239 if (VF.isScalar())
240 return 1;
241
242 if (ST->isAtom())
243 return 1;
244
245 // Sandybridge and Haswell have multiple execution ports and pipelined
246 // vector units.
247 if (ST->hasAVX())
248 return 4;
249
250 return 2;
251}
252
254 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
256 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
257
258 // vXi8 multiplications are always promoted to vXi16.
259 // Sub-128-bit types can be extended/packed more efficiently.
260 if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
261 Ty->getPrimitiveSizeInBits() <= 64 && Ty->getScalarSizeInBits() == 8) {
262 Type *WideVecTy =
264 return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
266 CostKind) +
267 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
269 CostKind) +
270 getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info);
271 }
272
273 // Legalize the type.
274 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
275
276 int ISD = TLI->InstructionOpcodeToISD(Opcode);
277 assert(ISD && "Invalid opcode");
278
279 if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
280 (LT.second.getScalarType() == MVT::i32 ||
281 LT.second.getScalarType() == MVT::i64)) {
282 // Check if the operands can be represented as a smaller datatype.
283 bool Op1Signed = false, Op2Signed = false;
284 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
285 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
286 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
287 bool SignedMode = Op1Signed || Op2Signed;
288
289 // If both vXi32 are representable as i15 and at least one is constant,
290 // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
291 // can treat this as PMADDWD which has the same costs as a vXi16 multiply.
292 if (OpMinSize <= 15 && !ST->isPMADDWDSlow() &&
293 LT.second.getScalarType() == MVT::i32) {
294 bool Op1Constant =
295 isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
296 bool Op2Constant =
297 isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
298 bool Op1Sext = isa<SExtInst>(Args[0]) &&
299 (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
300 bool Op2Sext = isa<SExtInst>(Args[1]) &&
301 (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
302
303 bool IsZeroExtended = !Op1Signed || !Op2Signed;
304 bool IsConstant = Op1Constant || Op2Constant;
305 bool IsSext = Op1Sext || Op2Sext;
306 if (IsConstant || IsZeroExtended || IsSext)
307 LT.second =
308 MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
309 }
310
311 // Check if the vXi32 operands can be shrunk into a smaller datatype.
312 // This should match the codegen from reduceVMULWidth.
313 // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()).
314 if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) {
315 if (OpMinSize <= 7)
316 return LT.first * 3; // pmullw/sext
317 if (!SignedMode && OpMinSize <= 8)
318 return LT.first * 3; // pmullw/zext
319 if (OpMinSize <= 15)
320 return LT.first * 5; // pmullw/pmulhw/pshuf
321 if (!SignedMode && OpMinSize <= 16)
322 return LT.first * 5; // pmullw/pmulhw/pshuf
323 }
324
325 // If both vXi64 are representable as (unsigned) i32, then we can perform
326 // the multiple with a single PMULUDQ instruction.
327 // TODO: Add (SSE41+) PMULDQ handling for signed extensions.
328 if (!SignedMode && OpMinSize <= 32 && LT.second.getScalarType() == MVT::i64)
330 }
331
332 // Vector multiply by pow2 will be simplified to shifts.
333 // Vector multiply by -pow2 will be simplified to shifts/negates.
334 if (ISD == ISD::MUL && Op2Info.isConstant() &&
335 (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) {
337 getArithmeticInstrCost(Instruction::Shl, Ty, CostKind,
338 Op1Info.getNoProps(), Op2Info.getNoProps());
339 if (Op2Info.isNegatedPowerOf2())
340 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
341 return Cost;
342 }
343
344 // On X86, vector signed division by constants power-of-two are
345 // normally expanded to the sequence SRA + SRL + ADD + SRA.
346 // The OperandValue properties may not be the same as that of the previous
347 // operation; conservatively assume OP_None.
348 if ((ISD == ISD::SDIV || ISD == ISD::SREM) &&
349 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
351 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
352 Op1Info.getNoProps(), Op2Info.getNoProps());
353 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
354 Op1Info.getNoProps(), Op2Info.getNoProps());
355 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
356 Op1Info.getNoProps(), Op2Info.getNoProps());
357
358 if (ISD == ISD::SREM) {
359 // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
360 Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(),
361 Op2Info.getNoProps());
362 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(),
363 Op2Info.getNoProps());
364 }
365
366 return Cost;
367 }
368
369 // Vector unsigned division/remainder will be simplified to shifts/masks.
370 if ((ISD == ISD::UDIV || ISD == ISD::UREM) &&
371 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
372 if (ISD == ISD::UDIV)
373 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
374 Op1Info.getNoProps(), Op2Info.getNoProps());
375 // UREM
376 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
377 Op1Info.getNoProps(), Op2Info.getNoProps());
378 }
379
380 static const CostKindTblEntry GFNIUniformConstCostTable[] = {
381 { ISD::SHL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
382 { ISD::SRL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
383 { ISD::SRA, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
384 { ISD::SHL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
385 { ISD::SRL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
386 { ISD::SRA, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
387 { ISD::SHL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
388 { ISD::SRL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
389 { ISD::SRA, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
390 };
391
392 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasGFNI())
393 if (const auto *Entry =
394 CostTableLookup(GFNIUniformConstCostTable, ISD, LT.second))
395 if (auto KindCost = Entry->Cost[CostKind])
396 return LT.first * *KindCost;
397
398 static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
399 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
400 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
401 { ISD::SRA, MVT::v16i8, { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb.
402 { ISD::SHL, MVT::v32i8, { 1, 8, 2, 3 } }, // psllw + pand.
403 { ISD::SRL, MVT::v32i8, { 1, 8, 2, 3 } }, // psrlw + pand.
404 { ISD::SRA, MVT::v32i8, { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb.
405 { ISD::SHL, MVT::v64i8, { 1, 8, 2, 3 } }, // psllw + pand.
406 { ISD::SRL, MVT::v64i8, { 1, 8, 2, 3 } }, // psrlw + pand.
407 { ISD::SRA, MVT::v64i8, { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb.
408
409 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // psllw
410 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
411 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
412 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // psllw
413 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
414 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
415 };
416
417 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI())
418 if (const auto *Entry =
419 CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second))
420 if (auto KindCost = Entry->Cost[CostKind])
421 return LT.first * *KindCost;
422
423 static const CostKindTblEntry AVX512UniformConstCostTable[] = {
424 { ISD::SHL, MVT::v64i8, { 2, 12, 5, 6 } }, // psllw + pand.
425 { ISD::SRL, MVT::v64i8, { 2, 12, 5, 6 } }, // psrlw + pand.
426 { ISD::SRA, MVT::v64i8, { 3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb.
427
428 { ISD::SHL, MVT::v16i16, { 2, 7, 4, 4 } }, // psllw + split.
429 { ISD::SRL, MVT::v16i16, { 2, 7, 4, 4 } }, // psrlw + split.
430 { ISD::SRA, MVT::v16i16, { 2, 7, 4, 4 } }, // psraw + split.
431
432 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, // pslld
433 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, // psrld
434 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, // psrad
435 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, // pslld
436 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, // psrld
437 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, // psrad
438
439 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, // psraq
440 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, // psllq
441 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, // psrlq
442 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, // psraq
443 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, // psllq
444 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, // psrlq
445 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, // psraq
446
447 { ISD::SDIV, MVT::v16i32, { 6 } }, // pmuludq sequence
448 { ISD::SREM, MVT::v16i32, { 8 } }, // pmuludq+mul+sub sequence
449 { ISD::UDIV, MVT::v16i32, { 5 } }, // pmuludq sequence
450 { ISD::UREM, MVT::v16i32, { 7 } }, // pmuludq+mul+sub sequence
451 };
452
453 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512())
454 if (const auto *Entry =
455 CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second))
456 if (auto KindCost = Entry->Cost[CostKind])
457 return LT.first * *KindCost;
458
459 static const CostKindTblEntry AVX2UniformConstCostTable[] = {
460 { ISD::SHL, MVT::v16i8, { 1, 8, 2, 3 } }, // psllw + pand.
461 { ISD::SRL, MVT::v16i8, { 1, 8, 2, 3 } }, // psrlw + pand.
462 { ISD::SRA, MVT::v16i8, { 2, 10, 5, 6 } }, // psrlw, pand, pxor, psubb.
463 { ISD::SHL, MVT::v32i8, { 2, 8, 2, 4 } }, // psllw + pand.
464 { ISD::SRL, MVT::v32i8, { 2, 8, 2, 4 } }, // psrlw + pand.
465 { ISD::SRA, MVT::v32i8, { 3, 10, 5, 9 } }, // psrlw, pand, pxor, psubb.
466
467 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw
468 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw
469 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw
470 { ISD::SHL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psllw
471 { ISD::SRL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psrlw
472 { ISD::SRA, MVT::v16i16,{ 2, 2, 1, 2 } }, // psraw
473
474 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
475 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld
476 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad
477 { ISD::SHL, MVT::v8i32, { 2, 2, 1, 2 } }, // pslld
478 { ISD::SRL, MVT::v8i32, { 2, 2, 1, 2 } }, // psrld
479 { ISD::SRA, MVT::v8i32, { 2, 2, 1, 2 } }, // psrad
480
481 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq
482 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq
483 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
484 { ISD::SHL, MVT::v4i64, { 2, 2, 1, 2 } }, // psllq
485 { ISD::SRL, MVT::v4i64, { 2, 2, 1, 2 } }, // psrlq
486 { ISD::SRA, MVT::v4i64, { 4, 4, 3, 6 } }, // psrad + shuffle + split.
487
488 { ISD::SDIV, MVT::v8i32, { 6 } }, // pmuludq sequence
489 { ISD::SREM, MVT::v8i32, { 8 } }, // pmuludq+mul+sub sequence
490 { ISD::UDIV, MVT::v8i32, { 5 } }, // pmuludq sequence
491 { ISD::UREM, MVT::v8i32, { 7 } }, // pmuludq+mul+sub sequence
492 };
493
494 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2())
495 if (const auto *Entry =
496 CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second))
497 if (auto KindCost = Entry->Cost[CostKind])
498 return LT.first * *KindCost;
499
500 static const CostKindTblEntry AVXUniformConstCostTable[] = {
501 { ISD::SHL, MVT::v16i8, { 2, 7, 2, 3 } }, // psllw + pand.
502 { ISD::SRL, MVT::v16i8, { 2, 7, 2, 3 } }, // psrlw + pand.
503 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
504 { ISD::SHL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psllw + pand) + split.
505 { ISD::SRL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psrlw + pand) + split.
506 { ISD::SRA, MVT::v32i8, { 7, 7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split.
507
508 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 1 } }, // psllw.
509 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 1 } }, // psrlw.
510 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 1 } }, // psraw.
511 { ISD::SHL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psllw + split.
512 { ISD::SRL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psrlw + split.
513 { ISD::SRA, MVT::v16i16,{ 3, 6, 4, 5 } }, // psraw + split.
514
515 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 1 } }, // pslld.
516 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 1 } }, // psrld.
517 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 1 } }, // psrad.
518 { ISD::SHL, MVT::v8i32, { 3, 6, 4, 5 } }, // pslld + split.
519 { ISD::SRL, MVT::v8i32, { 3, 6, 4, 5 } }, // psrld + split.
520 { ISD::SRA, MVT::v8i32, { 3, 6, 4, 5 } }, // psrad + split.
521
522 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 1 } }, // psllq.
523 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 1 } }, // psrlq.
524 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
525 { ISD::SHL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
526 { ISD::SRL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
527 { ISD::SRA, MVT::v4i64, { 5, 7, 8, 9 } }, // 2 x psrad + shuffle + split.
528
529 { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split.
530 { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split.
531 { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split.
532 { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split.
533 };
534
535 // XOP has faster vXi8 shifts.
536 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() &&
537 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
538 if (const auto *Entry =
539 CostTableLookup(AVXUniformConstCostTable, ISD, LT.second))
540 if (auto KindCost = Entry->Cost[CostKind])
541 return LT.first * *KindCost;
542
543 static const CostKindTblEntry SSE2UniformConstCostTable[] = {
544 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
545 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
546 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
547
548 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw.
549 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw.
550 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw.
551
552 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
553 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld.
554 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad.
555
556 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq.
557 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq.
558 { ISD::SRA, MVT::v2i64, { 3, 5, 6, 6 } }, // 2 x psrad + shuffle.
559
560 { ISD::SDIV, MVT::v4i32, { 6 } }, // pmuludq sequence
561 { ISD::SREM, MVT::v4i32, { 8 } }, // pmuludq+mul+sub sequence
562 { ISD::UDIV, MVT::v4i32, { 5 } }, // pmuludq sequence
563 { ISD::UREM, MVT::v4i32, { 7 } }, // pmuludq+mul+sub sequence
564 };
565
566 // XOP has faster vXi8 shifts.
567 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() &&
568 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
569 if (const auto *Entry =
570 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
571 if (auto KindCost = Entry->Cost[CostKind])
572 return LT.first * *KindCost;
573
574 static const CostKindTblEntry AVX512BWConstCostTable[] = {
575 { ISD::SDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
576 { ISD::SREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
577 { ISD::UDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
578 { ISD::UREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
579
580 { ISD::SDIV, MVT::v32i16, { 6 } }, // vpmulhw sequence
581 { ISD::SREM, MVT::v32i16, { 8 } }, // vpmulhw+mul+sub sequence
582 { ISD::UDIV, MVT::v32i16, { 6 } }, // vpmulhuw sequence
583 { ISD::UREM, MVT::v32i16, { 8 } }, // vpmulhuw+mul+sub sequence
584 };
585
586 if (Op2Info.isConstant() && ST->hasBWI())
587 if (const auto *Entry =
588 CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
589 if (auto KindCost = Entry->Cost[CostKind])
590 return LT.first * *KindCost;
591
592 static const CostKindTblEntry AVX512ConstCostTable[] = {
593 { ISD::SDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
594 { ISD::SREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
595 { ISD::UDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
596 { ISD::UREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
597
598 { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence
599 { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence
600 { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence
601 { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence
602
603 { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence
604 { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence
605 { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence
606 { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence
607 };
608
609 if (Op2Info.isConstant() && ST->hasAVX512())
610 if (const auto *Entry =
611 CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
612 if (auto KindCost = Entry->Cost[CostKind])
613 return LT.first * *KindCost;
614
615 static const CostKindTblEntry AVX2ConstCostTable[] = {
616 { ISD::SDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
617 { ISD::SREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
618 { ISD::UDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
619 { ISD::UREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
620
621 { ISD::SDIV, MVT::v16i16, { 6 } }, // vpmulhw sequence
622 { ISD::SREM, MVT::v16i16, { 8 } }, // vpmulhw+mul+sub sequence
623 { ISD::UDIV, MVT::v16i16, { 6 } }, // vpmulhuw sequence
624 { ISD::UREM, MVT::v16i16, { 8 } }, // vpmulhuw+mul+sub sequence
625
626 { ISD::SDIV, MVT::v8i32, { 15 } }, // vpmuldq sequence
627 { ISD::SREM, MVT::v8i32, { 19 } }, // vpmuldq+mul+sub sequence
628 { ISD::UDIV, MVT::v8i32, { 15 } }, // vpmuludq sequence
629 { ISD::UREM, MVT::v8i32, { 19 } }, // vpmuludq+mul+sub sequence
630 };
631
632 if (Op2Info.isConstant() && ST->hasAVX2())
633 if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
634 if (auto KindCost = Entry->Cost[CostKind])
635 return LT.first * *KindCost;
636
637 static const CostKindTblEntry AVXConstCostTable[] = {
638 { ISD::SDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
639 { ISD::SREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
640 { ISD::UDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
641 { ISD::UREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
642
643 { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split.
644 { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split.
645 { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split.
646 { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split.
647
648 { ISD::SDIV, MVT::v8i32, { 32 } }, // vpmuludq sequence
649 { ISD::SREM, MVT::v8i32, { 38 } }, // vpmuludq+mul+sub sequence
650 { ISD::UDIV, MVT::v8i32, { 32 } }, // 2*pmuludq sequence + split.
651 { ISD::UREM, MVT::v8i32, { 42 } }, // 2*pmuludq+mul+sub sequence + split.
652 };
653
654 if (Op2Info.isConstant() && ST->hasAVX())
655 if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second))
656 if (auto KindCost = Entry->Cost[CostKind])
657 return LT.first * *KindCost;
658
659 static const CostKindTblEntry SSE41ConstCostTable[] = {
660 { ISD::SDIV, MVT::v4i32, { 15 } }, // vpmuludq sequence
661 { ISD::SREM, MVT::v4i32, { 20 } }, // vpmuludq+mul+sub sequence
662 };
663
664 if (Op2Info.isConstant() && ST->hasSSE41())
665 if (const auto *Entry =
666 CostTableLookup(SSE41ConstCostTable, ISD, LT.second))
667 if (auto KindCost = Entry->Cost[CostKind])
668 return LT.first * *KindCost;
669
670 static const CostKindTblEntry SSE2ConstCostTable[] = {
671 { ISD::SDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
672 { ISD::SREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
673 { ISD::UDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
674 { ISD::UREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
675
676 { ISD::SDIV, MVT::v8i16, { 6 } }, // pmulhw sequence
677 { ISD::SREM, MVT::v8i16, { 8 } }, // pmulhw+mul+sub sequence
678 { ISD::UDIV, MVT::v8i16, { 6 } }, // pmulhuw sequence
679 { ISD::UREM, MVT::v8i16, { 8 } }, // pmulhuw+mul+sub sequence
680
681 { ISD::SDIV, MVT::v4i32, { 19 } }, // pmuludq sequence
682 { ISD::SREM, MVT::v4i32, { 24 } }, // pmuludq+mul+sub sequence
683 { ISD::UDIV, MVT::v4i32, { 15 } }, // pmuludq sequence
684 { ISD::UREM, MVT::v4i32, { 20 } }, // pmuludq+mul+sub sequence
685 };
686
687 if (Op2Info.isConstant() && ST->hasSSE2())
688 if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
689 if (auto KindCost = Entry->Cost[CostKind])
690 return LT.first * *KindCost;
691
692 static const CostKindTblEntry AVX512BWUniformCostTable[] = {
693 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
694 { ISD::SRL, MVT::v16i8, { 3,10, 5, 8 } }, // psrlw + pand.
695 { ISD::SRA, MVT::v16i8, { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb.
696 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
697 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
698 { ISD::SRA, MVT::v32i8, { 5,10,10,13 } }, // psrlw, pand, pxor, psubb.
699 { ISD::SHL, MVT::v64i8, { 4, 7, 6, 8 } }, // psllw + pand.
700 { ISD::SRL, MVT::v64i8, { 4, 8, 7,10 } }, // psrlw + pand.
701 { ISD::SRA, MVT::v64i8, { 5,10,10,15 } }, // psrlw, pand, pxor, psubb.
702
703 { ISD::SHL, MVT::v32i16, { 2, 4, 2, 3 } }, // psllw
704 { ISD::SRL, MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw
705 { ISD::SRA, MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw
706 };
707
708 if (ST->hasBWI() && Op2Info.isUniform())
709 if (const auto *Entry =
710 CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second))
711 if (auto KindCost = Entry->Cost[CostKind])
712 return LT.first * *KindCost;
713
714 static const CostKindTblEntry AVX512UniformCostTable[] = {
715 { ISD::SHL, MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split.
716 { ISD::SRL, MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split.
717 { ISD::SRA, MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split.
718
719 { ISD::SHL, MVT::v16i32, { 2, 4, 2, 3 } }, // pslld
720 { ISD::SRL, MVT::v16i32, { 2, 4, 2, 3 } }, // psrld
721 { ISD::SRA, MVT::v16i32, { 2, 4, 2, 3 } }, // psrad
722
723 { ISD::SRA, MVT::v2i64, { 1, 2, 1, 2 } }, // psraq
724 { ISD::SHL, MVT::v4i64, { 1, 4, 1, 2 } }, // psllq
725 { ISD::SRL, MVT::v4i64, { 1, 4, 1, 2 } }, // psrlq
726 { ISD::SRA, MVT::v4i64, { 1, 4, 1, 2 } }, // psraq
727 { ISD::SHL, MVT::v8i64, { 1, 4, 1, 2 } }, // psllq
728 { ISD::SRL, MVT::v8i64, { 1, 4, 1, 2 } }, // psrlq
729 { ISD::SRA, MVT::v8i64, { 1, 4, 1, 2 } }, // psraq
730 };
731
732 if (ST->hasAVX512() && Op2Info.isUniform())
733 if (const auto *Entry =
734 CostTableLookup(AVX512UniformCostTable, ISD, LT.second))
735 if (auto KindCost = Entry->Cost[CostKind])
736 return LT.first * *KindCost;
737
738 static const CostKindTblEntry AVX2UniformCostTable[] = {
739 // Uniform splats are cheaper for the following instructions.
740 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
741 { ISD::SRL, MVT::v16i8, { 3, 9, 5, 8 } }, // psrlw + pand.
742 { ISD::SRA, MVT::v16i8, { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb.
743 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
744 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
745 { ISD::SRA, MVT::v32i8, { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb.
746
747 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 2 } }, // psllw.
748 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 2 } }, // psrlw.
749 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 2 } }, // psraw.
750 { ISD::SHL, MVT::v16i16, { 2, 4, 2, 3 } }, // psllw.
751 { ISD::SRL, MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw.
752 { ISD::SRA, MVT::v16i16, { 2, 4, 2, 3 } }, // psraw.
753
754 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 2 } }, // pslld
755 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 2 } }, // psrld
756 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 2 } }, // psrad
757 { ISD::SHL, MVT::v8i32, { 2, 4, 2, 3 } }, // pslld
758 { ISD::SRL, MVT::v8i32, { 2, 4, 2, 3 } }, // psrld
759 { ISD::SRA, MVT::v8i32, { 2, 4, 2, 3 } }, // psrad
760
761 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 2 } }, // psllq
762 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 2 } }, // psrlq
763 { ISD::SRA, MVT::v2i64, { 2, 4, 5, 7 } }, // 2 x psrad + shuffle.
764 { ISD::SHL, MVT::v4i64, { 2, 4, 1, 2 } }, // psllq
765 { ISD::SRL, MVT::v4i64, { 2, 4, 1, 2 } }, // psrlq
766 { ISD::SRA, MVT::v4i64, { 4, 6, 5, 9 } }, // 2 x psrad + shuffle.
767 };
768
769 if (ST->hasAVX2() && Op2Info.isUniform())
770 if (const auto *Entry =
771 CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
772 if (auto KindCost = Entry->Cost[CostKind])
773 return LT.first * *KindCost;
774
775 static const CostKindTblEntry AVXUniformCostTable[] = {
776 { ISD::SHL, MVT::v16i8, { 4, 4, 6, 8 } }, // psllw + pand.
777 { ISD::SRL, MVT::v16i8, { 4, 8, 5, 8 } }, // psrlw + pand.
778 { ISD::SRA, MVT::v16i8, { 6, 6, 9,13 } }, // psrlw, pand, pxor, psubb.
779 { ISD::SHL, MVT::v32i8, { 7, 8,11,14 } }, // psllw + pand + split.
780 { ISD::SRL, MVT::v32i8, { 7, 9,10,14 } }, // psrlw + pand + split.
781 { ISD::SRA, MVT::v32i8, { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split.
782
783 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 2 } }, // psllw.
784 { ISD::SRL, MVT::v8i16, { 1, 3, 1, 2 } }, // psrlw.
785 { ISD::SRA, MVT::v8i16, { 1, 3, 1, 2 } }, // psraw.
786 { ISD::SHL, MVT::v16i16, { 3, 7, 5, 7 } }, // psllw + split.
787 { ISD::SRL, MVT::v16i16, { 3, 7, 5, 7 } }, // psrlw + split.
788 { ISD::SRA, MVT::v16i16, { 3, 7, 5, 7 } }, // psraw + split.
789
790 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 2 } }, // pslld.
791 { ISD::SRL, MVT::v4i32, { 1, 3, 1, 2 } }, // psrld.
792 { ISD::SRA, MVT::v4i32, { 1, 3, 1, 2 } }, // psrad.
793 { ISD::SHL, MVT::v8i32, { 3, 7, 5, 7 } }, // pslld + split.
794 { ISD::SRL, MVT::v8i32, { 3, 7, 5, 7 } }, // psrld + split.
795 { ISD::SRA, MVT::v8i32, { 3, 7, 5, 7 } }, // psrad + split.
796
797 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 2 } }, // psllq.
798 { ISD::SRL, MVT::v2i64, { 1, 3, 1, 2 } }, // psrlq.
799 { ISD::SRA, MVT::v2i64, { 3, 4, 5, 7 } }, // 2 x psrad + shuffle.
800 { ISD::SHL, MVT::v4i64, { 3, 7, 4, 6 } }, // psllq + split.
801 { ISD::SRL, MVT::v4i64, { 3, 7, 4, 6 } }, // psrlq + split.
802 { ISD::SRA, MVT::v4i64, { 6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split.
803 };
804
805 // XOP has faster vXi8 shifts.
806 if (ST->hasAVX() && Op2Info.isUniform() &&
807 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
808 if (const auto *Entry =
809 CostTableLookup(AVXUniformCostTable, ISD, LT.second))
810 if (auto KindCost = Entry->Cost[CostKind])
811 return LT.first * *KindCost;
812
813 static const CostKindTblEntry SSE2UniformCostTable[] = {
814 // Uniform splats are cheaper for the following instructions.
815 { ISD::SHL, MVT::v16i8, { 9, 10, 6, 9 } }, // psllw + pand.
816 { ISD::SRL, MVT::v16i8, { 9, 13, 5, 9 } }, // psrlw + pand.
817 { ISD::SRA, MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence.
818
819 { ISD::SHL, MVT::v8i16, { 2, 2, 1, 2 } }, // psllw.
820 { ISD::SRL, MVT::v8i16, { 2, 2, 1, 2 } }, // psrlw.
821 { ISD::SRA, MVT::v8i16, { 2, 2, 1, 2 } }, // psraw.
822
823 { ISD::SHL, MVT::v4i32, { 2, 2, 1, 2 } }, // pslld
824 { ISD::SRL, MVT::v4i32, { 2, 2, 1, 2 } }, // psrld.
825 { ISD::SRA, MVT::v4i32, { 2, 2, 1, 2 } }, // psrad.
826
827 { ISD::SHL, MVT::v2i64, { 2, 2, 1, 2 } }, // psllq.
828 { ISD::SRL, MVT::v2i64, { 2, 2, 1, 2 } }, // psrlq.
829 { ISD::SRA, MVT::v2i64, { 5, 9, 5, 7 } }, // 2*psrlq + xor + sub.
830 };
831
832 if (ST->hasSSE2() && Op2Info.isUniform() &&
833 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
834 if (const auto *Entry =
835 CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
836 if (auto KindCost = Entry->Cost[CostKind])
837 return LT.first * *KindCost;
838
839 static const CostKindTblEntry AVX512DQCostTable[] = {
840 { ISD::MUL, MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq
841 { ISD::MUL, MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq
842 { ISD::MUL, MVT::v8i64, { 3, 15, 1, 3 } } // pmullq
843 };
844
845 // Look for AVX512DQ lowering tricks for custom cases.
846 if (ST->hasDQI())
847 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
848 if (auto KindCost = Entry->Cost[CostKind])
849 return LT.first * *KindCost;
850
851 static const CostKindTblEntry AVX512BWCostTable[] = {
852 { ISD::SHL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsllvw/pack sequence.
853 { ISD::SRL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsrlvw/pack sequence.
854 { ISD::SRA, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsravw/pack sequence.
855 { ISD::SHL, MVT::v32i8, { 4, 23,11,16 } }, // extend/vpsllvw/pack sequence.
856 { ISD::SRL, MVT::v32i8, { 4, 30,12,18 } }, // extend/vpsrlvw/pack sequence.
857 { ISD::SRA, MVT::v32i8, { 6, 13,24,30 } }, // extend/vpsravw/pack sequence.
858 { ISD::SHL, MVT::v64i8, { 6, 19,13,15 } }, // extend/vpsllvw/pack sequence.
859 { ISD::SRL, MVT::v64i8, { 7, 27,15,18 } }, // extend/vpsrlvw/pack sequence.
860 { ISD::SRA, MVT::v64i8, { 15, 15,30,30 } }, // extend/vpsravw/pack sequence.
861
862 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsllvw
863 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsrlvw
864 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsravw
865 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsllvw
866 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsrlvw
867 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsravw
868 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsllvw
869 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsrlvw
870 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsravw
871
872 { ISD::ADD, MVT::v64i8, { 1, 1, 1, 1 } }, // paddb
873 { ISD::ADD, MVT::v32i16, { 1, 1, 1, 1 } }, // paddw
874
875 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 1 } }, // paddb
876 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 1 } }, // paddw
877 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 1 } }, // paddd
878 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 1 } }, // paddq
879
880 { ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb
881 { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw
882
883 { ISD::MUL, MVT::v16i8, { 4, 12, 4, 5 } }, // extend/pmullw/trunc
884 { ISD::MUL, MVT::v32i8, { 3, 10, 7,10 } }, // pmaddubsw
885 { ISD::MUL, MVT::v64i8, { 3, 11, 7,10 } }, // pmaddubsw
886 { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw
887
888 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb
889 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 1 } }, // psubw
890 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 1 } }, // psubd
891 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 1 } }, // psubq
892 };
893
894 // Look for AVX512BW lowering tricks for custom cases.
895 if (ST->hasBWI())
896 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
897 if (auto KindCost = Entry->Cost[CostKind])
898 return LT.first * *KindCost;
899
900 static const CostKindTblEntry AVX512CostTable[] = {
901 { ISD::SHL, MVT::v64i8, { 15, 19,27,33 } }, // vpblendv+split sequence.
902 { ISD::SRL, MVT::v64i8, { 15, 19,30,36 } }, // vpblendv+split sequence.
903 { ISD::SRA, MVT::v64i8, { 37, 37,51,63 } }, // vpblendv+split sequence.
904
905 { ISD::SHL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
906 { ISD::SRL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
907 { ISD::SRA, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence.
908
909 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } },
910 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } },
911 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } },
912 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } },
913 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } },
914 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } },
915 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } },
916 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } },
917 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } },
918
919 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } },
920 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } },
921 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } },
922 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } },
923 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } },
924 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } },
925 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } },
926 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } },
927 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } },
928
929 { ISD::ADD, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*paddb + split
930 { ISD::ADD, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*paddw + split
931
932 { ISD::SUB, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*psubb + split
933 { ISD::SUB, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*psubw + split
934
935 { ISD::AND, MVT::v32i8, { 1, 1, 1, 1 } },
936 { ISD::AND, MVT::v16i16, { 1, 1, 1, 1 } },
937 { ISD::AND, MVT::v8i32, { 1, 1, 1, 1 } },
938 { ISD::AND, MVT::v4i64, { 1, 1, 1, 1 } },
939
940 { ISD::OR, MVT::v32i8, { 1, 1, 1, 1 } },
941 { ISD::OR, MVT::v16i16, { 1, 1, 1, 1 } },
942 { ISD::OR, MVT::v8i32, { 1, 1, 1, 1 } },
943 { ISD::OR, MVT::v4i64, { 1, 1, 1, 1 } },
944
945 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 1 } },
946 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 1 } },
947 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 1 } },
948 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 1 } },
949
950 { ISD::MUL, MVT::v16i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
951 { ISD::MUL, MVT::v8i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
952 { ISD::MUL, MVT::v4i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
953 { ISD::MUL, MVT::v8i64, { 6, 9, 8, 8 } }, // 3*pmuludq/3*shift/2*add
954 { ISD::MUL, MVT::i64, { 1 } }, // Skylake from http://www.agner.org/
955
956 { X86ISD::PMULUDQ, MVT::v8i64, { 1, 5, 1, 1 } },
957
958 { ISD::FNEG, MVT::v8f64, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
959 { ISD::FADD, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
960 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
961 { ISD::FSUB, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
962 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
963 { ISD::FMUL, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
964 { ISD::FMUL, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
965 { ISD::FMUL, MVT::v2f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
966 { ISD::FMUL, MVT::f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
967
968 { ISD::FDIV, MVT::f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
969 { ISD::FDIV, MVT::v2f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
970 { ISD::FDIV, MVT::v4f64, { 8, 14, 1, 1 } }, // Skylake from http://www.agner.org/
971 { ISD::FDIV, MVT::v8f64, { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/
972
973 { ISD::FNEG, MVT::v16f32, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
974 { ISD::FADD, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
975 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
976 { ISD::FSUB, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
977 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
978 { ISD::FMUL, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
979 { ISD::FMUL, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
980 { ISD::FMUL, MVT::v4f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
981 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
982
983 { ISD::FDIV, MVT::f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
984 { ISD::FDIV, MVT::v4f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
985 { ISD::FDIV, MVT::v8f32, { 5, 11, 1, 1 } }, // Skylake from http://www.agner.org/
986 { ISD::FDIV, MVT::v16f32, { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/
987 };
988
989 if (ST->hasAVX512())
990 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
991 if (auto KindCost = Entry->Cost[CostKind])
992 return LT.first * *KindCost;
993
994 static const CostKindTblEntry AVX2ShiftCostTable[] = {
995 // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
996 // customize them to detect the cases where shift amount is a scalar one.
997 { ISD::SHL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org)
998 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
999 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org)
1000 { ISD::SHL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org)
1001 { ISD::SRL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
1002 { ISD::SRA, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org)
1003 { ISD::SHL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org)
1004 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org)
1005 { ISD::SHL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org)
1006 { ISD::SRL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org)
1007 };
1008
1009 if (ST->hasAVX512()) {
1010 if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant())
1011 // On AVX512, a packed v32i16 shift left by a constant build_vector
1012 // is lowered into a vector multiply (vpmullw).
1013 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
1014 Op1Info.getNoProps(), Op2Info.getNoProps());
1015 }
1016
1017 // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
1018 if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
1019 if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
1020 Op2Info.isConstant())
1021 // On AVX2, a packed v16i16 shift left by a constant build_vector
1022 // is lowered into a vector multiply (vpmullw).
1023 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
1024 Op1Info.getNoProps(), Op2Info.getNoProps());
1025
1026 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
1027 if (auto KindCost = Entry->Cost[CostKind])
1028 return LT.first * *KindCost;
1029 }
1030
1031 static const CostKindTblEntry XOPShiftCostTable[] = {
1032 // 128bit shifts take 1cy, but right shifts require negation beforehand.
1033 { ISD::SHL, MVT::v16i8, { 1, 3, 1, 1 } },
1034 { ISD::SRL, MVT::v16i8, { 2, 3, 1, 1 } },
1035 { ISD::SRA, MVT::v16i8, { 2, 3, 1, 1 } },
1036 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 1 } },
1037 { ISD::SRL, MVT::v8i16, { 2, 3, 1, 1 } },
1038 { ISD::SRA, MVT::v8i16, { 2, 3, 1, 1 } },
1039 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 1 } },
1040 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 1 } },
1041 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 1 } },
1042 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 1 } },
1043 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } },
1044 { ISD::SRA, MVT::v2i64, { 2, 3, 1, 1 } },
1045 // 256bit shifts require splitting if AVX2 didn't catch them above.
1046 { ISD::SHL, MVT::v32i8, { 4, 7, 5, 6 } },
1047 { ISD::SRL, MVT::v32i8, { 6, 7, 5, 6 } },
1048 { ISD::SRA, MVT::v32i8, { 6, 7, 5, 6 } },
1049 { ISD::SHL, MVT::v16i16, { 4, 7, 5, 6 } },
1050 { ISD::SRL, MVT::v16i16, { 6, 7, 5, 6 } },
1051 { ISD::SRA, MVT::v16i16, { 6, 7, 5, 6 } },
1052 { ISD::SHL, MVT::v8i32, { 4, 7, 5, 6 } },
1053 { ISD::SRL, MVT::v8i32, { 6, 7, 5, 6 } },
1054 { ISD::SRA, MVT::v8i32, { 6, 7, 5, 6 } },
1055 { ISD::SHL, MVT::v4i64, { 4, 7, 5, 6 } },
1056 { ISD::SRL, MVT::v4i64, { 6, 7, 5, 6 } },
1057 { ISD::SRA, MVT::v4i64, { 6, 7, 5, 6 } },
1058 };
1059
1060 // Look for XOP lowering tricks.
1061 if (ST->hasXOP()) {
1062 // If the right shift is constant then we'll fold the negation so
1063 // it's as cheap as a left shift.
1064 int ShiftISD = ISD;
1065 if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant())
1066 ShiftISD = ISD::SHL;
1067 if (const auto *Entry =
1068 CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
1069 if (auto KindCost = Entry->Cost[CostKind])
1070 return LT.first * *KindCost;
1071 }
1072
1073 if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) {
1074 MVT VT = LT.second;
1075 // Vector shift left by non uniform constant can be lowered
1076 // into vector multiply.
1077 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
1078 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
1079 ISD = ISD::MUL;
1080 }
1081
1082 static const CostKindTblEntry GLMCostTable[] = {
1083 { ISD::FDIV, MVT::f32, { 18, 19, 1, 1 } }, // divss
1084 { ISD::FDIV, MVT::v4f32, { 35, 36, 1, 1 } }, // divps
1085 { ISD::FDIV, MVT::f64, { 33, 34, 1, 1 } }, // divsd
1086 { ISD::FDIV, MVT::v2f64, { 65, 66, 1, 1 } }, // divpd
1087 };
1088
1089 if (ST->useGLMDivSqrtCosts())
1090 if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second))
1091 if (auto KindCost = Entry->Cost[CostKind])
1092 return LT.first * *KindCost;
1093
1094 static const CostKindTblEntry SLMCostTable[] = {
1095 { ISD::MUL, MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld
1096 { ISD::MUL, MVT::v8i16, { 2, 5, 1, 1 } }, // pmullw
1097 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // mulsd
1098 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // mulss
1099 { ISD::FMUL, MVT::v2f64, { 4, 7, 1, 1 } }, // mulpd
1100 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // mulps
1101 { ISD::FDIV, MVT::f32, { 17, 19, 1, 1 } }, // divss
1102 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 6 } }, // divps
1103 { ISD::FDIV, MVT::f64, { 32, 34, 1, 1 } }, // divsd
1104 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 6 } }, // divpd
1105 { ISD::FADD, MVT::v2f64, { 2, 4, 1, 1 } }, // addpd
1106 { ISD::FSUB, MVT::v2f64, { 2, 4, 1, 1 } }, // subpd
1107 // v2i64/v4i64 mul is custom lowered as a series of long:
1108 // multiplies(3), shifts(3) and adds(2)
1109 // slm muldq version throughput is 2 and addq throughput 4
1110 // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
1111 // 3X4 (addq throughput) = 17
1112 { ISD::MUL, MVT::v2i64, { 17, 22, 9, 9 } },
1113 // slm addq\subq throughput is 4
1114 { ISD::ADD, MVT::v2i64, { 4, 2, 1, 2 } },
1115 { ISD::SUB, MVT::v2i64, { 4, 2, 1, 2 } },
1116 };
1117
1118 if (ST->useSLMArithCosts())
1119 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second))
1120 if (auto KindCost = Entry->Cost[CostKind])
1121 return LT.first * *KindCost;
1122
1123 static const CostKindTblEntry AVX2CostTable[] = {
1124 { ISD::SHL, MVT::v16i8, { 6, 21,11,16 } }, // vpblendvb sequence.
1125 { ISD::SHL, MVT::v32i8, { 6, 23,11,22 } }, // vpblendvb sequence.
1126 { ISD::SHL, MVT::v8i16, { 5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence.
1127 { ISD::SHL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1128
1129 { ISD::SRL, MVT::v16i8, { 6, 27,12,18 } }, // vpblendvb sequence.
1130 { ISD::SRL, MVT::v32i8, { 8, 30,12,24 } }, // vpblendvb sequence.
1131 { ISD::SRL, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence.
1132 { ISD::SRL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1133
1134 { ISD::SRA, MVT::v16i8, { 17, 17,24,30 } }, // vpblendvb sequence.
1135 { ISD::SRA, MVT::v32i8, { 18, 20,24,43 } }, // vpblendvb sequence.
1136 { ISD::SRA, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsravd/pack sequence.
1137 { ISD::SRA, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsravd/pack sequence.
1138 { ISD::SRA, MVT::v2i64, { 4, 5, 5, 5 } }, // srl/xor/sub sequence.
1139 { ISD::SRA, MVT::v4i64, { 8, 8, 5, 9 } }, // srl/xor/sub sequence.
1140
1141 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 2 } }, // psubb
1142 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 2 } }, // paddb
1143 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 2 } }, // psubw
1144 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 2 } }, // paddw
1145 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 2 } }, // psubd
1146 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 2 } }, // paddd
1147 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 2 } }, // psubq
1148 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq
1149
1150 { ISD::MUL, MVT::v16i8, { 5, 18, 6,12 } }, // extend/pmullw/pack
1151 { ISD::MUL, MVT::v32i8, { 4, 8, 8,16 } }, // pmaddubsw
1152 { ISD::MUL, MVT::v16i16, { 2, 5, 1, 2 } }, // pmullw
1153 { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld
1154 { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld
1155 { ISD::MUL, MVT::v4i64, { 6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add
1156 { ISD::MUL, MVT::v2i64, { 6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
1157
1158 { X86ISD::PMULUDQ, MVT::v4i64, { 1, 5, 1, 1 } },
1159
1160 { ISD::FNEG, MVT::v4f64, { 1, 1, 1, 2 } }, // vxorpd
1161 { ISD::FNEG, MVT::v8f32, { 1, 1, 1, 2 } }, // vxorps
1162
1163 { ISD::FADD, MVT::f64, { 1, 4, 1, 1 } }, // vaddsd
1164 { ISD::FADD, MVT::f32, { 1, 4, 1, 1 } }, // vaddss
1165 { ISD::FADD, MVT::v2f64, { 1, 4, 1, 1 } }, // vaddpd
1166 { ISD::FADD, MVT::v4f32, { 1, 4, 1, 1 } }, // vaddps
1167 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 2 } }, // vaddpd
1168 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 2 } }, // vaddps
1169
1170 { ISD::FSUB, MVT::f64, { 1, 4, 1, 1 } }, // vsubsd
1171 { ISD::FSUB, MVT::f32, { 1, 4, 1, 1 } }, // vsubss
1172 { ISD::FSUB, MVT::v2f64, { 1, 4, 1, 1 } }, // vsubpd
1173 { ISD::FSUB, MVT::v4f32, { 1, 4, 1, 1 } }, // vsubps
1174 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 2 } }, // vsubpd
1175 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 2 } }, // vsubps
1176
1177 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // vmulsd
1178 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // vmulss
1179 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // vmulpd
1180 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // vmulps
1181 { ISD::FMUL, MVT::v4f64, { 1, 5, 1, 2 } }, // vmulpd
1182 { ISD::FMUL, MVT::v8f32, { 1, 5, 1, 2 } }, // vmulps
1183
1184 { ISD::FDIV, MVT::f32, { 7, 13, 1, 1 } }, // vdivss
1185 { ISD::FDIV, MVT::v4f32, { 7, 13, 1, 1 } }, // vdivps
1186 { ISD::FDIV, MVT::v8f32, { 14, 21, 1, 3 } }, // vdivps
1187 { ISD::FDIV, MVT::f64, { 14, 20, 1, 1 } }, // vdivsd
1188 { ISD::FDIV, MVT::v2f64, { 14, 20, 1, 1 } }, // vdivpd
1189 { ISD::FDIV, MVT::v4f64, { 28, 35, 1, 3 } }, // vdivpd
1190 };
1191
1192 // Look for AVX2 lowering tricks for custom cases.
1193 if (ST->hasAVX2())
1194 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
1195 if (auto KindCost = Entry->Cost[CostKind])
1196 return LT.first * *KindCost;
1197
1198 static const CostKindTblEntry AVX1CostTable[] = {
1199 // We don't have to scalarize unsupported ops. We can issue two half-sized
1200 // operations and we only need to extract the upper YMM half.
1201 // Two ops + 1 extract + 1 insert = 4.
1202 { ISD::MUL, MVT::v32i8, { 10, 11, 18, 19 } }, // pmaddubsw + split
1203 { ISD::MUL, MVT::v16i8, { 5, 6, 8, 12 } }, // 2*pmaddubsw/3*and/psllw/or
1204 { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split
1205 { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split
1206 { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld
1207 { ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } },
1208
1209 { X86ISD::PMULUDQ, MVT::v4i64, { 3, 5, 5, 6 } }, // pmuludq + split
1210
1211 { ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps
1212 { ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps
1213 { ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps
1214 { ISD::AND, MVT::v4i64, { 1, 1, 1, 2 } }, // vandps
1215
1216 { ISD::OR, MVT::v32i8, { 1, 1, 1, 2 } }, // vorps
1217 { ISD::OR, MVT::v16i16, { 1, 1, 1, 2 } }, // vorps
1218 { ISD::OR, MVT::v8i32, { 1, 1, 1, 2 } }, // vorps
1219 { ISD::OR, MVT::v4i64, { 1, 1, 1, 2 } }, // vorps
1220
1221 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 2 } }, // vxorps
1222 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 2 } }, // vxorps
1223 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 2 } }, // vxorps
1224 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 2 } }, // vxorps
1225
1226 { ISD::SUB, MVT::v32i8, { 4, 2, 5, 6 } }, // psubb + split
1227 { ISD::ADD, MVT::v32i8, { 4, 2, 5, 6 } }, // paddb + split
1228 { ISD::SUB, MVT::v16i16, { 4, 2, 5, 6 } }, // psubw + split
1229 { ISD::ADD, MVT::v16i16, { 4, 2, 5, 6 } }, // paddw + split
1230 { ISD::SUB, MVT::v8i32, { 4, 2, 5, 6 } }, // psubd + split
1231 { ISD::ADD, MVT::v8i32, { 4, 2, 5, 6 } }, // paddd + split
1232 { ISD::SUB, MVT::v4i64, { 4, 2, 5, 6 } }, // psubq + split
1233 { ISD::ADD, MVT::v4i64, { 4, 2, 5, 6 } }, // paddq + split
1234 { ISD::SUB, MVT::v2i64, { 1, 1, 1, 1 } }, // psubq
1235 { ISD::ADD, MVT::v2i64, { 1, 1, 1, 1 } }, // paddq
1236
1237 { ISD::SHL, MVT::v16i8, { 10, 21,11,17 } }, // pblendvb sequence.
1238 { ISD::SHL, MVT::v32i8, { 22, 22,27,40 } }, // pblendvb sequence + split.
1239 { ISD::SHL, MVT::v8i16, { 6, 9,11,11 } }, // pblendvb sequence.
1240 { ISD::SHL, MVT::v16i16, { 13, 16,24,25 } }, // pblendvb sequence + split.
1241 { ISD::SHL, MVT::v4i32, { 3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld
1242 { ISD::SHL, MVT::v8i32, { 9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split
1243 { ISD::SHL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1244 { ISD::SHL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1245
1246 { ISD::SRL, MVT::v16i8, { 11, 27,12,18 } }, // pblendvb sequence.
1247 { ISD::SRL, MVT::v32i8, { 23, 23,30,43 } }, // pblendvb sequence + split.
1248 { ISD::SRL, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1249 { ISD::SRL, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1250 { ISD::SRL, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1251 { ISD::SRL, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1252 { ISD::SRL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1253 { ISD::SRL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1254
1255 { ISD::SRA, MVT::v16i8, { 21, 22,24,36 } }, // pblendvb sequence.
1256 { ISD::SRA, MVT::v32i8, { 44, 45,51,76 } }, // pblendvb sequence + split.
1257 { ISD::SRA, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1258 { ISD::SRA, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1259 { ISD::SRA, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1260 { ISD::SRA, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1261 { ISD::SRA, MVT::v2i64, { 5, 6,10,14 } }, // Shift each lane + blend.
1262 { ISD::SRA, MVT::v4i64, { 12, 12,22,30 } }, // Shift each lane + blend + split.
1263
1264 { ISD::FNEG, MVT::v4f64, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1265 { ISD::FNEG, MVT::v8f32, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1266
1267 { ISD::FADD, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1268 { ISD::FADD, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1269 { ISD::FADD, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1270 { ISD::FADD, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1271 { ISD::FADD, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1272 { ISD::FADD, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1273
1274 { ISD::FSUB, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1275 { ISD::FSUB, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1276 { ISD::FSUB, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1277 { ISD::FSUB, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1278 { ISD::FSUB, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1279 { ISD::FSUB, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1280
1281 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1282 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1283 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1284 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1285 { ISD::FMUL, MVT::v4f64, { 4, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1286 { ISD::FMUL, MVT::v8f32, { 2, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1287
1288 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1289 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1290 { ISD::FDIV, MVT::v8f32, { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/
1291 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1292 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1293 { ISD::FDIV, MVT::v4f64, { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/
1294 };
1295
1296 if (ST->hasAVX())
1297 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
1298 if (auto KindCost = Entry->Cost[CostKind])
1299 return LT.first * *KindCost;
1300
1301 static const CostKindTblEntry SSE42CostTable[] = {
1302 { ISD::FADD, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1303 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1304 { ISD::FADD, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1305 { ISD::FADD, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1306
1307 { ISD::FSUB, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1308 { ISD::FSUB, MVT::f32 , { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1309 { ISD::FSUB, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1310 { ISD::FSUB, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1311
1312 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1313 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1314 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1315 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1316
1317 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1318 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1319 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1320 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1321
1322 { ISD::MUL, MVT::v2i64, { 6, 10,10,10 } } // 3*pmuludq/3*shift/2*add
1323 };
1324
1325 if (ST->hasSSE42())
1326 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
1327 if (auto KindCost = Entry->Cost[CostKind])
1328 return LT.first * *KindCost;
1329
1330 static const CostKindTblEntry SSE41CostTable[] = {
1331 { ISD::SHL, MVT::v16i8, { 15, 24,17,22 } }, // pblendvb sequence.
1332 { ISD::SHL, MVT::v8i16, { 11, 14,11,11 } }, // pblendvb sequence.
1333 { ISD::SHL, MVT::v4i32, { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld
1334
1335 { ISD::SRL, MVT::v16i8, { 16, 27,18,24 } }, // pblendvb sequence.
1336 { ISD::SRL, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1337 { ISD::SRL, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1338 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1339
1340 { ISD::SRA, MVT::v16i8, { 38, 41,30,36 } }, // pblendvb sequence.
1341 { ISD::SRA, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1342 { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1343 { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence.
1344
1345 { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org)
1346 };
1347
1348 if (ST->hasSSE41())
1349 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
1350 if (auto KindCost = Entry->Cost[CostKind])
1351 return LT.first * *KindCost;
1352
1353 static const CostKindTblEntry SSSE3CostTable[] = {
1354 { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*pmaddubsw/3*and/psllw/or
1355 };
1356
1357 if (ST->hasSSSE3())
1358 if (const auto *Entry = CostTableLookup(SSSE3CostTable, ISD, LT.second))
1359 if (auto KindCost = Entry->Cost[CostKind])
1360 return LT.first * *KindCost;
1361
1362 static const CostKindTblEntry SSE2CostTable[] = {
1363 // We don't correctly identify costs of casts because they are marked as
1364 // custom.
1365 { ISD::SHL, MVT::v16i8, { 13, 21,26,28 } }, // cmpgtb sequence.
1366 { ISD::SHL, MVT::v8i16, { 24, 27,16,20 } }, // cmpgtw sequence.
1367 { ISD::SHL, MVT::v4i32, { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq.
1368 { ISD::SHL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1369
1370 { ISD::SRL, MVT::v16i8, { 14, 28,27,30 } }, // cmpgtb sequence.
1371 { ISD::SRL, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1372 { ISD::SRL, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1373 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1374
1375 { ISD::SRA, MVT::v16i8, { 27, 30,54,54 } }, // unpacked cmpgtb sequence.
1376 { ISD::SRA, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1377 { ISD::SRA, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1378 { ISD::SRA, MVT::v2i64, { 8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence.
1379
1380 { ISD::AND, MVT::v16i8, { 1, 1, 1, 1 } }, // pand
1381 { ISD::AND, MVT::v8i16, { 1, 1, 1, 1 } }, // pand
1382 { ISD::AND, MVT::v4i32, { 1, 1, 1, 1 } }, // pand
1383 { ISD::AND, MVT::v2i64, { 1, 1, 1, 1 } }, // pand
1384
1385 { ISD::OR, MVT::v16i8, { 1, 1, 1, 1 } }, // por
1386 { ISD::OR, MVT::v8i16, { 1, 1, 1, 1 } }, // por
1387 { ISD::OR, MVT::v4i32, { 1, 1, 1, 1 } }, // por
1388 { ISD::OR, MVT::v2i64, { 1, 1, 1, 1 } }, // por
1389
1390 { ISD::XOR, MVT::v16i8, { 1, 1, 1, 1 } }, // pxor
1391 { ISD::XOR, MVT::v8i16, { 1, 1, 1, 1 } }, // pxor
1392 { ISD::XOR, MVT::v4i32, { 1, 1, 1, 1 } }, // pxor
1393 { ISD::XOR, MVT::v2i64, { 1, 1, 1, 1 } }, // pxor
1394
1395 { ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq
1396 { ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq
1397
1398 { ISD::MUL, MVT::v16i8, { 6, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack
1399 { ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw
1400 { ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle
1401 { ISD::MUL, MVT::v2i64, { 7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add
1402
1403 { X86ISD::PMULUDQ, MVT::v2i64, { 1, 5, 1, 1 } },
1404
1405 { ISD::FDIV, MVT::f32, { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/
1406 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/
1407 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/
1408 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/
1409
1410 { ISD::FNEG, MVT::f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1411 { ISD::FNEG, MVT::f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1412 { ISD::FNEG, MVT::v4f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1413 { ISD::FNEG, MVT::v2f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1414
1415 { ISD::FADD, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1416 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1417 { ISD::FADD, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1418
1419 { ISD::FSUB, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1420 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1421 { ISD::FSUB, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1422
1423 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1424 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1425 };
1426
1427 if (ST->hasSSE2())
1428 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
1429 if (auto KindCost = Entry->Cost[CostKind])
1430 return LT.first * *KindCost;
1431
1432 static const CostKindTblEntry SSE1CostTable[] = {
1433 { ISD::FDIV, MVT::f32, { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/
1434 { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/
1435
1436 { ISD::FNEG, MVT::f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1437 { ISD::FNEG, MVT::v4f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1438
1439 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1440 { ISD::FADD, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1441
1442 { ISD::FSUB, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1443 { ISD::FSUB, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1444
1445 { ISD::FMUL, MVT::f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1446 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1447 };
1448
1449 if (ST->hasSSE1())
1450 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1451 if (auto KindCost = Entry->Cost[CostKind])
1452 return LT.first * *KindCost;
1453
1454 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
1455 { ISD::ADD, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1456 { ISD::SUB, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1457 { ISD::MUL, MVT::i64, { 2, 6, 1, 2 } },
1458 };
1459
1460 if (ST->is64Bit())
1461 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1462 if (auto KindCost = Entry->Cost[CostKind])
1463 return LT.first * *KindCost;
1464
1465 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1466 { ISD::ADD, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1467 { ISD::ADD, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1468 { ISD::ADD, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1469
1470 { ISD::SUB, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1471 { ISD::SUB, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1472 { ISD::SUB, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1473
1474 { ISD::MUL, MVT::i8, { 3, 4, 1, 1 } },
1475 { ISD::MUL, MVT::i16, { 2, 4, 1, 1 } },
1476 { ISD::MUL, MVT::i32, { 1, 4, 1, 1 } },
1477
1478 { ISD::FNEG, MVT::f64, { 2, 2, 1, 3 } }, // (x87)
1479 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1480 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1481 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // (x87)
1482 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87)
1483 };
1484
1485 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1486 if (auto KindCost = Entry->Cost[CostKind])
1487 return LT.first * *KindCost;
1488
1489 // It is not a good idea to vectorize division. We have to scalarize it and
1490 // in the process we will often end up having to spilling regular
1491 // registers. The overhead of division is going to dominate most kernels
1492 // anyways so try hard to prevent vectorization of division - it is
1493 // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1494 // to hide "20 cycles" for each lane.
1495 if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() &&
1496 (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
1497 ISD == ISD::UREM)) {
1498 InstructionCost ScalarCost =
1499 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
1500 Op1Info.getNoProps(), Op2Info.getNoProps());
1501 return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1502 }
1503
1504 // Handle some basic single instruction code size cases.
1505 if (CostKind == TTI::TCK_CodeSize) {
1506 switch (ISD) {
1507 case ISD::FADD:
1508 case ISD::FSUB:
1509 case ISD::FMUL:
1510 case ISD::FDIV:
1511 case ISD::FNEG:
1512 case ISD::AND:
1513 case ISD::OR:
1514 case ISD::XOR:
1515 return LT.first;
1516 break;
1517 }
1518 }
1519
1520 // Fallback to the default implementation.
1521 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1522 Args, CxtI);
1523}
1524
1527 unsigned Opcode1, const SmallBitVector &OpcodeMask,
1529 if (isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
1530 return TTI::TCC_Basic;
1532}
1533
1535 VectorType *DstTy, VectorType *SrcTy,
1536 ArrayRef<int> Mask,
1538 int Index, VectorType *SubTp,
1540 const Instruction *CxtI) const {
1541 assert((Mask.empty() || DstTy->isScalableTy() ||
1542 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
1543 "Expected the Mask to match the return size if given");
1544 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
1545 "Expected the same scalar types");
1546
1547 // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1548 // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1549 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1550
1551 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
1552
1553 // If all args are constant than this will be constant folded away.
1554 if (!Args.empty() &&
1555 all_of(Args, [](const Value *Arg) { return isa<Constant>(Arg); }))
1556 return TTI::TCC_Free;
1557
1558 // Recognize a basic concat_vector shuffle.
1559 if (Kind == TTI::SK_PermuteTwoSrc &&
1560 Mask.size() == (2 * SrcTy->getElementCount().getKnownMinValue()) &&
1561 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
1565 CostKind, Mask.size() / 2, SrcTy);
1566
1567 // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1568 if (Kind == TTI::SK_Transpose)
1569 if (LT.second != MVT::v4f64 && LT.second != MVT::v4i64)
1570 Kind = TTI::SK_PermuteTwoSrc;
1571
1572 if (Kind == TTI::SK_Broadcast) {
1573 // For Broadcasts we are splatting the first element from the first input
1574 // register, so only need to reference that input and all the output
1575 // registers are the same.
1576 LT.first = 1;
1577
1578 // If we're broadcasting a load then AVX/AVX2 can do this for free.
1579 using namespace PatternMatch;
1580 if (!Args.empty() && match(Args[0], m_OneUse(m_Load(m_Value()))) &&
1581 (ST->hasAVX2() ||
1582 (ST->hasAVX() && LT.second.getScalarSizeInBits() >= 32)))
1583 return TTI::TCC_Free;
1584 }
1585
1586 // Attempt to detect a cheaper inlane shuffle, avoiding 128-bit subvector
1587 // permutation.
1588 // Attempt to detect a shuffle mask with a single defined element.
1589 bool IsInLaneShuffle = false;
1590 bool IsSingleElementMask = false;
1591 if (SrcTy->getPrimitiveSizeInBits() > 0 &&
1592 (SrcTy->getPrimitiveSizeInBits() % 128) == 0 &&
1593 SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
1594 Mask.size() == SrcTy->getElementCount().getKnownMinValue()) {
1595 unsigned NumLanes = SrcTy->getPrimitiveSizeInBits() / 128;
1596 unsigned NumEltsPerLane = Mask.size() / NumLanes;
1597 if ((Mask.size() % NumLanes) == 0) {
1598 IsInLaneShuffle = all_of(enumerate(Mask), [&](const auto &P) {
1599 return P.value() == PoisonMaskElem ||
1600 ((P.value() % Mask.size()) / NumEltsPerLane) ==
1601 (P.index() / NumEltsPerLane);
1602 });
1603 IsSingleElementMask =
1604 (Mask.size() - 1) == static_cast<unsigned>(count_if(Mask, [](int M) {
1605 return M == PoisonMaskElem;
1606 }));
1607 }
1608 }
1609
1610 // Treat <X x bfloat> shuffles as <X x half>.
1611 if (LT.second.isVector() && LT.second.getScalarType() == MVT::bf16)
1612 LT.second = LT.second.changeVectorElementType(MVT::f16);
1613
1614 // Subvector extractions are free if they start at the beginning of a
1615 // vector and cheap if the subvectors are aligned.
1616 if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1617 int NumElts = LT.second.getVectorNumElements();
1618 if ((Index % NumElts) == 0)
1619 return TTI::TCC_Free;
1620 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1621 if (SubLT.second.isVector()) {
1622 int NumSubElts = SubLT.second.getVectorNumElements();
1623 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1624 return SubLT.first;
1625 // Handle some cases for widening legalization. For now we only handle
1626 // cases where the original subvector was naturally aligned and evenly
1627 // fit in its legalized subvector type.
1628 // FIXME: Remove some of the alignment restrictions.
1629 // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1630 // vectors.
1631 int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1632 if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1633 (NumSubElts % OrigSubElts) == 0 &&
1634 LT.second.getVectorElementType() ==
1635 SubLT.second.getVectorElementType() &&
1636 LT.second.getVectorElementType().getSizeInBits() ==
1637 SrcTy->getElementType()->getPrimitiveSizeInBits()) {
1638 assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1639 "Unexpected number of elements!");
1640 auto *VecTy = FixedVectorType::get(SrcTy->getElementType(),
1641 LT.second.getVectorNumElements());
1642 auto *SubTy = FixedVectorType::get(SrcTy->getElementType(),
1643 SubLT.second.getVectorNumElements());
1644 int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1645 InstructionCost ExtractCost =
1647 ExtractIndex, SubTy);
1648
1649 // If the original size is 32-bits or more, we can use pshufd. Otherwise
1650 // if we have SSSE3 we can use pshufb.
1651 if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1652 return ExtractCost + 1; // pshufd or pshufb
1653
1654 assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1655 "Unexpected vector size");
1656
1657 return ExtractCost + 2; // worst case pshufhw + pshufd
1658 }
1659 }
1660 // If the extract subvector is not optimal, treat it as single op shuffle.
1662 }
1663
1664 // Subvector insertions are cheap if the subvectors are aligned.
1665 // Note that in general, the insertion starting at the beginning of a vector
1666 // isn't free, because we need to preserve the rest of the wide vector,
1667 // but if the destination vector legalizes to the same width as the subvector
1668 // then the insertion will simplify to a (free) register copy.
1669 if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1670 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(DstTy);
1671 int NumElts = DstLT.second.getVectorNumElements();
1672 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1673 if (SubLT.second.isVector()) {
1674 int NumSubElts = SubLT.second.getVectorNumElements();
1675 bool MatchingTypes =
1676 NumElts == NumSubElts &&
1677 (SubTp->getElementCount().getKnownMinValue() % NumSubElts) == 0;
1678 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1679 return MatchingTypes ? TTI::TCC_Free : SubLT.first;
1680 }
1681
1682 // Attempt to match MOVSS (Idx == 0) or INSERTPS pattern. This will have
1683 // been matched by improveShuffleKindFromMask as a SK_InsertSubvector of
1684 // v1f32 (legalised to f32) into a v4f32.
1685 if (LT.first == 1 && LT.second == MVT::v4f32 && SubLT.first == 1 &&
1686 SubLT.second == MVT::f32 && (Index == 0 || ST->hasSSE41()))
1687 return 1;
1688
1689 // If the insertion is the lowest subvector then it will be blended
1690 // otherwise treat it like a 2-op shuffle.
1691 Kind =
1692 (Index == 0 && LT.first == 1) ? TTI::SK_Select : TTI::SK_PermuteTwoSrc;
1693 }
1694
1695 // Handle some common (illegal) sub-vector types as they are often very cheap
1696 // to shuffle even on targets without PSHUFB.
1697 EVT VT = TLI->getValueType(DL, SrcTy);
1698 if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1699 !ST->hasSSSE3()) {
1700 static const CostKindTblEntry SSE2SubVectorShuffleTbl[] = {
1701 {TTI::SK_Broadcast, MVT::v4i16, {1,1,1,1}}, // pshuflw
1702 {TTI::SK_Broadcast, MVT::v2i16, {1,1,1,1}}, // pshuflw
1703 {TTI::SK_Broadcast, MVT::v8i8, {2,2,2,2}}, // punpck/pshuflw
1704 {TTI::SK_Broadcast, MVT::v4i8, {2,2,2,2}}, // punpck/pshuflw
1705 {TTI::SK_Broadcast, MVT::v2i8, {1,1,1,1}}, // punpck
1706
1707 {TTI::SK_Reverse, MVT::v4i16, {1,1,1,1}}, // pshuflw
1708 {TTI::SK_Reverse, MVT::v2i16, {1,1,1,1}}, // pshuflw
1709 {TTI::SK_Reverse, MVT::v4i8, {3,3,3,3}}, // punpck/pshuflw/packus
1710 {TTI::SK_Reverse, MVT::v2i8, {1,1,1,1}}, // punpck
1711
1712 {TTI::SK_Splice, MVT::v4i16, {2,2,2,2}}, // punpck+psrldq
1713 {TTI::SK_Splice, MVT::v2i16, {2,2,2,2}}, // punpck+psrldq
1714 {TTI::SK_Splice, MVT::v4i8, {2,2,2,2}}, // punpck+psrldq
1715 {TTI::SK_Splice, MVT::v2i8, {2,2,2,2}}, // punpck+psrldq
1716
1717 {TTI::SK_PermuteTwoSrc, MVT::v4i16, {2,2,2,2}}, // punpck/pshuflw
1718 {TTI::SK_PermuteTwoSrc, MVT::v2i16, {2,2,2,2}}, // punpck/pshuflw
1719 {TTI::SK_PermuteTwoSrc, MVT::v8i8, {7,7,7,7}}, // punpck/pshuflw
1720 {TTI::SK_PermuteTwoSrc, MVT::v4i8, {4,4,4,4}}, // punpck/pshuflw
1721 {TTI::SK_PermuteTwoSrc, MVT::v2i8, {2,2,2,2}}, // punpck
1722
1723 {TTI::SK_PermuteSingleSrc, MVT::v4i16, {1,1,1,1}}, // pshuflw
1724 {TTI::SK_PermuteSingleSrc, MVT::v2i16, {1,1,1,1}}, // pshuflw
1725 {TTI::SK_PermuteSingleSrc, MVT::v8i8, {5,5,5,5}}, // punpck/pshuflw
1726 {TTI::SK_PermuteSingleSrc, MVT::v4i8, {3,3,3,3}}, // punpck/pshuflw
1727 {TTI::SK_PermuteSingleSrc, MVT::v2i8, {1,1,1,1}}, // punpck
1728 };
1729
1730 if (ST->hasSSE2())
1731 if (const auto *Entry =
1732 CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1733 if (auto KindCost = Entry->Cost[CostKind])
1734 return LT.first * *KindCost;
1735 }
1736
1737 // We are going to permute multiple sources and the result will be in multiple
1738 // destinations. Providing an accurate cost only for splits where the element
1739 // type remains the same.
1740 if (LT.first != 1) {
1741 MVT LegalVT = LT.second;
1742 if (LegalVT.isVector() &&
1743 LegalVT.getVectorElementType().getSizeInBits() ==
1744 SrcTy->getElementType()->getPrimitiveSizeInBits() &&
1745 LegalVT.getVectorNumElements() <
1746 cast<FixedVectorType>(SrcTy)->getNumElements()) {
1747 unsigned VecTySize = DL.getTypeStoreSize(SrcTy);
1748 unsigned LegalVTSize = LegalVT.getStoreSize();
1749 // Number of source vectors after legalization:
1750 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1751 // Number of destination vectors after legalization:
1752 InstructionCost NumOfDests = LT.first;
1753
1754 auto *SingleOpTy = FixedVectorType::get(SrcTy->getElementType(),
1755 LegalVT.getVectorNumElements());
1756
1757 if (!Mask.empty() && NumOfDests.isValid()) {
1758 // Try to perform better estimation of the permutation.
1759 // 1. Split the source/destination vectors into real registers.
1760 // 2. Do the mask analysis to identify which real registers are
1761 // permuted. If more than 1 source registers are used for the
1762 // destination register building, the cost for this destination register
1763 // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
1764 // source register is used, build mask and calculate the cost as a cost
1765 // of PermuteSingleSrc.
1766 // Also, for the single register permute we try to identify if the
1767 // destination register is just a copy of the source register or the
1768 // copy of the previous destination register (the cost is
1769 // TTI::TCC_Basic). If the source register is just reused, the cost for
1770 // this operation is TTI::TCC_Free.
1771 NumOfDests =
1773 FixedVectorType::get(SrcTy->getElementType(), Mask.size()))
1774 .first;
1775 unsigned E = NumOfDests.getValue();
1776 unsigned NormalizedVF =
1777 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
1778 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
1779 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
1780 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
1781 copy(Mask, NormalizedMask.begin());
1782 unsigned PrevSrcReg = 0;
1783 ArrayRef<int> PrevRegMask;
1786 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
1787 [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask,
1788 &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
1789 if (!ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) {
1790 // Check if the previous register can be just copied to the next
1791 // one.
1792 if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
1793 PrevRegMask != RegMask)
1794 Cost +=
1796 SingleOpTy, RegMask, CostKind, 0, nullptr);
1797 else
1798 // Just a copy of previous destination register.
1800 return;
1801 }
1802 if (SrcReg != DestReg &&
1803 any_of(RegMask, not_equal_to(PoisonMaskElem))) {
1804 // Just a copy of the source register.
1806 }
1807 PrevSrcReg = SrcReg;
1808 PrevRegMask = RegMask;
1809 },
1810 [this, SingleOpTy, CostKind,
1811 &Cost](ArrayRef<int> RegMask, unsigned /*Unused*/,
1812 unsigned /*Unused*/, bool /*Unused*/) {
1814 SingleOpTy, RegMask, CostKind, 0, nullptr);
1815 });
1816 return Cost;
1817 }
1818
1819 InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1820 return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1821 SingleOpTy, {}, CostKind, 0,
1822 nullptr);
1823 }
1824
1825 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1826 SubTp);
1827 }
1828
1829 // If we're just moving a single element around (probably as an alternative to
1830 // extracting it), we can assume this is cheap.
1831 if (LT.first == 1 && IsInLaneShuffle && IsSingleElementMask)
1832 return TTI::TCC_Basic;
1833
1834 static const CostKindTblEntry AVX512VBMIShuffleTbl[] = {
1835 { TTI::SK_Reverse, MVT::v64i8, { 1, 1, 1, 1 } }, // vpermb
1836 { TTI::SK_Reverse, MVT::v32i8, { 1, 1, 1, 1 } }, // vpermb
1837 { TTI::SK_PermuteSingleSrc, MVT::v64i8, { 1, 1, 1, 1 } }, // vpermb
1838 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 1, 1, 1, 1 } }, // vpermb
1839 { TTI::SK_PermuteTwoSrc, MVT::v64i8, { 2, 2, 2, 2 } }, // vpermt2b
1840 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 2, 2, 2, 2 } }, // vpermt2b
1841 { TTI::SK_PermuteTwoSrc, MVT::v16i8, { 2, 2, 2, 2 } } // vpermt2b
1842 };
1843
1844 if (ST->hasVBMI())
1845 if (const auto *Entry =
1846 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1847 if (auto KindCost = Entry->Cost[CostKind])
1848 return LT.first * *KindCost;
1849
1850 static const CostKindTblEntry AVX512BWShuffleTbl[] = {
1851 { TTI::SK_Broadcast, MVT::v32i16, { 1, 3, 1, 1 } }, // vpbroadcastw
1852 { TTI::SK_Broadcast, MVT::v32f16, { 1, 3, 1, 1 } }, // vpbroadcastw
1853 { TTI::SK_Broadcast, MVT::v64i8, { 1, 3, 1, 1 } }, // vpbroadcastb
1854
1855 { TTI::SK_Reverse, MVT::v32i16, { 2, 6, 2, 4 } }, // vpermw
1856 { TTI::SK_Reverse, MVT::v32f16, { 2, 6, 2, 4 } }, // vpermw
1857 { TTI::SK_Reverse, MVT::v16i16, { 2, 2, 2, 2 } }, // vpermw
1858 { TTI::SK_Reverse, MVT::v16f16, { 2, 2, 2, 2 } }, // vpermw
1859 { TTI::SK_Reverse, MVT::v64i8, { 2, 9, 2, 3 } }, // pshufb + vshufi64x2
1860
1861 { TTI::SK_PermuteSingleSrc, MVT::v32i16, { 2, 2, 2, 2 } }, // vpermw
1862 { TTI::SK_PermuteSingleSrc, MVT::v32f16, { 2, 2, 2, 2 } }, // vpermw
1863 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 2, 2, 2, 2 } }, // vpermw
1864 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 2, 2, 2, 2 } }, // vpermw
1865 { TTI::SK_PermuteSingleSrc, MVT::v64i8, { 8, 8, 8, 8 } }, // extend to v32i16
1866
1867 { TTI::SK_PermuteTwoSrc, MVT::v32i16,{ 2, 2, 2, 2 } }, // vpermt2w
1868 { TTI::SK_PermuteTwoSrc, MVT::v32f16,{ 2, 2, 2, 2 } }, // vpermt2w
1869 { TTI::SK_PermuteTwoSrc, MVT::v16i16,{ 2, 2, 2, 2 } }, // vpermt2w
1870 { TTI::SK_PermuteTwoSrc, MVT::v8i16, { 2, 2, 2, 2 } }, // vpermt2w
1871 { TTI::SK_PermuteTwoSrc, MVT::v64i8, { 19, 19, 19, 19 } }, // 6 * v32i8 + 1
1872
1873 { TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vblendmw
1874 { TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vblendmb
1875
1876 { TTI::SK_Splice, MVT::v32i16, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
1877 { TTI::SK_Splice, MVT::v32f16, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
1878 { TTI::SK_Splice, MVT::v64i8, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
1879 };
1880
1881 if (ST->hasBWI())
1882 if (const auto *Entry =
1883 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1884 if (auto KindCost = Entry->Cost[CostKind])
1885 return LT.first * *KindCost;
1886
1887 static const CostKindTblEntry AVX512ShuffleTbl[] = {
1888 {TTI::SK_Broadcast, MVT::v8f64, { 1, 3, 1, 1 } }, // vbroadcastsd
1889 {TTI::SK_Broadcast, MVT::v4f64, { 1, 3, 1, 1 } }, // vbroadcastsd
1890 {TTI::SK_Broadcast, MVT::v16f32, { 1, 3, 1, 1 } }, // vbroadcastss
1891 {TTI::SK_Broadcast, MVT::v8f32, { 1, 3, 1, 1 } }, // vbroadcastss
1892 {TTI::SK_Broadcast, MVT::v8i64, { 1, 3, 1, 1 } }, // vpbroadcastq
1893 {TTI::SK_Broadcast, MVT::v4i64, { 1, 3, 1, 1 } }, // vpbroadcastq
1894 {TTI::SK_Broadcast, MVT::v16i32, { 1, 3, 1, 1 } }, // vpbroadcastd
1895 {TTI::SK_Broadcast, MVT::v8i32, { 1, 3, 1, 1 } }, // vpbroadcastd
1896 {TTI::SK_Broadcast, MVT::v32i16, { 1, 3, 1, 1 } }, // vpbroadcastw
1897 {TTI::SK_Broadcast, MVT::v16i16, { 1, 3, 1, 1 } }, // vpbroadcastw
1898 {TTI::SK_Broadcast, MVT::v32f16, { 1, 3, 1, 1 } }, // vpbroadcastw
1899 {TTI::SK_Broadcast, MVT::v16f16, { 1, 3, 1, 1 } }, // vpbroadcastw
1900 {TTI::SK_Broadcast, MVT::v64i8, { 1, 3, 1, 1 } }, // vpbroadcastb
1901 {TTI::SK_Broadcast, MVT::v32i8, { 1, 3, 1, 1 }}, // vpbroadcastb
1902
1903 {TTI::SK_Reverse, MVT::v8f64, { 1, 5, 2, 3 } }, // vpermpd
1904 {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 2, 3 } }, // vpermps
1905 {TTI::SK_Reverse, MVT::v8i64, { 1, 5, 2, 3 } }, // vpermq
1906 {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 2, 3 } }, // vpermd
1907 {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca
1908 {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca
1909 {TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca
1910
1911 {TTI::SK_Splice, MVT::v8f64, { 1, 1, 1, 1 } }, // vpalignd
1912 {TTI::SK_Splice, MVT::v4f64, { 1, 1, 1, 1 } }, // vpalignd
1913 {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd
1914 {TTI::SK_Splice, MVT::v8f32, { 1, 1, 1, 1 } }, // vpalignd
1915 {TTI::SK_Splice, MVT::v8i64, { 1, 1, 1, 1 } }, // vpalignd
1916 {TTI::SK_Splice, MVT::v4i64, { 1, 1, 1, 1 } }, // vpalignd
1917 {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd
1918 {TTI::SK_Splice, MVT::v8i32, { 1, 1, 1, 1 } }, // vpalignd
1919 {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr
1920 {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr
1921 {TTI::SK_Splice, MVT::v64i8, { 4, 4, 4, 4 } }, // split + palignr
1922
1923 {TTI::SK_PermuteSingleSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1924 {TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermpd
1925 {TTI::SK_PermuteSingleSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermpd
1926 {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1927 {TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermps
1928 {TTI::SK_PermuteSingleSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermps
1929 {TTI::SK_PermuteSingleSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1930 {TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermq
1931 {TTI::SK_PermuteSingleSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermq
1932 {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1933 {TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermd
1934 {TTI::SK_PermuteSingleSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermd
1935 {TTI::SK_PermuteSingleSrc, MVT::v16i8, { 1, 3, 1, 1 } }, // pshufb
1936
1937 {TTI::SK_PermuteTwoSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermt2pd
1938 {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermt2ps
1939 {TTI::SK_PermuteTwoSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermt2q
1940 {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermt2d
1941 {TTI::SK_PermuteTwoSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermt2pd
1942 {TTI::SK_PermuteTwoSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermt2ps
1943 {TTI::SK_PermuteTwoSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermt2q
1944 {TTI::SK_PermuteTwoSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermt2d
1945 {TTI::SK_PermuteTwoSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermt2pd
1946 {TTI::SK_PermuteTwoSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermt2ps
1947 {TTI::SK_PermuteTwoSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermt2q
1948 {TTI::SK_PermuteTwoSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermt2d
1949
1950 // FIXME: This just applies the type legalization cost rules above
1951 // assuming these completely split.
1952 {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } },
1953 {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } },
1954 {TTI::SK_PermuteSingleSrc, MVT::v64i8, { 14, 14, 14, 14 } },
1955 {TTI::SK_PermuteTwoSrc, MVT::v32i16, { 42, 42, 42, 42 } },
1956 {TTI::SK_PermuteTwoSrc, MVT::v32f16, { 42, 42, 42, 42 } },
1957 {TTI::SK_PermuteTwoSrc, MVT::v64i8, { 42, 42, 42, 42 } },
1958
1959 {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq
1960 {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq
1961 {TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vpternlogq
1962 {TTI::SK_Select, MVT::v8f64, { 1, 1, 1, 1 } }, // vblendmpd
1963 {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps
1964 {TTI::SK_Select, MVT::v8i64, { 1, 1, 1, 1 } }, // vblendmq
1965 {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd
1966 };
1967
1968 if (ST->hasAVX512())
1969 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1970 if (auto KindCost = Entry->Cost[CostKind])
1971 return LT.first * *KindCost;
1972
1973 static const CostKindTblEntry AVX2InLaneShuffleTbl[] = {
1974 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 1, 1, 1, 1 } }, // vpshufb
1975 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 1, 1, 1, 1 } }, // vpshufb
1976 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 1, 1, 1, 1 } }, // vpshufb
1977
1978 { TTI::SK_Transpose, MVT::v4f64, { 1, 1, 1, 1 } }, // vshufpd/vunpck
1979 { TTI::SK_Transpose, MVT::v4i64, { 1, 1, 1, 1 } }, // vshufpd/vunpck
1980
1981 { TTI::SK_PermuteTwoSrc, MVT::v4f64, { 2, 2, 2, 2 } }, // 2*vshufpd + vblendpd
1982 { TTI::SK_PermuteTwoSrc, MVT::v8f32, { 2, 2, 2, 2 } }, // 2*vshufps + vblendps
1983 { TTI::SK_PermuteTwoSrc, MVT::v4i64, { 2, 2, 2, 2 } }, // 2*vpshufd + vpblendd
1984 { TTI::SK_PermuteTwoSrc, MVT::v8i32, { 2, 2, 2, 2 } }, // 2*vpshufd + vpblendd
1985 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 2, 2, 2, 2 } }, // 2*vpshufb + vpor
1986 { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 2, 2, 2, 2 } }, // 2*vpshufb + vpor
1987 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 2, 2, 2, 2 } }, // 2*vpshufb + vpor
1988 };
1989
1990 if (IsInLaneShuffle && ST->hasAVX2())
1991 if (const auto *Entry =
1992 CostTableLookup(AVX2InLaneShuffleTbl, Kind, LT.second))
1993 if (auto KindCost = Entry->Cost[CostKind])
1994 return LT.first * *KindCost;
1995
1996 static const CostKindTblEntry AVX2ShuffleTbl[] = {
1997 { TTI::SK_Broadcast, MVT::v4f64, { 1, 3, 1, 2 } }, // vbroadcastpd
1998 { TTI::SK_Broadcast, MVT::v8f32, { 1, 3, 1, 2 } }, // vbroadcastps
1999 { TTI::SK_Broadcast, MVT::v4i64, { 1, 3, 1, 2 } }, // vpbroadcastq
2000 { TTI::SK_Broadcast, MVT::v8i32, { 1, 3, 1, 2 } }, // vpbroadcastd
2001 { TTI::SK_Broadcast, MVT::v16i16, { 1, 3, 1, 2 } }, // vpbroadcastw
2002 { TTI::SK_Broadcast, MVT::v8i16, { 1, 3, 1, 1 } }, // vpbroadcastw
2003 { TTI::SK_Broadcast, MVT::v16f16, { 1, 3, 1, 2 } }, // vpbroadcastw
2004 { TTI::SK_Broadcast, MVT::v8f16, { 1, 3, 1, 1 } }, // vpbroadcastw
2005 { TTI::SK_Broadcast, MVT::v32i8, { 1, 3, 1, 2 } }, // vpbroadcastb
2006 { TTI::SK_Broadcast, MVT::v16i8, { 1, 3, 1, 1 } }, // vpbroadcastb
2007
2008 { TTI::SK_Reverse, MVT::v4f64, { 1, 6, 1, 2 } }, // vpermpd
2009 { TTI::SK_Reverse, MVT::v8f32, { 2, 7, 2, 4 } }, // vpermps
2010 { TTI::SK_Reverse, MVT::v4i64, { 1, 6, 1, 2 } }, // vpermq
2011 { TTI::SK_Reverse, MVT::v8i32, { 2, 7, 2, 4 } }, // vpermd
2012 { TTI::SK_Reverse, MVT::v16i16, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
2013 { TTI::SK_Reverse, MVT::v16f16, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
2014 { TTI::SK_Reverse, MVT::v32i8, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
2015
2016 { TTI::SK_Select, MVT::v16i16, { 1, 1, 1, 1 } }, // vpblendvb
2017 { TTI::SK_Select, MVT::v16f16, { 1, 1, 1, 1 } }, // vpblendvb
2018 { TTI::SK_Select, MVT::v32i8, { 1, 1, 1, 1 } }, // vpblendvb
2019
2020 { TTI::SK_Splice, MVT::v8i32, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2021 { TTI::SK_Splice, MVT::v8f32, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2022 { TTI::SK_Splice, MVT::v16i16, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2023 { TTI::SK_Splice, MVT::v16f16, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2024 { TTI::SK_Splice, MVT::v32i8, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2025
2026 { TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 1, 1, 1 } }, // vpermpd
2027 { TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 1, 1, 1 } }, // vpermps
2028 { TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 1, 1, 1 } }, // vpermq
2029 { TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 1, 1, 1 } }, // vpermd
2030 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 4, 4, 4, 4 } },
2031 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 4, 4, 4, 4 } },
2032 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 4, 4, 4, 4 } },
2033
2034 { TTI::SK_PermuteTwoSrc, MVT::v4f64, { 3, 3, 3, 3 } }, // 2*vpermpd + vblendpd
2035 { TTI::SK_PermuteTwoSrc, MVT::v8f32, { 3, 3, 3, 3 } }, // 2*vpermps + vblendps
2036 { TTI::SK_PermuteTwoSrc, MVT::v4i64, { 3, 3, 3, 3 } }, // 2*vpermq + vpblendd
2037 { TTI::SK_PermuteTwoSrc, MVT::v8i32, { 3, 3, 3, 3 } }, // 2*vpermd + vpblendd
2038 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 7, 7, 7, 7 } },
2039 { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 7, 7, 7, 7 } },
2040 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 7, 7, 7, 7 } },
2041 };
2042
2043 if (ST->hasAVX2())
2044 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
2045 if (auto KindCost = Entry->Cost[CostKind])
2046 return LT.first * *KindCost;
2047
2048 static const CostKindTblEntry XOPShuffleTbl[] = {
2049 { TTI::SK_PermuteSingleSrc, MVT::v4f64, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2pd
2050 { TTI::SK_PermuteSingleSrc, MVT::v8f32, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2ps
2051 { TTI::SK_PermuteSingleSrc, MVT::v4i64, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2pd
2052 { TTI::SK_PermuteSingleSrc, MVT::v8i32, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2ps
2053 { TTI::SK_PermuteSingleSrc, MVT::v16i16,{ 4, 4, 4, 4 } }, // vextractf128 + 2*vpperm
2054 // + vinsertf128
2055 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 4, 4, 4, 4 } }, // vextractf128 + 2*vpperm
2056 // + vinsertf128
2057
2058 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 6*vpperm
2059 // + vinsertf128
2060
2061 { TTI::SK_PermuteTwoSrc, MVT::v8i16, { 1, 1, 1, 1 } }, // vpperm
2062 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 9, 9, 9, 9 } }, // 2*vextractf128 + 6*vpperm
2063 // + vinsertf128
2064 { TTI::SK_PermuteTwoSrc, MVT::v16i8, { 1, 1, 1, 1 } }, // vpperm
2065 };
2066
2067 if (ST->hasXOP())
2068 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
2069 if (auto KindCost = Entry->Cost[CostKind])
2070 return LT.first * *KindCost;
2071
2072 static const CostKindTblEntry AVX1InLaneShuffleTbl[] = {
2073 { TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 1, 1, 1 } }, // vpermilpd
2074 { TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 1, 1, 1 } }, // vpermilpd
2075 { TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 1, 1, 1 } }, // vpermilps
2076 { TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 1, 1, 1 } }, // vpermilps
2077
2078 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb
2079 // + vpor + vinsertf128
2080 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb
2081 // + vpor + vinsertf128
2082 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb
2083 // + vpor + vinsertf128
2084
2085 { TTI::SK_Transpose, MVT::v4f64, { 1, 1, 1, 1 } }, // vshufpd/vunpck
2086 { TTI::SK_Transpose, MVT::v4i64, { 1, 1, 1, 1 } }, // vshufpd/vunpck
2087
2088 { TTI::SK_PermuteTwoSrc, MVT::v4f64, { 2, 2, 2, 2 } }, // 2*vshufpd + vblendpd
2089 { TTI::SK_PermuteTwoSrc, MVT::v8f32, { 2, 2, 2, 2 } }, // 2*vshufps + vblendps
2090 { TTI::SK_PermuteTwoSrc, MVT::v4i64, { 2, 2, 2, 2 } }, // 2*vpermilpd + vblendpd
2091 { TTI::SK_PermuteTwoSrc, MVT::v8i32, { 2, 2, 2, 2 } }, // 2*vpermilps + vblendps
2092 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb
2093 // + 2*vpor + vinsertf128
2094 { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb
2095 // + 2*vpor + vinsertf128
2096 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb
2097 // + 2*vpor + vinsertf128
2098 };
2099
2100 if (IsInLaneShuffle && ST->hasAVX())
2101 if (const auto *Entry =
2102 CostTableLookup(AVX1InLaneShuffleTbl, Kind, LT.second))
2103 if (auto KindCost = Entry->Cost[CostKind])
2104 return LT.first * *KindCost;
2105
2106 static const CostKindTblEntry AVX1ShuffleTbl[] = {
2107 {TTI::SK_Broadcast, MVT::v4f64, {2,3,2,3}}, // vperm2f128 + vpermilpd
2108 {TTI::SK_Broadcast, MVT::v8f32, {2,3,2,3}}, // vperm2f128 + vpermilps
2109 {TTI::SK_Broadcast, MVT::v4i64, {2,3,2,3}}, // vperm2f128 + vpermilpd
2110 {TTI::SK_Broadcast, MVT::v8i32, {2,3,2,3}}, // vperm2f128 + vpermilps
2111 {TTI::SK_Broadcast, MVT::v16i16, {2,3,3,4}}, // vpshuflw + vpshufd + vinsertf128
2112 {TTI::SK_Broadcast, MVT::v16f16, {2,3,3,4}}, // vpshuflw + vpshufd + vinsertf128
2113 {TTI::SK_Broadcast, MVT::v32i8, {3,4,3,6}}, // vpshufb + vinsertf128
2114
2115 {TTI::SK_Reverse, MVT::v4f64, {2,6,2,2}}, // vperm2f128 + vpermilpd
2116 {TTI::SK_Reverse, MVT::v8f32, {2,7,2,4}}, // vperm2f128 + vpermilps
2117 {TTI::SK_Reverse, MVT::v4i64, {2,6,2,2}}, // vperm2f128 + vpermilpd
2118 {TTI::SK_Reverse, MVT::v8i32, {2,7,2,4}}, // vperm2f128 + vpermilps
2119 {TTI::SK_Reverse, MVT::v16i16, {2,9,5,5}}, // vextractf128 + 2*pshufb
2120 // + vinsertf128
2121 {TTI::SK_Reverse, MVT::v16f16, {2,9,5,5}}, // vextractf128 + 2*pshufb
2122 // + vinsertf128
2123 {TTI::SK_Reverse, MVT::v32i8, {2,9,5,5}}, // vextractf128 + 2*pshufb
2124 // + vinsertf128
2125
2126 {TTI::SK_Select, MVT::v4i64, {1,1,1,1}}, // vblendpd
2127 {TTI::SK_Select, MVT::v4f64, {1,1,1,1}}, // vblendpd
2128 {TTI::SK_Select, MVT::v8i32, {1,1,1,1}}, // vblendps
2129 {TTI::SK_Select, MVT::v8f32, {1,1,1,1}}, // vblendps
2130 {TTI::SK_Select, MVT::v16i16, {3,3,3,3}}, // vpand + vpandn + vpor
2131 {TTI::SK_Select, MVT::v16f16, {3,3,3,3}}, // vpand + vpandn + vpor
2132 {TTI::SK_Select, MVT::v32i8, {3,3,3,3}}, // vpand + vpandn + vpor
2133
2134 {TTI::SK_Splice, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + shufpd
2135 {TTI::SK_Splice, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + shufpd
2136 {TTI::SK_Splice, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2137 {TTI::SK_Splice, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2138 {TTI::SK_Splice, MVT::v16i16, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2139 {TTI::SK_Splice, MVT::v16f16, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2140 {TTI::SK_Splice, MVT::v32i8, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2141
2142 {TTI::SK_PermuteSingleSrc, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + vshufpd
2143 {TTI::SK_PermuteSingleSrc, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + vshufpd
2144 {TTI::SK_PermuteSingleSrc, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2145 {TTI::SK_PermuteSingleSrc, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2146 {TTI::SK_PermuteSingleSrc, MVT::v16i16,{8,8,8,8}}, // vextractf128 + 4*pshufb
2147 // + 2*por + vinsertf128
2148 {TTI::SK_PermuteSingleSrc, MVT::v16f16,{8,8,8,8}}, // vextractf128 + 4*pshufb
2149 // + 2*por + vinsertf128
2150 {TTI::SK_PermuteSingleSrc, MVT::v32i8, {8,8,8,8}}, // vextractf128 + 4*pshufb
2151 // + 2*por + vinsertf128
2152
2153 {TTI::SK_PermuteTwoSrc, MVT::v4f64, {3,3,3,3}}, // 2*vperm2f128 + vshufpd
2154 {TTI::SK_PermuteTwoSrc, MVT::v4i64, {3,3,3,3}}, // 2*vperm2f128 + vshufpd
2155 {TTI::SK_PermuteTwoSrc, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2156 {TTI::SK_PermuteTwoSrc, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2157 {TTI::SK_PermuteTwoSrc, MVT::v16i16,{15,15,15,15}}, // 2*vextractf128 + 8*pshufb
2158 // + 4*por + vinsertf128
2159 {TTI::SK_PermuteTwoSrc, MVT::v16f16,{15,15,15,15}}, // 2*vextractf128 + 8*pshufb
2160 // + 4*por + vinsertf128
2161 {TTI::SK_PermuteTwoSrc, MVT::v32i8, {15,15,15,15}}, // 2*vextractf128 + 8*pshufb
2162 // + 4*por + vinsertf128
2163 };
2164
2165 if (ST->hasAVX())
2166 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
2167 if (auto KindCost = Entry->Cost[CostKind])
2168 return LT.first * *KindCost;
2169
2170 static const CostKindTblEntry SSE41ShuffleTbl[] = {
2171 {TTI::SK_Select, MVT::v2i64, {1,1,1,1}}, // pblendw
2172 {TTI::SK_Select, MVT::v2f64, {1,1,1,1}}, // movsd
2173 {TTI::SK_Select, MVT::v4i32, {1,1,1,1}}, // pblendw
2174 {TTI::SK_Select, MVT::v4f32, {1,1,1,1}}, // blendps
2175 {TTI::SK_Select, MVT::v8i16, {1,1,1,1}}, // pblendw
2176 {TTI::SK_Select, MVT::v8f16, {1,1,1,1}}, // pblendw
2177 {TTI::SK_Select, MVT::v16i8, {1,1,1,1}} // pblendvb
2178 };
2179
2180 if (ST->hasSSE41())
2181 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
2182 if (auto KindCost = Entry->Cost[CostKind])
2183 return LT.first * *KindCost;
2184
2185 static const CostKindTblEntry SSSE3ShuffleTbl[] = {
2186 {TTI::SK_Broadcast, MVT::v8i16, {1, 3, 2, 2}}, // pshufb
2187 {TTI::SK_Broadcast, MVT::v8f16, {1, 3, 2, 2}}, // pshufb
2188 {TTI::SK_Broadcast, MVT::v16i8, {1, 3, 2, 2}}, // pshufb
2189
2190 {TTI::SK_Reverse, MVT::v8i16, {1, 2, 1, 2}}, // pshufb
2191 {TTI::SK_Reverse, MVT::v8f16, {1, 2, 1, 2}}, // pshufb
2192 {TTI::SK_Reverse, MVT::v16i8, {1, 2, 1, 2}}, // pshufb
2193
2194 {TTI::SK_Select, MVT::v8i16, {3, 3, 3, 3}}, // 2*pshufb + por
2195 {TTI::SK_Select, MVT::v8f16, {3, 3, 3, 3}}, // 2*pshufb + por
2196 {TTI::SK_Select, MVT::v16i8, {3, 3, 3, 3}}, // 2*pshufb + por
2197
2198 {TTI::SK_Splice, MVT::v4i32, {1, 1, 1, 1}}, // palignr
2199 {TTI::SK_Splice, MVT::v4f32, {1, 1, 1, 1}}, // palignr
2200 {TTI::SK_Splice, MVT::v8i16, {1, 1, 1, 1}}, // palignr
2201 {TTI::SK_Splice, MVT::v8f16, {1, 1, 1, 1}}, // palignr
2202 {TTI::SK_Splice, MVT::v16i8, {1, 1, 1, 1}}, // palignr
2203
2204 {TTI::SK_PermuteSingleSrc, MVT::v8i16, {1, 1, 1, 1}}, // pshufb
2205 {TTI::SK_PermuteSingleSrc, MVT::v8f16, {1, 1, 1, 1}}, // pshufb
2206 {TTI::SK_PermuteSingleSrc, MVT::v16i8, {1, 1, 1, 1}}, // pshufb
2207
2208 {TTI::SK_PermuteTwoSrc, MVT::v8i16, {3, 3, 3, 3}}, // 2*pshufb + por
2209 {TTI::SK_PermuteTwoSrc, MVT::v8f16, {3, 3, 3, 3}}, // 2*pshufb + por
2210 {TTI::SK_PermuteTwoSrc, MVT::v16i8, {3, 3, 3, 3}}, // 2*pshufb + por
2211 };
2212
2213 if (ST->hasSSSE3())
2214 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
2215 if (auto KindCost = Entry->Cost[CostKind])
2216 return LT.first * *KindCost;
2217
2218 static const CostKindTblEntry SSE2ShuffleTbl[] = {
2219 {TTI::SK_Broadcast, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2220 {TTI::SK_Broadcast, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
2221 {TTI::SK_Broadcast, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
2222 {TTI::SK_Broadcast, MVT::v8i16, {1, 2, 2, 2}}, // pshuflw + pshufd
2223 {TTI::SK_Broadcast, MVT::v8f16, {1, 2, 2, 2}}, // pshuflw + pshufd
2224 {TTI::SK_Broadcast, MVT::v16i8, {2, 3, 3, 4}}, // unpck + pshuflw + pshufd
2225
2226 {TTI::SK_Reverse, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2227 {TTI::SK_Reverse, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
2228 {TTI::SK_Reverse, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
2229 {TTI::SK_Reverse, MVT::v8i16, {2, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
2230 {TTI::SK_Reverse, MVT::v8f16, {2, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
2231 {TTI::SK_Reverse, MVT::v16i8, {5, 6,11,11}}, // 2*pshuflw + 2*pshufhw
2232 // + 2*pshufd + 2*unpck + packus
2233
2234 {TTI::SK_Select, MVT::v2i64, {1, 1, 1, 1}}, // movsd
2235 {TTI::SK_Select, MVT::v2f64, {1, 1, 1, 1}}, // movsd
2236 {TTI::SK_Select, MVT::v4i32, {2, 2, 2, 2}}, // 2*shufps
2237 {TTI::SK_Select, MVT::v8i16, {3, 3, 3, 3}}, // pand + pandn + por
2238 {TTI::SK_Select, MVT::v8f16, {3, 3, 3, 3}}, // pand + pandn + por
2239 {TTI::SK_Select, MVT::v16i8, {3, 3, 3, 3}}, // pand + pandn + por
2240
2241 {TTI::SK_Splice, MVT::v2i64, {1, 1, 1, 1}}, // shufpd
2242 {TTI::SK_Splice, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2243 {TTI::SK_Splice, MVT::v4i32, {2, 2, 2, 2}}, // 2*{unpck,movsd,pshufd}
2244 {TTI::SK_Splice, MVT::v8i16, {3, 3, 3, 3}}, // psrldq + psrlldq + por
2245 {TTI::SK_Splice, MVT::v8f16, {3, 3, 3, 3}}, // psrldq + psrlldq + por
2246 {TTI::SK_Splice, MVT::v16i8, {3, 3, 3, 3}}, // psrldq + psrlldq + por
2247
2248 {TTI::SK_PermuteSingleSrc, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2249 {TTI::SK_PermuteSingleSrc, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
2250 {TTI::SK_PermuteSingleSrc, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
2251 {TTI::SK_PermuteSingleSrc, MVT::v8i16, {3, 5, 5, 5}}, // 2*pshuflw + 2*pshufhw
2252 // + pshufd/unpck
2253 {TTI::SK_PermuteSingleSrc, MVT::v8f16, {3, 5, 5, 5}}, // 2*pshuflw + 2*pshufhw
2254 // + pshufd/unpck
2255 {TTI::SK_PermuteSingleSrc, MVT::v16i8, {8, 10, 10, 10}}, // 2*pshuflw + 2*pshufhw
2256 // + 2*pshufd + 2*unpck + 2*packus
2257
2258 {TTI::SK_PermuteTwoSrc, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2259 {TTI::SK_PermuteTwoSrc, MVT::v2i64, {1, 1, 1, 1}}, // shufpd
2260 {TTI::SK_PermuteTwoSrc, MVT::v4i32, {2, 2, 2, 2}}, // 2*{unpck,movsd,pshufd}
2261 {TTI::SK_PermuteTwoSrc, MVT::v8i16, {6, 8, 8, 8}}, // blend+permute
2262 {TTI::SK_PermuteTwoSrc, MVT::v8f16, {6, 8, 8, 8}}, // blend+permute
2263 {TTI::SK_PermuteTwoSrc, MVT::v16i8, {11, 13, 13, 13}}, // blend+permute
2264 };
2265
2266 static const CostTblEntry SSE3BroadcastLoadTbl[] = {
2267 {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
2268 };
2269
2270 if (ST->hasSSE2()) {
2271 bool IsLoad =
2272 llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); });
2273 if (ST->hasSSE3() && IsLoad)
2274 if (const auto *Entry =
2275 CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
2276 assert(isLegalBroadcastLoad(SrcTy->getElementType(),
2277 LT.second.getVectorElementCount()) &&
2278 "Table entry missing from isLegalBroadcastLoad()");
2279 return LT.first * Entry->Cost;
2280 }
2281
2282 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
2283 if (auto KindCost = Entry->Cost[CostKind])
2284 return LT.first * *KindCost;
2285 }
2286
2287 static const CostKindTblEntry SSE1ShuffleTbl[] = {
2288 { TTI::SK_Broadcast, MVT::v4f32, {1,1,1,1} }, // shufps
2289 { TTI::SK_Reverse, MVT::v4f32, {1,1,1,1} }, // shufps
2290 { TTI::SK_Select, MVT::v4f32, {2,2,2,2} }, // 2*shufps
2291 { TTI::SK_Splice, MVT::v4f32, {2,2,2,2} }, // 2*shufps
2292 { TTI::SK_PermuteSingleSrc, MVT::v4f32, {1,1,1,1} }, // shufps
2293 { TTI::SK_PermuteTwoSrc, MVT::v4f32, {2,2,2,2} }, // 2*shufps
2294 };
2295
2296 if (ST->hasSSE1()) {
2297 if (LT.first == 1 && LT.second == MVT::v4f32 && Mask.size() == 4) {
2298 // SHUFPS: both pairs must come from the same source register.
2299 auto MatchSHUFPS = [](int X, int Y) {
2300 return X < 0 || Y < 0 || ((X & 4) == (Y & 4));
2301 };
2302 if (MatchSHUFPS(Mask[0], Mask[1]) && MatchSHUFPS(Mask[2], Mask[3]))
2303 return 1;
2304 }
2305 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
2306 if (auto KindCost = Entry->Cost[CostKind])
2307 return LT.first * *KindCost;
2308 }
2309
2310 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
2311 SubTp);
2312}
2313
2315 Type *Src,
2318 const Instruction *I) const {
2319 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2320 assert(ISD && "Invalid opcode");
2321
2322 // The cost tables include both specific, custom (non-legal) src/dst type
2323 // conversions and generic, legalized types. We test for customs first, before
2324 // falling back to legalization.
2325 // FIXME: Need a better design of the cost table to handle non-simple types of
2326 // potential massive combinations (elem_num x src_type x dst_type).
2327 static const TypeConversionCostKindTblEntry AVX512BWConversionTbl[]{
2328 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2329 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2330
2331 // Mask sign extend has an instruction.
2332 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2333 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2334 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2335 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2336 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2337 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2338 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2339 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2340 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2341 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2342 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2343 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2344 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2345 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2346 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, { 1, 1, 1, 1 } },
2347 { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, { 1, 1, 1, 1 } },
2348 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, { 1, 1, 1, 1 } },
2349
2350 // Mask zero extend is a sext + shift.
2351 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2352 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2353 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2354 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2355 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2356 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2357 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2358 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2359 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2360 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2361 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2362 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2363 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2364 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2365 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, { 2, 1, 1, 1 } },
2366 { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, { 2, 1, 1, 1 } },
2367 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, { 2, 1, 1, 1 } },
2368
2369 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2370 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2371 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2372 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2373 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2374 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2375 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2376 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2377 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2378 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2379 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2380 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2381 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2382 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2383 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, { 2, 1, 1, 1 } },
2384 { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, { 2, 1, 1, 1 } },
2385 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, { 2, 1, 1, 1 } },
2386
2387 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 2, 1, 1, 1 } },
2388 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // widen to zmm
2389 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, { 2, 1, 1, 1 } }, // vpmovwb
2390 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, { 2, 1, 1, 1 } }, // vpmovwb
2391 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, { 2, 1, 1, 1 } }, // vpmovwb
2392 };
2393
2394 static const TypeConversionCostKindTblEntry AVX512DQConversionTbl[] = {
2395 // Mask sign extend has an instruction.
2396 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2397 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2398 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2399 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2400 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2401 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, { 1, 1, 1, 1 } },
2402 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } },
2403 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } },
2404
2405 // Mask zero extend is a sext + shift.
2406 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1, } },
2407 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1, } },
2408 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1, } },
2409 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1, } },
2410 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1, } },
2411 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, { 2, 1, 1, 1, } },
2412 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1, } },
2413 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1, } },
2414
2415 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2416 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2417 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2418 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2419 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2420 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } },
2421 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } },
2422 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, { 2, 1, 1, 1 } },
2423
2424 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2425 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2426
2427 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2428 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2429
2430 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2431 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2432
2433 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2434 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2435 };
2436
2437 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
2438 // 256-bit wide vectors.
2439
2440 static const TypeConversionCostKindTblEntry AVX512FConversionTbl[] = {
2441 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 1, 1, 1, 1 } },
2442 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, { 3, 1, 1, 1 } },
2443 { ISD::FP_EXTEND, MVT::v16f64, MVT::v16f32, { 4, 1, 1, 1 } }, // 2*vcvtps2pd+vextractf64x4
2444 { ISD::FP_EXTEND, MVT::v16f32, MVT::v16f16, { 1, 1, 1, 1 } }, // vcvtph2ps
2445 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
2446 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 1, 1, 1, 1 } },
2447 { ISD::FP_ROUND, MVT::v16f16, MVT::v16f32, { 1, 1, 1, 1 } }, // vcvtps2ph
2448
2449 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2450 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2451 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2452 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2453 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2454 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2455 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2456 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2457 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2458 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2459 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2460 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2461 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2462 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2463 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2464 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, { 2, 1, 1, 1 } }, // vpmovdb
2465 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, { 2, 1, 1, 1 } }, // vpmovdb
2466 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2467 { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2468 { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2469 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2470 { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2471 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, { 2, 1, 1, 1 } }, // vpmovqb
2472 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, { 1, 1, 1, 1 } }, // vpshufb
2473 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2474 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2475 { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2476 { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2477 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2478 { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2479 { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2480 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, { 1, 1, 1, 1 } }, // vpmovqd
2481 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // zmm vpmovqd
2482 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, { 5, 1, 1, 1 } },// 2*vpmovqd+concat+vpmovdb
2483
2484 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } }, // extend to v16i32
2485 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 8, 1, 1, 1 } },
2486 { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, { 8, 1, 1, 1 } },
2487
2488 // Sign extend is zmm vpternlogd+vptruncdb.
2489 // Zero extend is zmm broadcast load+vptruncdw.
2490 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 3, 1, 1, 1 } },
2491 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 4, 1, 1, 1 } },
2492 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 3, 1, 1, 1 } },
2493 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 4, 1, 1, 1 } },
2494 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 3, 1, 1, 1 } },
2495 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 4, 1, 1, 1 } },
2496 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 3, 1, 1, 1 } },
2497 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 4, 1, 1, 1 } },
2498
2499 // Sign extend is zmm vpternlogd+vptruncdw.
2500 // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
2501 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 3, 1, 1, 1 } },
2502 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2503 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 3, 1, 1, 1 } },
2504 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2505 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 3, 1, 1, 1 } },
2506 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2507 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 3, 1, 1, 1 } },
2508 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2509
2510 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2511 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2512 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2513 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2514 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2515 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2516 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2517 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2518 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2519 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2520
2521 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2522 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2523 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogq
2524 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2525
2526 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2527 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2528 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2529 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2530 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2531 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2532 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2533 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2534 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2535 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2536
2537 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2538 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2539
2540 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2541 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2542 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2543 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2544 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2545 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2546 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2547 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2548
2549 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2550 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2551 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2552 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2553 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2554 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2555 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2556 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2557 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, {26, 1, 1, 1 } },
2558 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 5, 1, 1, 1 } },
2559
2560 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2561 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, { 7, 1, 1, 1 } },
2562 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64, {15, 1, 1, 1 } },
2563 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32, {11, 1, 1, 1 } },
2564 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64, {31, 1, 1, 1 } },
2565 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2566 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, { 7, 1, 1, 1 } },
2567 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, { 5, 1, 1, 1 } },
2568 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64, {15, 1, 1, 1 } },
2569 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2570 { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, { 3, 1, 1, 1 } },
2571
2572 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2573 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2574 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, { 3, 1, 1, 1 } },
2575 { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, { 1, 1, 1, 1 } },
2576 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, { 3, 1, 1, 1 } },
2577 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, { 3, 1, 1, 1 } },
2578 };
2579
2580 static const TypeConversionCostKindTblEntry AVX512BWVLConversionTbl[] {
2581 // Mask sign extend has an instruction.
2582 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2583 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2584 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2585 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2586 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2587 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2588 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2589 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2590 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2591 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2592 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2593 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2594 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2595 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2596 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, { 1, 1, 1, 1 } },
2597 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, { 1, 1, 1, 1 } },
2598 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, { 1, 1, 1, 1 } },
2599
2600 // Mask zero extend is a sext + shift.
2601 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2602 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2603 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2604 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2605 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2606 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2607 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2608 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2609 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2610 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2611 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2612 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2613 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2614 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2615 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, { 2, 1, 1, 1 } },
2616 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, { 2, 1, 1, 1 } },
2617 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, { 2, 1, 1, 1 } },
2618
2619 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2620 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2621 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2622 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2623 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2624 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2625 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2626 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2627 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2628 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2629 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2630 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2631 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2632 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2633 { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, { 2, 1, 1, 1 } },
2634 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, { 2, 1, 1, 1 } },
2635 { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, { 2, 1, 1, 1 } },
2636
2637 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } },
2638 };
2639
2640 static const TypeConversionCostKindTblEntry AVX512DQVLConversionTbl[] = {
2641 // Mask sign extend has an instruction.
2642 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2643 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2644 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2645 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, { 1, 1, 1, 1 } },
2646 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2647 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, { 1, 1, 1, 1 } },
2648 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } },
2649 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2650
2651 // Mask zero extend is a sext + shift.
2652 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } },
2653 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1 } },
2654 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } },
2655 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, { 2, 1, 1, 1 } },
2656 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } },
2657 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, { 2, 1, 1, 1 } },
2658 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } },
2659 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } },
2660
2661 { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, { 2, 1, 1, 1 } },
2662 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } },
2663 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2664 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2665 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2666 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2667 { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, { 2, 1, 1, 1 } },
2668 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2669
2670 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2671 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2672 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2673 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2674
2675 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2676 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2677 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2678 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2679
2680 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2681 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2682 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2683 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2684
2685 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2686 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2687 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2688 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2689 };
2690
2691 static const TypeConversionCostKindTblEntry AVX512VLConversionTbl[] = {
2692 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2693 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2694 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2695 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 8, 1, 1, 1 } }, // split+2*v8i8
2696 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2697 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2698 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2699 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 8, 1, 1, 1 } }, // split+2*v8i16
2700 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2701 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2702 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2703 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2704 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2705 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2706 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // vpmovqd
2707 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqb
2708 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqw
2709 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, { 2, 1, 1, 1 } }, // vpmovwb
2710
2711 // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
2712 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
2713 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 5, 1, 1, 1 } },
2714 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 6, 1, 1, 1 } },
2715 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 5, 1, 1, 1 } },
2716 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 6, 1, 1, 1 } },
2717 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 5, 1, 1, 1 } },
2718 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 6, 1, 1, 1 } },
2719 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, {10, 1, 1, 1 } },
2720 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, {12, 1, 1, 1 } },
2721
2722 // sign extend is vpcmpeq+maskedmove+vpmovdw
2723 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
2724 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2725 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 5, 1, 1, 1 } },
2726 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2727 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 5, 1, 1, 1 } },
2728 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2729 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 5, 1, 1, 1 } },
2730 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, {10, 1, 1, 1 } },
2731 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, {12, 1, 1, 1 } },
2732
2733 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogd
2734 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2735 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogd
2736 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2737 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogd
2738 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2739 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2740 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2741
2742 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogq
2743 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2744 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogq
2745 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2746
2747 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2748 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2749 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2750 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2751 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2752 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2753 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2754 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2755 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2756 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2757 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2758 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2759
2760 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2761 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2762 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2763 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2764
2765 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
2766 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
2767 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2768 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2769 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2770 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2771 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 1, 1, 1, 1 } },
2772 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
2773 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2774 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2775 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 5, 1, 1, 1 } },
2776 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2777 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 5, 1, 1, 1 } },
2778
2779 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2780 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2781 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, { 5, 1, 1, 1 } },
2782
2783 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
2784 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
2785 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
2786 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
2787 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2788 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2789 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2790 };
2791
2792 static const TypeConversionCostKindTblEntry AVX2ConversionTbl[] = {
2793 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2794 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2795 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2796 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2797 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2798 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2799
2800 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2801 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2802 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2803 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2804 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2805 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2806 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2807 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2808 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2809 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2810 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2811 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2812 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2813 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2814
2815 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2816
2817 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 4, 1, 1, 1 } },
2818 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 4, 1, 1, 1 } },
2819 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 1, 1, 1, 1 } },
2820 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 1, 1, 1, 1 } },
2821 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 1, 1, 1, 1 } },
2822 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 4, 1, 1, 1 } },
2823 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 4, 1, 1, 1 } },
2824 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 1, 1, 1, 1 } },
2825 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 1, 1, 1, 1 } },
2826 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 5, 1, 1, 1 } },
2827 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } },
2828 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 2, 1, 1, 1 } },
2829
2830 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 3, 1, 1, 1 } },
2831 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 3, 1, 1, 1 } },
2832
2833 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2834 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2835 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2836 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 3, 1, 1, 1 } },
2837
2838 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 3, 1, 1, 1 } },
2839 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 3, 1, 1, 1 } },
2840 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2841 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
2842 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2843 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 4, 1, 1, 1 } },
2844 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 3, 1, 1, 1 } },
2845 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 4, 1, 1, 1 } },
2846
2847 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2848 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2849 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2850 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2851 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2852 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2853 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 3, 1, 1, 1 } },
2854
2855 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2856 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2857 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2858 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2859 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 2, 1, 1, 1 } },
2860 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 1, 1, 1, 1 } },
2861 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 2, 1, 1, 1 } },
2862 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2863 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2864 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2865 };
2866
2867 static const TypeConversionCostKindTblEntry AVXConversionTbl[] = {
2868 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2869 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2870 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2871 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2872 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2873 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2874
2875 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2876 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2877 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2878 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2879 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2880 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2881 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2882 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2883 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2884 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2885 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2886 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2887
2888 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 4, 1, 1, 1 } },
2889 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 5, 1, 1, 1 } },
2890 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 4, 1, 1, 1 } },
2891 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 9, 1, 1, 1 } },
2892 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, {11, 1, 1, 1 } },
2893
2894 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 6, 1, 1, 1 } },
2895 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 6, 1, 1, 1 } },
2896 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // and+extract+packuswb
2897 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 5, 1, 1, 1 } },
2898 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
2899 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 5, 1, 1, 1 } },
2900 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 3, 1, 1, 1 } }, // and+extract+2*packusdw
2901 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 2, 1, 1, 1 } },
2902
2903 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, { 3, 1, 1, 1 } },
2904 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, { 3, 1, 1, 1 } },
2905 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, { 8, 1, 1, 1 } },
2906 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2907 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2908 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2909 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2910 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2911 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2912 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2913 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 5, 1, 1, 1 } },
2914 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 8, 1, 1, 1 } },
2915
2916 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, { 7, 1, 1, 1 } },
2917 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, { 7, 1, 1, 1 } },
2918 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, { 6, 1, 1, 1 } },
2919 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2920 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2921 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2922 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2923 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 4, 1, 1, 1 } },
2924 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 4, 1, 1, 1 } },
2925 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
2926 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 6, 1, 1, 1 } },
2927 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 8, 1, 1, 1 } },
2928 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, {10, 1, 1, 1 } },
2929 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, {10, 1, 1, 1 } },
2930 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {18, 1, 1, 1 } },
2931 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2932 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, {10, 1, 1, 1 } },
2933
2934 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2935 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2936 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2937 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
2938 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
2939 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
2940 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
2941 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
2942 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 2, 1, 1, 1 } },
2943 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 2, 1, 1, 1 } },
2944 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 5, 1, 1, 1 } },
2945
2946 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2947 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2948 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2949 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
2950 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
2951 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
2952 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
2953 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
2954 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
2955 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2956 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 6, 1, 1, 1 } },
2957 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 7, 1, 1, 1 } },
2958 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 7, 1, 1, 1 } },
2959
2960 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, { 1, 1, 1, 1 } },
2961 { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, { 1, 1, 1, 1 } },
2962 };
2963
2964 static const TypeConversionCostKindTblEntry SSE41ConversionTbl[] = {
2965 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
2966 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
2967 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
2968 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
2969 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
2970 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
2971 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
2972 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
2973 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
2974 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
2975 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
2976 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
2977
2978 // These truncates end up widening elements.
2979 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 1, 1, 1, 1 } }, // PMOVXZBQ
2980 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 1, 1, 1, 1 } }, // PMOVXZWQ
2981 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 1, 1, 1, 1 } }, // PMOVXZBD
2982
2983 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 2, 1, 1, 1 } },
2984 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 2, 1, 1, 1 } },
2985 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 2, 1, 1, 1 } },
2986
2987 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
2988 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
2989 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
2990 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
2991 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
2992 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2993 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
2994 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2995 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
2996 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 1, 1, 1, 1 } },
2997 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2998
2999 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
3000 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
3001 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 4, 1, 1, 1 } },
3002 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 4, 1, 1, 1 } },
3003 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
3004 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
3005 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
3006 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
3007 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 3, 1, 1, 1 } },
3008 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
3009 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 2, 1, 1, 1 } },
3010 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {12, 1, 1, 1 } },
3011 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {22, 1, 1, 1 } },
3012 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 4, 1, 1, 1 } },
3013
3014 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
3015 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
3016 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
3017 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
3018 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
3019 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
3020 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
3021 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
3022 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
3023 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
3024
3025 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
3026 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3027 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
3028 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
3029 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
3030 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
3031 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
3032 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
3033 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
3034 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
3035 };
3036
3037 static const TypeConversionCostKindTblEntry SSE2ConversionTbl[] = {
3038 // These are somewhat magic numbers justified by comparing the
3039 // output of llvm-mca for our various supported scheduler models
3040 // and basing it off the worst case scenario.
3041 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
3042 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
3043 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 3, 1, 1, 1 } },
3044 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 3, 1, 1, 1 } },
3045 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 3, 1, 1, 1 } },
3046 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
3047 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 3, 1, 1, 1 } },
3048 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
3049 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
3050 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 4, 1, 1, 1 } },
3051 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 8, 1, 1, 1 } },
3052 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 8, 1, 1, 1 } },
3053
3054 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
3055 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
3056 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 8, 1, 1, 1 } },
3057 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 9, 1, 1, 1 } },
3058 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
3059 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 4, 1, 1, 1 } },
3060 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 4, 1, 1, 1 } },
3061 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
3062 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 7, 1, 1, 1 } },
3063 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 7, 1, 1, 1 } },
3064 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
3065 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, {15, 1, 1, 1 } },
3066 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {18, 1, 1, 1 } },
3067
3068 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
3069 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3070 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
3071 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
3072 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
3073 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
3074 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
3075 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
3076 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
3077 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
3078
3079 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
3080 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3081 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
3082 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, {15, 1, 1, 1 } },
3083 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
3084 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
3085 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
3086 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
3087 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 8, 1, 1, 1 } },
3088 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 8, 1, 1, 1 } },
3089
3090 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
3091 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
3092 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 2, 1, 1, 1 } },
3093 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 3, 1, 1, 1 } },
3094 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
3095 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 2, 1, 1, 1 } },
3096 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 2, 1, 1, 1 } },
3097 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 3, 1, 1, 1 } },
3098 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
3099 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 2, 1, 1, 1 } },
3100 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
3101 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 2, 1, 1, 1 } },
3102
3103 // These truncates are really widening elements.
3104 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 1, 1, 1, 1 } }, // PSHUFD
3105 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } }, // PUNPCKLWD+DQ
3106 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // PUNPCKLBW+WD+PSHUFD
3107 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 1, 1, 1, 1 } }, // PUNPCKLWD
3108 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } }, // PUNPCKLBW+WD
3109 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 1, 1, 1, 1 } }, // PUNPCKLBW
3110
3111 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 2, 1, 1, 1 } }, // PAND+PACKUSWB
3112 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } },
3113 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 3, 1, 1, 1 } }, // PAND+2*PACKUSWB
3114 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 7, 1, 1, 1 } },
3115 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, { 1, 1, 1, 1 } },
3116 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 3, 1, 1, 1 } },
3117 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
3118 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, {10, 1, 1, 1 } },
3119 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 4, 1, 1, 1 } }, // PAND+3*PACKUSWB
3120 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 2, 1, 1, 1 } }, // PSHUFD+PSHUFLW
3121 { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, { 1, 1, 1, 1 } }, // PSHUFD
3122 };
3123
3124 static const TypeConversionCostKindTblEntry F16ConversionTbl[] = {
3125 { ISD::FP_ROUND, MVT::f16, MVT::f32, { 1, 1, 1, 1 } },
3126 { ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, { 1, 1, 1, 1 } },
3127 { ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, { 1, 1, 1, 1 } },
3128 { ISD::FP_EXTEND, MVT::f32, MVT::f16, { 1, 1, 1, 1 } },
3129 { ISD::FP_EXTEND, MVT::f64, MVT::f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
3130 { ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, { 1, 1, 1, 1 } },
3131 { ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, { 1, 1, 1, 1 } },
3132 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
3133 };
3134
3135 // Attempt to map directly to (simple) MVT types to let us match custom entries.
3136 EVT SrcTy = TLI->getValueType(DL, Src);
3137 EVT DstTy = TLI->getValueType(DL, Dst);
3138
3139 // If we're sign-extending a vector comparison result back to the comparison
3140 // width, this will be free without AVX512 (or for 8/16-bit types without
3141 // BWI).
3142 if (!ST->hasAVX512() || (!ST->hasBWI() && DstTy.getScalarSizeInBits() < 32)) {
3143 if (I && Opcode == Instruction::CastOps::SExt &&
3144 SrcTy.isFixedLengthVector() && SrcTy.getScalarType() == MVT::i1) {
3145 if (auto *CmpI = dyn_cast<CmpInst>(I->getOperand(0))) {
3146 Type *CmpTy = CmpI->getOperand(0)->getType();
3147 if (CmpTy->getScalarSizeInBits() == DstTy.getScalarSizeInBits())
3148 return TTI::TCC_Free;
3149 }
3150 }
3151 }
3152
3153 // The function getSimpleVT only handles simple value types.
3154 if (SrcTy.isSimple() && DstTy.isSimple()) {
3155 MVT SimpleSrcTy = SrcTy.getSimpleVT();
3156 MVT SimpleDstTy = DstTy.getSimpleVT();
3157
3158 if (ST->useAVX512Regs()) {
3159 if (ST->hasBWI())
3160 if (const auto *Entry = ConvertCostTableLookup(
3161 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3162 if (auto KindCost = Entry->Cost[CostKind])
3163 return *KindCost;
3164
3165 if (ST->hasDQI())
3166 if (const auto *Entry = ConvertCostTableLookup(
3167 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3168 if (auto KindCost = Entry->Cost[CostKind])
3169 return *KindCost;
3170
3171 if (ST->hasAVX512())
3172 if (const auto *Entry = ConvertCostTableLookup(
3173 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3174 if (auto KindCost = Entry->Cost[CostKind])
3175 return *KindCost;
3176 }
3177
3178 if (ST->hasBWI())
3179 if (const auto *Entry = ConvertCostTableLookup(
3180 AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3181 if (auto KindCost = Entry->Cost[CostKind])
3182 return *KindCost;
3183
3184 if (ST->hasDQI())
3185 if (const auto *Entry = ConvertCostTableLookup(
3186 AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3187 if (auto KindCost = Entry->Cost[CostKind])
3188 return *KindCost;
3189
3190 if (ST->hasAVX512())
3191 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3192 SimpleDstTy, SimpleSrcTy))
3193 if (auto KindCost = Entry->Cost[CostKind])
3194 return *KindCost;
3195
3196 if (ST->hasAVX2()) {
3197 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3198 SimpleDstTy, SimpleSrcTy))
3199 if (auto KindCost = Entry->Cost[CostKind])
3200 return *KindCost;
3201 }
3202
3203 if (ST->hasAVX()) {
3204 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3205 SimpleDstTy, SimpleSrcTy))
3206 if (auto KindCost = Entry->Cost[CostKind])
3207 return *KindCost;
3208 }
3209
3210 if (ST->hasF16C()) {
3211 if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD,
3212 SimpleDstTy, SimpleSrcTy))
3213 if (auto KindCost = Entry->Cost[CostKind])
3214 return *KindCost;
3215 }
3216
3217 if (ST->hasSSE41()) {
3218 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3219 SimpleDstTy, SimpleSrcTy))
3220 if (auto KindCost = Entry->Cost[CostKind])
3221 return *KindCost;
3222 }
3223
3224 if (ST->hasSSE2()) {
3225 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3226 SimpleDstTy, SimpleSrcTy))
3227 if (auto KindCost = Entry->Cost[CostKind])
3228 return *KindCost;
3229 }
3230
3231 if ((ISD == ISD::FP_ROUND && SimpleDstTy == MVT::f16) ||
3232 (ISD == ISD::FP_EXTEND && SimpleSrcTy == MVT::f16)) {
3233 // fp16 conversions not covered by any table entries require a libcall.
3234 // Return a large (arbitrary) number to model this.
3235 return InstructionCost(64);
3236 }
3237 }
3238
3239 // Fall back to legalized types.
3240 std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src);
3241 std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst);
3242
3243 // If we're truncating to the same legalized type - just assume its free.
3244 if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second)
3245 return TTI::TCC_Free;
3246
3247 if (ST->useAVX512Regs()) {
3248 if (ST->hasBWI())
3249 if (const auto *Entry = ConvertCostTableLookup(
3250 AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
3251 if (auto KindCost = Entry->Cost[CostKind])
3252 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3253
3254 if (ST->hasDQI())
3255 if (const auto *Entry = ConvertCostTableLookup(
3256 AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
3257 if (auto KindCost = Entry->Cost[CostKind])
3258 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3259
3260 if (ST->hasAVX512())
3261 if (const auto *Entry = ConvertCostTableLookup(
3262 AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
3263 if (auto KindCost = Entry->Cost[CostKind])
3264 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3265 }
3266
3267 if (ST->hasBWI())
3268 if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
3269 LTDest.second, LTSrc.second))
3270 if (auto KindCost = Entry->Cost[CostKind])
3271 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3272
3273 if (ST->hasDQI())
3274 if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
3275 LTDest.second, LTSrc.second))
3276 if (auto KindCost = Entry->Cost[CostKind])
3277 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3278
3279 if (ST->hasAVX512())
3280 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3281 LTDest.second, LTSrc.second))
3282 if (auto KindCost = Entry->Cost[CostKind])
3283 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3284
3285 if (ST->hasAVX2())
3286 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3287 LTDest.second, LTSrc.second))
3288 if (auto KindCost = Entry->Cost[CostKind])
3289 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3290
3291 if (ST->hasAVX())
3292 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3293 LTDest.second, LTSrc.second))
3294 if (auto KindCost = Entry->Cost[CostKind])
3295 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3296
3297 if (ST->hasF16C()) {
3298 if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD,
3299 LTDest.second, LTSrc.second))
3300 if (auto KindCost = Entry->Cost[CostKind])
3301 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3302 }
3303
3304 if (ST->hasSSE41())
3305 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3306 LTDest.second, LTSrc.second))
3307 if (auto KindCost = Entry->Cost[CostKind])
3308 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3309
3310 if (ST->hasSSE2())
3311 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3312 LTDest.second, LTSrc.second))
3313 if (auto KindCost = Entry->Cost[CostKind])
3314 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3315
3316 // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
3317 // sitofp.
3318 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3319 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
3320 Type *ExtSrc = Src->getWithNewBitWidth(32);
3321 unsigned ExtOpc =
3322 (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
3323
3324 // For scalar loads the extend would be free.
3325 InstructionCost ExtCost = 0;
3326 if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
3327 ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
3328
3329 return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
3331 }
3332
3333 // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
3334 // i32.
3335 if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
3336 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
3337 Type *TruncDst = Dst->getWithNewBitWidth(32);
3338 return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
3339 getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
3341 }
3342
3343 // TODO: Allow non-throughput costs that aren't binary.
3344 auto AdjustCost = [&CostKind](InstructionCost Cost,
3347 return Cost == 0 ? 0 : N;
3348 return Cost * N;
3349 };
3350 return AdjustCost(
3351 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3352}
3353
3355 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
3357 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
3358 // Early out if this type isn't scalar/vector integer/float.
3359 if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy()))
3360 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3361 Op1Info, Op2Info, I);
3362
3363 // Legalize the type.
3364 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3365
3366 MVT MTy = LT.second;
3367
3368 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3369 assert(ISD && "Invalid opcode");
3370
3371 InstructionCost ExtraCost = 0;
3372 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
3373 // Some vector comparison predicates cost extra instructions.
3374 // TODO: Adjust ExtraCost based on CostKind?
3375 // TODO: Should we invert this and assume worst case cmp costs
3376 // and reduce for particular predicates?
3377 if (MTy.isVector() &&
3378 !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
3379 (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
3380 ST->hasBWI())) {
3381 // Fallback to I if a specific predicate wasn't specified.
3382 CmpInst::Predicate Pred = VecPred;
3383 if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE ||
3385 Pred = cast<CmpInst>(I)->getPredicate();
3386
3387 bool CmpWithConstant = false;
3388 if (auto *CmpInstr = dyn_cast_or_null<CmpInst>(I))
3389 CmpWithConstant = isa<Constant>(CmpInstr->getOperand(1));
3390
3391 switch (Pred) {
3393 // xor(cmpeq(x,y),-1)
3394 ExtraCost = CmpWithConstant ? 0 : 1;
3395 break;
3398 // xor(cmpgt(x,y),-1)
3399 ExtraCost = CmpWithConstant ? 0 : 1;
3400 break;
3403 // cmpgt(xor(x,signbit),xor(y,signbit))
3404 // xor(cmpeq(pmaxu(x,y),x),-1)
3405 ExtraCost = CmpWithConstant ? 1 : 2;
3406 break;
3409 if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
3410 (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
3411 // cmpeq(psubus(x,y),0)
3412 // cmpeq(pminu(x,y),x)
3413 ExtraCost = 1;
3414 } else {
3415 // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
3416 ExtraCost = CmpWithConstant ? 2 : 3;
3417 }
3418 break;
3421 // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases.
3422 // Use FCMP_UEQ expansion - FCMP_ONE should be the same.
3423 if (CondTy && !ST->hasAVX())
3424 return getCmpSelInstrCost(Opcode, ValTy, CondTy,
3426 Op1Info, Op2Info) +
3427 getCmpSelInstrCost(Opcode, ValTy, CondTy,
3429 Op1Info, Op2Info) +
3430 getArithmeticInstrCost(Instruction::Or, CondTy, CostKind);
3431
3432 break;
3435 // Assume worst case scenario and add the maximum extra cost.
3436 ExtraCost = 3;
3437 break;
3438 default:
3439 break;
3440 }
3441 }
3442 }
3443
3444 static const CostKindTblEntry SLMCostTbl[] = {
3445 // slm pcmpeq/pcmpgt throughput is 2
3446 { ISD::SETCC, MVT::v2i64, { 2, 5, 1, 2 } },
3447 // slm pblendvb/blendvpd/blendvps throughput is 4
3448 { ISD::SELECT, MVT::v2f64, { 4, 4, 1, 3 } }, // vblendvpd
3449 { ISD::SELECT, MVT::v4f32, { 4, 4, 1, 3 } }, // vblendvps
3450 { ISD::SELECT, MVT::v2i64, { 4, 4, 1, 3 } }, // pblendvb
3451 { ISD::SELECT, MVT::v8i32, { 4, 4, 1, 3 } }, // pblendvb
3452 { ISD::SELECT, MVT::v8i16, { 4, 4, 1, 3 } }, // pblendvb
3453 { ISD::SELECT, MVT::v16i8, { 4, 4, 1, 3 } }, // pblendvb
3454 };
3455
3456 static const CostKindTblEntry AVX512BWCostTbl[] = {
3457 { ISD::SETCC, MVT::v32i16, { 1, 1, 1, 1 } },
3458 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 1 } },
3459 { ISD::SETCC, MVT::v64i8, { 1, 1, 1, 1 } },
3460 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 1 } },
3461
3462 { ISD::SELECT, MVT::v32i16, { 1, 1, 1, 1 } },
3463 { ISD::SELECT, MVT::v64i8, { 1, 1, 1, 1 } },
3464 };
3465
3466 static const CostKindTblEntry AVX512CostTbl[] = {
3467 { ISD::SETCC, MVT::v8f64, { 1, 4, 1, 1 } },
3468 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 1 } },
3469 { ISD::SETCC, MVT::v16f32, { 1, 4, 1, 1 } },
3470 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 1 } },
3471
3472 { ISD::SETCC, MVT::v8i64, { 1, 1, 1, 1 } },
3473 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 1 } },
3474 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3475 { ISD::SETCC, MVT::v16i32, { 1, 1, 1, 1 } },
3476 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 1 } },
3477 { ISD::SETCC, MVT::v32i16, { 3, 7, 5, 5 } },
3478 { ISD::SETCC, MVT::v64i8, { 3, 7, 5, 5 } },
3479
3480 { ISD::SELECT, MVT::v8i64, { 1, 1, 1, 1 } },
3481 { ISD::SELECT, MVT::v4i64, { 1, 1, 1, 1 } },
3482 { ISD::SELECT, MVT::v2i64, { 1, 1, 1, 1 } },
3483 { ISD::SELECT, MVT::v16i32, { 1, 1, 1, 1 } },
3484 { ISD::SELECT, MVT::v8i32, { 1, 1, 1, 1 } },
3485 { ISD::SELECT, MVT::v4i32, { 1, 1, 1, 1 } },
3486 { ISD::SELECT, MVT::v8f64, { 1, 1, 1, 1 } },
3487 { ISD::SELECT, MVT::v4f64, { 1, 1, 1, 1 } },
3488 { ISD::SELECT, MVT::v2f64, { 1, 1, 1, 1 } },
3489 { ISD::SELECT, MVT::f64, { 1, 1, 1, 1 } },
3490 { ISD::SELECT, MVT::v16f32, { 1, 1, 1, 1 } },
3491 { ISD::SELECT, MVT::v8f32 , { 1, 1, 1, 1 } },
3492 { ISD::SELECT, MVT::v4f32, { 1, 1, 1, 1 } },
3493 { ISD::SELECT, MVT::f32 , { 1, 1, 1, 1 } },
3494
3495 { ISD::SELECT, MVT::v32i16, { 2, 2, 4, 4 } },
3496 { ISD::SELECT, MVT::v16i16, { 1, 1, 1, 1 } },
3497 { ISD::SELECT, MVT::v8i16, { 1, 1, 1, 1 } },
3498 { ISD::SELECT, MVT::v64i8, { 2, 2, 4, 4 } },
3499 { ISD::SELECT, MVT::v32i8, { 1, 1, 1, 1 } },
3500 { ISD::SELECT, MVT::v16i8, { 1, 1, 1, 1 } },
3501 };
3502
3503 static const CostKindTblEntry AVX2CostTbl[] = {
3504 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 2 } },
3505 { ISD::SETCC, MVT::v2f64, { 1, 4, 1, 1 } },
3506 { ISD::SETCC, MVT::f64, { 1, 4, 1, 1 } },
3507 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 2 } },
3508 { ISD::SETCC, MVT::v4f32, { 1, 4, 1, 1 } },
3509 { ISD::SETCC, MVT::f32, { 1, 4, 1, 1 } },
3510
3511 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 2 } },
3512 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 2 } },
3513 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 2 } },
3514 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 2 } },
3515
3516 { ISD::SELECT, MVT::v4f64, { 2, 2, 1, 2 } }, // vblendvpd
3517 { ISD::SELECT, MVT::v8f32, { 2, 2, 1, 2 } }, // vblendvps
3518 { ISD::SELECT, MVT::v4i64, { 2, 2, 1, 2 } }, // pblendvb
3519 { ISD::SELECT, MVT::v8i32, { 2, 2, 1, 2 } }, // pblendvb
3520 { ISD::SELECT, MVT::v16i16, { 2, 2, 1, 2 } }, // pblendvb
3521 { ISD::SELECT, MVT::v32i8, { 2, 2, 1, 2 } }, // pblendvb
3522 };
3523
3524 static const CostKindTblEntry XOPCostTbl[] = {
3525 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3526 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3527 };
3528
3529 static const CostKindTblEntry AVX1CostTbl[] = {
3530 { ISD::SETCC, MVT::v4f64, { 2, 3, 1, 2 } },
3531 { ISD::SETCC, MVT::v2f64, { 1, 3, 1, 1 } },
3532 { ISD::SETCC, MVT::f64, { 1, 3, 1, 1 } },
3533 { ISD::SETCC, MVT::v8f32, { 2, 3, 1, 2 } },
3534 { ISD::SETCC, MVT::v4f32, { 1, 3, 1, 1 } },
3535 { ISD::SETCC, MVT::f32, { 1, 3, 1, 1 } },
3536
3537 // AVX1 does not support 8-wide integer compare.
3538 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3539 { ISD::SETCC, MVT::v8i32, { 4, 2, 5, 6 } },
3540 { ISD::SETCC, MVT::v16i16, { 4, 2, 5, 6 } },
3541 { ISD::SETCC, MVT::v32i8, { 4, 2, 5, 6 } },
3542
3543 { ISD::SELECT, MVT::v4f64, { 3, 3, 1, 2 } }, // vblendvpd
3544 { ISD::SELECT, MVT::v8f32, { 3, 3, 1, 2 } }, // vblendvps
3545 { ISD::SELECT, MVT::v4i64, { 3, 3, 1, 2 } }, // vblendvpd
3546 { ISD::SELECT, MVT::v8i32, { 3, 3, 1, 2 } }, // vblendvps
3547 { ISD::SELECT, MVT::v16i16, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3548 { ISD::SELECT, MVT::v32i8, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3549 };
3550
3551 static const CostKindTblEntry SSE42CostTbl[] = {
3552 { ISD::SETCC, MVT::v2i64, { 1, 2, 1, 2 } },
3553 };
3554
3555 static const CostKindTblEntry SSE41CostTbl[] = {
3556 { ISD::SETCC, MVT::v2f64, { 1, 5, 1, 1 } },
3557 { ISD::SETCC, MVT::v4f32, { 1, 5, 1, 1 } },
3558
3559 { ISD::SELECT, MVT::v2f64, { 2, 2, 1, 2 } }, // blendvpd
3560 { ISD::SELECT, MVT::f64, { 2, 2, 1, 2 } }, // blendvpd
3561 { ISD::SELECT, MVT::v4f32, { 2, 2, 1, 2 } }, // blendvps
3562 { ISD::SELECT, MVT::f32 , { 2, 2, 1, 2 } }, // blendvps
3563 { ISD::SELECT, MVT::v2i64, { 2, 2, 1, 2 } }, // pblendvb
3564 { ISD::SELECT, MVT::v4i32, { 2, 2, 1, 2 } }, // pblendvb
3565 { ISD::SELECT, MVT::v8i16, { 2, 2, 1, 2 } }, // pblendvb
3566 { ISD::SELECT, MVT::v16i8, { 2, 2, 1, 2 } }, // pblendvb
3567 };
3568
3569 static const CostKindTblEntry SSE2CostTbl[] = {
3570 { ISD::SETCC, MVT::v2f64, { 2, 5, 1, 1 } },
3571 { ISD::SETCC, MVT::f64, { 1, 5, 1, 1 } },
3572
3573 { ISD::SETCC, MVT::v2i64, { 5, 4, 5, 5 } }, // pcmpeqd/pcmpgtd expansion
3574 { ISD::SETCC, MVT::v4i32, { 1, 1, 1, 1 } },
3575 { ISD::SETCC, MVT::v8i16, { 1, 1, 1, 1 } },
3576 { ISD::SETCC, MVT::v16i8, { 1, 1, 1, 1 } },
3577
3578 { ISD::SELECT, MVT::v2f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3579 { ISD::SELECT, MVT::f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3580 { ISD::SELECT, MVT::v2i64, { 2, 2, 3, 3 } }, // pand + pandn + por
3581 { ISD::SELECT, MVT::v4i32, { 2, 2, 3, 3 } }, // pand + pandn + por
3582 { ISD::SELECT, MVT::v8i16, { 2, 2, 3, 3 } }, // pand + pandn + por
3583 { ISD::SELECT, MVT::v16i8, { 2, 2, 3, 3 } }, // pand + pandn + por
3584 };
3585
3586 static const CostKindTblEntry SSE1CostTbl[] = {
3587 { ISD::SETCC, MVT::v4f32, { 2, 5, 1, 1 } },
3588 { ISD::SETCC, MVT::f32, { 1, 5, 1, 1 } },
3589
3590 { ISD::SELECT, MVT::v4f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3591 { ISD::SELECT, MVT::f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3592 };
3593
3594 if (ST->useSLMArithCosts())
3595 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
3596 if (auto KindCost = Entry->Cost[CostKind])
3597 return LT.first * (ExtraCost + *KindCost);
3598
3599 if (ST->hasBWI())
3600 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3601 if (auto KindCost = Entry->Cost[CostKind])
3602 return LT.first * (ExtraCost + *KindCost);
3603
3604 if (ST->hasAVX512())
3605 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3606 if (auto KindCost = Entry->Cost[CostKind])
3607 return LT.first * (ExtraCost + *KindCost);
3608
3609 if (ST->hasAVX2())
3610 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3611 if (auto KindCost = Entry->Cost[CostKind])
3612 return LT.first * (ExtraCost + *KindCost);
3613
3614 if (ST->hasXOP())
3615 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3616 if (auto KindCost = Entry->Cost[CostKind])
3617 return LT.first * (ExtraCost + *KindCost);
3618
3619 if (ST->hasAVX())
3620 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3621 if (auto KindCost = Entry->Cost[CostKind])
3622 return LT.first * (ExtraCost + *KindCost);
3623
3624 if (ST->hasSSE42())
3625 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3626 if (auto KindCost = Entry->Cost[CostKind])
3627 return LT.first * (ExtraCost + *KindCost);
3628
3629 if (ST->hasSSE41())
3630 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3631 if (auto KindCost = Entry->Cost[CostKind])
3632 return LT.first * (ExtraCost + *KindCost);
3633
3634 if (ST->hasSSE2())
3635 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3636 if (auto KindCost = Entry->Cost[CostKind])
3637 return LT.first * (ExtraCost + *KindCost);
3638
3639 if (ST->hasSSE1())
3640 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3641 if (auto KindCost = Entry->Cost[CostKind])
3642 return LT.first * (ExtraCost + *KindCost);
3643
3644 // Assume a 3cy latency for fp select ops.
3645 if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select)
3646 if (ValTy->getScalarType()->isFloatingPointTy())
3647 return 3;
3648
3649 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3650 Op1Info, Op2Info, I);
3651}
3652
3654
3658 // Costs should match the codegen from:
3659 // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
3660 // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
3661 // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
3662 // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
3663 // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
3664
3665 // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
3666 // specialized in these tables yet.
3667 static const CostKindTblEntry AVX512VBMI2CostTbl[] = {
3668 { ISD::FSHL, MVT::v8i64, { 1, 1, 1, 1 } },
3669 { ISD::FSHL, MVT::v4i64, { 1, 1, 1, 1 } },
3670 { ISD::FSHL, MVT::v2i64, { 1, 1, 1, 1 } },
3671 { ISD::FSHL, MVT::v16i32, { 1, 1, 1, 1 } },
3672 { ISD::FSHL, MVT::v8i32, { 1, 1, 1, 1 } },
3673 { ISD::FSHL, MVT::v4i32, { 1, 1, 1, 1 } },
3674 { ISD::FSHL, MVT::v32i16, { 1, 1, 1, 1 } },
3675 { ISD::FSHL, MVT::v16i16, { 1, 1, 1, 1 } },
3676 { ISD::FSHL, MVT::v8i16, { 1, 1, 1, 1 } },
3677 { ISD::ROTL, MVT::v32i16, { 1, 1, 1, 1 } },
3678 { ISD::ROTL, MVT::v16i16, { 1, 1, 1, 1 } },
3679 { ISD::ROTL, MVT::v8i16, { 1, 1, 1, 1 } },
3680 { ISD::ROTR, MVT::v32i16, { 1, 1, 1, 1 } },
3681 { ISD::ROTR, MVT::v16i16, { 1, 1, 1, 1 } },
3682 { ISD::ROTR, MVT::v8i16, { 1, 1, 1, 1 } },
3683 { X86ISD::VROTLI, MVT::v32i16, { 1, 1, 1, 1 } },
3684 { X86ISD::VROTLI, MVT::v16i16, { 1, 1, 1, 1 } },
3685 { X86ISD::VROTLI, MVT::v8i16, { 1, 1, 1, 1 } },
3686 };
3687 static const CostKindTblEntry AVX512BITALGCostTbl[] = {
3688 { ISD::CTPOP, MVT::v32i16, { 1, 1, 1, 1 } },
3689 { ISD::CTPOP, MVT::v64i8, { 1, 1, 1, 1 } },
3690 { ISD::CTPOP, MVT::v16i16, { 1, 1, 1, 1 } },
3691 { ISD::CTPOP, MVT::v32i8, { 1, 1, 1, 1 } },
3692 { ISD::CTPOP, MVT::v8i16, { 1, 1, 1, 1 } },
3693 { ISD::CTPOP, MVT::v16i8, { 1, 1, 1, 1 } },
3694 };
3695 static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = {
3696 { ISD::CTPOP, MVT::v8i64, { 1, 1, 1, 1 } },
3697 { ISD::CTPOP, MVT::v16i32, { 1, 1, 1, 1 } },
3698 { ISD::CTPOP, MVT::v4i64, { 1, 1, 1, 1 } },
3699 { ISD::CTPOP, MVT::v8i32, { 1, 1, 1, 1 } },
3700 { ISD::CTPOP, MVT::v2i64, { 1, 1, 1, 1 } },
3701 { ISD::CTPOP, MVT::v4i32, { 1, 1, 1, 1 } },
3702 };
3703 static const CostKindTblEntry AVX512CDCostTbl[] = {
3704 { ISD::CTLZ, MVT::v8i64, { 1, 5, 1, 1 } },
3705 { ISD::CTLZ, MVT::v16i32, { 1, 5, 1, 1 } },
3706 { ISD::CTLZ, MVT::v32i16, { 18, 27, 23, 27 } },
3707 { ISD::CTLZ, MVT::v64i8, { 3, 16, 9, 11 } },
3708 { ISD::CTLZ, MVT::v4i64, { 1, 5, 1, 1 } },
3709 { ISD::CTLZ, MVT::v8i32, { 1, 5, 1, 1 } },
3710 { ISD::CTLZ, MVT::v16i16, { 8, 19, 11, 13 } },
3711 { ISD::CTLZ, MVT::v32i8, { 2, 11, 9, 10 } },
3712 { ISD::CTLZ, MVT::v2i64, { 1, 5, 1, 1 } },
3713 { ISD::CTLZ, MVT::v4i32, { 1, 5, 1, 1 } },
3714 { ISD::CTLZ, MVT::v8i16, { 3, 15, 4, 6 } },
3715 { ISD::CTLZ, MVT::v16i8, { 2, 10, 9, 10 } },
3716
3717 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3718 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3719 { ISD::CTTZ, MVT::v4i64, { 1, 8, 6, 6 } },
3720 { ISD::CTTZ, MVT::v8i32, { 1, 8, 6, 6 } },
3721 { ISD::CTTZ, MVT::v2i64, { 1, 8, 6, 6 } },
3722 { ISD::CTTZ, MVT::v4i32, { 1, 8, 6, 6 } },
3723 };
3724 static const CostKindTblEntry AVX512BWCostTbl[] = {
3725 { ISD::ABS, MVT::v32i16, { 1, 1, 1, 1 } },
3726 { ISD::ABS, MVT::v64i8, { 1, 1, 1, 1 } },
3727 { ISD::BITREVERSE, MVT::v2i64, { 3, 10, 10, 11 } },
3728 { ISD::BITREVERSE, MVT::v4i64, { 3, 11, 10, 11 } },
3729 { ISD::BITREVERSE, MVT::v8i64, { 3, 12, 10, 14 } },
3730 { ISD::BITREVERSE, MVT::v4i32, { 3, 10, 10, 11 } },
3731 { ISD::BITREVERSE, MVT::v8i32, { 3, 11, 10, 11 } },
3732 { ISD::BITREVERSE, MVT::v16i32, { 3, 12, 10, 14 } },
3733 { ISD::BITREVERSE, MVT::v8i16, { 3, 10, 10, 11 } },
3734 { ISD::BITREVERSE, MVT::v16i16, { 3, 11, 10, 11 } },
3735 { ISD::BITREVERSE, MVT::v32i16, { 3, 12, 10, 14 } },
3736 { ISD::BITREVERSE, MVT::v16i8, { 2, 5, 9, 9 } },
3737 { ISD::BITREVERSE, MVT::v32i8, { 2, 5, 9, 9 } },
3738 { ISD::BITREVERSE, MVT::v64i8, { 2, 5, 9, 12 } },
3739 { ISD::BSWAP, MVT::v2i64, { 1, 1, 1, 2 } },
3740 { ISD::BSWAP, MVT::v4i64, { 1, 1, 1, 2 } },
3741 { ISD::BSWAP, MVT::v8i64, { 1, 1, 1, 2 } },
3742 { ISD::BSWAP, MVT::v4i32, { 1, 1, 1, 2 } },
3743 { ISD::BSWAP, MVT::v8i32, { 1, 1, 1, 2 } },
3744 { ISD::BSWAP, MVT::v16i32, { 1, 1, 1, 2 } },
3745 { ISD::BSWAP, MVT::v8i16, { 1, 1, 1, 2 } },
3746 { ISD::BSWAP, MVT::v16i16, { 1, 1, 1, 2 } },
3747 { ISD::BSWAP, MVT::v32i16, { 1, 1, 1, 2 } },
3748 { ISD::CTLZ, MVT::v8i64, { 8, 22, 23, 23 } },
3749 { ISD::CTLZ, MVT::v16i32, { 8, 23, 25, 25 } },
3750 { ISD::CTLZ, MVT::v32i16, { 4, 15, 15, 16 } },
3751 { ISD::CTLZ, MVT::v64i8, { 3, 12, 10, 9 } },
3752 { ISD::CTPOP, MVT::v2i64, { 3, 7, 10, 10 } },
3753 { ISD::CTPOP, MVT::v4i64, { 3, 7, 10, 10 } },
3754 { ISD::CTPOP, MVT::v8i64, { 3, 8, 10, 12 } },
3755 { ISD::CTPOP, MVT::v4i32, { 7, 11, 14, 14 } },
3756 { ISD::CTPOP, MVT::v8i32, { 7, 11, 14, 14 } },
3757 { ISD::CTPOP, MVT::v16i32, { 7, 12, 14, 16 } },
3758 { ISD::CTPOP, MVT::v8i16, { 2, 7, 11, 11 } },
3759 { ISD::CTPOP, MVT::v16i16, { 2, 7, 11, 11 } },
3760 { ISD::CTPOP, MVT::v32i16, { 3, 7, 11, 13 } },
3761 { ISD::CTPOP, MVT::v16i8, { 2, 4, 8, 8 } },
3762 { ISD::CTPOP, MVT::v32i8, { 2, 4, 8, 8 } },
3763 { ISD::CTPOP, MVT::v64i8, { 2, 5, 8, 10 } },
3764 { ISD::CTTZ, MVT::v8i16, { 3, 9, 14, 14 } },
3765 { ISD::CTTZ, MVT::v16i16, { 3, 9, 14, 14 } },
3766 { ISD::CTTZ, MVT::v32i16, { 3, 10, 14, 16 } },
3767 { ISD::CTTZ, MVT::v16i8, { 2, 6, 11, 11 } },
3768 { ISD::CTTZ, MVT::v32i8, { 2, 6, 11, 11 } },
3769 { ISD::CTTZ, MVT::v64i8, { 3, 7, 11, 13 } },
3770 { ISD::ROTL, MVT::v32i16, { 2, 8, 6, 8 } },
3771 { ISD::ROTL, MVT::v16i16, { 2, 8, 6, 7 } },
3772 { ISD::ROTL, MVT::v8i16, { 2, 7, 6, 7 } },
3773 { ISD::ROTL, MVT::v64i8, { 5, 6, 11, 12 } },
3774 { ISD::ROTL, MVT::v32i8, { 5, 15, 7, 10 } },
3775 { ISD::ROTL, MVT::v16i8, { 5, 15, 7, 10 } },
3776 { ISD::ROTR, MVT::v32i16, { 2, 8, 6, 8 } },
3777 { ISD::ROTR, MVT::v16i16, { 2, 8, 6, 7 } },
3778 { ISD::ROTR, MVT::v8i16, { 2, 7, 6, 7 } },
3779 { ISD::ROTR, MVT::v64i8, { 5, 6, 12, 14 } },
3780 { ISD::ROTR, MVT::v32i8, { 5, 14, 6, 9 } },
3781 { ISD::ROTR, MVT::v16i8, { 5, 14, 6, 9 } },
3782 { X86ISD::VROTLI, MVT::v32i16, { 2, 5, 3, 3 } },
3783 { X86ISD::VROTLI, MVT::v16i16, { 1, 5, 3, 3 } },
3784 { X86ISD::VROTLI, MVT::v8i16, { 1, 5, 3, 3 } },
3785 { X86ISD::VROTLI, MVT::v64i8, { 2, 9, 3, 4 } },
3786 { X86ISD::VROTLI, MVT::v32i8, { 1, 9, 3, 4 } },
3787 { X86ISD::VROTLI, MVT::v16i8, { 1, 8, 3, 4 } },
3788 { ISD::SADDSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3789 { ISD::SADDSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3790 { ISD::SMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3791 { ISD::SMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3792 { ISD::SMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3793 { ISD::SMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3794 { ISD::SMULO, MVT::v32i16, { 3, 6, 4, 4 } },
3795 { ISD::SMULO, MVT::v64i8, { 8, 21, 17, 18 } },
3796 { ISD::UMULO, MVT::v32i16, { 2, 5, 3, 3 } },
3797 { ISD::UMULO, MVT::v64i8, { 8, 15, 15, 16 } },
3798 { ISD::SSUBSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3799 { ISD::SSUBSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3800 { ISD::UADDSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3801 { ISD::UADDSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3802 { ISD::UMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3803 { ISD::UMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3804 { ISD::UMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3805 { ISD::UMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3806 { ISD::USUBSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3807 { ISD::USUBSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3808 };
3809 static const CostKindTblEntry AVX512CostTbl[] = {
3810 { ISD::ABS, MVT::v8i64, { 1, 1, 1, 1 } },
3811 { ISD::ABS, MVT::v4i64, { 1, 1, 1, 1 } },
3812 { ISD::ABS, MVT::v2i64, { 1, 1, 1, 1 } },
3813 { ISD::ABS, MVT::v16i32, { 1, 1, 1, 1 } },
3814 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 1 } },
3815 { ISD::ABS, MVT::v32i16, { 2, 7, 4, 4 } },
3816 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 1 } },
3817 { ISD::ABS, MVT::v64i8, { 2, 7, 4, 4 } },
3818 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 1 } },
3819 { ISD::BITREVERSE, MVT::v8i64, { 9, 13, 20, 20 } },
3820 { ISD::BITREVERSE, MVT::v16i32, { 9, 13, 20, 20 } },
3821 { ISD::BITREVERSE, MVT::v32i16, { 9, 13, 20, 20 } },
3822 { ISD::BITREVERSE, MVT::v64i8, { 6, 11, 17, 17 } },
3823 { ISD::BSWAP, MVT::v8i64, { 4, 7, 5, 5 } },
3824 { ISD::BSWAP, MVT::v16i32, { 4, 7, 5, 5 } },
3825 { ISD::BSWAP, MVT::v32i16, { 4, 7, 5, 5 } },
3826 { ISD::CTLZ, MVT::v8i64, { 10, 28, 32, 32 } },
3827 { ISD::CTLZ, MVT::v16i32, { 12, 30, 38, 38 } },
3828 { ISD::CTLZ, MVT::v32i16, { 8, 15, 29, 29 } },
3829 { ISD::CTLZ, MVT::v64i8, { 6, 11, 19, 19 } },
3830 { ISD::CTPOP, MVT::v8i64, { 16, 16, 19, 19 } },
3831 { ISD::CTPOP, MVT::v16i32, { 24, 19, 27, 27 } },
3832 { ISD::CTPOP, MVT::v32i16, { 18, 15, 22, 22 } },
3833 { ISD::CTPOP, MVT::v64i8, { 12, 11, 16, 16 } },
3834 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3835 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3836 { ISD::CTTZ, MVT::v32i16, { 7, 17, 27, 27 } },
3837 { ISD::CTTZ, MVT::v64i8, { 6, 13, 21, 21 } },
3838 { ISD::ROTL, MVT::v8i64, { 1, 1, 1, 1 } },
3839 { ISD::ROTL, MVT::v4i64, { 1, 1, 1, 1 } },
3840 { ISD::ROTL, MVT::v2i64, { 1, 1, 1, 1 } },
3841 { ISD::ROTL, MVT::v16i32, { 1, 1, 1, 1 } },
3842 { ISD::ROTL, MVT::v8i32, { 1, 1, 1, 1 } },
3843 { ISD::ROTL, MVT::v4i32, { 1, 1, 1, 1 } },
3844 { ISD::ROTR, MVT::v8i64, { 1, 1, 1, 1 } },
3845 { ISD::ROTR, MVT::v4i64, { 1, 1, 1, 1 } },
3846 { ISD::ROTR, MVT::v2i64, { 1, 1, 1, 1 } },
3847 { ISD::ROTR, MVT::v16i32, { 1, 1, 1, 1 } },
3848 { ISD::ROTR, MVT::v8i32, { 1, 1, 1, 1 } },
3849 { ISD::ROTR, MVT::v4i32, { 1, 1, 1, 1 } },
3850 { X86ISD::VROTLI, MVT::v8i64, { 1, 1, 1, 1 } },
3851 { X86ISD::VROTLI, MVT::v4i64, { 1, 1, 1, 1 } },
3852 { X86ISD::VROTLI, MVT::v2i64, { 1, 1, 1, 1 } },
3853 { X86ISD::VROTLI, MVT::v16i32, { 1, 1, 1, 1 } },
3854 { X86ISD::VROTLI, MVT::v8i32, { 1, 1, 1, 1 } },
3855 { X86ISD::VROTLI, MVT::v4i32, { 1, 1, 1, 1 } },
3856 { ISD::SADDSAT, MVT::v2i64, { 3, 3, 8, 9 } },
3857 { ISD::SADDSAT, MVT::v4i64, { 2, 2, 6, 7 } },
3858 { ISD::SADDSAT, MVT::v8i64, { 3, 3, 6, 7 } },
3859 { ISD::SADDSAT, MVT::v4i32, { 2, 2, 6, 7 } },
3860 { ISD::SADDSAT, MVT::v8i32, { 2, 2, 6, 7 } },
3861 { ISD::SADDSAT, MVT::v16i32, { 3, 3, 6, 7 } },
3862 { ISD::SADDSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3863 { ISD::SADDSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3864 { ISD::SMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3865 { ISD::SMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3866 { ISD::SMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3867 { ISD::SMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3868 { ISD::SMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3869 { ISD::SMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3870 { ISD::SMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3871 { ISD::SMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3872 { ISD::SMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3873 { ISD::SMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3874 { ISD::SMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3875 { ISD::SMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3876 { ISD::SMULO, MVT::v8i64, { 44, 44, 81, 93 } },
3877 { ISD::SMULO, MVT::v16i32, { 5, 12, 9, 11 } },
3878 { ISD::SMULO, MVT::v32i16, { 6, 12, 17, 17 } },
3879 { ISD::SMULO, MVT::v64i8, { 22, 28, 42, 42 } },
3880 { ISD::SSUBSAT, MVT::v2i64, { 2, 13, 9, 10 } },
3881 { ISD::SSUBSAT, MVT::v4i64, { 2, 15, 7, 8 } },
3882 { ISD::SSUBSAT, MVT::v8i64, { 2, 14, 7, 8 } },
3883 { ISD::SSUBSAT, MVT::v4i32, { 2, 14, 7, 8 } },
3884 { ISD::SSUBSAT, MVT::v8i32, { 2, 15, 7, 8 } },
3885 { ISD::SSUBSAT, MVT::v16i32, { 2, 14, 7, 8 } },
3886 { ISD::SSUBSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3887 { ISD::SSUBSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3888 { ISD::UMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3889 { ISD::UMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3890 { ISD::UMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3891 { ISD::UMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3892 { ISD::UMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3893 { ISD::UMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3894 { ISD::UMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3895 { ISD::UMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3896 { ISD::UMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3897 { ISD::UMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3898 { ISD::UMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3899 { ISD::UMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3900 { ISD::UMULO, MVT::v8i64, { 52, 52, 95, 104} },
3901 { ISD::UMULO, MVT::v16i32, { 5, 12, 8, 10 } },
3902 { ISD::UMULO, MVT::v32i16, { 5, 13, 16, 16 } },
3903 { ISD::UMULO, MVT::v64i8, { 18, 24, 30, 30 } },
3904 { ISD::UADDSAT, MVT::v2i64, { 1, 4, 4, 4 } },
3905 { ISD::UADDSAT, MVT::v4i64, { 1, 4, 4, 4 } },
3906 { ISD::UADDSAT, MVT::v8i64, { 1, 4, 4, 4 } },
3907 { ISD::UADDSAT, MVT::v4i32, { 1, 2, 4, 4 } },
3908 { ISD::UADDSAT, MVT::v8i32, { 1, 2, 4, 4 } },
3909 { ISD::UADDSAT, MVT::v16i32, { 2, 2, 4, 4 } },
3910 { ISD::UADDSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3911 { ISD::UADDSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3912 { ISD::USUBSAT, MVT::v2i64, { 1, 4, 2, 2 } },
3913 { ISD::USUBSAT, MVT::v4i64, { 1, 4, 2, 2 } },
3914 { ISD::USUBSAT, MVT::v8i64, { 1, 4, 2, 2 } },
3915 { ISD::USUBSAT, MVT::v8i32, { 1, 2, 2, 2 } },
3916 { ISD::USUBSAT, MVT::v16i32, { 1, 2, 2, 2 } },
3917 { ISD::USUBSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3918 { ISD::USUBSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3919 { ISD::FMAXNUM, MVT::f32, { 2, 2, 3, 3 } },
3920 { ISD::FMAXNUM, MVT::v4f32, { 1, 1, 3, 3 } },
3921 { ISD::FMAXNUM, MVT::v8f32, { 2, 2, 3, 3 } },
3922 { ISD::FMAXNUM, MVT::v16f32, { 4, 4, 3, 3 } },
3923 { ISD::FMAXNUM, MVT::f64, { 2, 2, 3, 3 } },
3924 { ISD::FMAXNUM, MVT::v2f64, { 1, 1, 3, 3 } },
3925 { ISD::FMAXNUM, MVT::v4f64, { 2, 2, 3, 3 } },
3926 { ISD::FMAXNUM, MVT::v8f64, { 3, 3, 3, 3 } },
3927 { ISD::FSQRT, MVT::f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3928 { ISD::FSQRT, MVT::v4f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3929 { ISD::FSQRT, MVT::v8f32, { 6, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3930 { ISD::FSQRT, MVT::v16f32, { 12, 20, 1, 3 } }, // Skylake from http://www.agner.org/
3931 { ISD::FSQRT, MVT::f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3932 { ISD::FSQRT, MVT::v2f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3933 { ISD::FSQRT, MVT::v4f64, { 12, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3934 { ISD::FSQRT, MVT::v8f64, { 24, 32, 1, 3 } }, // Skylake from http://www.agner.org/
3935 };
3936 static const CostKindTblEntry XOPCostTbl[] = {
3937 { ISD::BITREVERSE, MVT::v4i64, { 3, 6, 5, 6 } },
3938 { ISD::BITREVERSE, MVT::v8i32, { 3, 6, 5, 6 } },
3939 { ISD::BITREVERSE, MVT::v16i16, { 3, 6, 5, 6 } },
3940 { ISD::BITREVERSE, MVT::v32i8, { 3, 6, 5, 6 } },
3941 { ISD::BITREVERSE, MVT::v2i64, { 2, 7, 1, 1 } },
3942 { ISD::BITREVERSE, MVT::v4i32, { 2, 7, 1, 1 } },
3943 { ISD::BITREVERSE, MVT::v8i16, { 2, 7, 1, 1 } },
3944 { ISD::BITREVERSE, MVT::v16i8, { 2, 7, 1, 1 } },
3945 { ISD::BITREVERSE, MVT::i64, { 2, 2, 3, 4 } },
3946 { ISD::BITREVERSE, MVT::i32, { 2, 2, 3, 4 } },
3947 { ISD::BITREVERSE, MVT::i16, { 2, 2, 3, 4 } },
3948 { ISD::BITREVERSE, MVT::i8, { 2, 2, 3, 4 } },
3949 // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
3950 { ISD::ROTL, MVT::v4i64, { 4, 7, 5, 6 } },
3951 { ISD::ROTL, MVT::v8i32, { 4, 7, 5, 6 } },
3952 { ISD::ROTL, MVT::v16i16, { 4, 7, 5, 6 } },
3953 { ISD::ROTL, MVT::v32i8, { 4, 7, 5, 6 } },
3954 { ISD::ROTL, MVT::v2i64, { 1, 3, 1, 1 } },
3955 { ISD::ROTL, MVT::v4i32, { 1, 3, 1, 1 } },
3956 { ISD::ROTL, MVT::v8i16, { 1, 3, 1, 1 } },
3957 { ISD::ROTL, MVT::v16i8, { 1, 3, 1, 1 } },
3958 { ISD::ROTR, MVT::v4i64, { 4, 7, 8, 9 } },
3959 { ISD::ROTR, MVT::v8i32, { 4, 7, 8, 9 } },
3960 { ISD::ROTR, MVT::v16i16, { 4, 7, 8, 9 } },
3961 { ISD::ROTR, MVT::v32i8, { 4, 7, 8, 9 } },
3962 { ISD::ROTR, MVT::v2i64, { 1, 3, 3, 3 } },
3963 { ISD::ROTR, MVT::v4i32, { 1, 3, 3, 3 } },
3964 { ISD::ROTR, MVT::v8i16, { 1, 3, 3, 3 } },
3965 { ISD::ROTR, MVT::v16i8, { 1, 3, 3, 3 } },
3966 { X86ISD::VROTLI, MVT::v4i64, { 4, 7, 5, 6 } },
3967 { X86ISD::VROTLI, MVT::v8i32, { 4, 7, 5, 6 } },
3968 { X86ISD::VROTLI, MVT::v16i16, { 4, 7, 5, 6 } },
3969 { X86ISD::VROTLI, MVT::v32i8, { 4, 7, 5, 6 } },
3970 { X86ISD::VROTLI, MVT::v2i64, { 1, 3, 1, 1 } },
3971 { X86ISD::VROTLI, MVT::v4i32, { 1, 3, 1, 1 } },
3972 { X86ISD::VROTLI, MVT::v8i16, { 1, 3, 1, 1 } },
3973 { X86ISD::VROTLI, MVT::v16i8, { 1, 3, 1, 1 } },
3974 };
3975 static const CostKindTblEntry AVX2CostTbl[] = {
3976 { ISD::ABS, MVT::v2i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3977 { ISD::ABS, MVT::v4i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3978 { ISD::ABS, MVT::v4i32, { 1, 1, 1, 1 } },
3979 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 2 } },
3980 { ISD::ABS, MVT::v8i16, { 1, 1, 1, 1 } },
3981 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 2 } },
3982 { ISD::ABS, MVT::v16i8, { 1, 1, 1, 1 } },
3983 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 2 } },
3984 { ISD::BITREVERSE, MVT::v2i64, { 3, 11, 10, 11 } },
3985 { ISD::BITREVERSE, MVT::v4i64, { 5, 11, 10, 17 } },
3986 { ISD::BITREVERSE, MVT::v4i32, { 3, 11, 10, 11 } },
3987 { ISD::BITREVERSE, MVT::v8i32, { 5, 11, 10, 17 } },
3988 { ISD::BITREVERSE, MVT::v8i16, { 3, 11, 10, 11 } },
3989 { ISD::BITREVERSE, MVT::v16i16, { 5, 11, 10, 17 } },
3990 { ISD::BITREVERSE, MVT::v16i8, { 3, 6, 9, 9 } },
3991 { ISD::BITREVERSE, MVT::v32i8, { 4, 5, 9, 15 } },
3992 { ISD::BSWAP, MVT::v2i64, { 1, 2, 1, 2 } },
3993 { ISD::BSWAP, MVT::v4i64, { 1, 3, 1, 2 } },
3994 { ISD::BSWAP, MVT::v4i32, { 1, 2, 1, 2 } },
3995 { ISD::BSWAP, MVT::v8i32, { 1, 3, 1, 2 } },
3996 { ISD::BSWAP, MVT::v8i16, { 1, 2, 1, 2 } },
3997 { ISD::BSWAP, MVT::v16i16, { 1, 3, 1, 2 } },
3998 { ISD::CTLZ, MVT::v2i64, { 7, 18, 24, 25 } },
3999 { ISD::CTLZ, MVT::v4i64, { 14, 18, 24, 44 } },
4000 { ISD::CTLZ, MVT::v4i32, { 5, 16, 19, 20 } },
4001 { ISD::CTLZ, MVT::v8i32, { 10, 16, 19, 34 } },
4002 { ISD::CTLZ, MVT::v8i16, { 4, 13, 14, 15 } },
4003 { ISD::CTLZ, MVT::v16i16, { 6, 14, 14, 24 } },
4004 { ISD::CTLZ, MVT::v16i8, { 3, 12, 9, 10 } },
4005 { ISD::CTLZ, MVT::v32i8, { 4, 12, 9, 14 } },
4006 { ISD::CTPOP, MVT::v2i64, { 3, 9, 10, 10 } },
4007 { ISD::CTPOP, MVT::v4i64, { 4, 9, 10, 14 } },
4008 { ISD::CTPOP, MVT::v4i32, { 7, 12, 14, 14 } },
4009 { ISD::CTPOP, MVT::v8i32, { 7, 12, 14, 18 } },
4010 { ISD::CTPOP, MVT::v8i16, { 3, 7, 11, 11 } },
4011 { ISD::CTPOP, MVT::v16i16, { 6, 8, 11, 18 } },
4012 { ISD::CTPOP, MVT::v16i8, { 2, 5, 8, 8 } },
4013 { ISD::CTPOP, MVT::v32i8, { 3, 5, 8, 12 } },
4014 { ISD::CTTZ, MVT::v2i64, { 4, 11, 13, 13 } },
4015 { ISD::CTTZ, MVT::v4i64, { 5, 11, 13, 20 } },
4016 { ISD::CTTZ, MVT::v4i32, { 7, 14, 17, 17 } },
4017 { ISD::CTTZ, MVT::v8i32, { 7, 15, 17, 24 } },
4018 { ISD::CTTZ, MVT::v8i16, { 4, 9, 14, 14 } },
4019 { ISD::CTTZ, MVT::v16i16, { 6, 9, 14, 24 } },
4020 { ISD::CTTZ, MVT::v16i8, { 3, 7, 11, 11 } },
4021 { ISD::CTTZ, MVT::v32i8, { 5, 7, 11, 18 } },
4022 { ISD::SADDSAT, MVT::v2i64, { 4, 13, 8, 11 } },
4023 { ISD::SADDSAT, MVT::v4i64, { 3, 10, 8, 12 } },
4024 { ISD::SADDSAT, MVT::v4i32, { 2, 6, 7, 9 } },
4025 { ISD::SADDSAT, MVT::v8i32, { 4, 6, 7, 13 } },
4026 { ISD::SADDSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4027 { ISD::SADDSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4028 { ISD::SMAX, MVT::v2i64, { 2, 7, 2, 3 } },
4029 { ISD::SMAX, MVT::v4i64, { 2, 7, 2, 3 } },
4030 { ISD::SMAX, MVT::v8i32, { 1, 1, 1, 2 } },
4031 { ISD::SMAX, MVT::v16i16, { 1, 1, 1, 2 } },
4032 { ISD::SMAX, MVT::v32i8, { 1, 1, 1, 2 } },
4033 { ISD::SMIN, MVT::v2i64, { 2, 7, 2, 3 } },
4034 { ISD::SMIN, MVT::v4i64, { 2, 7, 2, 3 } },
4035 { ISD::SMIN, MVT::v8i32, { 1, 1, 1, 2 } },
4036 { ISD::SMIN, MVT::v16i16, { 1, 1, 1, 2 } },
4037 { ISD::SMIN, MVT::v32i8, { 1, 1, 1, 2 } },
4038 { ISD::SMULO, MVT::v4i64, { 20, 20, 33, 37 } },
4039 { ISD::SMULO, MVT::v2i64, { 8, 8, 13, 15 } },
4040 { ISD::SMULO, MVT::v8i32, { 8, 20, 13, 24 } },
4041 { ISD::SMULO, MVT::v4i32, { 5, 15, 11, 12 } },
4042 { ISD::SMULO, MVT::v16i16, { 4, 14, 8, 14 } },
4043 { ISD::SMULO, MVT::v8i16, { 3, 9, 6, 6 } },
4044 { ISD::SMULO, MVT::v32i8, { 9, 15, 18, 35 } },
4045 { ISD::SMULO, MVT::v16i8, { 6, 22, 14, 21 } },
4046 { ISD::SSUBSAT, MVT::v2i64, { 4, 13, 9, 13 } },
4047 { ISD::SSUBSAT, MVT::v4i64, { 4, 15, 9, 13 } },
4048 { ISD::SSUBSAT, MVT::v4i32, { 3, 14, 9, 11 } },
4049 { ISD::SSUBSAT, MVT::v8i32, { 4, 15, 9, 16 } },
4050 { ISD::SSUBSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4051 { ISD::SSUBSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4052 { ISD::UADDSAT, MVT::v2i64, { 2, 8, 6, 6 } },
4053 { ISD::UADDSAT, MVT::v4i64, { 3, 8, 6, 10 } },
4054 { ISD::UADDSAT, MVT::v8i32, { 2, 2, 4, 8 } },
4055 { ISD::UADDSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4056 { ISD::UADDSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4057 { ISD::UMAX, MVT::v2i64, { 2, 8, 5, 6 } },
4058 { ISD::UMAX, MVT::v4i64, { 2, 8, 5, 8 } },
4059 { ISD::UMAX, MVT::v8i32, { 1, 1, 1, 2 } },
4060 { ISD::UMAX, MVT::v16i16, { 1, 1, 1, 2 } },
4061 { ISD::UMAX, MVT::v32i8, { 1, 1, 1, 2 } },
4062 { ISD::UMIN, MVT::v2i64, { 2, 8, 5, 6 } },
4063 { ISD::UMIN, MVT::v4i64, { 2, 8, 5, 8 } },
4064 { ISD::UMIN, MVT::v8i32, { 1, 1, 1, 2 } },
4065 { ISD::UMIN, MVT::v16i16, { 1, 1, 1, 2 } },
4066 { ISD::UMIN, MVT::v32i8, { 1, 1, 1, 2 } },
4067 { ISD::UMULO, MVT::v4i64, { 24, 24, 39, 43 } },
4068 { ISD::UMULO, MVT::v2i64, { 10, 10, 15, 19 } },
4069 { ISD::UMULO, MVT::v8i32, { 8, 11, 13, 23 } },
4070 { ISD::UMULO, MVT::v4i32, { 5, 12, 11, 12 } },
4071 { ISD::UMULO, MVT::v16i16, { 4, 6, 8, 13 } },
4072 { ISD::UMULO, MVT::v8i16, { 2, 8, 6, 6 } },
4073 { ISD::UMULO, MVT::v32i8, { 9, 13, 17, 33 } },
4074 { ISD::UMULO, MVT::v16i8, { 6, 19, 13, 20 } },
4075 { ISD::USUBSAT, MVT::v2i64, { 2, 7, 6, 6 } },
4076 { ISD::USUBSAT, MVT::v4i64, { 3, 7, 6, 10 } },
4077 { ISD::USUBSAT, MVT::v8i32, { 2, 2, 2, 4 } },
4078 { ISD::USUBSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4079 { ISD::USUBSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4080 { ISD::FMAXNUM, MVT::f32, { 2, 7, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4081 { ISD::FMAXNUM, MVT::v4f32, { 2, 7, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4082 { ISD::FMAXNUM, MVT::v8f32, { 3, 7, 3, 6 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4083 { ISD::FMAXNUM, MVT::f64, { 2, 7, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4084 { ISD::FMAXNUM, MVT::v2f64, { 2, 7, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4085 { ISD::FMAXNUM, MVT::v4f64, { 3, 7, 3, 6 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4086 { ISD::FSQRT, MVT::f32, { 7, 15, 1, 1 } }, // vsqrtss
4087 { ISD::FSQRT, MVT::v4f32, { 7, 15, 1, 1 } }, // vsqrtps
4088 { ISD::FSQRT, MVT::v8f32, { 14, 21, 1, 3 } }, // vsqrtps
4089 { ISD::FSQRT, MVT::f64, { 14, 21, 1, 1 } }, // vsqrtsd
4090 { ISD::FSQRT, MVT::v2f64, { 14, 21, 1, 1 } }, // vsqrtpd
4091 { ISD::FSQRT, MVT::v4f64, { 28, 35, 1, 3 } }, // vsqrtpd
4092 };
4093 static const CostKindTblEntry AVX1CostTbl[] = {
4094 { ISD::ABS, MVT::v4i64, { 6, 8, 6, 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
4095 { ISD::ABS, MVT::v8i32, { 3, 6, 4, 5 } },
4096 { ISD::ABS, MVT::v16i16, { 3, 6, 4, 5 } },
4097 { ISD::ABS, MVT::v32i8, { 3, 6, 4, 5 } },
4098 { ISD::BITREVERSE, MVT::v4i64, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4099 { ISD::BITREVERSE, MVT::v2i64, { 8, 13, 10, 16 } },
4100 { ISD::BITREVERSE, MVT::v8i32, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4101 { ISD::BITREVERSE, MVT::v4i32, { 8, 13, 10, 16 } },
4102 { ISD::BITREVERSE, MVT::v16i16, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4103 { ISD::BITREVERSE, MVT::v8i16, { 8, 13, 10, 16 } },
4104 { ISD::BITREVERSE, MVT::v32i8, { 13, 15, 17, 26 } }, // 2 x 128-bit Op + extract/insert
4105 { ISD::BITREVERSE, MVT::v16i8, { 7, 7, 9, 13 } },
4106 { ISD::BSWAP, MVT::v4i64, { 5, 6, 5, 10 } },
4107 { ISD::BSWAP, MVT::v2i64, { 2, 2, 1, 3 } },
4108 { ISD::BSWAP, MVT::v8i32, { 5, 6, 5, 10 } },
4109 { ISD::BSWAP, MVT::v4i32, { 2, 2, 1, 3 } },
4110 { ISD::BSWAP, MVT::v16i16, { 5, 6, 5, 10 } },
4111 { ISD::BSWAP, MVT::v8i16, { 2, 2, 1, 3 } },
4112 { ISD::CTLZ, MVT::v4i64, { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert
4113 { ISD::CTLZ, MVT::v2i64, { 14, 24, 24, 28 } },
4114 { ISD::CTLZ, MVT::v8i32, { 24, 28, 39, 48 } }, // 2 x 128-bit Op + extract/insert
4115 { ISD::CTLZ, MVT::v4i32, { 12, 20, 19, 23 } },
4116 { ISD::CTLZ, MVT::v16i16, { 19, 22, 29, 38 } }, // 2 x 128-bit Op + extract/insert
4117 { ISD::CTLZ, MVT::v8i16, { 9, 16, 14, 18 } },
4118 { ISD::CTLZ, MVT::v32i8, { 14, 15, 19, 28 } }, // 2 x 128-bit Op + extract/insert
4119 { ISD::CTLZ, MVT::v16i8, { 7, 12, 9, 13 } },
4120 { ISD::CTPOP, MVT::v4i64, { 14, 18, 19, 28 } }, // 2 x 128-bit Op + extract/insert
4121 { ISD::CTPOP, MVT::v2i64, { 7, 14, 10, 14 } },
4122 { ISD::CTPOP, MVT::v8i32, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
4123 { ISD::CTPOP, MVT::v4i32, { 9, 20, 14, 18 } },
4124 { ISD::CTPOP, MVT::v16i16, { 16, 21, 22, 31 } }, // 2 x 128-bit Op + extract/insert
4125 { ISD::CTPOP, MVT::v8i16, { 8, 18, 11, 15 } },
4126 { ISD::CTPOP, MVT::v32i8, { 13, 15, 16, 25 } }, // 2 x 128-bit Op + extract/insert
4127 { ISD::CTPOP, MVT::v16i8, { 6, 12, 8, 12 } },
4128 { ISD::CTTZ, MVT::v4i64, { 17, 22, 24, 33 } }, // 2 x 128-bit Op + extract/insert
4129 { ISD::CTTZ, MVT::v2i64, { 9, 19, 13, 17 } },
4130 { ISD::CTTZ, MVT::v8i32, { 21, 27, 32, 41 } }, // 2 x 128-bit Op + extract/insert
4131 { ISD::CTTZ, MVT::v4i32, { 11, 24, 17, 21 } },
4132 { ISD::CTTZ, MVT::v16i16, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
4133 { ISD::CTTZ, MVT::v8i16, { 9, 21, 14, 18 } },
4134 { ISD::CTTZ, MVT::v32i8, { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert
4135 { ISD::CTTZ, MVT::v16i8, { 8, 16, 11, 15 } },
4136 { ISD::SADDSAT, MVT::v2i64, { 6, 13, 8, 11 } },
4137 { ISD::SADDSAT, MVT::v4i64, { 13, 20, 15, 25 } }, // 2 x 128-bit Op + extract/insert
4138 { ISD::SADDSAT, MVT::v8i32, { 12, 18, 14, 24 } }, // 2 x 128-bit Op + extract/insert
4139 { ISD::SADDSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4140 { ISD::SADDSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4141 { ISD::SMAX, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
4142 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 4 } },
4143 { ISD::SMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4144 { ISD::SMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4145 { ISD::SMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4146 { ISD::SMIN, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
4147 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
4148 { ISD::SMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4149 { ISD::SMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4150 { ISD::SMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4151 { ISD::SMULO, MVT::v4i64, { 20, 20, 33, 37 } },
4152 { ISD::SMULO, MVT::v2i64, { 9, 9, 13, 17 } },
4153 { ISD::SMULO, MVT::v8i32, { 15, 20, 24, 29 } },
4154 { ISD::SMULO, MVT::v4i32, { 7, 15, 11, 13 } },
4155 { ISD::SMULO, MVT::v16i16, { 8, 14, 14, 15 } },
4156 { ISD::SMULO, MVT::v8i16, { 3, 9, 6, 6 } },
4157 { ISD::SMULO, MVT::v32i8, { 20, 20, 37, 39 } },
4158 { ISD::SMULO, MVT::v16i8, { 9, 22, 18, 21 } },
4159 { ISD::SSUBSAT, MVT::v2i64, { 7, 13, 9, 13 } },
4160 { ISD::SSUBSAT, MVT::v4i64, { 15, 21, 18, 29 } }, // 2 x 128-bit Op + extract/insert
4161 { ISD::SSUBSAT, MVT::v8i32, { 15, 19, 18, 29 } }, // 2 x 128-bit Op + extract/insert
4162 { ISD::SSUBSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4163 { ISD::SSUBSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4164 { ISD::UADDSAT, MVT::v2i64, { 3, 8, 6, 6 } },
4165 { ISD::UADDSAT, MVT::v4i64, { 8, 11, 14, 15 } }, // 2 x 128-bit Op + extract/insert
4166 { ISD::UADDSAT, MVT::v8i32, { 6, 6, 10, 11 } }, // 2 x 128-bit Op + extract/insert
4167 { ISD::UADDSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4168 { ISD::UADDSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4169 { ISD::UMAX, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
4170 { ISD::UMAX, MVT::v2i64, { 4, 8, 5, 7 } },
4171 { ISD::UMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4172 { ISD::UMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4173 { ISD::UMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4174 { ISD::UMIN, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
4175 { ISD::UMIN, MVT::v2i64, { 4, 8, 5, 7 } },
4176 { ISD::UMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4177 { ISD::UMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4178 { ISD::UMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4179 { ISD::UMULO, MVT::v4i64, { 24, 26, 39, 45 } },
4180 { ISD::UMULO, MVT::v2i64, { 10, 12, 15, 20 } },
4181 { ISD::UMULO, MVT::v8i32, { 14, 15, 23, 28 } },
4182 { ISD::UMULO, MVT::v4i32, { 7, 12, 11, 13 } },
4183 { ISD::UMULO, MVT::v16i16, { 7, 11, 13, 14 } },
4184 { ISD::UMULO, MVT::v8i16, { 3, 8, 6, 6 } },
4185 { ISD::UMULO, MVT::v32i8, { 19, 19, 35, 37 } },
4186 { ISD::UMULO, MVT::v16i8, { 9, 19, 17, 20 } },
4187 { ISD::USUBSAT, MVT::v2i64, { 3, 7, 6, 6 } },
4188 { ISD::USUBSAT, MVT::v4i64, { 8, 10, 14, 15 } }, // 2 x 128-bit Op + extract/insert
4189 { ISD::USUBSAT, MVT::v8i32, { 4, 4, 7, 8 } }, // 2 x 128-bit Op + extract/insert
4190 { ISD::USUBSAT, MVT::v8i32, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4191 { ISD::USUBSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4192 { ISD::USUBSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4193 { ISD::FMAXNUM, MVT::f32, { 3, 6, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4194 { ISD::FMAXNUM, MVT::v4f32, { 3, 6, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4195 { ISD::FMAXNUM, MVT::v8f32, { 5, 7, 3, 10 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4196 { ISD::FMAXNUM, MVT::f64, { 3, 6, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4197 { ISD::FMAXNUM, MVT::v2f64, { 3, 6, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4198 { ISD::FMAXNUM, MVT::v4f64, { 5, 7, 3, 10 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4199 { ISD::FSQRT, MVT::f32, { 21, 21, 1, 1 } }, // vsqrtss
4200 { ISD::FSQRT, MVT::v4f32, { 21, 21, 1, 1 } }, // vsqrtps
4201 { ISD::FSQRT, MVT::v8f32, { 42, 42, 1, 3 } }, // vsqrtps
4202 { ISD::FSQRT, MVT::f64, { 27, 27, 1, 1 } }, // vsqrtsd
4203 { ISD::FSQRT, MVT::v2f64, { 27, 27, 1, 1 } }, // vsqrtpd
4204 { ISD::FSQRT, MVT::v4f64, { 54, 54, 1, 3 } }, // vsqrtpd
4205 };
4206 static const CostKindTblEntry GFNICostTbl[] = {
4207 { ISD::BITREVERSE, MVT::i8, { 3, 3, 3, 4 } }, // gf2p8affineqb
4208 { ISD::BITREVERSE, MVT::i16, { 3, 3, 4, 6 } }, // gf2p8affineqb
4209 { ISD::BITREVERSE, MVT::i32, { 3, 3, 4, 5 } }, // gf2p8affineqb
4210 { ISD::BITREVERSE, MVT::i64, { 3, 3, 4, 6 } }, // gf2p8affineqb
4211 { ISD::BITREVERSE, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4212 { ISD::BITREVERSE, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4213 { ISD::BITREVERSE, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4214 { ISD::BITREVERSE, MVT::v8i16, { 1, 8, 2, 4 } }, // gf2p8affineqb
4215 { ISD::BITREVERSE, MVT::v16i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
4216 { ISD::BITREVERSE, MVT::v32i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
4217 { ISD::BITREVERSE, MVT::v4i32, { 1, 8, 2, 4 } }, // gf2p8affineqb
4218 { ISD::BITREVERSE, MVT::v8i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
4219 { ISD::BITREVERSE, MVT::v16i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
4220 { ISD::BITREVERSE, MVT::v2i64, { 1, 8, 2, 4 } }, // gf2p8affineqb
4221 { ISD::BITREVERSE, MVT::v4i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
4222 { ISD::BITREVERSE, MVT::v8i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
4223 { X86ISD::VROTLI, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4224 { X86ISD::VROTLI, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4225 { X86ISD::VROTLI, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4226 };
4227 static const CostKindTblEntry GLMCostTbl[] = {
4228 { ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss
4229 { ISD::FSQRT, MVT::v4f32, { 37, 41, 1, 5 } }, // sqrtps
4230 { ISD::FSQRT, MVT::f64, { 34, 35, 1, 1 } }, // sqrtsd
4231 { ISD::FSQRT, MVT::v2f64, { 67, 71, 1, 5 } }, // sqrtpd
4232 };
4233 static const CostKindTblEntry SLMCostTbl[] = {
4234 { ISD::BSWAP, MVT::v2i64, { 5, 5, 1, 5 } },
4235 { ISD::BSWAP, MVT::v4i32, { 5, 5, 1, 5 } },
4236 { ISD::BSWAP, MVT::v8i16, { 5, 5, 1, 5 } },
4237 { ISD::FSQRT, MVT::f32, { 20, 20, 1, 1 } }, // sqrtss
4238 { ISD::FSQRT, MVT::v4f32, { 40, 41, 1, 5 } }, // sqrtps
4239 { ISD::FSQRT, MVT::f64, { 35, 35, 1, 1 } }, // sqrtsd
4240 { ISD::FSQRT, MVT::v2f64, { 70, 71, 1, 5 } }, // sqrtpd
4241 };
4242 static const CostKindTblEntry SSE42CostTbl[] = {
4243 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4244 { ISD::FMAXNUM, MVT::v4f32, { 4, 4, 4, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4245 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4246 { ISD::FMAXNUM, MVT::v2f64, { 4, 4, 4, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4247 { ISD::FSQRT, MVT::f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
4248 { ISD::FSQRT, MVT::v4f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
4249 };
4250 static const CostKindTblEntry SSE41CostTbl[] = {
4251 { ISD::ABS, MVT::v2i64, { 3, 4, 3, 5 } }, // BLENDVPD(X,PSUBQ(0,X),X)
4252 { ISD::SADDSAT, MVT::v2i64, { 10, 14, 17, 21 } },
4253 { ISD::SADDSAT, MVT::v4i32, { 5, 11, 8, 10 } },
4254 { ISD::SSUBSAT, MVT::v2i64, { 12, 19, 25, 29 } },
4255 { ISD::SSUBSAT, MVT::v4i32, { 6, 14, 10, 12 } },
4256 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 3 } },
4257 { ISD::SMAX, MVT::v4i32, { 1, 1, 1, 1 } },
4258 { ISD::SMAX, MVT::v16i8, { 1, 1, 1, 1 } },
4259 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
4260 { ISD::SMIN, MVT::v4i32, { 1, 1, 1, 1 } },
4261 { ISD::SMIN, MVT::v16i8, { 1, 1, 1, 1 } },
4262 { ISD::SMULO, MVT::v2i64, { 9, 11, 13, 17 } },
4263 { ISD::SMULO, MVT::v4i32, { 20, 24, 13, 19 } },
4264 { ISD::SMULO, MVT::v8i16, { 5, 9, 8, 8 } },
4265 { ISD::SMULO, MVT::v16i8, { 13, 22, 24, 25 } },
4266 { ISD::UADDSAT, MVT::v2i64, { 6, 13, 14, 14 } },
4267 { ISD::UADDSAT, MVT::v4i32, { 2, 2, 4, 4 } },
4268 { ISD::USUBSAT, MVT::v2i64, { 6, 10, 14, 14 } },
4269 { ISD::USUBSAT, MVT::v4i32, { 1, 2, 2, 2 } },
4270 { ISD::UMAX, MVT::v2i64, { 2, 11, 6, 7 } },
4271 { ISD::UMAX, MVT::v4i32, { 1, 1, 1, 1 } },
4272 { ISD::UMAX, MVT::v8i16, { 1, 1, 1, 1 } },
4273 { ISD::UMIN, MVT::v2i64, { 2, 11, 6, 7 } },
4274 { ISD::UMIN, MVT::v4i32, { 1, 1, 1, 1 } },
4275 { ISD::UMIN, MVT::v8i16, { 1, 1, 1, 1 } },
4276 { ISD::UMULO, MVT::v2i64, { 14, 20, 15, 20 } },
4277 { ISD::UMULO, MVT::v4i32, { 19, 22, 12, 18 } },
4278 { ISD::UMULO, MVT::v8i16, { 4, 9, 7, 7 } },
4279 { ISD::UMULO, MVT::v16i8, { 13, 19, 18, 20 } },
4280 };
4281 static const CostKindTblEntry SSSE3CostTbl[] = {
4282 { ISD::ABS, MVT::v4i32, { 1, 2, 1, 1 } },
4283 { ISD::ABS, MVT::v8i16, { 1, 2, 1, 1 } },
4284 { ISD::ABS, MVT::v16i8, { 1, 2, 1, 1 } },
4285 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 11, 21 } },
4286 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 11, 21 } },
4287 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 11, 21 } },
4288 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 10, 16 } },
4289 { ISD::BSWAP, MVT::v2i64, { 2, 3, 1, 5 } },
4290 { ISD::BSWAP, MVT::v4i32, { 2, 3, 1, 5 } },
4291 { ISD::BSWAP, MVT::v8i16, { 2, 3, 1, 5 } },
4292 { ISD::CTLZ, MVT::v2i64, { 18, 28, 28, 35 } },
4293 { ISD::CTLZ, MVT::v4i32, { 15, 20, 22, 28 } },
4294 { ISD::CTLZ, MVT::v8i16, { 13, 17, 16, 22 } },
4295 { ISD::CTLZ, MVT::v16i8, { 11, 15, 10, 16 } },
4296 { ISD::CTPOP, MVT::v2i64, { 13, 19, 12, 18 } },
4297 { ISD::CTPOP, MVT::v4i32, { 18, 24, 16, 22 } },
4298 { ISD::CTPOP, MVT::v8i16, { 13, 18, 14, 20 } },
4299 { ISD::CTPOP, MVT::v16i8, { 11, 12, 10, 16 } },
4300 { ISD::CTTZ, MVT::v2i64, { 13, 25, 15, 22 } },
4301 { ISD::CTTZ, MVT::v4i32, { 18, 26, 19, 25 } },
4302 { ISD::CTTZ, MVT::v8i16, { 13, 20, 17, 23 } },
4303 { ISD::CTTZ, MVT::v16i8, { 11, 16, 13, 19 } }
4304 };
4305 static const CostKindTblEntry SSE2CostTbl[] = {
4306 { ISD::ABS, MVT::v2i64, { 3, 6, 5, 5 } },
4307 { ISD::ABS, MVT::v4i32, { 1, 4, 4, 4 } },
4308 { ISD::ABS, MVT::v8i16, { 1, 2, 3, 3 } },
4309 { ISD::ABS, MVT::v16i8, { 1, 2, 3, 3 } },
4310 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 32, 32 } },
4311 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 30, 30 } },
4312 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 25, 25 } },
4313 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 21, 21 } },
4314 { ISD::BSWAP, MVT::v2i64, { 5, 6, 11, 11 } },
4315 { ISD::BSWAP, MVT::v4i32, { 5, 5, 9, 9 } },
4316 { ISD::BSWAP, MVT::v8i16, { 5, 5, 4, 5 } },
4317 { ISD::CTLZ, MVT::v2i64, { 10, 45, 36, 38 } },
4318 { ISD::CTLZ, MVT::v4i32, { 10, 45, 38, 40 } },
4319 { ISD::CTLZ, MVT::v8i16, { 9, 38, 32, 34 } },
4320 { ISD::CTLZ, MVT::v16i8, { 8, 39, 29, 32 } },
4321 { ISD::CTPOP, MVT::v2i64, { 12, 26, 16, 18 } },
4322 { ISD::CTPOP, MVT::v4i32, { 15, 29, 21, 23 } },
4323 { ISD::CTPOP, MVT::v8i16, { 13, 25, 18, 20 } },
4324 { ISD::CTPOP, MVT::v16i8, { 10, 21, 14, 16 } },
4325 { ISD::CTTZ, MVT::v2i64, { 14, 28, 19, 21 } },
4326 { ISD::CTTZ, MVT::v4i32, { 18, 31, 24, 26 } },
4327 { ISD::CTTZ, MVT::v8i16, { 16, 27, 21, 23 } },
4328 { ISD::CTTZ, MVT::v16i8, { 13, 23, 17, 19 } },
4329 { ISD::SADDSAT, MVT::v2i64, { 12, 14, 24, 24 } },
4330 { ISD::SADDSAT, MVT::v4i32, { 6, 11, 11, 12 } },
4331 { ISD::SADDSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4332 { ISD::SADDSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4333 { ISD::SMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4334 { ISD::SMAX, MVT::v4i32, { 2, 4, 5, 5 } },
4335 { ISD::SMAX, MVT::v8i16, { 1, 1, 1, 1 } },
4336 { ISD::SMAX, MVT::v16i8, { 2, 4, 5, 5 } },
4337 { ISD::SMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4338 { ISD::SMIN, MVT::v4i32, { 2, 4, 5, 5 } },
4339 { ISD::SMIN, MVT::v8i16, { 1, 1, 1, 1 } },
4340 { ISD::SMIN, MVT::v16i8, { 2, 4, 5, 5 } },
4341 { ISD::SMULO, MVT::v2i64, { 30, 33, 13, 23 } },
4342 { ISD::SMULO, MVT::v4i32, { 20, 24, 23, 23 } },
4343 { ISD::SMULO, MVT::v8i16, { 5, 10, 8, 8 } },
4344 { ISD::SMULO, MVT::v16i8, { 13, 23, 24, 25 } },
4345 { ISD::SSUBSAT, MVT::v2i64, { 16, 19, 31, 31 } },
4346 { ISD::SSUBSAT, MVT::v4i32, { 6, 14, 12, 13 } },
4347 { ISD::SSUBSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4348 { ISD::SSUBSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4349 { ISD::UADDSAT, MVT::v2i64, { 7, 13, 14, 14 } },
4350 { ISD::UADDSAT, MVT::v4i32, { 4, 5, 7, 7 } },
4351 { ISD::UADDSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4352 { ISD::UADDSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4353 { ISD::UMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4354 { ISD::UMAX, MVT::v4i32, { 2, 5, 8, 8 } },
4355 { ISD::UMAX, MVT::v8i16, { 1, 3, 3, 3 } },
4356 { ISD::UMAX, MVT::v16i8, { 1, 1, 1, 1 } },
4357 { ISD::UMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4358 { ISD::UMIN, MVT::v4i32, { 2, 5, 8, 8 } },
4359 { ISD::UMIN, MVT::v8i16, { 1, 3, 3, 3 } },
4360 { ISD::UMIN, MVT::v16i8, { 1, 1, 1, 1 } },
4361 { ISD::UMULO, MVT::v2i64, { 30, 33, 15, 29 } },
4362 { ISD::UMULO, MVT::v4i32, { 19, 22, 14, 18 } },
4363 { ISD::UMULO, MVT::v8i16, { 4, 9, 7, 7 } },
4364 { ISD::UMULO, MVT::v16i8, { 13, 19, 20, 20 } },
4365 { ISD::USUBSAT, MVT::v2i64, { 7, 10, 14, 14 } },
4366 { ISD::USUBSAT, MVT::v4i32, { 4, 4, 7, 7 } },
4367 { ISD::USUBSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4368 { ISD::USUBSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4369 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } },
4370 { ISD::FMAXNUM, MVT::v2f64, { 4, 6, 6, 6 } },
4371 { ISD::FSQRT, MVT::f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4372 { ISD::FSQRT, MVT::v2f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4373 };
4374 static const CostKindTblEntry SSE1CostTbl[] = {
4375 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } },
4376 { ISD::FMAXNUM, MVT::v4f32, { 4, 6, 6, 6 } },
4377 { ISD::FSQRT, MVT::f32, { 28, 30, 1, 2 } }, // Pentium III from http://www.agner.org/
4378 { ISD::FSQRT, MVT::v4f32, { 56, 56, 1, 2 } }, // Pentium III from http://www.agner.org/
4379 };
4380 static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets
4381 { ISD::CTTZ, MVT::i64, { 1, 1, 1, 1 } },
4382 };
4383 static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
4384 { ISD::CTTZ, MVT::i32, { 1, 1, 1, 1 } },
4385 { ISD::CTTZ, MVT::i16, { 2, 1, 1, 1 } },
4386 { ISD::CTTZ, MVT::i8, { 2, 1, 1, 1 } },
4387 };
4388 static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets
4389 { ISD::CTLZ, MVT::i64, { 1, 1, 1, 1 } },
4390 };
4391 static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
4392 { ISD::CTLZ, MVT::i32, { 1, 1, 1, 1 } },
4393 { ISD::CTLZ, MVT::i16, { 2, 1, 1, 1 } },
4394 { ISD::CTLZ, MVT::i8, { 2, 1, 1, 1 } },
4395 };
4396 static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets
4397 { ISD::CTPOP, MVT::i64, { 1, 1, 1, 1 } }, // popcnt
4398 };
4399 static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
4400 { ISD::CTPOP, MVT::i32, { 1, 1, 1, 1 } }, // popcnt
4401 { ISD::CTPOP, MVT::i16, { 1, 1, 2, 2 } }, // popcnt(zext())
4402 { ISD::CTPOP, MVT::i8, { 1, 1, 2, 2 } }, // popcnt(zext())
4403 };
4404 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
4405 { ISD::ABS, MVT::i64, { 1, 2, 3, 3 } }, // SUB+CMOV
4406 { ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } },
4407 { ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } },
4408 { ISD::CTLZ, MVT::i64, { 1, 2, 3, 3 } }, // MOV+BSR+XOR
4409 { ISD::CTLZ, MVT::i32, { 1, 2, 3, 3 } }, // MOV+BSR+XOR
4410 { ISD::CTLZ, MVT::i16, { 2, 2, 3, 3 } }, // MOV+BSR+XOR
4411 { ISD::CTLZ, MVT::i8, { 2, 2, 4, 3 } }, // MOV+BSR+XOR
4412 { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 2, 2 } }, // BSR+XOR
4413 { ISD::CTTZ, MVT::i64, { 1, 2, 2, 2 } }, // MOV+BSF
4414 { ISD::CTTZ, MVT::i32, { 1, 2, 2, 2 } }, // MOV+BSF
4415 { ISD::CTTZ, MVT::i16, { 2, 2, 2, 2 } }, // MOV+BSF
4416 { ISD::CTTZ, MVT::i8, { 2, 2, 2, 2 } }, // MOV+BSF
4417 { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 1, 2 } }, // BSF
4418 { ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } },
4419 { ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } },
4420 { ISD::ROTR, MVT::i64, { 2, 3, 1, 3 } },
4421 { X86ISD::VROTLI, MVT::i64, { 1, 1, 1, 1 } },
4422 { ISD::FSHL, MVT::i64, { 4, 4, 1, 4 } },
4423 { ISD::SADDSAT, MVT::i64, { 4, 4, 7, 10 } },
4424 { ISD::SSUBSAT, MVT::i64, { 4, 5, 8, 11 } },
4425 { ISD::UADDSAT, MVT::i64, { 2, 3, 4, 7 } },
4426 { ISD::USUBSAT, MVT::i64, { 2, 3, 4, 7 } },
4427 { ISD::SMAX, MVT::i64, { 1, 3, 2, 3 } },
4428 { ISD::SMIN, MVT::i64, { 1, 3, 2, 3 } },
4429 { ISD::UMAX, MVT::i64, { 1, 3, 2, 3 } },
4430 { ISD::UMIN, MVT::i64, { 1, 3, 2, 3 } },
4431 { ISD::SADDO, MVT::i64, { 2, 2, 4, 6 } },
4432 { ISD::UADDO, MVT::i64, { 2, 2, 4, 6 } },
4433 { ISD::SMULO, MVT::i64, { 4, 4, 4, 6 } },
4434 { ISD::UMULO, MVT::i64, { 8, 8, 4, 7 } },
4435 };
4436 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
4437 { ISD::ABS, MVT::i32, { 1, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
4438 { ISD::ABS, MVT::i16, { 2, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
4439 { ISD::ABS, MVT::i8, { 2, 4, 4, 3 } }, // SUB+XOR+SRA
4440 { ISD::BITREVERSE, MVT::i32, { 9, 12, 17, 19 } },
4441 { ISD::BITREVERSE, MVT::i16, { 9, 12, 17, 19 } },
4442 { ISD::BITREVERSE, MVT::i8, { 7, 9, 13, 14 } },
4443 { ISD::BSWAP, MVT::i32, { 1, 1, 1, 1 } },
4444 { ISD::BSWAP, MVT::i16, { 1, 2, 1, 2 } }, // ROL
4445 { ISD::CTLZ, MVT::i32, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
4446 { ISD::CTLZ, MVT::i16, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
4447 { ISD::CTLZ, MVT::i8, { 2, 2, 5, 6 } }, // BSR+XOR or BSR+XOR+CMOV
4448 { ISD::CTLZ_ZERO_UNDEF, MVT::i32,{ 1, 2, 2, 2 } }, // BSR+XOR
4449 { ISD::CTLZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 2, 2 } }, // BSR+XOR
4450 { ISD::CTLZ_ZERO_UNDEF, MVT::i8, { 2, 2, 3, 3 } }, // BSR+XOR
4451 { ISD::CTTZ, MVT::i32, { 2, 2, 3, 3 } }, // TEST+BSF+CMOV/BRANCH
4452 { ISD::CTTZ, MVT::i16, { 2, 2, 2, 3 } }, // TEST+BSF+CMOV/BRANCH
4453 { ISD::CTTZ, MVT::i8, { 2, 2, 2, 3 } }, // TEST+BSF+CMOV/BRANCH
4454 { ISD::CTTZ_ZERO_UNDEF, MVT::i32,{ 1, 2, 1, 2 } }, // BSF
4455 { ISD::CTTZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 1, 2 } }, // BSF
4456 { ISD::CTTZ_ZERO_UNDEF, MVT::i8, { 2, 2, 1, 2 } }, // BSF
4457 { ISD::CTPOP, MVT::i32, { 8, 7, 15, 15 } },
4458 { ISD::CTPOP, MVT::i16, { 9, 8, 17, 17 } },
4459 { ISD::CTPOP, MVT::i8, { 7, 6, 6, 6 } },
4460 { ISD::ROTL, MVT::i32, { 2, 3, 1, 3 } },
4461 { ISD::ROTL, MVT::i16, { 2, 3, 1, 3 } },
4462 { ISD::ROTL, MVT::i8, { 2, 3, 1, 3 } },
4463 { ISD::ROTR, MVT::i32, { 2, 3, 1, 3 } },
4464 { ISD::ROTR, MVT::i16, { 2, 3, 1, 3 } },
4465 { ISD::ROTR, MVT::i8, { 2, 3, 1, 3 } },
4466 { X86ISD::VROTLI, MVT::i32, { 1, 1, 1, 1 } },
4467 { X86ISD::VROTLI, MVT::i16, { 1, 1, 1, 1 } },
4468 { X86ISD::VROTLI, MVT::i8, { 1, 1, 1, 1 } },
4469 { ISD::FSHL, MVT::i32, { 4, 4, 1, 4 } },
4470 { ISD::FSHL, MVT::i16, { 4, 4, 2, 5 } },
4471 { ISD::FSHL, MVT::i8, { 4, 4, 2, 5 } },
4472 { ISD::SADDSAT, MVT::i32, { 3, 4, 6, 9 } },
4473 { ISD::SADDSAT, MVT::i16, { 4, 4, 7, 10 } },
4474 { ISD::SADDSAT, MVT::i8, { 4, 5, 8, 11 } },
4475 { ISD::SSUBSAT, MVT::i32, { 4, 4, 7, 10 } },
4476 { ISD::SSUBSAT, MVT::i16, { 4, 4, 7, 10 } },
4477 { ISD::SSUBSAT, MVT::i8, { 4, 5, 8, 11 } },
4478 { ISD::UADDSAT, MVT::i32, { 2, 3, 4, 7 } },
4479 { ISD::UADDSAT, MVT::i16, { 2, 3, 4, 7 } },
4480 { ISD::UADDSAT, MVT::i8, { 3, 3, 5, 8 } },
4481 { ISD::USUBSAT, MVT::i32, { 2, 3, 4, 7 } },
4482 { ISD::USUBSAT, MVT::i16, { 2, 3, 4, 7 } },
4483 { ISD::USUBSAT, MVT::i8, { 3, 3, 5, 8 } },
4484 { ISD::SMAX, MVT::i32, { 1, 2, 2, 3 } },
4485 { ISD::SMAX, MVT::i16, { 1, 4, 2, 4 } },
4486 { ISD::SMAX, MVT::i8, { 1, 4, 2, 4 } },
4487 { ISD::SMIN, MVT::i32, { 1, 2, 2, 3 } },
4488 { ISD::SMIN, MVT::i16, { 1, 4, 2, 4 } },
4489 { ISD::SMIN, MVT::i8, { 1, 4, 2, 4 } },
4490 { ISD::UMAX, MVT::i32, { 1, 2, 2, 3 } },
4491 { ISD::UMAX, MVT::i16, { 1, 4, 2, 4 } },
4492 { ISD::UMAX, MVT::i8, { 1, 4, 2, 4 } },
4493 { ISD::UMIN, MVT::i32, { 1, 2, 2, 3 } },
4494 { ISD::UMIN, MVT::i16, { 1, 4, 2, 4 } },
4495 { ISD::UMIN, MVT::i8, { 1, 4, 2, 4 } },
4496 { ISD::SADDO, MVT::i32, { 2, 2, 4, 6 } },
4497 { ISD::SADDO, MVT::i16, { 2, 2, 4, 6 } },
4498 { ISD::SADDO, MVT::i8, { 2, 2, 4, 6 } },
4499 { ISD::UADDO, MVT::i32, { 2, 2, 4, 6 } },
4500 { ISD::UADDO, MVT::i16, { 2, 2, 4, 6 } },
4501 { ISD::UADDO, MVT::i8, { 2, 2, 4, 6 } },
4502 { ISD::SMULO, MVT::i32, { 2, 2, 4, 6 } },
4503 { ISD::SMULO, MVT::i16, { 5, 5, 4, 6 } },
4504 { ISD::SMULO, MVT::i8, { 6, 6, 4, 6 } },
4505 { ISD::UMULO, MVT::i32, { 6, 6, 4, 8 } },
4506 { ISD::UMULO, MVT::i16, { 6, 6, 4, 9 } },
4507 { ISD::UMULO, MVT::i8, { 6, 6, 4, 6 } },
4508 };
4509
4510 Type *RetTy = ICA.getReturnType();
4511 Type *OpTy = RetTy;
4512 Intrinsic::ID IID = ICA.getID();
4513 unsigned ISD = ISD::DELETED_NODE;
4514 switch (IID) {
4515 default:
4516 break;
4517 case Intrinsic::abs:
4518 ISD = ISD::ABS;
4519 break;
4520 case Intrinsic::bitreverse:
4522 break;
4523 case Intrinsic::bswap:
4524 ISD = ISD::BSWAP;
4525 break;
4526 case Intrinsic::ctlz:
4527 ISD = ISD::CTLZ;
4528 break;
4529 case Intrinsic::ctpop:
4530 ISD = ISD::CTPOP;
4531 break;
4532 case Intrinsic::cttz:
4533 ISD = ISD::CTTZ;
4534 break;
4535 case Intrinsic::fshl:
4536 ISD = ISD::FSHL;
4537 if (!ICA.isTypeBasedOnly()) {
4538 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4539 if (Args[0] == Args[1]) {
4540 ISD = ISD::ROTL;
4541 // Handle uniform constant rotation amounts.
4542 // TODO: Handle funnel-shift cases.
4543 const APInt *Amt;
4544 if (Args[2] &&
4547 }
4548 }
4549 break;
4550 case Intrinsic::fshr:
4551 // FSHR has same costs so don't duplicate.
4552 ISD = ISD::FSHL;
4553 if (!ICA.isTypeBasedOnly()) {
4554 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4555 if (Args[0] == Args[1]) {
4556 ISD = ISD::ROTR;
4557 // Handle uniform constant rotation amount.
4558 // TODO: Handle funnel-shift cases.
4559 const APInt *Amt;
4560 if (Args[2] &&
4563 }
4564 }
4565 break;
4566 case Intrinsic::lrint:
4567 case Intrinsic::llrint: {
4568 // X86 can use the CVTP2SI instructions to lower lrint/llrint calls, which
4569 // have the same costs as the CVTTP2SI (fptosi) instructions
4570 const SmallVectorImpl<Type *> &ArgTys = ICA.getArgTypes();
4571 return getCastInstrCost(Instruction::FPToSI, RetTy, ArgTys[0],
4573 }
4574 case Intrinsic::maxnum:
4575 case Intrinsic::minnum:
4576 // FMINNUM has same costs so don't duplicate.
4577 ISD = ISD::FMAXNUM;
4578 break;
4579 case Intrinsic::sadd_sat:
4580 ISD = ISD::SADDSAT;
4581 break;
4582 case Intrinsic::smax:
4583 ISD = ISD::SMAX;
4584 break;
4585 case Intrinsic::smin:
4586 ISD = ISD::SMIN;
4587 break;
4588 case Intrinsic::ssub_sat:
4589 ISD = ISD::SSUBSAT;
4590 break;
4591 case Intrinsic::uadd_sat:
4592 ISD = ISD::UADDSAT;
4593 break;
4594 case Intrinsic::umax:
4595 ISD = ISD::UMAX;
4596 break;
4597 case Intrinsic::umin:
4598 ISD = ISD::UMIN;
4599 break;
4600 case Intrinsic::usub_sat:
4601 ISD = ISD::USUBSAT;
4602 break;
4603 case Intrinsic::sqrt:
4604 ISD = ISD::FSQRT;
4605 break;
4606 case Intrinsic::sadd_with_overflow:
4607 case Intrinsic::ssub_with_overflow:
4608 // SSUBO has same costs so don't duplicate.
4609 ISD = ISD::SADDO;
4610 OpTy = RetTy->getContainedType(0);
4611 break;
4612 case Intrinsic::uadd_with_overflow:
4613 case Intrinsic::usub_with_overflow:
4614 // USUBO has same costs so don't duplicate.
4615 ISD = ISD::UADDO;
4616 OpTy = RetTy->getContainedType(0);
4617 break;
4618 case Intrinsic::smul_with_overflow:
4619 ISD = ISD::SMULO;
4620 OpTy = RetTy->getContainedType(0);
4621 break;
4622 case Intrinsic::umul_with_overflow:
4623 ISD = ISD::UMULO;
4624 OpTy = RetTy->getContainedType(0);
4625 break;
4626 }
4627
4628 if (ISD != ISD::DELETED_NODE) {
4629 auto adjustTableCost = [&](int ISD, unsigned Cost,
4630 std::pair<InstructionCost, MVT> LT,
4632 InstructionCost LegalizationCost = LT.first;
4633 MVT MTy = LT.second;
4634
4635 // If there are no NANs to deal with, then these are reduced to a
4636 // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
4637 // assume is used in the non-fast case.
4638 if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) {
4639 if (FMF.noNaNs())
4640 return LegalizationCost * 1;
4641 }
4642
4643 // For cases where some ops can be folded into a load/store, assume free.
4644 if (MTy.isScalarInteger()) {
4645 if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
4646 if (const Instruction *II = ICA.getInst()) {
4647 if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
4648 return TTI::TCC_Free;
4649 if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
4650 if (LI->hasOneUse())
4651 return TTI::TCC_Free;
4652 }
4653 }
4654 }
4655 }
4656
4657 return LegalizationCost * (int)Cost;
4658 };
4659
4660 // Legalize the type.
4661 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy);
4662 MVT MTy = LT.second;
4663
4664 // Without BMI/LZCNT see if we're only looking for a *_ZERO_UNDEF cost.
4665 if (((ISD == ISD::CTTZ && !ST->hasBMI()) ||
4666 (ISD == ISD::CTLZ && !ST->hasLZCNT())) &&
4667 !MTy.isVector() && !ICA.isTypeBasedOnly()) {
4668 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4669 if (auto *Cst = dyn_cast<ConstantInt>(Args[1]))
4670 if (Cst->isAllOnesValue())
4672 }
4673
4674 // FSQRT is a single instruction.
4676 return LT.first;
4677
4678 if (ST->useGLMDivSqrtCosts())
4679 if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
4680 if (auto KindCost = Entry->Cost[CostKind])
4681 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4682
4683 if (ST->useSLMArithCosts())
4684 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
4685 if (auto KindCost = Entry->Cost[CostKind])
4686 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4687
4688 if (ST->hasVBMI2())
4689 if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy))
4690 if (auto KindCost = Entry->Cost[CostKind])
4691 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4692
4693 if (ST->hasBITALG())
4694 if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy))
4695 if (auto KindCost = Entry->Cost[CostKind])
4696 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4697
4698 if (ST->hasVPOPCNTDQ())
4699 if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy))
4700 if (auto KindCost = Entry->Cost[CostKind])
4701 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4702
4703 if (ST->hasGFNI())
4704 if (const auto *Entry = CostTableLookup(GFNICostTbl, ISD, MTy))
4705 if (auto KindCost = Entry->Cost[CostKind])
4706 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4707
4708 if (ST->hasCDI())
4709 if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
4710 if (auto KindCost = Entry->Cost[CostKind])
4711 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4712
4713 if (ST->hasBWI())
4714 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
4715 if (auto KindCost = Entry->Cost[CostKind])
4716 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4717
4718 if (ST->hasAVX512())
4719 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
4720 if (auto KindCost = Entry->Cost[CostKind])
4721 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4722
4723 if (ST->hasXOP())
4724 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
4725 if (auto KindCost = Entry->Cost[CostKind])
4726 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4727
4728 if (ST->hasAVX2())
4729 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
4730 if (auto KindCost = Entry->Cost[CostKind])
4731 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4732
4733 if (ST->hasAVX())
4734 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
4735 if (auto KindCost = Entry->Cost[CostKind])
4736 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4737
4738 if (ST->hasSSE42())
4739 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
4740 if (auto KindCost = Entry->Cost[CostKind])
4741 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4742
4743 if (ST->hasSSE41())
4744 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
4745 if (auto KindCost = Entry->Cost[CostKind])
4746 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4747
4748 if (ST->hasSSSE3())
4749 if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
4750 if (auto KindCost = Entry->Cost[CostKind])
4751 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4752
4753 if (ST->hasSSE2())
4754 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
4755 if (auto KindCost = Entry->Cost[CostKind])
4756 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4757
4758 if (ST->hasSSE1())
4759 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
4760 if (auto KindCost = Entry->Cost[CostKind])
4761 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4762
4763 if (ST->hasBMI()) {
4764 if (ST->is64Bit())
4765 if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
4766 if (auto KindCost = Entry->Cost[CostKind])
4767 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4768
4769 if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
4770 if (auto KindCost = Entry->Cost[CostKind])
4771 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4772 }
4773
4774 if (ST->hasLZCNT()) {
4775 if (ST->is64Bit())
4776 if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
4777 if (auto KindCost = Entry->Cost[CostKind])
4778 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4779
4780 if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
4781 if (auto KindCost = Entry->Cost[CostKind])
4782 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4783 }
4784
4785 if (ST->hasPOPCNT()) {
4786 if (ST->is64Bit())
4787 if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
4788 if (auto KindCost = Entry->Cost[CostKind])
4789 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4790
4791 if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
4792 if (auto KindCost = Entry->Cost[CostKind])
4793 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4794 }
4795
4796 if (ST->is64Bit())
4797 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
4798 if (auto KindCost = Entry->Cost[CostKind])
4799 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4800
4801 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
4802 if (auto KindCost = Entry->Cost[CostKind])
4803 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4804
4805 // Without arg data, we need to compute the expanded costs of custom lowered
4806 // intrinsics to prevent use of the (very low) default costs.
4807 if (ICA.isTypeBasedOnly() &&
4808 (IID == Intrinsic::fshl || IID == Intrinsic::fshr)) {
4809 Type *CondTy = RetTy->getWithNewBitWidth(1);
4811 Cost += getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
4812 Cost += getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
4813 Cost += getArithmeticInstrCost(BinaryOperator::Shl, RetTy, CostKind);
4814 Cost += getArithmeticInstrCost(BinaryOperator::LShr, RetTy, CostKind);
4815 Cost += getArithmeticInstrCost(BinaryOperator::And, RetTy, CostKind);
4816 Cost += getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
4818 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
4820 return Cost;
4821 }
4822 }
4823
4825}
4826
4828 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4829 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
4830 static const CostTblEntry SLMCostTbl[] = {
4831 { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 },
4832 { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 },
4833 { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 },
4834 { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 }
4835 };
4836
4837 assert(Val->isVectorTy() && "This must be a vector type");
4838 auto *VT = cast<VectorType>(Val);
4839 if (VT->isScalableTy())
4841
4842 Type *ScalarType = Val->getScalarType();
4843 InstructionCost RegisterFileMoveCost = 0;
4844
4845 // Non-immediate extraction/insertion can be handled as a sequence of
4846 // aliased loads+stores via the stack.
4847 if (Index == -1U && (Opcode == Instruction::ExtractElement ||
4848 Opcode == Instruction::InsertElement)) {
4849 // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
4850 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
4851
4852 // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling.
4853 assert(isa<FixedVectorType>(Val) && "Fixed vector type expected");
4854 Align VecAlign = DL.getPrefTypeAlign(Val);
4855 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
4856
4857 // Extract - store vector to stack, load scalar.
4858 if (Opcode == Instruction::ExtractElement) {
4859 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4860 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
4861 CostKind);
4862 }
4863 // Insert - store vector to stack, store scalar, load vector.
4864 if (Opcode == Instruction::InsertElement) {
4865 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4866 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
4867 CostKind) +
4868 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind);
4869 }
4870 }
4871
4872 if (Index != -1U && (Opcode == Instruction::ExtractElement ||
4873 Opcode == Instruction::InsertElement)) {
4874 // Extraction of vXi1 elements are now efficiently handled by MOVMSK.
4875 if (Opcode == Instruction::ExtractElement &&
4876 ScalarType->getScalarSizeInBits() == 1 &&
4877 cast<FixedVectorType>(Val)->getNumElements() > 1)
4878 return 1;
4879
4880 // Legalize the type.
4881 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
4882
4883 // This type is legalized to a scalar type.
4884 if (!LT.second.isVector())
4885 return TTI::TCC_Free;
4886
4887 // The type may be split. Normalize the index to the new type.
4888 unsigned SizeInBits = LT.second.getSizeInBits();
4889 unsigned NumElts = LT.second.getVectorNumElements();
4890 unsigned SubNumElts = NumElts;
4891 Index = Index % NumElts;
4892
4893 // For >128-bit vectors, we need to extract higher 128-bit subvectors.
4894 // For inserts, we also need to insert the subvector back.
4895 if (SizeInBits > 128) {
4896 assert((SizeInBits % 128) == 0 && "Illegal vector");
4897 unsigned NumSubVecs = SizeInBits / 128;
4898 SubNumElts = NumElts / NumSubVecs;
4899 if (SubNumElts <= Index) {
4900 RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
4901 Index %= SubNumElts;
4902 }
4903 }
4904
4905 MVT MScalarTy = LT.second.getScalarType();
4906 auto IsCheapPInsrPExtrInsertPS = [&]() {
4907 // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
4908 // Inserting f32 into index0 is just movss.
4909 // Also, assume insertps is relatively cheap on all >= SSE41 targets.
4910 return (MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4911 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4912 (MScalarTy == MVT::f32 && ST->hasSSE1() && Index == 0 &&
4913 Opcode == Instruction::InsertElement) ||
4914 (MScalarTy == MVT::f32 && ST->hasSSE41() &&
4915 Opcode == Instruction::InsertElement);
4916 };
4917
4918 if (Index == 0) {
4919 // Floating point scalars are already located in index #0.
4920 // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
4921 // true for all.
4922 if (ScalarType->isFloatingPointTy() &&
4923 (Opcode != Instruction::InsertElement || !Op0 ||
4924 isa<UndefValue>(Op0)))
4925 return RegisterFileMoveCost;
4926
4927 if (Opcode == Instruction::InsertElement &&
4929 // Consider the gather cost to be cheap.
4931 return RegisterFileMoveCost;
4932 if (!IsCheapPInsrPExtrInsertPS()) {
4933 // mov constant-to-GPR + movd/movq GPR -> XMM.
4934 if (isa_and_nonnull<Constant>(Op1) && Op1->getType()->isIntegerTy())
4935 return 2 + RegisterFileMoveCost;
4936 // Assume movd/movq GPR -> XMM is relatively cheap on all targets.
4937 return 1 + RegisterFileMoveCost;
4938 }
4939 }
4940
4941 // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
4942 if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
4943 return 1 + RegisterFileMoveCost;
4944 }
4945
4946 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4947 assert(ISD && "Unexpected vector opcode");
4948 if (ST->useSLMArithCosts())
4949 if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
4950 return Entry->Cost + RegisterFileMoveCost;
4951
4952 // Consider cheap cases.
4953 if (IsCheapPInsrPExtrInsertPS())
4954 return 1 + RegisterFileMoveCost;
4955
4956 // For extractions we just need to shuffle the element to index 0, which
4957 // should be very cheap (assume cost = 1). For insertions we need to shuffle
4958 // the elements to its destination. In both cases we must handle the
4959 // subvector move(s).
4960 // If the vector type is already less than 128-bits then don't reduce it.
4961 // TODO: Under what circumstances should we shuffle using the full width?
4962 InstructionCost ShuffleCost = 1;
4963 if (Opcode == Instruction::InsertElement) {
4964 auto *SubTy = cast<VectorType>(Val);
4965 EVT VT = TLI->getValueType(DL, Val);
4966 if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
4967 SubTy = FixedVectorType::get(ScalarType, SubNumElts);
4968 ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, SubTy, {},
4969 CostKind, 0, SubTy);
4970 }
4971 int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
4972 return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
4973 }
4974
4975 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1,
4976 VIC) +
4977 RegisterFileMoveCost;
4978}
4979
4981 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
4982 TTI::TargetCostKind CostKind, bool ForPoisonSrc, ArrayRef<Value *> VL,
4983 TTI::VectorInstrContext VIC) const {
4984 assert(DemandedElts.getBitWidth() ==
4985 cast<FixedVectorType>(Ty)->getNumElements() &&
4986 "Vector size mismatch");
4987
4988 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4989 MVT MScalarTy = LT.second.getScalarType();
4990 unsigned LegalVectorBitWidth = LT.second.getSizeInBits();
4992
4993 constexpr unsigned LaneBitWidth = 128;
4994 assert((LegalVectorBitWidth < LaneBitWidth ||
4995 (LegalVectorBitWidth % LaneBitWidth) == 0) &&
4996 "Illegal vector");
4997
4998 const int NumLegalVectors = LT.first.getValue();
4999 assert(NumLegalVectors >= 0 && "Negative cost!");
5000
5001 // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
5002 // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT. SLPVectorizer has
5003 // a special heuristic regarding poison input which is passed here in
5004 // ForPoisonSrc.
5005 if (Insert && !ForPoisonSrc) {
5006 // This is nearly identical to BaseT::getScalarizationOverhead(), except
5007 // it is passing nullptr to getVectorInstrCost() for Op0 (instead of
5008 // Constant::getNullValue()), which makes the X86TTIImpl
5009 // getVectorInstrCost() return 0 instead of 1.
5010 for (unsigned I : seq(DemandedElts.getBitWidth())) {
5011 if (!DemandedElts[I])
5012 continue;
5013 Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, I,
5015 VL.empty() ? nullptr : VL[I],
5017 }
5018 return Cost;
5019 }
5020
5021 if (Insert) {
5022 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
5023 (MScalarTy.isInteger() && ST->hasSSE41()) ||
5024 (MScalarTy == MVT::f32 && ST->hasSSE41())) {
5025 // For types we can insert directly, insertion into 128-bit sub vectors is
5026 // cheap, followed by a cheap chain of concatenations.
5027 if (LegalVectorBitWidth <= LaneBitWidth) {
5028 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
5029 /*Extract*/ false, CostKind);
5030 } else {
5031 // In each 128-lane, if at least one index is demanded but not all
5032 // indices are demanded and this 128-lane is not the first 128-lane of
5033 // the legalized-vector, then this 128-lane needs a extracti128; If in
5034 // each 128-lane, there is at least one demanded index, this 128-lane
5035 // needs a inserti128.
5036
5037 // The following cases will help you build a better understanding:
5038 // Assume we insert several elements into a v8i32 vector in avx2,
5039 // Case#1: inserting into 1th index needs vpinsrd + inserti128.
5040 // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
5041 // inserti128.
5042 // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
5043 assert((LegalVectorBitWidth % LaneBitWidth) == 0 && "Illegal vector");
5044 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
5045 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
5046 unsigned NumLegalElts =
5047 LT.second.getVectorNumElements() * NumLegalVectors;
5048 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
5049 "Vector has been legalized to smaller element count");
5050 assert((NumLegalElts % NumLanesTotal) == 0 &&
5051 "Unexpected elts per lane");
5052 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
5053
5054 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
5055 auto *LaneTy =
5056 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
5057
5058 for (unsigned I = 0; I != NumLanesTotal; ++I) {
5059 APInt LaneEltMask = WidenedDemandedElts.extractBits(
5060 NumEltsPerLane, NumEltsPerLane * I);
5061 if (LaneEltMask.isZero())
5062 continue;
5063 // FIXME: we don't need to extract if all non-demanded elements
5064 // are legalization-inserted padding.
5065 if (!LaneEltMask.isAllOnes())
5067 CostKind, I * NumEltsPerLane, LaneTy);
5068 Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert,
5069 /*Extract*/ false, CostKind);
5070 }
5071
5072 APInt AffectedLanes =
5073 APIntOps::ScaleBitMask(WidenedDemandedElts, NumLanesTotal);
5074 APInt FullyAffectedLegalVectors = APIntOps::ScaleBitMask(
5075 AffectedLanes, NumLegalVectors, /*MatchAllBits=*/true);
5076 for (int LegalVec = 0; LegalVec != NumLegalVectors; ++LegalVec) {
5077 for (unsigned Lane = 0; Lane != NumLegalLanes; ++Lane) {
5078 unsigned I = NumLegalLanes * LegalVec + Lane;
5079 // No need to insert unaffected lane; or lane 0 of each legal vector
5080 // iff ALL lanes of that vector were affected and will be inserted.
5081 if (!AffectedLanes[I] ||
5082 (Lane == 0 && FullyAffectedLegalVectors[LegalVec]))
5083 continue;
5085 CostKind, I * NumEltsPerLane, LaneTy);
5086 }
5087 }
5088 }
5089 } else if (LT.second.isVector()) {
5090 // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
5091 // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
5092 // series of UNPCK followed by CONCAT_VECTORS - all of these can be
5093 // considered cheap.
5094 if (Ty->isIntOrIntVectorTy())
5095 Cost += DemandedElts.popcount();
5096
5097 // Get the smaller of the legalized or original pow2-extended number of
5098 // vector elements, which represents the number of unpacks we'll end up
5099 // performing.
5100 unsigned NumElts = LT.second.getVectorNumElements();
5101 unsigned Pow2Elts =
5103 Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
5104 }
5105 }
5106
5107 if (Extract) {
5108 // vXi1 can be efficiently extracted with MOVMSK.
5109 // TODO: AVX512 predicate mask handling.
5110 // NOTE: This doesn't work well for roundtrip scalarization.
5111 if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) {
5112 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
5113 unsigned MaxElts = ST->hasAVX2() ? 32 : 16;
5114 unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts;
5115 return MOVMSKCost;
5116 }
5117
5118 if (LT.second.isVector()) {
5119 unsigned NumLegalElts =
5120 LT.second.getVectorNumElements() * NumLegalVectors;
5121 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
5122 "Vector has been legalized to smaller element count");
5123
5124 // If we're extracting elements from a 128-bit subvector lane,
5125 // we only need to extract each lane once, not for every element.
5126 if (LegalVectorBitWidth > LaneBitWidth) {
5127 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
5128 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
5129 assert((NumLegalElts % NumLanesTotal) == 0 &&
5130 "Unexpected elts per lane");
5131 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
5132
5133 // Add cost for each demanded 128-bit subvector extraction.
5134 // Luckily this is a lot easier than for insertion.
5135 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
5136 auto *LaneTy =
5137 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
5138
5139 for (unsigned I = 0; I != NumLanesTotal; ++I) {
5140 APInt LaneEltMask = WidenedDemandedElts.extractBits(
5141 NumEltsPerLane, I * NumEltsPerLane);
5142 if (LaneEltMask.isZero())
5143 continue;
5145 I * NumEltsPerLane, LaneTy);
5147 LaneTy, LaneEltMask, /*Insert*/ false, Extract, CostKind);
5148 }
5149
5150 return Cost;
5151 }
5152 }
5153
5154 // Fallback to default extraction.
5155 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ false,
5156 Extract, CostKind);
5157 }
5158
5159 return Cost;
5160}
5161
5163X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
5164 int VF, const APInt &DemandedDstElts,
5166 const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy);
5167 // We don't differentiate element types here, only element bit width.
5168 EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits);
5169
5170 auto bailout = [&]() {
5171 return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
5172 DemandedDstElts, CostKind);
5173 };
5174
5175 // For now, only deal with AVX512 cases.
5176 if (!ST->hasAVX512())
5177 return bailout();
5178
5179 // Do we have a native shuffle for this element type, or should we promote?
5180 unsigned PromEltTyBits = EltTyBits;
5181 switch (EltTyBits) {
5182 case 32:
5183 case 64:
5184 break; // AVX512F.
5185 case 16:
5186 if (!ST->hasBWI())
5187 PromEltTyBits = 32; // promote to i32, AVX512F.
5188 break; // AVX512BW
5189 case 8:
5190 if (!ST->hasVBMI())
5191 PromEltTyBits = 32; // promote to i32, AVX512F.
5192 break; // AVX512VBMI
5193 case 1:
5194 // There is no support for shuffling i1 elements. We *must* promote.
5195 if (ST->hasBWI()) {
5196 if (ST->hasVBMI())
5197 PromEltTyBits = 8; // promote to i8, AVX512VBMI.
5198 else
5199 PromEltTyBits = 16; // promote to i16, AVX512BW.
5200 break;
5201 }
5202 PromEltTyBits = 32; // promote to i32, AVX512F.
5203 break;
5204 default:
5205 return bailout();
5206 }
5207 auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits);
5208
5209 auto *SrcVecTy = FixedVectorType::get(EltTy, VF);
5210 auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF);
5211
5212 int NumDstElements = VF * ReplicationFactor;
5213 auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements);
5214 auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements);
5215
5216 // Legalize the types.
5217 MVT LegalSrcVecTy = getTypeLegalizationCost(SrcVecTy).second;
5218 MVT LegalPromSrcVecTy = getTypeLegalizationCost(PromSrcVecTy).second;
5219 MVT LegalPromDstVecTy = getTypeLegalizationCost(PromDstVecTy).second;
5220 MVT LegalDstVecTy = getTypeLegalizationCost(DstVecTy).second;
5221 // They should have legalized into vector types.
5222 if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() ||
5223 !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector())
5224 return bailout();
5225
5226 if (PromEltTyBits != EltTyBits) {
5227 // If we have to perform the shuffle with wider elt type than our data type,
5228 // then we will first need to anyext (we don't care about the new bits)
5229 // the source elements, and then truncate Dst elements.
5230 InstructionCost PromotionCost;
5231 PromotionCost += getCastInstrCost(
5232 Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy,
5234 PromotionCost +=
5235 getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy,
5236 /*Src=*/PromDstVecTy,
5238 return PromotionCost + getReplicationShuffleCost(PromEltTy,
5239 ReplicationFactor, VF,
5240 DemandedDstElts, CostKind);
5241 }
5242
5243 assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits &&
5244 LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() &&
5245 "We expect that the legalization doesn't affect the element width, "
5246 "doesn't coalesce/split elements.");
5247
5248 unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements();
5249 unsigned NumDstVectors =
5250 divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec);
5251
5252 auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec);
5253
5254 // Not all the produced Dst elements may be demanded. In our case,
5255 // given that a single Dst vector is formed by a single shuffle,
5256 // if all elements that will form a single Dst vector aren't demanded,
5257 // then we won't need to do that shuffle, so adjust the cost accordingly.
5258 APInt DemandedDstVectors = APIntOps::ScaleBitMask(
5259 DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors);
5260 unsigned NumDstVectorsDemanded = DemandedDstVectors.popcount();
5261
5262 InstructionCost SingleShuffleCost =
5263 getShuffleCost(TTI::SK_PermuteSingleSrc, SingleDstVecTy, SingleDstVecTy,
5264 /*Mask=*/{}, CostKind,
5265 /*Index=*/0, /*SubTp=*/nullptr);
5266 return NumDstVectorsDemanded * SingleShuffleCost;
5267}
5268
5270 Align Alignment,
5271 unsigned AddressSpace,
5273 TTI::OperandValueInfo OpInfo,
5274 const Instruction *I) const {
5275 // TODO: Handle other cost kinds.
5277 if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
5278 // Store instruction with index and scale costs 2 Uops.
5279 // Check the preceding GEP to identify non-const indices.
5280 if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
5281 if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
5282 return TTI::TCC_Basic * 2;
5283 }
5284 }
5285 return TTI::TCC_Basic;
5286 }
5287
5288 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
5289 "Invalid Opcode");
5290 // Type legalization can't handle structs
5291 if (TLI->getValueType(DL, Src, true) == MVT::Other)
5292 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5293 CostKind, OpInfo, I);
5294
5295 // Legalize the type.
5296 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
5297
5298 auto *VTy = dyn_cast<FixedVectorType>(Src);
5299
5301
5302 // Add a cost for constant load to vector.
5303 if (Opcode == Instruction::Store && OpInfo.isConstant())
5304 Cost += getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src),
5305 /*AddressSpace=*/0, CostKind, OpInfo);
5306
5307 // Handle the simple case of non-vectors.
5308 // NOTE: this assumes that legalization never creates vector from scalars!
5309 if (!VTy || !LT.second.isVector()) {
5310 // Each load/store unit costs 1.
5311 return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1;
5312 }
5313
5314 bool IsLoad = Opcode == Instruction::Load;
5315
5316 Type *EltTy = VTy->getElementType();
5317
5318 const int EltTyBits = DL.getTypeSizeInBits(EltTy);
5319
5320 // Source of truth: how many elements were there in the original IR vector?
5321 const unsigned SrcNumElt = VTy->getNumElements();
5322
5323 // How far have we gotten?
5324 int NumEltRemaining = SrcNumElt;
5325 // Note that we intentionally capture by-reference, NumEltRemaining changes.
5326 auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
5327
5328 const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8);
5329
5330 // Note that even if we can store 64 bits of an XMM, we still operate on XMM.
5331 const unsigned XMMBits = 128;
5332 if (XMMBits % EltTyBits != 0)
5333 // Vector size must be a multiple of the element size. I.e. no padding.
5334 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5335 CostKind, OpInfo, I);
5336 const int NumEltPerXMM = XMMBits / EltTyBits;
5337
5338 auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
5339
5340 for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0;
5341 NumEltRemaining > 0; CurrOpSizeBytes /= 2) {
5342 // How many elements would a single op deal with at once?
5343 if ((8 * CurrOpSizeBytes) % EltTyBits != 0)
5344 // Vector size must be a multiple of the element size. I.e. no padding.
5345 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5346 CostKind, OpInfo, I);
5347 int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits;
5348
5349 assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?");
5350 assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||
5351 (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&
5352 "Unless we haven't halved the op size yet, "
5353 "we have less than two op's sized units of work left.");
5354
5355 auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
5356 ? FixedVectorType::get(EltTy, CurrNumEltPerOp)
5357 : XMMVecTy;
5358
5359 assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&
5360 "After halving sizes, the vector elt count is no longer a multiple "
5361 "of number of elements per operation?");
5362 auto *CoalescedVecTy =
5363 CurrNumEltPerOp == 1
5364 ? CurrVecTy
5366 IntegerType::get(Src->getContext(),
5367 EltTyBits * CurrNumEltPerOp),
5368 CurrVecTy->getNumElements() / CurrNumEltPerOp);
5369 assert(DL.getTypeSizeInBits(CoalescedVecTy) ==
5370 DL.getTypeSizeInBits(CurrVecTy) &&
5371 "coalesciing elements doesn't change vector width.");
5372
5373 while (NumEltRemaining > 0) {
5374 assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?");
5375
5376 // Can we use this vector size, as per the remaining element count?
5377 // Iff the vector is naturally aligned, we can do a wide load regardless.
5378 if (NumEltRemaining < CurrNumEltPerOp &&
5379 (!IsLoad || Alignment < CurrOpSizeBytes) && CurrOpSizeBytes != 1)
5380 break; // Try smalled vector size.
5381
5382 // This isn't exactly right. We're using slow unaligned 32-byte accesses
5383 // as a proxy for a double-pumped AVX memory interface such as on
5384 // Sandybridge.
5385 // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or
5386 // will be scalarized.
5387 if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
5388 Cost += 2;
5389 else if (CurrOpSizeBytes < 4)
5390 Cost += 2;
5391 else
5392 Cost += 1;
5393
5394 // If we're loading a uniform value, then we don't need to split the load,
5395 // loading just a single (widest) vector can be reused by all splits.
5396 if (IsLoad && OpInfo.isUniform())
5397 return Cost;
5398
5399 bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
5400
5401 // If we have fully processed the previous reg, we need to replenish it.
5402 if (SubVecEltsLeft == 0) {
5403 SubVecEltsLeft += CurrVecTy->getNumElements();
5404 // And that's free only for the 0'th subvector of a legalized vector.
5405 if (!Is0thSubVec)
5406 Cost +=
5409 VTy, VTy, {}, CostKind, NumEltDone(), CurrVecTy);
5410 }
5411
5412 // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
5413 // for smaller widths (32/16/8) we have to insert/extract them separately.
5414 // Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
5415 // but let's pretend that it is also true for 16/8 bit wide ops...)
5416 if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
5417 int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
5418 assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "");
5419 int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
5420 APInt DemandedElts =
5421 APInt::getBitsSet(CoalescedVecTy->getNumElements(),
5422 CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
5423 assert(DemandedElts.popcount() == 1 && "Inserting single value");
5424 Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
5425 !IsLoad, CostKind);
5426 }
5427
5428 SubVecEltsLeft -= CurrNumEltPerOp;
5429 NumEltRemaining -= CurrNumEltPerOp;
5430 Alignment = commonAlignment(Alignment, CurrOpSizeBytes);
5431 }
5432 }
5433
5434 assert(NumEltRemaining <= 0 && "Should have processed all the elements.");
5435
5436 return Cost;
5437}
5438
5442 switch (MICA.getID()) {
5443 case Intrinsic::masked_scatter:
5444 case Intrinsic::masked_gather:
5445 return getGatherScatterOpCost(MICA, CostKind);
5446 case Intrinsic::masked_load:
5447 case Intrinsic::masked_store:
5448 return getMaskedMemoryOpCost(MICA, CostKind);
5449 }
5451}
5452
5456 unsigned Opcode = MICA.getID() == Intrinsic::masked_load ? Instruction::Load
5457 : Instruction::Store;
5458 Type *SrcTy = MICA.getDataType();
5459 Align Alignment = MICA.getAlignment();
5460 unsigned AddressSpace = MICA.getAddressSpace();
5461
5462 bool IsLoad = (Instruction::Load == Opcode);
5463 bool IsStore = (Instruction::Store == Opcode);
5464
5465 auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
5466 if (!SrcVTy)
5467 // To calculate scalar take the regular cost, without mask
5468 return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
5469
5470 unsigned NumElem = SrcVTy->getNumElements();
5471 auto *MaskTy =
5472 FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
5473 if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment, AddressSpace)) ||
5474 (IsStore && !isLegalMaskedStore(SrcVTy, Alignment, AddressSpace))) {
5475 // Scalarization
5476 APInt DemandedElts = APInt::getAllOnes(NumElem);
5478 MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind);
5479 InstructionCost ScalarCompareCost = getCmpSelInstrCost(
5480 Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
5482 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
5483 InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
5485 SrcVTy, DemandedElts, IsLoad, IsStore, CostKind);
5486 InstructionCost MemopCost =
5487 NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
5488 Alignment, AddressSpace, CostKind);
5489 return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
5490 }
5491
5492 // Legalize the type.
5493 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy);
5494 auto VT = TLI->getValueType(DL, SrcVTy);
5496 MVT Ty = LT.second;
5497 if (Ty == MVT::i16 || Ty == MVT::i32 || Ty == MVT::i64)
5498 // APX masked load/store for scalar is cheap.
5499 return Cost + LT.first;
5500
5501 if (VT.isSimple() && Ty != VT.getSimpleVT() &&
5502 LT.second.getVectorNumElements() == NumElem)
5503 // Promotion requires extend/truncate for data and a shuffle for mask.
5504 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, SrcVTy, {}, CostKind,
5505 0, nullptr) +
5506 getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, MaskTy, {}, CostKind,
5507 0, nullptr);
5508
5509 else if (LT.first * Ty.getVectorNumElements() > NumElem) {
5510 auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
5511 (unsigned)LT.first.getValue() *
5512 Ty.getVectorNumElements());
5513 // Expanding requires fill mask with zeroes
5514 Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, NewMaskTy, {},
5515 CostKind, 0, MaskTy);
5516 }
5517
5518 // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
5519 if (!ST->hasAVX512())
5520 return Cost + LT.first * (IsLoad ? 2 : 8);
5521
5522 // AVX-512 masked load/store is cheaper
5523 return Cost + LT.first;
5524}
5525
5527 ArrayRef<const Value *> Ptrs, const Value *Base,
5528 const TTI::PointersChainInfo &Info, Type *AccessTy,
5530 if (Info.isSameBase() && Info.isKnownStride()) {
5531 // If all the pointers have known stride all the differences are translated
5532 // into constants. X86 memory addressing allows encoding it into
5533 // displacement. So we just need to take the base GEP cost.
5534 if (const auto *BaseGEP = dyn_cast<GetElementPtrInst>(Base)) {
5535 SmallVector<const Value *> Indices(BaseGEP->indices());
5536 return getGEPCost(BaseGEP->getSourceElementType(),
5537 BaseGEP->getPointerOperand(), Indices, nullptr,
5538 CostKind);
5539 }
5540 return TTI::TCC_Free;
5541 }
5542 return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind);
5543}
5544
5547 const SCEV *Ptr,
5549 // Address computations in vectorized code with non-consecutive addresses will
5550 // likely result in more instructions compared to scalar code where the
5551 // computation can more often be merged into the index mode. The resulting
5552 // extra micro-ops can significantly decrease throughput.
5553 const unsigned NumVectorInstToHideOverhead = 10;
5554
5555 // Cost modeling of Strided Access Computation is hidden by the indexing
5556 // modes of X86 regardless of the stride value. We dont believe that there
5557 // is a difference between constant strided access in gerenal and constant
5558 // strided value which is less than or equal to 64.
5559 // Even in the case of (loop invariant) stride whose value is not known at
5560 // compile time, the address computation will not incur more than one extra
5561 // ADD instruction.
5562 if (PtrTy->isVectorTy() && SE && !ST->hasAVX2()) {
5563 // TODO: AVX2 is the current cut-off because we don't have correct
5564 // interleaving costs for prior ISA's.
5565 if (!BaseT::isStridedAccess(Ptr))
5566 return NumVectorInstToHideOverhead;
5567 if (!BaseT::getConstantStrideStep(SE, Ptr))
5568 return 1;
5569 }
5570
5571 return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
5572}
5573
5576 std::optional<FastMathFlags> FMF,
5579 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5580
5581 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5582 // and make it as the cost.
5583
5584 static const CostTblEntry SLMCostTbl[] = {
5585 { ISD::FADD, MVT::v2f64, 3 },
5586 { ISD::ADD, MVT::v2i64, 5 },
5587 };
5588
5589 static const CostTblEntry SSE2CostTbl[] = {
5590 { ISD::FADD, MVT::v2f64, 2 },
5591 { ISD::FADD, MVT::v2f32, 2 },
5592 { ISD::FADD, MVT::v4f32, 4 },
5593 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
5594 { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
5595 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
5596 { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
5597 { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
5598 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
5599 { ISD::ADD, MVT::v2i8, 2 },
5600 { ISD::ADD, MVT::v4i8, 2 },
5601 { ISD::ADD, MVT::v8i8, 2 },
5602 { ISD::ADD, MVT::v16i8, 3 },
5603 };
5604
5605 static const CostTblEntry AVX1CostTbl[] = {
5606 { ISD::FADD, MVT::v4f64, 3 },
5607 { ISD::FADD, MVT::v4f32, 3 },
5608 { ISD::FADD, MVT::v8f32, 4 },
5609 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
5610 { ISD::ADD, MVT::v4i64, 3 },
5611 { ISD::ADD, MVT::v8i32, 5 },
5612 { ISD::ADD, MVT::v16i16, 5 },
5613 { ISD::ADD, MVT::v32i8, 4 },
5614 };
5615
5616 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5617 assert(ISD && "Invalid opcode");
5618
5619 // Before legalizing the type, give a chance to look up illegal narrow types
5620 // in the table.
5621 // FIXME: Is there a better way to do this?
5622 EVT VT = TLI->getValueType(DL, ValTy);
5623 if (VT.isSimple()) {
5624 MVT MTy = VT.getSimpleVT();
5625 if (ST->useSLMArithCosts())
5626 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5627 return Entry->Cost;
5628
5629 if (ST->hasAVX())
5630 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5631 return Entry->Cost;
5632
5633 if (ST->hasSSE2())
5634 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5635 return Entry->Cost;
5636 }
5637
5638 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5639
5640 MVT MTy = LT.second;
5641
5642 auto *ValVTy = cast<FixedVectorType>(ValTy);
5643
5644 // Special case: vXi8 mul reductions are performed as vXi16.
5645 if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) {
5646 auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16);
5647 auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements());
5648 return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
5650 CostKind) +
5651 getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind);
5652 }
5653
5654 InstructionCost ArithmeticCost = 0;
5655 if (LT.first != 1 && MTy.isVector() &&
5656 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5657 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5658 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5659 MTy.getVectorNumElements());
5660 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5661 ArithmeticCost *= LT.first - 1;
5662 }
5663
5664 if (ST->useSLMArithCosts())
5665 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5666 return ArithmeticCost + Entry->Cost;
5667
5668 if (ST->hasAVX())
5669 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5670 return ArithmeticCost + Entry->Cost;
5671
5672 if (ST->hasSSE2())
5673 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5674 return ArithmeticCost + Entry->Cost;
5675
5676 // FIXME: These assume a naive kshift+binop lowering, which is probably
5677 // conservative in most cases.
5678 static const CostTblEntry AVX512BoolReduction[] = {
5679 { ISD::AND, MVT::v2i1, 3 },
5680 { ISD::AND, MVT::v4i1, 5 },
5681 { ISD::AND, MVT::v8i1, 7 },
5682 { ISD::AND, MVT::v16i1, 9 },
5683 { ISD::AND, MVT::v32i1, 11 },
5684 { ISD::AND, MVT::v64i1, 13 },
5685 { ISD::OR, MVT::v2i1, 3 },
5686 { ISD::OR, MVT::v4i1, 5 },
5687 { ISD::OR, MVT::v8i1, 7 },
5688 { ISD::OR, MVT::v16i1, 9 },
5689 { ISD::OR, MVT::v32i1, 11 },
5690 { ISD::OR, MVT::v64i1, 13 },
5691 };
5692
5693 static const CostTblEntry AVX2BoolReduction[] = {
5694 { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
5695 { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
5696 { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
5697 { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
5698 };
5699
5700 static const CostTblEntry AVX1BoolReduction[] = {
5701 { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
5702 { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
5703 { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5704 { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5705 { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
5706 { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
5707 { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5708 { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5709 };
5710
5711 static const CostTblEntry SSE2BoolReduction[] = {
5712 { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
5713 { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
5714 { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
5715 { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
5716 { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
5717 { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
5718 { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
5719 { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
5720 };
5721
5722 // Handle bool allof/anyof patterns.
5723 if (ValVTy->getElementType()->isIntegerTy(1)) {
5724 if (ISD == ISD::ADD) {
5725 // vXi1 addition reduction will bitcast to scalar and perform a popcount.
5726 auto *IntTy = IntegerType::getIntNTy(ValVTy->getContext(),
5727 ValVTy->getNumElements());
5728 IntrinsicCostAttributes ICA(Intrinsic::ctpop, IntTy, {IntTy});
5729 return getCastInstrCost(Instruction::BitCast, IntTy, ValVTy,
5731 CostKind) +
5733 }
5734
5735 InstructionCost ArithmeticCost = 0;
5736 if (LT.first != 1 && MTy.isVector() &&
5737 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5738 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5739 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5740 MTy.getVectorNumElements());
5741 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5742 ArithmeticCost *= LT.first - 1;
5743 }
5744
5745 if (ST->hasAVX512())
5746 if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
5747 return ArithmeticCost + Entry->Cost;
5748 if (ST->hasAVX2())
5749 if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
5750 return ArithmeticCost + Entry->Cost;
5751 if (ST->hasAVX())
5752 if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
5753 return ArithmeticCost + Entry->Cost;
5754 if (ST->hasSSE2())
5755 if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
5756 return ArithmeticCost + Entry->Cost;
5757
5758 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5759 }
5760
5761 unsigned NumVecElts = ValVTy->getNumElements();
5762 unsigned ScalarSize = ValVTy->getScalarSizeInBits();
5763
5764 // Special case power of 2 reductions where the scalar type isn't changed
5765 // by type legalization.
5766 if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
5767 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5768
5769 InstructionCost ReductionCost = 0;
5770
5771 auto *Ty = ValVTy;
5772 if (LT.first != 1 && MTy.isVector() &&
5773 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5774 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5775 Ty = FixedVectorType::get(ValVTy->getElementType(),
5776 MTy.getVectorNumElements());
5777 ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5778 ReductionCost *= LT.first - 1;
5779 NumVecElts = MTy.getVectorNumElements();
5780 }
5781
5782 // Now handle reduction with the legal type, taking into account size changes
5783 // at each level.
5784 while (NumVecElts > 1) {
5785 // Determine the size of the remaining vector we need to reduce.
5786 unsigned Size = NumVecElts * ScalarSize;
5787 NumVecElts /= 2;
5788 // If we're reducing from 256/512 bits, use an extract_subvector.
5789 if (Size > 128) {
5790 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5791 ReductionCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, Ty, {},
5792 CostKind, NumVecElts, SubTy);
5793 Ty = SubTy;
5794 } else if (Size == 128) {
5795 // Reducing from 128 bits is a permute of v2f64/v2i64.
5796 FixedVectorType *ShufTy;
5797 if (ValVTy->isFloatingPointTy())
5798 ShufTy =
5799 FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
5800 else
5801 ShufTy =
5802 FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
5803 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy,
5804 {}, CostKind, 0, nullptr);
5805 } else if (Size == 64) {
5806 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5807 FixedVectorType *ShufTy;
5808 if (ValVTy->isFloatingPointTy())
5809 ShufTy =
5810 FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
5811 else
5812 ShufTy =
5813 FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
5814 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy,
5815 {}, CostKind, 0, nullptr);
5816 } else {
5817 // Reducing from smaller size is a shift by immediate.
5818 auto *ShiftTy = FixedVectorType::get(
5819 Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
5820 ReductionCost += getArithmeticInstrCost(
5821 Instruction::LShr, ShiftTy, CostKind,
5824 }
5825
5826 // Add the arithmetic op for this level.
5827 ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
5828 }
5829
5830 // Add the final extract element to the cost.
5831 return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5832 CostKind, 0, nullptr, nullptr,
5834}
5835
5838 FastMathFlags FMF) const {
5839 IntrinsicCostAttributes ICA(IID, Ty, {Ty, Ty}, FMF);
5840 return getIntrinsicInstrCost(ICA, CostKind);
5841}
5842
5845 FastMathFlags FMF,
5847 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5848
5849 MVT MTy = LT.second;
5850
5851 int ISD;
5852 if (ValTy->isIntOrIntVectorTy()) {
5853 ISD = (IID == Intrinsic::umin || IID == Intrinsic::umax) ? ISD::UMIN
5854 : ISD::SMIN;
5855 } else {
5856 assert(ValTy->isFPOrFPVectorTy() &&
5857 "Expected float point or integer vector type.");
5858 ISD = (IID == Intrinsic::minnum || IID == Intrinsic::maxnum)
5859 ? ISD::FMINNUM
5860 : ISD::FMINIMUM;
5861 }
5862
5863 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5864 // and make it as the cost.
5865
5866 static const CostTblEntry SSE2CostTbl[] = {
5867 {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw
5868 {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw
5869 {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw
5870 };
5871
5872 static const CostTblEntry SSE41CostTbl[] = {
5873 {ISD::SMIN, MVT::v2i16, 3}, // same as sse2
5874 {ISD::SMIN, MVT::v4i16, 5}, // same as sse2
5875 {ISD::UMIN, MVT::v2i16, 5}, // same as sse2
5876 {ISD::UMIN, MVT::v4i16, 7}, // same as sse2
5877 {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor
5878 {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax
5879 {ISD::SMIN, MVT::v2i8, 3}, // pminsb
5880 {ISD::SMIN, MVT::v4i8, 5}, // pminsb
5881 {ISD::SMIN, MVT::v8i8, 7}, // pminsb
5882 {ISD::SMIN, MVT::v16i8, 6},
5883 {ISD::UMIN, MVT::v2i8, 3}, // same as sse2
5884 {ISD::UMIN, MVT::v4i8, 5}, // same as sse2
5885 {ISD::UMIN, MVT::v8i8, 7}, // same as sse2
5886 {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax
5887 };
5888
5889 static const CostTblEntry AVX1CostTbl[] = {
5890 {ISD::SMIN, MVT::v16i16, 6},
5891 {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax
5892 {ISD::SMIN, MVT::v32i8, 8},
5893 {ISD::UMIN, MVT::v32i8, 8},
5894 };
5895
5896 static const CostTblEntry AVX512BWCostTbl[] = {
5897 {ISD::SMIN, MVT::v32i16, 8},
5898 {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax
5899 {ISD::SMIN, MVT::v64i8, 10},
5900 {ISD::UMIN, MVT::v64i8, 10},
5901 };
5902
5903 // Before legalizing the type, give a chance to look up illegal narrow types
5904 // in the table.
5905 // FIXME: Is there a better way to do this?
5906 EVT VT = TLI->getValueType(DL, ValTy);
5907 if (VT.isSimple()) {
5908 MVT MTy = VT.getSimpleVT();
5909 if (ST->hasBWI())
5910 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5911 return Entry->Cost;
5912
5913 if (ST->hasAVX())
5914 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5915 return Entry->Cost;
5916
5917 if (ST->hasSSE41())
5918 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5919 return Entry->Cost;
5920
5921 if (ST->hasSSE2())
5922 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5923 return Entry->Cost;
5924 }
5925
5926 auto *ValVTy = cast<FixedVectorType>(ValTy);
5927 unsigned NumVecElts = ValVTy->getNumElements();
5928
5929 auto *Ty = ValVTy;
5930 InstructionCost MinMaxCost = 0;
5931 if (LT.first != 1 && MTy.isVector() &&
5932 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5933 // Type needs to be split. We need LT.first - 1 operations ops.
5934 Ty = FixedVectorType::get(ValVTy->getElementType(),
5935 MTy.getVectorNumElements());
5936 MinMaxCost = getMinMaxCost(IID, Ty, CostKind, FMF);
5937 MinMaxCost *= LT.first - 1;
5938 NumVecElts = MTy.getVectorNumElements();
5939 }
5940
5941 if (ST->hasBWI())
5942 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5943 return MinMaxCost + Entry->Cost;
5944
5945 if (ST->hasAVX())
5946 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5947 return MinMaxCost + Entry->Cost;
5948
5949 if (ST->hasSSE41())
5950 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5951 return MinMaxCost + Entry->Cost;
5952
5953 if (ST->hasSSE2())
5954 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5955 return MinMaxCost + Entry->Cost;
5956
5957 unsigned ScalarSize = ValTy->getScalarSizeInBits();
5958
5959 // Special case power of 2 reductions where the scalar type isn't changed
5960 // by type legalization.
5961 if (!isPowerOf2_32(ValVTy->getNumElements()) ||
5962 ScalarSize != MTy.getScalarSizeInBits())
5963 return BaseT::getMinMaxReductionCost(IID, ValTy, FMF, CostKind);
5964
5965 // Now handle reduction with the legal type, taking into account size changes
5966 // at each level.
5967 while (NumVecElts > 1) {
5968 // Determine the size of the remaining vector we need to reduce.
5969 unsigned Size = NumVecElts * ScalarSize;
5970 NumVecElts /= 2;
5971 // If we're reducing from 256/512 bits, use an extract_subvector.
5972 if (Size > 128) {
5973 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5974 MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, Ty, {},
5975 CostKind, NumVecElts, SubTy);
5976 Ty = SubTy;
5977 } else if (Size == 128) {
5978 // Reducing from 128 bits is a permute of v2f64/v2i64.
5979 VectorType *ShufTy;
5980 if (ValTy->isFloatingPointTy())
5981 ShufTy =
5982 FixedVectorType::get(Type::getDoubleTy(ValTy->getContext()), 2);
5983 else
5984 ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
5985 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy, {},
5986 CostKind, 0, nullptr);
5987 } else if (Size == 64) {
5988 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5989 FixedVectorType *ShufTy;
5990 if (ValTy->isFloatingPointTy())
5991 ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
5992 else
5993 ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
5994 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy, {},
5995 CostKind, 0, nullptr);
5996 } else {
5997 // Reducing from smaller size is a shift by immediate.
5998 auto *ShiftTy = FixedVectorType::get(
5999 Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
6000 MinMaxCost += getArithmeticInstrCost(
6001 Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
6004 }
6005
6006 // Add the arithmetic op for this level.
6007 MinMaxCost += getMinMaxCost(IID, Ty, CostKind, FMF);
6008 }
6009
6010 // Add the final extract element to the cost.
6011 return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
6012 CostKind, 0, nullptr, nullptr,
6014}
6015
6016/// Calculate the cost of materializing a 64-bit value. This helper
6017/// method might only calculate a fraction of a larger immediate. Therefore it
6018/// is valid to return a cost of ZERO.
6020 if (Val == 0)
6021 return TTI::TCC_Free;
6022
6023 if (isInt<32>(Val))
6024 return TTI::TCC_Basic;
6025
6026 return 2 * TTI::TCC_Basic;
6027}
6028
6031 assert(Ty->isIntegerTy());
6032
6033 unsigned BitSize = Ty->getPrimitiveSizeInBits();
6034 if (BitSize == 0)
6035 return ~0U;
6036
6037 // Never hoist constants larger than 128bit, because this might lead to
6038 // incorrect code generation or assertions in codegen.
6039 // Fixme: Create a cost model for types larger than i128 once the codegen
6040 // issues have been fixed.
6041 if (BitSize > 128)
6042 return TTI::TCC_Free;
6043
6044 if (Imm == 0)
6045 return TTI::TCC_Free;
6046
6047 // Sign-extend all constants to a multiple of 64-bit.
6048 APInt ImmVal = Imm;
6049 if (BitSize % 64 != 0)
6050 ImmVal = Imm.sext(alignTo(BitSize, 64));
6051
6052 // Split the constant into 64-bit chunks and calculate the cost for each
6053 // chunk.
6055 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
6056 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
6057 int64_t Val = Tmp.getSExtValue();
6058 Cost += getIntImmCost(Val);
6059 }
6060 // We need at least one instruction to materialize the constant.
6061 return std::max<InstructionCost>(1, Cost);
6062}
6063
6065 const APInt &Imm, Type *Ty,
6067 Instruction *Inst) const {
6068 assert(Ty->isIntegerTy());
6069
6070 unsigned BitSize = Ty->getPrimitiveSizeInBits();
6071 unsigned ImmBitWidth = Imm.getBitWidth();
6072
6073 // There is no cost model for constants with a bit size of 0. Return TCC_Free
6074 // here, so that constant hoisting will ignore this constant.
6075 if (BitSize == 0)
6076 return TTI::TCC_Free;
6077
6078 unsigned ImmIdx = ~0U;
6079 switch (Opcode) {
6080 default:
6081 return TTI::TCC_Free;
6082 case Instruction::GetElementPtr:
6083 // Always hoist the base address of a GetElementPtr. This prevents the
6084 // creation of new constants for every base constant that gets constant
6085 // folded with the offset.
6086 if (Idx == 0)
6087 return 2 * TTI::TCC_Basic;
6088 return TTI::TCC_Free;
6089 case Instruction::Store:
6090 ImmIdx = 0;
6091 break;
6092 case Instruction::ICmp:
6093 // This is an imperfect hack to prevent constant hoisting of
6094 // compares that might be trying to check if a 64-bit value fits in
6095 // 32-bits. The backend can optimize these cases using a right shift by 32.
6096 // There are other predicates and immediates the backend can use shifts for.
6097 if (Idx == 1 && ImmBitWidth == 64) {
6098 uint64_t ImmVal = Imm.getZExtValue();
6099 if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
6100 return TTI::TCC_Free;
6101
6102 if (auto *Cmp = dyn_cast_or_null<CmpInst>(Inst)) {
6103 if (Cmp->isEquality()) {
6104 KnownBits Known = computeKnownBits(Cmp->getOperand(0), DL);
6105 if (Known.countMinTrailingZeros() >= 32)
6106 return TTI::TCC_Free;
6107 }
6108 }
6109 }
6110 ImmIdx = 1;
6111 break;
6112 case Instruction::And:
6113 // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
6114 // by using a 32-bit operation with implicit zero extension. Detect such
6115 // immediates here as the normal path expects bit 31 to be sign extended.
6116 if (Idx == 1 && ImmBitWidth == 64 && Imm.isIntN(32))
6117 return TTI::TCC_Free;
6118 // If we have BMI then we can use BEXTR/BZHI to mask out upper i64 bits.
6119 if (Idx == 1 && ImmBitWidth == 64 && ST->is64Bit() && ST->hasBMI() &&
6120 Imm.isMask())
6121 return X86TTIImpl::getIntImmCost(ST->hasBMI2() ? 255 : 65535);
6122 ImmIdx = 1;
6123 break;
6124 case Instruction::Add:
6125 case Instruction::Sub:
6126 // For add/sub, we can use the opposite instruction for INT32_MIN.
6127 if (Idx == 1 && ImmBitWidth == 64 && Imm.getZExtValue() == 0x80000000)
6128 return TTI::TCC_Free;
6129 ImmIdx = 1;
6130 break;
6131 case Instruction::UDiv:
6132 case Instruction::SDiv:
6133 case Instruction::URem:
6134 case Instruction::SRem:
6135 // Division by constant is typically expanded later into a different
6136 // instruction sequence. This completely changes the constants.
6137 // Report them as "free" to stop ConstantHoist from marking them as opaque.
6138 return TTI::TCC_Free;
6139 case Instruction::Mul:
6140 case Instruction::Or:
6141 case Instruction::Xor:
6142 ImmIdx = 1;
6143 break;
6144 // Always return TCC_Free for the shift value of a shift instruction.
6145 case Instruction::Shl:
6146 case Instruction::LShr:
6147 case Instruction::AShr:
6148 if (Idx == 1)
6149 return TTI::TCC_Free;
6150 break;
6151 case Instruction::Trunc:
6152 case Instruction::ZExt:
6153 case Instruction::SExt:
6154 case Instruction::IntToPtr:
6155 case Instruction::PtrToInt:
6156 case Instruction::BitCast:
6157 case Instruction::PHI:
6158 case Instruction::Call:
6159 case Instruction::Select:
6160 case Instruction::Ret:
6161 case Instruction::Load:
6162 break;
6163 }
6164
6165 if (Idx == ImmIdx) {
6166 uint64_t NumConstants = divideCeil(BitSize, 64);
6168 return (Cost <= NumConstants * TTI::TCC_Basic)
6169 ? static_cast<int>(TTI::TCC_Free)
6170 : Cost;
6171 }
6172
6173 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
6174}
6175
6178 const APInt &Imm, Type *Ty,
6180 assert(Ty->isIntegerTy());
6181
6182 unsigned BitSize = Ty->getPrimitiveSizeInBits();
6183 // There is no cost model for constants with a bit size of 0. Return TCC_Free
6184 // here, so that constant hoisting will ignore this constant.
6185 if (BitSize == 0)
6186 return TTI::TCC_Free;
6187
6188 switch (IID) {
6189 default:
6190 return TTI::TCC_Free;
6191 case Intrinsic::sadd_with_overflow:
6192 case Intrinsic::uadd_with_overflow:
6193 case Intrinsic::ssub_with_overflow:
6194 case Intrinsic::usub_with_overflow:
6195 case Intrinsic::smul_with_overflow:
6196 case Intrinsic::umul_with_overflow:
6197 if ((Idx == 1) && Imm.getBitWidth() <= 64 && Imm.isSignedIntN(32))
6198 return TTI::TCC_Free;
6199 break;
6200 case Intrinsic::experimental_stackmap:
6201 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
6202 return TTI::TCC_Free;
6203 break;
6204 case Intrinsic::experimental_patchpoint_void:
6205 case Intrinsic::experimental_patchpoint:
6206 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
6207 return TTI::TCC_Free;
6208 break;
6209 }
6210 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
6211}
6212
6215 const Instruction *I) const {
6217 return Opcode == Instruction::PHI ? TTI::TCC_Free : TTI::TCC_Basic;
6218 // Branches are assumed to be predicted.
6219 return TTI::TCC_Free;
6220}
6221
6222int X86TTIImpl::getGatherOverhead() const {
6223 // Some CPUs have more overhead for gather. The specified overhead is relative
6224 // to the Load operation. "2" is the number provided by Intel architects. This
6225 // parameter is used for cost estimation of Gather Op and comparison with
6226 // other alternatives.
6227 // TODO: Remove the explicit hasAVX512()?, That would mean we would only
6228 // enable gather with a -march.
6229 if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
6230 return 2;
6231
6232 return 1024;
6233}
6234
6235int X86TTIImpl::getScatterOverhead() const {
6236 if (ST->hasAVX512())
6237 return 2;
6238
6239 return 1024;
6240}
6241
6242// Return an average cost of Gather / Scatter instruction, maybe improved later.
6243InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode,
6245 Type *SrcVTy, const Value *Ptr,
6246 Align Alignment,
6247 unsigned AddressSpace) const {
6248
6249 assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
6250 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
6251
6252 // Try to reduce index size from 64 bit (default for GEP)
6253 // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
6254 // operation will use 16 x 64 indices which do not fit in a zmm and needs
6255 // to split. Also check that the base pointer is the same for all lanes,
6256 // and that there's at most one variable index.
6257 auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
6258 unsigned IndexSize = DL.getPointerSizeInBits();
6259 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
6260 if (IndexSize < 64 || !GEP)
6261 return IndexSize;
6262
6263 unsigned NumOfVarIndices = 0;
6264 const Value *Ptrs = GEP->getPointerOperand();
6265 if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
6266 return IndexSize;
6267 for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) {
6268 if (isa<Constant>(GEP->getOperand(I)))
6269 continue;
6270 Type *IndxTy = GEP->getOperand(I)->getType();
6271 if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
6272 IndxTy = IndexVTy->getElementType();
6273 if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
6274 !isa<SExtInst>(GEP->getOperand(I))) ||
6275 ++NumOfVarIndices > 1)
6276 return IndexSize; // 64
6277 }
6278 return (unsigned)32;
6279 };
6280
6281 // Trying to reduce IndexSize to 32 bits for vector 16.
6282 // By default the IndexSize is equal to pointer size.
6283 unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
6284 ? getIndexSizeInBits(Ptr, DL)
6285 : DL.getPointerSizeInBits();
6286
6287 auto *IndexVTy = FixedVectorType::get(
6288 IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
6289 std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(IndexVTy);
6290 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(SrcVTy);
6291 InstructionCost::CostType SplitFactor =
6292 std::max(IdxsLT.first, SrcLT.first).getValue();
6293 if (SplitFactor > 1) {
6294 // Handle splitting of vector of pointers
6295 auto *SplitSrcTy =
6296 FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
6297 return SplitFactor * getGSVectorCost(Opcode, CostKind, SplitSrcTy, Ptr,
6298 Alignment, AddressSpace);
6299 }
6300
6301 // If we didn't split, this will be a single gather/scatter instruction.
6303 return 1;
6304
6305 // The gather / scatter cost is given by Intel architects. It is a rough
6306 // number since we are looking at one instruction in a time.
6307 const int GSOverhead = (Opcode == Instruction::Load) ? getGatherOverhead()
6308 : getScatterOverhead();
6309 return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
6310 Alignment, AddressSpace, CostKind);
6311}
6312
6313/// Calculate the cost of Gather / Scatter operation
6317 bool IsLoad = MICA.getID() == Intrinsic::masked_gather ||
6318 MICA.getID() == Intrinsic::vp_gather;
6319 unsigned Opcode = IsLoad ? Instruction::Load : Instruction::Store;
6320 Type *SrcVTy = MICA.getDataType();
6321 const Value *Ptr = MICA.getPointer();
6322 Align Alignment = MICA.getAlignment();
6323 if ((Opcode == Instruction::Load &&
6324 (!isLegalMaskedGather(SrcVTy, Align(Alignment)) ||
6326 Align(Alignment)))) ||
6327 (Opcode == Instruction::Store &&
6328 (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) ||
6330 Align(Alignment)))))
6332
6333 assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
6334 PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
6335 if (!PtrTy && Ptr->getType()->isVectorTy())
6336 PtrTy = dyn_cast<PointerType>(
6337 cast<VectorType>(Ptr->getType())->getElementType());
6338 assert(PtrTy && "Unexpected type for Ptr argument");
6339 unsigned AddressSpace = PtrTy->getAddressSpace();
6340 return getGSVectorCost(Opcode, CostKind, SrcVTy, Ptr, Alignment,
6341 AddressSpace);
6342}
6343
6345 const TargetTransformInfo::LSRCost &C2) const {
6346 // X86 specific here are "instruction number 1st priority".
6347 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, C1.NumIVMuls,
6348 C1.NumBaseAdds, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
6349 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, C2.NumIVMuls,
6350 C2.NumBaseAdds, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
6351}
6352
6354 return ST->hasMacroFusion() || ST->hasBranchFusion();
6355}
6356
6357static bool isLegalMaskedLoadStore(Type *ScalarTy, const X86Subtarget *ST) {
6358 if (!ST->hasAVX())
6359 return false;
6360
6361 if (ScalarTy->isPointerTy())
6362 return true;
6363
6364 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6365 return true;
6366
6367 if (ScalarTy->isHalfTy() && ST->hasBWI())
6368 return true;
6369
6370 if (ScalarTy->isBFloatTy() && ST->hasBF16())
6371 return true;
6372
6373 if (!ScalarTy->isIntegerTy())
6374 return false;
6375
6376 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6377 return IntWidth == 32 || IntWidth == 64 ||
6378 ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
6379}
6380
6382 unsigned AddressSpace,
6383 TTI::MaskKind MaskKind) const {
6384 Type *ScalarTy = DataTy->getScalarType();
6385
6386 // The backend can't handle a single element vector w/o CFCMOV.
6387 if (isa<VectorType>(DataTy) &&
6388 cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6389 return ST->hasCF() &&
6390 hasConditionalLoadStoreForType(ScalarTy, /*IsStore=*/false);
6391
6392 return isLegalMaskedLoadStore(ScalarTy, ST);
6393}
6394
6396 unsigned AddressSpace,
6397 TTI::MaskKind MaskKind) const {
6398 Type *ScalarTy = DataTy->getScalarType();
6399
6400 // The backend can't handle a single element vector w/o CFCMOV.
6401 if (isa<VectorType>(DataTy) &&
6402 cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6403 return ST->hasCF() &&
6404 hasConditionalLoadStoreForType(ScalarTy, /*IsStore=*/true);
6405
6406 return isLegalMaskedLoadStore(ScalarTy, ST);
6407}
6408
6409bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) const {
6410 unsigned DataSize = DL.getTypeStoreSize(DataType);
6411 // The only supported nontemporal loads are for aligned vectors of 16 or 32
6412 // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
6413 // (the equivalent stores only require AVX).
6414 if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
6415 return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
6416
6417 return false;
6418}
6419
6420bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) const {
6421 unsigned DataSize = DL.getTypeStoreSize(DataType);
6422
6423 // SSE4A supports nontemporal stores of float and double at arbitrary
6424 // alignment.
6425 if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
6426 return true;
6427
6428 // Besides the SSE4A subtarget exception above, only aligned stores are
6429 // available nontemporaly on any other subtarget. And only stores with a size
6430 // of 4..32 bytes (powers of 2, only) are permitted.
6431 if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
6433 return false;
6434
6435 // 32-byte vector nontemporal stores are supported by AVX (the equivalent
6436 // loads require AVX2).
6437 if (DataSize == 32)
6438 return ST->hasAVX();
6439 if (DataSize == 16)
6440 return ST->hasSSE1();
6441 return true;
6442}
6443
6445 ElementCount NumElements) const {
6446 // movddup
6447 return ST->hasSSE3() && !NumElements.isScalable() &&
6448 NumElements.getFixedValue() == 2 &&
6449 ElementTy == Type::getDoubleTy(ElementTy->getContext());
6450}
6451
6452bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy, Align Alignment) const {
6453 if (!isa<VectorType>(DataTy))
6454 return false;
6455
6456 if (!ST->hasAVX512())
6457 return false;
6458
6459 // The backend can't handle a single element vector.
6460 if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6461 return false;
6462
6463 Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
6464
6465 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6466 return true;
6467
6468 if (!ScalarTy->isIntegerTy())
6469 return false;
6470
6471 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6472 return IntWidth == 32 || IntWidth == 64 ||
6473 ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
6474}
6475
6477 Align Alignment) const {
6478 return isLegalMaskedExpandLoad(DataTy, Alignment);
6479}
6480
6481bool X86TTIImpl::supportsGather() const {
6482 // Some CPUs have better gather performance than others.
6483 // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
6484 // enable gather with a -march.
6485 return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2());
6486}
6487
6489 Align Alignment) const {
6490 // Gather / Scatter for vector 2 is not profitable on KNL / SKX
6491 // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend
6492 // it to 8 elements, but zeroing upper bits of the mask vector will add more
6493 // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO:
6494 // Check, maybe the gather/scatter instruction is better in the VariableMask
6495 // case.
6496 unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements();
6497 return NumElts == 1 ||
6498 (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())));
6499}
6500
6502 Align Alignment) const {
6503 Type *ScalarTy = DataTy->getScalarType();
6504 if (ScalarTy->isPointerTy())
6505 return true;
6506
6507 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6508 return true;
6509
6510 if (!ScalarTy->isIntegerTy())
6511 return false;
6512
6513 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6514 return IntWidth == 32 || IntWidth == 64;
6515}
6516
6517bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) const {
6518 if (!supportsGather() || !ST->preferGather())
6519 return false;
6520 return isLegalMaskedGatherScatter(DataTy, Alignment);
6521}
6522
6523bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
6524 unsigned Opcode1,
6525 const SmallBitVector &OpcodeMask) const {
6526 // ADDSUBPS 4xf32 SSE3
6527 // VADDSUBPS 4xf32 AVX
6528 // VADDSUBPS 8xf32 AVX2
6529 // ADDSUBPD 2xf64 SSE3
6530 // VADDSUBPD 2xf64 AVX
6531 // VADDSUBPD 4xf64 AVX2
6532
6533 unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
6534 assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible");
6535 if (!isPowerOf2_32(NumElements))
6536 return false;
6537 // Check the opcode pattern. We apply the mask on the opcode arguments and
6538 // then check if it is what we expect.
6539 for (int Lane : seq<int>(0, NumElements)) {
6540 unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0;
6541 // We expect FSub for even lanes and FAdd for odd lanes.
6542 if (Lane % 2 == 0 && Opc != Instruction::FSub)
6543 return false;
6544 if (Lane % 2 == 1 && Opc != Instruction::FAdd)
6545 return false;
6546 }
6547 // Now check that the pattern is supported by the target ISA.
6548 Type *ElemTy = cast<VectorType>(VecTy)->getElementType();
6549 if (ElemTy->isFloatTy())
6550 return ST->hasSSE3() && NumElements % 4 == 0;
6551 if (ElemTy->isDoubleTy())
6552 return ST->hasSSE3() && NumElements % 2 == 0;
6553 return false;
6554}
6555
6556bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) const {
6557 // AVX2 doesn't support scatter
6558 if (!ST->hasAVX512() || !ST->preferScatter())
6559 return false;
6560 return isLegalMaskedGatherScatter(DataType, Alignment);
6561}
6562
6563bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) const {
6564 EVT VT = TLI->getValueType(DL, DataType);
6565 return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
6566}
6567
6569 // FDIV is always expensive, even if it has a very low uop count.
6570 // TODO: Still necessary for recent CPUs with low latency/throughput fdiv?
6571 if (I->getOpcode() == Instruction::FDiv)
6572 return true;
6573
6575}
6576
6577bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) const { return false; }
6578
6580 const Function *Callee) const {
6581 const TargetMachine &TM = getTLI()->getTargetMachine();
6582
6583 // Work this as a subsetting of subtarget features.
6584 const FeatureBitset &CallerBits =
6585 TM.getSubtargetImpl(*Caller)->getFeatureBits();
6586 const FeatureBitset &CalleeBits =
6587 TM.getSubtargetImpl(*Callee)->getFeatureBits();
6588
6589 // Check whether features are the same (apart from the ignore list).
6590 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
6591 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
6592 if (RealCallerBits == RealCalleeBits)
6593 return true;
6594
6595 // If the features are a subset, we need to additionally check for calls
6596 // that may become ABI-incompatible as a result of inlining.
6597 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
6598 return false;
6599
6600 for (const Instruction &I : instructions(Callee)) {
6601 if (const auto *CB = dyn_cast<CallBase>(&I)) {
6602 // Having more target features is fine for inline ASM and intrinsics.
6603 if (CB->isInlineAsm() || CB->getIntrinsicID() != Intrinsic::not_intrinsic)
6604 continue;
6605
6607 for (Value *Arg : CB->args())
6608 Types.push_back(Arg->getType());
6609 if (!CB->getType()->isVoidTy())
6610 Types.push_back(CB->getType());
6611
6612 // Simple types are always ABI compatible.
6613 auto IsSimpleTy = [](Type *Ty) {
6614 return !Ty->isVectorTy() && !Ty->isAggregateType();
6615 };
6616 if (all_of(Types, IsSimpleTy))
6617 continue;
6618
6619 // Do a precise compatibility check.
6620 if (!areTypesABICompatible(Caller, Callee, Types))
6621 return false;
6622 }
6623 }
6624 return true;
6625}
6626
6628 const Function *Callee,
6629 ArrayRef<Type *> Types) const {
6630 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
6631 return false;
6632
6633 // If we get here, we know the target features match. If one function
6634 // considers 512-bit vectors legal and the other does not, consider them
6635 // incompatible.
6636 const TargetMachine &TM = getTLI()->getTargetMachine();
6637
6638 if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
6640 return true;
6641
6642 // Consider the arguments compatible if they aren't vectors or aggregates.
6643 // FIXME: Look at the size of vectors.
6644 // FIXME: Look at the element types of aggregates to see if there are vectors.
6645 return llvm::none_of(Types,
6646 [](Type *T) { return T->isVectorTy() || T->isAggregateType(); });
6647}
6648
6650X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
6652 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
6653 Options.NumLoadsPerBlock = 2;
6654 // All GPR and vector loads can be unaligned.
6655 Options.AllowOverlappingLoads = true;
6656 if (IsZeroCmp) {
6657 // Only enable vector loads for equality comparison. Right now the vector
6658 // version is not as fast for three way compare (see #33329).
6659 const unsigned PreferredWidth = ST->getPreferVectorWidth();
6660 if (PreferredWidth >= 512 && ST->hasAVX512())
6661 Options.LoadSizes.push_back(64);
6662 if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
6663 if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
6664 }
6665 if (ST->is64Bit()) {
6666 Options.LoadSizes.push_back(8);
6667 }
6668 Options.LoadSizes.push_back(4);
6669 Options.LoadSizes.push_back(2);
6670 Options.LoadSizes.push_back(1);
6671 return Options;
6672}
6673
6675 return supportsGather();
6676}
6677
6679 return false;
6680}
6681
6683 // TODO: We expect this to be beneficial regardless of arch,
6684 // but there are currently some unexplained performance artifacts on Atom.
6685 // As a temporary solution, disable on Atom.
6686 return !(ST->isAtom());
6687}
6688
6689// Get estimation for interleaved load/store operations and strided load.
6690// \p Indices contains indices for strided load.
6691// \p Factor - the factor of interleaving.
6692// AVX-512 provides 3-src shuffles that significantly reduces the cost.
6694 unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
6695 ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
6696 TTI::TargetCostKind CostKind, bool UseMaskForCond,
6697 bool UseMaskForGaps) const {
6698 // VecTy for interleave memop is <VF*Factor x Elt>.
6699 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6700 // VecTy = <12 x i32>.
6701
6702 // Calculate the number of memory operations (NumOfMemOps), required
6703 // for load/store the VecTy.
6704 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6705 unsigned VecTySize = DL.getTypeStoreSize(VecTy);
6706 unsigned LegalVTSize = LegalVT.getStoreSize();
6707 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
6708
6709 // Get the cost of one memory operation.
6710 auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
6711 LegalVT.getVectorNumElements());
6712 InstructionCost MemOpCost;
6713 bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps;
6714 if (UseMaskedMemOp) {
6715 unsigned IID = Opcode == Instruction::Load ? Intrinsic::masked_load
6716 : Intrinsic::masked_store;
6717 MemOpCost = getMaskedMemoryOpCost(
6718 {IID, SingleMemOpTy, Alignment, AddressSpace}, CostKind);
6719 } else
6720 MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace,
6721 CostKind);
6722
6723 unsigned VF = VecTy->getNumElements() / Factor;
6724 MVT VT =
6725 MVT::getVectorVT(TLI->getSimpleValueType(DL, VecTy->getScalarType()), VF);
6726
6727 InstructionCost MaskCost;
6728 if (UseMaskedMemOp) {
6729 APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements());
6730 for (unsigned Index : Indices) {
6731 assert(Index < Factor && "Invalid index for interleaved memory op");
6732 for (unsigned Elm = 0; Elm < VF; Elm++)
6733 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
6734 }
6735
6736 Type *I1Type = Type::getInt1Ty(VecTy->getContext());
6737
6738 MaskCost = getReplicationShuffleCost(
6739 I1Type, Factor, VF,
6740 UseMaskForGaps ? DemandedLoadStoreElts
6742 CostKind);
6743
6744 // The Gaps mask is invariant and created outside the loop, therefore the
6745 // cost of creating it is not accounted for here. However if we have both
6746 // a MaskForGaps and some other mask that guards the execution of the
6747 // memory access, we need to account for the cost of And-ing the two masks
6748 // inside the loop.
6749 if (UseMaskForGaps) {
6750 auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements());
6751 MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind);
6752 }
6753 }
6754
6755 if (Opcode == Instruction::Load) {
6756 // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
6757 // contain the cost of the optimized shuffle sequence that the
6758 // X86InterleavedAccess pass will generate.
6759 // The cost of loads and stores are computed separately from the table.
6760
6761 // X86InterleavedAccess support only the following interleaved-access group.
6762 static const CostTblEntry AVX512InterleavedLoadTbl[] = {
6763 {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
6764 {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
6765 {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
6766 };
6767
6768 if (const auto *Entry =
6769 CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
6770 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6771 //If an entry does not exist, fallback to the default implementation.
6772
6773 // Kind of shuffle depends on number of loaded values.
6774 // If we load the entire data in one register, we can use a 1-src shuffle.
6775 // Otherwise, we'll merge 2 sources in each operation.
6776 TTI::ShuffleKind ShuffleKind =
6777 (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
6778
6779 InstructionCost ShuffleCost = getShuffleCost(
6780 ShuffleKind, SingleMemOpTy, SingleMemOpTy, {}, CostKind, 0, nullptr);
6781
6782 unsigned NumOfLoadsInInterleaveGrp =
6783 Indices.size() ? Indices.size() : Factor;
6784 auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
6785 VecTy->getNumElements() / Factor);
6786 InstructionCost NumOfResults =
6787 getTypeLegalizationCost(ResultTy).first * NumOfLoadsInInterleaveGrp;
6788
6789 // About a half of the loads may be folded in shuffles when we have only
6790 // one result. If we have more than one result, or the loads are masked,
6791 // we do not fold loads at all.
6792 unsigned NumOfUnfoldedLoads =
6793 UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
6794
6795 // Get a number of shuffle operations per result.
6796 unsigned NumOfShufflesPerResult =
6797 std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
6798
6799 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6800 // When we have more than one destination, we need additional instructions
6801 // to keep sources.
6802 InstructionCost NumOfMoves = 0;
6803 if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
6804 NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
6805
6806 InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
6807 MaskCost + NumOfUnfoldedLoads * MemOpCost +
6808 NumOfMoves;
6809
6810 return Cost;
6811 }
6812
6813 // Store.
6814 assert(Opcode == Instruction::Store &&
6815 "Expected Store Instruction at this point");
6816 // X86InterleavedAccess support only the following interleaved-access group.
6817 static const CostTblEntry AVX512InterleavedStoreTbl[] = {
6818 {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
6819 {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
6820 {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
6821
6822 {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
6823 {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
6824 {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
6825 {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
6826 };
6827
6828 if (const auto *Entry =
6829 CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
6830 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6831 //If an entry does not exist, fallback to the default implementation.
6832
6833 // There is no strided stores meanwhile. And store can't be folded in
6834 // shuffle.
6835 unsigned NumOfSources = Factor; // The number of values to be merged.
6836 InstructionCost ShuffleCost =
6837 getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, SingleMemOpTy, {},
6838 CostKind, 0, nullptr);
6839 unsigned NumOfShufflesPerStore = NumOfSources - 1;
6840
6841 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6842 // We need additional instructions to keep sources.
6843 unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
6845 MaskCost +
6846 NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
6847 NumOfMoves;
6848 return Cost;
6849}
6850
6852 unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices,
6853 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
6854 bool UseMaskForCond, bool UseMaskForGaps) const {
6855 auto *VecTy = cast<FixedVectorType>(BaseTy);
6856
6857 auto isSupportedOnAVX512 = [&](Type *VecTy) {
6858 Type *EltTy = cast<VectorType>(VecTy)->getElementType();
6859 if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
6860 EltTy->isIntegerTy(32) || EltTy->isPointerTy())
6861 return true;
6862 if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy())
6863 return ST->hasBWI();
6864 if (EltTy->isBFloatTy())
6865 return ST->hasBF16();
6866 return false;
6867 };
6868 if (ST->hasAVX512() && isSupportedOnAVX512(VecTy))
6870 Opcode, VecTy, Factor, Indices, Alignment,
6871 AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
6872
6873 if (UseMaskForCond || UseMaskForGaps)
6874 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6875 Alignment, AddressSpace, CostKind,
6876 UseMaskForCond, UseMaskForGaps);
6877
6878 // Get estimation for interleaved load/store operations for SSE-AVX2.
6879 // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow
6880 // computing the cost using a generic formula as a function of generic
6881 // shuffles. We therefore use a lookup table instead, filled according to
6882 // the instruction sequences that codegen currently generates.
6883
6884 // VecTy for interleave memop is <VF*Factor x Elt>.
6885 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6886 // VecTy = <12 x i32>.
6887 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6888
6889 // This function can be called with VecTy=<6xi128>, Factor=3, in which case
6890 // the VF=2, while v2i128 is an unsupported MVT vector type
6891 // (see MachineValueType.h::getVectorVT()).
6892 if (!LegalVT.isVector())
6893 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6894 Alignment, AddressSpace, CostKind);
6895
6896 unsigned VF = VecTy->getNumElements() / Factor;
6897 Type *ScalarTy = VecTy->getElementType();
6898 // Deduplicate entries, model floats/pointers as appropriately-sized integers.
6899 if (!ScalarTy->isIntegerTy())
6900 ScalarTy =
6901 Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy));
6902
6903 // Get the cost of all the memory operations.
6904 // FIXME: discount dead loads.
6905 InstructionCost MemOpCosts =
6906 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
6907
6908 auto *VT = FixedVectorType::get(ScalarTy, VF);
6909 EVT ETy = TLI->getValueType(DL, VT);
6910 if (!ETy.isSimple())
6911 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6912 Alignment, AddressSpace, CostKind);
6913
6914 // TODO: Complete for other data-types and strides.
6915 // Each combination of Stride, element bit width and VF results in a different
6916 // sequence; The cost tables are therefore accessed with:
6917 // Factor (stride) and VectorType=VFxiN.
6918 // The Cost accounts only for the shuffle sequence;
6919 // The cost of the loads/stores is accounted for separately.
6920 //
6921 static const CostTblEntry AVX2InterleavedLoadTbl[] = {
6922 {2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8
6923 {2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8
6924 {2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8
6925 {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8
6926 {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8
6927
6928 {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16
6929 {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16
6930 {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16
6931
6932 {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32
6933 {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32
6934 {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32
6935
6936 {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64
6937 {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64
6938 {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64
6939 {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64
6940
6941 {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8
6942 {3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8
6943 {3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8
6944 {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
6945 {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8
6946
6947 {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16
6948 {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16
6949 {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16
6950 {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16
6951 {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16
6952
6953 {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32
6954 {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32
6955 {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32
6956 {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32
6957 {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32
6958
6959 {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64
6960 {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64
6961 {3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64
6962 {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64
6963
6964 {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8
6965 {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8
6966 {4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8
6967 {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8
6968 {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8
6969
6970 {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16
6971 {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16
6972 {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16
6973 {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16
6974 {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16
6975
6976 {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32
6977 {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32
6978 {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32
6979 {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32
6980 {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32
6981
6982 {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64
6983 {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64
6984 {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64
6985 {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64
6986
6987 {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8
6988 {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8
6989 {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8
6990 {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8
6991 {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8
6992
6993 {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16
6994 {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16
6995 {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16
6996 {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16
6997 {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16
6998
6999 {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32
7000 {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32
7001 {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32
7002 {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32
7003
7004 {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64
7005 {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64
7006 {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64
7007
7008 {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32
7009 };
7010
7011 static const CostTblEntry SSSE3InterleavedLoadTbl[] = {
7012 {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16
7013 };
7014
7015 static const CostTblEntry SSE2InterleavedLoadTbl[] = {
7016 {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16
7017 {2, MVT::v4i16, 7}, // (load 8i16 and) deinterleave into 2 x 4i16
7018
7019 {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32
7020 {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32
7021
7022 {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64
7023 };
7024
7025 static const CostTblEntry AVX2InterleavedStoreTbl[] = {
7026 {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store)
7027 {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store)
7028
7029 {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store)
7030 {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store)
7031 {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store)
7032
7033 {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store)
7034 {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store)
7035 {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store)
7036 {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store)
7037
7038 {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store)
7039 {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store)
7040 {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store)
7041 {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store)
7042 {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store)
7043
7044 {3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store)
7045 {3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store)
7046 {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store)
7047 {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
7048 {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
7049
7050 {3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store)
7051 {3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store)
7052 {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store)
7053 {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store)
7054 {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store)
7055
7056 {3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store)
7057 {3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store)
7058 {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store)
7059 {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store)
7060 {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store)
7061
7062 {3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store)
7063 {3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store)
7064 {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store)
7065 {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store)
7066
7067 {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store)
7068 {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store)
7069 {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store)
7070 {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store)
7071 {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store)
7072
7073 {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store)
7074 {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store)
7075 {4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store)
7076 {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store)
7077 {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store)
7078
7079 {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store)
7080 {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store)
7081 {4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store)
7082 {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store)
7083 {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store)
7084
7085 {4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store)
7086 {4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store)
7087 {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store)
7088 {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store)
7089
7090 {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store)
7091 {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store)
7092 {6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store)
7093 {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store)
7094 {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store)
7095
7096 {6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store)
7097 {6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store)
7098 {6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store)
7099 {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store)
7100 {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store)
7101
7102 {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store)
7103 {6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store)
7104 {6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store)
7105 {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store)
7106
7107 {6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store)
7108 {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store)
7109 {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store)
7110 };
7111
7112 static const CostTblEntry SSE2InterleavedStoreTbl[] = {
7113 {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store)
7114 {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store)
7115 {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store)
7116
7117 {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store)
7118 {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store)
7119
7120 {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store)
7121 };
7122
7123 if (Opcode == Instruction::Load) {
7124 auto GetDiscountedCost = [Factor, NumMembers = Indices.size(),
7125 MemOpCosts](const CostTblEntry *Entry) {
7126 // NOTE: this is just an approximation!
7127 // It can over/under -estimate the cost!
7128 return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor);
7129 };
7130
7131 if (ST->hasAVX2())
7132 if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor,
7133 ETy.getSimpleVT()))
7134 return GetDiscountedCost(Entry);
7135
7136 if (ST->hasSSSE3())
7137 if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor,
7138 ETy.getSimpleVT()))
7139 return GetDiscountedCost(Entry);
7140
7141 if (ST->hasSSE2())
7142 if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor,
7143 ETy.getSimpleVT()))
7144 return GetDiscountedCost(Entry);
7145 } else {
7146 assert(Opcode == Instruction::Store &&
7147 "Expected Store Instruction at this point");
7148 assert((!Indices.size() || Indices.size() == Factor) &&
7149 "Interleaved store only supports fully-interleaved groups.");
7150 if (ST->hasAVX2())
7151 if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor,
7152 ETy.getSimpleVT()))
7153 return MemOpCosts + Entry->Cost;
7154
7155 if (ST->hasSSE2())
7156 if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor,
7157 ETy.getSimpleVT()))
7158 return MemOpCosts + Entry->Cost;
7159 }
7160
7161 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
7162 Alignment, AddressSpace, CostKind,
7163 UseMaskForCond, UseMaskForGaps);
7164}
7165
7167 StackOffset BaseOffset,
7168 bool HasBaseReg, int64_t Scale,
7169 unsigned AddrSpace) const {
7170 // Scaling factors are not free at all.
7171 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
7172 // will take 2 allocations in the out of order engine instead of 1
7173 // for plain addressing mode, i.e. inst (reg1).
7174 // E.g.,
7175 // vaddps (%rsi,%rdx), %ymm0, %ymm1
7176 // Requires two allocations (one for the load, one for the computation)
7177 // whereas:
7178 // vaddps (%rsi), %ymm0, %ymm1
7179 // Requires just 1 allocation, i.e., freeing allocations for other operations
7180 // and having less micro operations to execute.
7181 //
7182 // For some X86 architectures, this is even worse because for instance for
7183 // stores, the complex addressing mode forces the instruction to use the
7184 // "load" ports instead of the dedicated "store" port.
7185 // E.g., on Haswell:
7186 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
7187 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
7189 AM.BaseGV = BaseGV;
7190 AM.BaseOffs = BaseOffset.getFixed();
7191 AM.HasBaseReg = HasBaseReg;
7192 AM.Scale = Scale;
7193 AM.ScalableOffset = BaseOffset.getScalable();
7194 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
7195 // Scale represents reg2 * scale, thus account for 1
7196 // as soon as we use a second register.
7197 return AM.Scale != 0;
7199}
7200
7202 // TODO: Hook MispredictPenalty of SchedMachineModel into this.
7203 return 14;
7204}
7205
7207 unsigned Bits = Ty->getScalarSizeInBits();
7208
7209 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
7210 // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
7211 if (ST->hasXOP() && (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
7212 return false;
7213
7214 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
7215 // shifts just as cheap as scalar ones.
7216 if (ST->hasAVX2() && (Bits == 32 || Bits == 64))
7217 return false;
7218
7219 // AVX512BW has shifts such as vpsllvw.
7220 if (ST->hasBWI() && Bits == 16)
7221 return false;
7222
7223 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
7224 // fully general vector.
7225 return true;
7226}
7227
7228unsigned X86TTIImpl::getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
7229 Type *ScalarValTy) const {
7230 if (ST->hasF16C() && ScalarMemTy->isHalfTy()) {
7231 return 4;
7232 }
7233 return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy);
7234}
7235
7237 SmallVectorImpl<Use *> &Ops) const {
7238 using namespace llvm::PatternMatch;
7239
7240 FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
7241 if (!VTy)
7242 return false;
7243
7244 if (I->getOpcode() == Instruction::Mul &&
7245 VTy->getElementType()->isIntegerTy(64)) {
7246 for (auto &Op : I->operands()) {
7247 // Make sure we are not already sinking this operand
7248 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
7249 continue;
7250
7251 // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or
7252 // the PMULUDQ pattern where the input is a zext_inreg from vXi32.
7253 if (ST->hasSSE41() &&
7254 match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),
7255 m_SpecificInt(32)))) {
7256 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
7257 Ops.push_back(&Op);
7258 } else if (ST->hasSSE2() &&
7259 match(Op.get(),
7260 m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) {
7261 Ops.push_back(&Op);
7262 }
7263 }
7264
7265 return !Ops.empty();
7266 }
7267
7268 // A uniform shift amount in a vector shift or funnel shift may be much
7269 // cheaper than a generic variable vector shift, so make that pattern visible
7270 // to SDAG by sinking the shuffle instruction next to the shift.
7271 int ShiftAmountOpNum = -1;
7272 if (I->isShift())
7273 ShiftAmountOpNum = 1;
7274 else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
7275 if (II->getIntrinsicID() == Intrinsic::fshl ||
7276 II->getIntrinsicID() == Intrinsic::fshr)
7277 ShiftAmountOpNum = 2;
7278 }
7279
7280 if (ShiftAmountOpNum == -1)
7281 return false;
7282
7283 auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
7284 if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
7285 isVectorShiftByScalarCheap(I->getType())) {
7286 Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
7287 return true;
7288 }
7289
7290 return false;
7291}
7292
7294 bool HasEGPR = ST->hasEGPR();
7295 const TargetMachine &TM = getTLI()->getTargetMachine();
7296
7297 for (User *U : F.users()) {
7299 if (!CB || CB->getCalledOperand() != &F)
7300 continue;
7301 Function *CallerFunc = CB->getFunction();
7302 if (TM.getSubtarget<X86Subtarget>(*CallerFunc).hasEGPR() != HasEGPR)
7303 return false;
7304 }
7305
7306 return true;
7307}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Expand Atomic instructions
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
Hexagon Common GEP
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define T
uint64_t IntrinsicInst * II
#define P(N)
static unsigned getNumElements(Type *Ty)
This file implements the SmallBitVector class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
CostTblEntryT< CostKindCosts > CostKindTblEntry
static bool isLegalMaskedLoadStore(Type *ScalarTy, const X86Subtarget *ST)
TypeConversionCostTblEntryT< CostKindCosts > TypeConversionCostKindTblEntry
This file a TargetTransformInfoImplBase conforming object specific to the X86 target machine.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1023
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1685
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1345
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:259
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1503
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1052
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:834
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:201
LLVM_ABI APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition APInt.cpp:482
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1577
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const override
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Value * getCalledOperand() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
const IntrinsicInst * getInst() const
const FeatureBitset & getFeatureBits() const
Machine Value Type.
bool is128BitVector() const
Return true if this is a 128-bit vector type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Information for memory intrinsic cost model.
Class to represent pointers.
unsigned getAddressSpace() const
Return the address space of the Pointer type.
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
bool test(unsigned Idx) const
size_type size() const
Returns the number of bits in this bitvector.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
Primary interface to the complete machine description for the target machine.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
virtual InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const
bool isStridedAccess(const SCEV *Ptr) const
unsigned minRequiredElementSize(const Value *Val, bool &isSigned) const
const SCEVConstant * getConstantStrideStep(ScalarEvolution *SE, const SCEV *Ptr) const
virtual bool isExpensiveToSpeculativelyExecute(const Instruction *I) const
virtual bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
MaskKind
Some targets only support masked load/store with a constant mask.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
CacheLevel
The possible cache levels.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
LLVM_ABI unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:294
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:156
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
static LLVM_ABI Type * getDoubleTy(LLVMContext &C)
Definition Type.cpp:285
Type * getContainedType(unsigned i) const
This method is used to implement the type iterator (defined at the end of the file).
Definition Type.h:381
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:284
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
Base class of all SIMD vector types.
static VectorType * getExtendedElementVectorType(VectorType *VTy)
This static method is like getInteger except that the element types are twice as wide as the elements...
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * getDoubleElementsVectorType(VectorType *VTy)
This static method returns a VectorType with twice as many elements as the input type and the same el...
Type * getElementType() const
bool useAVX512Regs() const
bool hasAVX512() const
bool hasAVX2() const
bool useFastCCForInternalCall(Function &F) const override
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind) const override
bool isLegalNTLoad(Type *DataType, Align Alignment) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const override
unsigned getRegisterClassForType(bool Vector, Type *Ty) const override
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Get memory intrinsic cost based on arguments.
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool isLegalNTStore(Type *DataType, Align Alignment) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getInterleavedMemoryOpCostAVX512(unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
bool isVectorShiftByScalarCheap(Type *Ty) const override
bool isLegalMaskedGather(Type *DataType, Align Alignment) const override
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
unsigned getAtomicMemIntrinsicMaxElementSize() const override
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const override
bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) const override
InstructionCost getBranchMispredictPenalty() const override
bool isExpensiveToSpeculativelyExecute(const Instruction *I) const override
bool hasConditionalLoadStoreForType(Type *Ty, bool IsStore) const override
bool isLegalMaskedStore(Type *DataType, Align Alignment, unsigned AddressSpace, TTI::MaskKind MaskKind=TTI::MaskKind::VariableOrConstantMask) const override
std::optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool isLegalMaskedGatherScatter(Type *DataType, Align Alignment) const
bool isLegalMaskedLoad(Type *DataType, Align Alignment, unsigned AddressSpace, TTI::MaskKind MaskKind=TTI::MaskKind::VariableOrConstantMask) const override
bool enableInterleavedAccessVectorization() const override
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const override
unsigned getNumberOfRegisters(unsigned ClassID) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const override
bool hasDivRemOp(Type *DataType, bool IsSigned) const override
bool isLegalMaskedCompressStore(Type *DataType, Align Alignment) const override
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
bool supportsEfficientVectorElementLoadStore() const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getIntImmCost(int64_t) const
Calculate the cost of materializing a 64-bit value.
InstructionCost getMinMaxCost(Intrinsic::ID IID, Type *Ty, TTI::TargetCostKind CostKind, FastMathFlags FMF) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool canMacroFuseCmp() const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool prefersVectorizedAddressing() const override
bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Type) const override
InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask, TTI::TargetCostKind CostKind) const override
bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
Estimate the overhead of scalarizing an instruction.
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
Calculate the cost of Gather / Scatter operation.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition APInt.cpp:3020
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:819
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:788
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:779
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:880
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:747
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:280
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:844
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:787
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:374
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:796
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:348
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:850
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:356
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:727
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:978
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:926
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:959
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:856
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:365
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
ap_match< APInt > m_APIntAllowPoison(const APInt *&Res)
Match APInt while allowing poison in splat vector constants.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
constexpr auto not_equal_to(T &&Arg)
Functor variant of std::not_equal_to that can be used as a UnaryPredicate in functional algorithms li...
Definition STLExtras.h:2180
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
FunctionAddr VTableAddr uintptr_t uintptr_t DataSize
Definition InstrProf.h:267
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1885
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2019
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
LLVM_ABI void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition CostTable.h:66
LLVM_ABI int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
#define N
std::optional< unsigned > operator[](TargetTransformInfo::TargetCostKind Kind) const
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Cost Table Entry.
Definition CostTable.h:25
Extended Value Type.
Definition ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition KnownBits.h:258
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Type Conversion Cost Table.
Definition CostTable.h:55