LLVM 20.0.0git
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements a TargetTransformInfo analysis pass specific to the
10/// X86 target machine. It uses the target's detailed information to provide
11/// more precise answers to certain TTI queries, while letting the target
12/// independent and default TTI implementations handle the rest.
13///
14//===----------------------------------------------------------------------===//
15/// About Cost Model numbers used below it's necessary to say the following:
16/// the numbers correspond to some "generic" X86 CPU instead of usage of a
17/// specific CPU model. Usually the numbers correspond to the CPU where the
18/// feature first appeared. For example, if we do Subtarget.hasSSE42() in
19/// the lookups below the cost is based on Nehalem as that was the first CPU
20/// to support that feature level and thus has most likely the worst case cost,
21/// although we may discard an outlying worst cost from one CPU (e.g. Atom).
22///
23/// Some examples of other technologies/CPUs:
24/// SSE 3 - Pentium4 / Athlon64
25/// SSE 4.1 - Penryn
26/// SSE 4.2 - Nehalem / Silvermont
27/// AVX - Sandy Bridge / Jaguar / Bulldozer
28/// AVX2 - Haswell / Ryzen
29/// AVX-512 - Xeon Phi / Skylake
30///
31/// And some examples of instruction target dependent costs (latency)
32/// divss sqrtss rsqrtss
33/// AMD K7 11-16 19 3
34/// Piledriver 9-24 13-15 5
35/// Jaguar 14 16 2
36/// Pentium II,III 18 30 2
37/// Nehalem 7-14 7-18 3
38/// Haswell 10-13 11 5
39///
40/// Interpreting the 4 TargetCostKind types:
41/// TCK_RecipThroughput and TCK_Latency should try to match the worst case
42/// values reported by the CPU scheduler models (and llvm-mca).
43/// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the
44/// actual encoding size of the instruction.
45/// TCK_SizeAndLatency should match the worst case micro-op counts reported by
46/// by the CPU scheduler models (and llvm-mca), to ensure that they are
47/// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are
48/// often used as the cost thresholds where TCK_SizeAndLatency is requested.
49//===----------------------------------------------------------------------===//
50
58#include "llvm/Support/Debug.h"
59#include <optional>
60
61using namespace llvm;
62
63#define DEBUG_TYPE "x86tti"
64
65//===----------------------------------------------------------------------===//
66//
67// X86 cost model.
68//
69//===----------------------------------------------------------------------===//
70
71// Helper struct to store/access costs for each cost kind.
72// TODO: Move this to allow other targets to use it?
74 unsigned RecipThroughputCost = ~0U;
75 unsigned LatencyCost = ~0U;
76 unsigned CodeSizeCost = ~0U;
77 unsigned SizeAndLatencyCost = ~0U;
78
79 std::optional<unsigned>
81 unsigned Cost = ~0U;
82 switch (Kind) {
85 break;
88 break;
91 break;
94 break;
95 }
96 if (Cost == ~0U)
97 return std::nullopt;
98 return Cost;
99 }
100};
103
106 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
107 // TODO: Currently the __builtin_popcount() implementation using SSE3
108 // instructions is inefficient. Once the problem is fixed, we should
109 // call ST->hasSSE3() instead of ST->hasPOPCNT().
110 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
111}
112
113std::optional<unsigned> X86TTIImpl::getCacheSize(
115 switch (Level) {
117 // - Penryn
118 // - Nehalem
119 // - Westmere
120 // - Sandy Bridge
121 // - Ivy Bridge
122 // - Haswell
123 // - Broadwell
124 // - Skylake
125 // - Kabylake
126 return 32 * 1024; // 32 KByte
128 // - Penryn
129 // - Nehalem
130 // - Westmere
131 // - Sandy Bridge
132 // - Ivy Bridge
133 // - Haswell
134 // - Broadwell
135 // - Skylake
136 // - Kabylake
137 return 256 * 1024; // 256 KByte
138 }
139
140 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
141}
142
143std::optional<unsigned> X86TTIImpl::getCacheAssociativity(
145 // - Penryn
146 // - Nehalem
147 // - Westmere
148 // - Sandy Bridge
149 // - Ivy Bridge
150 // - Haswell
151 // - Broadwell
152 // - Skylake
153 // - Kabylake
154 switch (Level) {
156 [[fallthrough]];
158 return 8;
159 }
160
161 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
162}
163
164unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
165 bool Vector = (ClassID == 1);
166 if (Vector && !ST->hasSSE1())
167 return 0;
168
169 if (ST->is64Bit()) {
170 if (Vector && ST->hasAVX512())
171 return 32;
172 if (!Vector && ST->hasEGPR())
173 return 32;
174 return 16;
175 }
176 return 8;
177}
178
180 if (!ST->hasCF())
181 return false;
182 if (!Ty)
183 return true;
184 // Conditional faulting is supported by CFCMOV, which only accepts
185 // 16/32/64-bit operands.
186 // TODO: Support f32/f64 with VMOVSS/VMOVSD with zero mask when it's
187 // profitable.
188 auto *VTy = dyn_cast<FixedVectorType>(Ty);
189 if (!Ty->isIntegerTy() && (!VTy || VTy->getNumElements() != 1))
190 return false;
191 auto *ScalarTy = Ty->getScalarType();
192 switch (cast<IntegerType>(ScalarTy)->getBitWidth()) {
193 default:
194 return false;
195 case 16:
196 case 32:
197 case 64:
198 return true;
199 }
200}
201
204 unsigned PreferVectorWidth = ST->getPreferVectorWidth();
205 switch (K) {
207 return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
209 if (ST->hasAVX512() && ST->hasEVEX512() && PreferVectorWidth >= 512)
210 return TypeSize::getFixed(512);
211 if (ST->hasAVX() && PreferVectorWidth >= 256)
212 return TypeSize::getFixed(256);
213 if (ST->hasSSE1() && PreferVectorWidth >= 128)
214 return TypeSize::getFixed(128);
215 return TypeSize::getFixed(0);
217 return TypeSize::getScalable(0);
218 }
219
220 llvm_unreachable("Unsupported register kind");
221}
222
225 .getFixedValue();
226}
227
229 // If the loop will not be vectorized, don't interleave the loop.
230 // Let regular unroll to unroll the loop, which saves the overflow
231 // check and memory check cost.
232 if (VF.isScalar())
233 return 1;
234
235 if (ST->isAtom())
236 return 1;
237
238 // Sandybridge and Haswell have multiple execution ports and pipelined
239 // vector units.
240 if (ST->hasAVX())
241 return 4;
242
243 return 2;
244}
245
247 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
250 const Instruction *CxtI) {
251
252 // vXi8 multiplications are always promoted to vXi16.
253 // Sub-128-bit types can be extended/packed more efficiently.
254 if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
255 Ty->getPrimitiveSizeInBits() <= 64 && Ty->getScalarSizeInBits() == 8) {
256 Type *WideVecTy =
257 VectorType::getExtendedElementVectorType(cast<VectorType>(Ty));
258 return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
260 CostKind) +
261 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
263 CostKind) +
264 getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info);
265 }
266
267 // Legalize the type.
268 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
269
270 int ISD = TLI->InstructionOpcodeToISD(Opcode);
271 assert(ISD && "Invalid opcode");
272
273 if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
274 (LT.second.getScalarType() == MVT::i32 ||
275 LT.second.getScalarType() == MVT::i64)) {
276 // Check if the operands can be represented as a smaller datatype.
277 bool Op1Signed = false, Op2Signed = false;
278 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
279 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
280 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
281 bool SignedMode = Op1Signed || Op2Signed;
282
283 // If both vXi32 are representable as i15 and at least one is constant,
284 // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
285 // can treat this as PMADDWD which has the same costs as a vXi16 multiply.
286 if (OpMinSize <= 15 && !ST->isPMADDWDSlow() &&
287 LT.second.getScalarType() == MVT::i32) {
288 bool Op1Constant =
289 isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
290 bool Op2Constant =
291 isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
292 bool Op1Sext = isa<SExtInst>(Args[0]) &&
293 (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
294 bool Op2Sext = isa<SExtInst>(Args[1]) &&
295 (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
296
297 bool IsZeroExtended = !Op1Signed || !Op2Signed;
298 bool IsConstant = Op1Constant || Op2Constant;
299 bool IsSext = Op1Sext || Op2Sext;
300 if (IsConstant || IsZeroExtended || IsSext)
301 LT.second =
302 MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
303 }
304
305 // Check if the vXi32 operands can be shrunk into a smaller datatype.
306 // This should match the codegen from reduceVMULWidth.
307 // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()).
308 if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) {
309 if (OpMinSize <= 7)
310 return LT.first * 3; // pmullw/sext
311 if (!SignedMode && OpMinSize <= 8)
312 return LT.first * 3; // pmullw/zext
313 if (OpMinSize <= 15)
314 return LT.first * 5; // pmullw/pmulhw/pshuf
315 if (!SignedMode && OpMinSize <= 16)
316 return LT.first * 5; // pmullw/pmulhw/pshuf
317 }
318
319 // If both vXi64 are representable as (unsigned) i32, then we can perform
320 // the multiple with a single PMULUDQ instruction.
321 // TODO: Add (SSE41+) PMULDQ handling for signed extensions.
322 if (!SignedMode && OpMinSize <= 32 && LT.second.getScalarType() == MVT::i64)
323 ISD = X86ISD::PMULUDQ;
324 }
325
326 // Vector multiply by pow2 will be simplified to shifts.
327 // Vector multiply by -pow2 will be simplified to shifts/negates.
328 if (ISD == ISD::MUL && Op2Info.isConstant() &&
329 (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) {
331 getArithmeticInstrCost(Instruction::Shl, Ty, CostKind,
332 Op1Info.getNoProps(), Op2Info.getNoProps());
333 if (Op2Info.isNegatedPowerOf2())
334 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
335 return Cost;
336 }
337
338 // On X86, vector signed division by constants power-of-two are
339 // normally expanded to the sequence SRA + SRL + ADD + SRA.
340 // The OperandValue properties may not be the same as that of the previous
341 // operation; conservatively assume OP_None.
342 if ((ISD == ISD::SDIV || ISD == ISD::SREM) &&
343 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
345 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
346 Op1Info.getNoProps(), Op2Info.getNoProps());
347 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
348 Op1Info.getNoProps(), Op2Info.getNoProps());
349 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
350 Op1Info.getNoProps(), Op2Info.getNoProps());
351
352 if (ISD == ISD::SREM) {
353 // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
354 Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(),
355 Op2Info.getNoProps());
356 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(),
357 Op2Info.getNoProps());
358 }
359
360 return Cost;
361 }
362
363 // Vector unsigned division/remainder will be simplified to shifts/masks.
364 if ((ISD == ISD::UDIV || ISD == ISD::UREM) &&
365 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
366 if (ISD == ISD::UDIV)
367 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
368 Op1Info.getNoProps(), Op2Info.getNoProps());
369 // UREM
370 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
371 Op1Info.getNoProps(), Op2Info.getNoProps());
372 }
373
374 static const CostKindTblEntry GFNIUniformConstCostTable[] = {
375 { ISD::SHL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
376 { ISD::SRL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
377 { ISD::SRA, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
378 { ISD::SHL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
379 { ISD::SRL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
380 { ISD::SRA, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
381 { ISD::SHL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
382 { ISD::SRL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
383 { ISD::SRA, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
384 };
385
386 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasGFNI())
387 if (const auto *Entry =
388 CostTableLookup(GFNIUniformConstCostTable, ISD, LT.second))
389 if (auto KindCost = Entry->Cost[CostKind])
390 return LT.first * *KindCost;
391
392 static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
393 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
394 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
395 { ISD::SRA, MVT::v16i8, { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb.
396 { ISD::SHL, MVT::v32i8, { 1, 8, 2, 3 } }, // psllw + pand.
397 { ISD::SRL, MVT::v32i8, { 1, 8, 2, 3 } }, // psrlw + pand.
398 { ISD::SRA, MVT::v32i8, { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb.
399 { ISD::SHL, MVT::v64i8, { 1, 8, 2, 3 } }, // psllw + pand.
400 { ISD::SRL, MVT::v64i8, { 1, 8, 2, 3 } }, // psrlw + pand.
401 { ISD::SRA, MVT::v64i8, { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb.
402
403 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // psllw
404 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
405 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
406 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // psllw
407 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
408 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
409 };
410
411 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI())
412 if (const auto *Entry =
413 CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second))
414 if (auto KindCost = Entry->Cost[CostKind])
415 return LT.first * *KindCost;
416
417 static const CostKindTblEntry AVX512UniformConstCostTable[] = {
418 { ISD::SHL, MVT::v64i8, { 2, 12, 5, 6 } }, // psllw + pand.
419 { ISD::SRL, MVT::v64i8, { 2, 12, 5, 6 } }, // psrlw + pand.
420 { ISD::SRA, MVT::v64i8, { 3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb.
421
422 { ISD::SHL, MVT::v16i16, { 2, 7, 4, 4 } }, // psllw + split.
423 { ISD::SRL, MVT::v16i16, { 2, 7, 4, 4 } }, // psrlw + split.
424 { ISD::SRA, MVT::v16i16, { 2, 7, 4, 4 } }, // psraw + split.
425
426 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, // pslld
427 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, // psrld
428 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, // psrad
429 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, // pslld
430 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, // psrld
431 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, // psrad
432
433 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, // psraq
434 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, // psllq
435 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, // psrlq
436 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, // psraq
437 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, // psllq
438 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, // psrlq
439 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, // psraq
440
441 { ISD::SDIV, MVT::v16i32, { 6 } }, // pmuludq sequence
442 { ISD::SREM, MVT::v16i32, { 8 } }, // pmuludq+mul+sub sequence
443 { ISD::UDIV, MVT::v16i32, { 5 } }, // pmuludq sequence
444 { ISD::UREM, MVT::v16i32, { 7 } }, // pmuludq+mul+sub sequence
445 };
446
447 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512())
448 if (const auto *Entry =
449 CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second))
450 if (auto KindCost = Entry->Cost[CostKind])
451 return LT.first * *KindCost;
452
453 static const CostKindTblEntry AVX2UniformConstCostTable[] = {
454 { ISD::SHL, MVT::v16i8, { 1, 8, 2, 3 } }, // psllw + pand.
455 { ISD::SRL, MVT::v16i8, { 1, 8, 2, 3 } }, // psrlw + pand.
456 { ISD::SRA, MVT::v16i8, { 2, 10, 5, 6 } }, // psrlw, pand, pxor, psubb.
457 { ISD::SHL, MVT::v32i8, { 2, 8, 2, 4 } }, // psllw + pand.
458 { ISD::SRL, MVT::v32i8, { 2, 8, 2, 4 } }, // psrlw + pand.
459 { ISD::SRA, MVT::v32i8, { 3, 10, 5, 9 } }, // psrlw, pand, pxor, psubb.
460
461 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw
462 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw
463 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw
464 { ISD::SHL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psllw
465 { ISD::SRL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psrlw
466 { ISD::SRA, MVT::v16i16,{ 2, 2, 1, 2 } }, // psraw
467
468 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
469 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld
470 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad
471 { ISD::SHL, MVT::v8i32, { 2, 2, 1, 2 } }, // pslld
472 { ISD::SRL, MVT::v8i32, { 2, 2, 1, 2 } }, // psrld
473 { ISD::SRA, MVT::v8i32, { 2, 2, 1, 2 } }, // psrad
474
475 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq
476 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq
477 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
478 { ISD::SHL, MVT::v4i64, { 2, 2, 1, 2 } }, // psllq
479 { ISD::SRL, MVT::v4i64, { 2, 2, 1, 2 } }, // psrlq
480 { ISD::SRA, MVT::v4i64, { 4, 4, 3, 6 } }, // psrad + shuffle + split.
481
482 { ISD::SDIV, MVT::v8i32, { 6 } }, // pmuludq sequence
483 { ISD::SREM, MVT::v8i32, { 8 } }, // pmuludq+mul+sub sequence
484 { ISD::UDIV, MVT::v8i32, { 5 } }, // pmuludq sequence
485 { ISD::UREM, MVT::v8i32, { 7 } }, // pmuludq+mul+sub sequence
486 };
487
488 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2())
489 if (const auto *Entry =
490 CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second))
491 if (auto KindCost = Entry->Cost[CostKind])
492 return LT.first * *KindCost;
493
494 static const CostKindTblEntry AVXUniformConstCostTable[] = {
495 { ISD::SHL, MVT::v16i8, { 2, 7, 2, 3 } }, // psllw + pand.
496 { ISD::SRL, MVT::v16i8, { 2, 7, 2, 3 } }, // psrlw + pand.
497 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
498 { ISD::SHL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psllw + pand) + split.
499 { ISD::SRL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psrlw + pand) + split.
500 { ISD::SRA, MVT::v32i8, { 7, 7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split.
501
502 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 1 } }, // psllw.
503 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 1 } }, // psrlw.
504 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 1 } }, // psraw.
505 { ISD::SHL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psllw + split.
506 { ISD::SRL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psrlw + split.
507 { ISD::SRA, MVT::v16i16,{ 3, 6, 4, 5 } }, // psraw + split.
508
509 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 1 } }, // pslld.
510 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 1 } }, // psrld.
511 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 1 } }, // psrad.
512 { ISD::SHL, MVT::v8i32, { 3, 6, 4, 5 } }, // pslld + split.
513 { ISD::SRL, MVT::v8i32, { 3, 6, 4, 5 } }, // psrld + split.
514 { ISD::SRA, MVT::v8i32, { 3, 6, 4, 5 } }, // psrad + split.
515
516 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 1 } }, // psllq.
517 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 1 } }, // psrlq.
518 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
519 { ISD::SHL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
520 { ISD::SRL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
521 { ISD::SRA, MVT::v4i64, { 5, 7, 8, 9 } }, // 2 x psrad + shuffle + split.
522
523 { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split.
524 { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split.
525 { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split.
526 { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split.
527 };
528
529 // XOP has faster vXi8 shifts.
530 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() &&
531 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
532 if (const auto *Entry =
533 CostTableLookup(AVXUniformConstCostTable, ISD, LT.second))
534 if (auto KindCost = Entry->Cost[CostKind])
535 return LT.first * *KindCost;
536
537 static const CostKindTblEntry SSE2UniformConstCostTable[] = {
538 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
539 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
540 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
541
542 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw.
543 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw.
544 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw.
545
546 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
547 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld.
548 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad.
549
550 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq.
551 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq.
552 { ISD::SRA, MVT::v2i64, { 3, 5, 6, 6 } }, // 2 x psrad + shuffle.
553
554 { ISD::SDIV, MVT::v4i32, { 6 } }, // pmuludq sequence
555 { ISD::SREM, MVT::v4i32, { 8 } }, // pmuludq+mul+sub sequence
556 { ISD::UDIV, MVT::v4i32, { 5 } }, // pmuludq sequence
557 { ISD::UREM, MVT::v4i32, { 7 } }, // pmuludq+mul+sub sequence
558 };
559
560 // XOP has faster vXi8 shifts.
561 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() &&
562 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
563 if (const auto *Entry =
564 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
565 if (auto KindCost = Entry->Cost[CostKind])
566 return LT.first * *KindCost;
567
568 static const CostKindTblEntry AVX512BWConstCostTable[] = {
569 { ISD::SDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
570 { ISD::SREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
571 { ISD::UDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
572 { ISD::UREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
573
574 { ISD::SDIV, MVT::v32i16, { 6 } }, // vpmulhw sequence
575 { ISD::SREM, MVT::v32i16, { 8 } }, // vpmulhw+mul+sub sequence
576 { ISD::UDIV, MVT::v32i16, { 6 } }, // vpmulhuw sequence
577 { ISD::UREM, MVT::v32i16, { 8 } }, // vpmulhuw+mul+sub sequence
578 };
579
580 if (Op2Info.isConstant() && ST->hasBWI())
581 if (const auto *Entry =
582 CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
583 if (auto KindCost = Entry->Cost[CostKind])
584 return LT.first * *KindCost;
585
586 static const CostKindTblEntry AVX512ConstCostTable[] = {
587 { ISD::SDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
588 { ISD::SREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
589 { ISD::UDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
590 { ISD::UREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
591
592 { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence
593 { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence
594 { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence
595 { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence
596
597 { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence
598 { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence
599 { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence
600 { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence
601 };
602
603 if (Op2Info.isConstant() && ST->hasAVX512())
604 if (const auto *Entry =
605 CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
606 if (auto KindCost = Entry->Cost[CostKind])
607 return LT.first * *KindCost;
608
609 static const CostKindTblEntry AVX2ConstCostTable[] = {
610 { ISD::SDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
611 { ISD::SREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
612 { ISD::UDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
613 { ISD::UREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
614
615 { ISD::SDIV, MVT::v16i16, { 6 } }, // vpmulhw sequence
616 { ISD::SREM, MVT::v16i16, { 8 } }, // vpmulhw+mul+sub sequence
617 { ISD::UDIV, MVT::v16i16, { 6 } }, // vpmulhuw sequence
618 { ISD::UREM, MVT::v16i16, { 8 } }, // vpmulhuw+mul+sub sequence
619
620 { ISD::SDIV, MVT::v8i32, { 15 } }, // vpmuldq sequence
621 { ISD::SREM, MVT::v8i32, { 19 } }, // vpmuldq+mul+sub sequence
622 { ISD::UDIV, MVT::v8i32, { 15 } }, // vpmuludq sequence
623 { ISD::UREM, MVT::v8i32, { 19 } }, // vpmuludq+mul+sub sequence
624 };
625
626 if (Op2Info.isConstant() && ST->hasAVX2())
627 if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
628 if (auto KindCost = Entry->Cost[CostKind])
629 return LT.first * *KindCost;
630
631 static const CostKindTblEntry AVXConstCostTable[] = {
632 { ISD::SDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
633 { ISD::SREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
634 { ISD::UDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
635 { ISD::UREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
636
637 { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split.
638 { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split.
639 { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split.
640 { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split.
641
642 { ISD::SDIV, MVT::v8i32, { 32 } }, // vpmuludq sequence
643 { ISD::SREM, MVT::v8i32, { 38 } }, // vpmuludq+mul+sub sequence
644 { ISD::UDIV, MVT::v8i32, { 32 } }, // 2*pmuludq sequence + split.
645 { ISD::UREM, MVT::v8i32, { 42 } }, // 2*pmuludq+mul+sub sequence + split.
646 };
647
648 if (Op2Info.isConstant() && ST->hasAVX())
649 if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second))
650 if (auto KindCost = Entry->Cost[CostKind])
651 return LT.first * *KindCost;
652
653 static const CostKindTblEntry SSE41ConstCostTable[] = {
654 { ISD::SDIV, MVT::v4i32, { 15 } }, // vpmuludq sequence
655 { ISD::SREM, MVT::v4i32, { 20 } }, // vpmuludq+mul+sub sequence
656 };
657
658 if (Op2Info.isConstant() && ST->hasSSE41())
659 if (const auto *Entry =
660 CostTableLookup(SSE41ConstCostTable, ISD, LT.second))
661 if (auto KindCost = Entry->Cost[CostKind])
662 return LT.first * *KindCost;
663
664 static const CostKindTblEntry SSE2ConstCostTable[] = {
665 { ISD::SDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
666 { ISD::SREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
667 { ISD::UDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
668 { ISD::UREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
669
670 { ISD::SDIV, MVT::v8i16, { 6 } }, // pmulhw sequence
671 { ISD::SREM, MVT::v8i16, { 8 } }, // pmulhw+mul+sub sequence
672 { ISD::UDIV, MVT::v8i16, { 6 } }, // pmulhuw sequence
673 { ISD::UREM, MVT::v8i16, { 8 } }, // pmulhuw+mul+sub sequence
674
675 { ISD::SDIV, MVT::v4i32, { 19 } }, // pmuludq sequence
676 { ISD::SREM, MVT::v4i32, { 24 } }, // pmuludq+mul+sub sequence
677 { ISD::UDIV, MVT::v4i32, { 15 } }, // pmuludq sequence
678 { ISD::UREM, MVT::v4i32, { 20 } }, // pmuludq+mul+sub sequence
679 };
680
681 if (Op2Info.isConstant() && ST->hasSSE2())
682 if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
683 if (auto KindCost = Entry->Cost[CostKind])
684 return LT.first * *KindCost;
685
686 static const CostKindTblEntry AVX512BWUniformCostTable[] = {
687 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
688 { ISD::SRL, MVT::v16i8, { 3,10, 5, 8 } }, // psrlw + pand.
689 { ISD::SRA, MVT::v16i8, { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb.
690 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
691 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
692 { ISD::SRA, MVT::v32i8, { 5,10,10,13 } }, // psrlw, pand, pxor, psubb.
693 { ISD::SHL, MVT::v64i8, { 4, 7, 6, 8 } }, // psllw + pand.
694 { ISD::SRL, MVT::v64i8, { 4, 8, 7,10 } }, // psrlw + pand.
695 { ISD::SRA, MVT::v64i8, { 5,10,10,15 } }, // psrlw, pand, pxor, psubb.
696
697 { ISD::SHL, MVT::v32i16, { 2, 4, 2, 3 } }, // psllw
698 { ISD::SRL, MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw
699 { ISD::SRA, MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw
700 };
701
702 if (ST->hasBWI() && Op2Info.isUniform())
703 if (const auto *Entry =
704 CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second))
705 if (auto KindCost = Entry->Cost[CostKind])
706 return LT.first * *KindCost;
707
708 static const CostKindTblEntry AVX512UniformCostTable[] = {
709 { ISD::SHL, MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split.
710 { ISD::SRL, MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split.
711 { ISD::SRA, MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split.
712
713 { ISD::SHL, MVT::v16i32, { 2, 4, 2, 3 } }, // pslld
714 { ISD::SRL, MVT::v16i32, { 2, 4, 2, 3 } }, // psrld
715 { ISD::SRA, MVT::v16i32, { 2, 4, 2, 3 } }, // psrad
716
717 { ISD::SRA, MVT::v2i64, { 1, 2, 1, 2 } }, // psraq
718 { ISD::SHL, MVT::v4i64, { 1, 4, 1, 2 } }, // psllq
719 { ISD::SRL, MVT::v4i64, { 1, 4, 1, 2 } }, // psrlq
720 { ISD::SRA, MVT::v4i64, { 1, 4, 1, 2 } }, // psraq
721 { ISD::SHL, MVT::v8i64, { 1, 4, 1, 2 } }, // psllq
722 { ISD::SRL, MVT::v8i64, { 1, 4, 1, 2 } }, // psrlq
723 { ISD::SRA, MVT::v8i64, { 1, 4, 1, 2 } }, // psraq
724 };
725
726 if (ST->hasAVX512() && Op2Info.isUniform())
727 if (const auto *Entry =
728 CostTableLookup(AVX512UniformCostTable, ISD, LT.second))
729 if (auto KindCost = Entry->Cost[CostKind])
730 return LT.first * *KindCost;
731
732 static const CostKindTblEntry AVX2UniformCostTable[] = {
733 // Uniform splats are cheaper for the following instructions.
734 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
735 { ISD::SRL, MVT::v16i8, { 3, 9, 5, 8 } }, // psrlw + pand.
736 { ISD::SRA, MVT::v16i8, { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb.
737 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
738 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
739 { ISD::SRA, MVT::v32i8, { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb.
740
741 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 2 } }, // psllw.
742 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 2 } }, // psrlw.
743 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 2 } }, // psraw.
744 { ISD::SHL, MVT::v16i16, { 2, 4, 2, 3 } }, // psllw.
745 { ISD::SRL, MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw.
746 { ISD::SRA, MVT::v16i16, { 2, 4, 2, 3 } }, // psraw.
747
748 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 2 } }, // pslld
749 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 2 } }, // psrld
750 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 2 } }, // psrad
751 { ISD::SHL, MVT::v8i32, { 2, 4, 2, 3 } }, // pslld
752 { ISD::SRL, MVT::v8i32, { 2, 4, 2, 3 } }, // psrld
753 { ISD::SRA, MVT::v8i32, { 2, 4, 2, 3 } }, // psrad
754
755 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 2 } }, // psllq
756 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 2 } }, // psrlq
757 { ISD::SRA, MVT::v2i64, { 2, 4, 5, 7 } }, // 2 x psrad + shuffle.
758 { ISD::SHL, MVT::v4i64, { 2, 4, 1, 2 } }, // psllq
759 { ISD::SRL, MVT::v4i64, { 2, 4, 1, 2 } }, // psrlq
760 { ISD::SRA, MVT::v4i64, { 4, 6, 5, 9 } }, // 2 x psrad + shuffle.
761 };
762
763 if (ST->hasAVX2() && Op2Info.isUniform())
764 if (const auto *Entry =
765 CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
766 if (auto KindCost = Entry->Cost[CostKind])
767 return LT.first * *KindCost;
768
769 static const CostKindTblEntry AVXUniformCostTable[] = {
770 { ISD::SHL, MVT::v16i8, { 4, 4, 6, 8 } }, // psllw + pand.
771 { ISD::SRL, MVT::v16i8, { 4, 8, 5, 8 } }, // psrlw + pand.
772 { ISD::SRA, MVT::v16i8, { 6, 6, 9,13 } }, // psrlw, pand, pxor, psubb.
773 { ISD::SHL, MVT::v32i8, { 7, 8,11,14 } }, // psllw + pand + split.
774 { ISD::SRL, MVT::v32i8, { 7, 9,10,14 } }, // psrlw + pand + split.
775 { ISD::SRA, MVT::v32i8, { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split.
776
777 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 2 } }, // psllw.
778 { ISD::SRL, MVT::v8i16, { 1, 3, 1, 2 } }, // psrlw.
779 { ISD::SRA, MVT::v8i16, { 1, 3, 1, 2 } }, // psraw.
780 { ISD::SHL, MVT::v16i16, { 3, 7, 5, 7 } }, // psllw + split.
781 { ISD::SRL, MVT::v16i16, { 3, 7, 5, 7 } }, // psrlw + split.
782 { ISD::SRA, MVT::v16i16, { 3, 7, 5, 7 } }, // psraw + split.
783
784 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 2 } }, // pslld.
785 { ISD::SRL, MVT::v4i32, { 1, 3, 1, 2 } }, // psrld.
786 { ISD::SRA, MVT::v4i32, { 1, 3, 1, 2 } }, // psrad.
787 { ISD::SHL, MVT::v8i32, { 3, 7, 5, 7 } }, // pslld + split.
788 { ISD::SRL, MVT::v8i32, { 3, 7, 5, 7 } }, // psrld + split.
789 { ISD::SRA, MVT::v8i32, { 3, 7, 5, 7 } }, // psrad + split.
790
791 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 2 } }, // psllq.
792 { ISD::SRL, MVT::v2i64, { 1, 3, 1, 2 } }, // psrlq.
793 { ISD::SRA, MVT::v2i64, { 3, 4, 5, 7 } }, // 2 x psrad + shuffle.
794 { ISD::SHL, MVT::v4i64, { 3, 7, 4, 6 } }, // psllq + split.
795 { ISD::SRL, MVT::v4i64, { 3, 7, 4, 6 } }, // psrlq + split.
796 { ISD::SRA, MVT::v4i64, { 6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split.
797 };
798
799 // XOP has faster vXi8 shifts.
800 if (ST->hasAVX() && Op2Info.isUniform() &&
801 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
802 if (const auto *Entry =
803 CostTableLookup(AVXUniformCostTable, ISD, LT.second))
804 if (auto KindCost = Entry->Cost[CostKind])
805 return LT.first * *KindCost;
806
807 static const CostKindTblEntry SSE2UniformCostTable[] = {
808 // Uniform splats are cheaper for the following instructions.
809 { ISD::SHL, MVT::v16i8, { 9, 10, 6, 9 } }, // psllw + pand.
810 { ISD::SRL, MVT::v16i8, { 9, 13, 5, 9 } }, // psrlw + pand.
811 { ISD::SRA, MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence.
812
813 { ISD::SHL, MVT::v8i16, { 2, 2, 1, 2 } }, // psllw.
814 { ISD::SRL, MVT::v8i16, { 2, 2, 1, 2 } }, // psrlw.
815 { ISD::SRA, MVT::v8i16, { 2, 2, 1, 2 } }, // psraw.
816
817 { ISD::SHL, MVT::v4i32, { 2, 2, 1, 2 } }, // pslld
818 { ISD::SRL, MVT::v4i32, { 2, 2, 1, 2 } }, // psrld.
819 { ISD::SRA, MVT::v4i32, { 2, 2, 1, 2 } }, // psrad.
820
821 { ISD::SHL, MVT::v2i64, { 2, 2, 1, 2 } }, // psllq.
822 { ISD::SRL, MVT::v2i64, { 2, 2, 1, 2 } }, // psrlq.
823 { ISD::SRA, MVT::v2i64, { 5, 9, 5, 7 } }, // 2*psrlq + xor + sub.
824 };
825
826 if (ST->hasSSE2() && Op2Info.isUniform() &&
827 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
828 if (const auto *Entry =
829 CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
830 if (auto KindCost = Entry->Cost[CostKind])
831 return LT.first * *KindCost;
832
833 static const CostKindTblEntry AVX512DQCostTable[] = {
834 { ISD::MUL, MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq
835 { ISD::MUL, MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq
836 { ISD::MUL, MVT::v8i64, { 3, 15, 1, 3 } } // pmullq
837 };
838
839 // Look for AVX512DQ lowering tricks for custom cases.
840 if (ST->hasDQI())
841 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
842 if (auto KindCost = Entry->Cost[CostKind])
843 return LT.first * *KindCost;
844
845 static const CostKindTblEntry AVX512BWCostTable[] = {
846 { ISD::SHL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsllvw/pack sequence.
847 { ISD::SRL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsrlvw/pack sequence.
848 { ISD::SRA, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsravw/pack sequence.
849 { ISD::SHL, MVT::v32i8, { 4, 23,11,16 } }, // extend/vpsllvw/pack sequence.
850 { ISD::SRL, MVT::v32i8, { 4, 30,12,18 } }, // extend/vpsrlvw/pack sequence.
851 { ISD::SRA, MVT::v32i8, { 6, 13,24,30 } }, // extend/vpsravw/pack sequence.
852 { ISD::SHL, MVT::v64i8, { 6, 19,13,15 } }, // extend/vpsllvw/pack sequence.
853 { ISD::SRL, MVT::v64i8, { 7, 27,15,18 } }, // extend/vpsrlvw/pack sequence.
854 { ISD::SRA, MVT::v64i8, { 15, 15,30,30 } }, // extend/vpsravw/pack sequence.
855
856 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsllvw
857 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsrlvw
858 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsravw
859 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsllvw
860 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsrlvw
861 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsravw
862 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsllvw
863 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsrlvw
864 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsravw
865
866 { ISD::ADD, MVT::v64i8, { 1, 1, 1, 1 } }, // paddb
867 { ISD::ADD, MVT::v32i16, { 1, 1, 1, 1 } }, // paddw
868
869 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 1 } }, // paddb
870 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 1 } }, // paddw
871 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 1 } }, // paddd
872 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 1 } }, // paddq
873
874 { ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb
875 { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw
876
877 { ISD::MUL, MVT::v16i8, { 4, 12, 4, 5 } }, // extend/pmullw/trunc
878 { ISD::MUL, MVT::v32i8, { 3, 10, 7,10 } }, // pmaddubsw
879 { ISD::MUL, MVT::v64i8, { 3, 11, 7,10 } }, // pmaddubsw
880 { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw
881
882 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb
883 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 1 } }, // psubw
884 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 1 } }, // psubd
885 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 1 } }, // psubq
886 };
887
888 // Look for AVX512BW lowering tricks for custom cases.
889 if (ST->hasBWI())
890 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
891 if (auto KindCost = Entry->Cost[CostKind])
892 return LT.first * *KindCost;
893
894 static const CostKindTblEntry AVX512CostTable[] = {
895 { ISD::SHL, MVT::v64i8, { 15, 19,27,33 } }, // vpblendv+split sequence.
896 { ISD::SRL, MVT::v64i8, { 15, 19,30,36 } }, // vpblendv+split sequence.
897 { ISD::SRA, MVT::v64i8, { 37, 37,51,63 } }, // vpblendv+split sequence.
898
899 { ISD::SHL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
900 { ISD::SRL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
901 { ISD::SRA, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence.
902
903 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } },
904 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } },
905 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } },
906 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } },
907 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } },
908 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } },
909 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } },
910 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } },
911 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } },
912
913 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } },
914 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } },
915 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } },
916 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } },
917 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } },
918 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } },
919 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } },
920 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } },
921 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } },
922
923 { ISD::ADD, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*paddb + split
924 { ISD::ADD, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*paddw + split
925
926 { ISD::SUB, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*psubb + split
927 { ISD::SUB, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*psubw + split
928
929 { ISD::AND, MVT::v32i8, { 1, 1, 1, 1 } },
930 { ISD::AND, MVT::v16i16, { 1, 1, 1, 1 } },
931 { ISD::AND, MVT::v8i32, { 1, 1, 1, 1 } },
932 { ISD::AND, MVT::v4i64, { 1, 1, 1, 1 } },
933
934 { ISD::OR, MVT::v32i8, { 1, 1, 1, 1 } },
935 { ISD::OR, MVT::v16i16, { 1, 1, 1, 1 } },
936 { ISD::OR, MVT::v8i32, { 1, 1, 1, 1 } },
937 { ISD::OR, MVT::v4i64, { 1, 1, 1, 1 } },
938
939 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 1 } },
940 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 1 } },
941 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 1 } },
942 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 1 } },
943
944 { ISD::MUL, MVT::v16i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
945 { ISD::MUL, MVT::v8i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
946 { ISD::MUL, MVT::v4i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
947 { ISD::MUL, MVT::v8i64, { 6, 9, 8, 8 } }, // 3*pmuludq/3*shift/2*add
948 { ISD::MUL, MVT::i64, { 1 } }, // Skylake from http://www.agner.org/
949
950 { X86ISD::PMULUDQ, MVT::v8i64, { 1, 5, 1, 1 } },
951
952 { ISD::FNEG, MVT::v8f64, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
953 { ISD::FADD, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
954 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
955 { ISD::FSUB, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
956 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
957 { ISD::FMUL, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
958 { ISD::FMUL, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
959 { ISD::FMUL, MVT::v2f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
960 { ISD::FMUL, MVT::f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
961
962 { ISD::FDIV, MVT::f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
963 { ISD::FDIV, MVT::v2f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
964 { ISD::FDIV, MVT::v4f64, { 8, 14, 1, 1 } }, // Skylake from http://www.agner.org/
965 { ISD::FDIV, MVT::v8f64, { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/
966
967 { ISD::FNEG, MVT::v16f32, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
968 { ISD::FADD, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
969 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
970 { ISD::FSUB, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
971 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
972 { ISD::FMUL, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
973 { ISD::FMUL, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
974 { ISD::FMUL, MVT::v4f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
975 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
976
977 { ISD::FDIV, MVT::f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
978 { ISD::FDIV, MVT::v4f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
979 { ISD::FDIV, MVT::v8f32, { 5, 11, 1, 1 } }, // Skylake from http://www.agner.org/
980 { ISD::FDIV, MVT::v16f32, { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/
981 };
982
983 if (ST->hasAVX512())
984 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
985 if (auto KindCost = Entry->Cost[CostKind])
986 return LT.first * *KindCost;
987
988 static const CostKindTblEntry AVX2ShiftCostTable[] = {
989 // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
990 // customize them to detect the cases where shift amount is a scalar one.
991 { ISD::SHL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org)
992 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
993 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org)
994 { ISD::SHL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org)
995 { ISD::SRL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
996 { ISD::SRA, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org)
997 { ISD::SHL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org)
998 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org)
999 { ISD::SHL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org)
1000 { ISD::SRL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org)
1001 };
1002
1003 if (ST->hasAVX512()) {
1004 if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant())
1005 // On AVX512, a packed v32i16 shift left by a constant build_vector
1006 // is lowered into a vector multiply (vpmullw).
1007 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
1008 Op1Info.getNoProps(), Op2Info.getNoProps());
1009 }
1010
1011 // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
1012 if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
1013 if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
1014 Op2Info.isConstant())
1015 // On AVX2, a packed v16i16 shift left by a constant build_vector
1016 // is lowered into a vector multiply (vpmullw).
1017 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
1018 Op1Info.getNoProps(), Op2Info.getNoProps());
1019
1020 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
1021 if (auto KindCost = Entry->Cost[CostKind])
1022 return LT.first * *KindCost;
1023 }
1024
1025 static const CostKindTblEntry XOPShiftCostTable[] = {
1026 // 128bit shifts take 1cy, but right shifts require negation beforehand.
1027 { ISD::SHL, MVT::v16i8, { 1, 3, 1, 1 } },
1028 { ISD::SRL, MVT::v16i8, { 2, 3, 1, 1 } },
1029 { ISD::SRA, MVT::v16i8, { 2, 3, 1, 1 } },
1030 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 1 } },
1031 { ISD::SRL, MVT::v8i16, { 2, 3, 1, 1 } },
1032 { ISD::SRA, MVT::v8i16, { 2, 3, 1, 1 } },
1033 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 1 } },
1034 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 1 } },
1035 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 1 } },
1036 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 1 } },
1037 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } },
1038 { ISD::SRA, MVT::v2i64, { 2, 3, 1, 1 } },
1039 // 256bit shifts require splitting if AVX2 didn't catch them above.
1040 { ISD::SHL, MVT::v32i8, { 4, 7, 5, 6 } },
1041 { ISD::SRL, MVT::v32i8, { 6, 7, 5, 6 } },
1042 { ISD::SRA, MVT::v32i8, { 6, 7, 5, 6 } },
1043 { ISD::SHL, MVT::v16i16, { 4, 7, 5, 6 } },
1044 { ISD::SRL, MVT::v16i16, { 6, 7, 5, 6 } },
1045 { ISD::SRA, MVT::v16i16, { 6, 7, 5, 6 } },
1046 { ISD::SHL, MVT::v8i32, { 4, 7, 5, 6 } },
1047 { ISD::SRL, MVT::v8i32, { 6, 7, 5, 6 } },
1048 { ISD::SRA, MVT::v8i32, { 6, 7, 5, 6 } },
1049 { ISD::SHL, MVT::v4i64, { 4, 7, 5, 6 } },
1050 { ISD::SRL, MVT::v4i64, { 6, 7, 5, 6 } },
1051 { ISD::SRA, MVT::v4i64, { 6, 7, 5, 6 } },
1052 };
1053
1054 // Look for XOP lowering tricks.
1055 if (ST->hasXOP()) {
1056 // If the right shift is constant then we'll fold the negation so
1057 // it's as cheap as a left shift.
1058 int ShiftISD = ISD;
1059 if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant())
1060 ShiftISD = ISD::SHL;
1061 if (const auto *Entry =
1062 CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
1063 if (auto KindCost = Entry->Cost[CostKind])
1064 return LT.first * *KindCost;
1065 }
1066
1067 if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) {
1068 MVT VT = LT.second;
1069 // Vector shift left by non uniform constant can be lowered
1070 // into vector multiply.
1071 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
1072 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
1073 ISD = ISD::MUL;
1074 }
1075
1076 static const CostKindTblEntry GLMCostTable[] = {
1077 { ISD::FDIV, MVT::f32, { 18, 19, 1, 1 } }, // divss
1078 { ISD::FDIV, MVT::v4f32, { 35, 36, 1, 1 } }, // divps
1079 { ISD::FDIV, MVT::f64, { 33, 34, 1, 1 } }, // divsd
1080 { ISD::FDIV, MVT::v2f64, { 65, 66, 1, 1 } }, // divpd
1081 };
1082
1083 if (ST->useGLMDivSqrtCosts())
1084 if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second))
1085 if (auto KindCost = Entry->Cost[CostKind])
1086 return LT.first * *KindCost;
1087
1088 static const CostKindTblEntry SLMCostTable[] = {
1089 { ISD::MUL, MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld
1090 { ISD::MUL, MVT::v8i16, { 2, 5, 1, 1 } }, // pmullw
1091 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // mulsd
1092 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // mulss
1093 { ISD::FMUL, MVT::v2f64, { 4, 7, 1, 1 } }, // mulpd
1094 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // mulps
1095 { ISD::FDIV, MVT::f32, { 17, 19, 1, 1 } }, // divss
1096 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 6 } }, // divps
1097 { ISD::FDIV, MVT::f64, { 32, 34, 1, 1 } }, // divsd
1098 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 6 } }, // divpd
1099 { ISD::FADD, MVT::v2f64, { 2, 4, 1, 1 } }, // addpd
1100 { ISD::FSUB, MVT::v2f64, { 2, 4, 1, 1 } }, // subpd
1101 // v2i64/v4i64 mul is custom lowered as a series of long:
1102 // multiplies(3), shifts(3) and adds(2)
1103 // slm muldq version throughput is 2 and addq throughput 4
1104 // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
1105 // 3X4 (addq throughput) = 17
1106 { ISD::MUL, MVT::v2i64, { 17, 22, 9, 9 } },
1107 // slm addq\subq throughput is 4
1108 { ISD::ADD, MVT::v2i64, { 4, 2, 1, 2 } },
1109 { ISD::SUB, MVT::v2i64, { 4, 2, 1, 2 } },
1110 };
1111
1112 if (ST->useSLMArithCosts())
1113 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second))
1114 if (auto KindCost = Entry->Cost[CostKind])
1115 return LT.first * *KindCost;
1116
1117 static const CostKindTblEntry AVX2CostTable[] = {
1118 { ISD::SHL, MVT::v16i8, { 6, 21,11,16 } }, // vpblendvb sequence.
1119 { ISD::SHL, MVT::v32i8, { 6, 23,11,22 } }, // vpblendvb sequence.
1120 { ISD::SHL, MVT::v8i16, { 5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence.
1121 { ISD::SHL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1122
1123 { ISD::SRL, MVT::v16i8, { 6, 27,12,18 } }, // vpblendvb sequence.
1124 { ISD::SRL, MVT::v32i8, { 8, 30,12,24 } }, // vpblendvb sequence.
1125 { ISD::SRL, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence.
1126 { ISD::SRL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1127
1128 { ISD::SRA, MVT::v16i8, { 17, 17,24,30 } }, // vpblendvb sequence.
1129 { ISD::SRA, MVT::v32i8, { 18, 20,24,43 } }, // vpblendvb sequence.
1130 { ISD::SRA, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsravd/pack sequence.
1131 { ISD::SRA, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsravd/pack sequence.
1132 { ISD::SRA, MVT::v2i64, { 4, 5, 5, 5 } }, // srl/xor/sub sequence.
1133 { ISD::SRA, MVT::v4i64, { 8, 8, 5, 9 } }, // srl/xor/sub sequence.
1134
1135 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 2 } }, // psubb
1136 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 2 } }, // paddb
1137 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 2 } }, // psubw
1138 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 2 } }, // paddw
1139 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 2 } }, // psubd
1140 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 2 } }, // paddd
1141 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 2 } }, // psubq
1142 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq
1143
1144 { ISD::MUL, MVT::v16i8, { 5, 18, 6,12 } }, // extend/pmullw/pack
1145 { ISD::MUL, MVT::v32i8, { 4, 8, 8,16 } }, // pmaddubsw
1146 { ISD::MUL, MVT::v16i16, { 2, 5, 1, 2 } }, // pmullw
1147 { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld
1148 { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld
1149 { ISD::MUL, MVT::v4i64, { 6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add
1150 { ISD::MUL, MVT::v2i64, { 6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
1151
1152 { X86ISD::PMULUDQ, MVT::v4i64, { 1, 5, 1, 1 } },
1153
1154 { ISD::FNEG, MVT::v4f64, { 1, 1, 1, 2 } }, // vxorpd
1155 { ISD::FNEG, MVT::v8f32, { 1, 1, 1, 2 } }, // vxorps
1156
1157 { ISD::FADD, MVT::f64, { 1, 4, 1, 1 } }, // vaddsd
1158 { ISD::FADD, MVT::f32, { 1, 4, 1, 1 } }, // vaddss
1159 { ISD::FADD, MVT::v2f64, { 1, 4, 1, 1 } }, // vaddpd
1160 { ISD::FADD, MVT::v4f32, { 1, 4, 1, 1 } }, // vaddps
1161 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 2 } }, // vaddpd
1162 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 2 } }, // vaddps
1163
1164 { ISD::FSUB, MVT::f64, { 1, 4, 1, 1 } }, // vsubsd
1165 { ISD::FSUB, MVT::f32, { 1, 4, 1, 1 } }, // vsubss
1166 { ISD::FSUB, MVT::v2f64, { 1, 4, 1, 1 } }, // vsubpd
1167 { ISD::FSUB, MVT::v4f32, { 1, 4, 1, 1 } }, // vsubps
1168 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 2 } }, // vsubpd
1169 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 2 } }, // vsubps
1170
1171 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // vmulsd
1172 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // vmulss
1173 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // vmulpd
1174 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // vmulps
1175 { ISD::FMUL, MVT::v4f64, { 1, 5, 1, 2 } }, // vmulpd
1176 { ISD::FMUL, MVT::v8f32, { 1, 5, 1, 2 } }, // vmulps
1177
1178 { ISD::FDIV, MVT::f32, { 7, 13, 1, 1 } }, // vdivss
1179 { ISD::FDIV, MVT::v4f32, { 7, 13, 1, 1 } }, // vdivps
1180 { ISD::FDIV, MVT::v8f32, { 14, 21, 1, 3 } }, // vdivps
1181 { ISD::FDIV, MVT::f64, { 14, 20, 1, 1 } }, // vdivsd
1182 { ISD::FDIV, MVT::v2f64, { 14, 20, 1, 1 } }, // vdivpd
1183 { ISD::FDIV, MVT::v4f64, { 28, 35, 1, 3 } }, // vdivpd
1184 };
1185
1186 // Look for AVX2 lowering tricks for custom cases.
1187 if (ST->hasAVX2())
1188 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
1189 if (auto KindCost = Entry->Cost[CostKind])
1190 return LT.first * *KindCost;
1191
1192 static const CostKindTblEntry AVX1CostTable[] = {
1193 // We don't have to scalarize unsupported ops. We can issue two half-sized
1194 // operations and we only need to extract the upper YMM half.
1195 // Two ops + 1 extract + 1 insert = 4.
1196 { ISD::MUL, MVT::v32i8, { 10, 11, 18, 19 } }, // pmaddubsw + split
1197 { ISD::MUL, MVT::v16i8, { 5, 6, 8, 12 } }, // 2*pmaddubsw/3*and/psllw/or
1198 { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split
1199 { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split
1200 { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld
1201 { ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } },
1202
1203 { ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps
1204 { ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps
1205 { ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps
1206 { ISD::AND, MVT::v4i64, { 1, 1, 1, 2 } }, // vandps
1207
1208 { ISD::OR, MVT::v32i8, { 1, 1, 1, 2 } }, // vorps
1209 { ISD::OR, MVT::v16i16, { 1, 1, 1, 2 } }, // vorps
1210 { ISD::OR, MVT::v8i32, { 1, 1, 1, 2 } }, // vorps
1211 { ISD::OR, MVT::v4i64, { 1, 1, 1, 2 } }, // vorps
1212
1213 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 2 } }, // vxorps
1214 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 2 } }, // vxorps
1215 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 2 } }, // vxorps
1216 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 2 } }, // vxorps
1217
1218 { ISD::SUB, MVT::v32i8, { 4, 2, 5, 6 } }, // psubb + split
1219 { ISD::ADD, MVT::v32i8, { 4, 2, 5, 6 } }, // paddb + split
1220 { ISD::SUB, MVT::v16i16, { 4, 2, 5, 6 } }, // psubw + split
1221 { ISD::ADD, MVT::v16i16, { 4, 2, 5, 6 } }, // paddw + split
1222 { ISD::SUB, MVT::v8i32, { 4, 2, 5, 6 } }, // psubd + split
1223 { ISD::ADD, MVT::v8i32, { 4, 2, 5, 6 } }, // paddd + split
1224 { ISD::SUB, MVT::v4i64, { 4, 2, 5, 6 } }, // psubq + split
1225 { ISD::ADD, MVT::v4i64, { 4, 2, 5, 6 } }, // paddq + split
1226 { ISD::SUB, MVT::v2i64, { 1, 1, 1, 1 } }, // psubq
1227 { ISD::ADD, MVT::v2i64, { 1, 1, 1, 1 } }, // paddq
1228
1229 { ISD::SHL, MVT::v16i8, { 10, 21,11,17 } }, // pblendvb sequence.
1230 { ISD::SHL, MVT::v32i8, { 22, 22,27,40 } }, // pblendvb sequence + split.
1231 { ISD::SHL, MVT::v8i16, { 6, 9,11,11 } }, // pblendvb sequence.
1232 { ISD::SHL, MVT::v16i16, { 13, 16,24,25 } }, // pblendvb sequence + split.
1233 { ISD::SHL, MVT::v4i32, { 3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld
1234 { ISD::SHL, MVT::v8i32, { 9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split
1235 { ISD::SHL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1236 { ISD::SHL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1237
1238 { ISD::SRL, MVT::v16i8, { 11, 27,12,18 } }, // pblendvb sequence.
1239 { ISD::SRL, MVT::v32i8, { 23, 23,30,43 } }, // pblendvb sequence + split.
1240 { ISD::SRL, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1241 { ISD::SRL, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1242 { ISD::SRL, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1243 { ISD::SRL, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1244 { ISD::SRL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1245 { ISD::SRL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1246
1247 { ISD::SRA, MVT::v16i8, { 21, 22,24,36 } }, // pblendvb sequence.
1248 { ISD::SRA, MVT::v32i8, { 44, 45,51,76 } }, // pblendvb sequence + split.
1249 { ISD::SRA, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1250 { ISD::SRA, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1251 { ISD::SRA, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1252 { ISD::SRA, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1253 { ISD::SRA, MVT::v2i64, { 5, 6,10,14 } }, // Shift each lane + blend.
1254 { ISD::SRA, MVT::v4i64, { 12, 12,22,30 } }, // Shift each lane + blend + split.
1255
1256 { ISD::FNEG, MVT::v4f64, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1257 { ISD::FNEG, MVT::v8f32, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1258
1259 { ISD::FADD, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1260 { ISD::FADD, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1261 { ISD::FADD, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1262 { ISD::FADD, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1263 { ISD::FADD, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1264 { ISD::FADD, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1265
1266 { ISD::FSUB, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1267 { ISD::FSUB, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1268 { ISD::FSUB, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1269 { ISD::FSUB, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1270 { ISD::FSUB, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1271 { ISD::FSUB, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1272
1273 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1274 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1275 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1276 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1277 { ISD::FMUL, MVT::v4f64, { 4, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1278 { ISD::FMUL, MVT::v8f32, { 2, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1279
1280 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1281 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1282 { ISD::FDIV, MVT::v8f32, { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/
1283 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1284 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1285 { ISD::FDIV, MVT::v4f64, { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/
1286 };
1287
1288 if (ST->hasAVX())
1289 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
1290 if (auto KindCost = Entry->Cost[CostKind])
1291 return LT.first * *KindCost;
1292
1293 static const CostKindTblEntry SSE42CostTable[] = {
1294 { ISD::FADD, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1295 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1296 { ISD::FADD, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1297 { ISD::FADD, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1298
1299 { ISD::FSUB, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1300 { ISD::FSUB, MVT::f32 , { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1301 { ISD::FSUB, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1302 { ISD::FSUB, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1303
1304 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1305 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1306 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1307 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1308
1309 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1310 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1311 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1312 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1313
1314 { ISD::MUL, MVT::v2i64, { 6, 10,10,10 } } // 3*pmuludq/3*shift/2*add
1315 };
1316
1317 if (ST->hasSSE42())
1318 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
1319 if (auto KindCost = Entry->Cost[CostKind])
1320 return LT.first * *KindCost;
1321
1322 static const CostKindTblEntry SSE41CostTable[] = {
1323 { ISD::SHL, MVT::v16i8, { 15, 24,17,22 } }, // pblendvb sequence.
1324 { ISD::SHL, MVT::v8i16, { 11, 14,11,11 } }, // pblendvb sequence.
1325 { ISD::SHL, MVT::v4i32, { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld
1326
1327 { ISD::SRL, MVT::v16i8, { 16, 27,18,24 } }, // pblendvb sequence.
1328 { ISD::SRL, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1329 { ISD::SRL, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1330 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1331
1332 { ISD::SRA, MVT::v16i8, { 38, 41,30,36 } }, // pblendvb sequence.
1333 { ISD::SRA, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1334 { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1335 { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence.
1336
1337 { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org)
1338 };
1339
1340 if (ST->hasSSE41())
1341 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
1342 if (auto KindCost = Entry->Cost[CostKind])
1343 return LT.first * *KindCost;
1344
1345 static const CostKindTblEntry SSSE3CostTable[] = {
1346 { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*pmaddubsw/3*and/psllw/or
1347 };
1348
1349 if (ST->hasSSSE3())
1350 if (const auto *Entry = CostTableLookup(SSSE3CostTable, ISD, LT.second))
1351 if (auto KindCost = Entry->Cost[CostKind])
1352 return LT.first * *KindCost;
1353
1354 static const CostKindTblEntry SSE2CostTable[] = {
1355 // We don't correctly identify costs of casts because they are marked as
1356 // custom.
1357 { ISD::SHL, MVT::v16i8, { 13, 21,26,28 } }, // cmpgtb sequence.
1358 { ISD::SHL, MVT::v8i16, { 24, 27,16,20 } }, // cmpgtw sequence.
1359 { ISD::SHL, MVT::v4i32, { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq.
1360 { ISD::SHL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1361
1362 { ISD::SRL, MVT::v16i8, { 14, 28,27,30 } }, // cmpgtb sequence.
1363 { ISD::SRL, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1364 { ISD::SRL, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1365 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1366
1367 { ISD::SRA, MVT::v16i8, { 27, 30,54,54 } }, // unpacked cmpgtb sequence.
1368 { ISD::SRA, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1369 { ISD::SRA, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1370 { ISD::SRA, MVT::v2i64, { 8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence.
1371
1372 { ISD::AND, MVT::v16i8, { 1, 1, 1, 1 } }, // pand
1373 { ISD::AND, MVT::v8i16, { 1, 1, 1, 1 } }, // pand
1374 { ISD::AND, MVT::v4i32, { 1, 1, 1, 1 } }, // pand
1375 { ISD::AND, MVT::v2i64, { 1, 1, 1, 1 } }, // pand
1376
1377 { ISD::OR, MVT::v16i8, { 1, 1, 1, 1 } }, // por
1378 { ISD::OR, MVT::v8i16, { 1, 1, 1, 1 } }, // por
1379 { ISD::OR, MVT::v4i32, { 1, 1, 1, 1 } }, // por
1380 { ISD::OR, MVT::v2i64, { 1, 1, 1, 1 } }, // por
1381
1382 { ISD::XOR, MVT::v16i8, { 1, 1, 1, 1 } }, // pxor
1383 { ISD::XOR, MVT::v8i16, { 1, 1, 1, 1 } }, // pxor
1384 { ISD::XOR, MVT::v4i32, { 1, 1, 1, 1 } }, // pxor
1385 { ISD::XOR, MVT::v2i64, { 1, 1, 1, 1 } }, // pxor
1386
1387 { ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq
1388 { ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq
1389
1390 { ISD::MUL, MVT::v16i8, { 6, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack
1391 { ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw
1392 { ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle
1393 { ISD::MUL, MVT::v2i64, { 7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add
1394
1395 { X86ISD::PMULUDQ, MVT::v2i64, { 1, 5, 1, 1 } },
1396
1397 { ISD::FDIV, MVT::f32, { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/
1398 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/
1399 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/
1400 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/
1401
1402 { ISD::FNEG, MVT::f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1403 { ISD::FNEG, MVT::f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1404 { ISD::FNEG, MVT::v4f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1405 { ISD::FNEG, MVT::v2f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1406
1407 { ISD::FADD, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1408 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1409 { ISD::FADD, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1410
1411 { ISD::FSUB, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1412 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1413 { ISD::FSUB, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1414
1415 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1416 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1417 };
1418
1419 if (ST->hasSSE2())
1420 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
1421 if (auto KindCost = Entry->Cost[CostKind])
1422 return LT.first * *KindCost;
1423
1424 static const CostKindTblEntry SSE1CostTable[] = {
1425 { ISD::FDIV, MVT::f32, { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/
1426 { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/
1427
1428 { ISD::FNEG, MVT::f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1429 { ISD::FNEG, MVT::v4f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1430
1431 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1432 { ISD::FADD, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1433
1434 { ISD::FSUB, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1435 { ISD::FSUB, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1436
1437 { ISD::FMUL, MVT::f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1438 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1439 };
1440
1441 if (ST->hasSSE1())
1442 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1443 if (auto KindCost = Entry->Cost[CostKind])
1444 return LT.first * *KindCost;
1445
1446 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
1447 { ISD::ADD, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1448 { ISD::SUB, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1449 { ISD::MUL, MVT::i64, { 2, 6, 1, 2 } },
1450 };
1451
1452 if (ST->is64Bit())
1453 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1454 if (auto KindCost = Entry->Cost[CostKind])
1455 return LT.first * *KindCost;
1456
1457 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1458 { ISD::ADD, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1459 { ISD::ADD, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1460 { ISD::ADD, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1461
1462 { ISD::SUB, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1463 { ISD::SUB, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1464 { ISD::SUB, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1465
1466 { ISD::MUL, MVT::i8, { 3, 4, 1, 1 } },
1467 { ISD::MUL, MVT::i16, { 2, 4, 1, 1 } },
1468 { ISD::MUL, MVT::i32, { 1, 4, 1, 1 } },
1469
1470 { ISD::FNEG, MVT::f64, { 2, 2, 1, 3 } }, // (x87)
1471 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1472 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1473 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // (x87)
1474 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87)
1475 };
1476
1477 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1478 if (auto KindCost = Entry->Cost[CostKind])
1479 return LT.first * *KindCost;
1480
1481 // It is not a good idea to vectorize division. We have to scalarize it and
1482 // in the process we will often end up having to spilling regular
1483 // registers. The overhead of division is going to dominate most kernels
1484 // anyways so try hard to prevent vectorization of division - it is
1485 // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1486 // to hide "20 cycles" for each lane.
1487 if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() &&
1488 (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
1489 ISD == ISD::UREM)) {
1490 InstructionCost ScalarCost =
1492 Op1Info.getNoProps(), Op2Info.getNoProps());
1493 return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1494 }
1495
1496 // Handle some basic single instruction code size cases.
1497 if (CostKind == TTI::TCK_CodeSize) {
1498 switch (ISD) {
1499 case ISD::FADD:
1500 case ISD::FSUB:
1501 case ISD::FMUL:
1502 case ISD::FDIV:
1503 case ISD::FNEG:
1504 case ISD::AND:
1505 case ISD::OR:
1506 case ISD::XOR:
1507 return LT.first;
1508 break;
1509 }
1510 }
1511
1512 // Fallback to the default implementation.
1513 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1514 Args, CxtI);
1515}
1516
1519 unsigned Opcode1, const SmallBitVector &OpcodeMask,
1521 if (isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
1522 return TTI::TCC_Basic;
1524}
1525
1527 TTI::ShuffleKind Kind, VectorType *BaseTp, ArrayRef<int> Mask,
1529 ArrayRef<const Value *> Args, const Instruction *CxtI) {
1530 // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1531 // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1532 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(BaseTp);
1533
1534 Kind = improveShuffleKindFromMask(Kind, Mask, BaseTp, Index, SubTp);
1535
1536 // Recognize a basic concat_vector shuffle.
1537 if (Kind == TTI::SK_PermuteTwoSrc &&
1538 Mask.size() == (2 * BaseTp->getElementCount().getKnownMinValue()) &&
1539 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
1542 CostKind, Mask.size() / 2, BaseTp);
1543
1544 // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1545 if (Kind == TTI::SK_Transpose)
1546 Kind = TTI::SK_PermuteTwoSrc;
1547
1548 if (Kind == TTI::SK_Broadcast) {
1549 // For Broadcasts we are splatting the first element from the first input
1550 // register, so only need to reference that input and all the output
1551 // registers are the same.
1552 LT.first = 1;
1553
1554 // If we're broadcasting a load then AVX/AVX2 can do this for free.
1555 using namespace PatternMatch;
1556 if (!Args.empty() && match(Args[0], m_OneUse(m_Load(m_Value()))) &&
1557 (ST->hasAVX2() ||
1558 (ST->hasAVX() && LT.second.getScalarSizeInBits() >= 32)))
1559 return TTI::TCC_Free;
1560 }
1561
1562 // Treat <X x bfloat> shuffles as <X x half>.
1563 if (LT.second.isVector() && LT.second.getScalarType() == MVT::bf16)
1564 LT.second = LT.second.changeVectorElementType(MVT::f16);
1565
1566 // Subvector extractions are free if they start at the beginning of a
1567 // vector and cheap if the subvectors are aligned.
1568 if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1569 int NumElts = LT.second.getVectorNumElements();
1570 if ((Index % NumElts) == 0)
1571 return 0;
1572 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1573 if (SubLT.second.isVector()) {
1574 int NumSubElts = SubLT.second.getVectorNumElements();
1575 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1576 return SubLT.first;
1577 // Handle some cases for widening legalization. For now we only handle
1578 // cases where the original subvector was naturally aligned and evenly
1579 // fit in its legalized subvector type.
1580 // FIXME: Remove some of the alignment restrictions.
1581 // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1582 // vectors.
1583 int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1584 if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1585 (NumSubElts % OrigSubElts) == 0 &&
1586 LT.second.getVectorElementType() ==
1587 SubLT.second.getVectorElementType() &&
1588 LT.second.getVectorElementType().getSizeInBits() ==
1590 assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1591 "Unexpected number of elements!");
1592 auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
1593 LT.second.getVectorNumElements());
1594 auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
1595 SubLT.second.getVectorNumElements());
1596 int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1597 InstructionCost ExtractCost =
1598 getShuffleCost(TTI::SK_ExtractSubvector, VecTy, std::nullopt,
1599 CostKind, ExtractIndex, SubTy);
1600
1601 // If the original size is 32-bits or more, we can use pshufd. Otherwise
1602 // if we have SSSE3 we can use pshufb.
1603 if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1604 return ExtractCost + 1; // pshufd or pshufb
1605
1606 assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1607 "Unexpected vector size");
1608
1609 return ExtractCost + 2; // worst case pshufhw + pshufd
1610 }
1611 }
1612 // If the extract subvector is not optimal, treat it as single op shuffle.
1614 }
1615
1616 // Subvector insertions are cheap if the subvectors are aligned.
1617 // Note that in general, the insertion starting at the beginning of a vector
1618 // isn't free, because we need to preserve the rest of the wide vector.
1619 if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1620 int NumElts = LT.second.getVectorNumElements();
1621 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1622 if (SubLT.second.isVector()) {
1623 int NumSubElts = SubLT.second.getVectorNumElements();
1624 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1625 return SubLT.first;
1626 }
1627
1628 // If the insertion isn't aligned, treat it like a 2-op shuffle.
1629 Kind = TTI::SK_PermuteTwoSrc;
1630 }
1631
1632 // Handle some common (illegal) sub-vector types as they are often very cheap
1633 // to shuffle even on targets without PSHUFB.
1634 EVT VT = TLI->getValueType(DL, BaseTp);
1635 if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1636 !ST->hasSSSE3()) {
1637 static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
1638 {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw
1639 {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw
1640 {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw
1641 {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw
1642 {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck
1643
1644 {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw
1645 {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw
1646 {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus
1647 {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck
1648
1649 {TTI::SK_Splice, MVT::v4i16, 2}, // punpck+psrldq
1650 {TTI::SK_Splice, MVT::v2i16, 2}, // punpck+psrldq
1651 {TTI::SK_Splice, MVT::v4i8, 2}, // punpck+psrldq
1652 {TTI::SK_Splice, MVT::v2i8, 2}, // punpck+psrldq
1653
1654 {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw
1655 {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw
1656 {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw
1657 {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw
1658 {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck
1659
1660 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
1661 {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
1662 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw
1663 {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw
1664 {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck
1665 };
1666
1667 if (ST->hasSSE2())
1668 if (const auto *Entry =
1669 CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1670 return Entry->Cost;
1671 }
1672
1673 // We are going to permute multiple sources and the result will be in multiple
1674 // destinations. Providing an accurate cost only for splits where the element
1675 // type remains the same.
1676 if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
1677 MVT LegalVT = LT.second;
1678 if (LegalVT.isVector() &&
1679 LegalVT.getVectorElementType().getSizeInBits() ==
1681 LegalVT.getVectorNumElements() <
1682 cast<FixedVectorType>(BaseTp)->getNumElements()) {
1683 unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
1684 unsigned LegalVTSize = LegalVT.getStoreSize();
1685 // Number of source vectors after legalization:
1686 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1687 // Number of destination vectors after legalization:
1688 InstructionCost NumOfDests = LT.first;
1689
1690 auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
1691 LegalVT.getVectorNumElements());
1692
1693 if (!Mask.empty() && NumOfDests.isValid()) {
1694 // Try to perform better estimation of the permutation.
1695 // 1. Split the source/destination vectors into real registers.
1696 // 2. Do the mask analysis to identify which real registers are
1697 // permuted. If more than 1 source registers are used for the
1698 // destination register building, the cost for this destination register
1699 // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
1700 // source register is used, build mask and calculate the cost as a cost
1701 // of PermuteSingleSrc.
1702 // Also, for the single register permute we try to identify if the
1703 // destination register is just a copy of the source register or the
1704 // copy of the previous destination register (the cost is
1705 // TTI::TCC_Basic). If the source register is just reused, the cost for
1706 // this operation is 0.
1707 NumOfDests =
1709 FixedVectorType::get(BaseTp->getElementType(), Mask.size()))
1710 .first;
1711 unsigned E = *NumOfDests.getValue();
1712 unsigned NormalizedVF =
1713 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
1714 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
1715 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
1716 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
1717 copy(Mask, NormalizedMask.begin());
1718 unsigned PrevSrcReg = 0;
1719 ArrayRef<int> PrevRegMask;
1722 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
1723 [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask,
1724 &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
1725 if (!ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) {
1726 // Check if the previous register can be just copied to the next
1727 // one.
1728 if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
1729 PrevRegMask != RegMask)
1731 RegMask, CostKind, 0, nullptr);
1732 else
1733 // Just a copy of previous destination register.
1735 return;
1736 }
1737 if (SrcReg != DestReg &&
1738 any_of(RegMask, [](int I) { return I != PoisonMaskElem; })) {
1739 // Just a copy of the source register.
1741 }
1742 PrevSrcReg = SrcReg;
1743 PrevRegMask = RegMask;
1744 },
1745 [this, SingleOpTy, CostKind, &Cost](ArrayRef<int> RegMask,
1746 unsigned /*Unused*/,
1747 unsigned /*Unused*/) {
1748 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
1749 CostKind, 0, nullptr);
1750 });
1751 return Cost;
1752 }
1753
1754 InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1755 return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1756 std::nullopt, CostKind, 0, nullptr);
1757 }
1758
1759 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
1760 }
1761
1762 // For 2-input shuffles, we must account for splitting the 2 inputs into many.
1763 if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
1764 // We assume that source and destination have the same vector type.
1765 InstructionCost NumOfDests = LT.first;
1766 InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1;
1767 LT.first = NumOfDests * NumOfShufflesPerDest;
1768 }
1769
1770 static const CostTblEntry AVX512VBMIShuffleTbl[] = {
1771 {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
1772 {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
1773
1774 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
1775 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
1776
1777 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
1778 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
1779 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b
1780 };
1781
1782 if (ST->hasVBMI())
1783 if (const auto *Entry =
1784 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1785 return LT.first * Entry->Cost;
1786
1787 static const CostTblEntry AVX512BWShuffleTbl[] = {
1788 {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1789 {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
1790 {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1791
1792 {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
1793 {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
1794 {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
1795 {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
1796
1797 {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
1798 {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
1799 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
1800 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
1801 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
1802
1803 {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
1804 {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
1805 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
1806 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
1807 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
1808
1809 {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
1810 {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb
1811
1812 {TTI::SK_Splice, MVT::v32i16, 2}, // vshufi64x2 + palignr
1813 {TTI::SK_Splice, MVT::v32f16, 2}, // vshufi64x2 + palignr
1814 {TTI::SK_Splice, MVT::v64i8, 2}, // vshufi64x2 + palignr
1815 };
1816
1817 if (ST->hasBWI())
1818 if (const auto *Entry =
1819 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1820 return LT.first * Entry->Cost;
1821
1822 static const CostKindTblEntry AVX512ShuffleTbl[] = {
1823 {TTI::SK_Broadcast, MVT::v8f64, { 1, 1, 1, 1 } }, // vbroadcastsd
1824 {TTI::SK_Broadcast, MVT::v16f32, { 1, 1, 1, 1 } }, // vbroadcastss
1825 {TTI::SK_Broadcast, MVT::v8i64, { 1, 1, 1, 1 } }, // vpbroadcastq
1826 {TTI::SK_Broadcast, MVT::v16i32, { 1, 1, 1, 1 } }, // vpbroadcastd
1827 {TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw
1828 {TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw
1829 {TTI::SK_Broadcast, MVT::v64i8, { 1, 1, 1, 1 } }, // vpbroadcastb
1830
1831 {TTI::SK_Reverse, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1832 {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1833 {TTI::SK_Reverse, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1834 {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1835 {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca
1836 {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca
1837 {TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca
1838
1839 {TTI::SK_Splice, MVT::v8f64, { 1, 1, 1, 1 } }, // vpalignd
1840 {TTI::SK_Splice, MVT::v4f64, { 1, 1, 1, 1 } }, // vpalignd
1841 {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd
1842 {TTI::SK_Splice, MVT::v8f32, { 1, 1, 1, 1 } }, // vpalignd
1843 {TTI::SK_Splice, MVT::v8i64, { 1, 1, 1, 1 } }, // vpalignd
1844 {TTI::SK_Splice, MVT::v4i64, { 1, 1, 1, 1 } }, // vpalignd
1845 {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd
1846 {TTI::SK_Splice, MVT::v8i32, { 1, 1, 1, 1 } }, // vpalignd
1847 {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr
1848 {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr
1849 {TTI::SK_Splice, MVT::v64i8, { 4, 4, 4, 4 } }, // split + palignr
1850
1851 {TTI::SK_PermuteSingleSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1852 {TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermpd
1853 {TTI::SK_PermuteSingleSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermpd
1854 {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1855 {TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermps
1856 {TTI::SK_PermuteSingleSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermps
1857 {TTI::SK_PermuteSingleSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1858 {TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermq
1859 {TTI::SK_PermuteSingleSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermq
1860 {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1861 {TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermd
1862 {TTI::SK_PermuteSingleSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermd
1863 {TTI::SK_PermuteSingleSrc, MVT::v16i8, { 1, 3, 1, 1 } }, // pshufb
1864
1865 {TTI::SK_PermuteTwoSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermt2pd
1866 {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermt2ps
1867 {TTI::SK_PermuteTwoSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermt2q
1868 {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermt2d
1869 {TTI::SK_PermuteTwoSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermt2pd
1870 {TTI::SK_PermuteTwoSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermt2ps
1871 {TTI::SK_PermuteTwoSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermt2q
1872 {TTI::SK_PermuteTwoSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermt2d
1873 {TTI::SK_PermuteTwoSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermt2pd
1874 {TTI::SK_PermuteTwoSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermt2ps
1875 {TTI::SK_PermuteTwoSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermt2q
1876 {TTI::SK_PermuteTwoSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermt2d
1877
1878 // FIXME: This just applies the type legalization cost rules above
1879 // assuming these completely split.
1880 {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } },
1881 {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } },
1882 {TTI::SK_PermuteSingleSrc, MVT::v64i8, { 14, 14, 14, 14 } },
1883 {TTI::SK_PermuteTwoSrc, MVT::v32i16, { 42, 42, 42, 42 } },
1884 {TTI::SK_PermuteTwoSrc, MVT::v32f16, { 42, 42, 42, 42 } },
1885 {TTI::SK_PermuteTwoSrc, MVT::v64i8, { 42, 42, 42, 42 } },
1886
1887 {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq
1888 {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq
1889 {TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vpternlogq
1890 {TTI::SK_Select, MVT::v8f64, { 1, 1, 1, 1 } }, // vblendmpd
1891 {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps
1892 {TTI::SK_Select, MVT::v8i64, { 1, 1, 1, 1 } }, // vblendmq
1893 {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd
1894 };
1895
1896 if (ST->hasAVX512())
1897 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1898 if (auto KindCost = Entry->Cost[CostKind])
1899 return LT.first * *KindCost;
1900
1901 static const CostTblEntry AVX2ShuffleTbl[] = {
1902 {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
1903 {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
1904 {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
1905 {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
1906 {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1907 {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw
1908 {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
1909
1910 {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
1911 {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
1912 {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
1913 {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
1914 {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1915 {TTI::SK_Reverse, MVT::v16f16, 2}, // vperm2i128 + pshufb
1916 {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
1917
1918 {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1919 {TTI::SK_Select, MVT::v16f16, 1}, // vpblendvb
1920 {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
1921
1922 {TTI::SK_Splice, MVT::v8i32, 2}, // vperm2i128 + vpalignr
1923 {TTI::SK_Splice, MVT::v8f32, 2}, // vperm2i128 + vpalignr
1924 {TTI::SK_Splice, MVT::v16i16, 2}, // vperm2i128 + vpalignr
1925 {TTI::SK_Splice, MVT::v16f16, 2}, // vperm2i128 + vpalignr
1926 {TTI::SK_Splice, MVT::v32i8, 2}, // vperm2i128 + vpalignr
1927
1928 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1929 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1930 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1931 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1932 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1933 // + vpblendvb
1934 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vperm2i128 + 2*vpshufb
1935 // + vpblendvb
1936 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
1937 // + vpblendvb
1938
1939 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
1940 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
1941 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
1942 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
1943 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1944 // + vpblendvb
1945 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 7}, // 2*vperm2i128 + 4*vpshufb
1946 // + vpblendvb
1947 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
1948 // + vpblendvb
1949 };
1950
1951 if (ST->hasAVX2())
1952 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1953 return LT.first * Entry->Cost;
1954
1955 static const CostTblEntry XOPShuffleTbl[] = {
1956 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
1957 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
1958 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
1959 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
1960 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1961 // + vinsertf128
1962 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
1963 // + vinsertf128
1964
1965 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
1966 // + vinsertf128
1967 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
1968 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
1969 // + vinsertf128
1970 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
1971 };
1972
1973 if (ST->hasXOP())
1974 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
1975 return LT.first * Entry->Cost;
1976
1977 static const CostTblEntry AVX1ShuffleTbl[] = {
1978 {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1979 {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1980 {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1981 {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1982 {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
1983 {TTI::SK_Broadcast, MVT::v16f16, 3}, // vpshuflw + vpshufd + vinsertf128
1984 {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
1985
1986 {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1987 {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1988 {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1989 {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1990 {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
1991 // + vinsertf128
1992 {TTI::SK_Reverse, MVT::v16f16, 4}, // vextractf128 + 2*pshufb
1993 // + vinsertf128
1994 {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
1995 // + vinsertf128
1996
1997 {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
1998 {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
1999 {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
2000 {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
2001 {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
2002 {TTI::SK_Select, MVT::v16f16, 3}, // vpand + vpandn + vpor
2003 {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
2004
2005 {TTI::SK_Splice, MVT::v4i64, 2}, // vperm2f128 + shufpd
2006 {TTI::SK_Splice, MVT::v4f64, 2}, // vperm2f128 + shufpd
2007 {TTI::SK_Splice, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
2008 {TTI::SK_Splice, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
2009 {TTI::SK_Splice, MVT::v16i16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2010 {TTI::SK_Splice, MVT::v16f16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2011 {TTI::SK_Splice, MVT::v32i8, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2012
2013 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
2014 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
2015 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
2016 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
2017 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
2018 // + 2*por + vinsertf128
2019 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 8}, // vextractf128 + 4*pshufb
2020 // + 2*por + vinsertf128
2021 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
2022 // + 2*por + vinsertf128
2023
2024 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
2025 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
2026 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
2027 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
2028 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
2029 // + 4*por + vinsertf128
2030 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 15}, // 2*vextractf128 + 8*pshufb
2031 // + 4*por + vinsertf128
2032 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
2033 // + 4*por + vinsertf128
2034 };
2035
2036 if (ST->hasAVX())
2037 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
2038 return LT.first * Entry->Cost;
2039
2040 static const CostTblEntry SSE41ShuffleTbl[] = {
2041 {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
2042 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
2043 {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
2044 {TTI::SK_Select, MVT::v4f32, 1}, // blendps
2045 {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
2046 {TTI::SK_Select, MVT::v8f16, 1}, // pblendw
2047 {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
2048 };
2049
2050 if (ST->hasSSE41())
2051 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
2052 return LT.first * Entry->Cost;
2053
2054 static const CostTblEntry SSSE3ShuffleTbl[] = {
2055 {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
2056 {TTI::SK_Broadcast, MVT::v8f16, 1}, // pshufb
2057 {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
2058
2059 {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
2060 {TTI::SK_Reverse, MVT::v8f16, 1}, // pshufb
2061 {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
2062
2063 {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
2064 {TTI::SK_Select, MVT::v8f16, 3}, // 2*pshufb + por
2065 {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
2066
2067 {TTI::SK_Splice, MVT::v4i32, 1}, // palignr
2068 {TTI::SK_Splice, MVT::v4f32, 1}, // palignr
2069 {TTI::SK_Splice, MVT::v8i16, 1}, // palignr
2070 {TTI::SK_Splice, MVT::v8f16, 1}, // palignr
2071 {TTI::SK_Splice, MVT::v16i8, 1}, // palignr
2072
2073 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
2074 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // pshufb
2075 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
2076
2077 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
2078 {TTI::SK_PermuteTwoSrc, MVT::v8f16, 3}, // 2*pshufb + por
2079 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
2080 };
2081
2082 if (ST->hasSSSE3())
2083 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
2084 return LT.first * Entry->Cost;
2085
2086 static const CostTblEntry SSE2ShuffleTbl[] = {
2087 {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
2088 {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
2089 {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
2090 {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
2091 {TTI::SK_Broadcast, MVT::v8f16, 2}, // pshuflw + pshufd
2092 {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
2093
2094 {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
2095 {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
2096 {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
2097 {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
2098 {TTI::SK_Reverse, MVT::v8f16, 3}, // pshuflw + pshufhw + pshufd
2099 {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
2100 // + 2*pshufd + 2*unpck + packus
2101
2102 {TTI::SK_Select, MVT::v2i64, 1}, // movsd
2103 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
2104 {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
2105 {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
2106 {TTI::SK_Select, MVT::v8f16, 3}, // pand + pandn + por
2107 {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
2108
2109 {TTI::SK_Splice, MVT::v2i64, 1}, // shufpd
2110 {TTI::SK_Splice, MVT::v2f64, 1}, // shufpd
2111 {TTI::SK_Splice, MVT::v4i32, 2}, // 2*{unpck,movsd,pshufd}
2112 {TTI::SK_Splice, MVT::v8i16, 3}, // psrldq + psrlldq + por
2113 {TTI::SK_Splice, MVT::v8f16, 3}, // psrldq + psrlldq + por
2114 {TTI::SK_Splice, MVT::v16i8, 3}, // psrldq + psrlldq + por
2115
2116 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
2117 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
2118 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
2119 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
2120 // + pshufd/unpck
2121 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 5}, // 2*pshuflw + 2*pshufhw
2122 // + pshufd/unpck
2123 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
2124 // + 2*pshufd + 2*unpck + 2*packus
2125
2126 { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
2127 { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
2128 { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
2129 { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
2130 { TTI::SK_PermuteTwoSrc, MVT::v8f16, 8 }, // blend+permute
2131 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
2132 };
2133
2134 static const CostTblEntry SSE3BroadcastLoadTbl[] = {
2135 {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
2136 };
2137
2138 if (ST->hasSSE2()) {
2139 bool IsLoad =
2140 llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); });
2141 if (ST->hasSSE3() && IsLoad)
2142 if (const auto *Entry =
2143 CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
2145 LT.second.getVectorElementCount()) &&
2146 "Table entry missing from isLegalBroadcastLoad()");
2147 return LT.first * Entry->Cost;
2148 }
2149
2150 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
2151 return LT.first * Entry->Cost;
2152 }
2153
2154 static const CostTblEntry SSE1ShuffleTbl[] = {
2155 { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
2156 { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
2157 { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
2158 { TTI::SK_Splice, MVT::v4f32, 2 }, // 2*shufps
2159 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
2160 { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
2161 };
2162
2163 if (ST->hasSSE1())
2164 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
2165 return LT.first * Entry->Cost;
2166
2167 return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
2168}
2169
2171 Type *Src,
2174 const Instruction *I) {
2175 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2176 assert(ISD && "Invalid opcode");
2177
2178 // The cost tables include both specific, custom (non-legal) src/dst type
2179 // conversions and generic, legalized types. We test for customs first, before
2180 // falling back to legalization.
2181 // FIXME: Need a better design of the cost table to handle non-simple types of
2182 // potential massive combinations (elem_num x src_type x dst_type).
2183 static const TypeConversionCostKindTblEntry AVX512BWConversionTbl[]{
2184 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2185 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2186
2187 // Mask sign extend has an instruction.
2188 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2189 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2190 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2191 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2192 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2193 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2194 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2195 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2196 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2197 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2198 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2199 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2200 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2201 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2202 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, { 1, 1, 1, 1 } },
2203 { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, { 1, 1, 1, 1 } },
2204 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, { 1, 1, 1, 1 } },
2205
2206 // Mask zero extend is a sext + shift.
2207 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2208 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2209 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2210 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2211 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2212 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2213 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2214 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2215 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2216 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2217 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2218 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2219 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2220 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2221 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, { 2, 1, 1, 1 } },
2222 { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, { 2, 1, 1, 1 } },
2223 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, { 2, 1, 1, 1 } },
2224
2225 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2226 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2227 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2228 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2229 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2230 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2231 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2232 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2233 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2234 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2235 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2236 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2237 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2238 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2239 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, { 2, 1, 1, 1 } },
2240 { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, { 2, 1, 1, 1 } },
2241 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, { 2, 1, 1, 1 } },
2242
2243 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 2, 1, 1, 1 } },
2244 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // widen to zmm
2245 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, { 2, 1, 1, 1 } }, // vpmovwb
2246 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, { 2, 1, 1, 1 } }, // vpmovwb
2247 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, { 2, 1, 1, 1 } }, // vpmovwb
2248 };
2249
2250 static const TypeConversionCostKindTblEntry AVX512DQConversionTbl[] = {
2251 // Mask sign extend has an instruction.
2252 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2253 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2254 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2255 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2256 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2257 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, { 1, 1, 1, 1 } },
2258 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } },
2259 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } },
2260
2261 // Mask zero extend is a sext + shift.
2262 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1, } },
2263 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1, } },
2264 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1, } },
2265 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1, } },
2266 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1, } },
2267 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, { 2, 1, 1, 1, } },
2268 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1, } },
2269 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1, } },
2270
2271 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2272 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2273 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2274 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2275 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2276 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } },
2277 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } },
2278 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, { 2, 1, 1, 1 } },
2279
2280 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2281 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2282
2283 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2284 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2285
2286 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2287 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2288
2289 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2290 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2291 };
2292
2293 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
2294 // 256-bit wide vectors.
2295
2296 static const TypeConversionCostKindTblEntry AVX512FConversionTbl[] = {
2297 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 1, 1, 1, 1 } },
2298 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, { 3, 1, 1, 1 } },
2299 { ISD::FP_EXTEND, MVT::v16f64, MVT::v16f32, { 4, 1, 1, 1 } }, // 2*vcvtps2pd+vextractf64x4
2300 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 1, 1, 1, 1 } },
2301
2302 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2303 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2304 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2305 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2306 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2307 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2308 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2309 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2310 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2311 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2312 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2313 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2314 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2315 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2316 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2317 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, { 2, 1, 1, 1 } }, // vpmovdb
2318 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, { 2, 1, 1, 1 } }, // vpmovdb
2319 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2320 { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2321 { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2322 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2323 { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2324 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, { 2, 1, 1, 1 } }, // vpmovqb
2325 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, { 1, 1, 1, 1 } }, // vpshufb
2326 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2327 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2328 { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2329 { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2330 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2331 { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2332 { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2333 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, { 1, 1, 1, 1 } }, // vpmovqd
2334 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // zmm vpmovqd
2335 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, { 5, 1, 1, 1 } },// 2*vpmovqd+concat+vpmovdb
2336
2337 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } }, // extend to v16i32
2338 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 8, 1, 1, 1 } },
2339 { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, { 8, 1, 1, 1 } },
2340
2341 // Sign extend is zmm vpternlogd+vptruncdb.
2342 // Zero extend is zmm broadcast load+vptruncdw.
2343 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 3, 1, 1, 1 } },
2344 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 4, 1, 1, 1 } },
2345 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 3, 1, 1, 1 } },
2346 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 4, 1, 1, 1 } },
2347 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 3, 1, 1, 1 } },
2348 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 4, 1, 1, 1 } },
2349 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 3, 1, 1, 1 } },
2350 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 4, 1, 1, 1 } },
2351
2352 // Sign extend is zmm vpternlogd+vptruncdw.
2353 // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
2354 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 3, 1, 1, 1 } },
2355 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2356 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 3, 1, 1, 1 } },
2357 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2358 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 3, 1, 1, 1 } },
2359 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2360 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 3, 1, 1, 1 } },
2361 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2362
2363 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2364 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2365 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2366 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2367 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2368 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2369 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2370 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2371 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2372 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2373
2374 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2375 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2376 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogq
2377 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2378
2379 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2380 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2381 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2382 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2383 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2384 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2385 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2386 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2387 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2388 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2389
2390 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2391 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2392
2393 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2394 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2395 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2396 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2397 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2398 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2399 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2400 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2401
2402 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2403 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2404 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2405 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2406 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2407 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2408 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2409 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2410 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, {26, 1, 1, 1 } },
2411 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 5, 1, 1, 1 } },
2412
2413 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2414 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, { 7, 1, 1, 1 } },
2415 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64, {15, 1, 1, 1 } },
2416 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32, {11, 1, 1, 1 } },
2417 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64, {31, 1, 1, 1 } },
2418 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2419 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, { 7, 1, 1, 1 } },
2420 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, { 5, 1, 1, 1 } },
2421 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64, {15, 1, 1, 1 } },
2422 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2423 { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, { 3, 1, 1, 1 } },
2424
2425 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2426 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2427 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, { 3, 1, 1, 1 } },
2428 { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, { 1, 1, 1, 1 } },
2429 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, { 3, 1, 1, 1 } },
2430 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, { 3, 1, 1, 1 } },
2431 };
2432
2433 static const TypeConversionCostKindTblEntry AVX512BWVLConversionTbl[] {
2434 // Mask sign extend has an instruction.
2435 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2436 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2437 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2438 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2439 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2440 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2441 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2442 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2443 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2444 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2445 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2446 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2447 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2448 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2449 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, { 1, 1, 1, 1 } },
2450 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, { 1, 1, 1, 1 } },
2451 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, { 1, 1, 1, 1 } },
2452
2453 // Mask zero extend is a sext + shift.
2454 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2455 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2456 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2457 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2458 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2459 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2460 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2461 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2462 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2463 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2464 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2465 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2466 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2467 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2468 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, { 2, 1, 1, 1 } },
2469 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, { 2, 1, 1, 1 } },
2470 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, { 2, 1, 1, 1 } },
2471
2472 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2473 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2474 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2475 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2476 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2477 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2478 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2479 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2480 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2481 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2482 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2483 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2484 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2485 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2486 { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, { 2, 1, 1, 1 } },
2487 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, { 2, 1, 1, 1 } },
2488 { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, { 2, 1, 1, 1 } },
2489
2490 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } },
2491 };
2492
2493 static const TypeConversionCostKindTblEntry AVX512DQVLConversionTbl[] = {
2494 // Mask sign extend has an instruction.
2495 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2496 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2497 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2498 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, { 1, 1, 1, 1 } },
2499 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2500 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, { 1, 1, 1, 1 } },
2501 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } },
2502 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2503
2504 // Mask zero extend is a sext + shift.
2505 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } },
2506 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1 } },
2507 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } },
2508 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, { 2, 1, 1, 1 } },
2509 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } },
2510 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, { 2, 1, 1, 1 } },
2511 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } },
2512 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } },
2513
2514 { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, { 2, 1, 1, 1 } },
2515 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } },
2516 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2517 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2518 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2519 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2520 { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, { 2, 1, 1, 1 } },
2521 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2522
2523 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2524 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2525 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2526 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2527
2528 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2529 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2530 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2531 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2532
2533 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2534 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2535 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2536 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2537
2538 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2539 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2540 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2541 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2542 };
2543
2544 static const TypeConversionCostKindTblEntry AVX512VLConversionTbl[] = {
2545 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2546 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2547 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2548 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 8, 1, 1, 1 } }, // split+2*v8i8
2549 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2550 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2551 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2552 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 8, 1, 1, 1 } }, // split+2*v8i16
2553 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2554 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2555 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2556 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2557 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2558 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2559 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // vpmovqd
2560 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqb
2561 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqw
2562 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, { 2, 1, 1, 1 } }, // vpmovwb
2563
2564 // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
2565 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
2566 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 5, 1, 1, 1 } },
2567 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 6, 1, 1, 1 } },
2568 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 5, 1, 1, 1 } },
2569 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 6, 1, 1, 1 } },
2570 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 5, 1, 1, 1 } },
2571 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 6, 1, 1, 1 } },
2572 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, {10, 1, 1, 1 } },
2573 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, {12, 1, 1, 1 } },
2574
2575 // sign extend is vpcmpeq+maskedmove+vpmovdw
2576 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
2577 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2578 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 5, 1, 1, 1 } },
2579 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2580 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 5, 1, 1, 1 } },
2581 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2582 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 5, 1, 1, 1 } },
2583 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, {10, 1, 1, 1 } },
2584 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, {12, 1, 1, 1 } },
2585
2586 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogd
2587 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2588 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogd
2589 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2590 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogd
2591 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2592 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2593 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2594
2595 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogq
2596 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2597 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogq
2598 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2599
2600 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2601 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2602 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2603 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2604 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2605 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2606 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2607 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2608 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2609 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2610 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2611 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2612
2613 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2614 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2615 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2616 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2617
2618 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
2619 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
2620 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2621 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2622 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2623 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2624 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 1, 1, 1, 1 } },
2625 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
2626 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2627 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2628 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 5, 1, 1, 1 } },
2629 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2630 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 5, 1, 1, 1 } },
2631
2632 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2633 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2634 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, { 5, 1, 1, 1 } },
2635
2636 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
2637 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
2638 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
2639 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
2640 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2641 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2642 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2643 };
2644
2645 static const TypeConversionCostKindTblEntry AVX2ConversionTbl[] = {
2646 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2647 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2648 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2649 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2650 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2651 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2652
2653 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2654 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2655 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2656 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2657 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2658 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2659 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2660 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2661 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2662 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2663 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2664 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2665 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2666 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2667
2668 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2669
2670 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 4, 1, 1, 1 } },
2671 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 4, 1, 1, 1 } },
2672 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 1, 1, 1, 1 } },
2673 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 1, 1, 1, 1 } },
2674 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 1, 1, 1, 1 } },
2675 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 4, 1, 1, 1 } },
2676 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 4, 1, 1, 1 } },
2677 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 1, 1, 1, 1 } },
2678 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 1, 1, 1, 1 } },
2679 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 5, 1, 1, 1 } },
2680 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } },
2681 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 2, 1, 1, 1 } },
2682
2683 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 3, 1, 1, 1 } },
2684 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 3, 1, 1, 1 } },
2685
2686 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2687 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2688 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2689 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 3, 1, 1, 1 } },
2690
2691 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 3, 1, 1, 1 } },
2692 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 3, 1, 1, 1 } },
2693 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2694 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
2695 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2696 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 4, 1, 1, 1 } },
2697 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 3, 1, 1, 1 } },
2698 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 4, 1, 1, 1 } },
2699
2700 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2701 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2702 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2703 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2704 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2705 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2706 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 3, 1, 1, 1 } },
2707
2708 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2709 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2710 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2711 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2712 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 2, 1, 1, 1 } },
2713 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 1, 1, 1, 1 } },
2714 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 2, 1, 1, 1 } },
2715 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2716 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2717 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2718 };
2719
2720 static const TypeConversionCostKindTblEntry AVXConversionTbl[] = {
2721 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2722 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2723 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2724 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2725 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2726 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2727
2728 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2729 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2730 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2731 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2732 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2733 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2734 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2735 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2736 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2737 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2738 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2739 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2740
2741 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 4, 1, 1, 1 } },
2742 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 5, 1, 1, 1 } },
2743 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 4, 1, 1, 1 } },
2744 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 9, 1, 1, 1 } },
2745 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, {11, 1, 1, 1 } },
2746
2747 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 6, 1, 1, 1 } },
2748 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 6, 1, 1, 1 } },
2749 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // and+extract+packuswb
2750 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 5, 1, 1, 1 } },
2751 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
2752 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 5, 1, 1, 1 } },
2753 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 3, 1, 1, 1 } }, // and+extract+2*packusdw
2754 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 2, 1, 1, 1 } },
2755
2756 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, { 3, 1, 1, 1 } },
2757 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, { 3, 1, 1, 1 } },
2758 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, { 8, 1, 1, 1 } },
2759 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2760 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2761 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2762 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2763 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2764 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2765 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2766 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 5, 1, 1, 1 } },
2767 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 8, 1, 1, 1 } },
2768
2769 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, { 7, 1, 1, 1 } },
2770 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, { 7, 1, 1, 1 } },
2771 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, { 6, 1, 1, 1 } },
2772 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2773 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2774 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2775 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2776 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 4, 1, 1, 1 } },
2777 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 4, 1, 1, 1 } },
2778 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
2779 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 6, 1, 1, 1 } },
2780 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 8, 1, 1, 1 } },
2781 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, {10, 1, 1, 1 } },
2782 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, {10, 1, 1, 1 } },
2783 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {18, 1, 1, 1 } },
2784 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2785 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, {10, 1, 1, 1 } },
2786
2787 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2788 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2789 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2790 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
2791 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
2792 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
2793 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
2794 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
2795 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 2, 1, 1, 1 } },
2796 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 2, 1, 1, 1 } },
2797 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 5, 1, 1, 1 } },
2798
2799 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2800 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2801 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2802 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
2803 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
2804 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
2805 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
2806 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
2807 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
2808 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2809 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 6, 1, 1, 1 } },
2810 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 7, 1, 1, 1 } },
2811 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 7, 1, 1, 1 } },
2812
2813 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, { 1, 1, 1, 1 } },
2814 { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, { 1, 1, 1, 1 } },
2815 };
2816
2817 static const TypeConversionCostKindTblEntry SSE41ConversionTbl[] = {
2818 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
2819 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
2820 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
2821 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
2822 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
2823 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
2824 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
2825 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
2826 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
2827 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
2828 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
2829 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
2830
2831 // These truncates end up widening elements.
2832 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 1, 1, 1, 1 } }, // PMOVXZBQ
2833 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 1, 1, 1, 1 } }, // PMOVXZWQ
2834 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 1, 1, 1, 1 } }, // PMOVXZBD
2835
2836 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 2, 1, 1, 1 } },
2837 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 2, 1, 1, 1 } },
2838 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 2, 1, 1, 1 } },
2839
2840 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
2841 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
2842 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
2843 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
2844 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
2845 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2846 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
2847 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2848 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
2849 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 1, 1, 1, 1 } },
2850 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2851
2852 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
2853 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
2854 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 4, 1, 1, 1 } },
2855 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 4, 1, 1, 1 } },
2856 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
2857 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2858 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
2859 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2860 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 3, 1, 1, 1 } },
2861 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
2862 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 2, 1, 1, 1 } },
2863 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {12, 1, 1, 1 } },
2864 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {22, 1, 1, 1 } },
2865 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 4, 1, 1, 1 } },
2866
2867 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
2868 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
2869 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
2870 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
2871 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
2872 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
2873 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
2874 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
2875 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
2876 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
2877
2878 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
2879 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
2880 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
2881 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
2882 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
2883 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
2884 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
2885 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
2886 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
2887 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2888 };
2889
2890 static const TypeConversionCostKindTblEntry SSE2ConversionTbl[] = {
2891 // These are somewhat magic numbers justified by comparing the
2892 // output of llvm-mca for our various supported scheduler models
2893 // and basing it off the worst case scenario.
2894 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
2895 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
2896 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 3, 1, 1, 1 } },
2897 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 3, 1, 1, 1 } },
2898 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 3, 1, 1, 1 } },
2899 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
2900 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 3, 1, 1, 1 } },
2901 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
2902 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
2903 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 4, 1, 1, 1 } },
2904 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 8, 1, 1, 1 } },
2905 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 8, 1, 1, 1 } },
2906
2907 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
2908 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
2909 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 8, 1, 1, 1 } },
2910 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 9, 1, 1, 1 } },
2911 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
2912 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 4, 1, 1, 1 } },
2913 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 4, 1, 1, 1 } },
2914 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
2915 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 7, 1, 1, 1 } },
2916 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 7, 1, 1, 1 } },
2917 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
2918 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, {15, 1, 1, 1 } },
2919 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {18, 1, 1, 1 } },
2920
2921 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
2922 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
2923 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
2924 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
2925 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
2926 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
2927 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
2928 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
2929 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
2930 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2931
2932 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
2933 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
2934 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
2935 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, {15, 1, 1, 1 } },
2936 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
2937 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
2938 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
2939 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
2940 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 8, 1, 1, 1 } },
2941 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 8, 1, 1, 1 } },
2942
2943 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
2944 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
2945 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 2, 1, 1, 1 } },
2946 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 3, 1, 1, 1 } },
2947 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
2948 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 2, 1, 1, 1 } },
2949 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 2, 1, 1, 1 } },
2950 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 3, 1, 1, 1 } },
2951 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
2952 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 2, 1, 1, 1 } },
2953 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
2954 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 2, 1, 1, 1 } },
2955
2956 // These truncates are really widening elements.
2957 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 1, 1, 1, 1 } }, // PSHUFD
2958 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } }, // PUNPCKLWD+DQ
2959 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // PUNPCKLBW+WD+PSHUFD
2960 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 1, 1, 1, 1 } }, // PUNPCKLWD
2961 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } }, // PUNPCKLBW+WD
2962 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 1, 1, 1, 1 } }, // PUNPCKLBW
2963
2964 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 2, 1, 1, 1 } }, // PAND+PACKUSWB
2965 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } },
2966 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 3, 1, 1, 1 } }, // PAND+2*PACKUSWB
2967 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 7, 1, 1, 1 } },
2968 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, { 1, 1, 1, 1 } },
2969 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 3, 1, 1, 1 } },
2970 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
2971 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, {10, 1, 1, 1 } },
2972 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 4, 1, 1, 1 } }, // PAND+3*PACKUSWB
2973 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 2, 1, 1, 1 } }, // PSHUFD+PSHUFLW
2974 { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, { 1, 1, 1, 1 } }, // PSHUFD
2975 };
2976
2977 // Attempt to map directly to (simple) MVT types to let us match custom entries.
2978 EVT SrcTy = TLI->getValueType(DL, Src);
2979 EVT DstTy = TLI->getValueType(DL, Dst);
2980
2981 // The function getSimpleVT only handles simple value types.
2982 if (SrcTy.isSimple() && DstTy.isSimple()) {
2983 MVT SimpleSrcTy = SrcTy.getSimpleVT();
2984 MVT SimpleDstTy = DstTy.getSimpleVT();
2985
2986 if (ST->useAVX512Regs()) {
2987 if (ST->hasBWI())
2988 if (const auto *Entry = ConvertCostTableLookup(
2989 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2990 if (auto KindCost = Entry->Cost[CostKind])
2991 return *KindCost;
2992
2993 if (ST->hasDQI())
2994 if (const auto *Entry = ConvertCostTableLookup(
2995 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2996 if (auto KindCost = Entry->Cost[CostKind])
2997 return *KindCost;
2998
2999 if (ST->hasAVX512())
3000 if (const auto *Entry = ConvertCostTableLookup(
3001 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3002 if (auto KindCost = Entry->Cost[CostKind])
3003 return *KindCost;
3004 }
3005
3006 if (ST->hasBWI())
3007 if (const auto *Entry = ConvertCostTableLookup(
3008 AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3009 if (auto KindCost = Entry->Cost[CostKind])
3010 return *KindCost;
3011
3012 if (ST->hasDQI())
3013 if (const auto *Entry = ConvertCostTableLookup(
3014 AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3015 if (auto KindCost = Entry->Cost[CostKind])
3016 return *KindCost;
3017
3018 if (ST->hasAVX512())
3019 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3020 SimpleDstTy, SimpleSrcTy))
3021 if (auto KindCost = Entry->Cost[CostKind])
3022 return *KindCost;
3023
3024 if (ST->hasAVX2()) {
3025 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3026 SimpleDstTy, SimpleSrcTy))
3027 if (auto KindCost = Entry->Cost[CostKind])
3028 return *KindCost;
3029 }
3030
3031 if (ST->hasAVX()) {
3032 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3033 SimpleDstTy, SimpleSrcTy))
3034 if (auto KindCost = Entry->Cost[CostKind])
3035 return *KindCost;
3036 }
3037
3038 if (ST->hasSSE41()) {
3039 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3040 SimpleDstTy, SimpleSrcTy))
3041 if (auto KindCost = Entry->Cost[CostKind])
3042 return *KindCost;
3043 }
3044
3045 if (ST->hasSSE2()) {
3046 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3047 SimpleDstTy, SimpleSrcTy))
3048 if (auto KindCost = Entry->Cost[CostKind])
3049 return *KindCost;
3050 }
3051 }
3052
3053 // Fall back to legalized types.
3054 std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src);
3055 std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst);
3056
3057 // If we're truncating to the same legalized type - just assume its free.
3058 if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second)
3059 return TTI::TCC_Free;
3060
3061 if (ST->useAVX512Regs()) {
3062 if (ST->hasBWI())
3063 if (const auto *Entry = ConvertCostTableLookup(
3064 AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
3065 if (auto KindCost = Entry->Cost[CostKind])
3066 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3067
3068 if (ST->hasDQI())
3069 if (const auto *Entry = ConvertCostTableLookup(
3070 AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
3071 if (auto KindCost = Entry->Cost[CostKind])
3072 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3073
3074 if (ST->hasAVX512())
3075 if (const auto *Entry = ConvertCostTableLookup(
3076 AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
3077 if (auto KindCost = Entry->Cost[CostKind])
3078 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3079 }
3080
3081 if (ST->hasBWI())
3082 if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
3083 LTDest.second, LTSrc.second))
3084 if (auto KindCost = Entry->Cost[CostKind])
3085 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3086
3087 if (ST->hasDQI())
3088 if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
3089 LTDest.second, LTSrc.second))
3090 if (auto KindCost = Entry->Cost[CostKind])
3091 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3092
3093 if (ST->hasAVX512())
3094 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3095 LTDest.second, LTSrc.second))
3096 if (auto KindCost = Entry->Cost[CostKind])
3097 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3098
3099 if (ST->hasAVX2())
3100 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3101 LTDest.second, LTSrc.second))
3102 if (auto KindCost = Entry->Cost[CostKind])
3103 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3104
3105 if (ST->hasAVX())
3106 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3107 LTDest.second, LTSrc.second))
3108 if (auto KindCost = Entry->Cost[CostKind])
3109 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3110
3111 if (ST->hasSSE41())
3112 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3113 LTDest.second, LTSrc.second))
3114 if (auto KindCost = Entry->Cost[CostKind])
3115 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3116
3117 if (ST->hasSSE2())
3118 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3119 LTDest.second, LTSrc.second))
3120 if (auto KindCost = Entry->Cost[CostKind])
3121 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3122
3123 // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
3124 // sitofp.
3125 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3126 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
3127 Type *ExtSrc = Src->getWithNewBitWidth(32);
3128 unsigned ExtOpc =
3129 (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
3130
3131 // For scalar loads the extend would be free.
3132 InstructionCost ExtCost = 0;
3133 if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
3134 ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
3135
3136 return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
3138 }
3139
3140 // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
3141 // i32.
3142 if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
3143 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
3144 Type *TruncDst = Dst->getWithNewBitWidth(32);
3145 return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
3146 getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
3148 }
3149
3150 // TODO: Allow non-throughput costs that aren't binary.
3151 auto AdjustCost = [&CostKind](InstructionCost Cost,
3154 return Cost == 0 ? 0 : N;
3155 return Cost * N;
3156 };
3157 return AdjustCost(
3158 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3159}
3160
3162 Type *CondTy,
3163 CmpInst::Predicate VecPred,
3165 const Instruction *I) {
3166 // Early out if this type isn't scalar/vector integer/float.
3167 if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy()))
3168 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3169 I);
3170
3171 // Legalize the type.
3172 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3173
3174 MVT MTy = LT.second;
3175
3176 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3177 assert(ISD && "Invalid opcode");
3178
3179 InstructionCost ExtraCost = 0;
3180 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
3181 // Some vector comparison predicates cost extra instructions.
3182 // TODO: Adjust ExtraCost based on CostKind?
3183 // TODO: Should we invert this and assume worst case cmp costs
3184 // and reduce for particular predicates?
3185 if (MTy.isVector() &&
3186 !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
3187 (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
3188 ST->hasBWI())) {
3189 // Fallback to I if a specific predicate wasn't specified.
3190 CmpInst::Predicate Pred = VecPred;
3191 if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE ||
3193 Pred = cast<CmpInst>(I)->getPredicate();
3194
3195 bool CmpWithConstant = false;
3196 if (auto *CmpInstr = dyn_cast_or_null<CmpInst>(I))
3197 CmpWithConstant = isa<Constant>(CmpInstr->getOperand(1));
3198
3199 switch (Pred) {
3201 // xor(cmpeq(x,y),-1)
3202 ExtraCost = CmpWithConstant ? 0 : 1;
3203 break;
3206 // xor(cmpgt(x,y),-1)
3207 ExtraCost = CmpWithConstant ? 0 : 1;
3208 break;
3211 // cmpgt(xor(x,signbit),xor(y,signbit))
3212 // xor(cmpeq(pmaxu(x,y),x),-1)
3213 ExtraCost = CmpWithConstant ? 1 : 2;
3214 break;
3217 if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
3218 (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
3219 // cmpeq(psubus(x,y),0)
3220 // cmpeq(pminu(x,y),x)
3221 ExtraCost = 1;
3222 } else {
3223 // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
3224 ExtraCost = CmpWithConstant ? 2 : 3;
3225 }
3226 break;
3229 // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases.
3230 // Use FCMP_UEQ expansion - FCMP_ONE should be the same.
3231 if (CondTy && !ST->hasAVX())
3232 return getCmpSelInstrCost(Opcode, ValTy, CondTy,
3234 getCmpSelInstrCost(Opcode, ValTy, CondTy,
3236 getArithmeticInstrCost(Instruction::Or, CondTy, CostKind);
3237
3238 break;
3241 // Assume worst case scenario and add the maximum extra cost.
3242 ExtraCost = 3;
3243 break;
3244 default:
3245 break;
3246 }
3247 }
3248 }
3249
3250 static const CostKindTblEntry SLMCostTbl[] = {
3251 // slm pcmpeq/pcmpgt throughput is 2
3252 { ISD::SETCC, MVT::v2i64, { 2, 5, 1, 2 } },
3253 // slm pblendvb/blendvpd/blendvps throughput is 4
3254 { ISD::SELECT, MVT::v2f64, { 4, 4, 1, 3 } }, // vblendvpd
3255 { ISD::SELECT, MVT::v4f32, { 4, 4, 1, 3 } }, // vblendvps
3256 { ISD::SELECT, MVT::v2i64, { 4, 4, 1, 3 } }, // pblendvb
3257 { ISD::SELECT, MVT::v8i32, { 4, 4, 1, 3 } }, // pblendvb
3258 { ISD::SELECT, MVT::v8i16, { 4, 4, 1, 3 } }, // pblendvb
3259 { ISD::SELECT, MVT::v16i8, { 4, 4, 1, 3 } }, // pblendvb
3260 };
3261
3262 static const CostKindTblEntry AVX512BWCostTbl[] = {
3263 { ISD::SETCC, MVT::v32i16, { 1, 1, 1, 1 } },
3264 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 1 } },
3265 { ISD::SETCC, MVT::v64i8, { 1, 1, 1, 1 } },
3266 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 1 } },
3267
3268 { ISD::SELECT, MVT::v32i16, { 1, 1, 1, 1 } },
3269 { ISD::SELECT, MVT::v64i8, { 1, 1, 1, 1 } },
3270 };
3271
3272 static const CostKindTblEntry AVX512CostTbl[] = {
3273 { ISD::SETCC, MVT::v8f64, { 1, 4, 1, 1 } },
3274 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 1 } },
3275 { ISD::SETCC, MVT::v16f32, { 1, 4, 1, 1 } },
3276 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 1 } },
3277
3278 { ISD::SETCC, MVT::v8i64, { 1, 1, 1, 1 } },
3279 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 1 } },
3280 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3281 { ISD::SETCC, MVT::v16i32, { 1, 1, 1, 1 } },
3282 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 1 } },
3283 { ISD::SETCC, MVT::v32i16, { 3, 7, 5, 5 } },
3284 { ISD::SETCC, MVT::v64i8, { 3, 7, 5, 5 } },
3285
3286 { ISD::SELECT, MVT::v8i64, { 1, 1, 1, 1 } },
3287 { ISD::SELECT, MVT::v4i64, { 1, 1, 1, 1 } },
3288 { ISD::SELECT, MVT::v2i64, { 1, 1, 1, 1 } },
3289 { ISD::SELECT, MVT::v16i32, { 1, 1, 1, 1 } },
3290 { ISD::SELECT, MVT::v8i32, { 1, 1, 1, 1 } },
3291 { ISD::SELECT, MVT::v4i32, { 1, 1, 1, 1 } },
3292 { ISD::SELECT, MVT::v8f64, { 1, 1, 1, 1 } },
3293 { ISD::SELECT, MVT::v4f64, { 1, 1, 1, 1 } },
3294 { ISD::SELECT, MVT::v2f64, { 1, 1, 1, 1 } },
3295 { ISD::SELECT, MVT::f64, { 1, 1, 1, 1 } },
3296 { ISD::SELECT, MVT::v16f32, { 1, 1, 1, 1 } },
3297 { ISD::SELECT, MVT::v8f32 , { 1, 1, 1, 1 } },
3298 { ISD::SELECT, MVT::v4f32, { 1, 1, 1, 1 } },
3299 { ISD::SELECT, MVT::f32 , { 1, 1, 1, 1 } },
3300
3301 { ISD::SELECT, MVT::v32i16, { 2, 2, 4, 4 } },
3302 { ISD::SELECT, MVT::v16i16, { 1, 1, 1, 1 } },
3303 { ISD::SELECT, MVT::v8i16, { 1, 1, 1, 1 } },
3304 { ISD::SELECT, MVT::v64i8, { 2, 2, 4, 4 } },
3305 { ISD::SELECT, MVT::v32i8, { 1, 1, 1, 1 } },
3306 { ISD::SELECT, MVT::v16i8, { 1, 1, 1, 1 } },
3307 };
3308
3309 static const CostKindTblEntry AVX2CostTbl[] = {
3310 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 2 } },
3311 { ISD::SETCC, MVT::v2f64, { 1, 4, 1, 1 } },
3312 { ISD::SETCC, MVT::f64, { 1, 4, 1, 1 } },
3313 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 2 } },
3314 { ISD::SETCC, MVT::v4f32, { 1, 4, 1, 1 } },
3315 { ISD::SETCC, MVT::f32, { 1, 4, 1, 1 } },
3316
3317 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 2 } },
3318 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 2 } },
3319 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 2 } },
3320 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 2 } },
3321
3322 { ISD::SELECT, MVT::v4f64, { 2, 2, 1, 2 } }, // vblendvpd
3323 { ISD::SELECT, MVT::v8f32, { 2, 2, 1, 2 } }, // vblendvps
3324 { ISD::SELECT, MVT::v4i64, { 2, 2, 1, 2 } }, // pblendvb
3325 { ISD::SELECT, MVT::v8i32, { 2, 2, 1, 2 } }, // pblendvb
3326 { ISD::SELECT, MVT::v16i16, { 2, 2, 1, 2 } }, // pblendvb
3327 { ISD::SELECT, MVT::v32i8, { 2, 2, 1, 2 } }, // pblendvb
3328 };
3329
3330 static const CostKindTblEntry XOPCostTbl[] = {
3331 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3332 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3333 };
3334
3335 static const CostKindTblEntry AVX1CostTbl[] = {
3336 { ISD::SETCC, MVT::v4f64, { 2, 3, 1, 2 } },
3337 { ISD::SETCC, MVT::v2f64, { 1, 3, 1, 1 } },
3338 { ISD::SETCC, MVT::f64, { 1, 3, 1, 1 } },
3339 { ISD::SETCC, MVT::v8f32, { 2, 3, 1, 2 } },
3340 { ISD::SETCC, MVT::v4f32, { 1, 3, 1, 1 } },
3341 { ISD::SETCC, MVT::f32, { 1, 3, 1, 1 } },
3342
3343 // AVX1 does not support 8-wide integer compare.
3344 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3345 { ISD::SETCC, MVT::v8i32, { 4, 2, 5, 6 } },
3346 { ISD::SETCC, MVT::v16i16, { 4, 2, 5, 6 } },
3347 { ISD::SETCC, MVT::v32i8, { 4, 2, 5, 6 } },
3348
3349 { ISD::SELECT, MVT::v4f64, { 3, 3, 1, 2 } }, // vblendvpd
3350 { ISD::SELECT, MVT::v8f32, { 3, 3, 1, 2 } }, // vblendvps
3351 { ISD::SELECT, MVT::v4i64, { 3, 3, 1, 2 } }, // vblendvpd
3352 { ISD::SELECT, MVT::v8i32, { 3, 3, 1, 2 } }, // vblendvps
3353 { ISD::SELECT, MVT::v16i16, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3354 { ISD::SELECT, MVT::v32i8, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3355 };
3356
3357 static const CostKindTblEntry SSE42CostTbl[] = {
3358 { ISD::SETCC, MVT::v2i64, { 1, 2, 1, 2 } },
3359 };
3360
3361 static const CostKindTblEntry SSE41CostTbl[] = {
3362 { ISD::SETCC, MVT::v2f64, { 1, 5, 1, 1 } },
3363 { ISD::SETCC, MVT::v4f32, { 1, 5, 1, 1 } },
3364
3365 { ISD::SELECT, MVT::v2f64, { 2, 2, 1, 2 } }, // blendvpd
3366 { ISD::SELECT, MVT::f64, { 2, 2, 1, 2 } }, // blendvpd
3367 { ISD::SELECT, MVT::v4f32, { 2, 2, 1, 2 } }, // blendvps
3368 { ISD::SELECT, MVT::f32 , { 2, 2, 1, 2 } }, // blendvps
3369 { ISD::SELECT, MVT::v2i64, { 2, 2, 1, 2 } }, // pblendvb
3370 { ISD::SELECT, MVT::v4i32, { 2, 2, 1, 2 } }, // pblendvb
3371 { ISD::SELECT, MVT::v8i16, { 2, 2, 1, 2 } }, // pblendvb
3372 { ISD::SELECT, MVT::v16i8, { 2, 2, 1, 2 } }, // pblendvb
3373 };
3374
3375 static const CostKindTblEntry SSE2CostTbl[] = {
3376 { ISD::SETCC, MVT::v2f64, { 2, 5, 1, 1 } },
3377 { ISD::SETCC, MVT::f64, { 1, 5, 1, 1 } },
3378
3379 { ISD::SETCC, MVT::v2i64, { 5, 4, 5, 5 } }, // pcmpeqd/pcmpgtd expansion
3380 { ISD::SETCC, MVT::v4i32, { 1, 1, 1, 1 } },
3381 { ISD::SETCC, MVT::v8i16, { 1, 1, 1, 1 } },
3382 { ISD::SETCC, MVT::v16i8, { 1, 1, 1, 1 } },
3383
3384 { ISD::SELECT, MVT::v2f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3385 { ISD::SELECT, MVT::f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3386 { ISD::SELECT, MVT::v2i64, { 2, 2, 3, 3 } }, // pand + pandn + por
3387 { ISD::SELECT, MVT::v4i32, { 2, 2, 3, 3 } }, // pand + pandn + por
3388 { ISD::SELECT, MVT::v8i16, { 2, 2, 3, 3 } }, // pand + pandn + por
3389 { ISD::SELECT, MVT::v16i8, { 2, 2, 3, 3 } }, // pand + pandn + por
3390 };
3391
3392 static const CostKindTblEntry SSE1CostTbl[] = {
3393 { ISD::SETCC, MVT::v4f32, { 2, 5, 1, 1 } },
3394 { ISD::SETCC, MVT::f32, { 1, 5, 1, 1 } },
3395
3396 { ISD::SELECT, MVT::v4f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3397 { ISD::SELECT, MVT::f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3398 };
3399
3400 if (ST->useSLMArithCosts())
3401 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
3402 if (auto KindCost = Entry->Cost[CostKind])
3403 return LT.first * (ExtraCost + *KindCost);
3404
3405 if (ST->hasBWI())
3406 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3407 if (auto KindCost = Entry->Cost[CostKind])
3408 return LT.first * (ExtraCost + *KindCost);
3409
3410 if (ST->hasAVX512())
3411 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3412 if (auto KindCost = Entry->Cost[CostKind])
3413 return LT.first * (ExtraCost + *KindCost);
3414
3415 if (ST->hasAVX2())
3416 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3417 if (auto KindCost = Entry->Cost[CostKind])
3418 return LT.first * (ExtraCost + *KindCost);
3419
3420 if (ST->hasXOP())
3421 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3422 if (auto KindCost = Entry->Cost[CostKind])
3423 return LT.first * (ExtraCost + *KindCost);
3424
3425 if (ST->hasAVX())
3426 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3427 if (auto KindCost = Entry->Cost[CostKind])
3428 return LT.first * (ExtraCost + *KindCost);
3429
3430 if (ST->hasSSE42())
3431 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3432 if (auto KindCost = Entry->Cost[CostKind])
3433 return LT.first * (ExtraCost + *KindCost);
3434
3435 if (ST->hasSSE41())
3436 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3437 if (auto KindCost = Entry->Cost[CostKind])
3438 return LT.first * (ExtraCost + *KindCost);
3439
3440 if (ST->hasSSE2())
3441 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3442 if (auto KindCost = Entry->Cost[CostKind])
3443 return LT.first * (ExtraCost + *KindCost);
3444
3445 if (ST->hasSSE1())
3446 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3447 if (auto KindCost = Entry->Cost[CostKind])
3448 return LT.first * (ExtraCost + *KindCost);
3449
3450 // Assume a 3cy latency for fp select ops.
3451 if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select)
3452 if (ValTy->getScalarType()->isFloatingPointTy())
3453 return 3;
3454
3455 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
3456}
3457
3459
3463 // Costs should match the codegen from:
3464 // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
3465 // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
3466 // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
3467 // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
3468 // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
3469
3470 // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
3471 // specialized in these tables yet.
3472 static const CostKindTblEntry AVX512VBMI2CostTbl[] = {
3473 { ISD::FSHL, MVT::v8i64, { 1, 1, 1, 1 } },
3474 { ISD::FSHL, MVT::v4i64, { 1, 1, 1, 1 } },
3475 { ISD::FSHL, MVT::v2i64, { 1, 1, 1, 1 } },
3476 { ISD::FSHL, MVT::v16i32, { 1, 1, 1, 1 } },
3477 { ISD::FSHL, MVT::v8i32, { 1, 1, 1, 1 } },
3478 { ISD::FSHL, MVT::v4i32, { 1, 1, 1, 1 } },
3479 { ISD::FSHL, MVT::v32i16, { 1, 1, 1, 1 } },
3480 { ISD::FSHL, MVT::v16i16, { 1, 1, 1, 1 } },
3481 { ISD::FSHL, MVT::v8i16, { 1, 1, 1, 1 } },
3482 { ISD::ROTL, MVT::v32i16, { 1, 1, 1, 1 } },
3483 { ISD::ROTL, MVT::v16i16, { 1, 1, 1, 1 } },
3484 { ISD::ROTL, MVT::v8i16, { 1, 1, 1, 1 } },
3485 { ISD::ROTR, MVT::v32i16, { 1, 1, 1, 1 } },
3486 { ISD::ROTR, MVT::v16i16, { 1, 1, 1, 1 } },
3487 { ISD::ROTR, MVT::v8i16, { 1, 1, 1, 1 } },
3488 { X86ISD::VROTLI, MVT::v32i16, { 1, 1, 1, 1 } },
3489 { X86ISD::VROTLI, MVT::v16i16, { 1, 1, 1, 1 } },
3490 { X86ISD::VROTLI, MVT::v8i16, { 1, 1, 1, 1 } },
3491 };
3492 static const CostKindTblEntry AVX512BITALGCostTbl[] = {
3493 { ISD::CTPOP, MVT::v32i16, { 1, 1, 1, 1 } },
3494 { ISD::CTPOP, MVT::v64i8, { 1, 1, 1, 1 } },
3495 { ISD::CTPOP, MVT::v16i16, { 1, 1, 1, 1 } },
3496 { ISD::CTPOP, MVT::v32i8, { 1, 1, 1, 1 } },
3497 { ISD::CTPOP, MVT::v8i16, { 1, 1, 1, 1 } },
3498 { ISD::CTPOP, MVT::v16i8, { 1, 1, 1, 1 } },
3499 };
3500 static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = {
3501 { ISD::CTPOP, MVT::v8i64, { 1, 1, 1, 1 } },
3502 { ISD::CTPOP, MVT::v16i32, { 1, 1, 1, 1 } },
3503 { ISD::CTPOP, MVT::v4i64, { 1, 1, 1, 1 } },
3504 { ISD::CTPOP, MVT::v8i32, { 1, 1, 1, 1 } },
3505 { ISD::CTPOP, MVT::v2i64, { 1, 1, 1, 1 } },
3506 { ISD::CTPOP, MVT::v4i32, { 1, 1, 1, 1 } },
3507 };
3508 static const CostKindTblEntry AVX512CDCostTbl[] = {
3509 { ISD::CTLZ, MVT::v8i64, { 1, 5, 1, 1 } },
3510 { ISD::CTLZ, MVT::v16i32, { 1, 5, 1, 1 } },
3511 { ISD::CTLZ, MVT::v32i16, { 18, 27, 23, 27 } },
3512 { ISD::CTLZ, MVT::v64i8, { 3, 16, 9, 11 } },
3513 { ISD::CTLZ, MVT::v4i64, { 1, 5, 1, 1 } },
3514 { ISD::CTLZ, MVT::v8i32, { 1, 5, 1, 1 } },
3515 { ISD::CTLZ, MVT::v16i16, { 8, 19, 11, 13 } },
3516 { ISD::CTLZ, MVT::v32i8, { 2, 11, 9, 10 } },
3517 { ISD::CTLZ, MVT::v2i64, { 1, 5, 1, 1 } },
3518 { ISD::CTLZ, MVT::v4i32, { 1, 5, 1, 1 } },
3519 { ISD::CTLZ, MVT::v8i16, { 3, 15, 4, 6 } },
3520 { ISD::CTLZ, MVT::v16i8, { 2, 10, 9, 10 } },
3521
3522 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3523 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3524 { ISD::CTTZ, MVT::v4i64, { 1, 8, 6, 6 } },
3525 { ISD::CTTZ, MVT::v8i32, { 1, 8, 6, 6 } },
3526 { ISD::CTTZ, MVT::v2i64, { 1, 8, 6, 6 } },
3527 { ISD::CTTZ, MVT::v4i32, { 1, 8, 6, 6 } },
3528 };
3529 static const CostKindTblEntry AVX512BWCostTbl[] = {
3530 { ISD::ABS, MVT::v32i16, { 1, 1, 1, 1 } },
3531 { ISD::ABS, MVT::v64i8, { 1, 1, 1, 1 } },
3532 { ISD::BITREVERSE, MVT::v2i64, { 3, 10, 10, 11 } },
3533 { ISD::BITREVERSE, MVT::v4i64, { 3, 11, 10, 11 } },
3534 { ISD::BITREVERSE, MVT::v8i64, { 3, 12, 10, 14 } },
3535 { ISD::BITREVERSE, MVT::v4i32, { 3, 10, 10, 11 } },
3536 { ISD::BITREVERSE, MVT::v8i32, { 3, 11, 10, 11 } },
3537 { ISD::BITREVERSE, MVT::v16i32, { 3, 12, 10, 14 } },
3538 { ISD::BITREVERSE, MVT::v8i16, { 3, 10, 10, 11 } },
3539 { ISD::BITREVERSE, MVT::v16i16, { 3, 11, 10, 11 } },
3540 { ISD::BITREVERSE, MVT::v32i16, { 3, 12, 10, 14 } },
3541 { ISD::BITREVERSE, MVT::v16i8, { 2, 5, 9, 9 } },
3542 { ISD::BITREVERSE, MVT::v32i8, { 2, 5, 9, 9 } },
3543 { ISD::BITREVERSE, MVT::v64i8, { 2, 5, 9, 12 } },
3544 { ISD::BSWAP, MVT::v2i64, { 1, 1, 1, 2 } },
3545 { ISD::BSWAP, MVT::v4i64, { 1, 1, 1, 2 } },
3546 { ISD::BSWAP, MVT::v8i64, { 1, 1, 1, 2 } },
3547 { ISD::BSWAP, MVT::v4i32, { 1, 1, 1, 2 } },
3548 { ISD::BSWAP, MVT::v8i32, { 1, 1, 1, 2 } },
3549 { ISD::BSWAP, MVT::v16i32, { 1, 1, 1, 2 } },
3550 { ISD::BSWAP, MVT::v8i16, { 1, 1, 1, 2 } },
3551 { ISD::BSWAP, MVT::v16i16, { 1, 1, 1, 2 } },
3552 { ISD::BSWAP, MVT::v32i16, { 1, 1, 1, 2 } },
3553 { ISD::CTLZ, MVT::v8i64, { 8, 22, 23, 23 } },
3554 { ISD::CTLZ, MVT::v16i32, { 8, 23, 25, 25 } },
3555 { ISD::CTLZ, MVT::v32i16, { 4, 15, 15, 16 } },
3556 { ISD::CTLZ, MVT::v64i8, { 3, 12, 10, 9 } },
3557 { ISD::CTPOP, MVT::v2i64, { 3, 7, 10, 10 } },
3558 { ISD::CTPOP, MVT::v4i64, { 3, 7, 10, 10 } },
3559 { ISD::CTPOP, MVT::v8i64, { 3, 8, 10, 12 } },
3560 { ISD::CTPOP, MVT::v4i32, { 7, 11, 14, 14 } },
3561 { ISD::CTPOP, MVT::v8i32, { 7, 11, 14, 14 } },
3562 { ISD::CTPOP, MVT::v16i32, { 7, 12, 14, 16 } },
3563 { ISD::CTPOP, MVT::v8i16, { 2, 7, 11, 11 } },
3564 { ISD::CTPOP, MVT::v16i16, { 2, 7, 11, 11 } },
3565 { ISD::CTPOP, MVT::v32i16, { 3, 7, 11, 13 } },
3566 { ISD::CTPOP, MVT::v16i8, { 2, 4, 8, 8 } },
3567 { ISD::CTPOP, MVT::v32i8, { 2, 4, 8, 8 } },
3568 { ISD::CTPOP, MVT::v64i8, { 2, 5, 8, 10 } },
3569 { ISD::CTTZ, MVT::v8i16, { 3, 9, 14, 14 } },
3570 { ISD::CTTZ, MVT::v16i16, { 3, 9, 14, 14 } },
3571 { ISD::CTTZ, MVT::v32i16, { 3, 10, 14, 16 } },
3572 { ISD::CTTZ, MVT::v16i8, { 2, 6, 11, 11 } },
3573 { ISD::CTTZ, MVT::v32i8, { 2, 6, 11, 11 } },
3574 { ISD::CTTZ, MVT::v64i8, { 3, 7, 11, 13 } },
3575 { ISD::ROTL, MVT::v32i16, { 2, 8, 6, 8 } },
3576 { ISD::ROTL, MVT::v16i16, { 2, 8, 6, 7 } },
3577 { ISD::ROTL, MVT::v8i16, { 2, 7, 6, 7 } },
3578 { ISD::ROTL, MVT::v64i8, { 5, 6, 11, 12 } },
3579 { ISD::ROTL, MVT::v32i8, { 5, 15, 7, 10 } },
3580 { ISD::ROTL, MVT::v16i8, { 5, 15, 7, 10 } },
3581 { ISD::ROTR, MVT::v32i16, { 2, 8, 6, 8 } },
3582 { ISD::ROTR, MVT::v16i16, { 2, 8, 6, 7 } },
3583 { ISD::ROTR, MVT::v8i16, { 2, 7, 6, 7 } },
3584 { ISD::ROTR, MVT::v64i8, { 5, 6, 12, 14 } },
3585 { ISD::ROTR, MVT::v32i8, { 5, 14, 6, 9 } },
3586 { ISD::ROTR, MVT::v16i8, { 5, 14, 6, 9 } },
3587 { X86ISD::VROTLI, MVT::v32i16, { 2, 5, 3, 3 } },
3588 { X86ISD::VROTLI, MVT::v16i16, { 1, 5, 3, 3 } },
3589 { X86ISD::VROTLI, MVT::v8i16, { 1, 5, 3, 3 } },
3590 { X86ISD::VROTLI, MVT::v64i8, { 2, 9, 3, 4 } },
3591 { X86ISD::VROTLI, MVT::v32i8, { 1, 9, 3, 4 } },
3592 { X86ISD::VROTLI, MVT::v16i8, { 1, 8, 3, 4 } },
3593 { ISD::SADDSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3594 { ISD::SADDSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3595 { ISD::SMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3596 { ISD::SMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3597 { ISD::SMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3598 { ISD::SMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3599 { ISD::SMULO, MVT::v32i16, { 3, 6, 4, 4 } },
3600 { ISD::SMULO, MVT::v64i8, { 8, 21, 17, 18 } },
3601 { ISD::UMULO, MVT::v32i16, { 2, 5, 3, 3 } },
3602 { ISD::UMULO, MVT::v64i8, { 8, 15, 15, 16 } },
3603 { ISD::SSUBSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3604 { ISD::SSUBSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3605 { ISD::UADDSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3606 { ISD::UADDSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3607 { ISD::UMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3608 { ISD::UMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3609 { ISD::UMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3610 { ISD::UMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3611 { ISD::USUBSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3612 { ISD::USUBSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3613 };
3614 static const CostKindTblEntry AVX512CostTbl[] = {
3615 { ISD::ABS, MVT::v8i64, { 1, 1, 1, 1 } },
3616 { ISD::ABS, MVT::v4i64, { 1, 1, 1, 1 } },
3617 { ISD::ABS, MVT::v2i64, { 1, 1, 1, 1 } },
3618 { ISD::ABS, MVT::v16i32, { 1, 1, 1, 1 } },
3619 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 1 } },
3620 { ISD::ABS, MVT::v32i16, { 2, 7, 4, 4 } },
3621 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 1 } },
3622 { ISD::ABS, MVT::v64i8, { 2, 7, 4, 4 } },
3623 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 1 } },
3624 { ISD::BITREVERSE, MVT::v8i64, { 9, 13, 20, 20 } },
3625 { ISD::BITREVERSE, MVT::v16i32, { 9, 13, 20, 20 } },
3626 { ISD::BITREVERSE, MVT::v32i16, { 9, 13, 20, 20 } },
3627 { ISD::BITREVERSE, MVT::v64i8, { 6, 11, 17, 17 } },
3628 { ISD::BSWAP, MVT::v8i64, { 4, 7, 5, 5 } },
3629 { ISD::BSWAP, MVT::v16i32, { 4, 7, 5, 5 } },
3630 { ISD::BSWAP, MVT::v32i16, { 4, 7, 5, 5 } },
3631 { ISD::CTLZ, MVT::v8i64, { 10, 28, 32, 32 } },
3632 { ISD::CTLZ, MVT::v16i32, { 12, 30, 38, 38 } },
3633 { ISD::CTLZ, MVT::v32i16, { 8, 15, 29, 29 } },
3634 { ISD::CTLZ, MVT::v64i8, { 6, 11, 19, 19 } },
3635 { ISD::CTPOP, MVT::v8i64, { 16, 16, 19, 19 } },
3636 { ISD::CTPOP, MVT::v16i32, { 24, 19, 27, 27 } },
3637 { ISD::CTPOP, MVT::v32i16, { 18, 15, 22, 22 } },
3638 { ISD::CTPOP, MVT::v64i8, { 12, 11, 16, 16 } },
3639 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3640 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3641 { ISD::CTTZ, MVT::v32i16, { 7, 17, 27, 27 } },
3642 { ISD::CTTZ, MVT::v64i8, { 6, 13, 21, 21 } },
3643 { ISD::ROTL, MVT::v8i64, { 1, 1, 1, 1 } },
3644 { ISD::ROTL, MVT::v4i64, { 1, 1, 1, 1 } },
3645 { ISD::ROTL, MVT::v2i64, { 1, 1, 1, 1 } },
3646 { ISD::ROTL, MVT::v16i32, { 1, 1, 1, 1 } },
3647 { ISD::ROTL, MVT::v8i32, { 1, 1, 1, 1 } },
3648 { ISD::ROTL, MVT::v4i32, { 1, 1, 1, 1 } },
3649 { ISD::ROTR, MVT::v8i64, { 1, 1, 1, 1 } },
3650 { ISD::ROTR, MVT::v4i64, { 1, 1, 1, 1 } },
3651 { ISD::ROTR, MVT::v2i64, { 1, 1, 1, 1 } },
3652 { ISD::ROTR, MVT::v16i32, { 1, 1, 1, 1 } },
3653 { ISD::ROTR, MVT::v8i32, { 1, 1, 1, 1 } },
3654 { ISD::ROTR, MVT::v4i32, { 1, 1, 1, 1 } },
3655 { X86ISD::VROTLI, MVT::v8i64, { 1, 1, 1, 1 } },
3656 { X86ISD::VROTLI, MVT::v4i64, { 1, 1, 1, 1 } },
3657 { X86ISD::VROTLI, MVT::v2i64, { 1, 1, 1, 1 } },
3658 { X86ISD::VROTLI, MVT::v16i32, { 1, 1, 1, 1 } },
3659 { X86ISD::VROTLI, MVT::v8i32, { 1, 1, 1, 1 } },
3660 { X86ISD::VROTLI, MVT::v4i32, { 1, 1, 1, 1 } },
3661 { ISD::SADDSAT, MVT::v2i64, { 3, 3, 8, 9 } },
3662 { ISD::SADDSAT, MVT::v4i64, { 2, 2, 6, 7 } },
3663 { ISD::SADDSAT, MVT::v8i64, { 3, 3, 6, 7 } },
3664 { ISD::SADDSAT, MVT::v4i32, { 2, 2, 6, 7 } },
3665 { ISD::SADDSAT, MVT::v8i32, { 2, 2, 6, 7 } },
3666 { ISD::SADDSAT, MVT::v16i32, { 3, 3, 6, 7 } },
3667 { ISD::SADDSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3668 { ISD::SADDSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3669 { ISD::SMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3670 { ISD::SMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3671 { ISD::SMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3672 { ISD::SMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3673 { ISD::SMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3674 { ISD::SMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3675 { ISD::SMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3676 { ISD::SMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3677 { ISD::SMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3678 { ISD::SMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3679 { ISD::SMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3680 { ISD::SMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3681 { ISD::SMULO, MVT::v8i64, { 44, 44, 81, 93 } },
3682 { ISD::SMULO, MVT::v16i32, { 5, 12, 9, 11 } },
3683 { ISD::SMULO, MVT::v32i16, { 6, 12, 17, 17 } },
3684 { ISD::SMULO, MVT::v64i8, { 22, 28, 42, 42 } },
3685 { ISD::SSUBSAT, MVT::v2i64, { 2, 13, 9, 10 } },
3686 { ISD::SSUBSAT, MVT::v4i64, { 2, 15, 7, 8 } },
3687 { ISD::SSUBSAT, MVT::v8i64, { 2, 14, 7, 8 } },
3688 { ISD::SSUBSAT, MVT::v4i32, { 2, 14, 7, 8 } },
3689 { ISD::SSUBSAT, MVT::v8i32, { 2, 15, 7, 8 } },
3690 { ISD::SSUBSAT, MVT::v16i32, { 2, 14, 7, 8 } },
3691 { ISD::SSUBSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3692 { ISD::SSUBSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3693 { ISD::UMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3694 { ISD::UMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3695 { ISD::UMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3696 { ISD::UMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3697 { ISD::UMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3698 { ISD::UMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3699 { ISD::UMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3700 { ISD::UMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3701 { ISD::UMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3702 { ISD::UMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3703 { ISD::UMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3704 { ISD::UMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3705 { ISD::UMULO, MVT::v8i64, { 52, 52, 95, 104} },
3706 { ISD::UMULO, MVT::v16i32, { 5, 12, 8, 10 } },
3707 { ISD::UMULO, MVT::v32i16, { 5, 13, 16, 16 } },
3708 { ISD::UMULO, MVT::v64i8, { 18, 24, 30, 30 } },
3709 { ISD::UADDSAT, MVT::v2i64, { 1, 4, 4, 4 } },
3710 { ISD::UADDSAT, MVT::v4i64, { 1, 4, 4, 4 } },
3711 { ISD::UADDSAT, MVT::v8i64, { 1, 4, 4, 4 } },
3712 { ISD::UADDSAT, MVT::v4i32, { 1, 2, 4, 4 } },
3713 { ISD::UADDSAT, MVT::v8i32, { 1, 2, 4, 4 } },
3714 { ISD::UADDSAT, MVT::v16i32, { 2, 2, 4, 4 } },
3715 { ISD::UADDSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3716 { ISD::UADDSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3717 { ISD::USUBSAT, MVT::v2i64, { 1, 4, 2, 2 } },
3718 { ISD::USUBSAT, MVT::v4i64, { 1, 4, 2, 2 } },
3719 { ISD::USUBSAT, MVT::v8i64, { 1, 4, 2, 2 } },
3720 { ISD::USUBSAT, MVT::v8i32, { 1, 2, 2, 2 } },
3721 { ISD::USUBSAT, MVT::v16i32, { 1, 2, 2, 2 } },
3722 { ISD::USUBSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3723 { ISD::USUBSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3724 { ISD::FMAXNUM, MVT::f32, { 2, 2, 3, 3 } },
3725 { ISD::FMAXNUM, MVT::v4f32, { 1, 1, 3, 3 } },
3726 { ISD::FMAXNUM, MVT::v8f32, { 2, 2, 3, 3 } },
3727 { ISD::FMAXNUM, MVT::v16f32, { 4, 4, 3, 3 } },
3728 { ISD::FMAXNUM, MVT::f64, { 2, 2, 3, 3 } },
3729 { ISD::FMAXNUM, MVT::v2f64, { 1, 1, 3, 3 } },
3730 { ISD::FMAXNUM, MVT::v4f64, { 2, 2, 3, 3 } },
3731 { ISD::FMAXNUM, MVT::v8f64, { 3, 3, 3, 3 } },
3732 { ISD::FSQRT, MVT::f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3733 { ISD::FSQRT, MVT::v4f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3734 { ISD::FSQRT, MVT::v8f32, { 6, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3735 { ISD::FSQRT, MVT::v16f32, { 12, 20, 1, 3 } }, // Skylake from http://www.agner.org/
3736 { ISD::FSQRT, MVT::f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3737 { ISD::FSQRT, MVT::v2f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3738 { ISD::FSQRT, MVT::v4f64, { 12, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3739 { ISD::FSQRT, MVT::v8f64, { 24, 32, 1, 3 } }, // Skylake from http://www.agner.org/
3740 };
3741 static const CostKindTblEntry XOPCostTbl[] = {
3742 { ISD::BITREVERSE, MVT::v4i64, { 3, 6, 5, 6 } },
3743 { ISD::BITREVERSE, MVT::v8i32, { 3, 6, 5, 6 } },
3744 { ISD::BITREVERSE, MVT::v16i16, { 3, 6, 5, 6 } },
3745 { ISD::BITREVERSE, MVT::v32i8, { 3, 6, 5, 6 } },
3746 { ISD::BITREVERSE, MVT::v2i64, { 2, 7, 1, 1 } },
3747 { ISD::BITREVERSE, MVT::v4i32, { 2, 7, 1, 1 } },
3748 { ISD::BITREVERSE, MVT::v8i16, { 2, 7, 1, 1 } },
3749 { ISD::BITREVERSE, MVT::v16i8, { 2, 7, 1, 1 } },
3750 { ISD::BITREVERSE, MVT::i64, { 2, 2, 3, 4 } },
3751 { ISD::BITREVERSE, MVT::i32, { 2, 2, 3, 4 } },
3752 { ISD::BITREVERSE, MVT::i16, { 2, 2, 3, 4 } },
3753 { ISD::BITREVERSE, MVT::i8, { 2, 2, 3, 4 } },
3754 // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
3755 { ISD::ROTL, MVT::v4i64, { 4, 7, 5, 6 } },
3756 { ISD::ROTL, MVT::v8i32, { 4, 7, 5, 6 } },
3757 { ISD::ROTL, MVT::v16i16, { 4, 7, 5, 6 } },
3758 { ISD::ROTL, MVT::v32i8, { 4, 7, 5, 6 } },
3759 { ISD::ROTL, MVT::v2i64, { 1, 3, 1, 1 } },
3760 { ISD::ROTL, MVT::v4i32, { 1, 3, 1, 1 } },
3761 { ISD::ROTL, MVT::v8i16, { 1, 3, 1, 1 } },
3762 { ISD::ROTL, MVT::v16i8, { 1, 3, 1, 1 } },
3763 { ISD::ROTR, MVT::v4i64, { 4, 7, 8, 9 } },
3764 { ISD::ROTR, MVT::v8i32, { 4, 7, 8, 9 } },
3765 { ISD::ROTR, MVT::v16i16, { 4, 7, 8, 9 } },
3766 { ISD::ROTR, MVT::v32i8, { 4, 7, 8, 9 } },
3767 { ISD::ROTR, MVT::v2i64, { 1, 3, 3, 3 } },
3768 { ISD::ROTR, MVT::v4i32, { 1, 3, 3, 3 } },
3769 { ISD::ROTR, MVT::v8i16, { 1, 3, 3, 3 } },
3770 { ISD::ROTR, MVT::v16i8, { 1, 3, 3, 3 } },
3771 { X86ISD::VROTLI, MVT::v4i64, { 4, 7, 5, 6 } },
3772 { X86ISD::VROTLI, MVT::v8i32, { 4, 7, 5, 6 } },
3773 { X86ISD::VROTLI, MVT::v16i16, { 4, 7, 5, 6 } },
3774 { X86ISD::VROTLI, MVT::v32i8, { 4, 7, 5, 6 } },
3775 { X86ISD::VROTLI, MVT::v2i64, { 1, 3, 1, 1 } },
3776 { X86ISD::VROTLI, MVT::v4i32, { 1, 3, 1, 1 } },
3777 { X86ISD::VROTLI, MVT::v8i16, { 1, 3, 1, 1 } },
3778 { X86ISD::VROTLI, MVT::v16i8, { 1, 3, 1, 1 } },
3779 };
3780 static const CostKindTblEntry AVX2CostTbl[] = {
3781 { ISD::ABS, MVT::v2i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3782 { ISD::ABS, MVT::v4i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3783 { ISD::ABS, MVT::v4i32, { 1, 1, 1, 1 } },
3784 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 2 } },
3785 { ISD::ABS, MVT::v8i16, { 1, 1, 1, 1 } },
3786 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 2 } },
3787 { ISD::ABS, MVT::v16i8, { 1, 1, 1, 1 } },
3788 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 2 } },
3789 { ISD::BITREVERSE, MVT::v2i64, { 3, 11, 10, 11 } },
3790 { ISD::BITREVERSE, MVT::v4i64, { 5, 11, 10, 17 } },
3791 { ISD::BITREVERSE, MVT::v4i32, { 3, 11, 10, 11 } },
3792 { ISD::BITREVERSE, MVT::v8i32, { 5, 11, 10, 17 } },
3793 { ISD::BITREVERSE, MVT::v8i16, { 3, 11, 10, 11 } },
3794 { ISD::BITREVERSE, MVT::v16i16, { 5, 11, 10, 17 } },
3795 { ISD::BITREVERSE, MVT::v16i8, { 3, 6, 9, 9 } },
3796 { ISD::BITREVERSE, MVT::v32i8, { 4, 5, 9, 15 } },
3797 { ISD::BSWAP, MVT::v2i64, { 1, 2, 1, 2 } },
3798 { ISD::BSWAP, MVT::v4i64, { 1, 3, 1, 2 } },
3799 { ISD::BSWAP, MVT::v4i32, { 1, 2, 1, 2 } },
3800 { ISD::BSWAP, MVT::v8i32, { 1, 3, 1, 2 } },
3801 { ISD::BSWAP, MVT::v8i16, { 1, 2, 1, 2 } },
3802 { ISD::BSWAP, MVT::v16i16, { 1, 3, 1, 2 } },
3803 { ISD::CTLZ, MVT::v2i64, { 7, 18, 24, 25 } },
3804 { ISD::CTLZ, MVT::v4i64, { 14, 18, 24, 44 } },
3805 { ISD::CTLZ, MVT::v4i32, { 5, 16, 19, 20 } },
3806 { ISD::CTLZ, MVT::v8i32, { 10, 16, 19, 34 } },
3807 { ISD::CTLZ, MVT::v8i16, { 4, 13, 14, 15 } },
3808 { ISD::CTLZ, MVT::v16i16, { 6, 14, 14, 24 } },
3809 { ISD::CTLZ, MVT::v16i8, { 3, 12, 9, 10 } },
3810 { ISD::CTLZ, MVT::v32i8, { 4, 12, 9, 14 } },
3811 { ISD::CTPOP, MVT::v2i64, { 3, 9, 10, 10 } },
3812 { ISD::CTPOP, MVT::v4i64, { 4, 9, 10, 14 } },
3813 { ISD::CTPOP, MVT::v4i32, { 7, 12, 14, 14 } },
3814 { ISD::CTPOP, MVT::v8i32, { 7, 12, 14, 18 } },
3815 { ISD::CTPOP, MVT::v8i16, { 3, 7, 11, 11 } },
3816 { ISD::CTPOP, MVT::v16i16, { 6, 8, 11, 18 } },
3817 { ISD::CTPOP, MVT::v16i8, { 2, 5, 8, 8 } },
3818 { ISD::CTPOP, MVT::v32i8, { 3, 5, 8, 12 } },
3819 { ISD::CTTZ, MVT::v2i64, { 4, 11, 13, 13 } },
3820 { ISD::CTTZ, MVT::v4i64, { 5, 11, 13, 20 } },
3821 { ISD::CTTZ, MVT::v4i32, { 7, 14, 17, 17 } },
3822 { ISD::CTTZ, MVT::v8i32, { 7, 15, 17, 24 } },
3823 { ISD::CTTZ, MVT::v8i16, { 4, 9, 14, 14 } },
3824 { ISD::CTTZ, MVT::v16i16, { 6, 9, 14, 24 } },
3825 { ISD::CTTZ, MVT::v16i8, { 3, 7, 11, 11 } },
3826 { ISD::CTTZ, MVT::v32i8, { 5, 7, 11, 18 } },
3827 { ISD::SADDSAT, MVT::v2i64, { 4, 13, 8, 11 } },
3828 { ISD::SADDSAT, MVT::v4i64, { 3, 10, 8, 12 } },
3829 { ISD::SADDSAT, MVT::v4i32, { 2, 6, 7, 9 } },
3830 { ISD::SADDSAT, MVT::v8i32, { 4, 6, 7, 13 } },
3831 { ISD::SADDSAT, MVT::v16i16, { 1, 1, 1, 2 } },
3832 { ISD::SADDSAT, MVT::v32i8, { 1, 1, 1, 2 } },
3833 { ISD::SMAX, MVT::v2i64, { 2, 7, 2, 3 } },
3834 { ISD::SMAX, MVT::v4i64, { 2, 7, 2, 3 } },
3835 { ISD::SMAX, MVT::v8i32, { 1, 1, 1, 2 } },
3836 { ISD::SMAX, MVT::v16i16, { 1, 1, 1, 2 } },
3837 { ISD::SMAX, MVT::v32i8, { 1, 1, 1, 2 } },
3838 { ISD::SMIN, MVT::v2i64, { 2, 7, 2, 3 } },
3839 { ISD::SMIN, MVT::v4i64, { 2, 7, 2, 3 } },
3840 { ISD::SMIN, MVT::v8i32, { 1, 1, 1, 2 } },
3841 { ISD::SMIN, MVT::v16i16, { 1, 1, 1, 2 } },
3842 { ISD::SMIN, MVT::v32i8, { 1, 1, 1, 2 } },
3843 { ISD::SMULO, MVT::v4i64, { 20, 20, 33, 37 } },
3844 { ISD::SMULO, MVT::v2i64, { 8, 8, 13, 15 } },
3845 { ISD::SMULO, MVT::v8i32, { 8, 20, 13, 24 } },
3846 { ISD::SMULO, MVT::v4i32, { 5, 15, 11, 12 } },
3847 { ISD::SMULO, MVT::v16i16, { 4, 14, 8, 14 } },
3848 { ISD::SMULO, MVT::v8i16, { 3, 9, 6, 6 } },
3849 { ISD::SMULO, MVT::v32i8, { 9, 15, 18, 35 } },
3850 { ISD::SMULO, MVT::v16i8, { 6, 22, 14, 21 } },
3851 { ISD::SSUBSAT, MVT::v2i64, { 4, 13, 9, 13 } },
3852 { ISD::SSUBSAT, MVT::v4i64, { 4, 15, 9, 13 } },
3853 { ISD::SSUBSAT, MVT::v4i32, { 3, 14, 9, 11 } },
3854 { ISD::SSUBSAT, MVT::v8i32, { 4, 15, 9, 16 } },
3855 { ISD::SSUBSAT, MVT::v16i16, { 1, 1, 1, 2 } },
3856 { ISD::SSUBSAT, MVT::v32i8, { 1, 1, 1, 2 } },
3857 { ISD::UADDSAT, MVT::v2i64, { 2, 8, 6, 6 } },
3858 { ISD::UADDSAT, MVT::v4i64, { 3, 8, 6, 10 } },
3859 { ISD::UADDSAT, MVT::v8i32, { 2, 2, 4, 8 } },
3860 { ISD::UADDSAT, MVT::v16i16, { 1, 1, 1, 2 } },
3861 { ISD::UADDSAT, MVT::v32i8, { 1, 1, 1, 2 } },
3862 { ISD::UMAX, MVT::v2i64, { 2, 8, 5, 6 } },
3863 { ISD::UMAX, MVT::v4i64, { 2, 8, 5, 8 } },
3864 { ISD::UMAX, MVT::v8i32, { 1, 1, 1, 2 } },
3865 { ISD::UMAX, MVT::v16i16, { 1, 1, 1, 2 } },
3866 { ISD::UMAX, MVT::v32i8, { 1, 1, 1, 2 } },
3867 { ISD::UMIN, MVT::v2i64, { 2, 8, 5, 6 } },
3868 { ISD::UMIN, MVT::v4i64, { 2, 8, 5, 8 } },
3869 { ISD::UMIN, MVT::v8i32, { 1, 1, 1, 2 } },
3870 { ISD::UMIN, MVT::v16i16, { 1, 1, 1, 2 } },
3871 { ISD::UMIN, MVT::v32i8, { 1, 1, 1, 2 } },
3872 { ISD::UMULO, MVT::v4i64, { 24, 24, 39, 43 } },
3873 { ISD::UMULO, MVT::v2i64, { 10, 10, 15, 19 } },
3874 { ISD::UMULO, MVT::v8i32, { 8, 11, 13, 23 } },
3875 { ISD::UMULO, MVT::v4i32, { 5, 12, 11, 12 } },
3876 { ISD::UMULO, MVT::v16i16, { 4, 6, 8, 13 } },
3877 { ISD::UMULO, MVT::v8i16, { 2, 8, 6, 6 } },
3878 { ISD::UMULO, MVT::v32i8, { 9, 13, 17, 33 } },
3879 { ISD::UMULO, MVT::v16i8, { 6, 19, 13, 20 } },
3880 { ISD::USUBSAT, MVT::v2i64, { 2, 7, 6, 6 } },
3881 { ISD::USUBSAT, MVT::v4i64, { 3, 7, 6, 10 } },
3882 { ISD::USUBSAT, MVT::v8i32, { 2, 2, 2, 4 } },
3883 { ISD::USUBSAT, MVT::v16i16, { 1, 1, 1, 2 } },
3884 { ISD::USUBSAT, MVT::v32i8, { 1, 1, 1, 2 } },
3885 { ISD::FMAXNUM, MVT::f32, { 2, 7, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3886 { ISD::FMAXNUM, MVT::v4f32, { 2, 7, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3887 { ISD::FMAXNUM, MVT::v8f32, { 3, 7, 3, 6 } }, // MAXPS + CMPUNORDPS + BLENDVPS
3888 { ISD::FMAXNUM, MVT::f64, { 2, 7, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
3889 { ISD::FMAXNUM, MVT::v2f64, { 2, 7, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3890 { ISD::FMAXNUM, MVT::v4f64, { 3, 7, 3, 6 } }, // MAXPD + CMPUNORDPD + BLENDVPD
3891 { ISD::FSQRT, MVT::f32, { 7, 15, 1, 1 } }, // vsqrtss
3892 { ISD::FSQRT, MVT::v4f32, { 7, 15, 1, 1 } }, // vsqrtps
3893 { ISD::FSQRT, MVT::v8f32, { 14, 21, 1, 3 } }, // vsqrtps
3894 { ISD::FSQRT, MVT::f64, { 14, 21, 1, 1 } }, // vsqrtsd
3895 { ISD::FSQRT, MVT::v2f64, { 14, 21, 1, 1 } }, // vsqrtpd
3896 { ISD::FSQRT, MVT::v4f64, { 28, 35, 1, 3 } }, // vsqrtpd
3897 };
3898 static const CostKindTblEntry AVX1CostTbl[] = {
3899 { ISD::ABS, MVT::v4i64, { 6, 8, 6, 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3900 { ISD::ABS, MVT::v8i32, { 3, 6, 4, 5 } },
3901 { ISD::ABS, MVT::v16i16, { 3, 6, 4, 5 } },
3902 { ISD::ABS, MVT::v32i8, { 3, 6, 4, 5 } },
3903 { ISD::BITREVERSE, MVT::v4i64, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3904 { ISD::BITREVERSE, MVT::v2i64, { 8, 13, 10, 16 } },
3905 { ISD::BITREVERSE, MVT::v8i32, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3906 { ISD::BITREVERSE, MVT::v4i32, { 8, 13, 10, 16 } },
3907 { ISD::BITREVERSE, MVT::v16i16, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
3908 { ISD::BITREVERSE, MVT::v8i16, { 8, 13, 10, 16 } },
3909 { ISD::BITREVERSE, MVT::v32i8, { 13, 15, 17, 26 } }, // 2 x 128-bit Op + extract/insert
3910 { ISD::BITREVERSE, MVT::v16i8, { 7, 7, 9, 13 } },
3911 { ISD::BSWAP, MVT::v4i64, { 5, 6, 5, 10 } },
3912 { ISD::BSWAP, MVT::v2i64, { 2, 2, 1, 3 } },
3913 { ISD::BSWAP, MVT::v8i32, { 5, 6, 5, 10 } },
3914 { ISD::BSWAP, MVT::v4i32, { 2, 2, 1, 3 } },
3915 { ISD::BSWAP, MVT::v16i16, { 5, 6, 5, 10 } },
3916 { ISD::BSWAP, MVT::v8i16, { 2, 2, 1, 3 } },
3917 { ISD::CTLZ, MVT::v4i64, { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert
3918 { ISD::CTLZ, MVT::v2i64, { 14, 24, 24, 28 } },
3919 { ISD::CTLZ, MVT::v8i32, { 24, 28, 39, 48 } }, // 2 x 128-bit Op + extract/insert
3920 { ISD::CTLZ, MVT::v4i32, { 12, 20, 19, 23 } },
3921 { ISD::CTLZ, MVT::v16i16, { 19, 22, 29, 38 } }, // 2 x 128-bit Op + extract/insert
3922 { ISD::CTLZ, MVT::v8i16, { 9, 16, 14, 18 } },
3923 { ISD::CTLZ, MVT::v32i8, { 14, 15, 19, 28 } }, // 2 x 128-bit Op + extract/insert
3924 { ISD::CTLZ, MVT::v16i8, { 7, 12, 9, 13 } },
3925 { ISD::CTPOP, MVT::v4i64, { 14, 18, 19, 28 } }, // 2 x 128-bit Op + extract/insert
3926 { ISD::CTPOP, MVT::v2i64, { 7, 14, 10, 14 } },
3927 { ISD::CTPOP, MVT::v8i32, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
3928 { ISD::CTPOP, MVT::v4i32, { 9, 20, 14, 18 } },
3929 { ISD::CTPOP, MVT::v16i16, { 16, 21, 22, 31 } }, // 2 x 128-bit Op + extract/insert
3930 { ISD::CTPOP, MVT::v8i16, { 8, 18, 11, 15 } },
3931 { ISD::CTPOP, MVT::v32i8, { 13, 15, 16, 25 } }, // 2 x 128-bit Op + extract/insert
3932 { ISD::CTPOP, MVT::v16i8, { 6, 12, 8, 12 } },
3933 { ISD::CTTZ, MVT::v4i64, { 17, 22, 24, 33 } }, // 2 x 128-bit Op + extract/insert
3934 { ISD::CTTZ, MVT::v2i64, { 9, 19, 13, 17 } },
3935 { ISD::CTTZ, MVT::v8i32, { 21, 27, 32, 41 } }, // 2 x 128-bit Op + extract/insert
3936 { ISD::CTTZ, MVT::v4i32, { 11, 24, 17, 21 } },
3937 { ISD::CTTZ, MVT::v16i16, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
3938 { ISD::CTTZ, MVT::v8i16, { 9, 21, 14, 18 } },
3939 { ISD::CTTZ, MVT::v32i8, { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert
3940 { ISD::CTTZ, MVT::v16i8, { 8, 16, 11, 15 } },
3941 { ISD::SADDSAT, MVT::v2i64, { 6, 13, 8, 11 } },
3942 { ISD::SADDSAT, MVT::v4i64, { 13, 20, 15, 25 } }, // 2 x 128-bit Op + extract/insert
3943 { ISD::SADDSAT, MVT::v8i32, { 12, 18, 14, 24 } }, // 2 x 128-bit Op + extract/insert
3944 { ISD::SADDSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3945 { ISD::SADDSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3946 { ISD::SMAX, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
3947 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 4 } },
3948 { ISD::SMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3949 { ISD::SMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3950 { ISD::SMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3951 { ISD::SMIN, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
3952 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
3953 { ISD::SMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3954 { ISD::SMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3955 { ISD::SMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3956 { ISD::SMULO, MVT::v4i64, { 20, 20, 33, 37 } },
3957 { ISD::SMULO, MVT::v2i64, { 9, 9, 13, 17 } },
3958 { ISD::SMULO, MVT::v8i32, { 15, 20, 24, 29 } },
3959 { ISD::SMULO, MVT::v4i32, { 7, 15, 11, 13 } },
3960 { ISD::SMULO, MVT::v16i16, { 8, 14, 14, 15 } },
3961 { ISD::SMULO, MVT::v8i16, { 3, 9, 6, 6 } },
3962 { ISD::SMULO, MVT::v32i8, { 20, 20, 37, 39 } },
3963 { ISD::SMULO, MVT::v16i8, { 9, 22, 18, 21 } },
3964 { ISD::SSUBSAT, MVT::v2i64, { 7, 13, 9, 13 } },
3965 { ISD::SSUBSAT, MVT::v4i64, { 15, 21, 18, 29 } }, // 2 x 128-bit Op + extract/insert
3966 { ISD::SSUBSAT, MVT::v8i32, { 15, 19, 18, 29 } }, // 2 x 128-bit Op + extract/insert
3967 { ISD::SSUBSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3968 { ISD::SSUBSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3969 { ISD::UADDSAT, MVT::v2i64, { 3, 8, 6, 6 } },
3970 { ISD::UADDSAT, MVT::v4i64, { 8, 11, 14, 15 } }, // 2 x 128-bit Op + extract/insert
3971 { ISD::UADDSAT, MVT::v8i32, { 6, 6, 10, 11 } }, // 2 x 128-bit Op + extract/insert
3972 { ISD::UADDSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3973 { ISD::UADDSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3974 { ISD::UMAX, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
3975 { ISD::UMAX, MVT::v2i64, { 4, 8, 5, 7 } },
3976 { ISD::UMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3977 { ISD::UMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3978 { ISD::UMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3979 { ISD::UMIN, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
3980 { ISD::UMIN, MVT::v2i64, { 4, 8, 5, 7 } },
3981 { ISD::UMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3982 { ISD::UMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3983 { ISD::UMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3984 { ISD::UMULO, MVT::v4i64, { 24, 26, 39, 45 } },
3985 { ISD::UMULO, MVT::v2i64, { 10, 12, 15, 20 } },
3986 { ISD::UMULO, MVT::v8i32, { 14, 15, 23, 28 } },
3987 { ISD::UMULO, MVT::v4i32, { 7, 12, 11, 13 } },
3988 { ISD::UMULO, MVT::v16i16, { 7, 11, 13, 14 } },
3989 { ISD::UMULO, MVT::v8i16, { 3, 8, 6, 6 } },
3990 { ISD::UMULO, MVT::v32i8, { 19, 19, 35, 37 } },
3991 { ISD::UMULO, MVT::v16i8, { 9, 19, 17, 20 } },
3992 { ISD::USUBSAT, MVT::v2i64, { 3, 7, 6, 6 } },
3993 { ISD::USUBSAT, MVT::v4i64, { 8, 10, 14, 15 } }, // 2 x 128-bit Op + extract/insert
3994 { ISD::USUBSAT, MVT::v8i32, { 4, 4, 7, 8 } }, // 2 x 128-bit Op + extract/insert
3995 { ISD::USUBSAT, MVT::v8i32, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3996 { ISD::USUBSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3997 { ISD::USUBSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
3998 { ISD::FMAXNUM, MVT::f32, { 3, 6, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
3999 { ISD::FMAXNUM, MVT::v4f32, { 3, 6, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4000 { ISD::FMAXNUM, MVT::v8f32, { 5, 7, 3, 10 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4001 { ISD::FMAXNUM, MVT::f64, { 3, 6, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4002 { ISD::FMAXNUM, MVT::v2f64, { 3, 6, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4003 { ISD::FMAXNUM, MVT::v4f64, { 5, 7, 3, 10 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4004 { ISD::FSQRT, MVT::f32, { 21, 21, 1, 1 } }, // vsqrtss
4005 { ISD::FSQRT, MVT::v4f32, { 21, 21, 1, 1 } }, // vsqrtps
4006 { ISD::FSQRT, MVT::v8f32, { 42, 42, 1, 3 } }, // vsqrtps
4007 { ISD::FSQRT, MVT::f64, { 27, 27, 1, 1 } }, // vsqrtsd
4008 { ISD::FSQRT, MVT::v2f64, { 27, 27, 1, 1 } }, // vsqrtpd
4009 { ISD::FSQRT, MVT::v4f64, { 54, 54, 1, 3 } }, // vsqrtpd
4010 };
4011 static const CostKindTblEntry GFNICostTbl[] = {
4012 { ISD::BITREVERSE, MVT::i8, { 3, 3, 3, 4 } }, // gf2p8affineqb
4013 { ISD::BITREVERSE, MVT::i16, { 3, 3, 4, 6 } }, // gf2p8affineqb
4014 { ISD::BITREVERSE, MVT::i32, { 3, 3, 4, 5 } }, // gf2p8affineqb
4015 { ISD::BITREVERSE, MVT::i64, { 3, 3, 4, 6 } }, // gf2p8affineqb
4016 { ISD::BITREVERSE, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4017 { ISD::BITREVERSE, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4018 { ISD::BITREVERSE, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4019 { ISD::BITREVERSE, MVT::v8i16, { 1, 8, 2, 4 } }, // gf2p8affineqb
4020 { ISD::BITREVERSE, MVT::v16i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
4021 { ISD::BITREVERSE, MVT::v32i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
4022 { ISD::BITREVERSE, MVT::v4i32, { 1, 8, 2, 4 } }, // gf2p8affineqb
4023 { ISD::BITREVERSE, MVT::v8i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
4024 { ISD::BITREVERSE, MVT::v16i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
4025 { ISD::BITREVERSE, MVT::v2i64, { 1, 8, 2, 4 } }, // gf2p8affineqb
4026 { ISD::BITREVERSE, MVT::v4i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
4027 { ISD::BITREVERSE, MVT::v8i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
4028 { X86ISD::VROTLI, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4029 { X86ISD::VROTLI, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4030 { X86ISD::VROTLI, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4031 };
4032 static const CostKindTblEntry GLMCostTbl[] = {
4033 { ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss
4034 { ISD::FSQRT, MVT::v4f32, { 37, 41, 1, 5 } }, // sqrtps
4035 { ISD::FSQRT, MVT::f64, { 34, 35, 1, 1 } }, // sqrtsd
4036 { ISD::FSQRT, MVT::v2f64, { 67, 71, 1, 5 } }, // sqrtpd
4037 };
4038 static const CostKindTblEntry SLMCostTbl[] = {
4039 { ISD::BSWAP, MVT::v2i64, { 5, 5, 1, 5 } },
4040 { ISD::BSWAP, MVT::v4i32, { 5, 5, 1, 5 } },
4041 { ISD::BSWAP, MVT::v8i16, { 5, 5, 1, 5 } },
4042 { ISD::FSQRT, MVT::f32, { 20, 20, 1, 1 } }, // sqrtss
4043 { ISD::FSQRT, MVT::v4f32, { 40, 41, 1, 5 } }, // sqrtps
4044 { ISD::FSQRT, MVT::f64, { 35, 35, 1, 1 } }, // sqrtsd
4045 { ISD::FSQRT, MVT::v2f64, { 70, 71, 1, 5 } }, // sqrtpd
4046 };
4047 static const CostKindTblEntry SSE42CostTbl[] = {
4048 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4049 { ISD::FMAXNUM, MVT::v4f32, { 4, 4, 4, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4050 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4051 { ISD::FMAXNUM, MVT::v2f64, { 4, 4, 4, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4052 { ISD::FSQRT, MVT::f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
4053 { ISD::FSQRT, MVT::v4f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
4054 };
4055 static const CostKindTblEntry SSE41CostTbl[] = {
4056 { ISD::ABS, MVT::v2i64, { 3, 4, 3, 5 } }, // BLENDVPD(X,PSUBQ(0,X),X)
4057 { ISD::SADDSAT, MVT::v2i64, { 10, 14, 17, 21 } },
4058 { ISD::SADDSAT, MVT::v4i32, { 5, 11, 8, 10 } },
4059 { ISD::SSUBSAT, MVT::v2i64, { 12, 19, 25, 29 } },
4060 { ISD::SSUBSAT, MVT::v4i32, { 6, 14, 10, 12 } },
4061 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 3 } },
4062 { ISD::SMAX, MVT::v4i32, { 1, 1, 1, 1 } },
4063 { ISD::SMAX, MVT::v16i8, { 1, 1, 1, 1 } },
4064 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
4065 { ISD::SMIN, MVT::v4i32, { 1, 1, 1, 1 } },
4066 { ISD::SMIN, MVT::v16i8, { 1, 1, 1, 1 } },
4067 { ISD::SMULO, MVT::v2i64, { 9, 11, 13, 17 } },
4068 { ISD::SMULO, MVT::v4i32, { 20, 24, 13, 19 } },
4069 { ISD::SMULO, MVT::v8i16, { 5, 9, 8, 8 } },
4070 { ISD::SMULO, MVT::v16i8, { 13, 22, 24, 25 } },
4071 { ISD::UADDSAT, MVT::v2i64, { 6, 13, 14, 14 } },
4072 { ISD::UADDSAT, MVT::v4i32, { 2, 2, 4, 4 } },
4073 { ISD::USUBSAT, MVT::v2i64, { 6, 10, 14, 14 } },
4074 { ISD::USUBSAT, MVT::v4i32, { 1, 2, 2, 2 } },
4075 { ISD::UMAX, MVT::v2i64, { 2, 11, 6, 7 } },
4076 { ISD::UMAX, MVT::v4i32, { 1, 1, 1, 1 } },
4077 { ISD::UMAX, MVT::v8i16, { 1, 1, 1, 1 } },
4078 { ISD::UMIN, MVT::v2i64, { 2, 11, 6, 7 } },
4079 { ISD::UMIN, MVT::v4i32, { 1, 1, 1, 1 } },
4080 { ISD::UMIN, MVT::v8i16, { 1, 1, 1, 1 } },
4081 { ISD::UMULO, MVT::v2i64, { 14, 20, 15, 20 } },
4082 { ISD::UMULO, MVT::v4i32, { 19, 22, 12, 18 } },
4083 { ISD::UMULO, MVT::v8i16, { 4, 9, 7, 7 } },
4084 { ISD::UMULO, MVT::v16i8, { 13, 19, 18, 20 } },
4085 };
4086 static const CostKindTblEntry SSSE3CostTbl[] = {
4087 { ISD::ABS, MVT::v4i32, { 1, 2, 1, 1 } },
4088 { ISD::ABS, MVT::v8i16, { 1, 2, 1, 1 } },
4089 { ISD::ABS, MVT::v16i8, { 1, 2, 1, 1 } },
4090 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 11, 21 } },
4091 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 11, 21 } },
4092 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 11, 21 } },
4093 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 10, 16 } },
4094 { ISD::BSWAP, MVT::v2i64, { 2, 3, 1, 5 } },
4095 { ISD::BSWAP, MVT::v4i32, { 2, 3, 1, 5 } },
4096 { ISD::BSWAP, MVT::v8i16, { 2, 3, 1, 5 } },
4097 { ISD::CTLZ, MVT::v2i64, { 18, 28, 28, 35 } },
4098 { ISD::CTLZ, MVT::v4i32, { 15, 20, 22, 28 } },
4099 { ISD::CTLZ, MVT::v8i16, { 13, 17, 16, 22 } },
4100 { ISD::CTLZ, MVT::v16i8, { 11, 15, 10, 16 } },
4101 { ISD::CTPOP, MVT::v2i64, { 13, 19, 12, 18 } },
4102 { ISD::CTPOP, MVT::v4i32, { 18, 24, 16, 22 } },
4103 { ISD::CTPOP, MVT::v8i16, { 13, 18, 14, 20 } },
4104 { ISD::CTPOP, MVT::v16i8, { 11, 12, 10, 16 } },
4105 { ISD::CTTZ, MVT::v2i64, { 13, 25, 15, 22 } },
4106 { ISD::CTTZ, MVT::v4i32, { 18, 26, 19, 25 } },
4107 { ISD::CTTZ, MVT::v8i16, { 13, 20, 17, 23 } },
4108 { ISD::CTTZ, MVT::v16i8, { 11, 16, 13, 19 } }
4109 };
4110 static const CostKindTblEntry SSE2CostTbl[] = {
4111 { ISD::ABS, MVT::v2i64, { 3, 6, 5, 5 } },
4112 { ISD::ABS, MVT::v4i32, { 1, 4, 4, 4 } },
4113 { ISD::ABS, MVT::v8i16, { 1, 2, 3, 3 } },
4114 { ISD::ABS, MVT::v16i8, { 1, 2, 3, 3 } },
4115 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 32, 32 } },
4116 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 30, 30 } },
4117 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 25, 25 } },
4118 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 21, 21 } },
4119 { ISD::BSWAP, MVT::v2i64, { 5, 6, 11, 11 } },
4120 { ISD::BSWAP, MVT::v4i32, { 5, 5, 9, 9 } },
4121 { ISD::BSWAP, MVT::v8i16, { 5, 5, 4, 5 } },
4122 { ISD::CTLZ, MVT::v2i64, { 10, 45, 36, 38 } },
4123 { ISD::CTLZ, MVT::v4i32, { 10, 45, 38, 40 } },
4124 { ISD::CTLZ, MVT::v8i16, { 9, 38, 32, 34 } },
4125 { ISD::CTLZ, MVT::v16i8, { 8, 39, 29, 32 } },
4126 { ISD::CTPOP, MVT::v2i64, { 12, 26, 16, 18 } },
4127 { ISD::CTPOP, MVT::v4i32, { 15, 29, 21, 23 } },
4128 { ISD::CTPOP, MVT::v8i16, { 13, 25, 18, 20 } },
4129 { ISD::CTPOP, MVT::v16i8, { 10, 21, 14, 16 } },
4130 { ISD::CTTZ, MVT::v2i64, { 14, 28, 19, 21 } },
4131 { ISD::CTTZ, MVT::v4i32, { 18, 31, 24, 26 } },
4132 { ISD::CTTZ, MVT::v8i16, { 16, 27, 21, 23 } },
4133 { ISD::CTTZ, MVT::v16i8, { 13, 23, 17, 19 } },
4134 { ISD::SADDSAT, MVT::v2i64, { 12, 14, 24, 24 } },
4135 { ISD::SADDSAT, MVT::v4i32, { 6, 11, 11, 12 } },
4136 { ISD::SADDSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4137 { ISD::SADDSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4138 { ISD::SMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4139 { ISD::SMAX, MVT::v4i32, { 2, 4, 5, 5 } },
4140 { ISD::SMAX, MVT::v8i16, { 1, 1, 1, 1 } },
4141 { ISD::SMAX, MVT::v16i8, { 2, 4, 5, 5 } },
4142 { ISD::SMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4143 { ISD::SMIN, MVT::v4i32, { 2, 4, 5, 5 } },
4144 { ISD::SMIN, MVT::v8i16, { 1, 1, 1, 1 } },
4145 { ISD::SMIN, MVT::v16i8, { 2, 4, 5, 5 } },
4146 { ISD::SMULO, MVT::v2i64, { 30, 33, 13, 23 } },
4147 { ISD::SMULO, MVT::v4i32, { 20, 24, 23, 23 } },
4148 { ISD::SMULO, MVT::v8i16, { 5, 10, 8, 8 } },
4149 { ISD::SMULO, MVT::v16i8, { 13, 23, 24, 25 } },
4150 { ISD::SSUBSAT, MVT::v2i64, { 16, 19, 31, 31 } },
4151 { ISD::SSUBSAT, MVT::v4i32, { 6, 14, 12, 13 } },
4152 { ISD::SSUBSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4153 { ISD::SSUBSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4154 { ISD::UADDSAT, MVT::v2i64, { 7, 13, 14, 14 } },
4155 { ISD::UADDSAT, MVT::v4i32, { 4, 5, 7, 7 } },
4156 { ISD::UADDSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4157 { ISD::UADDSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4158 { ISD::UMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4159 { ISD::UMAX, MVT::v4i32, { 2, 5, 8, 8 } },
4160 { ISD::UMAX, MVT::v8i16, { 1, 3, 3, 3 } },
4161 { ISD::UMAX, MVT::v16i8, { 1, 1, 1, 1 } },
4162 { ISD::UMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4163 { ISD::UMIN, MVT::v4i32, { 2, 5, 8, 8 } },
4164 { ISD::UMIN, MVT::v8i16, { 1, 3, 3, 3 } },
4165 { ISD::UMIN, MVT::v16i8, { 1, 1, 1, 1 } },
4166 { ISD::UMULO, MVT::v2i64, { 30, 33, 15, 29 } },
4167 { ISD::UMULO, MVT::v4i32, { 19, 22, 14, 18 } },
4168 { ISD::UMULO, MVT::v8i16, { 4, 9, 7, 7 } },
4169 { ISD::UMULO, MVT::v16i8, { 13, 19, 20, 20 } },
4170 { ISD::USUBSAT, MVT::v2i64, { 7, 10, 14, 14 } },
4171 { ISD::USUBSAT, MVT::v4i32, { 4, 4, 7, 7 } },
4172 { ISD::USUBSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4173 { ISD::USUBSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4174 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } },
4175 { ISD::FMAXNUM, MVT::v2f64, { 4, 6, 6, 6 } },
4176 { ISD::FSQRT, MVT::f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4177 { ISD::FSQRT, MVT::v2f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4178 };
4179 static const CostKindTblEntry SSE1CostTbl[] = {
4180 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } },
4181 { ISD::FMAXNUM, MVT::v4f32, { 4, 6, 6, 6 } },
4182 { ISD::FSQRT, MVT::f32, { 28, 30, 1, 2 } }, // Pentium III from http://www.agner.org/
4183 { ISD::FSQRT, MVT::v4f32, { 56, 56, 1, 2 } }, // Pentium III from http://www.agner.org/
4184 };
4185 static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets
4186 { ISD::CTTZ, MVT::i64, { 1, 1, 1, 1 } },
4187 };
4188 static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
4189 { ISD::CTTZ, MVT::i32, { 1, 1, 1, 1 } },
4190 { ISD::CTTZ, MVT::i16, { 2, 1, 1, 1 } },
4191 { ISD::CTTZ, MVT::i8, { 2, 1, 1, 1 } },
4192 };
4193 static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets
4194 { ISD::CTLZ, MVT::i64, { 1, 1, 1, 1 } },
4195 };
4196 static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
4197 { ISD::CTLZ, MVT::i32, { 1, 1, 1, 1 } },
4198 { ISD::CTLZ, MVT::i16, { 2, 1, 1, 1 } },
4199 { ISD::CTLZ, MVT::i8, { 2, 1, 1, 1 } },
4200 };
4201 static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets
4202 { ISD::CTPOP, MVT::i64, { 1, 1, 1, 1 } }, // popcnt
4203 };
4204 static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
4205 { ISD::CTPOP, MVT::i32, { 1, 1, 1, 1 } }, // popcnt
4206 { ISD::CTPOP, MVT::i16, { 1, 1, 2, 2 } }, // popcnt(zext())
4207 { ISD::CTPOP, MVT::i8, { 1, 1, 2, 2 } }, // popcnt(zext())
4208 };
4209 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
4210 { ISD::ABS, MVT::i64, { 1, 2, 3, 3 } }, // SUB+CMOV
4211 { ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } },
4212 { ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } },
4213 { ISD::CTLZ, MVT::i64, { 3, 2, 6, 6 } }, // BSR+XOR or BSR+XOR+CMOV
4214 { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 2, 2 } }, // BSR+XOR
4215 { ISD::CTTZ, MVT::i64, { 2, 2, 5, 5 } }, // TEST+BSF+CMOV/BRANCH
4216 { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 1, 2 } }, // BSF
4217 { ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } },
4218 { ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } },
4219 { ISD::ROTR, MVT::i64, { 2, 3, 1, 3 } },
4220 { X86ISD::VROTLI, MVT::i64, { 1, 1, 1, 1 } },
4221 { ISD::FSHL, MVT::i64, { 4, 4, 1, 4 } },
4222 { ISD::SADDSAT, MVT::i64, { 4, 4, 7, 10 } },
4223 { ISD::SSUBSAT, MVT::i64, { 4, 5, 8, 11 } },
4224 { ISD::UADDSAT, MVT::i64, { 2, 3, 4, 7 } },
4225 { ISD::USUBSAT, MVT::i64, { 2, 3, 4, 7 } },
4226 { ISD::SMAX, MVT::i64, { 1, 3, 2, 3 } },
4227 { ISD::SMIN, MVT::i64, { 1, 3, 2, 3 } },
4228 { ISD::UMAX, MVT::i64, { 1, 3, 2, 3 } },
4229 { ISD::UMIN, MVT::i64, { 1, 3, 2, 3 } },
4230 { ISD::SADDO, MVT::i64, { 2, 2, 4, 6 } },
4231 { ISD::UADDO, MVT::i64, { 2, 2, 4, 6 } },
4232 { ISD::SMULO, MVT::i64, { 4, 4, 4, 6 } },
4233 { ISD::UMULO, MVT::i64, { 8, 8, 4, 7 } },
4234 };
4235 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
4236 { ISD::ABS, MVT::i32, { 1, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
4237 { ISD::ABS, MVT::i16, { 2, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
4238 { ISD::ABS, MVT::i8, { 2, 4, 4, 3 } }, // SUB+XOR+SRA
4239 { ISD::BITREVERSE, MVT::i32, { 9, 12, 17, 19 } },
4240 { ISD::BITREVERSE, MVT::i16, { 9, 12, 17, 19 } },
4241 { ISD::BITREVERSE, MVT::i8, { 7, 9, 13, 14 } },
4242 { ISD::BSWAP, MVT::i32, { 1, 1, 1, 1 } },
4243 { ISD::BSWAP, MVT::i16, { 1, 2, 1, 2 } }, // ROL
4244 { ISD::CTLZ, MVT::i32, { 3, 2, 6, 6 } }, // BSR+XOR or BSR+XOR+CMOV
4245 { ISD::CTLZ, MVT::i16, { 3, 2, 6, 6 } }, // BSR+XOR or BSR+XOR+CMOV
4246 { ISD::CTLZ, MVT::i8, { 3, 2, 7, 7 } }, // BSR+XOR or BSR+XOR+CMOV
4247 { ISD::CTLZ_ZERO_UNDEF, MVT::i32,{ 1, 2, 2, 2 } }, // BSR+XOR
4248 { ISD::CTLZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 2, 2 } }, // BSR+XOR
4249 { ISD::CTLZ_ZERO_UNDEF, MVT::i8, { 2, 2, 3, 3 } }, // BSR+XOR
4250 { ISD::CTTZ, MVT::i32, { 2, 2, 3, 3 } }, // TEST+BSF+CMOV/BRANCH
4251 { ISD::CTTZ, MVT::i16, { 2, 2, 2, 3 } }, // TEST+BSF+CMOV/BRANCH
4252 { ISD::CTTZ, MVT::i8, { 2, 2, 2, 3 } }, // TEST+BSF+CMOV/BRANCH
4253 { ISD::CTTZ_ZERO_UNDEF, MVT::i32,{ 1, 2, 1, 2 } }, // BSF
4254 { ISD::CTTZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 1, 2 } }, // BSF
4255 { ISD::CTTZ_ZERO_UNDEF, MVT::i8, { 2, 2, 1, 2 } }, // BSF
4256 { ISD::CTPOP, MVT::i32, { 8, 7, 15, 15 } },
4257 { ISD::CTPOP, MVT::i16, { 9, 8, 17, 17 } },
4258 { ISD::CTPOP, MVT::i8, { 7, 6, 6, 6 } },
4259 { ISD::ROTL, MVT::i32, { 2, 3, 1, 3 } },
4260 { ISD::ROTL, MVT::i16, { 2, 3, 1, 3 } },
4261 { ISD::ROTL, MVT::i8, { 2, 3, 1, 3 } },
4262 { ISD::ROTR, MVT::i32, { 2, 3, 1, 3 } },
4263 { ISD::ROTR, MVT::i16, { 2, 3, 1, 3 } },
4264 { ISD::ROTR, MVT::i8, { 2, 3, 1, 3 } },
4265 { X86ISD::VROTLI, MVT::i32, { 1, 1, 1, 1 } },
4266 { X86ISD::VROTLI, MVT::i16, { 1, 1, 1, 1 } },
4267 { X86ISD::VROTLI, MVT::i8, { 1, 1, 1, 1 } },
4268 { ISD::FSHL, MVT::i32, { 4, 4, 1, 4 } },
4269 { ISD::FSHL, MVT::i16, { 4, 4, 2, 5 } },
4270 { ISD::FSHL, MVT::i8, { 4, 4, 2, 5 } },
4271 { ISD::SADDSAT, MVT::i32, { 3, 4, 6, 9 } },
4272 { ISD::SADDSAT, MVT::i16, { 4, 4, 7, 10 } },
4273 { ISD::SADDSAT, MVT::i8, { 4, 5, 8, 11 } },
4274 { ISD::SSUBSAT, MVT::i32, { 4, 4, 7, 10 } },
4275 { ISD::SSUBSAT, MVT::i16, { 4, 4, 7, 10 } },
4276 { ISD::SSUBSAT, MVT::i8, { 4, 5, 8, 11 } },
4277 { ISD::UADDSAT, MVT::i32, { 2, 3, 4, 7 } },
4278 { ISD::UADDSAT, MVT::i16, { 2, 3, 4, 7 } },
4279 { ISD::UADDSAT, MVT::i8, { 3, 3, 5, 8 } },
4280 { ISD::USUBSAT, MVT::i32, { 2, 3, 4, 7 } },
4281 { ISD::USUBSAT, MVT::i16, { 2, 3, 4, 7 } },
4282 { ISD::USUBSAT, MVT::i8, { 3, 3, 5, 8 } },
4283 { ISD::SMAX, MVT::i32, { 1, 2, 2, 3 } },
4284 { ISD::SMAX, MVT::i16, { 1, 4, 2, 4 } },
4285 { ISD::SMAX, MVT::i8, { 1, 4, 2, 4 } },
4286 { ISD::SMIN, MVT::i32, { 1, 2, 2, 3 } },
4287 { ISD::SMIN, MVT::i16, { 1, 4, 2, 4 } },
4288 { ISD::SMIN, MVT::i8, { 1, 4, 2, 4 } },
4289 { ISD::UMAX, MVT::i32, { 1, 2, 2, 3 } },
4290 { ISD::UMAX, MVT::i16, { 1, 4, 2, 4 } },
4291 { ISD::UMAX, MVT::i8, { 1, 4, 2, 4 } },
4292 { ISD::UMIN, MVT::i32, { 1, 2, 2, 3 } },
4293 { ISD::UMIN, MVT::i16, { 1, 4, 2, 4 } },
4294 { ISD::UMIN, MVT::i8, { 1, 4, 2, 4 } },
4295 { ISD::SADDO, MVT::i32, { 2, 2, 4, 6 } },
4296 { ISD::SADDO, MVT::i16, { 2, 2, 4, 6 } },
4297 { ISD::SADDO, MVT::i8, { 2, 2, 4, 6 } },
4298 { ISD::UADDO, MVT::i32, { 2, 2, 4, 6 } },
4299 { ISD::UADDO, MVT::i16, { 2, 2, 4, 6 } },
4300 { ISD::UADDO, MVT::i8, { 2, 2, 4, 6 } },
4301 { ISD::SMULO, MVT::i32, { 2, 2, 4, 6 } },
4302 { ISD::SMULO, MVT::i16, { 5, 5, 4, 6 } },
4303 { ISD::SMULO, MVT::i8, { 6, 6, 4, 6 } },
4304 { ISD::UMULO, MVT::i32, { 6, 6, 4, 8 } },
4305 { ISD::UMULO, MVT::i16, { 6, 6, 4, 9 } },
4306 { ISD::UMULO, MVT::i8, { 6, 6, 4, 6 } },
4307 };
4308
4309 Type *RetTy = ICA.getReturnType();
4310 Type *OpTy = RetTy;
4311 Intrinsic::ID IID = ICA.getID();
4312 unsigned ISD = ISD::DELETED_NODE;
4313 switch (IID) {
4314 default:
4315 break;
4316 case Intrinsic::abs:
4317 ISD = ISD::ABS;
4318 break;
4319 case Intrinsic::bitreverse:
4320 ISD = ISD::BITREVERSE;
4321 break;
4322 case Intrinsic::bswap:
4323 ISD = ISD::BSWAP;
4324 break;
4325 case Intrinsic::ctlz:
4326 ISD = ISD::CTLZ;
4327 break;
4328 case Intrinsic::ctpop:
4329 ISD = ISD::CTPOP;
4330 break;
4331 case Intrinsic::cttz:
4332 ISD = ISD::CTTZ;
4333 break;
4334 case Intrinsic::fshl:
4335 ISD = ISD::FSHL;
4336 if (!ICA.isTypeBasedOnly()) {
4337 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4338 if (Args[0] == Args[1]) {
4339 ISD = ISD::ROTL;
4340 // Handle uniform constant rotation amounts.
4341 // TODO: Handle funnel-shift cases.
4342 const APInt *Amt;
4343 if (Args[2] &&
4345 ISD = X86ISD::VROTLI;
4346 }
4347 }
4348 break;
4349 case Intrinsic::fshr:
4350 // FSHR has same costs so don't duplicate.
4351 ISD = ISD::FSHL;
4352 if (!ICA.isTypeBasedOnly()) {
4353 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4354 if (Args[0] == Args[1]) {
4355 ISD = ISD::ROTR;
4356 // Handle uniform constant rotation amount.
4357 // TODO: Handle funnel-shift cases.
4358 const APInt *Amt;
4359 if (Args[2] &&
4361 ISD = X86ISD::VROTLI;
4362 }
4363 }
4364 break;
4365 case Intrinsic::lrint:
4366 case Intrinsic::llrint:
4367 // X86 can use the CVTP2SI instructions to lower lrint/llrint calls, which
4368 // have the same costs as the CVTTP2SI (fptosi) instructions
4369 if (!ICA.isTypeBasedOnly()) {
4370 const SmallVectorImpl<Type *> &ArgTys = ICA.getArgTypes();
4371 return getCastInstrCost(Instruction::FPToSI, RetTy, ArgTys[0],
4373 }
4374 break;
4375 case Intrinsic::maxnum:
4376 case Intrinsic::minnum:
4377 // FMINNUM has same costs so don't duplicate.
4378 ISD = ISD::FMAXNUM;
4379 break;
4380 case Intrinsic::sadd_sat:
4381 ISD = ISD::SADDSAT;
4382 break;
4383 case Intrinsic::smax:
4384 ISD = ISD::SMAX;
4385 break;
4386 case Intrinsic::smin:
4387 ISD = ISD::SMIN;
4388 break;
4389 case Intrinsic::ssub_sat:
4390 ISD = ISD::SSUBSAT;
4391 break;
4392 case Intrinsic::uadd_sat:
4393 ISD = ISD::UADDSAT;
4394 break;
4395 case Intrinsic::umax:
4396 ISD = ISD::UMAX;
4397 break;
4398 case Intrinsic::umin:
4399 ISD = ISD::UMIN;
4400 break;
4401 case Intrinsic::usub_sat:
4402 ISD = ISD::USUBSAT;
4403 break;
4404 case Intrinsic::sqrt:
4405 ISD = ISD::FSQRT;
4406 break;
4407 case Intrinsic::sadd_with_overflow:
4408 case Intrinsic::ssub_with_overflow:
4409 // SSUBO has same costs so don't duplicate.
4410 ISD = ISD::SADDO;
4411 OpTy = RetTy->getContainedType(0);
4412 break;
4413 case Intrinsic::uadd_with_overflow:
4414 case Intrinsic::usub_with_overflow:
4415 // USUBO has same costs so don't duplicate.
4416 ISD = ISD::UADDO;
4417 OpTy = RetTy->getContainedType(0);
4418 break;
4419 case Intrinsic::smul_with_overflow:
4420 ISD = ISD::SMULO;
4421 OpTy = RetTy->getContainedType(0);
4422 break;
4423 case Intrinsic::umul_with_overflow:
4424 ISD = ISD::UMULO;
4425 OpTy = RetTy->getContainedType(0);
4426 break;
4427 }
4428
4429 if (ISD != ISD::DELETED_NODE) {
4430 auto adjustTableCost = [&](int ISD, unsigned Cost,
4431 std::pair<InstructionCost, MVT> LT,
4433 InstructionCost LegalizationCost = LT.first;
4434 MVT MTy = LT.second;
4435
4436 // If there are no NANs to deal with, then these are reduced to a
4437 // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
4438 // assume is used in the non-fast case.
4439 if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) {
4440 if (FMF.noNaNs())
4441 return LegalizationCost * 1;
4442 }
4443
4444 // For cases where some ops can be folded into a load/store, assume free.
4445 if (MTy.isScalarInteger()) {
4446 if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
4447 if (const Instruction *II = ICA.getInst()) {
4448 if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
4449 return TTI::TCC_Free;
4450 if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
4451 if (LI->hasOneUse())
4452 return TTI::TCC_Free;
4453 }
4454 }
4455 }
4456 }
4457
4458 return LegalizationCost * (int)Cost;
4459 };
4460
4461 // Legalize the type.
4462 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy);
4463 MVT MTy = LT.second;
4464
4465 // Without BMI/LZCNT see if we're only looking for a *_ZERO_UNDEF cost.
4466 if (((ISD == ISD::CTTZ && !ST->hasBMI()) ||
4467 (ISD == ISD::CTLZ && !ST->hasLZCNT())) &&
4468 !MTy.isVector() && !ICA.isTypeBasedOnly()) {
4469 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4470 if (auto *Cst = dyn_cast<ConstantInt>(Args[1]))
4471 if (Cst->isAllOnesValue())
4473 }
4474
4475 // FSQRT is a single instruction.
4476 if (ISD == ISD::FSQRT && CostKind == TTI::TCK_CodeSize)
4477 return LT.first;
4478
4479 if (ST->useGLMDivSqrtCosts())
4480 if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
4481 if (auto KindCost = Entry->Cost[CostKind])
4482 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4483
4484 if (ST->useSLMArithCosts())
4485 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
4486 if (auto KindCost = Entry->Cost[CostKind])
4487 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4488
4489 if (ST->hasVBMI2())
4490 if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy))
4491 if (auto KindCost = Entry->Cost[CostKind])
4492 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4493
4494 if (ST->hasBITALG())
4495 if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy))
4496 if (auto KindCost = Entry->Cost[CostKind])
4497 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4498
4499 if (ST->hasVPOPCNTDQ())
4500 if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy))
4501 if (auto KindCost = Entry->Cost[CostKind])
4502 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4503
4504 if (ST->hasGFNI())
4505 if (const auto *Entry = CostTableLookup(GFNICostTbl, ISD, MTy))
4506 if (auto KindCost = Entry->Cost[CostKind])
4507 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4508
4509 if (ST->hasCDI())
4510 if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
4511 if (auto KindCost = Entry->Cost[CostKind])
4512 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4513
4514 if (ST->hasBWI())
4515 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
4516 if (auto KindCost = Entry->Cost[CostKind])
4517 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4518
4519 if (ST->hasAVX512())
4520 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
4521 if (auto KindCost = Entry->Cost[CostKind])
4522 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4523
4524 if (ST->hasXOP())
4525 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
4526 if (auto KindCost = Entry->Cost[CostKind])
4527 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4528
4529 if (ST->hasAVX2())
4530 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
4531 if (auto KindCost = Entry->Cost[CostKind])
4532 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4533
4534 if (ST->hasAVX())
4535 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
4536 if (auto KindCost = Entry->Cost[CostKind])
4537 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4538
4539 if (ST->hasSSE42())
4540 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
4541 if (auto KindCost = Entry->Cost[CostKind])
4542 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4543
4544 if (ST->hasSSE41())
4545 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
4546 if (auto KindCost = Entry->Cost[CostKind])
4547 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4548
4549 if (ST->hasSSSE3())
4550 if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
4551 if (auto KindCost = Entry->Cost[CostKind])
4552 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4553
4554 if (ST->hasSSE2())
4555 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
4556 if (auto KindCost = Entry->Cost[CostKind])
4557 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4558
4559 if (ST->hasSSE1())
4560 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
4561 if (auto KindCost = Entry->Cost[CostKind])
4562 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4563
4564 if (ST->hasBMI()) {
4565 if (ST->is64Bit())
4566 if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
4567 if (auto KindCost = Entry->Cost[CostKind])
4568 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4569
4570 if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
4571 if (auto KindCost = Entry->Cost[CostKind])
4572 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4573 }
4574
4575 if (ST->hasLZCNT()) {
4576 if (ST->is64Bit())
4577 if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
4578 if (auto KindCost = Entry->Cost[CostKind])
4579 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4580
4581 if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
4582 if (auto KindCost = Entry->Cost[CostKind])
4583 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4584 }
4585
4586 if (ST->hasPOPCNT()) {
4587 if (ST->is64Bit())
4588 if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
4589 if (auto KindCost = Entry->Cost[CostKind])
4590 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4591
4592 if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
4593 if (auto KindCost = Entry->Cost[CostKind])
4594 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4595 }
4596
4597 if (ST->is64Bit())
4598 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
4599 if (auto KindCost = Entry->Cost[CostKind])
4600 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4601
4602 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
4603 if (auto KindCost = Entry->Cost[CostKind])
4604 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4605 }
4606
4608}
4609
4612 unsigned Index, Value *Op0,
4613 Value *Op1) {
4614 static const CostTblEntry SLMCostTbl[] = {
4615 { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 },
4616 { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 },
4617 { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 },
4618 { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 }
4619 };
4620
4621 assert(Val->isVectorTy() && "This must be a vector type");
4622 Type *ScalarType = Val->getScalarType();
4623 InstructionCost RegisterFileMoveCost = 0;
4624
4625 // Non-immediate extraction/insertion can be handled as a sequence of
4626 // aliased loads+stores via the stack.
4627 if (Index == -1U && (Opcode == Instruction::ExtractElement ||
4628 Opcode == Instruction::InsertElement)) {
4629 // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
4630 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
4631
4632 // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling.
4633 assert(isa<FixedVectorType>(Val) && "Fixed vector type expected");
4634 Align VecAlign = DL.getPrefTypeAlign(Val);
4635 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
4636
4637 // Extract - store vector to stack, load scalar.
4638 if (Opcode == Instruction::ExtractElement) {
4639 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4640 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
4641 CostKind);
4642 }
4643 // Insert - store vector to stack, store scalar, load vector.
4644 if (Opcode == Instruction::InsertElement) {
4645 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4646 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
4647 CostKind) +
4648 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind);
4649 }
4650 }
4651
4652 if (Index != -1U && (Opcode == Instruction::ExtractElement ||
4653 Opcode == Instruction::InsertElement)) {
4654 // Extraction of vXi1 elements are now efficiently handled by MOVMSK.
4655 if (Opcode == Instruction::ExtractElement &&
4656 ScalarType->getScalarSizeInBits() == 1 &&
4657 cast<FixedVectorType>(Val)->getNumElements() > 1)
4658 return 1;
4659
4660 // Legalize the type.
4661 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
4662
4663 // This type is legalized to a scalar type.
4664 if (!LT.second.isVector())
4665 return 0;
4666
4667 // The type may be split. Normalize the index to the new type.
4668 unsigned SizeInBits = LT.second.getSizeInBits();
4669 unsigned NumElts = LT.second.getVectorNumElements();
4670 unsigned SubNumElts = NumElts;
4671 Index = Index % NumElts;
4672
4673 // For >128-bit vectors, we need to extract higher 128-bit subvectors.
4674 // For inserts, we also need to insert the subvector back.
4675 if (SizeInBits > 128) {
4676 assert((SizeInBits % 128) == 0 && "Illegal vector");
4677 unsigned NumSubVecs = SizeInBits / 128;
4678 SubNumElts = NumElts / NumSubVecs;
4679 if (SubNumElts <= Index) {
4680 RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
4681 Index %= SubNumElts;
4682 }
4683 }
4684
4685 MVT MScalarTy = LT.second.getScalarType();
4686 auto IsCheapPInsrPExtrInsertPS = [&]() {
4687 // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
4688 // Also, assume insertps is relatively cheap on all >= SSE41 targets.
4689 return (MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4690 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4691 (MScalarTy == MVT::f32 && ST->hasSSE41() &&
4692 Opcode == Instruction::InsertElement);
4693 };
4694
4695 if (Index == 0) {
4696 // Floating point scalars are already located in index #0.
4697 // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
4698 // true for all.
4699 if (ScalarType->isFloatingPointTy() &&
4700 (Opcode != Instruction::InsertElement || !Op0 ||
4701 isa<UndefValue>(Op0)))
4702 return RegisterFileMoveCost;
4703
4704 if (Opcode == Instruction::InsertElement &&
4705 isa_and_nonnull<UndefValue>(Op0)) {
4706 // Consider the gather cost to be cheap.
4707 if (isa_and_nonnull<LoadInst>(Op1))
4708 return RegisterFileMoveCost;
4709 if (!IsCheapPInsrPExtrInsertPS()) {
4710 // mov constant-to-GPR + movd/movq GPR -> XMM.
4711 if (isa_and_nonnull<Constant>(Op1) && Op1->getType()->isIntegerTy())
4712 return 2 + RegisterFileMoveCost;
4713 // Assume movd/movq GPR -> XMM is relatively cheap on all targets.
4714 return 1 + RegisterFileMoveCost;
4715 }
4716 }
4717
4718 // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
4719 if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
4720 return 1 + RegisterFileMoveCost;
4721 }
4722
4723 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4724 assert(ISD && "Unexpected vector opcode");
4725 if (ST->useSLMArithCosts())
4726 if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
4727 return Entry->Cost + RegisterFileMoveCost;
4728
4729 // Consider cheap cases.
4730 if (IsCheapPInsrPExtrInsertPS())
4731 return 1 + RegisterFileMoveCost;
4732
4733 // For extractions we just need to shuffle the element to index 0, which
4734 // should be very cheap (assume cost = 1). For insertions we need to shuffle
4735 // the elements to its destination. In both cases we must handle the
4736 // subvector move(s).
4737 // If the vector type is already less than 128-bits then don't reduce it.
4738 // TODO: Under what circumstances should we shuffle using the full width?
4739 InstructionCost ShuffleCost = 1;
4740 if (Opcode == Instruction::InsertElement) {
4741 auto *SubTy = cast<VectorType>(Val);
4742 EVT VT = TLI->getValueType(DL, Val);
4743 if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
4744 SubTy = FixedVectorType::get(ScalarType, SubNumElts);
4745 ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, std::nullopt,
4746 CostKind, 0, SubTy);
4747 }
4748 int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
4749 return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
4750 }
4751
4752 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1) +
4753 RegisterFileMoveCost;
4754}
4755
4758 bool Insert, bool Extract,
4760 assert(DemandedElts.getBitWidth() ==
4761 cast<FixedVectorType>(Ty)->getNumElements() &&
4762 "Vector size mismatch");
4763
4764 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4765 MVT MScalarTy = LT.second.getScalarType();
4766 unsigned LegalVectorBitWidth = LT.second.getSizeInBits();
4768
4769 constexpr unsigned LaneBitWidth = 128;
4770 assert((LegalVectorBitWidth < LaneBitWidth ||
4771 (LegalVectorBitWidth % LaneBitWidth) == 0) &&
4772 "Illegal vector");
4773
4774 const int NumLegalVectors = *LT.first.getValue();
4775 assert(NumLegalVectors >= 0 && "Negative cost!");
4776
4777 // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
4778 // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
4779 if (Insert) {
4780 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4781 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4782 (MScalarTy == MVT::f32 && ST->hasSSE41())) {
4783 // For types we can insert directly, insertion into 128-bit sub vectors is
4784 // cheap, followed by a cheap chain of concatenations.
4785 if (LegalVectorBitWidth <= LaneBitWidth) {
4786 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
4787 /*Extract*/ false, CostKind);
4788 } else {
4789 // In each 128-lane, if at least one index is demanded but not all
4790 // indices are demanded and this 128-lane is not the first 128-lane of
4791 // the legalized-vector, then this 128-lane needs a extracti128; If in
4792 // each 128-lane, there is at least one demanded index, this 128-lane
4793 // needs a inserti128.
4794
4795 // The following cases will help you build a better understanding:
4796 // Assume we insert several elements into a v8i32 vector in avx2,
4797 // Case#1: inserting into 1th index needs vpinsrd + inserti128.
4798 // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
4799 // inserti128.
4800 // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
4801 assert((LegalVectorBitWidth % LaneBitWidth) == 0 && "Illegal vector");
4802 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
4803 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
4804 unsigned NumLegalElts =
4805 LT.second.getVectorNumElements() * NumLegalVectors;
4806 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
4807 "Vector has been legalized to smaller element count");
4808 assert((NumLegalElts % NumLanesTotal) == 0 &&
4809 "Unexpected elts per lane");
4810 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
4811
4812 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
4813 auto *LaneTy =
4814 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
4815
4816 for (unsigned I = 0; I != NumLanesTotal; ++I) {
4817 APInt LaneEltMask = WidenedDemandedElts.extractBits(
4818 NumEltsPerLane, NumEltsPerLane * I);
4819 if (LaneEltMask.isZero())
4820 continue;
4821 // FIXME: we don't need to extract if all non-demanded elements
4822 // are legalization-inserted padding.
4823 if (!LaneEltMask.isAllOnes())
4824 Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
4825 CostKind, I * NumEltsPerLane, LaneTy);
4826 Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert,
4827 /*Extract*/ false, CostKind);
4828 }
4829
4830 APInt AffectedLanes =
4831 APIntOps::ScaleBitMask(WidenedDemandedElts, NumLanesTotal);
4832 APInt FullyAffectedLegalVectors = APIntOps::ScaleBitMask(
4833 AffectedLanes, NumLegalVectors, /*MatchAllBits=*/true);
4834 for (int LegalVec = 0; LegalVec != NumLegalVectors; ++LegalVec) {
4835 for (unsigned Lane = 0; Lane != NumLegalLanes; ++Lane) {
4836 unsigned I = NumLegalLanes * LegalVec + Lane;
4837 // No need to insert unaffected lane; or lane 0 of each legal vector
4838 // iff ALL lanes of that vector were affected and will be inserted.
4839 if (!AffectedLanes[I] ||
4840 (Lane == 0 && FullyAffectedLegalVectors[LegalVec]))
4841 continue;
4842 Cost += getShuffleCost(TTI::SK_InsertSubvector, Ty, std::nullopt,
4843 CostKind, I * NumEltsPerLane, LaneTy);
4844 }
4845 }
4846 }
4847 } else if (LT.second.isVector()) {
4848 // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
4849 // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
4850 // series of UNPCK followed by CONCAT_VECTORS - all of these can be
4851 // considered cheap.
4852 if (Ty->isIntOrIntVectorTy())
4853 Cost += DemandedElts.popcount();
4854
4855 // Get the smaller of the legalized or original pow2-extended number of
4856 // vector elements, which represents the number of unpacks we'll end up
4857 // performing.
4858 unsigned NumElts = LT.second.getVectorNumElements();
4859 unsigned Pow2Elts =
4860 PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements());
4861 Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
4862 }
4863 }
4864
4865 if (Extract) {
4866 // vXi1 can be efficiently extracted with MOVMSK.
4867 // TODO: AVX512 predicate mask handling.
4868 // NOTE: This doesn't work well for roundtrip scalarization.
4869 if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) {
4870 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
4871 unsigned MaxElts = ST->hasAVX2() ? 32 : 16;
4872 unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts;
4873 return MOVMSKCost;
4874 }
4875
4876 if (LT.second.isVector()) {
4877 unsigned NumLegalElts =
4878 LT.second.getVectorNumElements() * NumLegalVectors;
4879 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
4880 "Vector has been legalized to smaller element count");
4881
4882 // If we're extracting elements from a 128-bit subvector lane,
4883 // we only need to extract each lane once, not for every element.
4884 if (LegalVectorBitWidth > LaneBitWidth) {
4885 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
4886 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
4887 assert((NumLegalElts % NumLanesTotal) == 0 &&
4888 "Unexpected elts per lane");
4889 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
4890
4891 // Add cost for each demanded 128-bit subvector extraction.
4892 // Luckily this is a lot easier than for insertion.
4893 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
4894 auto *LaneTy =
4895 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
4896
4897 for (unsigned I = 0; I != NumLanesTotal; ++I) {
4898 APInt LaneEltMask = WidenedDemandedElts.extractBits(
4899 NumEltsPerLane, I * NumEltsPerLane);
4900 if (LaneEltMask.isZero())
4901 continue;
4902 Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
4903 CostKind, I * NumEltsPerLane, LaneTy);
4905 LaneTy, LaneEltMask, /*Insert*/ false, Extract, CostKind);
4906 }
4907
4908 return Cost;
4909 }
4910 }
4911
4912 // Fallback to default extraction.
4913 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ false,
4914 Extract, CostKind);
4915 }
4916
4917 return Cost;
4918}
4919
4921X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
4922 int VF, const APInt &DemandedDstElts,
4924 const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy);
4925 // We don't differentiate element types here, only element bit width.
4926 EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits);
4927
4928 auto bailout = [&]() {
4929 return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
4930 DemandedDstElts, CostKind);
4931 };
4932
4933 // For now, only deal with AVX512 cases.
4934 if (!ST->hasAVX512())
4935 return bailout();
4936
4937 // Do we have a native shuffle for this element type, or should we promote?
4938 unsigned PromEltTyBits = EltTyBits;
4939 switch (EltTyBits) {
4940 case 32:
4941 case 64:
4942 break; // AVX512F.
4943 case 16:
4944 if (!ST->hasBWI())
4945 PromEltTyBits = 32; // promote to i32, AVX512F.
4946 break; // AVX512BW
4947 case 8:
4948 if (!ST->hasVBMI())
4949 PromEltTyBits = 32; // promote to i32, AVX512F.
4950 break; // AVX512VBMI
4951 case 1:
4952 // There is no support for shuffling i1 elements. We *must* promote.
4953 if (ST->hasBWI()) {
4954 if (ST->hasVBMI())
4955 PromEltTyBits = 8; // promote to i8, AVX512VBMI.
4956 else
4957 PromEltTyBits = 16; // promote to i16, AVX512BW.
4958 break;
4959 }
4960 PromEltTyBits = 32; // promote to i32, AVX512F.
4961 break;
4962 default:
4963 return bailout();
4964 }
4965 auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits);
4966
4967 auto *SrcVecTy = FixedVectorType::get(EltTy, VF);
4968 auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF);
4969
4970 int NumDstElements = VF * ReplicationFactor;
4971 auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements);
4972 auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements);
4973
4974 // Legalize the types.
4975 MVT LegalSrcVecTy = getTypeLegalizationCost(SrcVecTy).second;
4976 MVT LegalPromSrcVecTy = getTypeLegalizationCost(PromSrcVecTy).second;
4977 MVT LegalPromDstVecTy = getTypeLegalizationCost(PromDstVecTy).second;
4978 MVT LegalDstVecTy = getTypeLegalizationCost(DstVecTy).second;
4979 // They should have legalized into vector types.
4980 if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() ||
4981 !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector())
4982 return bailout();
4983
4984 if (PromEltTyBits != EltTyBits) {
4985 // If we have to perform the shuffle with wider elt type than our data type,
4986 // then we will first need to anyext (we don't care about the new bits)
4987 // the source elements, and then truncate Dst elements.
4988 InstructionCost PromotionCost;
4989 PromotionCost += getCastInstrCost(
4990 Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy,
4992 PromotionCost +=
4993 getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy,
4994 /*Src=*/PromDstVecTy,
4996 return PromotionCost + getReplicationShuffleCost(PromEltTy,
4997 ReplicationFactor, VF,
4998 DemandedDstElts, CostKind);
4999 }
5000
5001 assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits &&
5002 LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() &&
5003 "We expect that the legalization doesn't affect the element width, "
5004 "doesn't coalesce/split elements.");
5005
5006 unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements();
5007 unsigned NumDstVectors =
5008 divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec);
5009
5010 auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec);
5011
5012 // Not all the produced Dst elements may be demanded. In our case,
5013 // given that a single Dst vector is formed by a single shuffle,
5014 // if all elements that will form a single Dst vector aren't demanded,
5015 // then we won't need to do that shuffle, so adjust the cost accordingly.
5016 APInt DemandedDstVectors = APIntOps::ScaleBitMask(
5017 DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors);
5018 unsigned NumDstVectorsDemanded = DemandedDstVectors.popcount();
5019
5020 InstructionCost SingleShuffleCost = getShuffleCost(
5021 TTI::SK_PermuteSingleSrc, SingleDstVecTy, /*Mask=*/std::nullopt, CostKind,
5022 /*Index=*/0, /*SubTp=*/nullptr);
5023 return NumDstVectorsDemanded * SingleShuffleCost;
5024}
5025
5027 MaybeAlign Alignment,
5028 unsigned AddressSpace,
5030 TTI::OperandValueInfo OpInfo,
5031 const Instruction *I) {
5032 // TODO: Handle other cost kinds.
5034 if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
5035 // Store instruction with index and scale costs 2 Uops.
5036 // Check the preceding GEP to identify non-const indices.
5037 if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
5038 if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
5039 return TTI::TCC_Basic * 2;
5040 }
5041 }
5042 return TTI::TCC_Basic;
5043 }
5044
5045 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
5046 "Invalid Opcode");
5047 // Type legalization can't handle structs
5048 if (TLI->getValueType(DL, Src, true) == MVT::Other)
5049 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5050 CostKind);
5051
5052 // Legalize the type.
5053 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
5054
5055 auto *VTy = dyn_cast<FixedVectorType>(Src);
5056
5058
5059 // Add a cost for constant load to vector.
5060 if (Opcode == Instruction::Store && OpInfo.isConstant())
5061 Cost += getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src),
5062 /*AddressSpace=*/0, CostKind);
5063
5064 // Handle the simple case of non-vectors.
5065 // NOTE: this assumes that legalization never creates vector from scalars!
5066 if (!VTy || !LT.second.isVector()) {
5067 // Each load/store unit costs 1.
5068 return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1;
5069 }
5070
5071 bool IsLoad = Opcode == Instruction::Load;
5072
5073 Type *EltTy = VTy->getElementType();
5074
5075 const int EltTyBits = DL.getTypeSizeInBits(EltTy);
5076
5077 // Source of truth: how many elements were there in the original IR vector?
5078 const unsigned SrcNumElt = VTy->getNumElements();
5079
5080 // How far have we gotten?
5081 int NumEltRemaining = SrcNumElt;
5082 // Note that we intentionally capture by-reference, NumEltRemaining changes.
5083 auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
5084
5085 const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8);
5086
5087 // Note that even if we can store 64 bits of an XMM, we still operate on XMM.
5088 const unsigned XMMBits = 128;
5089 if (XMMBits % EltTyBits != 0)
5090 // Vector size must be a multiple of the element size. I.e. no padding.
5091 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5092 CostKind);
5093 const int NumEltPerXMM = XMMBits / EltTyBits;
5094
5095 auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
5096
5097 for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0;
5098 NumEltRemaining > 0; CurrOpSizeBytes /= 2) {
5099 // How many elements would a single op deal with at once?
5100 if ((8 * CurrOpSizeBytes) % EltTyBits != 0)
5101 // Vector size must be a multiple of the element size. I.e. no padding.
5102 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5103 CostKind);
5104 int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits;
5105
5106 assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?");
5107 assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||
5108 (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&
5109 "Unless we haven't halved the op size yet, "
5110 "we have less than two op's sized units of work left.");
5111
5112 auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
5113 ? FixedVectorType::get(EltTy, CurrNumEltPerOp)
5114 : XMMVecTy;
5115
5116 assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&
5117 "After halving sizes, the vector elt count is no longer a multiple "
5118 "of number of elements per operation?");
5119 auto *CoalescedVecTy =
5120 CurrNumEltPerOp == 1
5121 ? CurrVecTy
5123 IntegerType::get(Src->getContext(),
5124 EltTyBits * CurrNumEltPerOp),
5125 CurrVecTy->getNumElements() / CurrNumEltPerOp);
5126 assert(DL.getTypeSizeInBits(CoalescedVecTy) ==
5127 DL.getTypeSizeInBits(CurrVecTy) &&
5128 "coalesciing elements doesn't change vector width.");
5129
5130 while (NumEltRemaining > 0) {
5131 assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?");
5132
5133 // Can we use this vector size, as per the remaining element count?
5134 // Iff the vector is naturally aligned, we can do a wide load regardless.
5135 if (NumEltRemaining < CurrNumEltPerOp &&
5136 (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) &&
5137 CurrOpSizeBytes != 1)
5138 break; // Try smalled vector size.
5139
5140 bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
5141
5142 // If we have fully processed the previous reg, we need to replenish it.
5143 if (SubVecEltsLeft == 0) {
5144 SubVecEltsLeft += CurrVecTy->getNumElements();
5145 // And that's free only for the 0'th subvector of a legalized vector.
5146 if (!Is0thSubVec)
5149 VTy, std::nullopt, CostKind, NumEltDone(),
5150 CurrVecTy);
5151 }
5152
5153 // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
5154 // for smaller widths (32/16/8) we have to insert/extract them separately.
5155 // Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
5156 // but let's pretend that it is also true for 16/8 bit wide ops...)
5157 if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
5158 int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
5159 assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "");
5160 int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
5161 APInt DemandedElts =
5162 APInt::getBitsSet(CoalescedVecTy->getNumElements(),
5163 CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
5164 assert(DemandedElts.popcount() == 1 && "Inserting single value");
5165 Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
5166 !IsLoad, CostKind);
5167 }
5168
5169 // This isn't exactly right. We're using slow unaligned 32-byte accesses
5170 // as a proxy for a double-pumped AVX memory interface such as on
5171 // Sandybridge.
5172 // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or
5173 // will be scalarized.
5174 if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
5175 Cost += 2;
5176 else if (CurrOpSizeBytes < 4)
5177 Cost += 2;
5178 else
5179 Cost += 1;
5180
5181 SubVecEltsLeft -= CurrNumEltPerOp;
5182 NumEltRemaining -= CurrNumEltPerOp;
5183 Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes);
5184 }
5185 }
5186
5187 assert(NumEltRemaining <= 0 && "Should have processed all the elements.");
5188
5189 return Cost;
5190}
5191
5193X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
5194 unsigned AddressSpace,
5196 bool IsLoad = (Instruction::Load == Opcode);
5197 bool IsStore = (Instruction::Store == Opcode);
5198
5199 auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
5200 if (!SrcVTy)
5201 // To calculate scalar take the regular cost, without mask
5202 return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
5203
5204 unsigned NumElem = SrcVTy->getNumElements();
5205 auto *MaskTy =
5206 FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
5207 if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) ||
5208 (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) {
5209 // Scalarization
5210 APInt DemandedElts = APInt::getAllOnes(NumElem);
5212 MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind);
5213 InstructionCost ScalarCompareCost = getCmpSelInstrCost(
5214 Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
5216 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
5217 InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
5219 SrcVTy, DemandedElts, IsLoad, IsStore, CostKind);
5220 InstructionCost MemopCost =
5221 NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
5222 Alignment, AddressSpace, CostKind);
5223 return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
5224 }
5225
5226 // Legalize the type.
5227 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy);
5228 auto VT = TLI->getValueType(DL, SrcVTy);
5230 MVT Ty = LT.second;
5231 if (Ty == MVT::i16 || Ty == MVT::i32 || Ty == MVT::i64)
5232 // APX masked load/store for scalar is cheap.
5233 return Cost + LT.first;
5234
5235 if (VT.isSimple() && Ty != VT.getSimpleVT() &&
5236 LT.second.getVectorNumElements() == NumElem)
5237 // Promotion requires extend/truncate for data and a shuffle for mask.
5238 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, std::nullopt,
5239 CostKind, 0, nullptr) +
5240 getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, std::nullopt,
5241 CostKind, 0, nullptr);
5242
5243 else if (LT.first * Ty.getVectorNumElements() > NumElem) {
5244 auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
5246 // Expanding requires fill mask with zeroes
5247 Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, std::nullopt,
5248 CostKind, 0, MaskTy);
5249 }
5250
5251 // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
5252 if (!ST->hasAVX512())
5253 return Cost + LT.first * (IsLoad ? 2 : 8);
5254
5255 // AVX-512 masked load/store is cheaper
5256 return Cost + LT.first;
5257}
5258
5261 const Value *Base,
5262 const TTI::PointersChainInfo &Info,
5263 Type *AccessTy, TTI::TargetCostKind CostKind) {
5264 if (Info.isSameBase() && Info.isKnownStride()) {
5265 // If all the pointers have known stride all the differences are translated
5266 // into constants. X86 memory addressing allows encoding it into
5267 // displacement. So we just need to take the base GEP cost.
5268 if (const auto *BaseGEP = dyn_cast<GetElementPtrInst>(Base)) {
5269 SmallVector<const Value *> Indices(BaseGEP->indices());
5270 return getGEPCost(BaseGEP->getSourceElementType(),
5271 BaseGEP->getPointerOperand(), Indices, nullptr,
5272 CostKind);
5273 }
5274 return TTI::TCC_Free;
5275 }
5276 return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind);
5277}
5278
5280 ScalarEvolution *SE,
5281 const SCEV *Ptr) {
5282 // Address computations in vectorized code with non-consecutive addresses will
5283 // likely result in more instructions compared to scalar code where the
5284 // computation can more often be merged into the index mode. The resulting
5285 // extra micro-ops can significantly decrease throughput.
5286 const unsigned NumVectorInstToHideOverhead = 10;
5287
5288 // Cost modeling of Strided Access Computation is hidden by the indexing
5289 // modes of X86 regardless of the stride value. We dont believe that there
5290 // is a difference between constant strided access in gerenal and constant
5291 // strided value which is less than or equal to 64.
5292 // Even in the case of (loop invariant) stride whose value is not known at
5293 // compile time, the address computation will not incur more than one extra
5294 // ADD instruction.
5295 if (Ty->isVectorTy() && SE && !ST->hasAVX2()) {
5296 // TODO: AVX2 is the current cut-off because we don't have correct
5297 // interleaving costs for prior ISA's.
5299 return NumVectorInstToHideOverhead;
5301 return 1;
5302 }
5303
5304 return BaseT::getAddressComputationCost(Ty, SE, Ptr);
5305}
5306
5309 std::optional<FastMathFlags> FMF,
5312 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5313
5314 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5315 // and make it as the cost.
5316
5317 static const CostTblEntry SLMCostTbl[] = {
5318 { ISD::FADD, MVT::v2f64, 3 },
5319 { ISD::ADD, MVT::v2i64, 5 },
5320 };
5321
5322 static const CostTblEntry SSE2CostTbl[] = {
5323 { ISD::FADD, MVT::v2f64, 2 },
5324 { ISD::FADD, MVT::v2f32, 2 },
5325 { ISD::FADD, MVT::v4f32, 4 },
5326 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
5327 { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
5328 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
5329 { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
5330 { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
5331 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
5332 { ISD::ADD, MVT::v2i8, 2 },
5333 { ISD::ADD, MVT::v4i8, 2 },
5334 { ISD::ADD, MVT::v8i8, 2 },
5335 { ISD::ADD, MVT::v16i8, 3 },
5336 };
5337
5338 static const CostTblEntry AVX1CostTbl[] = {
5339 { ISD::FADD, MVT::v4f64, 3 },
5340 { ISD::FADD, MVT::v4f32, 3 },
5341 { ISD::FADD, MVT::v8f32, 4 },
5342 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
5343 { ISD::ADD, MVT::v4i64, 3 },
5344 { ISD::ADD, MVT::v8i32, 5 },
5345 { ISD::ADD, MVT::v16i16, 5 },
5346 { ISD::ADD, MVT::v32i8, 4 },
5347 };
5348
5349 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5350 assert(ISD && "Invalid opcode");
5351
5352 // Before legalizing the type, give a chance to look up illegal narrow types
5353 // in the table.
5354 // FIXME: Is there a better way to do this?
5355 EVT VT = TLI->getValueType(DL, ValTy);
5356 if (VT.isSimple()) {
5357 MVT MTy = VT.getSimpleVT();
5358 if (ST->useSLMArithCosts())
5359 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5360 return Entry->Cost;
5361
5362 if (ST->hasAVX())
5363 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5364 return Entry->Cost;
5365
5366 if (ST->hasSSE2())
5367 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5368 return Entry->Cost;
5369 }
5370
5371 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5372
5373 MVT MTy = LT.second;
5374
5375 auto *ValVTy = cast<FixedVectorType>(ValTy);
5376
5377 // Special case: vXi8 mul reductions are performed as vXi16.
5378 if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) {
5379 auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16);
5380 auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements());
5381 return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
5383 CostKind) +
5384 getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind);
5385 }
5386
5387 InstructionCost ArithmeticCost = 0;
5388 if (LT.first != 1 && MTy.isVector() &&
5389 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5390 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5391 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5392 MTy.getVectorNumElements());
5393 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5394 ArithmeticCost *= LT.first - 1;
5395 }
5396
5397 if (ST->useSLMArithCosts())
5398 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5399 return ArithmeticCost + Entry->Cost;
5400
5401 if (ST->hasAVX())
5402 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5403 return ArithmeticCost + Entry->Cost;
5404
5405 if (ST->hasSSE2())
5406 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5407 return ArithmeticCost + Entry->Cost;
5408
5409 // FIXME: These assume a naive kshift+binop lowering, which is probably
5410 // conservative in most cases.
5411 static const CostTblEntry AVX512BoolReduction[] = {
5412 { ISD::AND, MVT::v2i1, 3 },
5413 { ISD::AND, MVT::v4i1, 5 },
5414 { ISD::AND, MVT::v8i1, 7 },
5415 { ISD::AND, MVT::v16i1, 9 },
5416 { ISD::AND, MVT::v32i1, 11 },
5417 { ISD::AND, MVT::v64i1, 13 },
5418 { ISD::OR, MVT::v2i1, 3 },
5419 { ISD::OR, MVT::v4i1, 5 },
5420 { ISD::OR, MVT::v8i1, 7 },
5421 { ISD::OR, MVT::v16i1, 9 },
5422 { ISD::OR, MVT::v32i1, 11 },
5423 { ISD::OR, MVT::v64i1, 13 },
5424 };
5425
5426 static const CostTblEntry AVX2BoolReduction[] = {
5427 { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
5428 { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
5429 { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
5430 { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
5431 };
5432
5433 static const CostTblEntry AVX1BoolReduction[] = {
5434 { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
5435 { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
5436 { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5437 { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5438 { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
5439 { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
5440 { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5441 { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5442 };
5443
5444 static const CostTblEntry SSE2BoolReduction[] = {
5445 { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
5446 { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
5447 { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
5448 { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
5449 { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
5450 { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
5451 { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
5452 { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
5453 };
5454
5455 // Handle bool allof/anyof patterns.
5456 if (ValVTy->getElementType()->isIntegerTy(1)) {
5457 InstructionCost ArithmeticCost = 0;
5458 if (LT.first != 1 && MTy.isVector() &&
5459 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5460 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5461 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5462 MTy.getVectorNumElements());
5463 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5464 ArithmeticCost *= LT.first - 1;
5465 }
5466
5467 if (ST->hasAVX512())
5468 if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
5469 return ArithmeticCost + Entry->Cost;
5470 if (ST->hasAVX2())
5471 if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
5472 return ArithmeticCost + Entry->Cost;
5473 if (ST->hasAVX())
5474 if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
5475 return ArithmeticCost + Entry->Cost;
5476 if (ST->hasSSE2())
5477 if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
5478 return ArithmeticCost + Entry->Cost;
5479
5480 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5481 }
5482
5483 unsigned NumVecElts = ValVTy->getNumElements();
5484 unsigned ScalarSize = ValVTy->getScalarSizeInBits();
5485
5486 // Special case power of 2 reductions where the scalar type isn't changed
5487 // by type legalization.
5488 if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
5489 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5490
5491 InstructionCost ReductionCost = 0;
5492
5493 auto *Ty = ValVTy;
5494 if (LT.first != 1 && MTy.isVector() &&
5495 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5496 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5497 Ty = FixedVectorType::get(ValVTy->getElementType(),
5498 MTy.getVectorNumElements());
5499 ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5500 ReductionCost *= LT.first - 1;
5501 NumVecElts = MTy.getVectorNumElements();
5502 }
5503
5504 // Now handle reduction with the legal type, taking into account size changes
5505 // at each level.
5506 while (NumVecElts > 1) {
5507 // Determine the size of the remaining vector we need to reduce.
5508 unsigned Size = NumVecElts * ScalarSize;
5509 NumVecElts /= 2;
5510 // If we're reducing from 256/512 bits, use an extract_subvector.
5511 if (Size > 128) {
5512 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5513 ReductionCost +=
5515 NumVecElts, SubTy);
5516 Ty = SubTy;
5517 } else if (Size == 128) {
5518 // Reducing from 128 bits is a permute of v2f64/v2i64.
5519 FixedVectorType *ShufTy;
5520 if (ValVTy->isFloatingPointTy())
5521 ShufTy =
5522 FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
5523 else
5524 ShufTy =
5525 FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
5526 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5527 std::nullopt, CostKind, 0, nullptr);
5528 } else if (Size == 64) {
5529 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5530 FixedVectorType *ShufTy;
5531 if (ValVTy->isFloatingPointTy())
5532 ShufTy =
5533 FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
5534 else
5535 ShufTy =
5536 FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
5537 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5538 std::nullopt, CostKind, 0, nullptr);
5539 } else {
5540 // Reducing from smaller size is a shift by immediate.
5541 auto *ShiftTy = FixedVectorType::get(
5542 Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
5543 ReductionCost += getArithmeticInstrCost(
5544 Instruction::LShr, ShiftTy, CostKind,
5547 }
5548
5549 // Add the arithmetic op for this level.
5550 ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
5551 }
5552
5553 // Add the final extract element to the cost.
5554 return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5555 CostKind, 0, nullptr, nullptr);
5556}
5557
5560 FastMathFlags FMF) {
5561 IntrinsicCostAttributes ICA(IID, Ty, {Ty, Ty}, FMF);
5562 return getIntrinsicInstrCost(ICA, CostKind);
5563}
5564
5567 FastMathFlags FMF,
5569 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5570
5571 MVT MTy = LT.second;
5572
5573 int ISD;
5574 if (ValTy->isIntOrIntVectorTy()) {
5575 ISD = (IID == Intrinsic::umin || IID == Intrinsic::umax) ? ISD::UMIN
5576 : ISD::SMIN;
5577 } else {
5578 assert(ValTy->isFPOrFPVectorTy() &&
5579 "Expected float point or integer vector type.");
5580 ISD = (IID == Intrinsic::minnum || IID == Intrinsic::maxnum)
5581 ? ISD::FMINNUM
5582 : ISD::FMINIMUM;
5583 }
5584
5585 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5586 // and make it as the cost.
5587
5588 static const CostTblEntry SSE2CostTbl[] = {
5589 {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw
5590 {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw
5591 {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw
5592 };
5593
5594 static const CostTblEntry SSE41CostTbl[] = {
5595 {ISD::SMIN, MVT::v2i16, 3}, // same as sse2
5596 {ISD::SMIN, MVT::v4i16, 5}, // same as sse2
5597 {ISD::UMIN, MVT::v2i16, 5}, // same as sse2
5598 {ISD::UMIN, MVT::v4i16, 7}, // same as sse2
5599 {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor
5600 {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax
5601 {ISD::SMIN, MVT::v2i8, 3}, // pminsb
5602 {ISD::SMIN, MVT::v4i8, 5}, // pminsb
5603 {ISD::SMIN, MVT::v8i8, 7}, // pminsb
5604 {ISD::SMIN, MVT::v16i8, 6},
5605 {ISD::UMIN, MVT::v2i8, 3}, // same as sse2
5606 {ISD::UMIN, MVT::v4i8, 5}, // same as sse2
5607 {ISD::UMIN, MVT::v8i8, 7}, // same as sse2
5608 {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax
5609 };
5610
5611 static const CostTblEntry AVX1CostTbl[] = {
5612 {ISD::SMIN, MVT::v16i16, 6},
5613 {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax
5614 {ISD::SMIN, MVT::v32i8, 8},
5615 {ISD::UMIN, MVT::v32i8, 8},
5616 };
5617
5618 static const CostTblEntry AVX512BWCostTbl[] = {
5619 {ISD::SMIN, MVT::v32i16, 8},
5620 {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax
5621 {ISD::SMIN, MVT::v64i8, 10},
5622 {ISD::UMIN, MVT::v64i8, 10},
5623 };
5624
5625 // Before legalizing the type, give a chance to look up illegal narrow types
5626 // in the table.
5627 // FIXME: Is there a better way to do this?
5628 EVT VT = TLI->getValueType(DL, ValTy);
5629 if (VT.isSimple()) {
5630 MVT MTy = VT.getSimpleVT();
5631 if (ST->hasBWI())
5632 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5633 return Entry->Cost;
5634
5635 if (ST->hasAVX())
5636 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5637 return Entry->Cost;
5638
5639 if (ST->hasSSE41())
5640 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5641 return Entry->Cost;
5642
5643 if (ST->hasSSE2())
5644 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5645 return Entry->Cost;
5646 }
5647
5648 auto *ValVTy = cast<FixedVectorType>(ValTy);
5649 unsigned NumVecElts = ValVTy->getNumElements();
5650
5651 auto *Ty = ValVTy;
5652 InstructionCost MinMaxCost = 0;
5653 if (LT.first != 1 && MTy.isVector() &&
5654 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5655 // Type needs to be split. We need LT.first - 1 operations ops.
5656 Ty = FixedVectorType::get(ValVTy->getElementType(),
5657 MTy.getVectorNumElements());
5658 MinMaxCost = getMinMaxCost(IID, Ty, CostKind, FMF);
5659 MinMaxCost *= LT.first - 1;
5660 NumVecElts = MTy.getVectorNumElements();
5661 }
5662
5663 if (ST->hasBWI())
5664 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5665 return MinMaxCost + Entry->Cost;
5666
5667 if (ST->hasAVX())
5668 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5669 return MinMaxCost + Entry->Cost;
5670
5671 if (ST->hasSSE41())
5672 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5673 return MinMaxCost + Entry->Cost;
5674
5675 if (ST->hasSSE2())
5676 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5677 return MinMaxCost + Entry->Cost;
5678
5679 unsigned ScalarSize = ValTy->getScalarSizeInBits();
5680
5681 // Special case power of 2 reductions where the scalar type isn't changed
5682 // by type legalization.
5683 if (!isPowerOf2_32(ValVTy->getNumElements()) ||
5684 ScalarSize != MTy.getScalarSizeInBits())
5685 return BaseT::getMinMaxReductionCost(IID, ValTy, FMF, CostKind);
5686
5687 // Now handle reduction with the legal type, taking into account size changes
5688 // at each level.
5689 while (NumVecElts > 1) {
5690 // Determine the size of the remaining vector we need to reduce.
5691 unsigned Size = NumVecElts * ScalarSize;
5692 NumVecElts /= 2;
5693 // If we're reducing from 256/512 bits, use an extract_subvector.
5694 if (Size > 128) {
5695 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5696 MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
5697 CostKind, NumVecElts, SubTy);
5698 Ty = SubTy;
5699 } else if (Size == 128) {
5700 // Reducing from 128 bits is a permute of v2f64/v2i64.
5701 VectorType *ShufTy;
5702 if (ValTy->isFloatingPointTy())
5703 ShufTy =
5705 else
5706 ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
5707 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5708 std::nullopt, CostKind, 0, nullptr);
5709 } else if (Size == 64) {
5710 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5711 FixedVectorType *ShufTy;
5712 if (ValTy->isFloatingPointTy())
5713 ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
5714 else
5715 ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
5716 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy,
5717 std::nullopt, CostKind, 0, nullptr);
5718 } else {
5719 // Reducing from smaller size is a shift by immediate.
5720 auto *ShiftTy = FixedVectorType::get(
5721 Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
5722 MinMaxCost += getArithmeticInstrCost(
5723 Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
5726 }
5727
5728 // Add the arithmetic op for this level.
5729 MinMaxCost += getMinMaxCost(IID, Ty, CostKind, FMF);
5730 }
5731
5732 // Add the final extract element to the cost.
5733 return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5734 CostKind, 0, nullptr, nullptr);
5735}
5736
5737/// Calculate the cost of materializing a 64-bit value. This helper
5738/// method might only calculate a fraction of a larger immediate. Therefore it
5739/// is valid to return a cost of ZERO.
5741 if (Val == 0)
5742 return TTI::TCC_Free;
5743
5744 if (isInt<32>(Val))
5745 return TTI::TCC_Basic;
5746
5747 return 2 * TTI::TCC_Basic;
5748}
5749
5752 assert(Ty->isIntegerTy());
5753
5754 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5755 if (BitSize == 0)
5756 return ~0U;
5757
5758 // Never hoist constants larger than 128bit, because this might lead to
5759 // incorrect code generation or assertions in codegen.
5760 // Fixme: Create a cost model for types larger than i128 once the codegen
5761 // issues have been fixed.
5762 if (BitSize > 128)
5763 return TTI::TCC_Free;
5764
5765 if (Imm == 0)
5766 return TTI::TCC_Free;
5767
5768 // Sign-extend all constants to a multiple of 64-bit.
5769 APInt ImmVal = Imm;
5770 if (BitSize % 64 != 0)
5771 ImmVal = Imm.sext(alignTo(BitSize, 64));
5772
5773 // Split the constant into 64-bit chunks and calculate the cost for each
5774 // chunk.
5776 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
5777 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
5778 int64_t Val = Tmp.getSExtValue();
5779 Cost += getIntImmCost(Val);
5780 }
5781 // We need at least one instruction to materialize the constant.
5782 return std::max<InstructionCost>(1, Cost);
5783}
5784
5786 const APInt &Imm, Type *Ty,
5788 Instruction *Inst) {
5789 assert(Ty->isIntegerTy());
5790
5791 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5792 // There is no cost model for constants with a bit size of 0. Return TCC_Free
5793 // here, so that constant hoisting will ignore this constant.
5794 if (BitSize == 0)
5795 return TTI::TCC_Free;
5796
5797 unsigned ImmIdx = ~0U;
5798 switch (Opcode) {
5799 default:
5800 return TTI::TCC_Free;
5801 case Instruction::GetElementPtr:
5802 // Always hoist the base address of a GetElementPtr. This prevents the
5803 // creation of new constants for every base constant that gets constant
5804 // folded with the offset.
5805 if (Idx == 0)
5806 return 2 * TTI::TCC_Basic;
5807 return TTI::TCC_Free;
5808 case Instruction::Store:
5809 ImmIdx = 0;
5810 break;
5811 case Instruction::ICmp:
5812 // This is an imperfect hack to prevent constant hoisting of
5813 // compares that might be trying to check if a 64-bit value fits in
5814 // 32-bits. The backend can optimize these cases using a right shift by 32.
5815 // Ideally we would check the compare predicate here. There also other
5816 // similar immediates the backend can use shifts for.
5817 if (Idx == 1 && Imm.getBitWidth() == 64) {
5818 uint64_t ImmVal = Imm.getZExtValue();
5819 if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
5820 return TTI::TCC_Free;
5821 }
5822 ImmIdx = 1;
5823 break;
5824 case Instruction::And:
5825 // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
5826 // by using a 32-bit operation with implicit zero extension. Detect such
5827 // immediates here as the normal path expects bit 31 to be sign extended.
5828 if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.isIntN(32))
5829 return TTI::TCC_Free;
5830 ImmIdx = 1;
5831 break;
5832 case Instruction::Add:
5833 case Instruction::Sub:
5834 // For add/sub, we can use the opposite instruction for INT32_MIN.
5835 if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000)
5836 return TTI::TCC_Free;
5837 ImmIdx = 1;
5838 break;
5839 case Instruction::UDiv:
5840 case Instruction::SDiv:
5841 case Instruction::URem:
5842 case Instruction::SRem:
5843 // Division by constant is typically expanded later into a different
5844 // instruction sequence. This completely changes the constants.
5845 // Report them as "free" to stop ConstantHoist from marking them as opaque.
5846 return TTI::TCC_Free;
5847 case Instruction::Mul:
5848 case Instruction::Or:
5849 case Instruction::Xor:
5850 ImmIdx = 1;
5851 break;
5852 // Always return TCC_Free for the shift value of a shift instruction.
5853 case Instruction::Shl:
5854 case Instruction::LShr:
5855 case Instruction::AShr:
5856 if (Idx == 1)
5857 return TTI::TCC_Free;
5858 break;
5859 case Instruction::Trunc:
5860 case Instruction::ZExt:
5861 case Instruction::SExt:
5862 case Instruction::IntToPtr:
5863 case Instruction::PtrToInt:
5864 case Instruction::BitCast:
5865 case Instruction::PHI:
5866 case Instruction::Call:
5867 case Instruction::Select:
5868 case Instruction::Ret:
5869 case Instruction::Load:
5870 break;
5871 }
5872
5873 if (Idx == ImmIdx) {
5874 uint64_t NumConstants = divideCeil(BitSize, 64);
5876 return (Cost <= NumConstants * TTI::TCC_Basic)
5877 ? static_cast<int>(TTI::TCC_Free)
5878 : Cost;
5879 }
5880
5881 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
5882}
5883
5885 const APInt &Imm, Type *Ty,
5887 assert(Ty->isIntegerTy());
5888
5889 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5890 // There is no cost model for constants with a bit size of 0. Return TCC_Free
5891 // here, so that constant hoisting will ignore this constant.
5892 if (BitSize == 0)
5893 return TTI::TCC_Free;
5894
5895 switch (IID) {
5896 default:
5897 return TTI::TCC_Free;
5898 case Intrinsic::sadd_with_overflow:
5899 case Intrinsic::uadd_with_overflow:
5900 case Intrinsic::ssub_with_overflow:
5901 case Intrinsic::usub_with_overflow:
5902 case Intrinsic::smul_with_overflow:
5903 case Intrinsic::umul_with_overflow:
5904 if ((Idx == 1) && Imm.getBitWidth() <= 64 && Imm.isSignedIntN(32))
5905 return TTI::TCC_Free;
5906 break;
5907 case Intrinsic::experimental_stackmap:
5908 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
5909 return TTI::TCC_Free;
5910 break;
5911 case Intrinsic::experimental_patchpoint_void:
5912 case Intrinsic::experimental_patchpoint:
5913 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
5914 return TTI::TCC_Free;
5915 break;
5916 }
5917 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
5918}
5919
5922 const Instruction *I) {
5924 return Opcode == Instruction::PHI ? 0 : 1;
5925 // Branches are assumed to be predicted.
5926 return 0;
5927}
5928
5929int X86TTIImpl::getGatherOverhead() const {
5930 // Some CPUs have more overhead for gather. The specified overhead is relative
5931 // to the Load operation. "2" is the number provided by Intel architects. This
5932 // parameter is used for cost estimation of Gather Op and comparison with
5933 // other alternatives.
5934 // TODO: Remove the explicit hasAVX512()?, That would mean we would only
5935 // enable gather with a -march.
5936 if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
5937 return 2;
5938
5939 return 1024;
5940}
5941
5942int X86TTIImpl::getScatterOverhead() const {
5943 if (ST->hasAVX512())
5944 return 2;
5945
5946 return 1024;
5947}
5948
5949// Return an average cost of Gather / Scatter instruction, maybe improved later.
5950InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode,
5952 Type *SrcVTy, const Value *Ptr,
5953 Align Alignment,
5954 unsigned AddressSpace) {
5955
5956 assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
5957 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
5958
5959 // Try to reduce index size from 64 bit (default for GEP)
5960 // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
5961 // operation will use 16 x 64 indices which do not fit in a zmm and needs
5962 // to split. Also check that the base pointer is the same for all lanes,
5963 // and that there's at most one variable index.
5964 auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
5965 unsigned IndexSize = DL.getPointerSizeInBits();
5966 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
5967 if (IndexSize < 64 || !GEP)
5968 return IndexSize;
5969
5970 unsigned NumOfVarIndices = 0;
5971 const Value *Ptrs = GEP->getPointerOperand();
5972 if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
5973 return IndexSize;
5974 for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) {
5975 if (isa<Constant>(GEP->getOperand(I)))
5976 continue;
5977 Type *IndxTy = GEP->getOperand(I)->getType();
5978 if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
5979 IndxTy = IndexVTy->getElementType();
5980 if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
5981 !isa<SExtInst>(GEP->getOperand(I))) ||
5982 ++NumOfVarIndices > 1)
5983 return IndexSize; // 64
5984 }
5985 return (unsigned)32;
5986 };
5987
5988 // Trying to reduce IndexSize to 32 bits for vector 16.
5989 // By default the IndexSize is equal to pointer size.
5990 unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
5991 ? getIndexSizeInBits(Ptr, DL)
5993
5994 auto *IndexVTy = FixedVectorType::get(
5995 IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
5996 std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(IndexVTy);
5997 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(SrcVTy);
5998 InstructionCost::CostType SplitFactor =
5999 *std::max(IdxsLT.first, SrcLT.first).getValue();
6000 if (SplitFactor > 1) {
6001 // Handle splitting of vector of pointers
6002 auto *SplitSrcTy =
6003 FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
6004 return SplitFactor * getGSVectorCost(Opcode, CostKind, SplitSrcTy, Ptr,
6005 Alignment, AddressSpace);
6006 }
6007
6008 // If we didn't split, this will be a single gather/scatter instruction.
6010 return 1;
6011
6012 // The gather / scatter cost is given by Intel architects. It is a rough
6013 // number since we are looking at one instruction in a time.
6014 const int GSOverhead = (Opcode == Instruction::Load) ? getGatherOverhead()
6015 : getScatterOverhead();
6016 return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
6017 MaybeAlign(Alignment), AddressSpace,
6018 CostKind);
6019}
6020
6021/// Calculate the cost of Gather / Scatter operation
6023 unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask,
6025 const Instruction *I = nullptr) {
6026 if ((Opcode == Instruction::Load &&
6027 (!isLegalMaskedGather(SrcVTy, Align(Alignment)) ||
6028 forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
6029 Align(Alignment)))) ||
6030 (Opcode == Instruction::Store &&
6031 (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) ||
6032 forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
6033 Align(Alignment)))))
6034 return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask,
6035 Alignment, CostKind, I);
6036
6037 assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
6038 PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
6039 if (!PtrTy && Ptr->getType()->isVectorTy())
6040 PtrTy = dyn_cast<PointerType>(
6041 cast<VectorType>(Ptr->getType())->getElementType());
6042 assert(PtrTy && "Unexpected type for Ptr argument");
6043 unsigned AddressSpace = PtrTy->getAddressSpace();
6044 return getGSVectorCost(Opcode, CostKind, SrcVTy, Ptr, Alignment,
6045 AddressSpace);
6046}
6047
6049 const TargetTransformInfo::LSRCost &C2) {
6050 // X86 specific here are "instruction number 1st priority".
6051 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
6052 C1.NumIVMuls, C1.NumBaseAdds,
6053 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
6054 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
6055 C2.NumIVMuls, C2.NumBaseAdds,
6056 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
6057}
6058
6060 return ST->hasMacroFusion() || ST->hasBranchFusion();
6061}
6062
6063bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
6064 Type *ScalarTy = DataTy->getScalarType();
6065
6066 // The backend can't handle a single element vector w/o CFCMOV.
6067 if (isa<VectorType>(DataTy) && cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6068 return ST->hasCF() && hasConditionalLoadStoreForType(ScalarTy);
6069
6070 if (!ST->hasAVX())
6071 return false;
6072
6073 if (ScalarTy->isPointerTy())
6074 return true;
6075
6076 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6077 return true;
6078
6079 if (ScalarTy->isHalfTy() && ST->hasBWI())
6080 return true;
6081
6082 if (ScalarTy->isBFloatTy() && ST->hasBF16())
6083 return true;
6084
6085 if (!ScalarTy->isIntegerTy())
6086 return false;
6087
6088 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6089 return IntWidth == 32 || IntWidth == 64 ||
6090 ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
6091}
6092
6093bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) {
6094 return isLegalMaskedLoad(DataType, Alignment);
6095}
6096
6097bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) {
6098 unsigned DataSize = DL.getTypeStoreSize(DataType);
6099 // The only supported nontemporal loads are for aligned vectors of 16 or 32
6100 // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
6101 // (the equivalent stores only require AVX).
6102 if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
6103 return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
6104
6105 return false;
6106}
6107
6108bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) {
6109 unsigned DataSize = DL.getTypeStoreSize(DataType);
6110
6111 // SSE4A supports nontemporal stores of float and double at arbitrary
6112 // alignment.
6113 if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
6114 return true;
6115
6116 // Besides the SSE4A subtarget exception above, only aligned stores are
6117 // available nontemporaly on any other subtarget. And only stores with a size
6118 // of 4..32 bytes (powers of 2, only) are permitted.
6119 if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
6120 !isPowerOf2_32(DataSize))
6121 return false;
6122
6123 // 32-byte vector nontemporal stores are supported by AVX (the equivalent
6124 // loads require AVX2).
6125 if (DataSize == 32)
6126 return ST->hasAVX();
6127 if (DataSize == 16)
6128 return ST->hasSSE1();
6129 return true;
6130}
6131
6133 ElementCount NumElements) const {
6134 // movddup
6135 return ST->hasSSE3() && !NumElements.isScalable() &&
6136 NumElements.getFixedValue() == 2 &&
6137 ElementTy == Type::getDoubleTy(ElementTy->getContext());
6138}
6139
6141 if (!isa<VectorType>(DataTy))
6142 return false;
6143
6144 if (!ST->hasAVX512())
6145 return false;
6146
6147 // The backend can't handle a single element vector.
6148 if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6149 return false;
6150
6151 Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
6152
6153 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6154 return true;
6155
6156 if (!ScalarTy->isIntegerTy())
6157 return false;
6158
6159 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6160 return IntWidth == 32 || IntWidth == 64 ||
6161 ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
6162}
6163
6165 return isLegalMaskedExpandLoad(DataTy, Alignment);
6166}
6167
6168bool X86TTIImpl::supportsGather() const {
6169 // Some CPUs have better gather performance than others.
6170 // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
6171 // enable gather with a -march.
6172 return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2());
6173}
6174
6176 // Gather / Scatter for vector 2 is not profitable on KNL / SKX
6177 // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend
6178 // it to 8 elements, but zeroing upper bits of the mask vector will add more
6179 // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO:
6180 // Check, maybe the gather/scatter instruction is better in the VariableMask
6181 // case.
6182 unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements();
6183 return NumElts == 1 ||
6184 (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())));
6185}
6186
6188 Type *ScalarTy = DataTy->getScalarType();
6189 if (ScalarTy->isPointerTy())
6190 return true;
6191
6192 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6193 return true;
6194
6195 if (!ScalarTy->isIntegerTy())
6196 return false;
6197
6198 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6199 return IntWidth == 32 || IntWidth == 64;
6200}
6201
6203 if (!supportsGather() || !ST->preferGather())
6204 return false;
6205 return isLegalMaskedGatherScatter(DataTy, Alignment);
6206}
6207
6208bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
6209 unsigned Opcode1,
6210 const SmallBitVector &OpcodeMask) const {
6211 // ADDSUBPS 4xf32 SSE3
6212 // VADDSUBPS 4xf32 AVX
6213 // VADDSUBPS 8xf32 AVX2
6214 // ADDSUBPD 2xf64 SSE3
6215 // VADDSUBPD 2xf64 AVX
6216 // VADDSUBPD 4xf64 AVX2
6217
6218 unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
6219 assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible");
6220 if (!isPowerOf2_32(NumElements))
6221 return false;
6222 // Check the opcode pattern. We apply the mask on the opcode arguments and
6223 // then check if it is what we expect.
6224 for (int Lane : seq<int>(0, NumElements)) {
6225 unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0;
6226 // We expect FSub for even lanes and FAdd for odd lanes.
6227 if (Lane % 2 == 0 && Opc != Instruction::FSub)
6228 return false;
6229 if (Lane % 2 == 1 && Opc != Instruction::FAdd)
6230 return false;
6231 }
6232 // Now check that the pattern is supported by the target ISA.
6233 Type *ElemTy = cast<VectorType>(VecTy)->getElementType();
6234 if (ElemTy->isFloatTy())
6235 return ST->hasSSE3() && NumElements % 4 == 0;
6236 if (ElemTy->isDoubleTy())
6237 return ST->hasSSE3() && NumElements % 2 == 0;
6238 return false;
6239}
6240
6241bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) {
6242 // AVX2 doesn't support scatter
6243 if (!ST->hasAVX512() || !ST->preferScatter())
6244 return false;
6245 return isLegalMaskedGatherScatter(DataType, Alignment);
6246}
6247
6248bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
6249 EVT VT = TLI->getValueType(DL, DataType);
6250 return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
6251}
6252
6254 // FDIV is always expensive, even if it has a very low uop count.
6255 // TODO: Still necessary for recent CPUs with low latency/throughput fdiv?
6256 if (I->getOpcode() == Instruction::FDiv)
6257 return true;
6258
6260}
6261
6263 return false;
6264}
6265
6267 const Function *Callee) const {
6268 const TargetMachine &TM = getTLI()->getTargetMachine();
6269
6270 // Work this as a subsetting of subtarget features.
6271 const FeatureBitset &CallerBits =
6272 TM.getSubtargetImpl(*Caller)->getFeatureBits();
6273 const FeatureBitset &CalleeBits =
6274 TM.getSubtargetImpl(*Callee)->getFeatureBits();
6275
6276 // Check whether features are the same (apart from the ignore list).
6277 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
6278 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
6279 if (RealCallerBits == RealCalleeBits)
6280 return true;
6281
6282 // If the features are a subset, we need to additionally check for calls
6283 // that may become ABI-incompatible as a result of inlining.
6284 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
6285 return false;
6286
6287 for (const Instruction &I : instructions(Callee)) {
6288 if (const auto *CB = dyn_cast<CallBase>(&I)) {
6289 // Having more target features is fine for inline ASM.
6290 if (CB->isInlineAsm())
6291 continue;
6292
6294 for (Value *Arg : CB->args())
6295 Types.push_back(Arg->getType());
6296 if (!CB->getType()->isVoidTy())
6297 Types.push_back(CB->getType());
6298
6299 // Simple types are always ABI compatible.
6300 auto IsSimpleTy = [](Type *Ty) {
6301 return !Ty->isVectorTy() && !Ty->isAggregateType();
6302 };
6303 if (all_of(Types, IsSimpleTy))
6304 continue;
6305
6306 if (Function *NestedCallee = CB->getCalledFunction()) {
6307 // Assume that intrinsics are always ABI compatible.
6308 if (NestedCallee->isIntrinsic())
6309 continue;
6310
6311 // Do a precise compatibility check.
6312 if (!areTypesABICompatible(Caller, NestedCallee, Types))
6313 return false;
6314 } else {
6315 // We don't know the target features of the callee,
6316 // assume it is incompatible.
6317 return false;
6318 }
6319 }
6320 }
6321 return true;
6322}
6323
6325 const Function *Callee,
6326 const ArrayRef<Type *> &Types) const {
6327 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
6328 return false;
6329
6330 // If we get here, we know the target features match. If one function
6331 // considers 512-bit vectors legal and the other does not, consider them
6332 // incompatible.
6333 const TargetMachine &TM = getTLI()->getTargetMachine();
6334
6335 if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
6336 TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs())
6337 return true;
6338
6339 // Consider the arguments compatible if they aren't vectors or aggregates.
6340 // FIXME: Look at the size of vectors.
6341 // FIXME: Look at the element types of aggregates to see if there are vectors.
6342 return llvm::none_of(Types,
6343 [](Type *T) { return T->isVectorTy() || T->isAggregateType(); });
6344}
6345
6347X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
6349 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
6350 Options.NumLoadsPerBlock = 2;
6351 // All GPR and vector loads can be unaligned.
6352 Options.AllowOverlappingLoads = true;
6353 if (IsZeroCmp) {
6354 // Only enable vector loads for equality comparison. Right now the vector
6355 // version is not as fast for three way compare (see #33329).
6356 const unsigned PreferredWidth = ST->getPreferVectorWidth();
6357 if (PreferredWidth >= 512 && ST->hasAVX512() && ST->hasEVEX512())
6358 Options.LoadSizes.push_back(64);
6359 if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
6360 if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
6361 }
6362 if (ST->is64Bit()) {
6363 Options.LoadSizes.push_back(8);
6364 }
6365 Options.LoadSizes.push_back(4);
6366 Options.LoadSizes.push_back(2);
6367 Options.LoadSizes.push_back(1);
6368 return Options;
6369}
6370
6372 return supportsGather();
6373}
6374
6376 return false;
6377}
6378
6380 // TODO: We expect this to be beneficial regardless of arch,
6381 // but there are currently some unexplained performance artifacts on Atom.
6382 // As a temporary solution, disable on Atom.
6383 return !(ST->isAtom());
6384}
6385
6386// Get estimation for interleaved load/store operations and strided load.
6387// \p Indices contains indices for strided load.
6388// \p Factor - the factor of interleaving.
6389// AVX-512 provides 3-src shuffles that significantly reduces the cost.
6391 unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
6392 ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
6393 TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
6394 // VecTy for interleave memop is <VF*Factor x Elt>.
6395 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6396 // VecTy = <12 x i32>.
6397
6398 // Calculate the number of memory operations (NumOfMemOps), required
6399 // for load/store the VecTy.
6400 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6401 unsigned VecTySize = DL.getTypeStoreSize(VecTy);
6402 unsigned LegalVTSize = LegalVT.getStoreSize();
6403 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
6404
6405 // Get the cost of one memory operation.
6406 auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
6407 LegalVT.getVectorNumElements());
6408 InstructionCost MemOpCost;
6409 bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps;
6410 if (UseMaskedMemOp)
6411 MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment,
6413 else
6414 MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, MaybeAlign(Alignment),
6416
6417 unsigned VF = VecTy->getNumElements() / Factor;
6418 MVT VT =
6420
6421 InstructionCost MaskCost;
6422 if (UseMaskedMemOp) {
6423 APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements());
6424 for (unsigned Index : Indices) {
6425 assert(Index < Factor && "Invalid index for interleaved memory op");
6426 for (unsigned Elm = 0; Elm < VF; Elm++)
6427 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
6428 }
6429
6430 Type *I1Type = Type::getInt1Ty(VecTy->getContext());
6431
6432 MaskCost = getReplicationShuffleCost(
6433 I1Type, Factor, VF,
6434 UseMaskForGaps ? DemandedLoadStoreElts
6436 CostKind);
6437
6438 // The Gaps mask is invariant and created outside the loop, therefore the
6439 // cost of creating it is not accounted for here. However if we have both
6440 // a MaskForGaps and some other mask that guards the execution of the
6441 // memory access, we need to account for the cost of And-ing the two masks
6442 // inside the loop.
6443 if (UseMaskForGaps) {
6444 auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements());
6445 MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind);
6446 }
6447 }
6448
6449 if (Opcode == Instruction::Load) {
6450 // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
6451 // contain the cost of the optimized shuffle sequence that the
6452 // X86InterleavedAccess pass will generate.
6453 // The cost of loads and stores are computed separately from the table.
6454
6455 // X86InterleavedAccess support only the following interleaved-access group.
6456 static const CostTblEntry AVX512InterleavedLoadTbl[] = {
6457 {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
6458 {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
6459 {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
6460 };
6461
6462 if (const auto *Entry =
6463 CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
6464 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6465 //If an entry does not exist, fallback to the default implementation.
6466
6467 // Kind of shuffle depends on number of loaded values.
6468 // If we load the entire data in one register, we can use a 1-src shuffle.
6469 // Otherwise, we'll merge 2 sources in each operation.
6470 TTI::ShuffleKind ShuffleKind =
6471 (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
6472
6473 InstructionCost ShuffleCost = getShuffleCost(
6474 ShuffleKind, SingleMemOpTy, std::nullopt, CostKind, 0, nullptr);
6475
6476 unsigned NumOfLoadsInInterleaveGrp =
6477 Indices.size() ? Indices.size() : Factor;
6478 auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
6479 VecTy->getNumElements() / Factor);
6480 InstructionCost NumOfResults =
6481 getTypeLegalizationCost(ResultTy).first * NumOfLoadsInInterleaveGrp;
6482
6483 // About a half of the loads may be folded in shuffles when we have only
6484 // one result. If we have more than one result, or the loads are masked,
6485 // we do not fold loads at all.
6486 unsigned NumOfUnfoldedLoads =
6487 UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
6488
6489 // Get a number of shuffle operations per result.
6490 unsigned NumOfShufflesPerResult =
6491 std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
6492
6493 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6494 // When we have more than one destination, we need additional instructions
6495 // to keep sources.
6496 InstructionCost NumOfMoves = 0;
6497 if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
6498 NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
6499
6500 InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
6501 MaskCost + NumOfUnfoldedLoads * MemOpCost +
6502 NumOfMoves;
6503
6504 return Cost;
6505 }
6506
6507 // Store.
6508 assert(Opcode == Instruction::Store &&
6509 "Expected Store Instruction at this point");
6510 // X86InterleavedAccess support only the following interleaved-access group.
6511 static const CostTblEntry AVX512InterleavedStoreTbl[] = {
6512 {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
6513 {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
6514 {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
6515
6516 {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
6517 {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
6518 {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
6519 {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
6520 };
6521
6522 if (const auto *Entry =
6523 CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
6524 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6525 //If an entry does not exist, fallback to the default implementation.
6526
6527 // There is no strided stores meanwhile. And store can't be folded in
6528 // shuffle.
6529 unsigned NumOfSources = Factor; // The number of values to be merged.
6530 InstructionCost ShuffleCost = getShuffleCost(
6531 TTI::SK_PermuteTwoSrc, SingleMemOpTy, std::nullopt, CostKind, 0, nullptr);
6532 unsigned NumOfShufflesPerStore = NumOfSources - 1;
6533
6534 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6535 // We need additional instructions to keep sources.
6536 unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
6538 MaskCost +
6539 NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
6540 NumOfMoves;
6541 return Cost;
6542}
6543
6545 unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices,
6546 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
6547 bool UseMaskForCond, bool UseMaskForGaps) {
6548 auto *VecTy = cast<FixedVectorType>(BaseTy);
6549
6550 auto isSupportedOnAVX512 = [&](Type *VecTy) {
6551 Type *EltTy = cast<VectorType>(VecTy)->getElementType();
6552 if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
6553 EltTy->isIntegerTy(32) || EltTy->isPointerTy())
6554 return true;
6555 if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy())
6556 return ST->hasBWI();
6557 if (EltTy->isBFloatTy())
6558 return ST->hasBF16();
6559 return false;
6560 };
6561 if (ST->hasAVX512() && isSupportedOnAVX512(VecTy))
6563 Opcode, VecTy, Factor, Indices, Alignment,
6564 AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
6565
6566 if (UseMaskForCond || UseMaskForGaps)
6567 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6568 Alignment, AddressSpace, CostKind,
6569 UseMaskForCond, UseMaskForGaps);
6570
6571 // Get estimation for interleaved load/store operations for SSE-AVX2.
6572 // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow
6573 // computing the cost using a generic formula as a function of generic
6574 // shuffles. We therefore use a lookup table instead, filled according to
6575 // the instruction sequences that codegen currently generates.
6576
6577 // VecTy for interleave memop is <VF*Factor x Elt>.
6578 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6579 // VecTy = <12 x i32>.
6580 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6581
6582 // This function can be called with VecTy=<6xi128>, Factor=3, in which case
6583 // the VF=2, while v2i128 is an unsupported MVT vector type
6584 // (see MachineValueType.h::getVectorVT()).
6585 if (!LegalVT.isVector())
6586 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6587 Alignment, AddressSpace, CostKind);
6588
6589 unsigned VF = VecTy->getNumElements() / Factor;
6590 Type *ScalarTy = VecTy->getElementType();
6591 // Deduplicate entries, model floats/pointers as appropriately-sized integers.
6592 if (!ScalarTy->isIntegerTy())
6593 ScalarTy =
6594 Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy));
6595
6596 // Get the cost of all the memory operations.
6597 // FIXME: discount dead loads.
6598 InstructionCost MemOpCosts = getMemoryOpCost(
6599 Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind);
6600
6601 auto *VT = FixedVectorType::get(ScalarTy, VF);
6602 EVT ETy = TLI->getValueType(DL, VT);
6603 if (!ETy.isSimple())
6604 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6605 Alignment, AddressSpace, CostKind);
6606
6607 // TODO: Complete for other data-types and strides.
6608 // Each combination of Stride, element bit width and VF results in a different
6609 // sequence; The cost tables are therefore accessed with:
6610 // Factor (stride) and VectorType=VFxiN.
6611 // The Cost accounts only for the shuffle sequence;
6612 // The cost of the loads/stores is accounted for separately.
6613 //
6614 static const CostTblEntry AVX2InterleavedLoadTbl[] = {
6615 {2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8
6616 {2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8
6617 {2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8
6618 {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8
6619 {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8
6620
6621 {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16
6622 {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16
6623 {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16
6624
6625 {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32
6626 {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32
6627 {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32
6628
6629 {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64
6630 {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64
6631 {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64
6632 {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64
6633
6634 {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8
6635 {3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8
6636 {3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8
6637 {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
6638 {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8
6639
6640 {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16
6641 {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16
6642 {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16
6643 {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16
6644 {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16
6645
6646 {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32
6647 {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32
6648 {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32
6649 {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32
6650 {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32
6651
6652 {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64
6653 {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64
6654 {3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64
6655 {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64
6656
6657 {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8
6658 {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8
6659 {4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8
6660 {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8
6661 {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8
6662
6663 {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16
6664 {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16
6665 {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16
6666 {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16
6667 {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16
6668
6669 {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32
6670 {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32
6671 {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32
6672 {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32
6673 {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32
6674
6675 {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64
6676 {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64
6677 {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64
6678 {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64
6679
6680 {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8
6681 {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8
6682 {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8
6683 {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8
6684 {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8
6685
6686 {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16
6687 {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16
6688 {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16
6689 {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16
6690 {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16
6691
6692 {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32
6693 {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32
6694 {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32
6695 {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32
6696
6697 {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64
6698 {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64
6699 {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64
6700
6701 {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32
6702 };
6703
6704 static const CostTblEntry SSSE3InterleavedLoadTbl[] = {
6705 {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16
6706 };
6707
6708 static const CostTblEntry SSE2InterleavedLoadTbl[] = {
6709 {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16
6710 {2, MVT::v4i16, 7}, // (load 8i16 and) deinterleave into 2 x 4i16
6711
6712 {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32
6713 {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32
6714
6715 {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64
6716 };
6717
6718 static const CostTblEntry AVX2InterleavedStoreTbl[] = {
6719 {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store)
6720 {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store)
6721
6722 {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store)
6723 {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store)
6724 {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store)
6725
6726 {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store)
6727 {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store)
6728 {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store)
6729 {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store)
6730
6731 {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store)
6732 {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store)
6733 {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store)
6734 {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store)
6735 {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store)
6736
6737 {3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store)
6738 {3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store)
6739 {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store)
6740 {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
6741 {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
6742
6743 {3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store)
6744 {3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store)
6745 {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store)
6746 {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store)
6747 {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store)
6748
6749 {3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store)
6750 {3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store)
6751 {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store)
6752 {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store)
6753 {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store)
6754
6755 {3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store)
6756 {3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store)
6757 {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store)
6758 {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store)
6759
6760 {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store)
6761 {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store)
6762 {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store)
6763 {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store)
6764 {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store)
6765
6766 {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store)
6767 {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store)
6768 {4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store)
6769 {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store)
6770 {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store)
6771
6772 {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store)
6773 {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store)
6774 {4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store)
6775 {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store)
6776 {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store)
6777
6778 {4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store)
6779 {4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store)
6780 {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store)
6781 {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store)
6782
6783 {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store)
6784 {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store)
6785 {6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store)
6786 {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store)
6787 {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store)
6788
6789 {6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store)
6790 {6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store)
6791 {6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store)
6792 {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store)
6793 {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store)
6794
6795 {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store)
6796 {6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store)
6797 {6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store)
6798 {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store)
6799
6800 {6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store)
6801 {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store)
6802 {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store)
6803 };
6804
6805 static const CostTblEntry SSE2InterleavedStoreTbl[] = {
6806 {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store)
6807 {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store)
6808 {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store)
6809
6810 {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store)
6811 {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store)
6812
6813 {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store)
6814 };
6815
6816 if (Opcode == Instruction::Load) {
6817 auto GetDiscountedCost = [Factor, NumMembers = Indices.size(),
6818 MemOpCosts](const CostTblEntry *Entry) {
6819 // NOTE: this is just an approximation!
6820 // It can over/under -estimate the cost!
6821 return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor);
6822 };
6823
6824 if (ST->hasAVX2())
6825 if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor,
6826 ETy.getSimpleVT()))
6827 return GetDiscountedCost(Entry);
6828
6829 if (ST->hasSSSE3())
6830 if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor,
6831 ETy.getSimpleVT()))
6832 return GetDiscountedCost(Entry);
6833
6834 if (ST->hasSSE2())
6835 if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor,
6836 ETy.getSimpleVT()))
6837 return GetDiscountedCost(Entry);
6838 } else {
6839 assert(Opcode == Instruction::Store &&
6840 "Expected Store Instruction at this point");
6841 assert((!Indices.size() || Indices.size() == Factor) &&
6842 "Interleaved store only supports fully-interleaved groups.");
6843 if (ST->hasAVX2())
6844 if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor,
6845 ETy.getSimpleVT()))
6846 return MemOpCosts + Entry->Cost;
6847
6848 if (ST->hasSSE2())
6849 if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor,
6850 ETy.getSimpleVT()))
6851 return MemOpCosts + Entry->Cost;
6852 }
6853
6854 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6855 Alignment, AddressSpace, CostKind,
6856 UseMaskForCond, UseMaskForGaps);
6857}
6858
6860 StackOffset BaseOffset,
6861 bool HasBaseReg, int64_t Scale,
6862 unsigned AddrSpace) const {
6863 // Scaling factors are not free at all.
6864 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
6865 // will take 2 allocations in the out of order engine instead of 1
6866 // for plain addressing mode, i.e. inst (reg1).
6867 // E.g.,
6868 // vaddps (%rsi,%rdx), %ymm0, %ymm1
6869 // Requires two allocations (one for the load, one for the computation)
6870 // whereas:
6871 // vaddps (%rsi), %ymm0, %ymm1
6872 // Requires just 1 allocation, i.e., freeing allocations for other operations
6873 // and having less micro operations to execute.
6874 //
6875 // For some X86 architectures, this is even worse because for instance for
6876 // stores, the complex addressing mode forces the instruction to use the
6877 // "load" ports instead of the dedicated "store" port.
6878 // E.g., on Haswell:
6879 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
6880 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
6882 AM.BaseGV = BaseGV;
6883 AM.BaseOffs = BaseOffset.getFixed();
6884 AM.HasBaseReg = HasBaseReg;
6885 AM.Scale = Scale;
6886 AM.ScalableOffset = BaseOffset.getScalable();
6887 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
6888 // Scale represents reg2 * scale, thus account for 1
6889 // as soon as we use a second register.
6890 return AM.Scale != 0;
6891 return -1;
6892}
6893
6895 // TODO: Hook MispredictPenalty of SchedMachineModel into this.
6896 return 14;
6897}
Expand Atomic instructions
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
Hexagon Common GEP
static LVOptions Options
Definition: LVOptions.cpp:25
#define I(x, y, z)
Definition: MD5.cpp:58
uint64_t IntrinsicInst * II
if(PassOpts->AAPipeline)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getNumElements(Type *Ty)
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
This file a TargetTransformInfo::Concept conforming object specific to the X86 target machine.
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1627
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1308
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:349
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:236
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:358
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1446
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:805
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:178
APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition: APInt.cpp:453
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1520
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:970
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:763
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:439
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:892
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:856
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:340
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:760
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:787
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:781
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:780
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:765
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:768
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:782
@ ICMP_NE
not equal
Definition: InstrTypes.h:779
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:785
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:783
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:767
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:367
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:838
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:621
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:429
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:842
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:322
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
unsigned getNumElements() const
Definition: DerivedTypes.h:582
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:680
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:915
static InstructionCost getInvalid(CostType Val=0)
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:266
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
const IntrinsicInst * getInst() const
Machine Value Type.
bool is128BitVector() const
Return true if this is a 128-bit vector type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Class to represent pointers.
Definition: DerivedTypes.h:646
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:679
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
bool test(unsigned Idx) const
size_type size() const
Returns the number of bits in this bitvector.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
static StackOffset getScalable(int64_t Scalable)
Definition: TypeSize.h:43
static StackOffset getFixed(int64_t Fixed)
Definition: TypeSize.h:42
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
unsigned getMaxExpandSizeMemcmp(bool OptSize) const
Get maximum # of load operations permitted for memcmp.
MVT getSimpleValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the MVT corresponding to this LLVM type. See getValueType.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
bool isExpensiveToSpeculativelyExecute(const Instruction *I)
bool isStridedAccess(const SCEV *Ptr) const
unsigned minRequiredElementSize(const Value *Val, bool &isSigned) const
const SCEVConstant * getConstantStrideStep(ScalarEvolution *SE, const SCEV *Ptr) const
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
CacheLevel
The possible cache levels.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:348
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
static Type * getDoubleTy(LLVMContext &C)
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:261
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:230
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:251
static IntegerType * getInt1Ty(LLVMContext &C)
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:145
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
static IntegerType * getInt8Ty(LLVMContext &C)
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:224
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:212
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:343
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
static VectorType * getExtendedElementVectorType(VectorType *VTy)
This static method is like getInteger except that the element types are twice as wide as the elements...
Definition: DerivedTypes.h:463
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * getDoubleElementsVectorType(VectorType *VTy)
This static method returns a VectorType with twice as many elements as the input type and the same el...
Definition: DerivedTypes.h:517
Type * getElementType() const
Definition: DerivedTypes.h:436
bool hasSSE1() const
Definition: X86Subtarget.h:193
bool hasSSE42() const
Definition: X86Subtarget.h:198
bool useAVX512Regs() const
Definition: X86Subtarget.h:253
bool hasSSE3() const
Definition: X86Subtarget.h:195
bool hasAVX512() const
Definition: X86Subtarget.h:201
bool hasSSE41() const
Definition: X86Subtarget.h:197
bool hasSSE2() const
Definition: X86Subtarget.h:194
bool hasSSSE3() const
Definition: X86Subtarget.h:196
bool hasAVX() const
Definition: X86Subtarget.h:199
unsigned getPreferVectorWidth() const
Definition: X86Subtarget.h:225
bool hasAVX2() const
Definition: X86Subtarget.h:200
InstructionCost getInterleavedMemoryOpCostAVX512(unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
bool isLegalMaskedGather(Type *DataType, Align Alignment)
InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask, TTI::TargetCostKind CostKind) const
std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool isLegalNTStore(Type *DataType, Align Alignment)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
bool isLegalNTLoad(Type *DataType, Align Alignment)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment)
bool isLegalMaskedGatherScatter(Type *DataType, Align Alignment)
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
bool isLegalMaskedLoad(Type *DataType, Align Alignment)
bool hasConditionalLoadStoreForType(Type *Ty=nullptr) const
bool supportsEfficientVectorElementLoadStore() const
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
bool prefersVectorizedAddressing() const
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment)
std::optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const override
bool isLegalMaskedStore(Type *DataType, Align Alignment)
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
Calculate the cost of Gather / Scatter operation.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
unsigned getMaxInterleaveFactor(ElementCount VF)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
bool isLegalMaskedCompressStore(Type *DataType, Align Alignment)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
unsigned getNumberOfRegisters(unsigned ClassID) const
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
unsigned getAtomicMemIntrinsicMaxElementSize() const
bool isLegalMaskedScatter(Type *DataType, Align Alignment)
InstructionCost getIntImmCost(int64_t)
Calculate the cost of materializing a 64-bit value.
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getBranchMispredictPenalty() const
bool isExpensiveToSpeculativelyExecute(const Instruction *I)
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
InstructionCost getMinMaxCost(Intrinsic::ID IID, Type *Ty, TTI::TargetCostKind CostKind, FastMathFlags FMF)
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Type) const
bool hasDivRemOp(Type *DataType, bool IsSigned)
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition: APInt.cpp:2978
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:779
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:752
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:743
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:840
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:716
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:262
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:804
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:751
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:980
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:756
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:330
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:734
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:549
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:810
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1028
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:696
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:938
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1047
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:886
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:708
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:919
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:816
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
apint_match m_APIntAllowPoison(const APInt *&Res)
Match APInt while allowing poison in splat vector constants.
Definition: PatternMatch.h:305
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
AddressSpace
Definition: NVPTXBaseInfo.h:21
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:555
Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:394
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1824
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
InstructionCost Cost
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
#define N
std::optional< unsigned > operator[](TargetTransformInfo::TargetCostKind Kind) const
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:359
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:307
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:314
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition: Alignment.h:141
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Type Conversion Cost Table.
Definition: CostTable.h:55